/* code for nbody : sticky9.c 
   (with GRAPE, leap-flog scheme)
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <unistd.h>
#define DIM 3
#define REAL double
#define NMAX 270000 

#pragma goose func
double rsqrt(double r2)
{
  return 1.0/sqrt(r2);
}

void
force(double (*x)[3], double *m, double eps2,
      double (*a)[3], double *pot, int n)
{
    double r, r2, rinv, mrinv, mr3inv, dx[3];
    int i, j, k;

// #pragma omp parallel for private(j,dx,r2,rinv,mrinv,mr3inv,k)

#pragma goose parallel for precision ("double") loopcounter(i, j) \
nip_pack(1) // pack 4 IPs for better performance on GRAPE-DR, 1 on GTX280 and HD4850.

  for(i=0;i<n;i++) {
    for(k=0;k<3;k++) a[i][k] = 0.0;
    pot[i] = 0.0;
    for (j=0;j<n;j++) {
      for(k=0;k<3;k++) dx[k] = x[j][k] - x[i][k];
      r2 = dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] + eps2;
      rinv = rsqrt(r2);
      mrinv = m[j]*rinv;
      mr3inv = mrinv*rinv*rinv;
      a[i][0] += mr3inv * dx[0];
      a[i][1] += mr3inv * dx[1];
      a[i][2] += mr3inv * dx[2];      
      pot[i] -= mrinv;
    }
  }

  for(i=0;i<n;i++) pot[i] += m[i]/sqrt(eps2);
}

void get_cputime(double *laptime, double *splittime)
{
	struct timeval tval;
	struct timezone *dum=0;

	gettimeofday(&tval,dum);
	*laptime = tval.tv_sec + tval.tv_usec * 1.0e-6 - *splittime;
	*splittime = tval.tv_sec + tval.tv_usec * 1.0e-6 ;

}

void compare_force(REAL (*a0)[3], REAL (*a1)[3], int n)
{
    int i, k, nerr;
    double da2, a2, relerr;

    nerr = 0;
    for (i = 0; i < n; i++) {
	da2 = 0.0;
	a2 = 0.0;
	for (k = 0; k < 3; k++) {
	    da2 += (a0[i][k] - a1[i][k]) * (a0[i][k] - a1[i][k]);
	    a2 += a0[i][k] * a0[i][k];
	}
	relerr = sqrt(da2/a2);
	//	if (relerr > 1e-2) {
	if (relerr > 4e-8) {	  
#if 1
	    fprintf(stderr, "%6d g %8.5f %8.5f %8.5f\n", i, a0[i][0], a0[i][1], a0[i][2]);
	    fprintf(stderr, "       h %8.5f %8.5f %8.5f\n", a1[i][0], a1[i][1], a1[i][2]);
	    fprintf(stderr, "relative error: %22.15e\n", relerr);
#endif
	    nerr++;
	}
    }
    fprintf(stderr, "compare_force: # of error(s) larger than 1%%: %d\n", nerr);
}

void read_nemoformat(FILE *fp,
                     int *n,
                     int *dim,
                     REAL *t,
                     REAL m[NMAX],
                     REAL x[NMAX][DIM],
                     REAL v[NMAX][DIM])
{
    unsigned long int index,i;

    fscanf(fp,"%d%d%lf",n,dim,t);
    index = *n;
    for(i=0;i<index;i++) fscanf(fp,"%lf",&m[i]);
    for(i=0;i<index;i++)fscanf(fp,"%lf%lf%lf",&x[i][0],&x[i][1],&x[i][2]);
    for(i=0;i<index;i++)fscanf(fp,"%lf%lf%lf",&v[i][0],&v[i][1],&v[i][2]);
}	


void data_input(REAL x[NMAX][DIM],
                REAL v[NMAX][DIM],
                REAL m[NMAX],
                int *n,
                char filename[],
                REAL *time)
{
    int dum;
    FILE *fpinput;

    fpinput = fopen(filename,"r");
    if (!fpinput) {
      perror("data_input");
      exit(1);
    }

    read_nemoformat(fpinput,n,&dum,time,m,x,v);
    fclose(fpinput);

    printf("initialdata end\n");
}

void energy(REAL t,
            REAL pot[NMAX],
            REAL x[NMAX][DIM],
            REAL v[NMAX][DIM],
            REAL m[NMAX],
            int n,
            REAL init_ene,
            REAL icm[DIM])
{
    REAL total_pot=0,total_kin=0;
    int i,k;	
    REAL cm[DIM];

    for(i=0;i<n;i++) total_pot += m[i]*pot[i];
    total_pot *= 0.5;

    for(i=0;i<n;i++) total_kin += m[i]*(v[i][0]*v[i][0]+v[i][1]*v[i][1]+v[i][2]*v[i][2]);
    total_kin *= 0.5;

    for(k=0;k<DIM;k++) cm[k] = 0.0;
    for(i=0;i<n;i++){
	for(k=0;k<DIM;k++) cm[k] += x[i][k] * m[i];
    }

    printf("time = %g\n",t);
    printf("pot = %22.15e kin = %22.15e \n total= %22.15e ratio = %e\n",
	   total_pot,total_kin,total_pot+total_kin,total_kin/total_pot);
    printf(" DE = %e %g\n",(init_ene-(total_pot+total_kin))/(total_pot+total_kin),t);
    printf(" DCM= %22.15e %22.15e %22.15e \n",icm[0]-cm[0],icm[1]-cm[1],icm[2]-cm[2]);
}

void initial_energy(REAL pot[NMAX],
                    REAL x[NMAX][DIM],
                    REAL v[NMAX][DIM],
                    REAL m[NMAX],
                    int n,
                    REAL *init_ene,
                    REAL cm[DIM])
{
    REAL total_pot;
    REAL total_kin;
    int i,k;	
	
    total_pot = 0.0; 
    total_kin = 0.0; 

    for(i=0;i<n;i++)  total_pot += m[i]*pot[i];
    total_pot *= 0.5;

    for(i=0;i<n;i++) total_kin += m[i]*(v[i][0]*v[i][0]+v[i][1]*v[i][1]+v[i][2]*v[i][2]);
    total_kin *= 0.5;

    printf("pot = %22.15e kin = %22.15e \n total= %22.15e ratio = %e\n",
	   total_pot,total_kin,total_pot+total_kin, total_kin/total_pot);
    *init_ene = total_pot+ total_kin;

    for(k=0;k<DIM;k++) cm[k] = 0.0;
    for(i=0;i<n;i++){
	for(k=0;k<DIM;k++) cm[k] += x[i][k] * m[i];
    }
}

void push_velocity(REAL v[NMAX][DIM],
                   REAL a[NMAX][DIM],
                   REAL dt,
                   int n)
{
    int j,k;

    for(j=0;j<n;j++){
	for(k=0;k<DIM;k++) v[j][k] += dt*a[j][k];
    }
}

void push_position(REAL x[NMAX][DIM],
                   REAL v[NMAX][DIM],
                   REAL dt,
                   int n)
{
    int j,k;	

    for(j=0;j<n;j++){
	for(k=0;k<DIM;k++) x[j][k] += dt*v[j][k];
    }
}

main()
{
    static REAL x[NMAX][DIM];
    static REAL v[NMAX][DIM];
    static REAL m[NMAX];
    static REAL a[NMAX][DIM];
    static REAL pot[NMAX];
    static REAL ah[NMAX][DIM];
    static REAL poth[NMAX];
    REAL dt,eps,init_ene,time=0.0;
    REAL deouttime,eouttime,idtinv,endtime,epsinv;
    REAL icm[DIM],ratio;
    FILE *fp2;
    int n,i,k;
    char filename[200];
    double lt=0.0, st=0.0;
    double hlt=0.0, hst=0.0;
    double holdtime;
    double xsize=2.0,rotint=2.0,sustained=0.0;
    int simid;
    double gintrps;
    double peak;

    fp2 = fopen("inputpara9","r");
    if (!fp2) {
      perror("main");
      exit(1);
    }

    fscanf(fp2,"%lf%lf%lf%lf",&epsinv,&idtinv,&endtime,&deouttime);
    fscanf(fp2,"%s",filename);
    fclose(fp2);
    /******		(input	para)
			1.0/eps: softening parameter (9999=> eps=0)
			1.0/idt: initial timestep
			endtime: end time
			deoutime: interval of energy output
			filename: input file name
    ********/	

    dt = 1.0/idtinv;
    if(epsinv==9999.0){
	eps = 0.0;
    }else{
	eps = 1.0/epsinv;
    }

    printf("eps:%g dt:%g \ndeouttime:%g endtime:%g \n",
	   eps,dt,deouttime,endtime);
    printf("inputfile %s \n",filename);

    data_input(x,v,m,&n,filename,&time);

    printf("m %g x %g %g %g\n",m[0],x[0][0],x[0][1],x[0][2]);

    eouttime= deouttime + time;

    get_cputime(&hlt,&hst);
    holdtime = hlt;

    force(x,m,eps*eps,a,pot,n);

    initial_energy(pot,x,v,m,n,&init_ene,icm);
    fflush(stdout);

#if COMPARE_WITH_HOST
	force(x,m,eps*eps,ah,poth,n);
	compare_force(a, ah, n);
	//	for(i=0;i<n;i++) printf("i %d pot %e poth %e %e\n",i,pot[i],poth[i],pot[i]-poth[i]);
#endif
    get_cputime(&lt,&st);
    while(time < endtime){
	static int step = 0;
	push_velocity(v,a,0.5*dt,n);
	push_position(x,v,dt,n);

	time += dt;
//	fprintf(stderr, "step: %d\n", step++);

	force(x,m,eps*eps,a,pot,n);	

#if COMPARE_WITH_HOST
	force(x,m,eps*eps,ah,poth,n);
	compare_force(a, ah, n);
#endif
	push_velocity(v,a,0.5*dt,n);

       if( time >= eouttime) {
	    energy(time,pot,x,v,m,n,init_ene,icm);
	    eouttime += deouttime;
	    get_cputime(&lt,&st);
            printf("cputime %e %e %e\n",lt,st,st);
	    sustained = 38.0*((double)n)*((double)n)
		*(deouttime/dt)/(lt)/1e9;
	    //	    peak = 38.0 * g5_get_number_of_pipelines() * g5_get_pcibus_freq() / 1000.0;
            gintrps = ((double)n)*((double)n)*(deouttime/dt)/(lt)/1e9;
            printf("speed %g G interactions/s (%g Gflops)\n",
                   gintrps, 38.0*gintrps);
	    get_cputime(&lt,&st);
	    fflush(stdout);
	}
    }
}

