/*
 * code for nbody : sticky8.c
 * (4nd-order predictor-corrector scheme and Individual time step)
 *
 * this version predicts positions for ni particles on the host.
 * nj particles on the hardware.
 */

#include <stdlib.h>
#include "./sticky.h"

void force_on_ith_particle(int i,
                           REAL xi[DIM], REAL vi[DIM],
                           REAL x[NMAX][DIM], REAL v[NMAX][DIM], REAL m[NMAX], REAL eps,
                           REAL ai[DIM], REAL adoti[DIM], REAL *poti,
                           int n)
{
    int j,d,k;
    REAL r2,r3inv,r2inv,rinv,eps2,xdotv;
    REAL r5inv,xdotvr5inv,r3invdx;
    REAL r3invdvetc;
    REAL dx[DIM];
    REAL dv[DIM];
	
    for(k=0;k<DIM;k++){
        ai[k] = 0.0;
        adoti[k] = 0.0;
    }
    *poti = 0.0;
    eps2 = eps*eps;

    for(j=0;j<n;j++){
        if(j!=i){
	    r2 = eps2;
	    xdotv = 0.0;
	    for(d=0;d<DIM;d++){
                dx[d] = x[j][d] - xi[d];
                dv[d] = v[j][d] - vi[d];
                r2 += dx[d] * dx[d];
                xdotv += dx[d]*dv[d];
	    }
            r2inv = 1.0/r2;
            rinv = sqrt(r2inv);
            r3inv = r2inv*rinv;
            r5inv = r2inv*r2inv*rinv;
	    xdotvr5inv = 3.0*xdotv*r5inv;
	    for(d=0;d<DIM;d++){
                r3invdx = r3inv * dx[d];
                ai[d] += m[j] * r3invdx;
                r3invdvetc = r3inv * dv[d] - xdotvr5inv * dx[d]; 
                adoti[d] += m[j] * r3invdvetc;
	    }
	    *poti += -m[j]*rinv;					
        }
    }
}

void force_host(REAL x[NMAX][DIM], REAL v[NMAX][DIM], REAL m[NMAX], REAL eps,
                REAL a[NMAX][DIM], REAL adot[NMAX][DIM], REAL pot[NMAX],
                int n)
{
    int i,j,d,k;
    REAL r2,r3inv,r2inv,rinv,eps2,xdotv;
    REAL r5inv,xdotvr5inv,r3invdx;
    REAL r3invdvetc;
    REAL dx[DIM];
    REAL dv[DIM];
	
    for(j=0;j<n;j++){  
        for(k=0;k<DIM;k++){
	    a[j][k] = 0.0;
	    adot[j][k] = 0.0;
        }
        pot[j] = 0.0;
    }
    eps2 = eps*eps;

    for(i=0;i<n-1;i++){
        for(j=i+1;j<n;j++){
	    r2 = eps2;
	    xdotv = 0.0;
	    for(d=0;d<DIM;d++){
                dx[d] = x[j][d] - x[i][d];
                dv[d] = v[j][d] - v[i][d];
                r2 += dx[d] * dx[d];
                xdotv += dx[d]*dv[d];
	    }
            r2inv = 1.0/r2;
            rinv = sqrt(r2inv);
            r3inv = r2inv*rinv;
            r5inv = r2inv*r2inv*rinv;
	    xdotvr5inv = 3.0*xdotv*r5inv;
	    for(d=0;d<DIM;d++){
                r3invdx = r3inv * dx[d];
                a[i][d] += m[j] * r3invdx;
                a[j][d] += - m[i] * r3invdx;
                r3invdvetc = r3inv * dv[d] - xdotvr5inv * dx[d]; 
                adot[i][d] += m[j] * r3invdvetc;
                adot[j][d] += - m[i] * r3invdvetc;
	    }
	    pot[i] += m[j]*rinv;					
	    pot[j] += m[i]*rinv;					
        }
    }
    for(i=0;i<n;i++) pot[i] *= -1;    

}

void force(REAL x[NMAX][DIM], REAL v[NMAX][DIM], REAL m[NMAX], REAL t[NMAX], REAL eps,
           REAL a[NMAX][DIM], REAL adot[NMAX][DIM], REAL pot[NMAX],
           int n)
{
    int i,k;
    REAL error2;

    force_host(x,v,m,eps,a,adot,pot,n);

}

void energy(REAL time,
            REAL pot[NMAX],
            REAL x[NMAX][DIM], REAL v[NMAX][DIM], REAL m[NMAX],
            int n, REAL eps, REAL init_ene)
{
    REAL total_pot=0,total_kin=0;
    int i;	

    for(i=0;i<n;i++){
        total_pot += 0.5*m[i]*pot[i];
    }

    for(i=0;i<n;i++){ 
        total_kin += m[i]*(v[i][0]*v[i][0]+v[i][1]*v[i][1]+v[i][2]*v[i][2]);
    }
    total_kin *= 0.5;

    printf("t = %g\n",time);
    printf("pot = %22.15e kin = %22.15e \n total= %e ratio = %e\n",
           total_pot,total_kin,total_pot+total_kin,total_kin/total_pot);
    printf("   error = %e %g\n",(init_ene-(total_pot+total_kin))/init_ene,time);
}

void initial_energy(REAL pot[NMAX],
                    REAL x[NMAX][DIM], REAL v[NMAX][DIM], REAL m[NMAX],
                    int n, REAL eps, REAL *init_ene)
{
    REAL total_pot=0.0,total_kin=0.0;
    int i;	

    for(i=0;i<n;i++){
        total_pot += 0.5*m[i]*pot[i];
    }

    for(i=0;i<n;i++){ 
        total_kin += m[i]*(v[i][0]*v[i][0]+v[i][1]*v[i][1]+v[i][2]*v[i][2]);
    }
    total_kin *= 0.5;

    printf("pot = %22.15e kin = %22.15e \n total= %22.15e ratio = %e\n",
           total_pot,total_kin,total_pot+total_kin,total_kin/total_pot);
    *init_ene = total_pot+total_kin;
}

void predict(REAL time,
             REAL x1[DIM], REAL v1[DIM],
             REAL ti,
             REAL x0[DIM], REAL v0[DIM],
             REAL a0[DIM], REAL adot0[DIM])
{
    int k;	
    REAL dt2half,dt3over6,dt;

    dt = time - ti;
    dt2half = 0.5*dt*dt;
    dt3over6 = 1.0/3.0*dt*dt2half;

    for(k=0;k<DIM;k++) x1[k] = x0[k] + dt*v0[k] + dt2half*a0[k] + dt3over6*adot0[k];
    for(k=0;k<DIM;k++) v1[k] = v0[k] + dt*a0[k] + dt2half*adot0[k];
}

REAL mod(REAL x, REAL y)
{
    return ((x/y)-((int)(x/y)))/y;
}

void correct(REAL x1[DIM], REAL v1[DIM],
             REAL x0[DIM], REAL v0[DIM],
             REAL a0[DIM], REAL adot0[DIM],
             REAL a1[DIM], REAL adot1[DIM],
             REAL *dt, REAL time, REAL eta)
{
    int k,dum;	
    REAL dt3over6,dt4over24,dt5over120;
    REAL dtinv,dt2inv,dt3inv,nextdt;
    REAL a0mia1,ad04plad12,ad0plad1,a2[DIM],a3[DIM];
    REAL a1abs,adot1abs,a2dot1abs,a3dot1abs,a2dot1[DIM];

    dt3over6 = (*dt)*(*dt)*(*dt)/6.0;
    dt4over24 = dt3over6*(*dt)/4.0;
    dt5over120 = dt4over24*(*dt)/5.0;
    dtinv = 1.0/(*dt);
    dt2inv = dtinv*dtinv;
    dt3inv = dt2inv*dtinv;
    for(k=0;k<DIM;k++) {
        a0mia1 = a0[k]-a1[k];
        ad04plad12 = 4.0*adot0[k] + 2.0*adot1[k];
        ad0plad1 = adot0[k] + adot1[k];
        a2[k] = -6.0*a0mia1*dt2inv - ad04plad12*dtinv;
        a3[k] = 12.0*a0mia1*dt3inv + 6.0*ad0plad1*dt2inv;
        x1[k] +=  dt4over24*a2[k] + dt5over120*a3[k];
        v1[k] +=  dt3over6*a2[k] + dt4over24*a3[k];
    }

#ifndef SHAREDTIMESTEP
    a1abs = sqrt(a1[0]*a1[0]+a1[1]*a1[1]+a1[2]*a1[2]);
    adot1abs = sqrt(adot1[0]*adot1[0]+adot1[1]*adot1[1]+adot1[2]*adot1[2]);
    for(k=0;k<DIM;k++) a2dot1[k] = a2[k] + (*dt)*a3[k];	
    a2dot1abs = sqrt(a2dot1[0]*a2dot1[0]+a2dot1[1]*a2dot1[1]+a2dot1[2]*a2dot1[2]);
    a3dot1abs = sqrt(a3[0]*a3[0]+a3[1]*a3[1]+a3[2]*a3[2]);

    nextdt=	sqrt(eta*(a1abs*a2dot1abs+adot1abs*adot1abs)
                     /(adot1abs*a3dot1abs+a2dot1abs*a2dot1abs));

    if((nextdt < (*dt))&&(nextdt > 1.0e-8)){
        int power;
        power = log(nextdt)/log(2.0)-1;  
        *dt = pow(2.0,(double)power);
    }	  
    if((nextdt > 2.0*(*dt))&&(mod(time,2.0*(*dt))==0)&&((2.0*(*dt))<=MAXTIMESTEP)){
        *dt *= 2.0;
    }
#endif
}

static int clusterid=0;

void initial_timestep(REAL a[NMAX][DIM], REAL adot[NMAX][DIM],
                      REAL dt[NMAX], int n, REAL eta_s)
                      
{
    REAL a2,adot2;
    int power,i;

    for(i=0;i<n;i++){
        a2 = a[i][0]*a[i][0]+a[i][1]*a[i][1]+a[i][2]*a[i][2];
        adot2 = adot[i][0]*adot[i][0]+adot[i][1]*adot[i][1]+adot[i][2]*adot[i][2];
        if(adot2 == 0){
	    dt[i] = eta_s;
        }else{
	    dt[i] = eta_s*sqrt(a2/adot2);
        }
        power = log(dt[i])/log(2.0);  
        dt[i] = pow(2.0,(double)(power-1));		
        if(dt[i]>MAXTIMESTEP) dt[i] = MAXTIMESTEP;

#ifdef SHAREDTIMESTEP
        dt[i] = 1.0/512.0;
#endif
        /*printf("a2 adot2 dt %e %e %e\n",a2,adot2,dt[i]);*/
    }
}

#pragma goose func
double rsqrt(double r2)
{
    return 1.0/sqrt(r2);
}

int
main(void)
{

    static REAL x0[NMAX][DIM];
    static REAL v0[NMAX][DIM];
    static REAL a0[NMAX][DIM];
    static REAL adot0[NMAX][DIM];
    static REAL x1[NMAX][DIM];
    static REAL v1[NMAX][DIM];
    static REAL a1[NMAX][DIM];
    static REAL adot1[NMAX][DIM];
    static REAL m[NMAX];
    static REAL dti[NMAX];
    static REAL ti[NMAX];
    static REAL pot[NMAX];
    REAL eps,epsinv;
    char filename[200],radfile[100];

    REAL icm[DIM],init_ene,time=0.0,outtime,douttime,endtime;
    REAL deouttime,eouttime,eta_s,eta,maxm,alpha,rtidal,eadd,initm,ratio;
    static int index[NMAX],proflag;
    FILE *fp1,*fp2;

    int step,n,i,k,nts=0,nsame,isame,npipe,ni,prev_nsame;
    REAL lt=0,st=0,pst=0,st0=0,lt0=0;
    int nstep=0;

    int idi;
    double eps2;
    int j,d;
    REAL r2,r3inv,rinv,xdotv;
    REAL r5inv,xdotvr5inv,r3invdx,r3invdvetc;
    REAL dx[DIM],dv[DIM];
    REAL dt2half,dt3over6,dt;
    REAL x1j[DIM],v1j[DIM];
	     
    double suma[DIM],sumadot[DIM],tgrape,tcommj;
    struct previous prev;
    prev.flag = 0;
    prev.rc = prev.cod[0] = prev.cod[1] = prev.cod[2] = 0.0;
    prev.total_m = 1.0;

    fp2 = fopen("inputpara","r");
    if (fp2 == NULL) {
        char buf[256];
        //            sprintf(buf, "In %s main() fopen(\"%s\", \"r\") failed. ",
        //                    __FILE__, "inputpara");
        perror(buf);
        exit(1);
    }
    if (fp2 == NULL) {
        perror("main");
    }

    fscanf(fp2,"%lf%lf%lf%lf",&epsinv,&douttime,&endtime,&deouttime);
    fscanf(fp2,"%s%lf%lf",filename,&eta_s,&eta);
    /*                      (input  para)
                            1.0/eps: softening parameter (9999=> eps=0)
                            douttime: interval of output
                            endtime: end time
                            deoutime: interval of energy output
                            filename: name of input file 
                            eta_s: parameter for initial timestep determination
                            eta: parameter for timestep determination
    */
    fclose(fp2);

    get_cputime(&st,&lt);
	    
    if(epsinv==9999.0){
        eps = 0.0;
    }else{
        eps = 1.0/epsinv;
    }
    eps2 = eps*eps;

    data_input(x0,v0,m,icm,&n,filename,&time);
    printf("eta_s,eta %e %e n %d\n",eta_s,eta,n);
    initm = m[0];

    proflag=0;

    if(time==0.0){
        for(i=0;i<n;i++) ti[i] = 0.0;
    }else{
        for(i=0;i<n;i++) ti[i] = time;
    }	

    outtime = douttime + time;
    eouttime = deouttime + time;
    printf("eps:%g douttime:%g \ndeouttime:%g endtime:%g \n",
           eps,douttime,deouttime,endtime);
    printf("inputfile %s \n",filename);

    for(i=0;i<n;i++){
        for(k=0;k<DIM;k++){
	    a0[i][k] = 1.0;
	    adot0[i][k] = 1000.0;
        }
        pot[i] = -1.0;
    }
    force(x0,v0,m,ti,eps,a0,adot0,pot,n);
    printf("initial force\n");  

    /*	for(i=0;i<10;i++){
        printf("i a adot pot %d %f %f %f\n",i,a0[i][0],adot0[i][0],pot[i]);
	}
    */
	
    initial_energy(pot,x0,v0,m,n,eps,&init_ene);
    fflush(stdout);

    /*        radius(time,x0,v0,m,n,dmaindex,&prev,pot,"radfile");*/

    initial_timestep(a0,adot0,dti,n,eta_s);
    for(i=0;i<n;i++) index[i] = i;

    nsame = n;
    prev_nsame = n;
    do{
        REAL nextt;
        int ncor=0;

        sort_timestep_m(0,nsame-1,dti,index);

        time = ti[index[0]] + dti[index[0]];
        // printf("time ti dti0 %22.14e %22.14e %e %d\n",time,ti[index[0]],dti[index[0]],nsame);
        nts++; 

        isame = 0;
        do{
	    isame++;
	    nextt = ti[index[isame]] + dti[index[isame]];
        }while((time == nextt)&&(isame<n));
        nsame = isame;

        //	  for(j=0;j<n;j++)predict(time,x1[j],v1[j],ti[j],x0[j],v0[j],a0[j],adot0[j]);

        for(i=0;i<isame;i++){
	    idi = index[i];
	    predict(time,x1[idi],v1[idi],ti[idi],x0[idi],v0[idi],a0[idi],adot0[idi]);	  
        }

        // #pragma omp parallel for private (j,d,r2,xdotv,dx,dv,rinv,r3inv,r5inv,xdotvr5inv,dt,dt2half,dt3over6,x1j,v1j,k)

#pragma goose parallel for precision ("double-single") loopcounter(i, j) nip_pack(1) njp_write(prev_nsame)
        for(i=0;i<nsame;i++){
            for(d=0;d<DIM;d++){
                a1[index[i]][d] = 0.0;
                adot1[index[i]][d] = 0.0;
            }
            pot[index[i]] = 0.0;

            for(j=0;j<n;j++){
                dt = time - ti[index[j]];
                dt2half = 0.5*dt*dt;
                dt3over6 = 1.0/3.0*dt*dt2half;
                for(d=0;d<DIM;d++){
                    x1j[d] = x0[index[j]][d] + dt*v0[index[j]][d] + dt2half*a0[index[j]][d] + dt3over6*adot0[index[j]][d];
                    v1j[d] = v0[index[j]][d] + dt*a0[index[j]][d] + dt2half*adot0[index[j]][d];
                }	       

                r2 = eps2;
                xdotv = 0.0;
                for(d=0;d<DIM;d++){
                    dx[d] = x1j[d] - x1[index[i]][d];
                    dv[d] = v1j[d] - v1[index[i]][d];
                    r2 += dx[d] * dx[d];
                    xdotv += dx[d]*dv[d];
                }
                rinv = rsqrt(r2);
                if(index[i]==index[j]) rinv = 0.0;
                r3inv = rinv*rinv*rinv;
                r5inv = r3inv*rinv*rinv;
                xdotvr5inv = 3.0*xdotv*r5inv;
                for(d=0;d<DIM;d++){
                    a1[index[i]][d] += m[index[j]] * r3inv * dx[d];
                    adot1[index[i]][d] += m[index[j]] * (r3inv * dv[d] - xdotvr5inv * dx[d]); 
                }
                pot[index[i]] -= m[index[j]]*rinv;					
            }
        }

        for(i=0;i<nsame;i++){
            idi = index[i];
            //printf("i %d %d a %g %g %g adot1 %g %g %g pot %g\n",i,idi,a1[idi][0],a1[idi][1],a1[idi][2],adot1[idi][0],adot1[idi][1],adot1[idi][2],pot[idi]);
            ti[idi] = time;
	    
            correct(x1[idi],v1[idi],x0[idi],v0[idi],a0[idi],adot0[idi],a1[idi],adot1[idi],&dti[idi],time,eta);

            for(k=0;k<DIM;k++){ 
	        x0[idi][k] = x1[idi][k];
	        v0[idi][k] = v1[idi][k];
	        a0[idi][k] = a1[idi][k];
	        adot0[idi][k] = adot1[idi][k];
            }
        }

        get_cputime(&st0,&lt0);
        tgrape += st0;
        tcommj += st0;	  

        ncor = nsame;
        prev_nsame = nsame;

        nstep += ncor;
        if( time >= eouttime){
	    double tg;
  	    energy(time,pot,x1,v1,m,n,eps,init_ene);
	    eouttime += deouttime;
	    printf("time %g %d\n",time,ncor);
            printf("nts = %d nstep %d\n",nts,nstep);
	    get_cputime(&st,&lt);
	    printf("cputime %g %g %g\n",lt,st,lt-pst);
            printf("speed %g Gflops, %g nstep/s\n",
                   57.0*((double)n)*((double)nstep)/st/1e9,nstep/st);

	    nts=0;
	    nstep=0; 
	    pst = lt;
	    tgrape = 0;
	    tcommj = 0;	    
	    fflush(stdout);

        }
        ncor=0;

        if( time >= outtime) {
	    outtime += douttime;
            /*            radius(time,x1,v1,m,n,dmaindex,&prev,pot,"radfile");*/

            fp1 = fopen("nemoout","w");
            data_output(time,x1,v1,m,n,fp1);
            fclose(fp1);       

        }

    }while(time<endtime);
    exit(0);
}

