#include <stdio.h>
#include <stdlib.h>
#include <g6util.h>
#include <math.h>
#include "direct.h"

/*
 * set scaling factors.
 */
static void
set_range(double xmin, double xmax)
{
    double xsize = xmax - xmin;
    double xscale = pow(2.0, 64.0) / xsize;
    double vscale = xscale;
    double eps2scale = xscale * xscale;
    double mscale = 1.0;
    double ascale = xscale * xscale / mscale;
    double jscale = ascale;

    g6_set_range_xj(xmin, xmax);
    g6_set_range_xi(xmin, xmax);
    g6_set_scale_vj(vscale);
    g6_set_scale_vi(vscale);
    g6_set_scale_epsi2(eps2scale);
    g6_set_scale_mj(mscale);
    g6_set_scale_acc(ascale);
    g6_set_scale_jerk(jscale);
}

void
calc_gravity4(double *mj, double (*xj)[3], double (*vj)[3],
              double eps, double (*a)[3], double *p, double (*jerk)[3], int n)
{
    static double atmp[NMAX][3];
    static double jtmp[NMAX][3];
    int i, j, k, ii;
    int npipe = g6_get_number_of_pipelines();
    int jmemsize = g6_get_jmemsize();
    static int firstcall = 1;
    static double eps2[NMAX], eps0;
    static int fshift[NMAX];
    static int jshift[NMAX];
    double e2;

    if (NMAX < n) {
	fprintf(stderr, "%s too large n (%d)\n", __FILE__, n);
	exit(2);
    }

    for (ii = 0; ii < npipe; ii++) {
        fshift[ii] = 150;
        jshift[ii] = 118;
    }

    if (firstcall || eps != eps0) {
	firstcall = 0;
	eps0 = eps;
	e2 = eps * eps;
	for (ii = 0; ii < npipe; ii++) {
	    eps2[ii] = e2;
	}
    }

    for (i = 0; i < n; i++) {
	for (k = 0; k < 3; k++) {
	    a[i][k] = 0.0;
	}
	for (k = 0; k < 3; k++) {
	    jerk[i][k] = 0.0;
	}
    }
    for (j = 0; j < n; j += jmemsize) {
        int nj;

	if (j + jmemsize > n) {
	    nj = n - j;
	}
	else {
	    nj = jmemsize;
	}

	g6_set_jp(0, nj, mj + j, xj + j, vj + j);
        g6_set_n(nj);

	for (i = 0; i < n; i += npipe) {
	    int ni;

	    if (i + npipe > n) {
                ni = n - i;
	    }
	    else {
		ni = npipe;
	    }
	    g6_set_ip(ni, (double (*)[3])xj[i], (double (*)[3])vj[i], eps2, fshift, jshift);
            g6_run();
	    g6_get_fout(ni, (double (*)[3])atmp[i], (double (*)[3])jtmp[i]);
	    for (ii = 0; ii < ni; ii++) {
                double ascale = pow(2.0, -fshift[ii]);
		for (k = 0; k < 3; k++) {
		    atmp[i + ii][k] *= ascale;
		}
                double jscale = pow(2.0, -jshift[ii]);
		for (k = 0; k < 3; k++) {
		    jtmp[i + ii][k] *= jscale;
		}
	    }
	}
	for (i = 0; i < n; i++) {
	    for (k = 0; k < 3; k++) {
		a[i][k] += atmp[i][k];
	    }
	    for (k = 0; k < 3; k++) {
		jerk[i][k] += jtmp[i][k];
	    }
	}
    }
}

// just for performance measurement.
static
get_pcibus_freq(void)
{
    return 133.0; // clock frequency of the pipeline (in MHz).
}

int
main(int argc, char **argv)
{
    static int firstcall = 1;
    static double mj[NMAX], xj[NMAX][3], vj[NMAX][3];
    static double a[NMAX][3], p[NMAX], jerk[NMAX][3];
    double xmax, xmin, mmin;
    double time, dt, endt;;
    double eps;
    double e, e0, ke, pe;
    int i, j, n;
    int nstep, step;
    int interval;
    UINT32 binfo;
    double peak;
    double sustained=0.0;
    double lt=0.0, st=0.0;

    if (argc < 4) {
        fprintf(stderr, "usage: %s <infile> <outfile> <endtime>\n",  argv[0]);
        exit(2);
    }
  
    srand48(1234);
    xmax = 64.0;
    xmin = -64.0;

    endt = atof(argv[3]);
    eps = 0.02;
    dt = 0.01;
    time = 0.0;
    nstep = endt/dt;
    readnbody(&n, mj, xj, vj, argv[1]);
    fprintf(stderr, "n: %d outfile: %s endtime: %f\n", n, argv[2], endt);

    interval = 500 * (10000.0/n) * (10000.0/n);    
    if (interval * 10 > nstep) {
	interval = nstep / 10;
    }
    fprintf(stderr, "interval: %d\n", interval);

    mmin = mj[0];
    g6_open();
    set_range(-xmax, +xmax);
    get_cputime(&lt,&st);
    calc_gravity4(mj, xj, vj, eps, a, p, jerk, n);
    energy(mj, vj, p, n, &ke, &pe);
    e0 = ke+pe;
    printf("ke: %f\n", ke);
    fflush(stdout);
    for (step = 1; step < nstep; step++) {
        push_velocity(vj, a, 0.5*dt, n);
	push_position(xj, vj, a, dt, n);
        time = time + dt;
        calc_gravity4(mj, xj, vj, eps, a, p, jerk, n);
        push_velocity(vj, a, 0.5*dt, n);
#ifdef ANIM
        plot_star(xj, n, time, 0.3, mj, mj[0]);
#endif /* ANIM */
        if (interval > 10 && step % (interval/10) == 0) {
            fprintf(stderr, ".");
        }
        if (step % interval == 0) {
            energy(mj, vj, p, n, &ke, &pe);
            e = ke+pe;

	    get_cputime(&lt,&st);
            printf("\ncputime: %e %e\n",lt,st);
	    sustained = 38.0*((double)n)*((double)n)
		*interval/lt/1e9;
	    peak = 38.0 * 2
                * get_pcibus_freq() / 1000.0;
            printf("speed: %g Gflops (%4.1f %%)\n",
                   sustained, sustained / peak * 100.0);
            printf("step: %d time: %e\n", step, time);
            printf("    e: % 15.13E   de: % 15.13E\n", e, e-e0);
            printf("   ke: % 15.13E   pe: % 15.13E\n", ke, pe);
            printf("ke/pe: % 15.13E\n\n", ke/pe);
            fflush(stdout);
	    get_cputime(&lt,&st);
        }
    }
    g6_close();
    writenbody(n, mj, xj, vj, argv[2]);
}
