#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "gravperf.h"

#pragma goose func
double rsqrt(double r2)
{
  return 1.0/sqrt(r2);
}

void
calc_gravity(int ni, int nj, double *m, double (*x)[3],
	     double eps, double (*a)[3], double *pot)
{
    int i, j, k;
    double r2, rinv, mrinv, mr3inv, dx[3];
    double eps2 = eps * eps;

#pragma omp parallel for private(i,j,k,dx,r2,rinv,mrinv,mr3inv)
#pragma goose parallel for loopcounter(i, j) precision ("double") nip_pack(1)
    for (i = 0; i < ni; i++) {
        for (k = 0; k < 3; k++) {
            a[i][k] = 0.0;
        }
        pot[i] = 0.0;
        for (j = 0; j < nj; j++) {
            for (k = 0; k < 3; k++) {
                dx[k] = x[j][k] - x[i][k];
            }
            r2 = dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] + eps2;
            rinv = rsqrt(r2);
            mrinv = m[j] * rinv;
            mr3inv = mrinv * rinv * rinv;
            for (k = 0; k < 3; k++) {
                a[i][k] += mr3inv * dx[k];
            }
            pot[i] -= mrinv;
        }
    }
    for (i=0; i < ni; i++) {
        pot[i] += m[i]/eps;
    }
}

int
main(int argc, char **argv)
{
    static double *mj = NULL, (*xj)[3] = NULL, (*vj)[3] = NULL;
    static double (*a)[3] = NULL, *p = NULL;
    double time, dt, endt;;
    double eps;
    double e, e0, ke, pe;
    double lt = 0.0, st = 0.0;
    int i, j, k, ni, nj;
    int nstep, step, interval;
    double dinterval;
    double gintrps;
    double cm[3], cm0[3];

    eps = 0.02;
    dt = 0.01;
    endt = 1.0;
    time = 0.0;
    nstep = endt/dt;

    if (argc < 2) {
        fprintf(stderr, "usage: %s <infile> [ni]\n",  argv[0]);
        exit(1);
    }
    readnbody(&nj, &mj, &xj, &vj, &a, &p, argv[1]);
    ni = argc < 3 ? nj : atoi(argv[2]);
    if (ni > nj) {
        fprintf(stderr, "ni (=%d) exceeded nj (=%d). abort.\n", ni, nj);
        exit(1);
    }

    fprintf(stderr, "ni: %d  nj:%d\n", ni, nj);
    dinterval = 500.0 * (10000.0/ni) * (10000.0/nj);
    interval = dinterval;
    if (dinterval * 10.0 > nstep) {
	interval = nstep / 10;
    }
    interval = interval < 1 ? 1 : interval;
    fprintf(stderr, "interval: %d\n", interval);

    // the 1st step.
    get_cputime(&lt,&st);
    calc_gravity(ni, nj, mj, xj, eps, a, p);
    energy(mj, vj, p, nj, &ke, &pe);
    e0 = ke+pe;
    printf("ke: %f  pe: %f  e0: %f\n", ke, pe, e0);

    // the time-integration loop.
    for (step = 1; step < nstep; step++) {
        push_velocity(vj, a, 0.5*dt, nj);
        push_position(xj, vj, a, dt, nj);
        time = time + dt;
        calc_gravity(ni, nj, mj, xj, eps, a, p);
        push_velocity(vj, a, 0.5*dt, nj);
        if (interval > 10 && step % (interval/10) == 0) {
            fprintf(stderr, ".");
        }
        if (step % interval == 0) {
	    get_cputime(&lt,&st);
            gintrps = (double)ni * (double)nj * (double)interval / lt /1e9;
            center_of_mass(mj, xj, nj, cm);
            energy(mj, vj, p, nj, &ke, &pe);
            e = ke+pe;
            printf("step: %d time: %e\n", step, time);
            printf("e: %e de: %e\n", e, e-e0);
            printf("ke: %e pe: %e\n", ke, pe);
            printf("ke/pe: %e\n", ke/pe);
            printf("cm : %22.15e %22.15e %22.15e\n", cm[0], cm[1], cm[2]);
            printf("dcm: %22.15e %22.15e %22.15e\n",
                   cm[0] - cm0[0], cm[1] - cm0[1], cm[2] - cm0[2]);
            for (k = 0; k < 3; k++) {
                cm0[k] = cm[k];
            }
            printf("CPU time: %e\n", lt);
            printf("speed: %e Ginteraction/s (%6.2f Gflops)\n\n",
                   gintrps, 38.0 * gintrps);
	    get_cputime(&lt,&st);
        }
    }
    //    writenbody(n, mj, xj, vj, argv[2]);
}

