#include "gravperf_0.vsmgen.h"
#include "singutil.h"

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "gravperf.h"

#pragma goose func
double
rsqrt(double r2)
{
    return 1.0 / sqrt(r2);
}

void
calc_gravity(int ni, int nj, double *m, double (*x)[3], double eps, double (*a)[3], double *pot)
{
    int i, j, k;
    double r2, rinv, mrinv, mr3inv, dx[3];
    double eps2 = eps * eps;

    for (i = 0; i < ni; i++) {

        for (k = 0; k < 3; k++) {
            a[i][k] = 0.0;
        }

        pot[i] = 0.0;
    }
    {
        enum
        {
            npe = 32,
            nbb = 16,
            nvec = 4,
        };

        static struct f0_0_grape_j_particle_struct jp[nbb];
        static struct f0_0_grape_i_particle_struct ip[npe * nvec];
        static struct f0_0_grape_result_struct result[npe * nvec];
        int npipe, nn;
        int i, ii, j, jj, vid;

        static int firstcall = 1;
        if (firstcall) {
            f0_0_grape_init();  // open only once in a run.
            firstcall = 0;
        }

        npipe = npe * nvec;
        if (SING_devpara() == I_DEVPARA) {
            npipe *= SING_ndevice();
        }

        for (j = 0; j < nj; j += nbb) {
            for (jj = 0; jj < nbb; jj++) {
                for (vid = 0; vid < nvec; vid++) {
                    if (j + jj < nj) {
                        jp[jj].x_j_0_[vid] = x[j + jj][0];
                        jp[jj].x_j_1_[vid] = x[j + jj][1];
                        jp[jj].x_j_2_[vid] = x[j + jj][2];
                        jp[jj].m_j_[vid] = m[j + jj];
                    }
                    else {
                        jp[jj].x_j_0_[vid] = 0.0;
                        jp[jj].x_j_1_[vid] = 0.0;
                        jp[jj].x_j_2_[vid] = 0.0;
                        jp[jj].m_j_[vid] = 0.0;
                    }
                }
            }
            SING_EM_write(0, j, nbb, jp);
        }

        for (i = 0; i < SING_ndevice(); i++, j += nbb) {
            SING_EM_write(0, j, nbb, jp);
        }

        for (i = 0; i < ni; i += npipe * 2) {
            if ((i + npipe * 2) > ni) {
                nn = (ni - i - 1) / 2 + 1;
            }
            else {
                nn = npipe;
            }
            for (ii = 0; ii < nn; ii++) {
                ip[ii].x_i_0_ = x[i + ii * 2][0];
                ip[ii].x_i_1_ = x[i + ii * 2][1];
                ip[ii].x_i_2_ = x[i + ii * 2][2];
                ip[ii].x_id20767__0_ = x[i + ii * 2 + 1][0];
                ip[ii].x_id20767__1_ = x[i + ii * 2 + 1][1];
                ip[ii].x_id20767__2_ = x[i + ii * 2 + 1][2];
                ip[ii].eps2 = eps2;
            }
            f0_0_send_vsm_constants();
            f0_0_register_i_particle_conversion();
            f0_0_send_i_particle(ip, nn);
            f0_0_grape_run((nj - 1) / nbb + 1);
            f0_0_get_result(result);
            for (ii = 0; ii < nn; ii++) {
                a[i + ii * 2][0] = result[ii].a_i_0_;
                a[i + ii * 2][1] = result[ii].a_i_1_;
                a[i + ii * 2][2] = result[ii].a_i_2_;
                pot[i + ii * 2] = result[ii].pot_i_;
                a[i + ii * 2 + 1][0] = result[ii].a_id20767__0_;
                a[i + ii * 2 + 1][1] = result[ii].a_id20767__1_;
                a[i + ii * 2 + 1][2] = result[ii].a_id20767__2_;
                pot[i + ii * 2 + 1] = result[ii].pot_id20767_;
            }
        }

    }

    for (i = 0; i < ni; i++) {
        pot[i] += m[i] / eps;
    }
}

int
main(int argc, char **argv)
{
    static double mj[NMAX], xj[NMAX][3], vj[NMAX][3];
    static double a[NMAX][3], p[NMAX];
    double time, dt, endt;;
    double eps;
    double e, e0, ke, pe;
    double lt = 0.0, st = 0.0;
    int i, j, k, ni, nj;
    int nstep, step, interval;
    double dinterval;
    double gintrps;
    double cm[3], cm0[3];

    eps = 0.02;
    dt = 0.01;
    endt = 1.0;
    time = 0.0;
    nstep = endt / dt;

    if (argc < 2) {
        fprintf(stderr, "usage: %s <infile> [ni]\n", argv[0]);
        exit(1);
    }
    readnbody(&nj, mj, xj, vj, argv[1]);
    ni = argc < 3 ? nj : atoi(argv[2]);
    if (nj > NMAX) {
        fprintf(stderr, "nj (=%d) exceeded NMAX (=%d). abort.\n", nj, NMAX);
        exit(1);
    }
    if (ni > nj) {
        fprintf(stderr, "ni (=%d) exceeded nj (=%d). abort.\n", ni, nj);
        exit(1);
    }

    fprintf(stderr, "ni: %d  nj:%d\n", ni, nj);
    dinterval = 500.0 * (10000.0 / ni) * (10000.0 / nj);
    interval = dinterval;
    if (dinterval * 10.0 > nstep) {
        interval = nstep / 10;
    }
    fprintf(stderr, "interval: %d\n", interval);

    get_cputime(&lt, &st);
    calc_gravity(ni, nj, mj, xj, eps, a, p);
    energy(mj, vj, p, nj, &ke, &pe);
    e0 = ke + pe;
    printf("ke: %f  pe: %f  e0: %f\n", ke, pe, e0);

    for (step = 1; step < nstep; step++) {
        push_velocity(vj, a, 0.5 * dt, nj);
        push_position(xj, vj, a, dt, nj);
        time = time + dt;
        calc_gravity(ni, nj, mj, xj, eps, a, p);
        push_velocity(vj, a, 0.5 * dt, nj);
        if (interval > 10 && step % (interval / 10) == 0) {
            fprintf(stderr, ".");
        }
        if (step % interval == 0) {
            get_cputime(&lt, &st);
            gintrps = (double) ni *(double) nj *(double) interval / lt / 1e9;
            center_of_mass(mj, xj, nj, cm);
            energy(mj, vj, p, nj, &ke, &pe);
            e = ke + pe;
            printf("step: %d time: %e\n", step, time);
            printf("e: %e de: %e\n", e, e - e0);
            printf("ke: %e pe: %e\n", ke, pe);
            printf("ke/pe: %e\n", ke / pe);
            printf("cm : %22.15e %22.15e %22.15e\n", cm[0], cm[1], cm[2]);
            printf("dcm: %22.15e %22.15e %22.15e\n", cm[0] - cm0[0], cm[1] - cm0[1], cm[2] - cm0[2]);
            for (k = 0; k < 3; k++) {
                cm0[k] = cm[k];
            }
            printf("CPU time: %e\n", lt);
            printf("speed: %e Ginteraction/s (%6.2f Gflops)\n\n", gintrps, 38.0 * gintrps);
            get_cputime(&lt, &st);
        }
    }

}

