#include <gcalutil.h>
#include "gravperf_0.h"

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "gravperf.h"

#pragma goose func
double
rsqrt(double r2)
{
    return 1.0 / sqrt(r2);
}

void
calc_gravity(int ni, int nj, double *m, double (*x)[3], double eps, double (*a)[3], double *pot)
{
    int i, j, k;
    double r2, rinv, mrinv, mr3inv, dx[3];
    double eps2 = eps * eps;

    for (i = 0; i < ni; i++) {

        for (k = 0; k < 3; k++) {
            a[i][k] = 0.0;
        }

        pot[i] = 0.0;
    }
    {
        enum
        {
            MINDOMAINH = 2,
        };
        int bufidx_;
        int npack_istride_ = 2 * 1;
        int ni_div_npack_ = (ni - 1) / 2 + 1;
        int i0_ = 0;
        int npipe_ = (ni - i0_ - 1) / npack_istride_ + 1;       // # of threads to be dispatched.
        int newbufsize_;
        int devid_ = 0;         // currently Goose supports a single-device system only,
        int moduleid_ = 0;      // and uses one kernel-module per device so far.
        int looplen_ = 2;

        int nj_save_ = nj;
        int njtry_ = (nj - 0 + 1) / 1;  // round up number of j-loop trials to a multiple of looplen_.
        nj = 0 + ((njtry_ - 1) / looplen_ + 1) * looplen_ * 1;

        CALdomain jdomain_, idomain_;
        static int bufsize_ = 0;
        static int jpsize_ = 0;
        static int ipsize_ = 0;
        static double *doublebuf_ = NULL;

        jdomain_.width = 256;
        jdomain_.height = (nj - 1) / (jdomain_.width * looplen_) + 1;
        if (jdomain_.height < MINDOMAINH) {
            jdomain_.height = MINDOMAINH;
        }
        idomain_.width = 64;
        idomain_.height = (npipe_ - 1) / (idomain_.width * looplen_) + 1;
        if (idomain_.height < MINDOMAINH) {
            idomain_.height = MINDOMAINH;
        }

        // initialize the device.
        static int firstcall_ = 1;
        if (firstcall_) {
            firstcall_ = 0;
            GCAL_openMC(devid_, moduleid_, f0_0_kernelsrc_, 19, 11);
        }

        // (re)alloc buf.
        newbufsize_ = (npipe_ > nj) ? npipe_ : nj;
        if (newbufsize_ > bufsize_) {
            bufsize_ = newbufsize_;
            doublebuf_ = (double *) realloc(doublebuf_, sizeof(double) * bufsize_);
            if (!doublebuf_) {
                perror("doublebuf_.");
                exit(1);
            }
        }

        // (re)allocate device-memory.
        if (nj > jpsize_) {
            if (jpsize_ > 0) {
                GCAL_freeMC(devid_, 7);
                GCAL_freeMC(devid_, 8);
                GCAL_freeMC(devid_, 9);
                GCAL_freeMC(devid_, 10);

            }
            jpsize_ = nj;
            GCAL_mallocMC(devid_, moduleid_, 7, gcalCtypeDouble2, gcalMemHostToDevice, jdomain_.width, jdomain_.height);        // x[j][0]
            GCAL_mallocMC(devid_, moduleid_, 8, gcalCtypeDouble2, gcalMemHostToDevice, jdomain_.width, jdomain_.height);        // x[j][1]
            GCAL_mallocMC(devid_, moduleid_, 9, gcalCtypeDouble2, gcalMemHostToDevice, jdomain_.width, jdomain_.height);        // x[j][2]
            GCAL_mallocMC(devid_, moduleid_, 10, gcalCtypeDouble2, gcalMemHostToDevice, jdomain_.width, jdomain_.height);       // m[j]

        }
        if (npipe_ > ipsize_) {
            if (ipsize_ > 0) {
                GCAL_freeMC(devid_, 0);
                GCAL_freeMC(devid_, 1);
                GCAL_freeMC(devid_, 2);
                GCAL_freeMC(devid_, 3);
                GCAL_freeMC(devid_, 4);
                GCAL_freeMC(devid_, 5);
                GCAL_freeMC(devid_, 6);
                GCAL_freeMC(devid_, 11);
                GCAL_freeMC(devid_, 12);
                GCAL_freeMC(devid_, 13);
                GCAL_freeMC(devid_, 14);
                GCAL_freeMC(devid_, 15);
                GCAL_freeMC(devid_, 16);
                GCAL_freeMC(devid_, 17);
                GCAL_freeMC(devid_, 18);

            }
            ipsize_ = npipe_;
            GCAL_mallocMC(devid_, moduleid_, 0, gcalCtypeDouble2, gcalMemHostToDevice, idomain_.width, idomain_.height);        // x[i][0]
            GCAL_mallocMC(devid_, moduleid_, 1, gcalCtypeDouble2, gcalMemHostToDevice, idomain_.width, idomain_.height);        // x[i][1]
            GCAL_mallocMC(devid_, moduleid_, 2, gcalCtypeDouble2, gcalMemHostToDevice, idomain_.width, idomain_.height);        // x[i][2]
            GCAL_mallocMC(devid_, moduleid_, 3, gcalCtypeDouble2, gcalMemHostToDevice, idomain_.width, idomain_.height);        // x[i+1][0]
            GCAL_mallocMC(devid_, moduleid_, 4, gcalCtypeDouble2, gcalMemHostToDevice, idomain_.width, idomain_.height);        // x[i+1][1]
            GCAL_mallocMC(devid_, moduleid_, 5, gcalCtypeDouble2, gcalMemHostToDevice, idomain_.width, idomain_.height);        // x[i+1][2]
            GCAL_mallocMC(devid_, moduleid_, 6, gcalCtypeDouble2, gcalMemHostToDevice, idomain_.width, idomain_.height);        // eps2
            GCAL_mallocMC(devid_, moduleid_, 11, gcalCtypeDouble2, gcalMemDeviceToHost, idomain_.width, idomain_.height);       // a[i][0]
            GCAL_mallocMC(devid_, moduleid_, 12, gcalCtypeDouble2, gcalMemDeviceToHost, idomain_.width, idomain_.height);       // a[i][1]
            GCAL_mallocMC(devid_, moduleid_, 13, gcalCtypeDouble2, gcalMemDeviceToHost, idomain_.width, idomain_.height);       // a[i][2]
            GCAL_mallocMC(devid_, moduleid_, 14, gcalCtypeDouble2, gcalMemDeviceToHost, idomain_.width, idomain_.height);       // pot[i]
            GCAL_mallocMC(devid_, moduleid_, 15, gcalCtypeDouble2, gcalMemDeviceToHost, idomain_.width, idomain_.height);       // a[i+1][0]
            GCAL_mallocMC(devid_, moduleid_, 16, gcalCtypeDouble2, gcalMemDeviceToHost, idomain_.width, idomain_.height);       // a[i+1][1]
            GCAL_mallocMC(devid_, moduleid_, 17, gcalCtypeDouble2, gcalMemDeviceToHost, idomain_.width, idomain_.height);       // a[i+1][2]
            GCAL_mallocMC(devid_, moduleid_, 18, gcalCtypeDouble2, gcalMemDeviceToHost, idomain_.width, idomain_.height);       // pot[i+1]

        }

        // copy JPs from the host memory to the device memory.
        for (j = 0, bufidx_ = 0; j < nj_save_; j += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) x[j][0];
        }
        for (j = nj_save_; j < nj; j += 1, bufidx_++) {
            doublebuf_[bufidx_] = 0.0;
        }
        GCAL_memcpyMC(devid_, 7, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);
        for (j = 0, bufidx_ = 0; j < nj_save_; j += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) x[j][1];
        }
        for (j = nj_save_; j < nj; j += 1, bufidx_++) {
            doublebuf_[bufidx_] = 0.0;
        }
        GCAL_memcpyMC(devid_, 8, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);
        for (j = 0, bufidx_ = 0; j < nj_save_; j += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) x[j][2];
        }
        for (j = nj_save_; j < nj; j += 1, bufidx_++) {
            doublebuf_[bufidx_] = 0.0;
        }
        GCAL_memcpyMC(devid_, 9, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);
        for (j = 0, bufidx_ = 0; j < nj_save_; j += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) m[j];
        }
        for (j = nj_save_; j < nj; j += 1, bufidx_++) {
            doublebuf_[bufidx_] = 0.0;
        }
        GCAL_memcpyMC(devid_, 10, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);

        // copy IPs from the host memory to the device memory.
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) x[i * 2][0];
        }
        GCAL_memcpyMC(devid_, 0, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) x[i * 2][1];
        }
        GCAL_memcpyMC(devid_, 1, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) x[i * 2][2];
        }
        GCAL_memcpyMC(devid_, 2, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) x[i * 2 + 1][0];
        }
        GCAL_memcpyMC(devid_, 3, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) x[i * 2 + 1][1];
        }
        GCAL_memcpyMC(devid_, 4, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) x[i * 2 + 1][2];
        }
        GCAL_memcpyMC(devid_, 5, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            doublebuf_[bufidx_] = (double) eps2;
        }
        GCAL_memcpyMC(devid_, 6, sizeof(double) * bufidx_, doublebuf_, gcalMemHostToDevice);

        // launch the kernel.
        GCAL_launchMC(devid_, moduleid_, 19, nj, looplen_, &jdomain_, &idomain_);

        // copy results from the device memory to the host memory.
        GCAL_memcpyMC(devid_, 11, sizeof(double) * npipe_, doublebuf_, gcalMemDeviceToHost);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            a[i * 2][0] = (double) doublebuf_[bufidx_];
        }
        GCAL_memcpyMC(devid_, 12, sizeof(double) * npipe_, doublebuf_, gcalMemDeviceToHost);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            a[i * 2][1] = (double) doublebuf_[bufidx_];
        }
        GCAL_memcpyMC(devid_, 13, sizeof(double) * npipe_, doublebuf_, gcalMemDeviceToHost);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            a[i * 2][2] = (double) doublebuf_[bufidx_];
        }
        GCAL_memcpyMC(devid_, 14, sizeof(double) * npipe_, doublebuf_, gcalMemDeviceToHost);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            pot[i * 2] = (double) doublebuf_[bufidx_];
        }
        GCAL_memcpyMC(devid_, 15, sizeof(double) * npipe_, doublebuf_, gcalMemDeviceToHost);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            a[i * 2 + 1][0] = (double) doublebuf_[bufidx_];
        }
        GCAL_memcpyMC(devid_, 16, sizeof(double) * npipe_, doublebuf_, gcalMemDeviceToHost);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            a[i * 2 + 1][1] = (double) doublebuf_[bufidx_];
        }
        GCAL_memcpyMC(devid_, 17, sizeof(double) * npipe_, doublebuf_, gcalMemDeviceToHost);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            a[i * 2 + 1][2] = (double) doublebuf_[bufidx_];
        }
        GCAL_memcpyMC(devid_, 18, sizeof(double) * npipe_, doublebuf_, gcalMemDeviceToHost);
        for (i = 0, bufidx_ = 0; i < ni_div_npack_; i += 1, bufidx_++) {
            pot[i * 2 + 1] = (double) doublebuf_[bufidx_];
        }

        nj = nj_save_;          // restore nj.

    }

    for (i = 0; i < ni; i++) {
        pot[i] += m[i] / eps;
    }
}

int
main(int argc, char **argv)
{
    static double mj[NMAX], xj[NMAX][3], vj[NMAX][3];
    static double a[NMAX][3], p[NMAX];
    double time, dt, endt;;
    double eps;
    double e, e0, ke, pe;
    double lt = 0.0, st = 0.0;
    int i, j, k, ni, nj;
    int nstep, step, interval;
    double dinterval;
    double gintrps;
    double cm[3], cm0[3];

    eps = 0.02;
    dt = 0.01;
    endt = 1.0;
    time = 0.0;
    nstep = endt / dt;

    if (argc < 2) {
        fprintf(stderr, "usage: %s <infile> [ni]\n", argv[0]);
        exit(1);
    }
    readnbody(&nj, mj, xj, vj, argv[1]);
    ni = argc < 3 ? nj : atoi(argv[2]);
    if (nj > NMAX) {
        fprintf(stderr, "nj (=%d) exceeded NMAX (=%d). abort.\n", nj, NMAX);
        exit(1);
    }
    if (ni > nj) {
        fprintf(stderr, "ni (=%d) exceeded nj (=%d). abort.\n", ni, nj);
        exit(1);
    }

    fprintf(stderr, "ni: %d  nj:%d\n", ni, nj);
    dinterval = 500.0 * (10000.0 / ni) * (10000.0 / nj);
    interval = dinterval;
    if (dinterval * 10.0 > nstep) {
        interval = nstep / 10;
    }
    fprintf(stderr, "interval: %d\n", interval);

    get_cputime(&lt, &st);
    calc_gravity(ni, nj, mj, xj, eps, a, p);
    energy(mj, vj, p, nj, &ke, &pe);
    e0 = ke + pe;
    printf("ke: %f  pe: %f  e0: %f\n", ke, pe, e0);

    for (step = 1; step < nstep; step++) {
        push_velocity(vj, a, 0.5 * dt, nj);
        push_position(xj, vj, a, dt, nj);
        time = time + dt;
        calc_gravity(ni, nj, mj, xj, eps, a, p);
        push_velocity(vj, a, 0.5 * dt, nj);
        if (interval > 10 && step % (interval / 10) == 0) {
            fprintf(stderr, ".");
        }
        if (step % interval == 0) {
            get_cputime(&lt, &st);
            gintrps = (double) ni *(double) nj *(double) interval / lt / 1e9;
            center_of_mass(mj, xj, nj, cm);
            energy(mj, vj, p, nj, &ke, &pe);
            e = ke + pe;
            printf("step: %d time: %e\n", step, time);
            printf("e: %e de: %e\n", e, e - e0);
            printf("ke: %e pe: %e\n", ke, pe);
            printf("ke/pe: %e\n", ke / pe);
            printf("cm : %22.15e %22.15e %22.15e\n", cm[0], cm[1], cm[2]);
            printf("dcm: %22.15e %22.15e %22.15e\n", cm[0] - cm0[0], cm[1] - cm0[1], cm[2] - cm0[2]);
            for (k = 0; k < 3; k++) {
                cm0[k] = cm[k];
            }
            printf("CPU time: %e\n", lt);
            printf("speed: %e Ginteraction/s (%6.2f Gflops)\n\n", gintrps, 38.0 * gintrps);
            get_cputime(&lt, &st);
        }
    }

}

