#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <sys/resource.h>
#define NMAX 8192

void get_cputime(double *laptime, double *splittime)
{
  struct timeval tval;
  struct timezone *dum=0;
  gettimeofday(&tval,dum);
  *laptime = tval.tv_sec + tval.tv_usec * 1.0e-6 - *splittime;
  *splittime = tval.tv_sec + tval.tv_usec * 1.0e-6 ;
}

#pragma goose func
double rsqrt(double r2)
{
  return 1.0/sqrt(r2);
}

main(int argc, char **argv)
{
  static double x[NMAX][3];
  static double a[NMAX][3];
  static double c6[NMAX];
  static double c12[NMAX];  
  static int ilist[NMAX];
  static int jlist[NMAX];

  double r2, rinv,r2inv,r8inv,r14inv;
  int i, j, k,n,times,ii;
  double dx[3];
  double c6ij,c12ij,fvdw;
  double lt,st,total_time=0.0;

  if(argc == 3){
    sscanf(argv[1],"%d",&n);
    sscanf(argv[2],"%d",&times);    
  }else if(argc == 2){
    sscanf(argv[1],"%d",&n);
    times = 1;
  }else{
    n = 256;
    times = 1;
  }
  printf("n %d, %d times\n",n,times);

  for(ii = 0;ii< times; ii++){

    for(i=0;i<n;i++){
      x[i][0] = drand48()-0.5;
      x[i][1] = drand48()-0.5;
      x[i][2] = drand48()-0.5;
      c6[i] = 1.0e-3;
      c12[i] = 1.0e-6;    
      ilist[i] = n-1-i;
      jlist[i] = i;      
    }

    get_cputime(&lt,&st);

// #pragma omp parallel for private(j,k,dx,r2,rinv,r2inv,r8inv,r14inv,c6ij,c12ij,fvdw)

#pragma goose parallel for precision ("double") loopcounter(i, j) \
        result (a[ilist[i]][0..2])

    for(i=0;i<n;i++) {
      for(k=0;k<3;k++) a[ilist[i]][k] = 0.0;
      for (j=0;j<n;j++) {
        for(k=0;k<3;k++) dx[k] = x[jlist[j]][k] - x[ilist[i]][k];
        r2 = dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2];
        rinv = rsqrt(r2);
        if(ilist[i]==jlist[j]) rinv = 0.0; 
        r2inv = rinv*rinv;
        r8inv = r2inv*r2inv*r2inv*r2inv;
        r14inv = r8inv*r8inv*r2;
        c6ij = c6[ilist[i]]*c6[jlist[j]];
        c12ij = c12[ilist[i]]*c12[jlist[j]];
        fvdw = 12.0*c12ij*r14inv - 6.0*c6ij*r8inv;
        a[ilist[i]][0] += fvdw * dx[0];
        a[ilist[i]][1] += fvdw * dx[1];
        a[ilist[i]][2] += fvdw * dx[2];      
      }
    }
    get_cputime(&lt,&st); total_time += lt;
  }

  for(i=0;i<n;i++){
    printf("i %d a %22.14e %22.14e %22.14e\n",i,a[i][0],a[i][1],a[i][2]);
  }
  printf("totol time: %g\n",total_time);
}



