#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include "hibutil.h"

//static UINT64 piowbuf[GRAPE7X_DMABUF_BYTES>>3]; // should be larger than (GRAPE7X/E_DMABUF_BYTES>>3)
//static UINT64 piowbuf2[NHIB][GRAPE7X_DMABUF_BYTES>>3];
static UINT64 *piowbuf;
static UINT64 *piowbuf2[NHIB];

static void showstatus(int argc, char **argv);
static void stopdmaw(int argc, char **argv);
static void clearfifo(int argc, char **argv);
static void showdmastatus(int argc, char **argv);
static void configread(int argc, char **argv);
static void configwrite(int argc, char **argv);
static void regread(int argc, char **argv);
static void regwrite(int argc, char **argv);
static void pioread(int argc, char **argv);
static void piowrite(int argc, char **argv);

static void dmatest(int argc, char **argv);
static void dmaperf(int argc, char **argv);
static void dmawperf(int argc, char **argv);
static void dmarperf(int argc, char **argv);
static void resetbackend(int argc, char **argv);
static void rawperf(int argc, char **argv);
static void dmaperf2(int argc, char **argv);
static void dmawperf2(int argc, char **argv);
static void dmarperf2(int argc, char **argv);

static void eraserom(int argc, char **argv);
static void writerom(int argc, char **argv);
static void readromid(int argc, char **argv);

static void writepllconf(int argc, char **argv);

static void showusage(int argc, char **argv);
static void get_cputime(double *laptime, double *sprittime);

typedef struct {
    void (*func)();
    char *usage;
} TestMode;

static TestMode testmode[] =
    {
	showstatus, "show contents of config & HIB-local registers [devid]",
	stopdmaw, "reset DMA and FIFO [devid]",
	clearfifo, "clear HIB-internal FIFO [devid]",
	showdmastatus, "show DMA status [devid]",
	configread, "read config register <addr> [devid]",
	configwrite, "write config register <addr> <val> [devid]",
	regread, "read HIB local registers mapped to BAR0 <addr> [devid]",
	regwrite, "write HIB local registers mapped to BAR0 <addr> <val> [devid]",
	pioread, "read backend memory space mapped to BAR1 <addr> [devid]",
	piowrite, "write backend memory space mapped to BAR1 <addr> <val> [devid]",
	dmatest, "check DMA read/write function <size> <sendfunc> [devid] (host <-> HIB)",
	dmaperf, "measure DMA performance <sendfunc> [devid] (host <-> HIB)",
	dmawperf, "measure DMA write performance [devid] (host <- HIB; bypass internal FIFO)",
	dmarperf, "measure DMA read performance <sendfunc> [devid] (host -> HIB; bypass internal FIFO)",
	resetbackend, "reset backend [devid]",
	rawperf, "raw PIO r/w & DMA r/w [devid]",
	dmaperf2, "measure DMA performance with multiple HIBs <sendfunc> (host <-> HIBs internal FIFO)",
	dmawperf2, "measure DMA write performance with multiple HIBs (host <- HIBs; bypass internal FIFO)",
	dmarperf2, "measure DMA read performance with multiple HIBs <sendfunc> (host -> HIBs; bypass internal FIFO)",
        eraserom, "erase configuration ROM (EPCS64) [devid]",
        writerom, "write .rpd to configuration ROM (EPCS64) <rpd-file> [devid]",
        readromid, "read configuration ROM ID (0x10:EPCS1 0x12:EPCS4 0x14:EPCS16 0x16:EPCS64) [devid]",
	writepllconf, "set pipeline clock frequency to (PCI-X_bus_freq * M / N) <M> <N> [devid]",
    };

int
main(int argc, char **argv)
{
    int mode, i;

    if (argc < 2) {
	showusage(argc, argv);
	exit (1);
    }
    piowbuf = (UINT64 *)calloc(GRAPE7X_DMABUF_BYTES>>3,sizeof(*piowbuf));
    if (! piowbuf ) {
        fprintf(stderr, "Out of memory.");
        exit (1);
    }
    for (i=0; i<NHIB; i++) {
        piowbuf2[i] = (UINT64 *)calloc(GRAPE7X_DMABUF_BYTES>>3,sizeof(*piowbuf));
        if (! piowbuf2[i] ) {
          fprintf(stderr, "Out of memory.");
          exit (1);
        }
    }

    mode = atoi(argv[1]);
    if (0 <= mode && mode < sizeof(testmode)/sizeof(testmode[0])) {
	testmode[mode].func(argc, argv);
    }
    else {
	showusage(argc, argv);
	exit (1);
    }
    exit (0);
}

static void
showusage(int argc, char **argv)
{
    int i;
    int nitems = sizeof(testmode)/sizeof(testmode[0]);

    fprintf(stderr, "usage: %s <test_program_ID>\n", argv[0]);
    for (i = 0; i < nitems; i++) {
	fprintf(stderr, "  %2d) %s\n", i, testmode[i].usage);
    }
}

#if 1 // set 1 by default. set 0 to test loopback backend logic.

/*
 * write data to the HIB-internal FIFO, then read back.
 */
static void
dmatest(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 4) {
	devid = atoi(argv[4]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }

    fprintf(stderr, "\n# check hib[%d] DMA read/write (host <-> HIB internal FIFO)\n\n", devid);

    size = atoi(argv[2]);
    fprintf(stderr, "size %d\n", size);

    sendfunc = atoi(argv[3]);

    h = hib_openMC(devid);
    if (size < 0 || size > (h->dmabuf_bytes>>2)) {
	fprintf(stderr, "inappropriate size %d\n", size);
	exit(1);
    }

    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);
    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# hib[%d] DMA read, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	rbuf = h->dmar_buf;
	wbuf = h->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    printf("clear DMA buf...\n");

    srand48(time(NULL));
    for (i = 0; i < size+10; i++) {
	rbuf[i] = 0x123456789abc0000ll|i;
        //	rbuf[i] = lrand48() << 32 | lrand48();;
	wbuf[i] = 0xfedcba9876540000ll|i;
	//      fprintf(stderr, "rbuf[0x%02x]: 0x%016llx\n", i, rbuf[i]);
    }

    printf("DMA read size: %d words (%d bytes)\n", size, size*8);
    srand48(time(NULL));
    printf("will dmar...\n");

    hib_sendMC(devid, size, rbuf);
    hib_recvMC(devid, size, wbuf);

    for (nng = 0, i = 0; i < size+2; i++) {
	fprintf(stdout, "rbuf[%04d]: 0x%016llx  wbuf[%04d]: 0x%016llx",
		i, rbuf[i], i, wbuf[i]);
	if (wbuf[i] != rbuf[i] && i < size) {
	    nng++;
	    fprintf(stdout, " NG\n");
	}
	else {
	    fprintf(stdout, " \n");
	}
	if (i+1 == size) {
	    fprintf(stdout, "---- transfer size reached ----\n");
	}
    }
    printf("done\n %d words (%d bytes).\n", size, size*8);
    if (nng) {
	fprintf(stderr, "NG %d words\n", nng);
    }
    else {
	fprintf(stderr, "OK\n");
    }
    for (i = 0; i < size; i++) {
	rbuf[i] = 0;
	wbuf[i] = 0;
    }
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}

#else

/*
 * write data to the backend, then read back.
 * works only if a loopback logic is implemented as the backend.
 */
static void
dmatest(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 4) {
	devid = atoi(argv[4]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }

    fprintf(stderr, "\n# check hib[%d] DMA read/write (host <-> backend)\n\n", devid);

    size = atoi(argv[2]);
    fprintf(stderr, "size %d\n", size);

    sendfunc = atoi(argv[3]);

    h = hib_openMC(devid);
    if (size < 0 || size > (h->dmabuf_bytes>>2)) {
	fprintf(stderr, "inappropriate size %d\n", size);
	exit(1);
    }

    //    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);
    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# hib[%d] DMA read, and then DMA write (host <-> backend)\n", devid);
	rbuf = h->dmar_buf;
	wbuf = h->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write, and then DMA write (host <-> backend)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    printf("clear DMA buf...\n");

    srand48(time(NULL));
    for (i = 0; i < size+10; i++) {
	rbuf[i] = 0x123456789abc0000ll|i;
        //	rbuf[i] = lrand48() << 32 | lrand48();;
	wbuf[i] = 0xfedcba9876540000ll|i;
	//      fprintf(stderr, "rbuf[0x%02x]: 0x%016llx\n", i, rbuf[i]);
    }

    printf("DMA read size: %d words (%d bytes)\n", size, size*8);
    srand48(time(NULL));
    printf("will dmar...\n");

    hib_sendMC(devid, size, rbuf);
    hib_recvMC(devid, size, wbuf);

    for (nng = 0, i = 0; i < size+2; i++) {
	fprintf(stdout, "rbuf[%04d]: 0x%016llx  wbuf[%04d]: 0x%016llx",
		i, rbuf[i], i, wbuf[i]);
	if (wbuf[i] != rbuf[i] && i < size) {
	    nng++;
	    fprintf(stdout, " NG\n");
	}
	else {
	    fprintf(stdout, " \n");
	}
	if (i+1 == size) {
	    fprintf(stdout, "---- transfer size reached ----\n");
	}
    }
    printf("done\n %d words (%d bytes).\n", size, size*8);
    if (nng) {
	fprintf(stderr, "NG %d words\n", nng);
    }
    else {
	fprintf(stderr, "OK\n");
    }
    for (i = 0; i < size; i++) {
	rbuf[i] = 0;
	wbuf[i] = 0;
    }
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}

#endif


static void
dmaperf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 3) {
	devid = atoi(argv[3]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }

    sendfunc = atoi(argv[2]);

    h = hib_openMC(devid);
    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);

    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# hib[%d] DMA read, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	rbuf = h->dmar_buf;
	wbuf = h->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    for (i = 0; i < (h->dmabuf_bytes>>3); i++) {
	rbuf[i] = 0x123456789abc0000ll|i;
	wbuf[i] = 0xfedcba9876540000ll|i;
	//      fprintf(stderr, "rbuf[0x%02x]: 0x%016llx\n", i, rbuf[i]);
    }

    nloop = 1e7;

    for (size = 32; size <= 512; size *= 2) {
	for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);
	    for (j = 0; j < nloop/size; j++) {
		hib_sendMC(devid, size, rbuf);
		hib_recvMC(devid, size, wbuf);
	    }
	    get_cputime(&lt, &st);
	    printf("size: % 5d DMA read & write: % 4.1f sec  % 7.2f MB/s\n",
		   size*8, lt, 2*sizeof(UINT64)*nloop/1e6/lt);
	}
    }
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}


static void
dmawperf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    if (argc < 2) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 2) {
	devid = atoi(argv[2]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }

    h = hib_openMC(devid);
    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);
    rbuf = h->dmar_buf;
    wbuf = h->dmaw_buf;

    for (i = 0; i < (h->dmabuf_bytes>>3); i++) {
	rbuf[i] = 0x123456789abc0000ll|i;
	wbuf[i] = 0xfedcba9876540000ll|i;
	//      fprintf(stderr, "rbuf[0x%02x]: 0x%016llx\n", i, rbuf[i]);
    }

    nloop = 1e7;

    fprintf(stderr, "\n# hib[%d] DMA write (host <- HIB)\n", devid);
    for (size = 128; size <= 4096; size *= 2) {
	for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);
	    for (j = 0; j < nloop/size; j++) {
		hib_recvMC(devid, size, wbuf);
	    }
	    get_cputime(&lt, &st);
	    printf("size: %d DMA write: %f sec  %f MB/s\n",
		   size*8, lt, sizeof(UINT64)*nloop/1e6/lt);
	}
    }
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}


static void
dmarperf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 3) {
	devid = atoi(argv[3]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }

    sendfunc = atoi(argv[2]);

    h = hib_openMC(devid);
    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);

    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# hib[%d] DMA read (host -> HIB)\n", devid);
	rbuf = h->dmar_buf;
	wbuf = h->dmaw_buf;
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# hib[%d] PIO write (host -> HIB)\n", devid);
	hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	rbuf = piowbuf;
	wbuf = h->dmaw_buf;
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }

    for (i = 0; i < (h->dmabuf_bytes>>3); i++) {
	rbuf[i] = 0x123456789abc0000ll|i;
	wbuf[i] = 0xfedcba9876540000ll|i;
	//      fprintf(stderr, "rbuf[0x%02x]: 0x%016llx\n", i, rbuf[i]);
    }

    nloop = 1e7;

    for (size = 32; size <= 4096; size *= 2) {
	for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);
	    for (j = 0; j < nloop/size; j++) {
		hib_sendMC(devid, size, rbuf);
	    }
	    get_cputime(&lt, &st);
	    printf("size: %d DMA read: %f sec  %f MB/s\n",
		   size*8, lt, sizeof(UINT64)*nloop/1e6/lt);
	}
    }
    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}

static void
dmaperf2(int argc, char **argv)
{
    Hib *h[NHIB];
    UINT64 *wbuf[NHIB], *rbuf[NHIB];
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }
    sendfunc = atoi(argv[2]);

    for (devid = 0; devid < NHIB; devid++) {
        h[devid] = hib_openMC(devid);
	hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_FIFO);

	switch (sendfunc) {
	  case SENDFUNC_DMAR:
	    fprintf(stderr, "\n# hib[%d] DMA read, and then DMA write (host <-> HIB internal FIFO)\n", devid);
            rbuf[devid] = h[devid]->dmar_buf;
            wbuf[devid] = h[devid]->dmaw_buf;
	    break;
	  case SENDFUNC_PIOW:
	    fprintf(stderr, "\n# hib[%d] PIO write, and then DMA write (host <-> HIB internal FIFO)\n", devid);
	    hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	    rbuf[devid] = piowbuf2[devid];
	    wbuf[devid] = h[devid]->dmaw_buf;
	    break;
	  default:
	    fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	    exit(1);
	}
    }

    nloop = 1e7;

    for (size = 32; size <= 512; size *= 2) { // size in 8-byte word
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    for (devid = 0; devid < NHIB; devid++) {
		hib_sendMC(devid, size, rbuf[devid]);
	    }
	    for (devid = 0; devid < NHIB; devid++) {
		hib_recvMC(devid, size, wbuf[devid]);
	    }
	}
	get_cputime(&lt, &st);
	printf("size: % 5d byte   DMA read & write: % 4.1f sec  % 7.2f MB/s\n",
	       size*8, lt, NHIB*sizeof(UINT64)*nloop/1e6/lt);
    }

    for (devid = 0; devid < NHIB; devid++) {
	hib_set_test_modeMC(devid, TESTMODE_NONE);
	hib_closeMC(devid);
    }
}

static void
dmawperf2(int argc, char **argv)
{
    Hib *h[NHIB];
    UINT64 *wbuf[NHIB], *rbuf[NHIB];
    int devid = 0;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    fprintf(stderr, "\n# DMA write (host <- HIB) using all hibs simultaneously.\n");
    for (devid = 0; devid < NHIB; devid++) {
        h[devid] = hib_openMC(devid);
	hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);
	rbuf[devid] = h[devid]->dmar_buf;
	wbuf[devid] = h[devid]->dmaw_buf;
    }

    nloop = 1e7;

    for (size = 32; size <= 4096; size *= 2) { // size in 8-byte word
	for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);
	    for (j = 0; j < nloop/size; j++) {
#if 0
		for (devid = 0; devid < NHIB; devid++) {
		    hib_recvMC(devid, size, wbuf[devid]);
		}
#else
		for (devid = 0; devid < NHIB; devid++) {
		    hib_start_dmawMC(devid, size, wbuf[devid]);
		}
		for (devid = 0; devid < NHIB; devid++) {
		    hib_finish_dmawMC(devid);
		}
#endif
	    }
	    get_cputime(&lt, &st);
	    printf("size: % 5d byte   DMA read & write: % 4.1f sec  % 7.2f MB/s\n",
		   size*8, lt, NHIB*sizeof(UINT64)*nloop/1e6/lt);
	}
    }

    for (devid = 0; devid < NHIB; devid++) {
	hib_set_test_modeMC(devid, TESTMODE_NONE);
	hib_closeMC(devid);
    }
}

static void
dmarperf2(int argc, char **argv)
{
    Hib *h[NHIB];
    UINT64 *wbuf[NHIB], *rbuf[NHIB];
    int devid = 0;
    int sendfunc;
    int i, j, ntry, nng, size0, off;
    int size; /* in 32-bit words */
    double lt = 0.0, st = 0.0;
    double nloop;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }
    sendfunc = atoi(argv[2]);

    switch (sendfunc) {
      case SENDFUNC_DMAR:
	fprintf(stderr, "\n# DMA read (host -> HIB) using all hibs simultaneously.\n");
	break;
      case SENDFUNC_PIOW:
	fprintf(stderr, "\n# PIO write (host -> HIB) using all hibs simultaneously.\n");
	break;
      default:
	fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	exit(1);
    }
    for (devid = 0; devid < NHIB; devid++) {
        h[devid] = hib_openMC(devid);
	hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);

	switch (sendfunc) {
	  case SENDFUNC_DMAR:
            rbuf[devid] = h[devid]->dmar_buf;
            wbuf[devid] = h[devid]->dmaw_buf;
	    break;
	  case SENDFUNC_PIOW:
	    hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
	    rbuf[devid] = piowbuf2[devid];
	    wbuf[devid] = h[devid]->dmaw_buf;
	    break;
	  default:
	    fprintf(stderr, "invalid sendfunc: %d. abort.\n", sendfunc);
	    exit(1);
	}
    }

    nloop = 1e7;

    for (size = 32; size <= 4096; size *= 2) { // size in 8-byte word
	for (ntry = 0; ntry < 1; ntry++) {
	    get_cputime(&lt, &st);
	    for (j = 0; j < nloop/size; j++) {
		switch (sendfunc) {
		  case SENDFUNC_DMAR:
		    for (devid = 0; devid < NHIB; devid++) {
			hib_start_dmarMC(devid, size, rbuf[devid]);
		    }
		    for (devid = 0; devid < NHIB; devid++) {
			hib_finish_dmarMC(devid);
		    }
		    break;
		  case SENDFUNC_PIOW:
		    for (devid = 0; devid < NHIB; devid++) {
			hib_piowMC(devid, size, rbuf[devid]);
		    }
		    break;
		}
	    }
	    get_cputime(&lt, &st);
	    printf("size: % 5d byte   DMA read & write: % 4.1f sec  % 7.2f MB/s\n",
		   size*8, lt, NHIB*sizeof(UINT64)*nloop/1e6/lt);
	}
    }

    for (devid = 0; devid < NHIB; devid++) {
	hib_set_test_modeMC(devid, TESTMODE_NONE);
	hib_closeMC(devid);
    }
}

static void
showstatus(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int i, j;
    int plda_csr, busmode;
    int devid;
    double freq;
    int hib0, nhibs;


    hib0 = 0;
    nhibs = NHIB;
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhibs = 1;
    }
    //    fprintf(stderr, "hib0: %d    nhibs: %d\n", hib0, nhibs);

    for (j = 0; j < nhibs; j++) {
	devid = hib0 + j;
        h = hib_openMC(devid);

	fprintf(stderr, "## hib%d:\n", devid);

	if (h->type == HIB_GRAPE7X) { // XHIB
	    plda_csr = hib_config_readMC(devid, 0x44); /* PLDA Core Status Register at 0x44 */
	    switch ((plda_csr>>28)&0x3) {
	      case 0:
		freq = 33.0;
		break;
	      case 1:
		freq = 66.0;
		break;
	      case 2:
		freq = 100.0;
		break;
	      case 3:
		freq = 133.0;
		break;
	    }
	    busmode = (plda_csr>>30)&0x1;
	    fprintf(stderr, "PCI bus freq.: %3.0f MHz  Bus mode: %s \n",
		    freq, busmode ? "PCI-X" : "PCI");
	}
	else {
	    fprintf(stderr, "PCI bus freq.: %3.0f MHz  Bus mode: %s \n",
		    125.0, "PCIe");
	}
	fprintf(stderr, "configuration register:\n");
	for (i = 0; i < 16; i++) {
            if (4 <= i && i <=7) {
                fprintf(stderr, "0x%08x: 0x%08x 0x%08x\n",
                        i*4, hib_config_readMC(devid, i*4),
			(hib_config_readMC(devid, i*4)>>4)<<4);
            }
            else {
                fprintf(stderr, "0x%08x: 0x%08x\n",
                        i*4, hib_config_readMC(devid, i*4));
            }
	}
	if (busmode) {
	    fprintf(stderr, "---- PCI-X extension ----\n");
	    for (; i < 24; i++) {
		fprintf(stderr, "0x%08x: 0x%08x\n",
			i*4, hib_config_readMC(devid, i*4));
	    }
	}
	hib_closeMC(devid);
    }
}

static void
configread(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int addr;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 3) {
	devid = atoi(argv[3]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }
    h = hib_openMC(devid);
    addr = strtoul(argv[2], (char**)NULL, 16);

    fprintf(stderr, "hib[%d] config 0x%08lx: 0x%08x\n",
	    devid, addr, hib_config_readMC(devid, addr));

    hib_closeMC(devid);
}

static void
configwrite(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int addr, val;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }

    if (argc > 4) {
	devid = atoi(argv[4]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }
    h = hib_openMC(devid);
    addr = strtoul(argv[2], (char**)NULL, 16);
    val = strtoul(argv[3], (char**)NULL, 16);
    fprintf(stderr, "write to hib[%d] config 0x%08lx value 0x%08lx\n",
	    devid, addr, val);
    hib_config_writeMC(devid, addr, val);
    hib_closeMC(devid);
}

static void
regread(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int addr;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 3) {
	devid = atoi(argv[3]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }
    h = hib_openMC(devid);
    addr = strtoul(argv[2], (char**)NULL, 16);
    fprintf(stderr, "hib[%d] 0x%08lx: 0x%08x\n",
	    devid, addr, hib_mem_readMC(devid, addr));
    /*
    fprintf(stderr, "hib[%d] 0x%08lx: 0x%016x\n",
	    devid, addr, hib_mem_readMC64(devid, addr));
    */

    hib_closeMC(devid);
}

static void
regwrite(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int addr, val;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 4) {
	devid = atoi(argv[4]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }
    h = hib_openMC(devid);
    addr = strtoul(argv[2], (char**)NULL, 16);
    val = strtoul(argv[3], (char**)NULL, 16);
    fprintf(stderr, "write to hib[%d] 0x%08lx value 0x%08lx\n",
	    devid, addr, val);
    hib_mem_writeMC(devid, addr, val);
    hib_closeMC(devid);
}

static void
stopdmaw(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int i, j, size, nword;
    int devid;
    int hib0, nhibs;

    hib0 = 0;
    nhibs = NHIB;
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhibs = 1;
    }
    fprintf(stderr, "hib0: %d    nhibs: %d\n", hib0, nhibs);

    fprintf(stderr, "stop DMA channel operation (host <- HIB)");
    for (j = 0; j < nhibs; j++) {
	devid = hib0 + j;
        h = hib_openMC(devid);
	fprintf(stderr, "\n#\n# stop hib[%d]\n#\n", devid);
	//	hib_mem_writeMC(devid, h->dmastat, 0x80000000);
	hib_mem_writeMC(devid, h->dmastat, (1<<h->dmastat_dma_reset_bit));
	hib_closeMC(devid);
    }
}

static int
getdma1datacnt(int devid, Hib *h)
{
    int datacnt;

    switch (h->type) {
      case HIB_GRAPE7X:
	datacnt =  8 * (hib_mem_readMC(devid, h->dma1misc) & 0x7ff);
	// in PCI-X core, datacnt is counted in 64-bit word unit.
	break;
      case HIB_GRAPE7E:
	datacnt =  (hib_mem_readMC(devid, h->dma1misc) & 0x1fff);
	// in PCIe core, datacnt is counted in byte unit.
	break;
      default:
	datacnt = 0;
	fprintf(stderr, "getdma1datacnt: unknown Hib type: %d\n", h->type);
	exit(2);
    }
    return datacnt;
}

static void
clearfifo(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int i, j;
    int devid;
    int datacnt;

    int hib0, nhibs;

    hib0 = 0;
    nhibs = NHIB;
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhibs = 1;
    }
    fprintf(stderr, "hib0: %d    nhibs: %d\n", hib0, nhibs);

    fprintf(stderr, "clear HIB-internal FIFO \n");
    for (j = 0; j < nhibs; j++) {
	devid = hib0 + j;
        h = hib_openMC(devid);

	fprintf(stderr, "\n#\n# clear hib[%d] FIFO ", devid);
	while (datacnt = getdma1datacnt(devid, h)) {
	    fprintf(stderr, "dma1 datacnt: %d byte(s)\n", datacnt);
	    usleep(3000000/datacnt);
	    hib_recvMC(devid, 1, wbuf);
	}

	fprintf(stderr, "... done.\n#\n");

	hib_closeMC(devid);
    }
}

static void
getwlock(int devid, Hib *h, int wlock[2])
{

    switch (h->type) {
      case HIB_GRAPE7X:
	wlock[0] = ((hib_mem_readMC(devid, h->dma0misc)>>25)&1);
	wlock[1] = ((hib_mem_readMC(devid, h->dma0misc)>>26)&1);
	break;
      case HIB_GRAPE7E:
	wlock[0] = ((hib_mem_readMC(devid, h->dma0misc)>>14)&1);
	wlock[1] = ((hib_mem_readMC(devid, h->dma1misc)>>14)&1);
	break;
      default:
	fprintf(stderr, "getwlock: unknown Hib type: %d\n", h->type);
	exit(2);
    }
}

static void
showdmastatus(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int i, j;
    int devid;
    int hib0, nhibs;
    int wlock[2];

    hib0 = 0;
    nhibs = NHIB;
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhibs = 1;
    }
    fprintf(stderr, "hib0: %d    nhibs: %d\n", hib0, nhibs);

    fprintf(stderr, "show DMA status \n");
    for (j = 0; j < nhibs; j++) {
	devid = hib0 + j;
        h = hib_openMC(devid);
	getwlock(devid, h, wlock);
	fprintf(stderr, "\n#\n# hib[%d]\n#\n", devid);
	fprintf(stderr, "PIO write (host->HIB)\n");
	fprintf(stderr, "    swap_sram: %d\n", (hib_mem_readMC(devid, h->dma0misc)>>(h->dma0misc_swap_sram_bit))&1);
	fprintf(stderr, "    sram0_wlock: %d\n", wlock[0]);
	fprintf(stderr, "    sram1_wlock: %d\n", wlock[1]);
	fprintf(stderr, "    sram_wcnt: %d\n", (hib_mem_readMC(devid, h->dma0misc)>>(h->dma0misc_sram_wcnt_bit))&0x1ff); // !!!
	fprintf(stderr, "\n");

	fprintf(stderr, "DMA0 (host->HIB)\n");
	fprintf(stderr, "    dma0_done: %d\n", (hib_mem_readMC(devid, h->dma0misc) >> (h->dma0misc_done_bit))&1);
	fprintf(stderr, "    data to be transferred:    %d byte\n", hib_mem_readMC(devid, h->dma0size));
	fprintf(stderr, "    command & status register: 0x%08x\n", hib_mem_readMC(devid, h->dma0cmd));
	fprintf(stderr, "\n");

	fprintf(stderr, "DMA1 (host<-HIB)\n");
	fprintf(stderr, "    dma1_done: %d\n", (hib_mem_readMC(devid, h->dma1misc) >> (h->dma1misc_done_bit))&1);
	fprintf(stderr, "    m_dma1_datacnt:            %d byte\n",
		getdma1datacnt(devid, h));
	fprintf(stderr, "    data to be transferred:    %d byte\n", hib_mem_readMC(devid, h->dma1size));
	fprintf(stderr, "    command & status register: 0x%08x\n", hib_mem_readMC(devid, h->dma1cmd));
	fprintf(stderr, "\n");

	hib_closeMC(devid);
    }
}

static void
pioread(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int baddr, waddr;

    if (argc < 3) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 3) {
	devid = atoi(argv[3]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }
    h = hib_openMC(devid);
    baddr = strtoull(argv[2], (char**)NULL, 16);

    waddr = baddr >> 3;
    baddr = waddr << 3;
    fprintf(stderr, "backend[%d] 0x%08lx: 0x%016llx\n",
	    devid, baddr, h->backend[waddr]);

    hib_closeMC(devid);
}

static void
piowrite(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    unsigned long int baddr, waddr;
    UINT64 val;

    if (argc < 4) {
	showusage(argc, argv);
	exit(1);
    }
    if (argc > 4) {
	devid = atoi(argv[4]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }
    h = hib_openMC(devid);
    baddr = strtoul(argv[2], (char**)NULL, 16);
    val = strtoull(argv[3], (char**)NULL, 16);

    waddr = baddr >> 3;
    baddr = waddr << 3;
    fprintf(stderr, "write to backend[%d] 0x%08lx value 0x%016llx\n",
	    devid, baddr, val);
    h->backend[waddr] = val;
    hib_closeMC(devid);
}

#define MEGA (1e6)
static void
rawperf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    int i, j, ntry, nng, size0, off;
    int size; /* in 64-bit words */
    double sized, ratio, nloop;
    double lt = 0.0, st = 0.0;
    UINT64 *b;

    if (argc < 2) {
	showusage(argc, argv);
	exit(1);
    }

    h = hib_openMC(devid);
    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);
    rbuf = h->dmar_buf;
    wbuf = h->dmaw_buf;
    ratio = 1.01;

#if 1
    // DMA read
    nloop = 1e5;
    printf("\n#\n# DMA read\n#\n");
    for (sized = size = 2; size < 1024*16+1; sized *= ratio, size = sized) {
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    hib_sendMC(devid, size, rbuf);
	}
	get_cputime(&lt, &st);
	printf("%ld byte    %f sec    %f MB/s\n",
               size*sizeof(UINT64), lt, nloop*sizeof(UINT64)/MEGA/lt);
	fflush(stdout);
    }

    // DMA write
    nloop = 1e5;
    printf("\n#\n# DMA write\n#\n");
    for (sized = size = 2; size < 1024*16+1; sized *= ratio, size = sized) {
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    hib_recvMC(devid, size, wbuf);
	}
	get_cputime(&lt, &st);
	printf("%ld byte    %f sec    %f MB/s\n",
               size*sizeof(UINT64), lt, nloop*sizeof(UINT64)/MEGA/lt);
	fflush(stdout);
    }
#endif

    // PIO write
    nloop = 1e5;

    hib_set_test_modeMC(devid, TESTMODE_REFDESIGN_RAM);
    hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
    printf("\n#\n# PIO write\n#\n");

    for (sized = size = 2; size < 1024*16+1; sized *= ratio, size = sized) {
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    hib_sendMC(devid, size, piowbuf);
	}
	get_cputime(&lt, &st);
	printf("%ld byte    %f sec    %f MB/s\n",
               size*sizeof(UINT64), lt, nloop*sizeof(UINT64)/MEGA/lt);
	fflush(stdout);
    }

#if 0 // never turn this flag on for PCIe.
    // PIO read
    nloop = 1e4;

    printf("\n#\n# PIO read\n#\n");
    for (sized = size = 2; size < 1024+1; sized *= ratio, size = sized) {
	get_cputime(&lt, &st);
	for (j = 0; j < nloop/size; j++) {
	    for (i = 0; i < size; i++) {
	        piowbuf[i] = h->backend[i];
	    }
	    // _mm_mfence();
	}
	get_cputime(&lt, &st);
	printf("%ld byte    %f sec    %f MB/s\n",
               size*sizeof(UINT64), lt, nloop*sizeof(UINT64)/MEGA/lt);
	fflush(stdout);
    }
#endif


    hib_set_test_modeMC(devid, TESTMODE_NONE);
    hib_closeMC(devid);
}

static void
eraserom(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    volatile UINT32 asmi;

    if (NHIB > 1 && argc < 3) {
        showusage(argc, argv);
        exit (1);
    }
    if (argc > 2) {
        devid = atoi(argv[2]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }
    h = hib_openMC(devid);
    fprintf(stderr, "\nerasing configuration ROM of hib[%d].\n"
            "this would take a few minutes. be patient.", devid);

    UINT32 var = h->asmi_cmd_eb;
    hib_mem_writeMC(devid, h->asmi, var);
    do {
        asmi = hib_mem_readMC(devid, h->asmi);
        sleep(3);
        fprintf(stderr, ".");
    }
    while (asmi & (1 << h->asmi_busy_bit));

    if (asmi & (1 << h->asmi_error_bit)) {
	fprintf(stderr, "failed for unknown reason.\n");
	exit(1);
    }

    fprintf(stderr, "done.\n\n");
    hib_closeMC(devid);
}

static char
reverse_byte(char in)
{
    char out = 0;

    out =
        ((in & 0x01) << 7) |
        ((in & 0x02) << 5) |
        ((in & 0x04) << 3) |
        ((in & 0x08) << 1) |
        ((in & 0x10) >> 1) |
        ((in & 0x20) >> 3) |
        ((in & 0x40) >> 5) |
        ((in & 0x80) >> 7);

    return out;
}

static void
writerom(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    unsigned int i, addr, size, pagesize, sizemax;
    int devid = 0;
    FILE *fp;
    char poffile[128];
    char *pofdata;
    volatile UINT32 asmi;

    if (argc < 3 || (NHIB > 1 && argc < 4)) {
	showusage(argc, argv);
	exit (1);
    }
    strncpy(poffile, argv[2], 128);
    if (argc > 3) {
	devid = atoi(argv[3]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }

    fp = fopen(poffile, "r");
    if (!fp) {
        perror("writerom");
        exit(1);
    }
    sizemax = (1<<24)-1; // 16Mbyte
    pofdata = (char *)malloc(sizeof(char) * sizemax);
    size = fread(pofdata, sizeof(char), sizemax, fp);
    fclose(fp);
    fprintf(stderr, "read %d byte in %s\n", size, poffile);

    h = hib_openMC(devid);
    fprintf(stderr, "\nwriting %s to configuration ROM of hib[%d]...\n\n",
            poffile, devid);

    pagesize = 256;
    for (addr = 0; addr < size; addr += pagesize) {
        UINT32 var;
        if (pagesize + addr > size) {
            pagesize = size - addr;
        }

	// stage1: fill up ASMI-megafunction-internal fifo.
        for (i = 0; i < pagesize; i++) {
            var = (h->asmi_cmd_wp | (0xff & pofdata[addr + i])); // don't reverse the bit order. the hardware handles it.
            hib_mem_writeMC(devid, h->asmi, var);

#if 1 // just to wait for a moment in order to make sure the written byte is send to the fifo
	    do {
		asmi = hib_mem_readMC(devid, h->asmi);
	    }
	    while (asmi & (1 << h->asmi_busy_bit));
#endif

        }

	// stage2: flush the fifo to the ROM.
        var = (addr << 8) | (0xff & pagesize);
        hib_mem_writeMC(devid, h->asmi, var); // writing start address will start flushing automatically.
        do {
            asmi = hib_mem_readMC(devid, h->asmi);
        }
        while (asmi & (1 << h->asmi_busy_bit));

	if (asmi & (1 << h->asmi_error_bit)) {
	    fprintf(stderr, "failed for unknown reason.\n");
	    exit(1);
	}

	if (addr % (1<<18) == 0) {
            fprintf(stderr, "%5.1f MB (%2d%%) done\n",
                    (double)(addr)/(1<<20), 100*addr/size);
        }
    }
    fprintf(stderr, "done.\n\n");
    hib_closeMC(devid);
    free(pofdata);
}

static void
readromid(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0;
    static char romname[32];
    volatile UINT32 asmi, id;

    if (argc > 2) {
        devid = atoi(argv[2]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }
    h = hib_openMC(devid);
    fprintf(stderr, "\nreading configuration ROM id of hib[%d]...", devid);

    UINT32 var = h->asmi_cmd_rsid;
    hib_mem_writeMC(devid, h->asmi, var);
    do {
	asmi = hib_mem_readMC(devid, h->asmi);
    }
    while (asmi & (1 << h->asmi_busy_bit));
    asmi = hib_mem_readMC(devid, h->asmi);
    id = ((asmi >> 8) & 0xff);

    fprintf(stderr, "done.\n\n");
    printf("asmi: 0x%08x\n", asmi);
    hib_closeMC(devid);

    switch (id) {
      case 0x10:
        sprintf(romname, "EPCS1");
        break;
      case 0x12:
        sprintf(romname, "EPCS4");
        break;
      case 0x14:
        sprintf(romname, "EPCS16");
        break;
      case 0x16:
        sprintf(romname, "EPCS64");
        break;
      default:
        sprintf(romname, "unknown");
    };
    printf("id:0x%02x  rom:%s\n", id, romname);
}

#if 0
  -- PLL Reconfiguration Control Register (40h)
  -- 
  --         write             read
  -- (32:25) not used
  -- (24:16) datain
  -- (15:12) not used
  -- (11: 8) counter_type
  -- ( 7: 6) not used
  -- ( 5: 3) counter_param
  -- (2)     reconfig
  -- (1)     read_param
  -- (0)     write_param       busy

  type   val
  M      0x0
  N      0x1
  CP/LF  0x2
  VCO    0x3
  C0     0x4
  C1     0x5

  param         val    width
  high_count    0x0    8
  low_count     0x1    8
  bybass_M&N    0x2    1
  bybass_couter 0x4    1
  odd_division  0x5    1
  vco_pos_scale 0x2    1

  data            width

#endif

// write parameters to the pll scan cache.
static void
writepllparam(int devid, Hib *h, int type, int param, int data)
{
    UINT32 pllconf;
    UINT32 val = (data << 16) | (type << 8) | (param << 3) | (1 << h->pllconf_write_bit);
;
    hib_mem_writeMC(devid, h->pllconf, val);
    do {
	pllconf = hib_mem_readMC(devid, h->pllconf);
	//	fprintf(stderr, "0x%08x\n", pllconf);
    }
    while (pllconf & (1 << h->pllconf_busy_bit));

    fprintf(stderr, ".");
    //    usleep(100000); // this is not necessary. just in case.
    pllconf = hib_mem_readMC(devid, h->pllconf);
}

static void
writepllconf(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int devid = 0, m = 1, n = 1;
    UINT32 pllconf;
    static char romname[32];
    volatile UINT32 asmi, id;

    if (argc < 4 || (NHIB > 1 && argc < 5)) {
	showusage(argc, argv);
	exit (1);
    }
    if (argc > 4) {
        devid = atoi(argv[4]);
	if (NHIB < devid+1) {
	    fprintf(stderr,
		    "too large devid(= %d).\n",
		    devid);
	    exit(1);
	}
    }
    m = atoi(argv[2]);
    n = atoi(argv[3]);

    if (m < 1 || 511 < m) {
	fprintf(stderr, "M out of range. it should be in the range of 1..511.\n");
	exit(1);
    }
    if (n < 1 || 511 < n) {
	fprintf(stderr, "N out of range. it should be in the range of 1..511.\n");
	exit(1);
    }

    fprintf(stderr, "\nset hib[%d] pipeline clock frequency to (PCI-X_bus_freq * %d / %d).",
	    devid, m, n);

    h = hib_openMC(devid);

    UINT32 counter_type; // 4 bit
    UINT32 counter_param; // 3 bit
    UINT32 counter_data; // 9 bit
    UINT32 val;

#if 1
    // type, param, data
    writepllparam(devid, h, 0x0, 0x2, 0x0); // M bypass: 0
    writepllparam(devid, h, 0x0, 0x0, m); // M high_count: 
    writepllparam(devid, h, 0x0, 0x1, m); // M low_count: 

    writepllparam(devid, h, 0x4, 0x4, 0x0); // C0 bypass: 0
    writepllparam(devid, h, 0x4, 0x0, n); // C0 high_count: 
    writepllparam(devid, h, 0x4, 0x1, n); // C0 low_count: 
#else
    writepllparam(devid, h, 0x0, 0x2, 0x1); // M bypass: 0
    writepllparam(devid, h, 0x4, 0x4, 0x1); // C0 bypass: 0
    writepllparam(devid, h, 0x5, 0x4, 0x1); // C1 bypass: 0
#endif

    // reconfigure the pll.
    val = 0x1 << h->pllconf_reconfig_bit;
    hib_mem_writeMC(devid, h->pllconf, val);
    do {
	pllconf = hib_mem_readMC(devid, h->pllconf);
    }
    while (pllconf & (1 << h->pllconf_busy_bit));

    fprintf(stderr, "done.\n\n");
    hib_closeMC(devid);
}

static void
resetbackend(int argc, char **argv)
{
    Hib *h;
    UINT64 *wbuf, *rbuf;
    int i, j, size, nword;
    int devid;
    int hib0, nhibs;

    hib0 = 0;
    nhibs = NHIB;
    if (argc > 2) {
	hib0 = atoi(argv[2]);
	nhibs = 1;
    }
    fprintf(stderr, "hib0: %d    nhibs: %d\n", hib0, nhibs);

    fprintf(stderr, "\nreset backend\n\n");
    for (j = 0; j < nhibs; j++) {
	devid = hib0 + j;
        h = hib_openMC(devid);
	fprintf(stderr, "\n#\n# reset hib[%d] backend ... ", devid);
	// hib_mem_writeMC(devid, h[devid].dmastat, 0x40000000);
	hib_mem_writeMC(devid, h->dmastat, (1<<h->dmastat_reset_backend_bit));
	fprintf(stderr, "done.\n\n");
	hib_closeMC(devid);
    }
}


static void
get_cputime(double *splittime, double *laptime)
{
    struct timeval x;

    gettimeofday(&x, NULL);

    *splittime = x.tv_sec + x.tv_usec/1000000.0 - *laptime;
    *laptime = x.tv_sec + x.tv_usec/1000000.0;
}
