/*
 * <libname>.c: <prefix> user library.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <errno.h>
#include <assert.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <<prefix>util.h>
#define PGUTIL_HEADER
#include "pgemu.h"
#undef PGUTIL_HEADER

#ifdef ICC_RCD
#define ONEHALF (0.0) // Intel CC with -rcd switch
#else
#define ONEHALF (0.5) // standard C
#endif

#define DUMMYSIZE (32) /* this may necessary be set to 32 or larger on some hosts. */
#define PAD_DUMMY_DATA(n) while (n < DUMMYSIZE) { Rbuf[devid][n++] = 0; }

#define WARN(lv, fmt, args...) if (lv <= warn_level) fprintf(stderr, fmt, ## args);
static int warn_level = 2; /* warning message output level. the higher the more verbose.
                              0: no warning (may cause wrong result with g7pkg/scripts/check.csh)
                              1: minimum
                              2: default
                              3: for debugging purpose
                           */

/*
 * maximum number of JPs, IPs, FOs can be transfered by
 * a single DMA transaction.
 */
#define IFIFOSIZE  (512-1)    // input fifo size in 64-bit word.
#define OFIFOSIZE  (1024-1)   // output fifo size in 64-bit word.
#define NJPWORDMAX (HIB_DMABUF_BYTES/4) // max JP DMA size in 32-bit word. 32k-word= 128kB = 32 pages = 8k particles.
#define NIPMAX     (IFIFOSIZE)
#define NFOMAX     (OFIFOSIZE)

static UINT32 Njpmax[NHIB];
static UINT32 Njpwordmax[NHIB];
static UINT32 Nipmax[NHIB];
static UINT32 Nfomax[NHIB];

Hib *H[NHIB];
static UINT32 *Rbuf[NHIB]; /* DMA read buffer */
static UINT32 *Wbuf[NHIB]; /* DMA write buffer */

#ifndef HIB_DMABUF_BYTES
#define HIB_DMABUF_BYTES (GRAPE7X_DMABUF_BYTES > GRAPE7E_DMABUF_BYTES ? GRAPE7X_DMABUF_BYTES : GRAPE7E_DMABUF_BYTES)
#endif

static UINT32 Pbuf[NHIB][HIB_DMABUF_BYTES/4]; /* PIO write buffer (128kB).
                                                 do not define the size with HIB_PIOWBUF_BYTES.
                                                 it denotes size of the hardware buffer in HIB.
                                                 here we defining a buffer on the main memory,
                                                 that needs size the same as that of DMA's. */
static UINT32 Nbodies[NHIB];
static UINT32 Nretrieved[NHIB]; // number of calculation results retrieved by the last <prefix>_get_foutMC
static UINT32 Ni[NHIB];
static UINT32 Jpsize[NHIB]; // JP packet size in 64-bit word.
static UINT32 Ipsize[NHIB]; // IP packet size in 64-bit word.
static UINT32 Fosize[NHIB]; // FO packet size in 64-bit word.

#if NHIB > 8
#error NHIB must not exceed 8
#endif
static int <prefix>_npipes[8] = {
    0, 0, 0, 0, 0, 0, 0, 0, 
};

static int <prefix>_ncards = 0;
static int <prefix>_cards[NHIB];
static int <prefix>_sendfunc = SENDFUNC_PIOW;

// values encoded in board_info register
static UINT32 <prefix>_model[NHIB];
static UINT32 <prefix>_product[NHIB];
static UINT32 <prefix>_nchip[NHIB];
static UINT32 <prefix>_jmemsize[NHIB];

static UINT32 Ipaddr[NHIB];    //     ip packet
static UINT32 Jpaddr[NHIB];    //     jp packet
static UINT32 Calcaddr[NHIB];  //     calc command
static UINT32 Foregaddr[NHIB]; //     fo register
static UINT32 Ipregaddr[NHIB]; //     ip register
static UINT32 Coeffaddr[NHIB];  //     coefficient register

/*
    pg_ctl.vhd local space address map:
    ------------------------------------------------------------------
    hib_data
    31..28    63..48    47..32
    ------------------------------------------------------------------
    0000      0x0000     ndip * ni        IP packet header
    0100      jaddr      ndjp * nj        JP packet header
    1000      ndip*npipe N(16)            calc
    1100      ndip(16)   ni(16)           IP register
    1110      ndfo(16)   ni(16)           FO register
    1101      data(32)                    general-purpose registers
                                          for constant coefficients
                                          such as eta & rcut.
                                          addr:(63:56) data(55:32)
    ------------------------------------------------------------------

    coefficient register space address map:
    ------------------------------------------------------------------
    hib_data
    63..56    55..32
    ------------------------------------------------------------------
    00000000  param0
    00000001  param1
    00000010  param2
     ...       ...
    ------------------------------------------------------------------
 */

/*
 * local functions
 */
static void   init_envs(void);
static void   init_boardinfoMC(int devid);
static void   recalculate_iobuf_attributesMC(int devid);
static void   set_regMC(int devid, UINT32 addr, UINT32 val);
static void   initialize_scale_factorMC(int devid);
static UINT64 compose_float(UINT64 sign, UINT64 exp, UINT64 man, int wexp, int wman);
static void decompose_float(UINT64 src, int wexp, int wman, UINT64 *signp, UINT64 *expp, UINT64 *manp);

// COEFF conversion
<COEFFCONV_PROTOTYPE>
// JP conversion
<JPCONV_PROTOTYPE>
// IP conversion
<IPCONV_PROTOTYPE>
// FO conversion
<FOCONV_PROTOTYPE>

static void unpack_foutMC(int devid, int ni, <FOUNPACKARGS>);

<COEFFRANGE_DEFINITION>
<JPRANGE_DEFINITION>
<IPRANGE_DEFINITION>
<FORANGE_DEFINITION>

void
<prefix>_open(void)
{
    int ic;

    init_envs();
    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_openMC(ic);
    }
}

void
<prefix>_openMC(int devid)
{
    int nword, ni;

    init_envs();
    H[devid] = hib_openMC(devid);
    if (<prefix>_npipes[devid] == 0) { // open for the first time.
        init_boardinfoMC(devid);
	Ipaddr[devid]    = 0x00000000; // ip packet
	Jpaddr[devid]    = 0x40000000; // jp packet
	Calcaddr[devid]  = 0x80000000; // calc command
	Foregaddr[devid] = 0xe0000000; // fo register
	Ipregaddr[devid] = 0xc0000000; // ip register
	Coeffaddr[devid]  = 0xd0000000; // coefficient register

	recalculate_iobuf_attributesMC(devid);
        initialize_scale_factorMC(devid);
    }

    WARN(3, "<prefix>_sendfunc: %s\n", <prefix>_sendfunc == SENDFUNC_DMAR ? "DMA read" : "PIO write");
    if (<prefix>_sendfunc == SENDFUNC_PIOW) {
        hib_set_sendfuncMC(devid, SENDFUNC_PIOW);
        Rbuf[devid] = Pbuf[devid];
    }
    else {
        hib_set_sendfuncMC(devid, SENDFUNC_DMAR);
	Rbuf[devid] = (UINT32 *)(H[devid]->dmar_buf);
    }
    /* using Rbuf as PIO write buffer would degrade
     * performance, since the buffer is marked up as 'non-cached'.
     */

    Wbuf[devid] = (UINT32 *)(H[devid]->dmaw_buf);

    hib_mem_writeMC(devid, H[devid]->dmastat, (1<<H[devid]->dmastat_dma_reset_bit)); // stop DMA
    hib_mem_writeMC(devid, H[devid]->dmastat, (1<<H[devid]->dmastat_reset_backend_bit)); // reset backend

    set_regMC(devid, Ipregaddr[devid], Ipsize[devid]<<16 | <prefix>_npipes[devid]);
    set_regMC(devid, Foregaddr[devid], Fosize[devid]<<16 | <prefix>_npipes[devid]);

    WARN(2, "<prefix>[%d] opened.\n", devid);
}

void
<prefix>_close(void)
{
    int ic;

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_closeMC(ic);
    }
}

void
<prefix>_closeMC(int devid)
{
    hib_closeMC(devid);
}

<FOUND_COEFFSET>

void
<prefix>_set_coeff(<COEFFARGS>)
{
    int ic;

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_set_coeffMC(ic<COEFFCALL>);
    }
}

void
<prefix>_set_coeffMC(int devid, <COEFFARGS>)
{
    int i, k, nword, nword0;
    UINT64 <COEFFVARS>;

<COEFFCONV>

    /*
     * pack COEFFs.
     */
<COEFFPACK>
}

</FOUND_COEFFSET>

<FOUND_JPSET>
void
<prefix>_set_jp(int adr, int nj, <JPARGS>)
{
    int ic;
    int j0, njj;

    j0 = 0;
    njj = (nj + <prefix>_ncards - 1) / <prefix>_ncards;

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	if (nj < j0 + njj) {
	    njj = nj - j0;
	}
	<prefix>_set_jpMC(ic, adr, njj<JPCALL>);
	j0 += njj;
    }
}

void
<prefix>_set_jpMC(int devid, int adr, int nj, <JPARGS>)
{
    int nword, nword0;
    int nempty_cycle = 4;
    int nword_almost_overflow = Njpwordmax[devid] - (2 * 2 + nempty_cycle * 2 + Jpsize[devid] * 2);
                                                 // (packet header + empty cycle) * 2 + JP size in 32-bitword.
    int jsent, jbuffered, jindex;
    int ic, cid, nnj, i, k;
    UINT64 <JPVARS>;

    nnj = (nj-1)/<prefix>_nchip[devid] + 1;

    jsent = 0;     // number of JPs sent.
    jbuffered = 0; // number of JPs packed to the DMA buffer.
    nword = 0;     // number of data words packed to the DMA buffer.

    for (ic = 0; ic < <prefix>_nchip[devid]; ic++) {

        jindex = 0; // index uniquely assigned to each JP in a chip.

        // IP reg packet:
        switch (<prefix>_model[devid]) {
          case 2: // chipid of model600 & 300d starts from 1.
          case 6:
            cid = ic + 1;
            break;
          case 3: // that of model300 starts from 4.
            cid = ic + 4;
            break;
        }
        switch (<prefix>_model[devid]) {
          case 2:
          case 3:
          case 6:
            for (i = 0; i < nempty_cycle; i++) {
                Rbuf[devid][nword++] = 0;
            }
            Rbuf[devid][nword++] = Ipregaddr[devid];
            Rbuf[devid][nword++] = cid<<28 | Ipsize[devid]<<16 | <prefix>_npipes[devid]; // write chip id to IP reg.
            for (i = 0; i < nempty_cycle; i++) {
                Rbuf[devid][nword++] = 0;
            }
            break;
          default:
            // nothing to do for model100 & 800.
            break;
        }

        // JP packet tag:
        Rbuf[devid][nword++] = Jpaddr[devid];
        Rbuf[devid][nword++] = ((long long int)(adr) << 16) | Jpsize[devid] * nnj;

        // JP packet body:
        while (jindex < nnj) {

            // convert numerical format.
            if (jsent < nj) {
<JPCONV>
            }
            else { // clear garbage in the memory of the last pFPGA.
<JPCLEAR>
            }

            /*
             * pack a JP.
             */
            int nword0 = nword;
<JPPACK>
#if 0
            Rbuf[devid][nword++] = (0xfff & (jindex+1))<<17 | (0x1ffff & mj); // offset jindex by 1.
#endif


            jsent++;                // reset when this function begins.
            jbuffered++;            // reset when Rbuf is flushed.
            jindex++;               // reset when ic is incremented.

            // DMA buffer is full. flush to the HIB.
            if (nword >= nword_almost_overflow) {
                PAD_DUMMY_DATA(nword);
                hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);
                jbuffered = 0;
                nword = 0;
            }

        } // nnj loop

    } // ic loop

    // flush data remaining in the buffer.
    if (nword > 0) {
        PAD_DUMMY_DATA(nword);
        hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

#if 0
	{
	    int i;
	    fprintf(stderr, "JP\n");
	    for (i = 0; i < nword; i++) {
		fprintf(stderr, "Rbuf[%d][%2d]:0x%08x\n",
			devid, i, Rbuf[devid][i]);
	    }
	    fprintf(stderr, "\n");
	}
#endif

        jbuffered = 0;
        nword = 0;
    }

    // write chip id 0 (measns broadcast) to IP reg.
    switch (<prefix>_model[devid]) {
      case 2:
      case 3:
      case 6:
	set_regMC(devid, Ipregaddr[devid], 0<<28 | Ipsize[devid]<<16 | <prefix>_npipes[devid]);
        break;
      default:
        // nothing to do for model100 & 800.
        break;
    }

}
</FOUND_JPSET>

<FOUND_IPSET>
void
<prefix>_set_ip(int ni, <IPARGS>)
{
    int ic;

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_set_ipMC(ic, ni<IPCALL>);
    }
}

void
<prefix>_set_ipMC(int devid, int ni, <IPARGS>)
{
    int i, k, nword, nword0;
    UINT64 <IPVARS>;

    if (Nbodies[devid] == 0) return;

    if (ni > Nipmax[devid]) {
	fprintf(stderr, "<prefix>_set_ip: too large ni (%d). "
                "should not be larger than %d. abort.\n", ni, Nipmax[devid]);
	exit(1);
    }
    Ni[devid] = ni;

    nword = 0;
    Rbuf[devid][nword++] = Ipaddr[devid];
    Rbuf[devid][nword++] = Ipsize[devid] * ni;

    for (i = 0; i < ni; i++) {
<IPCONV>
        /*
         * pack an IP.
         */
	int nword0 = nword;
<IPPACK>
    }
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

#if 0
    {
	int i;
	fprintf(stderr, "IP\n");
	for (i = 0; i < nword; i++) {
	    fprintf(stderr, "Rbuf[%d][%2d]:0x%08x\n",
		    devid, i, Rbuf[devid][i]);
	}
	fprintf(stderr, "\n");
    }
#endif
}
</FOUND_IPSET>

void
<prefix>_run(void)
{
    int ic;

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_runMC(ic);
    }
}

void
<prefix>_runMC(int devid)
{
    int nword;
    int ni = Ni[devid];

    if (Nbodies[devid] == 0) return;

    /* set N and run */
    nword = 0;
    Rbuf[devid][nword++] = Calcaddr[devid];
    Rbuf[devid][nword++] = ((Ipsize[devid] * <prefix>_npipes[devid]) << 16) | Nbodies[devid];
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);

    /* kick off DMA write.
     * ni must be rounded up by <prefix>_npipes[devid].
     */
    nword = ((ni-1)/<prefix>_npipes[devid] + 1) * <prefix>_npipes[devid];
    nword = sizeof(long long)/sizeof(int) * Fosize[devid] * nword;
    hib_start_dmawMC(devid, (nword+1)/2, (UINT64*)Wbuf[devid]);
}

void
<prefix>_set_n(int nj)
{
    int ic;
    int j0, njj;

    j0 = 0;
    njj = (nj + <prefix>_ncards - 1) / <prefix>_ncards;

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	if (nj < j0 + njj) {
	    njj = nj - j0;
	}
	<prefix>_set_nMC(ic, njj);
	j0 += njj;
    }
}

void
<prefix>_set_nMC(int devid, int n)
{
    Nbodies[devid] = (n-1)/<prefix>_nchip[devid] + 1;
}

<FOUND_FOSET>
void
<prefix>_get_fout(int ni, <FOARGS>)
{
#if 0 // a run on multiple card returns the same result with
      // that on a single card. works only for fout of type int.

    /*
     * 1) retrieve 'fout's from all cards.
     * 2) sums them up.
     * 3) convert their numerical format. for the conversion,
     *    the scale factor of the first card is applied to all 'fout's.
     */
<FOVARS>
    int ic, i, k, devid;

    for (i = 0; i < ni; i++) {
<FOCLEAR>
    }
    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	unpack_foutMC(ic, ni<FOCALL_RAW>); // copy from DMAW buf to each array.
        for (i = 0; i < ni; i++) {
<FOACCUM>
        }
    }
    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] != 0) break;
    }
    devid = ic; // device id of the first card in use.

    // convert numerical format.
    for (i = 0; i < ni; i++) {
<FOCONV>
    }

#else // result of a run on multiple cards and 
      // that on a single card is not exactly the same.
      // works fout of type float as well as int.

    int ic, i, k;
<FOVARS>
    for (i = 0; i < ni; i++) {
<FOCLEAR>
    }

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	unpack_foutMC(ic, ni<FOCALL_RAW>); // copy from DMAW buf to each array.
        for (i = 0; i < ni; i++) {
<FOACCUM_CONV>
        }
    }

#endif

}

void
<prefix>_get_foutMC(int devid, int ni, <FOARGS>)
{
    int i, k;
<FOVARS_MC>

    if (ni > Nfomax[devid]) {
	fprintf(stderr, "<prefix>_get_foutMC: too large ni (%d). abort.\n", ni);
	exit(1);
    }

    unpack_foutMC(devid, ni<FOCALL_RAW>); // copy from DMAW buf to each array.

    // convert numerical format.
    for (i = 0; i < ni; i++) {
<FOCONV_MC>
    }
}

static void
unpack_foutMC(int devid, int ni, <FOUNPACKARGS>)
{
    int i, k, nword;

    if (Nbodies[devid] == 0) { // no calculation done, and no valid data in Wbuf.
        for (i = 0; i < ni; i++) {
<FOUNPACKCLEAR>
        }
        Nretrieved[devid] = 0;
        return;
    }

    /* wait DMA write completion */
    hib_finish_dmawMC(devid);

    nword = 0;
    for (i = 0; i < ni; i++) {
        /*
         * unpack an FO.
         */
	int nword0 = nword;
<FOUNPACK>
    }

#if 0
     for (i = 0; i < nword; i++) {
	 fprintf(stderr, "Wbuf[%d][%d]:0x%08x\n", devid, i, Wbuf[devid][i]);
     }
#endif

    Nretrieved[devid] = ni;
}

</FOUND_FOSET>

void
<prefix>_set_cards(int *c)
{
    int i, nc = 0;

    for (i = 0; i < NHIB; i++) {
	<prefix>_cards[i] = c[i];
	nc++;
    }
    <prefix>_ncards = nc;
}

void
<prefix>_get_cards(int *c)
{
    int ic;
    int first;

    for (ic = 0; ic < NHIB; ic++) {
	c[ic] = <prefix>_cards[ic];
    }

    WARN(2, "use <prefix>_cards[");
    first = 1;
    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
        if (!first) WARN(2, " ");
        first = 0;
	WARN(2, "%d", ic);
    }
    WARN(2, "]\n");
}

int
<prefix>_get_number_of_cards(void)
{
    return NHIB;
}

int
<prefix>_get_number_of_pipelines(void)
{
    int ic, n;
    int nmin = 65536; // any large number will do.

    // returns the smallest one
    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	n = <prefix>_get_number_of_pipelinesMC(ic);
	if (nmin > n) {
	    nmin = n;
	}
    }

    return nmin;
}

int
<prefix>_get_jmemsize(void)
{
    int ic;
    int jms = 0;

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	jms += <prefix>_get_jmemsizeMC(ic);
    }
    return jms;
}

int
<prefix>_get_number_of_pipelinesMC(int devid)
{
    return Nipmax[devid];
}

int
<prefix>_get_jmemsizeMC(int devid)
{
    return <prefix>_jmemsize[devid];
}

<FOUND_IPSET>
<FOUND_FOSET>
void
<prefix>_calculate_fout_on_ip(<IPARGS>, <FOARGS>, int ni)
{
    int off, nii, np;

    np = <prefix>_get_number_of_pipelines();

    for (off = 0; off < ni; off += np) {
	nii = np;
	if (off+nii > ni) {
	    nii = ni - off;
	}

	<prefix>_set_ip(nii<IPCALL_OFF>);
	<prefix>_run();
	<prefix>_get_fout(nii<FOCALL_OFF>);
    }
}
</FOUND_FOSET>
</FOUND_IPSET>

/*
 *
 * local functions
 *
 */

/*
 * initialize variables used for "standard" functions (i.e. non-MC functions).
 * this initialization is not necessary for "primitive" functions (MC functions).
 */
static void
init_envs(void)
{
    int ic;
    int dummy[NHIB];
    char *p;
    char *cardno;
    static int firstcall = 1;

    if (firstcall) {
        firstcall = 0;
	p = getenv("<PREFIX>_WARNLEVEL");
        if (p) {
            int tmp;
            tmp = atoi(strtok(p, " "));
            if (0 <= tmp) {
                warn_level = tmp;
            }
            WARN(3, "warn_level: %d\n", warn_level);
        }
        hib_set_warn_level(warn_level);

	p = getenv("<PREFIX>_SENDFUNC");
        if (p) {
	    if (0 == strcmp("DMAR", p)) {
		<prefix>_sendfunc = SENDFUNC_DMAR;
	    }
        }
    }

    if (<prefix>_ncards == 0) {
        /* cards are not allocated yet.
           try to allocate cards specified by environment variable "<PREFIX>_CARDS".
           try to allocate all cards, if <PREFIX>_CARDS is not set. */

	p = getenv("<PREFIX>_CARDS");
	if (p) { // parse <PREFIX>_CARDS
	    for (ic = 0; ic < NHIB; ic++) {
		<prefix>_cards[ic] = 0;
	    }
            cardno = strtok(p, " ");
            while (cardno) {
                ic = atoi(cardno);
                if (ic < 0 || ic >= NHIB) {
                    fprintf(stderr, "<PREFIX>_CARDS have device_id out of range: %d\n", ic);
                    exit(2);
                }
                <prefix>_cards[ic] = 1;
                <prefix>_ncards++;
                cardno = strtok(NULL, " ");
            }
            
	}
	else { // <PREFIX>_CARDS is not set
	    <prefix>_ncards = NHIB;
	    for (ic = 0; ic < NHIB; ic++) {
		<prefix>_cards[ic] = 1;
	    }
	}
        <prefix>_get_cards(dummy);
    }

}

static void
init_boardinfoMC(int devid)
{
    int tmp;
    UINT32 binfo;

    binfo = hib_mem_readMC(devid, H[devid]->boardinfo);
    <prefix>_model[devid] = (binfo >> 24) & 0xf;
    <prefix>_product[devid] = (binfo >> 28) & 0xf;
    switch (<prefix>_product[devid]) {
      case 1:
        WARN(1,"GRAPE-7(PCI-X) ");
	break;
      case 2:
        WARN(1,"GRAPE-7(PCIe) ");
	break;
      case 3:
        WARN(1,"GRAPE-DR ");
	break;
      default:
        fprintf(stderr,"init_boardinfoMC: <prefix>_product[%d]=%d  unknown product.\n",
		devid, <prefix>_product[devid]);
	exit(2);
    }

    switch (<prefix>_product[devid]) {
      case 1: // GRAPE-7
      case 2:
        switch (<prefix>_model[devid]) {
          case 1:
            <prefix>_nchip[devid] = 1;
            WARN(1, "model100 ");
            break;
          case 3:
            <prefix>_nchip[devid] = 3;
            WARN(1, "model300 [4-6] ");
            break;
          case 2:
            <prefix>_nchip[devid] = 3;
            WARN(1, "model300 D[1-3] ");
            break;
          case 6:
            <prefix>_nchip[devid] = 6;
            WARN(1, "model600 ");
            break;
          case 8:
            <prefix>_nchip[devid] = 1;
            WARN(1, "model800 ");
            break;
          default:
            fprintf(stderr,"init_boardinfoMC: <prefix>_model[%d]=%d  unknown model.\n",
                    devid, <prefix>_model[devid]);
            exit(2);
        }
        WARN(1, " <prefix>_nchip:%d ", <prefix>_nchip[devid]);

        // <prefix>_jmemsize & <prefix>_npipes are not read out from board_info registers.
        // they are embedded into the source code by pgdl2lib.
        //
        <prefix>_jmemsize[devid]   = <JWORDS> * <prefix>_nchip[devid];
        <prefix>_npipes[devid] = <NPIPE>;
        WARN(1, "<prefix>_npipes:%d <prefix>_jmemsize:%d\n",
             <prefix>_npipes[devid], <prefix>_jmemsize[devid]);
        break;

      case 3: // GRAPE-DR
        switch (<prefix>_model[devid]) {
          case 1:
            <prefix>_nchip[devid] = 1;
            WARN(1, "TB1 ");
            break;
          case 2:
            <prefix>_nchip[devid] = 1;
            WARN(1, "TB2 ");
            break;
          case 3:
            <prefix>_nchip[devid] = 1;
            WARN(1, "TB3 ");
            break;
          case 4:
            <prefix>_nchip[devid] = 4;
            WARN(1, "TB4 ");
            break;
          default:
            fprintf(stderr,"init_boardinfoMC: <prefix>_model[%d]=%d  unknown model.\n",
                    devid, <prefix>_model[devid]);
            exit(2);
        }
        break;
    }
}

static void
recalculate_iobuf_attributesMC(int devid)
{
    Jpsize[devid] = <JPSIZE>;
    Ipsize[devid] = <IPSIZE>;
    Fosize[devid] = <FOSIZE>;

    Nfomax[devid] = OFIFOSIZE / Fosize[devid];
    Njpwordmax[devid] = NJPWORDMAX - 100;
    Nipmax[devid] = IFIFOSIZE / Ipsize[devid];
    Nfomax[devid] = OFIFOSIZE / Fosize[devid];
    Nipmax[devid] = Nipmax[devid] / <prefix>_npipes[devid] * <prefix>_npipes[devid];
    Nfomax[devid] = Nfomax[devid] / <prefix>_npipes[devid] * <prefix>_npipes[devid];

    if (Nipmax[devid] > Nfomax[devid]) {
	Nipmax[devid] = Nfomax[devid];
    }
    else {
	Nfomax[devid] = Nipmax[devid];
    }
    WARN(3, "Fosize:%d Nfomax:%d\n", Fosize[devid], Nfomax[devid]);
}

static void
set_regMC(int devid, UINT32 addr, UINT32 val)
{
    int nword = 0;

    Rbuf[devid][nword++] = addr;
    Rbuf[devid][nword++] = val;
    PAD_DUMMY_DATA(nword);
    hib_sendMC(devid, (nword+1)/2, (UINT64*)Rbuf[devid]);
}

static void
initialize_scale_factorMC(int devid)
{
#if 0 // !!! should look like;
    Ximin[devid] = 0.0;
#endif
}

static UINT64
compose_float(UINT64 sign, UINT64 exp, UINT64 man, int wexp, int wman)
{
    UINT64 dst;

    sign &= (UINT64)1;
    exp  &= ((UINT64)1 << wexp) - 1;
    man  &= ((UINT64)1 << wman) - 1;

    dst  = sign << (wexp + wman);
    dst |= exp  << wman;
    dst |= man;

    return dst;
}

static void
decompose_float(UINT64 src, int wexp, int wman, UINT64 *signp, UINT64 *expp, UINT64 *manp)
{

    *signp = src >> (wexp + wman);
    *signp &= (UINT64)1;

    *expp = src >> wman;
    *expp &= ((UINT64)1 << wexp) - 1;

    *manp = src;
    *manp &= ((UINT64)1 << wman) - 1;
}

<COEFFCONV_DEFINITION>
<JPCONV_DEFINITION>
<IPCONV_DEFINITION>
<FOCONV_DEFINITION>

<OLD_API_EXISTS>
/*
 * API for backward compatibility.
 */
</OLD_API_EXISTS>

<OLD_API_FOUND_ETA>
void
<prefix>_set_eta(double eta)
{
    <prefix>_set_coeff(eta);
}

void
<prefix>_set_etaMC(int devid, double eta)
{
    <prefix>_set_coeffMC(devid, eta);
}
</OLD_API_FOUND_ETA>

<OLD_API_FOUND_XMJ>
void
<prefix>_set_xmj(int adr, int nj, double (*xj)[3], double *mj)
{
    int j0 = 0;
    <prefix>_set_jp(adr, nj<JPCALL>);
}

void
<prefix>_set_xmjMC(int devid, int adr, int nj, double (*xj)[3], double *mj)
{
    int j0 = 0;
    <prefix>_set_jpMC(devid, adr, nj<JPCALL>);
}
</OLD_API_FOUND_XMJ>
<OLD_API_FOUND_EPS2>

static double Eps2[NHIB][NIPMAX];

void
<prefix>_set_xi(int ni, double (*xi)[3])
{
    int ic;

    assert(ni <= NIPMAX);

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_set_ipMC(ic, ni, xi, Eps2[ic]);
    }
}

void
<prefix>_set_xiMC(int devid, int ni, double (*xi)[3])
{
    <prefix>_set_ipMC(devid, ni, xi, Eps2[devid]);
}

void
<prefix>_set_eps(int ni, double *eps)
{
    int ic;

    assert(ni <= NIPMAX);

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_set_epsMC(ic, ni, eps);
    }
}

void
<prefix>_set_epsMC(int devid, int ni, double *eps)
{
    int i;

    assert(ni <= NIPMAX);

    for (i = 0; i < ni; i++) {
        Eps2[devid][i] = eps[i] * eps[i];
    }
}

void
<prefix>_set_eps2(int ni, double *eps2)
{
    int ic;

    assert(ni <= NIPMAX);

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_set_eps2MC(ic, ni, eps2);
    }
}

void
<prefix>_set_eps2MC(int devid, int ni, double *eps2)
{
    int i;

    assert(ni <= NIPMAX);

    for (i = 0; i < ni; i++) {
        Eps2[devid][i] = eps2[i];
    }
}

void
<prefix>_set_eps_to_all(double eps)
{
    int ic;

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_set_eps_to_allMC(ic, eps);
    }
}

void
<prefix>_set_eps_to_allMC(int devid, double eps)
{
    int i;

    for (i = 0; i < NIPMAX; i++) {
        Eps2[devid][i] = eps * eps;
    }
}

void
<prefix>_set_eps2_to_all(double eps2)
{
    int ic;

    for (ic = 0; ic < NHIB; ic++) {
	if (<prefix>_cards[ic] == 0) continue;
	<prefix>_set_eps2_to_allMC(ic, eps2);
    }
}

void
<prefix>_set_eps2_to_allMC(int devid, double eps2)
{
    int i;

    for (i = 0; i < NIPMAX; i++) {
        Eps2[devid][i] = eps2;
    }
}

</OLD_API_FOUND_EPS2>

<OLD_API_FOUND_A>

void
<prefix>_get_force(int ni, double (*a)[3], double *p)
{
    <prefix>_get_fout(ni, a <OLD_API_P>);
}

void
<prefix>_get_forceMC(int devid, int ni, double (*a)[3], double *p)
{
    <prefix>_get_foutMC(devid, ni, a <OLD_API_P>);
}

</OLD_API_FOUND_A>

<OLD_API_FOUND_EPS2_A>

void
<prefix>_calculate_force_on_x(double (*xi)[3], double (*a)[3], double *p, int ni)
{
    int off, nii, np;

    np = <prefix>_get_number_of_pipelines();

    for (off = 0; off < ni; off += np) {
	nii = np;
	if (off+nii > ni) {
	    nii = ni - off;
	}

	<prefix>_set_xi(nii, (double (*)[3])xi[off]);
	<prefix>_run();
	<prefix>_get_force(nii, (double (*)[3])a[off], &p[off]);
    }
}
</OLD_API_FOUND_EPS2_A>
