/*
   This program tests the communications channels in a parallel computer
   to see if they have similar speeds.

   You'd think that vendors would have such a diagnostic; you'd be wrong.

   The method is to pass a token between pairs of processors, for all 
   immediage neighbors.  This tests ONLY neighbor links; it does not
   test pass-through effects (so, for a circuit-switched or packet-switched
   system, not all routes are tested).  The times are compared; routes
   whose times very greatly from the average are flagged.
 */

#include "tools.h"
#include "comm/comm.h"
#include "system/time/usec.h"
#include <math.h>
#include <stdio.h>
#if defined(intelnx)
#include <sys/types.h>
#include <nxinfo.h>
#define PHYNODE() nxinfo.physnode
#else
#define PHYNODE() PImytid
#endif

/* Forward references */
void OrderNbrs(), TokenTestSync(), TokenTestASync(), TokenTestForce(),
     GenerateReport(), DrawHistogram(), DrawDanceHall(), DrawMesh();
double RemoveOutliers();

main(argc,argv)
int  argc;
char **argv;
{
int worker();
PICall( worker, argc, argv );
}

#define MAX_NBRS 100
int worker( argc, argv )
int  argc;
char **argv;
{
double       *times;
int          nnbrs, nbrs[MAX_NBRS], mtype, badnbrs[MAX_NBRS];
int          len, reps, nbr, sval[3], ctype, myid;
double       rtol;
FILE         *graphfile = stdout;
char         graphfname[256];
int          do_graph = 0, nx = 0;

/* Get test parameters */
if (SYArgHasName( &argc, argv, 1, "-help" )) {
    if (PImytid != 0) return;
    fprintf( stderr,
    "%s -size len max incr", argv[0] );
    fprintf( stderr,
    " -rtol <tolerance> -reps <repititions> -async -force -graph -nx n\n" );
    return;
    }
    
myid    = PImytid;
rtol    = 0.025;
reps    = 100;
ctype   = 0;
sval[0] = 64;
sval[1] = 64;
sval[2] = 64;
SYArgGetDouble( &argc, argv, 1, "-rtol", &rtol );
SYArgGetInt(    &argc, argv, 1, "-reps", &reps );
SYArgGetIntVec( &argc, argv, 1, "-size", 3, sval );
if (SYArgHasName( &argc, argv, 1, "-async" )) ctype = 1;
if (SYArgHasName( &argc, argv, 1, "-force" )) ctype = 2;
SYArgGetInt( &argc, argv, 1, "-nx", &nx );
do_graph = SYArgHasName( &argc, argv, 1, "-graph" );
if (SYArgGetString( &argc, argv, 1, "-fname", graphfname, 256 )) {
    do_graph  = 1;
    graphfile = fopen( graphfname, "w" );
    if (!graphfile) graphfile = stdout;
    }

/* Get the neighbors */
nnbrs = PIGetNbrs( PImytid, nbrs );

/* need to compute the message type to use and order so that nodes don't
   block.  Note that everyone already knows all the processors that they
   will be communicating with. */
OrderNbrs( nnbrs, nbrs );

times = (double *)MALLOC( nnbrs * sizeof(double) );   CHKPTRV(times,1);

for (len=sval[0]; len<=sval[1]; len+=sval[2]) {
    /* For each neighbor, start the test.  
     */
    for (nbr = 0; nbr < nnbrs; nbr++) {
        mtype = (nbrs[nbr] > myid) ? myid : nbrs[nbr];
        switch (ctype) {
        case 0:
            TokenTestSync( nbrs[nbr], times + nbr, mtype, len, reps );  break;
        case 1:
            TokenTestASync( nbrs[nbr], times + nbr, mtype, len, reps ); break;
        case 2:
            TokenTestForce( nbrs[nbr], times + nbr, mtype, len, reps ); break;
            }
	}
    /* Generate report */
    GenerateReport( nbrs, nnbrs, times, rtol, len, reps, badnbrs, do_graph );
    if (do_graph) {
	if (nx == 0) 
	    DrawDanceHall( graphfile, nbrs, nnbrs, times, rtol, badnbrs );
	else 
	    DrawMesh( graphfile, nbrs, nnbrs, times, rtol, badnbrs, nx ); 
	}
    }
    
FREE( times );
}

/* To get the timing right, the low processor sends to the high processor
   to start the test, then the timing starts */
void TokenTestSync( nbr, time, phase, len, reps )
int    nbr, phase, len, reps;
double *time;
{
int          myid, i;
SYusc_time_t t1, t2;
char         *sbuf, *rbuf;

myid = PImytid;
MSGALLOCRECV(rbuf,len,char);
MSGALLOCSEND(sbuf,len,char);

if (myid < nbr) {
    SENDSYNC( phase, sbuf, 0, nbr, MSG_OTHER );
    RECVSYNC( phase, rbuf, len, MSG_OTHER );
    SYusc_clock( &t1 );
    for (i=0; i<reps; i++) {
	SENDSYNC( phase, sbuf, len, nbr, MSG_OTHER );
	RECVSYNC( phase, rbuf, len, MSG_OTHER );
	}
    SYusc_clock( &t2 );
    *time = SYuscDiff( &t1, &t2 );
    }
else {
    RECVSYNC( phase, rbuf, len, MSG_OTHER );
    SENDSYNC( phase, sbuf, len, nbr, MSG_OTHER );
    SYusc_clock( &t1 );
    for (i=0; i<reps; i++) {
	RECVSYNC( phase, rbuf, len, MSG_OTHER );
	SENDSYNC( phase, sbuf, len, nbr, MSG_OTHER );
	}
    SYusc_clock( &t2 );
    *time = SYuscDiff( &t1, &t2 );
    }
MSGFREERECV(rbuf);
MSGFREESEND(sbuf);
}

void TokenTestASync( nbr, time, phase, len, reps )
int    nbr, phase, len, reps;
double *time;
{
int            myid, i;
SYusc_time_t   t1, t2;
char           *sbuf, *rbuf;
ASYNCRecvId_t  rid;

myid = PImytid;
MSGALLOCRECV(rbuf,len,char);
MSGALLOCSEND(sbuf,len,char);

if (myid < nbr) {
    SENDSYNC( phase, sbuf, 0, nbr, MSG_OTHER );
    RECVSYNC( phase, rbuf, len, MSG_OTHER );
    SYusc_clock( &t1 );
    for (i=0; i<reps; i++) {
    	RECVASYNC( phase, rbuf, len, MSG_OTHER, rid );
	SENDSYNC( phase, sbuf, len, nbr, MSG_OTHER );
	RECVWAIT( phase, rbuf, len, MSG_OTHER, rid );
	}
    SYusc_clock( &t2 );
    *time = SYuscDiff( &t1, &t2 );
    }
else {
    RECVSYNC( phase, rbuf, len, MSG_OTHER );
    RECVASYNC( phase, rbuf, len, MSG_OTHER, rid );
    SENDSYNC( phase, sbuf, len, nbr, MSG_OTHER );
    SYusc_clock( &t1 );
    for (i=0; i<reps-1; i++) {
	RECVWAIT( phase, rbuf, len, MSG_OTHER, rid );
	SENDSYNC( phase, sbuf, len, nbr, MSG_OTHER );
	RECVASYNC( phase, rbuf, len, MSG_OTHER, rid );
	}
    RECVWAIT( phase, rbuf, len, MSG_OTHER, rid );
    SENDSYNC( phase, sbuf, len, nbr, MSG_OTHER );
    SYusc_clock( &t2 );
    *time = SYuscDiff( &t1, &t2 );
    }
MSGFREERECV(rbuf);
MSGFREESEND(sbuf);
}

/* Use async code once it works, but with force modifier */
void TokenTestForce( nbr, time, phase, len, reps )
int    nbr, phase, len, reps;
double *time;
{
}

/*
   This routine analyzes the results, looking for unusually fast or
   slow nodes.  A histogram of the times is produced as well.

   badnbrs[i] = 1 if nbrs[i] is out-of-range, 0 otherwise.
 */
void GenerateReport( nbrs, nnbrs, times, rtol, len, reps, badnbrs, do_graph )
int    *nbrs, nnbrs, len, reps, *badnbrs, do_graph;
double *times, rtol;
{
int    i, nlinks, j;
int    cnt = 0;
double rlow, rhigh;
double mintime, maxtime, avetime, wtime;

/* Get some information on the global times.  We are assuming that
   all neighbors should have the same speed of links */
maxtime = times[0];
mintime = times[0];
avetime = times[0];
for (i=1; i<nnbrs; i++) {
    avetime += times[i];
    if (times[i] > maxtime)      maxtime = times[i];
    else if (times[i] < mintime) mintime = times[i];
    }

GDMAX( &maxtime, 1, &wtime, ALLPROCS );
GDMIN( &mintime, 1, &wtime, ALLPROCS );
PIgdsum( &avetime, 1, &wtime, ALLPROCS );
nlinks  = nnbrs;
PIgisum( &nlinks, 1, &i, ALLPROCS );
avetime /= nlinks;

for (i=0; i<nnbrs; i++) 
    badnbrs[i] = 0;
/*
   Here we could try to remove outliers and recompute the average
   (that is, discard any local values that are well out-of-range, then
   recompute the average time on the remaining links.  Do this until
   no further values are discarded.
*/

/* Look for nodes that are away from the mean */
if (maxtime - mintime >= rtol * avetime) {
    /* Compute a new average */
    avetime = RemoveOutliers( times, nnbrs, 
			      mintime, maxtime, 2*rtol, avetime );
    
    /* Somebody is bad */
    rlow  = avetime * (1.0 - rtol);
    rhigh = avetime * (1.0 + rtol);
    if (PImytid == 0) 
      printf( "%cNode[PhysNode] Nbr         Time  AverageTime        %%Diff\n",
	      do_graph ? '#' : ' ' );
    for (j=0; j<=PInumtids; j++) {
    	/* If we could pass a data-value with the token, we could
    	   pass the error count along.  If we were the first one,
    	   we could issue an error message. */
    	if (GTOKEN(ALLPROCS,j)) {
            for (i = 0; i < nnbrs; i++) {
                if (times[i] < rlow || times[i] > rhigh) {
                    cnt ++;
             	    printf( "%c%4d[%8d] %3d %12.2e %12.2e %12.2e\n", 
			    do_graph ? '#' : ' ', 
			    j, PHYNODE(), nbrs[i], times[i], avetime, 
	                    100.0*(times[i] - avetime)/avetime );
		    fflush( stdout );
		    badnbrs[i] = 1;
	            }
                }
            }
        }
    }
    
PIgisum(&cnt,1,&i,ALLPROCS);
if (PImytid == 0) {
    if (do_graph) printf( "#" );
    if (cnt == 0) printf( "All links within range\n" );
    else          printf( "%d links are out-of-range\n", cnt );
    if (do_graph) printf( "#" );
    printf( "For message size = %d, Average rate = %.0f bytes/sec\n",
	    len, 2.0 * (double)(len * reps) / avetime );
    }
if (cnt) {
    int nbin = 40;
    while (nbin > nlinks * 2) nbin /= 2;
    if (PImytid == 0) 
	printf( "\n%cHistogram by time on each link\n", do_graph ? '#' : ' ' );
    DrawHistogram( times, nnbrs, nbin, stdout, ALLPROCS, mintime, maxtime, 
		   do_graph );
    }
/* Data for a plot of processors versus time would also be interesting,
   perhaps to an auxillery file */
}


/* 
   Algorithm for ordering the partners.
   This is an ordering that can be computed entirely locally for
   each processor.  A similar ordering is used in BlockComm for
   synchronous communication.

   The idea is to break the communication up into phases between processors
   that differ in a current bit position (mask).  Only those processors
   may communicate in that phase.

   This algorithm is NOT guarenteed to generate an optimial ordering.
   It does do so for hypercubes and for meshes with even dimension.
   Meshes with odd dimension will have a poor schedule (roughly
   proportional to the diameter of the mesh).
 */

void OrderNbrs( nnbrs, nbrs )
int  nnbrs, *nbrs;
{
int mask = 0x1, i;
int *newnbrs, cnt, myid;

myid = PImytid;
newnbrs = (int *)MALLOC( nnbrs * sizeof(int) );    CHKPTR(newnbrs);

/* Sort by increasing node number */
SYIsort( nnbrs, nbrs );

cnt     = 0;
while (cnt < nnbrs) {
    for (i=0; i<nnbrs; i++) {
	if (nbrs[i] >= 0 && ((myid & mask) ^ (nbrs[i] & mask)) &&
	    myid > nbrs[i]) {
	    /* Masters */
	    newnbrs[cnt++] = nbrs[i];
	    nbrs[i] = -1;
	    }
	}
    for (i=0; i<nnbrs; i++) {
	if (nbrs[i] >= 0 && ((myid & mask) ^ (nbrs[i] & mask))) {
	    /* Slaves */
	    newnbrs[cnt++] = nbrs[i];
	    nbrs[i] = -1;
	    }
	}
    mask <<= 1;
    }

MEMCPY(nbrs,newnbrs,nnbrs*sizeof(int));
FREE(newnbrs);
}

/*
    Draw a histogram on FILE fp using (double) data and nbin bins
 */     
void DrawHistogram( data, n, nbin, fp, procset, dmin, dmax, do_graph )
double  *data, dmin, dmax;
int     n, nbin, do_graph;
FILE    *fp;
ProcSet *procset;
{
char *line;
int  *bins, *work;
int  i, j, ib;
int  maxcnt;

bins = (int *)MALLOC( nbin * 2 * sizeof(int) );   CHKPTR(bins);
work = bins + nbin;
line = (char *)MALLOC( nbin + 1 );                CHKPTR(line);

for (i=0; i<nbin; i++)
    bins[i] = 0;

for (i=0; i<n; i++) {
    ib  = (nbin - 1) * (data[i] - dmin) / (dmax - dmin);
    bins[ib]++;
    }
PIgisum( bins, nbin, work, procset );

if (PSISROOT(procset)) {
    maxcnt = 0;
    for (i=0; i<nbin; i++) {
        line[i] = ' ';
        if (bins[i] > maxcnt) maxcnt = bins[i];
        }
    line[nbin] = 0;
    for (j=maxcnt; j>0; j--) {
    	for (i=0; i<nbin; i++)
    	    if (bins[i] == j) line[i] = '*';
    	fprintf( fp, "%c%s\n", do_graph ? '#' : ' ', line );
        }
    fprintf( fp, "%cmin = %12.2e max = %12.2e\n", 
	     do_graph ? '#' : ' ', dmin, dmax );
    }
FREE( bins );
FREE( line );
}

/* Compute a new average time after discarding times that are away
   from the average */
double RemoveOutliers( times, n, mintime, maxtime, rtol, avetime )
double *times, mintime, maxtime, rtol;
int    n;
{
int    cnt, i;
double sum, work;

cnt = 0;
sum = 0.0;
while (cnt == 0) {
    for (i=0; i<n; i++) {
	if (fabs( times[i] - avetime ) < rtol * avetime ) {
	    cnt++;
	    sum += times[i];
	    }
	}
    PIgdsum( &sum, 1, &work, ALLPROCS );
    PIgisum( &cnt, 1, &i,    ALLPROCS );
    /* If cnt is STILL 0, EVERYTHING was an outlier, so we prepare to 
       try again with twice the tolerance */
    rtol *= 2.0;
    }
return sum / cnt;
}

/*
   The following code (not yet written) 
   is a first attempt to help generate graphical 
   output for the output from this routine.  The idea is

   If the topology is known and has limited connectivity (i.e., mesh), 
   then draw it.  For this, we need to know our location in the mesh.

   If the topology is not known or is highly connected, generate one of 
   two plots:
       for each node, a separate plot of all connections
       a "dance hall" diagram.

   The graphs should draw in-range lines as dotted and out-of-range lines
   as solids

   Note that since the data is distributed, these are parallel routines.
 */

/* type == 0 for in range, 1 otherwise */
void DrawConnection( fp, fromx, fromy, tox, toy, type )
FILE *fp;
int  fromx, fromy, tox, toy, type;
{
fprintf( fp, "%d %d\n%d %d\njoin %s\n", fromx, fromy, tox, toy,
	 type ? "" : "dash" );
}

void DrawNode( fp, xc, yc, num )
FILE *fp;
int  xc, yc, num;
{
fprintf( fp, "%d %d\n%d %d\n%d %d\n%d %d\n%d %d\njoin\n",
	xc - 2, yc - 2, xc + 2, yc - 2, xc + 2, yc + 2, xc - 2, yc + 2, 
	xc - 2, yc - 2 );
fprintf( fp, "text x %d y %d '%d'\n", xc, yc, num );
}

void DrawDanceHall( fp, nbrs, nnbrs, times, rtol, badnbrs )
FILE   *fp;
int    *nbrs, nnbrs, *badnbrs;
double *times, rtol;
{
int i, k;

if (PImytid == 0) {
    fprintf( fp, "set limits x -3 %d y -1 15\naxis all off\n", PInumtids * 5 );
    }
for (i=0; i<=PInumtids; i++) {
    if (PIgtoken( ALLPROCS, i )) {
	fflush( fp );
	/* Move to the end of the file */
	fseek( fp, 0L, 2 );
	DrawNode( fp, i*5, 2,  i );
	DrawNode( fp, i*5, 12, i );
	for (k=0; k<nnbrs; k++) {
	    DrawConnection( fp, i*5, 4, nbrs[k]*5, 10, badnbrs[k] );
	    }
	fflush( fp );
	}
    }
}

void DrawMesh( fp, nbrs, nnbrs, times, rtol, badnbrs, nx )
FILE   *fp;
int    *nbrs, nnbrs, *badnbrs, nx;
double *times, rtol;
{
int i, k, ifrom, jfrom, ito, jto;

if (PImytid == 0) {
    fprintf( fp, "set limits x -3 %d y -3 %d\naxis all off\n", nx * 5, 
	    (PInumtids / nx) * 5 - 2 );
    }
for (i=0; i<=PInumtids; i++) {
    if (PIgtoken( ALLPROCS, i )) {
	fflush( fp );
	ifrom = i % nx;
	jfrom = i / nx;

	/* Move to the end of the file */
	fseek( fp, 0L, 2 );
	DrawNode( fp, ifrom*5, jfrom*5,  i );
	for (k=0; k<nnbrs; k++) {
	    ito = nbrs[k] % nx;
	    jto = nbrs[k] / nx;
	    if (jfrom == jto) {
		if (ifrom > ito) 
		    DrawConnection( fp, ito*5+2, jto*5 - 1, 
				    ifrom*5-2, jfrom*5 - 1, badnbrs[k] );
		else
		    DrawConnection( fp, ifrom*5+2, jfrom*5 + 1, ito*5-2, 
				    jto*5 + 1,  badnbrs[k] );
		}
	    else if (jfrom < jto) 
		DrawConnection( fp, ifrom*5+1, jfrom*5 + 2, 
			        ito*5+1, jto*5 - 2, badnbrs[k] );
	    else 
		DrawConnection( fp, ito*5 - 1, jto*5 + 2, 
			        ifrom*5 - 1, jfrom*5 - 2, badnbrs[k] );
	    }
	fflush( fp );
	}
    }
}

void DrawOneToAll( )
{
}
