/*
 * adjlog2.c:  Program to take multiple alog logfiles, extract events
 *             for synchronizing the clocks, and generating adjusted times.
 *             The files are replaced, allowing the use of other alog tools.
 *
 * -e n defines synchronization events
 * -a1 n -a2 m -b1 k define pair-exchange events used to compute clock offsets
 * (There are predefined values; these allow the user to define their own)
 *
 * Algorithm:
 *     Build a matrix of time events; solve it for the offset and skew for
 *     each clock.  For the first pass, this "matrix" will have just the 
 *     "synchronization" events.
 *
 * This version uses floating-point (IEEE-double) to do the computations,
 * since 53 bits is enough to hold most expected clock values
 *
 * This is the formula:
 * Processor 0 has the standard clock.
 * The time is formed as:
 * 
 * gtime = (ltime - ltime0) * (1 + dp) + gtime0
 *
 * where 1+dp is a clock skew (if the clocks all run at the exact same
 * rate, dp == 0).  The values that are computed are gtime0 and dp; we
 * define ltime0 as the time of the first synchronization observation.
 * To compute dp, we record the times at two points in the program
 * where the processors have been synchronized (note that synchronization
 * is not exact).  Let these two points be ls0 and ls1, with gs0 and gs1
 * the points for the global reference clock (processor 0).  Then we
 * have:
 * 
 *            gs1 - gs0   gs1 - ls1 + ls1 - (gs0 - ls0 + ls0)
 * (1 + dp) = --------- = -----------------------------------
 *            ls1 - ls0              ls1 - ls0
 *
 *                gs1 - ls1 - (gs0 - ls0)
 *          = 1 + -----------------------
 *                      ls1 - ls0
 * 
 * I'd really like to measure dp directly rather than by forming the
 * difference.  
 *
 * Now, to compute gtime0 for each processor, we can approximate it
 * by taking ltime0 (the time of the first sync).  However, this is
 * only approximate.  We can adjust this value by doing various 
 * pairwise exchanges.  More on this later (like when I implement it).
 * 
 */
#include <stdio.h>
#include <math.h>

#define DO_NEGATIVE 1
#define IGNORE_NEGATIVE 2

#include "alog/elog.h"

/* Time values for syncs */
double *lt0, *lt1;

/* For now, we just handle a set of timing events (np-1 of them)
   between processor i and i+1 (processor 0 participates in only
   1 event) */
typedef struct {
    double  a1, b1, a2;         /* Times for the events */
    int     p0, p1;             /* processors that participated in
					 this time-exchange */
    } OffsetEvents;
OffsetEvents *offsetevents;
int noffsetevents = 0;

/* The global time is found by adding an offset and scaling by
   a skew represented as (1 + skew) */
double        *skew;
double        *globaloffset;

/* mintime holds the mintime for ALL runs; this can be used to 
   offset the values */
double  mintime;

/* These hold user-defined synchronization events */
#define MAX_USERETYPES 100
static int syncevent[MAX_USERETYPES];
static int syncep=0;

/* These hold the 3 event types used to adjust the individual offsets
   (if not present, the synchronization events are used to compute the
   offsets)
 */
static int a1event[MAX_USERETYPES],
           a2event[MAX_USERETYPES],
           b1event[MAX_USERETYPES];
static int a1p = 0, a2p = 0, b1p = 0;

void ComputeOffsets();
           
main(argc,argv)
int argc;
char *argv[];
{
	ELData *fin, *fout;
	int  np, i, nsync, nlsync;
	char headerfile[255];
	int pid;
        int firstfile;

	if ( argc <= 1 ) 
		usage( argv[0] );

	/* Look for user-defined events */
	for (i=1; i<argc; i++) {
	    if (strcmp(argv[i],"-e") == 0) 
		/* Test on MAX_USERTYPES */
		syncevent[syncep++] = atoi(argv[++i]);
	    else if (strcmp(argv[i],"-a1") == 0)
	    	a1event[a1p++] = atoi(argv[++i]);
	    else if (strcmp(argv[i],"-a2") == 0) 
	    	a2event[a2p++] = atoi(argv[++i]);
	    else if (strcmp(argv[i],"-b1") == 0) 
	    	b1event[b1p++] = atoi(argv[++i]);
	    else
		break;
	    }
	/* Figure out how many processors there are */
	np        = argc - i;
	firstfile = i;
	/* These could be allocated on demand */
	lt0          = (double *) malloc( np * sizeof(double) );
	lt1          = (double *) malloc( np * sizeof(double) );
	globaloffset = (double *) malloc( np * sizeof(double) );
	skew         = (double *) malloc( np * sizeof(double) );
	offsetevents = (OffsetEvents *) malloc( np * sizeof(OffsetEvents) );
	mintime      = HUGE;

	/* Loop through each file, looking for the synchronization events */
	for (i=0; i<np; i++) {
	    fin = ELopen( argv[firstfile+i], "r" );
	    nsync = extract_timing( i, fin );
	    if (i > 0 && nsync != nlsync) {
		fprintf( stderr, "Found differing numbers of syncs\n" );
		exit(0);
		}
	    nlsync = nsync;
	    ELclose( fin );
	    }
	/* If we didn't find enough events, we exit */
	if (nsync < 1) {
	    fprintf( stderr, 
		     "Not enough synchronization events to adjust logs\n" );
	    exit(0);
	    }
        if (nsync < 2) {
	    /* Just adjust the start times */
	    fprintf( stderr, "Can only adjust clock offsets\n" );
	    for (i=0; i<np; i++) 
		lt1[i] = lt0[i] + 1.0;
	    }

	/* Compute a "global clock" time */
	/* NOTE: if numer is changed, ComputeOffsets must be changed as well */
	for (i=0; i<np; i++) {
            skew[i]         = (lt1[0] - lt0[0]) / (lt1[i] - lt0[i]);
            globaloffset[i] = lt0[i];  /*   - mintime; */
	    }
	fprintf( stderr, "Summary of clock transformations:\n" );
	if (noffsetevents == np - 1) {
	    /* Print out the initial globaloffsets */
	    fprintf( stderr, "Global offsets from sync events are:\n" );
	    for (i=0; i<np; i++) {
		fprintf( stderr, "%4d  %14e\n", i, globaloffset[i] );
		}
	    }

	/* Use adjust events to compute a modified offset (if such events
	   are not present, the globaloffset values above will be used) */
	ComputeOffsets( np );

	/* Write a summary */
	for (i=0; i<np; i++) {
	    fprintf( stderr, "%4d  (t - %14e) (%14e)\n", 
		     i, lt0[i], skew[i] );
	    }
        
	/* Rewrite the log files using the clock adjustment */
	for (i=0; i<np; i++) {
	    sprintf( headerfile, "%s.new", argv[firstfile+i] );
/* 	    pid = getpid(); */
/* 	    sprintf(headerfile,"log.header.%d",pid); */
	    if ( (fout = ELopen(headerfile,"w")) == NULL ) {
		fprintf(stderr,"%s: unable to create temp file %s.\n",
			argv[0], headerfile );
		exit(0);
		}
	    fin = ELopen( argv[firstfile+i], "r" );
	    if (!fin) {
		fprintf( stderr, "%s: Unable to open log file %s\n", 
			 argv[0], argv[firstfile+i] );
		exit(0);
		}
	    adjust_file( i, fin, fout, 0, nsync, argv[firstfile+i] );
	    ELclose( fin );
	    ELclose( fout );

	    /* move filename */
/* 	    unlink( argv[firstfile+i] );
	    link( headerfile, argv[firstfile+i] );
	    unlink( headerfile );  */
	    }
	
} /* main */

/*
   Extract timing data for the i'th log file 
 */
int extract_timing( i, fd )
int    i;
ELData *fd;
{
ELEntry entry;
double  etime;
int    nsync = -1;
int    event;

while (1) {
    if (ELread(fd,&entry)) break;
    event = ELetype(&entry);
    etime = ELetime(fd,&entry);
    if (is_sync_event(event)) {
	/* We do this so that we save the LAST sync event */
	if (nsync == 0) { 
	    lt0[i] = etime;
	    nsync++;
	    }
	else 
	    lt1[i] = etime;
	}
	/* For the offset events, the assumption is that each processor
	   (except for processor 0) is the ORIGINATOR of one offsetevent.
	   It MAY participate as the respondent (b1 event) for multiple
	   events, including having processor 0 respond to EVERYONE.
	   Finally, the (b1) processor has processor number SMALLER than
	   the (a1,a2) processor.  This makes the equations that need
	   to be solved for the offsets TRIANGULAR and easy.
	 */
    else if (is_a1_event(event)) {
    	offsetevents[i].a1 = etime;
    	offsetevents[i].p0 = entry.X.ALOG.i_data;
        }
    else if (is_a2_event(event)) {
    	offsetevents[i].a2 = etime;
    	offsetevents[i].p0 = entry.X.ALOG.i_data;
    	noffsetevents++;
        }
    else if (is_b1_event(event)) {
    	if (entry.X.ALOG.i_data < i) {
    	    fprintf( stderr,
	             "Improper offset event (originating processor %d\n", i );
    	    fprintf( stderr, "higher numbered than partner %d)\n", 
		     entry.X.ALOG.i_data );
    	    exit(0);
    	    }
    	offsetevents[entry.X.ALOG.i_data].b1 = etime;
    	offsetevents[entry.X.ALOG.i_data].p1 = i;
        }
    else if (event > 0) {
	if (mintime > etime) mintime = etime;
	}
    }
return nsync + 1;
}

adjust_file( p, fin, fout, leave_events, nsync, fname )
ELData *fin, *fout;
int    p, leave_events, nsync;
char   *fname;
{
ELEntry entry;
double GlobalTime(), gtime;
double lasttime;
int    event;

/* lasttime is used to make sure that we don't mess up the log files without
   knowing it */
lasttime = 0; 
while (1) {
    if (ELread( fin, &entry )) break;
    event = ELetype( &entry );
    if (!leave_events && (event == ALOG_EVENT_SYNC ||
			  event == ALOG_EVENT_PAIR_A1 ||
			  event == ALOG_EVENT_PAIR_A2 ||
			  event == ALOG_EVENT_PAIR_B1)) continue;
    /* adjust to the global clock time */
    gtime = GlobalTime( ELetime(el,&entry), p, nsync );
    if (event > 0) {
	if (gtime < lasttime) {
	    fprintf( stderr, "Error computing global times\n" );
	    fprintf( stderr, "Times are not properly sorted\n" );
	    fprintf( stderr, "Last time was %lu, current time is %lu\n", 
		     lasttime, gtime );
	    fprintf( stderr, "(original new time is %lu)\n",
                     ELetime(el,&entry) );
	    fprintf( stderr, "processing file %s\n", fname );
	    exit(0);
	    }
	else 
	    lasttime = gtime;
	}
    /* negative events are unchanged. */
    if (event >= 0) ELetimeLow(&entry) = gtime;
    ELwrite( fout, &entry );
    }
}

usage( a )
char *a;
{
	fprintf(stderr,"%s: %s [options] infile1 infile2 ...\n",a,a);
	fprintf(stderr,"  updates files with synchronized clocks\n");

	exit(0);
}

/* This routine allows the user to define MANY sync events */
int is_sync_event( type )
int type;
{
int i;

if (type == ALOG_EVENT_SYNC) return 1;
for (i=0; i<syncep; i++) 
    if (type == syncevent[i]) return 1;
return 0;
}

int is_a1_event( type )
int type;
{
int i;

if (type == ALOG_EVENT_PAIR_A1) return 1;
for (i=0; i<a1p; i++) 
    if (type == a1event[i]) return 1;
return 0;
}

int is_a2_event( type )
int type;
{
int i;

if (type == ALOG_EVENT_PAIR_A2) return 1;
for (i=0; i<a2p; i++) 
    if (type == a2event[i]) return 1;
return 0;
}

int is_b1_event( type )
int type;
{
int i;

if (type == ALOG_EVENT_PAIR_B1) return 1;
for (i=0; i<b1p; i++) 
    if (type == b1event[i]) return 1;
return 0;
}

double GlobalTime( time, p, nsync )
double time;
int    p, nsync;
{
double gtime;

gtime = (time - lt0[p]) * skew[p] + globaloffset[p];
return gtime;
}

/*
    This routine takes offset events and solves for the offsets.  The
    approach is:

    Let the global time be given by (local_time - offset)*scale ,
    with a different offset and scale on each processor.  Each processor
    originates exactly one communication event (except processor 0),
    generating an a1 and a2 event.  A corresponding number of b2 events
    are generated, but note that one processor may have more than 1 b2
    event (if using Dunnigan's synchronization, there will be np-1 b2 events
    on processor 0, and none anywhere else).

    These events are:

   pi   a1 (send to nbr)                        (recv) a2
   pj                     (recv) b1 (send back)

    We base the analysis on the assumption that in the GLOBAL time
    repreresentation, a2-a1 is twice the time to do a (send) and
    a (recv).  This is equivalent to assuming that global((a1+a2)/2) ==
    global(b1).  Then, with the unknowns the offsets (the scales
    are assumed known from the syncevent calculation), the matrix is

    1
    -s0 s1
       ....
       -sj ... si

    where si is the scale for the i'th processor (note s0 = 1).
    The right hand sides are (1/2)(a1(i)+a2(i)) *s(i) - b1(j)*s(j).
    Because of the triangular nature of the matrix, this reduces to

       o(i) = (a1(i)+a2(i))/2 - (s(j)/s(i)) * (b1(j)-o(j))

    Note that if s(i)==s(j) and b1 == (a1+a2)/2, this gives o(i)==o(j).

    This works with ANY triangular matrix; we can use a master-slave
    version (all exchange with processor 0), a log-tree version
    (everyone exchanges with binary tree parent), or a linear version
    (2p+1 exchanges with 2p).  Others are possible.    
 */
void ComputeOffsets( np )
int np;
{
int i, j;
double d1, delta;
double ScaleLong();

/* If there aren't enough events, return */
if (noffsetevents != np - 1) {
    if (noffsetevents != 0) 
	fprintf( stderr, 
	   "Incorrect number of offset events to compute clock offsets\n" );
    else
	fprintf( stderr, "No clock offset events\n" );
    return;
    }

/* Take globaloffset[0] from sync */
for (i=1; i<np; i++) {
    /* o(i) = (a1(i)+a2(i))/2 - (s(j)/s(i)) * (b1(j)-o(j)) */
    j     = offsetevents[i].p1;
    d1    = (offsetevents[i].a2 + offsetevents[i].a1)/2;
    delta = (skew[j] / skew[i]) * (offsetevents[i].b1 - globaloffset[j] );

    globaloffset[i] = d1 - delta;
    }
}




