/*
------------------------------------------------------------------------
History
------------------------------------------------------------------------
*/
/*
Jan 23, 2000
   Started
*/
/*
------------------------------------------------------------------------
Usage
------------------------------------------------------------------------
*/
#define VERSION_STR " Version 0.92"
char *Usage_m[] = {
"",
"   total -dNv <key-col> <data-col> <file>",
"",
"   Read text data <file>.  Each line contains a record consisting",
"   of space delimited fields, some of which are numeric.  Use",
"   <key-col> (comma delimited list of columns starting at 1, or '-'",
"   for no key) as record keys, and for each unique set of record",
"   keys calculate the column values requested in <data-col>.",
"",
"   <key-col>  comma delimited list of column numbers OR just '-'",
"              to indicate no keys, this groups all file records together.",
"   <data-col> comma delimited list of column numbers follow by",
"              an optional character s (sum - the default), a (average),",
"    d (standard deviation).  Or instead of a column number, the letter",
"   'n' to indicate count the number of records.",
"",
"   Examples:",
"",
"     Input file:    burt dog  10 5",
"                    burt fish  1 1",
"                    bill dog   2 2",
"                    burt dog   5 5",
"                    bill fish  3 1",
"",
"     Command:       total 1,2 3,4 input.fil",
"",
"     Output:        burt dog  15 10",
"                    burt fish  1  1",
"                    bill dog   2  2",
"                    bill fish  3  1",
"",
"     Command:       total 2 4,3,n input.fil",
"",
"     Output:        dog  12 17 3",
"                    fish  2  4 2",
"",
"     Command:       total - n input.fil",
"                    5",
"",
VERSION_STR,
0
};
                   
/*
------------------------------------------------------------------------
Include files
------------------------------------------------------------------------
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include "hash.h"

/*
------------------------------------------------------------------------
DEFINES
------------------------------------------------------------------------
*/
#define MAX_COL 256
#define NSTR    4192
#define V0      0
#define MIN     1
#define MAX     2
#define AVG     3
#define DEV     4
#define NDTYPE  5

#define U_CHAR unsigned char

/*  Number of slots in hash table  */
#define N_HASH_SLOTS  500000



/*
------------------------------------------------------------------------
Debugging Macros
------------------------------------------------------------------------
*/
#define WRITEMSG \
   printf ("In file %s at line %i.\n", __FILE__, __LINE__); \
   fflush (stdin);

#define WRITEVAR(VAR_NAME,VAR_TYPE) \
      printf ("FILE %s LINE %i :", __FILE__, __LINE__); \
      printf ("%s = ", #VAR_NAME); \
      printf (#VAR_TYPE, (VAR_NAME) ); \
      printf ("\n"); \
      fflush (stdin); 

/*
------------------------------------------------------------------------
File variables
------------------------------------------------------------------------
*/
char *comment_char_m = "#";
char *field_sep_m    = " ";
int  debug_m = 0;


/*
------------------------------------------------------------------------
------------------------------------------------------------------------
*/
/*
------------------------------------------------------------------------
------------------------------------------------------------------------
*/
/*
------------------------------------------------------------------------
Local Function Prototypes
------------------------------------------------------------------------
*/
int is_whitespace(int);
int is_comment(char *);
int find_action_col (int *, int *, int, int, int);
int get_fields (char *, char *[]);
void prformat(double);
int isnumber (char *);

/*
------------------------------------------------------------------------
Main function
------------------------------------------------------------------------
*/
main (int argc, char *argv[]) {
	int arg;
	int nkey, ndata, nproc, nfield;
	int nrdata;
	int ikey, idata, iproc;
	int lastchr;
	int key_col[MAX_COL]; 
	int data_col[MAX_COL],   data_action[MAX_COL];
	int proc_col[2*MAX_COL], proc_action[2*MAX_COL];
	int i;
	char *keystr, *datastr, *ptr;
	char buffer [NSTR];
	double *databuf = NULL;
	double *rdata   = NULL;
	double  val;
	char keybuff[NSTR];
	char *fptr[MAX_COL];
	long   cnt;
	hlist_t **hkeydata = NULL;
	hlist_t *t         = NULL;
	FILE *fin;
	int res;
	int isum;
	double avg;
	int max_col;
	int optchar;
	int iline;
	int nhashslots = N_HASH_SLOTS;

	/*  Read command line options  */
	/*  Read options  */
	while (-1 != (optchar=getopt(argc,argv,"N:dv"))) {
		switch (optchar) {
			case '?':
			return 1;
		/*  Debugging option  */
		case 'd':
			debug_m = 1;
			printf ("Debugging mode is on\n");
			break;
		case 'v':
			printf ("Version %s (compiled %s)\n", VERSION_STR,__DATE__);
			printf ("Default number of hash slots = %d\n", N_HASH_SLOTS);
			return 0;
			break;
		case 'N':
			nhashslots = atoi(optarg);
			if (nhashslots<1) {
				printf (
				"ERROR: Number of hash slots (-N%d) must be one or greater\n", 
				nhashslots);
				exit(1);
			}
			break;
		default:
			return 1;
		}
	}

	/*  Print Usage  */
	if (argc<4) {
		char **ptr = Usage_m;
		printf ("%s\n", *ptr);
		while (*ptr) {
			printf ("%s\n", *(ptr++));
		}
		return 0;
	}

	/*  Read options  */
	while (EOF!=(arg=getopt(argc,argv,"F:"))) {
		switch (arg) {
			case 'F':
				field_sep_m[0] = optarg[0];
				field_sep_m[1] = '\0';
				break;
		}
	}

	
	/*  Parse key column arguement : a '-' means no key  */
	nkey=0;
	keystr = strdup (argv[optind++]);
	if (strcmp("-",keystr)) {
		ptr=strtok(keystr,",");
		while (ptr) {
			if (! isnumber(ptr) ) {
				printf ("ERROR: Key column argument is not numeric\n");
				return 1;
			}
			if (0>(val=atoi(ptr)-1)) {
				printf ("ERROR: Key column argument is 0\n");
				return 1;
			}
			key_col[nkey++] = atoi(ptr)-1;
			ptr = strtok(NULL,",");
		}
	}

	/*  
	Parse data column argument 

	data_col[] contains column index, data_action[] contains code n,a,s,d
   which stands for count, average, sum, standard deviation

	Note that legal parameter 'n' will result in column number -1.
	Need to take precautions.
	*/
	ndata=0;
	datastr = strdup (argv[optind++]);
	ptr=strtok(datastr,",");
	while (ptr) {
		/*  Get trailing character if a,m,n,s,x  */
		lastchr = ptr[strlen(ptr)-1];
		if (strchr("amnsxlf", lastchr)) {
			data_action[ndata] = lastchr;
			ptr[strlen(ptr)-1] = '\0';
		} else {
			data_action[ndata] = 's';
		}
		data_col[ndata++] = atoi(ptr)-1;
		ptr=strtok(NULL,",");
	}

	/*  Determine actual data processes from requested data  */	
	nproc=0;
	proc_col   [nproc  ] =   0;
	proc_action[nproc++] = 'n';
	for (i=0;i<ndata;i++) {

		/*  Check for duplicate action  */
		if (-1!=find_action_col
				(proc_col, proc_action, data_col[i], data_action[i],nproc) )
			continue;

		/*  Save data column action in process  */
		switch (data_action[i]) {
			/*  Field counts happens automatically  */
			case 'n':
				break;
			/*  Standard deviation requires summation also  */
			case 'd':
				proc_action[nproc  ] = 'd';
				proc_col   [nproc++] = data_col[i];
				/*  Include summation for this column if not already present  */
				if (! find_action_col 
						(proc_col, proc_action, data_col[i], 's', nproc) ) {
					proc_action[nproc  ] = 's';
					proc_col   [nproc++] = data_col[i];
				}
				break;
			case 'm':
			case 'x':
			case 'a':
			case 's':
			case 'f':
			case 'l':
				proc_action[nproc  ] = data_action[i];
				proc_col   [nproc++] = data_col[i];
				break;
		}
	}


	/*  Open file  (use stdin if file "-")  */
	if (!strcmp("-",argv[optind])) {
		fin = stdin;
	} else {
		fin = fopen (argv[optind], "rt");
		if (NULL==fin) {
			printf ("ERROR:  Cannot open input file <%s>\n", argv[optind]);
			return 1;
		}
	}

	/*  Find maximum key,data column  */
	max_col=0;
	for (idata=0;idata<ndata;idata++) 
		if (max_col<data_col[idata]) max_col = data_col[idata];
	for (ikey=0;ikey<nkey;ikey++) 
		if (max_col<key_col[ikey]) max_col = key_col[ikey];
	

	/*  Allocate data storage  */
	databuf  = (double *) calloc (nproc, sizeof(double));

	/*  Initialize hash table  */
	hkeydata = init_htable(nhashslots);

	/*  Read file  */
if (debug_m) {
	iline=0;
}
	while (NULL!=fgets(buffer,NSTR,fin)) {
		buffer[strlen(buffer)-1] = '\0';
if (debug_m) {
	iline++;
	if ((iline%10000) == 0) 
		printf ("debug: nhash %d line %d <%s>\n", getcount(), iline, buffer);
}
		if (is_comment(buffer))  continue;
		nfield = get_fields (buffer, fptr);
		/*  max_col runs from 0..n-1, while nfield runs from 1..n  */
		if (max_col>=nfield) continue;

		/*  Form key  */
		if (nkey)
			strcpy (keybuff, fptr[key_col[0]]);
		else
			keybuff[0]='\0';
		for (ikey=1;ikey<nkey;ikey++) {
			strcat (keybuff, field_sep_m);
			strcat (keybuff, fptr[key_col[ikey]]);
		}

		/*  Lookup key, process entry  */
		if ( 
			finddata (hkeydata,(U_CHAR *)&keybuff
				,strlen(keybuff)+1,(U_CHAR **)&rdata,&nrdata) 
			) {
			rdata[0] = rdata[0] + 1;
			for (iproc=1;iproc<nproc;iproc++) {
				val = atof(fptr[proc_col[iproc]]);
				switch (proc_action[iproc]) {
					case 'a':
					case 's':
						rdata[iproc] += val;
						break;
					case 'd':
						rdata[iproc] += val*val;
						break;
					case 'n':
						rdata[iproc] = rdata[iproc] + 1;
						break;
					case 'm':
						if (val<rdata[iproc]) rdata[iproc] = val;
						break;
					case 'x':
						if (val>rdata[iproc]) rdata[iproc] = val;
						break;
					/*  'f' means looking for first entry, its already there */
					case 'f':
						break;
					/*  'l' -> last entry  */
					case 'l':
						rdata[iproc] = val;
						break;
				}
			}
			
		/*  No key found, create new entry  */
		} else {
			databuf[0] = 1;
			for (iproc=1;iproc<nproc;iproc++) {
				val = atof(fptr[proc_col[iproc]]);
				switch (proc_action[iproc]) {
					case 'a':
					case 's':
						databuf[iproc] = val;
						break;
					case 'd':
						databuf[iproc] = val*val;
						break;
					case 'n':
						databuf[iproc] = 1;
						break;
					case 'm':
					case 'x':
					case 'f':
					case 'l':
						databuf[iproc] = val;
						break;
				}
			}
      	addnode (hkeydata, (U_CHAR *) &keybuff, strlen(keybuff)+1, 
				(U_CHAR *) databuf, nproc*sizeof(databuf[0]));

		}  /* add new entry  */

	}   /*  read file  */
		
	fclose(fin);


	/*  Write results  */
	getfirst(hkeydata);
	while ((t=getnext(hkeydata))) {
		/*  Print key  */
		printf ("%s", t->key);
		/*  Print requested fields  */
		rdata = (double *) t->data;
		for (idata=0;idata<ndata;idata++) {
			/*  If data action is 'n', just print first element  */
			if (data_action[idata]=='n') {
				printf (" %.0lf", rdata[0]);
				continue;
			}
			/*  Other actions  */
			iproc = find_action_col 
				(proc_col, proc_action, data_col[idata], data_action[idata], nproc);
			switch (data_action[idata]) {
				/*  Print sum, min, max as is  */
				case 's':
				case 'm':
				case 'x':
				case 'f':
				case 'l':
					prformat (rdata[iproc]);
					break;
				/*  Calculate average  */
				case 'a':
					if (rdata[0]==0) prformat(0.0);
					else             prformat(rdata[iproc]/rdata[0]);
					break;
				/*  Calculate standard deviation */
				case 'd':
					if (rdata[0]==0) prformat(0.0);
					else {
						/*  Find corresponding sum value  */
						isum = find_action_col
							(proc_col, proc_action, data_col[idata], 's', nproc);
						if (isum==-1) {
							fprintf (stderr, 
								"INTERNAL ERROR:  No sum stored for standard deviation calculation\n");
							exit(2);
						}
						avg = rdata[isum]/rdata[0];
						prformat (sqrt((rdata[iproc]-avg*avg)/rdata[0]));
					}
			}  /* end of data_action[] switch  */
		}  /*  end of data for this hash table element  */
		printf ("\n");
	}  /*  end of hash loop  */
		
	return 0;
}
	
/*
------------------------------------------------------------------------
Local functions
------------------------------------------------------------------------
*/
int is_whitespace(int c) {
	return (NULL!=strchr(" \t\n\r",c));
}

int is_comment(char *s) {
	while (*s && is_whitespace(*s)) s++;
	return (*s==comment_char_m[0]);
}

/*
Fill fptr() array with pointers to tokens in string,
return number of tokens found
*/
int get_fields (char *s, char **fptr) {
	int nfield = 0;

	/*  Skip leading whitespace  */
	while (*s && is_whitespace(*s)) s++;
	/*  Find tokens  */
	while (*s && nfield<MAX_COL) {
		fptr[nfield++] = s;
		/*  Find first whitespace  */
		while (*s && !is_whitespace(*s))  s++;
		/*  Make first white space string terminator */
		if (*s)  *s++ = '\0';
		/*  Find last whitespace */
		while (*s &&  is_whitespace(*s))  s++;
	}
	return nfield;
}


int find_action_col 
(int *list_col, int *list_action, int col, int action, int nlist) { 
	int i;
	for (i=0;i<nlist;i++) {
		if (list_col[i]==col && list_action[i]==action) 
			return i;
	}
	return -1;
}


/*  Print double in integer format if integer  */
void prformat(double d) {
	if (rint(d)==d)  printf (" %.0lf", d);
	else             printf (" %e",    d);
}


/*  Return 0,1 if string non-numeric,numeric */
int isnumber (char *str) {
	while (*str) {
		if (*str<'0'  || *str>'9')
			return 0;
		str++;
	}
	return 1;
}
