/**
 * Tool to generate a memory efficient hash table for a given input set.
 * (C) by Wolfgang Oertl 2005
 *
 * Terminology:
 * - hash table: a data set that maps keys to data.
 * - entry: a key/data pair
 * - hash value: a 32 bit integer computed from a hash key.
 * - hash size: the number of buckets.
 * - bucket: consists of zero or more entries.
 * - collision: multiple key/data pairs in one bucket.
 *
 * Limitations:
 * - If the input is stdin, only one pass can be made.
 * - The max. data size is 255 bytes
 * - keys cannot contain NUL bytes (this could be fixed easily; data can have
 *   NUL bytes).
 * - For a given input set and parameters, sometimes no valid output
 *   can be generated due to hash collisions.  In this case increase
 *   the hash table size.
 * - The actual keys are not stored; optionally the full hash value of the
 *   keys is stored, which should give sufficient protection against finding
 *   invalid keys.
 *
 * Version history:
 *  2005-07-18	first version
 *  2005-07-19	better command line parsing, many options, help.
 *  2008-03-03	more comments
 */

#include "config.h"

#ifndef HASHFUNC
 #error "Please define HASHFUNC."
#endif

#define XSTR(s) STR(s)
#define STR(s) #s


/* the actual hash function must be linked from another file. */
unsigned int HASHFUNC(const char *p, int len);

#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>

char *ifname = NULL;
FILE *ofile = NULL;
int hash_size;
int curr_ofs;
int real = 0;
char *structname = "hash_info_test";

/* this only limits the debug output */
#define MAX_BUCKET_LEN 50

/* 1=add keys as comments to output */
int debug = 0;

/* store the full hash value with each entry?  This (almost) avoids false
 * lookups at the expense of a larger output. */
int full_hash = 0;

/* While reading the input, construct a temporary hash table in memory using
 * this structure. */
struct hash_item {
	struct hash_item *next;
	unsigned int hash_value;
	char *key;
	char *value;		/* may contain NUL bytes! */
	int value_len;		/* length of the value */
};

/* start the output of one hash bucket */
void store_start()
{
	if (real)
		fputs("\t\"", ofile);
}

/* end of hash bucket */
void store_end()
{
	if (real)
		fputs("\"\n", ofile);
}

/* get an item (part of hash bucket) out the door */
void store(int cnt, unsigned char *data)
{
	unsigned char c, buf[500], *p = buf;

	curr_ofs += cnt;

	if (!real)
		return;

	while (cnt--) {
		c = *data++;
		if (c == '"' || c == '\\')
			*p++ = '\\';
		/* avoid trigraphs by storing ( as octal */
		if (c >= ' ' && c < 127 && c != '(') {
			*p++ = c;
		} else {
			p += sprintf((char*)p, "\\%03o", c);
		}
	}
	*p = 0;

	fputs((char*) buf, ofile);
}


/**
 * Look at all entries in the bucket starting at "hi".  If there are
 * unresolveable similarities between hash values, return -1.  If there
 * is just one entry, return 0, else the byte number of the hash value
 * to use/store.
 *
 * "*count" is set to the number of entries.
 */
int detect_collision(struct hash_item *hi, int *count)
{
	struct hash_item *hi2, *hi3;
	int bytenr;
	int cnt;

	/* just one entry - easy. */
	if (!hi->next) {
		*count = 1;
		return 0;	/* which bytenr is actually irrelevant */
	}

	if (full_hash) {
		/* store full hash value anyway; just make sure that no two
		 * entries have the same 32 bit hash value */

		cnt = 0;
		for (; hi; hi=hi->next) {
			cnt ++;
			for (hi2=hi->next; hi2; hi2=hi2->next) {
				if (hi->hash_value == hi2->hash_value) {
					fprintf(stderr, "Collision.\n");
					return -1;
				}
			}
		}
		*count = cnt;
		return 0;
	}

	/* Use just one byte of the hash value to discern the entries.  Which
	 * one should we use for this bucket?
	 * Note: this is an O(n^2) algorithm, but n should be small (2 to 5) */

	for (bytenr=0; bytenr<4; bytenr++) {
		cnt = 0;
		for (hi2=hi; hi2; hi2=hi2->next) {
			cnt ++;
			for (hi3=hi2->next; hi3; hi3=hi3->next)
				if (((unsigned char*)&hi2->hash_value)[bytenr]
				 == ((unsigned char*)&hi3->hash_value)[bytenr])
					goto fail;
		}
		*count = cnt;
		return bytenr;

fail:;
	}

	fprintf(stderr, "Size %d: Collision!\n", hash_size);
	return -1;
}



/**
 * Build the hash table for the given size.  Returns 1 on permanent error.
 * If real is false, don't produce output.
 */
int build_hash(int hash_size)
{
	FILE *ifile;

	if (*ifname == '-' && ifname[1] == 0)
		ifile = stdin;
	else {
		ifile = fopen(ifname, "r");
		if (!ifile) {
			fprintf(stderr, "Can't open input file %s: %s\n",
				ifname, strerror(errno));
			return 1;
		}
	}

	/* build a simple hash */
	struct hash_item **hash_array, *hash_item;
	hash_array = (struct hash_item**) calloc(hash_size,
		sizeof(*hash_array));

	unsigned int hash, index;
	int line, collisions=0, data_len;
	char *s, *d, *pos, buffer[400];

	for (line=0; ; line++) {
		s = fgets(buffer, 400, ifile);
		if (!s)
			break;
		if (*s == '\n')
			continue;

		/* replace \nnn with single characters */
		for (d=s; *s; ) {
			if (*s == '\n')
				break;
			if (*s != '\\') {
				*d++ = *s++;
				continue;
			}
			s++;
			if (*s == '\\') {
				*d++ = *s++;
				continue;
			}
			sscanf(s, "%3o", &index);
			*d++ = (char) index;
			s += 3;
		}

		*d = 0;	/* just for safety, not included in data_len */
		s = buffer;


		pos = strchr(s, ',');
		if (!pos) {
			fprintf(stderr, "%d: line without comma, ignoring\n",
				line);
			continue;
		}
		*pos ++ = 0;

		/* s: start of key
		 * pos: start of data; pos-1=end of key
		 * d: end of data + 1
		 */
		data_len = d - pos;

		if (data_len > 255) {
			fprintf(stderr, "%d: line with data length > 255 "
				"(%d).\n", line, data_len);
			return 1;
		}

		hash = HASHFUNC(s, (pos-1) - s);
		index = hash % hash_size;

		/* make item */
		hash_item = (struct hash_item*) malloc(sizeof *hash_item);
		hash_item->key = strdup(s);
		hash_item->value_len = data_len;
		hash_item->value = (char*) malloc(data_len);
		memcpy(hash_item->value, pos, data_len);
		hash_item->hash_value = hash;
		if ((hash_item->next = hash_array[index]))
			collisions++;
		hash_array[index] = hash_item;
	}

	fclose(ifile);

	/* generate result buffer */
	unsigned char len_char;
	int bytenr, i, len2;
	unsigned char data[200];
	struct hash_item *hi, *hi2;
	int *offsets, count;

	if (real) {
		fprintf(ofile, "#include \"hash-simple.h\"\n");
		fprintf(ofile, "static const unsigned char data_string[] =\n");
	}


	offsets = (int*) calloc(hash_size, sizeof(int));

	curr_ofs = 1;
	for (hash=0; hash<hash_size; hash++) {

		hi = hash_array[hash];
		if (!hi)
			continue;

		offsets[hash] = curr_ofs;

		/* optionally add debug info */
		if (real && debug) {
			fprintf(ofile, "// hash %d, ofs %d, entries:",
				hash, curr_ofs); 
			for (; hi; hi=hi->next)
				fprintf(ofile, " %s", hi->key);
			fprintf(ofile, "\n");
			hi = hash_array[hash];
		}

		bytenr = detect_collision(hi, &count);
		if (bytenr == -1)
			return 1;

		/* sanity check */
		if (bytenr < 0 || bytenr > 3)
			return 1;

		/* start the bucket */
		store_start();
		if (full_hash) {
			len_char = count - 1;
		} else {
			/* arbitrary limit */
			if (count > 64) {
				fprintf(stderr, "Overfull hash bucket\n");
				return 1;
			}
			len_char = ((count - 1) & 0x3f) + (bytenr << 6);
		}
		store(1, &len_char);


		/* make the entries of this bucket */
		for (hi = hash_array[hash]; hi; hi=hi->next) {

			/* part or full hash value */
			len2 = 0;
			if (full_hash) {
				memcpy(data, &hi->hash_value, 4);
				len2 = 4;
			} else if (count > 1) {
				data[0] = ((unsigned char*)&hi->hash_value)
					[bytenr];
				len2 = 1;
			}

			/* length of data, data */
			i = hi->value_len;
			data[len2++] = (unsigned char) i;
			memcpy(data+len2, hi->value, i);
			len2 += i;

			store(len2, data);
		}
		store_end();
	}
	if (real)
		fprintf(ofile, ";\n\n");

	/* what is the offset size needed? */
	const char *ofs_type;
	int ofs_size;
	if (curr_ofs < 256) {
		ofs_type = "unsigned char";
		ofs_size = 1;
	} else if (curr_ofs < 65536) {
		ofs_type = "unsigned short";
		ofs_size = 2;
	} else {
		ofs_type = "unsigned int";
		ofs_size = 4;
	}

	/* generate pointers */
	if (real) {
		fprintf(ofile, "static const %s offset_table[] = {\n",
			ofs_type);

		int this_line = 0;
		for (hash=0; hash<hash_size; hash++) {
			fprintf(ofile, "%d,", offsets[hash]);
			if (++this_line > 10) {
				fprintf(ofile, "\n");
				this_line = 0;
			}
		}

		fprintf(ofile, "};\n\n");
	}

	/* print statistics */
	fprintf(stderr, "sz: %6d, d: %6d, i: %6d, t: %6d, c: %6d -",
		hash_size,				/* buckets */
		curr_ofs,				/* data size */
		hash_size * ofs_size,			/* index size*/
		curr_ofs + hash_size * ofs_size,	/* total size */
		collisions);				/* collision count */

	/* show a histogram of the item count in the buckets. 0=empty */
	int cnt;
	int bucket_length[MAX_BUCKET_LEN];
	memset(bucket_length, 0, sizeof(bucket_length));

	for (hash=0; hash<hash_size; hash++) {
		cnt = 0;
		hash_item = hash_array[hash];
		while (hash_item) {
			cnt ++;
			hash_item = hash_item->next;
		}
		if (cnt >= MAX_BUCKET_LEN)
			cnt = MAX_BUCKET_LEN-1;
		bucket_length[cnt] ++;
	}

	/* find highest count */
	for (cnt=MAX_BUCKET_LEN-1; cnt>=0; cnt--)
		if (bucket_length[cnt])
			break;

	for (i=0; i<=cnt; i++) {
		fprintf(stderr, " %d=%d", i, bucket_length[i]);
	}
	fprintf(stderr, "\n");

	/* generate the meta structure of this hash */
	if (real) {
		fprintf(ofile, "unsigned int %s(const char*, int);\n",
			XSTR(HASHFUNC));
		fprintf(ofile, "const struct hash_info %s = "
			"{\n\t%d,\n\t%d,\n\t%d,\n\t%s,\n\tdata_string,"
			"\n\t(const unsigned char*) offset_table\n};\n",
			structname, full_hash, hash_size, ofs_size,
			XSTR(HASHFUNC));
	}

	/* free all structures -- important for multiple runs. */
	for (hash=0; hash<hash_size; hash++) {
		hi = hash_array[hash];
		while (hi) {
			hi2 = hi->next;
			free(hi->key);
			free(hi->value);
			free(hi);
			hi = hi2;
		}
	}

	free(hash_array);
	free(offsets);

	return 0;
}

void show_help()
{
	printf(""
"Usage: generate [OPTIONS] [FILE]\n"
"Generate a static hash table as compileable C file for a given input.\n"
"With no FILE, or when FILE is -, read standard input.\n"
"\n"
"Valid input lines are a key, followed by a comma and the data.\n"
"The resulting table does not contain the keys and possibly not\n"
"even the hash values, so that random strings might be found.\n"
"\n"
"By default, one hash table is generated and written to stdout.  To rapidly\n"
"try different hash table sizes, you can use the s, e and i options.\n"
"\n"
"Statistics displayed are: sz (size of hash table, i.e. number of entries),\n"
"d (size of data in bytes), i (size of index in bytes), t (total size),\n"
"c (collision count), followed by a list of bucket sizes and counters.\n"
"\n"
"  -h        show this help\n"
"  -s n      set the size of the hash table to generate\n"
"  -e n      end size of the range of hash tables\n"
"  -i n      interval (default is 1) of the size\n"
"  -o file   write output to this file instead of stdout\n"
"  -d        debug: add keys as comments to output file\n"
"  -n str    name of the generated structure (default = hash_info_test)\n"
"  -f        store the full hash value with each entry\n"
"\n"
"Copyright (C) 2005 Wolfgang Oertl under the terms of the GNU GPLv2\n"
"Please send suggestions, bugs reports and praise to:\n"
"wolfgang.oertl@gmx.at\n"
);
}

int main(int argc, char **argv)
{
	int sz, size1 = -1, size2 = -1, interval=1, rc;
	const char *ofname = NULL;

	while ((rc = getopt(argc, argv, "s:e:i:o:n:hdf")) != -1) {
		switch (rc) {
			case 's':
				size1 = atoi(optarg);
				break;
			case 'e':
				size2 = atoi(optarg);
				break;
			case 'i':
				interval = atoi(optarg);
				if (interval <= 0) {
					fprintf(stderr,
						"Interval can't be <= 0\n");
					return 1;
				}
				break;
			case 'd':
				debug = 1;
				break;
			case 'o':
				ofname = optarg;
				break;
			case 'h':
				show_help();
				return 0;
			case 'n':
				structname = optarg;
				break;
			case 'f':
				full_hash = 1;
				break;
			default:
				fprintf(stderr, "Unknown option character %c\n",
					rc);
			case '?':
				return 1;
		}
	}

	if (size1 <= 2) {
		fprintf(stderr, "Please specify a valid size with -s\n");
		return 1;
	}

	ifname = (optind == argc) ? "-" : argv[optind];

	if (size2 == -1)
		size2 = size1;

	real = (size1 == size2);

	if (!real) {
		if (ofname)
			/* not real, but anyway an output file name given? */
			fprintf(stderr, "Warning: no output will be generated.\n");
		if (ifname[0] == '-' && ifname[1] == 0
				&& size1 + interval <= size2) {
			fprintf(stderr, "Error: when input is stdin, only one "
				"pass can be made.\n");
			return 1;
		}
	} else {
		if (ofname) {
			ofile = fopen(ofname, "w");
			if (!ofile) {
				fprintf(stderr, "Can't open output file %s: %s\n",
					ofname, strerror(errno));
				return 1;
			}
		} else {
			ofile = stdout;
		}
	}


	for (sz = size1; sz <= size2; sz += interval) {
		rc = build_hash(sz);
		if (rc)
			break;
	}
	
	return 0;
}

