/*
** Modular Logfile Analyzer
** Copyright 2000 Jan Kneschke <jan@kneschke.de>
**
** Homepage: http://www.modlogan.org
**

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version, and provided that the above
    copyright and permission notice is included with all distributed
    copies of this or derived software.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA

**
** $Id: parse.c,v 1.76 2003/04/18 18:34:38 ostborn Exp $
*/

#define _GNU_SOURCE

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <ctype.h>
#include <errno.h>

#include "config.h"

#include "mlocale.h"
#include "mplugins.h"
#include "mrecord.h"
#include "mdatatypes.h"
#include "datatypes/query/datatype.h"
#include "datatypes/record/datatype.h"
#include "misc.h"

#include "plugin_config.h"


/* Converts short string month (ignoring case) to month number,
 * else it returns 0. */
static inline int shortmonth_to_number(const char *month) {
        switch(month[0] | 32) {
                case 'j':
                        switch(month[1] | 32) {
                                case 'a': /* Jan */
                                        return 1;
                                case 'u': /* Jul | Jun */
                                        switch(month[2] | 32) {
                                                case 'l': /* Jul */
                                                        return 7;
                                                case 'n': /* Jun */
                                                        return 6;
                                        }
                                        break;
                        }
                        break;
                case 'm':
                        switch(month[2] | 32) {
                                case 'y': /* May */
                                        return 5;
                                case 'r': /* Mar */
                                        return 3;
                        }
                        break;
                case 'a':
                        switch(month[1] | 32) {
                                case 'p': /* Apr */
                                        return 4;
                                case 'u': /* Aug */
                                        return 8;
                        }
                        break;
                case 'd': /* Dec */
                        return 12;
                case 'f':
                        return 2;
                case 's': /* Sep */
                        return 9;
                case 'o': /* Oct */
                        return 10;
                case 'n': /* Nov */
                        return 11;
        }

        return 0;
}

int parse_timestamp(mconfig *ext_conf, const char *str, mlogrec *record) {
	struct tm tm;
	int timezone;
	
	/*
	 * fixed length
	 * 
	 * 01/Jan/2000:00:01:18 +0100
	 * 0  3   7    2  5  8  1
	 * 0  0   0    1  1  1  2
	 */
	
	/* no speed increasement 
	 * - looks like strtol is expensive 
	 */
	tm.tm_mday = strtol(str, NULL, 10);
	tm.tm_mon  = shortmonth_to_number(str+3)-1;
	tm.tm_year = strtol(str + 7, NULL, 10)-1900;
	tm.tm_hour = strtol(str + 12, NULL, 10);
	tm.tm_min  = strtol(str + 15, NULL, 10);
	tm.tm_sec  = strtol(str + 18, NULL, 10);
	timezone   = strtol(str + 21, NULL, 10);	
	
	/* We ignore TZ here */
	record->timestamp = mkutctime (&tm);
	
	/* transform timestamp to UTC */
	record->timestamp -= timezone * 36;

	return M_RECORD_NO_ERROR;
}

int parse_useragent(mconfig *ext_conf,const char *str, mlogrec *record) {
	config_input *conf = ext_conf->plugin_conf;
	mlist *l;
	char *ua_os = NULL;
	int i, str_len;
	int oldest_timestamp, oldest_index;
	
	mlogrec_web *recweb;
	mlogrec_web_extclf *recext;
	
	recweb = record->ext;
	recext = recweb->ext;
		
	if (!str) return 0;
	
	str_len = strlen(str);

	/* check the cache for an entry */
	for (i = 0; i < UA_CACHE_SIZE; i++) {
		if (conf->ua_cache[i].key && 
		    0 == strcmp(conf->ua_cache[i].key, str)) {
			char *minus = strchr(conf->ua_cache[i].ua_os, ';');
			
			if (minus) {
				if (*(conf->ua_cache[i].ua_os)) {
					buffer_strcpy_len(recext->req_useragent, conf->ua_cache[i].ua_os, minus - conf->ua_cache[i].ua_os);
				}
				minus++;
				if (*minus) {
					buffer_strcpy(recext->req_useros, minus);
				}
			}
			
			conf->ua_cache[i].timestamp = record->timestamp;
			
			break;
		}
	}
	
	if (i != UA_CACHE_SIZE) {
		/* we had a cache hit */
		return 0;
	}
	
	for (l = conf->match_useragent; l; l = l->next) {
		mdata *data = l->data;

		if (!data) continue;

		if (NULL != (ua_os = substitute(ext_conf,
						data->data.match.match, data->data.match.study,
						data->key, str, str_len))) {
			break;
		}
	}
	
	if (ua_os) {
		char *minus = strchr(ua_os, ';');
		
		/* find oldest index */
		oldest_timestamp = conf->ua_cache[0].timestamp;
		oldest_index = 0;
		for (i = 0; i < UA_CACHE_SIZE; i++) {
			if (conf->ua_cache[i].timestamp < oldest_timestamp) {
				oldest_index = i;
			}
		}
		
		/* replace cache entry */
		conf->ua_cache[oldest_index].timestamp = record->timestamp;
		if (conf->ua_cache[oldest_index].ua_os) free(conf->ua_cache[oldest_index].ua_os);
		if (conf->ua_cache[oldest_index].key) free(conf->ua_cache[oldest_index].key);
		
		conf->ua_cache[oldest_index].key = strdup(str);
		conf->ua_cache[oldest_index].ua_os = strdup(ua_os);

		if (minus) {
			*minus++ = '\0';

			if (*ua_os) {
				buffer_strcpy(recext->req_useragent, ua_os);
			}
			if (*minus) {
				buffer_strcpy(recext->req_useros, minus);
			}
		} else {
			fprintf(stderr, "%s.%d: incorrect match for %s, ';' is missing in group-string\n", __FILE__, __LINE__, ua_os);
		}
		free(ua_os);
	}

	return 0;
}

int parse_url(mconfig *ext_conf,const char *str, mlogrec_web *record) {
	/*    method     uri                           get-vars    protocol 
	 * "^([A-Za-z]+) (?:https?://[^/]*|)(.+?(?:\\?(.*?)|))(?: +(HTTP/.*?)|)$",
	 */

	const char *method_end, *uri, *get, *protocol, *protocol_end;
	int len = strlen(str);

	if (len == 1 && *str == '-') {
/* if the url is '-' we don't have to cry about it.
 * if someone knows what a url == '-' is good for, tell me please.
 * doing it this should suppress the warning.
 */
		return M_RECORD_IGNORED;
	}
	
	if (len < 2) return M_RECORD_CORRUPT;

	/* 3 parts, 3rd is optional */
	method_end = uri = strchr(str, ' ');
	if (!uri) return M_RECORD_CORRUPT;
		
	uri++;
	if (uri[0] == 'h' &&
	    uri[1] == 't' &&
	    uri[2] == 't' &&
	    uri[3] == 'p') {
		int j = 4;
		if (uri[4] == 's') j++;
		if (uri[j] == ':' &&
		    uri[j+1] == '/' &&
		    uri[j+2] == '/') {
			j += 3;
			
			/* skip everything excluding the / for the relative URI */
			for (j += 2; uri[j] && uri[j] != '/'; j++);
			
			uri += j;
		}
	}
	
	/* Ignore spaces around protocol if any */
	protocol_end = str + len - 1;
	while (*protocol_end == ' ') {
		protocol_end--;
		if (protocol_end == str) return M_RECORD_CORRUPT;
	}

	if (protocol_end > uri ) {
		protocol = memrchr(str, ' ', protocol_end - str);
		if (protocol <= uri) protocol = NULL;
	} else
		protocol = NULL;

	if (protocol) {
		buffer_strcpy_len(record->req_url, uri, protocol - uri);
		if ((get = strchr(uri, '?'))) {
			buffer_strcpy_len(record->req_getvars, get+1, protocol - (get + 1));
		}
		
		buffer_strcpy_len(record->req_protocol, protocol, protocol_end + 1 - protocol);
	} else {
		buffer_strcpy(record->req_url, uri);
		if ((get = strchr(uri, '?'))) {
			buffer_strcpy(record->req_getvars, get+1);
		}
	}

	buffer_strcpy_len(record->req_method, str, method_end - str);

	return M_RECORD_NO_ERROR;

}

int parse_referrer(mconfig *ext_conf,const char *str, mlogrec_web_extclf *record) {
	char *get;
	if (NULL != (get = strchr(str, '?'))) {
		buffer_strcpy(record->ref_url, str);
		buffer_strcpy(record->ref_getvars, get+1);
	} else {
		buffer_strcpy(record->ref_url, str);
	}

	return 0;
}

int parse_record_pcre(mconfig *ext_conf, mlogrec *record, buffer *b) {
#define N 20 + 1
	const char **list;
	int ovector[3 * N], n;
	config_input *conf = ext_conf->plugin_conf;
	mlogrec_web *recweb = NULL;
	int ret;

	if (record->ext_type != M_RECORD_TYPE_WEB) {
		if (record->ext_type != M_RECORD_TYPE_UNSET) {
			mrecord_free_ext(record);
		}
		
		record->ext_type = M_RECORD_TYPE_WEB;
		record->ext = mrecord_init_web();
	}

	recweb = record->ext;

	if (recweb == NULL) return M_RECORD_HARD_ERROR;

	if (strncmp("format=", b->ptr, 7) == 0) {
		fprintf(stderr, "%s.%d: detected a NetScape Server Log - perhaps it goes wrong\n", __FILE__, __LINE__);
		fprintf(stderr, "%s.%d: use the netscape plugin instead\n", __FILE__, __LINE__);
		return M_RECORD_HARD_ERROR;
	}

/* parse a CLF record */
	if ((n = pcre_exec(conf->match_clf, conf->match_clf_extra, b->ptr, b->used - 1, 0, 0, ovector, 3 * N)) < 0) {
		if (n == PCRE_ERROR_NOMATCH) {
			M_DEBUG1(ext_conf->debug_level, M_DEBUG_SECTION_PARSING, M_DEBUG_LEVEL_ERRORS,
				 "string doesn't match: %s\n", b->ptr);
			return M_RECORD_CORRUPT;
		} else {
			M_DEBUG1(ext_conf->debug_level, M_DEBUG_SECTION_PARSING, M_DEBUG_LEVEL_ERRORS,
				 "execution error while matching: %d\n", n);
			return M_RECORD_HARD_ERROR;
		}
	}

	if (n >= 7) {
		pcre_get_substring_list(b->ptr, ovector, n, &list);

		if (is_ip(list[1])) {
			buffer_strcpy(recweb->req_host_ip, (char *)list[1]);
		} else {
			buffer_strcpy(recweb->req_host_name, (char *)list[1]);
		}

		buffer_strcpy(recweb->req_user, (char *)list[3]);

		switch ((ret = parse_timestamp(ext_conf, list[4], record))) {
		case M_RECORD_NO_ERROR:
			break;
		case M_RECORD_HARD_ERROR:
			fprintf(stderr, "%s.%d: parse_timestamp died on %s\n", __FILE__, __LINE__, b->ptr);
			free(list);
			return M_RECORD_HARD_ERROR;
		case M_RECORD_IGNORED:
			free(list);
			return M_RECORD_IGNORED;
		case M_RECORD_CORRUPT:
			free(list);
			return M_RECORD_CORRUPT;
		default:
			fprintf(stderr, "%s.%d: parse_timestamp return a unknown ret-value on %d\n",
				__FILE__, __LINE__,
				ret);
			free(list);
			return M_RECORD_HARD_ERROR;
		}

		switch ( (ret = parse_url(ext_conf, list[5], recweb)) ) {
		case M_RECORD_NO_ERROR:
			break;
		case M_RECORD_HARD_ERROR:
			fprintf(stderr, "%s.%d: parse_url died on %s\n", __FILE__, __LINE__, b->ptr);
			free(list);
			return M_RECORD_HARD_ERROR;
		case M_RECORD_IGNORED:
			free(list);
			return M_RECORD_IGNORED;
		case M_RECORD_CORRUPT:
			free(list);
			return M_RECORD_CORRUPT;
		default:
			fprintf(stderr, "%s.%d: parse_url return a unknown ret-value: %d\n",
				__FILE__, __LINE__,
				ret);
			free(list);
			return M_RECORD_CORRUPT;
		}

		recweb->req_status = strtol(list[6], NULL,10);
		recweb->xfersize = strtol(list[7], NULL,10);

		/* referrer & useragent */
		if (n == 11) {
			mlogrec_web_extclf *recext;

			recext = mrecord_init_web_extclf();
			recweb->ext_type = M_RECORD_TYPE_WEB_EXTCLF;
			recweb->ext = recext;
			
			if (recext != NULL) {
				if (parse_referrer(ext_conf, list[9], recext) == -1) {
					mrecord_free_web_extclf(recext);
					fprintf(stderr, "%s.%d: parse_referrer died on %s\n", __FILE__, __LINE__, b->ptr);
					free(list);
					return M_RECORD_CORRUPT;
				}

				if (parse_useragent(ext_conf, list[10], record) == -1) {
					mrecord_free_web_extclf(recext);
					fprintf(stderr, "%s.%d: parse_useragent died on %s\n", __FILE__, __LINE__, b->ptr);
					free(list);
					return M_RECORD_CORRUPT;
				}
			}
		} else if (n == 10) {
			/* squid */
			mlogrec_web_squid *recext;

			recweb->ext_type = M_RECORD_TYPE_WEB_SQUID;
			recweb->ext = mrecord_init_web_extclf();

			recext = recweb->ext;

			if (recext != NULL) {
			}
		}
		free(list);
	} else {
		fprintf(stderr, "%s.%d: Matched fields below minimum: %d\n", __FILE__, __LINE__, n);
		return M_RECORD_HARD_ERROR;
	}

	return M_RECORD_NO_ERROR;
#undef  N
}

int parse_record_dynamic(mconfig *ext_conf, mlogrec *record, buffer *b) {
#define N 20 + 1
	const char **list;
	int ovector[3 * N], n, i, j, jlen;
	config_input *conf = ext_conf->plugin_conf;
	mlogrec_web *recweb = NULL;
	mlogrec_web_extclf *recext = NULL;
	int ret;

	/* remove the carriage return */
	if (b->ptr[b->used-2] == '\r') b->ptr[--b->used-1] = '\0';

	if (conf->match_clf == NULL) return M_RECORD_HARD_ERROR;
	
	if (record->ext_type != M_RECORD_TYPE_WEB) {
		if (record->ext_type != M_RECORD_TYPE_UNSET) {
			mrecord_free_ext(record);
		}
		
		record->ext_type = M_RECORD_TYPE_WEB;
		record->ext = mrecord_init_web();
	}

	recweb = record->ext;

	if (recweb->ext_type != M_RECORD_TYPE_WEB_EXTCLF) {
#if 0
		if (record->ext_type != M_RECORD_TYPE_WEB_UNSET) {
			mrecord_free_web_ext(record);
		}
#endif
		
		recweb->ext = mrecord_init_web_extclf();
		recweb->ext_type = M_RECORD_TYPE_WEB_EXTCLF;
	}
	recext = recweb->ext;

	if (recweb == NULL) return M_RECORD_HARD_ERROR;

/* parse a CLF record */
	if ((n = pcre_exec(conf->match_clf, conf->match_clf_extra, b->ptr, b->used - 1, 0, 0, ovector, 3 * N)) < 0) {
		if (n == PCRE_ERROR_NOMATCH) {
			/* FIXME! 
			   depend output on debuglevel
			   fprintf(stderr, "%s.%d: string doesn't match: %s\n", __FILE__, __LINE__, b->ptr); */  
			return M_RECORD_CORRUPT;
		} else {
			fprintf(stderr, "%s.%d: execution error while matching: %d\n", __FILE__, __LINE__, n);
			return M_RECORD_HARD_ERROR;
		}
	}

	pcre_get_substring_list(b->ptr, ovector, n, &list);
	
	ret = M_RECORD_NO_ERROR;

	for (i = 0; (i < n-1) && (ret == M_RECORD_NO_ERROR); i++) {
		switch (conf->trans_fields[i]) {
		case M_CLF_FIELD_TIMESTAMP:
			switch ( (ret = parse_timestamp(ext_conf, (char *)list[i+1], record)) ) {
			case M_RECORD_HARD_ERROR:
				fprintf(stderr, "%s.%d: parse_timestamp died on %s\n", __FILE__, __LINE__, b->ptr);
				break;
			case M_RECORD_NO_ERROR:
			case M_RECORD_IGNORED:
			case M_RECORD_CORRUPT:
				break;
			default:
				fprintf(stderr, "%s.%d: *args* on %s\n", __FILE__, __LINE__, b->ptr);
				ret = M_RECORD_HARD_ERROR;
				break;
			}

			break;
		case M_CLF_FIELD_REQ_HOST:
			if (is_ip(list[i+1])) {
				if (!recweb->req_host_ip) {
					buffer_strcpy(recweb->req_host_ip, (char *)list[i+1]);
				}
			} else {
				buffer_strcpy(recweb->req_host_name, (char *)list[i+1]);
			}
			break;
		case M_CLF_FIELD_REMOTE_IP:
			if (!recweb->req_host_ip) {
				buffer_strcpy(recweb->req_host_ip, (char *)list[i+1]);
			}
			break;
		case M_CLF_FIELD_USERNAME:
			buffer_strcpy(recweb->req_user, (char *)list[i+1]);
			break;
		case M_CLF_FIELD_STATUS:
			recweb->req_status = strtol(list[i+1], NULL, 10);
			break;
		case M_CLF_FIELD_DURATION:
			recext->duration = strtol(list[i+1], NULL, 10);
			break;
		case M_CLF_FIELD_BYTES_SEND:
			recweb->xfersize = strtod(list[i+1], NULL);
			break;
		case M_CLF_FIELD_SERVER_PORT:
			buffer_strcpy(recext->srv_port, (char *)list[i+1]);
			break;
		case M_CLF_FIELD_SERVER_IP:
			buffer_strcpy(recext->srv_host, (char *)list[i+1]);
			jlen = recext->srv_host->used;
			for(j = 0; j < jlen; j++) {
				recext->srv_host->ptr[j] = tolower(recext->srv_host->ptr[j]);
			}
			break;
		case M_CLF_FIELD_REQUEST:
			switch ( (ret = parse_url(ext_conf, list[i+1], recweb)) ) {
			case M_RECORD_HARD_ERROR:
				fprintf(stderr, "%s.%d: parse_url died on %s\n", __FILE__, __LINE__, b->ptr);
				break;
			case M_RECORD_NO_ERROR:
			case M_RECORD_IGNORED:
			case M_RECORD_CORRUPT:
				break;
			default:
				fprintf(stderr, "%s.%d: *args* on %s\n", __FILE__, __LINE__, b->ptr);
				ret = M_RECORD_HARD_ERROR;
				break;
			}
			break;
		case M_CLF_FIELD_USER_AGENT:
			switch ((ret = parse_useragent(ext_conf, list[i+1], record))) {
			case M_RECORD_HARD_ERROR:
				fprintf(stderr, "%s.%d: parse_useragent died on %s\n", __FILE__, __LINE__, b->ptr);
				break;
			case M_RECORD_NO_ERROR:
			case M_RECORD_IGNORED:
			case M_RECORD_CORRUPT:
				break;
			default:
				fprintf(stderr, "%s.%d: *args* on %s\n", __FILE__, __LINE__, b->ptr);
				ret = M_RECORD_HARD_ERROR;
				break;
			}
			break;
		case M_CLF_FIELD_REFERRER:
			switch ((ret = parse_referrer(ext_conf, list[i+1], recext))) {
			case M_RECORD_HARD_ERROR:
				fprintf(stderr, "%s.%d: parse_referrer died on %s\n", __FILE__, __LINE__, b->ptr);
				break;
			case M_RECORD_NO_ERROR:
			case M_RECORD_IGNORED:
			case M_RECORD_CORRUPT:
				break;
			default:
				fprintf(stderr, "%s.%d: *args* on %s\n", __FILE__, __LINE__, b->ptr);
				ret = M_RECORD_HARD_ERROR;
				break;
			}
			break;
		/* no mapping */
		default:
			break;
		}
	}

	free(list);

	return ret;
#undef  N
}


/* TODO: handle !\n as a expected linewrap
 *
 * ...bla!\n
 * saber\n
 *
 * becomes
 *
 * ...blasaber\n
 *
 */

int mplugins_input_clf_get_next_record(mconfig *ext_conf, mlogrec *record) {
	int ret = 0;
	config_input *conf = ext_conf->plugin_conf;

	if (record == NULL) return M_RECORD_HARD_ERROR;

	/* fill the line buffer */
	if (NULL == mgets(&(conf->inputfile), conf->buf)) return M_RECORD_EOF;
	
	if (conf->format) {
		ret = parse_record_dynamic(ext_conf, record, conf->buf);
	} else {
		ret = parse_record_pcre   (ext_conf, record, conf->buf);
	}
	
	if (ret == M_RECORD_CORRUPT) {
		M_DEBUG1(ext_conf->debug_level, M_DEBUG_SECTION_PARSING, M_DEBUG_LEVEL_WARNINGS,
			 "affected Record: %s\n",
			 conf->buf->ptr
			 );
	}
	return ret;
}
