/* $Id: regex.c,v 1.6 2001/11/30 10:10:24 ukai Exp $ */
/* 
 * regex: Regular expression pattern match library
 * 
 * by A.ITO, December 1989
 */

#ifdef REGEX_DEBUG
#include <sys/types.h>
#include <malloc.h>
#endif				/* REGEX_DEBUG */
#include <ctype.h>
#include <gc.h>
#include "fm.h"
#include "myctype.h"
#include "regex.h"

#ifdef USE_M17N
#ifdef USE_UNICODE
#include "ucs.h"
#endif
#endif

longchar
set_longchar(char *str)
{
    unsigned char *p = (unsigned char *)str;
    longchar r;

#ifdef USE_M17N
    if (*p & 0x80) {
	r.wch = wtf_parse1(&p);
#ifdef USE_UNICODE
	if (WC_CCS_IS_UNICODE(r.wch.ccs)) {
	    if (WC_CCS_SET(r.wch.ccs) == WC_CCS_UCS_TAG)
		r.wch.code = wc_ucs_tag_to_ucs(r.wch.code);
	    r.wch.ccs = WC_CCS_UCS4;
	}
	else
#endif
	    r.wch.ccs = WC_CCS_SET(r.wch.ccs);
	r.type = RE_TYPE_WCHAR_T;
	return r;
    }
#endif
    r.ch = *p;
    r.type = RE_TYPE_CHAR;
    return r;
}

static Regex DefaultRegex;
#define CompiledRegex DefaultRegex.re
#define Cstorage DefaultRegex.storage

static longchar *st_ptr;

static int regmatch(regexchar *, char *, int, int, char **);
static int regmatch1(regexchar *, longchar *);
static int matchWhich(longchar *, longchar *);
static int match_longchar(longchar *, longchar *, int mode);
static int match_range_longchar(longchar *, longchar *, longchar *);

/* 
 * regexCompile: compile regular expression
 */
char *
regexCompile(char *ex, int igncase)
{
    char *msg;
    newRegex(ex, igncase, &DefaultRegex, &msg);
    return msg;
}

Regex *
newRegex(char *ex, int igncase, Regex *regex, char **msg)
{
    char *p;
    longchar *r;
    regexchar *re = regex->re - 1;
    int m, n;

    if (regex == 0)
	regex = (Regex *)GC_malloc_atomic(sizeof(Regex));
    st_ptr = regex->storage;
    for (p = ex; *p != '\0'; p++) {
	switch (*p) {
	case '.':
	    re++;
	    re->pattern = NULL;
	    re->mode = RE_ANY;
	    break;
	case '$':
	    re++;
	    re->pattern = NULL;
	    re->mode = RE_END;
	    break;
	case '^':
	    re++;
	    re->pattern = NULL;
	    re->mode = RE_BEGIN;
	    break;
	case '*':
	    if (!(re->mode & RE_ANY) && re->pattern == NULL) {
		if (msg)
		    *msg = "Invalid regular expression";
		return NULL;
	    }
	    re->mode |= RE_ANYTIME;
	    break;
	case '[':
	    r = st_ptr;
	    p++;
	    if (*p == '^') {
		p++;
		m = RE_EXCEPT;
	    }
	    else
		m = RE_WHICH;
	    while (1) {
		if (*p == '\\')
		    p++;
		else if (*p == ']')
		    break;
		else if (*p == '-') {
		    (st_ptr++)->type = RE_WHICH_RANGE;
		    p++;
		    continue;
		}
		if (*p == '\0') {
		    if (msg)
			*msg = "Missing ]";
		    return NULL;
		}
		n = get_mclen(p);
		*(st_ptr++) = set_longchar(p);
		p += n;

		if (st_ptr >= &Cstorage[STORAGE_MAX]) {
		    if (msg)
			*msg = "Regular expression too long";
		    return NULL;
		}
	    }
	    (st_ptr++)->type = RE_TYPE_END;
	    re++;
	    re->pattern = r;
	    re->mode = m;
	    break;
	case '\\':
	    p++;
	    if (*p == '\0') {
		if (msg)
		    *msg = "terminated after '\\'.";
		return NULL;
	    }
	default:
	    re++;
	    n = get_mclen(p);
	    *(st_ptr) = set_longchar(p);
	    p += n - 1;
	    re->pattern = st_ptr;
	    st_ptr++;
	    re->mode = RE_NORMAL;
	    if (igncase)
		re->mode |= RE_IGNCASE;
	}
	if (st_ptr >= &Cstorage[STORAGE_MAX] ||
	    re >= &CompiledRegex[REGEX_MAX]) {
	    if (msg)
		*msg = "Regular expression too long";
	    return NULL;
	}
    }
    re++;
    re->mode = RE_ENDMARK;
    if (msg)
	*msg = NULL;
    return regex;
}

/* 
 * regexMatch: match regular expression
 */
int
regexMatch(char *str, int len, int firstp)
{
    return RegexMatch(&DefaultRegex, str, len, firstp);
}

int
RegexMatch(Regex *re, char *str, int len, int firstp)
{
    char *p, *ep;

    if (str == NULL)
	return 0;
    re->position = NULL;
    ep = str + ((len == 0) ? strlen(str) : len);
    for (p = str; p < ep;) {
	switch (regmatch
		(re->re, p, ep - p, firstp && (p == str), &re->lposition)) {
	case 1:
	    re->position = p;
	    return 1;
	case -1:
	    re->position = NULL;
	    return -1;
	}
	p += get_mclen(p);
    }
    return 0;
}

/* 
 * matchedPosition: last matched position
 */
void
MatchedPosition(Regex *re, char **first, char **last)
{
    *first = re->position;
    *last = re->lposition;
}

void
matchedPosition(char **first, char **last)
{
    *first = DefaultRegex.position;
    *last = DefaultRegex.lposition;
}

/* 
 * Intermal routines
 */
static int
regmatch(regexchar * re, char *str, int len, int firstp, char **lastpos)
{
    char *p = str, *ep = str + len;
    char *lpos, *llpos = NULL;
    longchar k;
    int n;

    *lastpos = NULL;
#ifdef REGEX_DEBUG
    debugre(re, str);
#endif				/* REGEX_DEBUG */
    while ((re->mode & RE_ENDMARK) == 0) {
	if (re->mode & RE_BEGIN) {
	    if (!firstp)
		return 0;
	    re++;
	}
	else if (re->mode & RE_ANYTIME) {
	    short matched, ok = 0;
	    for (;;) {
		matched = 0;
		if (regmatch(re + 1, p, ep - p, firstp, &lpos) == 1) {
		    llpos = lpos;
		    matched = 1;
		    ok = 1;
		}
		if (p >= ep)
		    break;
		n = get_mclen(p);
		k = set_longchar(p);
		if (regmatch1(re, &k)) {
		    if (lastpos != NULL)
			*lastpos = llpos;
		    p += n;
		}
		else
		    break;
	    }
	    if (lastpos != NULL)
		*lastpos = llpos;
	    return ok;
	}
	else if (re->mode & RE_END) {
	    if (lastpos != NULL)
		*lastpos = p;
	    return (p >= ep);
	}
	else {
	    n = get_mclen(p);
	    k = set_longchar(p);
	    p += n;
	    if (!regmatch1(re, &k))
		return 0;
	    else
		re++;
	}
    }
    if (lastpos != NULL)
	*lastpos = p;
    return 1;
}

static int
regmatch1(regexchar * re, longchar * c)
{
    switch (re->mode & RE_MATCHMODE) {
    case RE_ANY:
	return 1;
    case RE_NORMAL:
	return match_longchar(re->pattern, c, re->mode);
    case RE_WHICH:
	return matchWhich(re->pattern, c);
    case RE_EXCEPT:
	return !matchWhich(re->pattern, c);
    }
    return 0;
}

static int
matchWhich(longchar * pattern, longchar * c)
{
    longchar *p = pattern;
    int ans = 0;

    while (p->type != RE_TYPE_END) {
	if ((p + 1)->type == RE_WHICH_RANGE && (p + 2)->type != RE_TYPE_END) {
	    if (match_range_longchar(p, p + 2, c)) {
		ans = 1;
		break;
	    }
	    p += 3;
	}
	else {
	    if (match_longchar(p, c, 0)) {
		ans = 1;
		break;
	    }
	    p++;
	}
    }
#ifdef REGEX_DEBUG
    printf(" -> %d\n", ans);
#endif				/* REGEX_DEBUG */
    return ans;
}

static int
match_longchar(longchar * a, longchar * b, int mode)
{
#ifdef USE_M17N
    if (a->type != b->type)
	return 0;
    if (a->type == RE_TYPE_WCHAR_T)
	return (a->wch.ccs == b->wch.ccs) && (a->wch.code == b->wch.code);
#endif
    if (mode & RE_IGNCASE && IS_ALPHA(a->ch) && IS_ALPHA(b->ch))
	return tolower(a->ch) == tolower(b->ch);
    else
	return a->ch == b->ch;
}

static int
match_range_longchar(longchar * a, longchar * b, longchar * c)
{
#ifdef USE_M17N
    if (a->type != b->type || a->type != c->type)
	return 0;
    if (a->type == RE_TYPE_WCHAR_T)
	return (a->wch.ccs == c->wch.ccs && c->wch.ccs == b->wch.ccs) &&
	    (a->wch.code <= c->wch.code && c->wch.code <= b->wch.code);
#endif
    return (a->ch <= c->ch && c->ch <= b->ch);
}

#ifdef REGEX_DEBUG
char *
lc2c(longchar * x)
{
    static char y[STORAGE_MAX + 1], buf[20];
    int i = 0, l;

    while (i < STORAGE_MAX && x[i].type != RE_TYPE_END) {
	if (x[i].type == RE_WHICH_RANGE)
	    y[i] = '-';
	else if (x[i].type == RE_TYPE_CHAR)
	    y[i] = x[i].ch;
#ifdef USE_M17N
	else {
	    sprintf(buf, "[%x]%x", x[i].wch.ccs, x[i].wch.code);
	    l = strlen(buf);
	    if (i + l > STORAGE_MAX)
		break;
	    strcpy(&y[i], buf);
	    i += l;
	    continue;
	}
#endif
	i++;
    }
    y[i] = '\0';
    return y;
}

void
debugre(re, s)
     regexchar *re;
     char *s;
{
    for (; !(re->mode & RE_ENDMARK); re++) {
	if (re->mode & RE_BEGIN) {
	    printf("Begin ");
	    continue;
	}
	else if (re->mode & RE_END) {
	    printf("End ");
	    continue;
	}
	if (re->mode & RE_ANYTIME)
	    printf("Anytime-");

	switch (re->mode & RE_MATCHMODE) {
	case RE_ANY:
	    printf("Any ");
	    break;
	case RE_NORMAL:
	    printf("Match-to'%c' ", *re->pattern);
	    break;
	case RE_WHICH:
	    printf("One-of\"%s\" ", lc2c(re->pattern));
	    break;
	case RE_EXCEPT:
	    printf("Other-than\"%s\" ", lc2c(re->pattern));
	    break;
	default:
	    printf("Unknown ");
	}
    }
    putchar('\n');
}

#endif				/* REGEX_DEBUG */
