/* Copyright (C) 2000-2007 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include "udm_config.h"

#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_IO_H
#include <io.h>
#endif
#include <sys/stat.h>
#include <stdio.h>
#include <errno.h>
#include <math.h>

#include "udm_common.h"
#include "udm_utils.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_uniconv.h"
#include "udm_searchtool.h"
#include "udm_boolean.h"
#include "udm_stopwords.h"
#include "udm_word.h"
#include "udm_vars.h"
#include "udm_db.h"
#include "udm_db_int.h"
#include "udm_hash.h"
#include "udm_parsehtml.h"
#include "udm_store.h"
#include "udm_doc.h"
#include "udm_conf.h"
#include "udm_result.h"
#include "udm_log.h"
#include "udm_sgml.h"
#include "udm_mutex.h"
#include "udm_chinese.h"
#include "udm_synonym.h"


typedef struct udm_stack_parser_state_st
{
  int secno;
  int secno_match_type;
  int next_word_match_type;
  int use_numeric_operators;
  int word_match;
  int nphrasecmd;
  int auto_phrase;
  int dehyphenate;
  int strip_accents;
  int phrpos;
  size_t SubstringMatchMinWordLength;
  const char *lang;
} UDM_STACK_PARSER_STATE;



static int
UdmStackItemListAddCmd(UDM_RESULT *Res,
                       UDM_STACK_PARSER_STATE *state,
                       int *lex, size_t length)
{
  int rc= UDM_OK;
  size_t i;
  UDM_STACK_ITEM item;
  
  for (i = 0; i < length; i++)
  {
     switch(lex[i])
     {
       case '&':
       case '+':
         item.cmd= UDM_STACK_AND;
         state->next_word_match_type= state->word_match;
         break;
       case '|':
         item.cmd= UDM_STACK_OR;
         state->next_word_match_type= state->word_match;
         break;
       case '~':
         item.cmd= UDM_STACK_NOT;
         state->next_word_match_type= state->word_match;
         break;
       case '(':
         item.cmd= UDM_STACK_LEFT;
         state->next_word_match_type= state->word_match;
         break;
       case ')':
         item.cmd= UDM_STACK_RIGHT;
         state->next_word_match_type= state->word_match;
         break;
       case '"':
         item.cmd= UDM_STACK_PHRASE;
         state->next_word_match_type= state->word_match;
         state->nphrasecmd++;
         break;
       case '<':
         state->next_word_match_type= state->use_numeric_operators ?
                                      UDM_MATCH_NUMERIC_LT : state->word_match;
         break;
       case '>':
         state->next_word_match_type= state->use_numeric_operators ? 
                                      UDM_MATCH_NUMERIC_GT : state->word_match;
         break;
       default:
         if (state->auto_phrase && !UdmAutoPhraseChar(lex[i]))
         {
           int quot= '"';
           item.cmd= UDM_STACK_PHRASE;
           item.arg= state->secno_match_type;
           UdmStackItemListAddCmd(Res, state, &quot, 1);
           state->auto_phrase= 0;
         }
         continue;
     }
     /* Ignore all operators if we are in a phrase,  except phrase end. */
    if (!(state->nphrasecmd % 2) || item.cmd == UDM_STACK_PHRASE)
    {
      item.arg= state->secno_match_type;
      rc= UdmStackItemListAdd(&Res->ItemList, &item);
      Res->ItemList.ncmds++;
    }
  }
  return rc;
}


static int
UdmWordMatchApplyLength(UDM_STACK_PARSER_STATE *S, size_t wlen)
{
  return (wlen < S->SubstringMatchMinWordLength &&
          (S->next_word_match_type == UDM_MATCH_BEGIN ||
           S->next_word_match_type == UDM_MATCH_SUBSTR ||
           S->next_word_match_type == UDM_MATCH_END)) ?
  UDM_MATCH_FULL : S->next_word_match_type;
}


static int
UdmStackItemListAddWord(UDM_AGENT *query, UDM_RESULT *Res,
                        UDM_STACK_PARSER_STATE *state,
                        size_t length, int *lt, char *wrd)
{
  UDM_WIDEWORD OWord;
  UDM_WIDEWORDLIST Forms;
  UDM_STACK_ITEM item;
  int origin, rc, lt_auto_phrase= UdmAutoPhraseChar(lt[0]);
  size_t phrlen= 0;
  int end_of_phrase= 0;
  int final_word_match= UdmWordMatchApplyLength(state, length);

  if(Res->WWList.nuniq >= UDM_MAXWORDPERQUERY-1)
    return UDM_OK;

  if (state->nphrasecmd % 2) /* in phrase */
  {
    if (((state->auto_phrase && !lt_auto_phrase) || lt[0] == '"'))
    {
      /* End of auto- or non-auto-phrase*/
      phrlen= state->phrpos + 1;
      end_of_phrase= 1;
    }
  }
  else /* not in phrase */
  {
    if (lt_auto_phrase)
    {
      /* Start of auto-phrase */
      int quot= '"';
      state->auto_phrase= 1;
      item.cmd= UDM_STACK_PHRASE;
      item.arg= 0;
      UdmStackItemListAddCmd(Res, state, &quot, 1);
      phrlen= 0;
    }
    else
      phrlen= 1; /* Single word */
  }
  
  item.cmd= UDM_STACK_WORD;
  item.arg= Res->WWList.nuniq;
  rc= UdmStackItemListAdd(&Res->ItemList, &item);


  /*
    Check stopword only when full word.
    Substring searches should not exclude them.
  */
  if(final_word_match == UDM_MATCH_FULL &&
     (UdmStopListFind(&query->Conf->StopWords, wrd, state->lang) ||
      query->Conf->WordParam.min_word_len > length ||
      query->Conf->WordParam.max_word_len < length))
  {
    origin= UDM_WORD_ORIGIN_STOP;
    Res->ItemList.items[Res->ItemList.nitems - 1].cmd= UDM_STACK_STOP;
  }
  else
  {
    origin= UDM_WORD_ORIGIN_QUERY;
  }

  if (phrlen > 1)
  {
    /* Phrase end found: set phrlen for all words in the same phrase */
    UDM_WIDEWORD *W= &Res->WWList.Word[Res->WWList.nwords-1];
    for (; W >= Res->WWList.Word && W->phrlen == 0; W--)
      W->phrlen= phrlen;
  }

  bzero((void*) &OWord, sizeof(OWord));
  OWord.len= strlen(wrd);
  OWord.order= Res->WWList.nuniq;
  OWord.phrpos= state->phrpos;
  OWord.phrlen= phrlen;
  OWord.count= 0;
  OWord.word= wrd;
  OWord.origin = origin;
  OWord.match= final_word_match;
  OWord.secno= state->secno;
  OWord.phrwidth= state->dehyphenate ? 1 : 0;
  
  UdmWideWordListAdd(&Res->WWList, &OWord);
  if (state->nphrasecmd % 2)
    state->phrpos++;
      
  if (origin == UDM_WORD_ORIGIN_STOP)
    return UDM_OK;
      
  UdmWideWordListInit(&Forms);
  if (UDM_OK != (rc= UdmAllForms(query,&Forms,&OWord)))
    return rc;
  
  {
    UDM_WIDEWORD FWord;
    size_t frm;
    bzero((void*) &FWord, sizeof(FWord));
    for (frm= 0; frm < Forms.nwords ; frm++)
    {
      FWord.order= Res->WWList.nuniq;
      FWord.phrpos= OWord.phrpos; /* state->phrpos is already changed here */
      FWord.phrlen= phrlen;
      FWord.count= 0;
      FWord.word= Forms.Word[frm].word;
      FWord.len= Forms.Word[frm].len;
      FWord.origin = Forms.Word[frm].origin;
      FWord.match= state->next_word_match_type;
      FWord.secno= state->secno;
      UdmWideWordListAdd(&Res->WWList,&FWord);
/*    UdmLog(query, UDM_LOG_DEBUG, "Word form: [%d] %s", FWord.origin, wrd);*/
    }
  }
  UdmWideWordListFree(&Forms);
  if (end_of_phrase)
    state->phrpos= 0;
  Res->WWList.nuniq++;
  return rc;
}


int UdmPrepare(UDM_AGENT * query,UDM_RESULT *Res)
{
  UDM_CHARSET * browser_cs, * local_cs, *sys_int;
  int  ctype, rc;
  int *ustr, *uend, *lt, *lex;
  size_t ulen;
  const char * txt = UdmVarListFindStr(&query->Conf->Vars,"q","");
  const char * qprev = UdmVarListFindStr(&query->Conf->Vars,"qprev","");
  const char   *seg=  UdmVarListFindStr(&query->Conf->Vars, "Segmenter", NULL);
  char *ltxt;
  size_t wlen, llen, obytes;
  char *wrd;
  int *uwrd;
  UDM_CONV uni_lc, bc_uni, bc_lc;
  const char *lang;
  UDM_STACK_PARSER_STATE state;
  int *(*UniGetSepToken)(UDM_UNIDATA *unidata,
                         int *str, int *strend, int **last, int *ctype0);
  UDM_UNIDATA *unidata= query->Conf->unidata;
  
  state.secno= 0;
  state.secno_match_type= 0;
  state.use_numeric_operators= UdmVarListFindBool(&query->Conf->Vars, "UseNumericOperators", 0);
  state.dehyphenate= UdmVarListFindBool(&query->Conf->Vars, "Dehyphenate", 0);
  state.strip_accents= UdmVarListFindBool(&query->Conf->Vars, "StripAccents", 0);
  state.nphrasecmd= 0;
  state.word_match= UdmMatchMode(UdmVarListFindStr(&query->Conf->Vars, "wm", "wrd"));
  state.next_word_match_type= state.word_match;
  state.lang= UdmVarListFindStr(&query->Conf->Vars, "g", NULL);
  state.auto_phrase= 0;
  state.phrpos= 0;
  state.SubstringMatchMinWordLength=
    (size_t) UdmVarListFindInt(&query->Conf->Vars, "SubstringMatchMinWordLength", 0);
  UniGetSepToken= state.dehyphenate ? UdmUniGetSepToken2 :UdmUniGetSepToken;
  
  if ((wrd = (char*)UdmMalloc(query->Conf->WordParam.max_word_len * 12 + 1)) == NULL) return 0;
  if ((uwrd = (int*)UdmMalloc(sizeof(int) * (query->Conf->WordParam.max_word_len + 1))) == NULL) { UDM_FREE(wrd); return 0; }


  if (!(browser_cs = query->Conf->bcs))
    browser_cs=UdmGetCharSet("iso-8859-1");
  
  if(!(local_cs = query->Conf->lcs))
    local_cs=UdmGetCharSet("iso-8859-1");
  
  sys_int= &udm_charset_sys_int;
  
  UdmConvInit(&bc_uni,browser_cs,sys_int,UDM_RECODE_HTML);
  UdmConvInit(&uni_lc,sys_int,local_cs,UDM_RECODE_HTML);
  UdmConvInit(&bc_lc,browser_cs,local_cs,UDM_RECODE_HTML);
  
  ulen=strlen(txt);
  ustr=(int*)(UdmMalloc((sizeof(int))*(ulen+1)));
  obytes= UdmConv(&bc_uni,(char*)ustr,sizeof(ustr[0])*(ulen+1),txt,ulen+1);
  obytes-= sizeof(int);
  
  /* Create copy of query, converted into LocalCharset (for UdmTrack) */
  llen = ulen * 14 + 1;
  ltxt=(char*)UdmMalloc(llen);
  obytes= UdmConv(&uni_lc, ltxt, llen, (char*) ustr, obytes);
  ltxt[obytes]='\0';
  UdmVarListReplaceStr(&query->Conf->Vars,"q",ltxt);  /* "q-lc" was here */
  UDM_FREE(ltxt);
  
  llen = strlen(qprev);
  ltxt=(char*)UdmMalloc(llen*14+1);
  obytes= UdmConv(&bc_lc,ltxt,llen*14+1,qprev,llen);
  ltxt[obytes]='\0';
  UdmVarListReplaceStr(&query->Conf->Vars,"qprev",ltxt);
  UDM_FREE(ltxt);
  
  /* Parse query and build boolean search stack*/
  if (state.strip_accents)
    UdmUniStrStripAccents(unidata, ustr);
  UdmUniStrToLower(unidata, ustr);
  switch(browser_cs->family)
  {
    case UDM_CHARSET_CHINESE_SIMPLIFIED:
    case UDM_CHARSET_CHINESE_TRADITIONAL: lang = "zh"; break;
    case UDM_CHARSET_JAPANESE: lang = "ja"; break;
    case UDM_CHARSET_THAI: lang = "th"; break;
    default: lang = "";
  }
  
  ustr= UdmUniSegment(query, ustr, lang, seg);
  uend= ustr + UdmUniLen(ustr); /* TODO: get rid of it */

  lex= UniGetSepToken(unidata, ustr, uend, &lt , &ctype);
  for ( ;lex; lex= UniGetSepToken(unidata, NULL, uend, &lt, &ctype))
  {
    wlen=lt-lex;
    memcpy(uwrd, lex, (udm_min(wlen, query->Conf->WordParam.max_word_len)) * sizeof(int));
    uwrd[udm_min(wlen, query->Conf->WordParam.max_word_len)] = 0;
    UdmConv(&uni_lc, wrd, query->Conf->WordParam.max_word_len * 12,(char*)uwrd, sizeof(uwrd[0])*(wlen+1));
    UdmTrim(wrd, " \t\r\n");
    
    if (ctype == UDM_UNI_SEPAR)
    {
      UdmStackItemListAddCmd(Res, &state, lex, wlen);
    } 
    else
    {
      UDM_VAR *Section;

      if (lt[0] == ':' || lt[0] == '=')
      {
        if ((Section= UdmVarListFind(&query->Conf->Sections, wrd)))
        {
          state.secno= Section->section;
          state.secno_match_type= lt[0];
          continue;
        }
        if (wlen > 5 && !strncmp(wrd, "secno", 5))
        {
          state.secno= atoi(wrd + 5);
          state.secno_match_type= lt[0];
          continue;
        }
      }
      
      if (UDM_OK != (rc= UdmStackItemListAddWord(query, Res, &state,
                                                 wlen, lt, wrd)))
        return rc;
    }
  }
  
  if (state.nphrasecmd & 1)
  {
    UDM_STACK_ITEM item;
    item.cmd= UDM_STACK_PHRASE;
    item.arg= 0;
    UdmStackItemListAdd(&Res->ItemList, &item);
    Res->ItemList.ncmds++;
  }
  UDM_FREE(ustr);
  UDM_FREE(uwrd);
  UDM_FREE(wrd);
  Res->WWList.wm= state.word_match;
  Res->WWList.strip_noaccents= state.strip_accents;
  return UDM_OK;
}


void UdmURLDataGroupBySite(UDM_URLDATALIST *List)
{
  UDM_URLDATA *src= List->Item + 1;
  UDM_URLDATA *dst= List->Item;
  UDM_URLDATA *end= List->Item + List->nitems;
  uint4 count;
  
  if (!List->nitems)
    return;

  for(count= List->Item[0].per_site; src < end; src++)
  {
    /* Group by site_id */
    if(dst->site_id == src->site_id)
    {
      count+= src->per_site;
      if (dst->coord > src->coord)
      {
        continue;
      }
      else if (dst->coord == src->coord)
      {
        if (dst->pop_rank > src->pop_rank)
        {
          continue;
        }
        else if (dst->pop_rank == src->pop_rank)
        {
          if (dst->url_id < src->url_id)
            continue;
        }
      }
      dst->url_id=        src->url_id;
      dst->coord=         src->coord;
      dst->last_mod_time= src->last_mod_time;
      dst->pop_rank=      src->pop_rank;
      dst->url=           src->url;
      dst->section=       src->section;
    }
    else
    {
      /* Next site */
      dst->per_site= count;
      *++dst= *src;
      count= src->per_site;
    }
  }
  dst->per_site= count;
  List->nitems= dst - List->Item + 1;
  return;
}


/******** Convert category string into 32 bit number *************/

void UdmDecodeHex8Str(const char *hex_str, uint4 *hi,
                      uint4 *lo, uint4 *fhi, uint4 *flo)
{
  char str[33],str_hi[17],str_lo[17], *s = str;

  strncpy(str, hex_str, 13);
  str[12] = '\0';
  strcat(str,"000000000000");
  while(*s == '0') *s++ = ' ';
  strncpy(str_hi,&str[0],6); str_hi[6]=0;
  strncpy(str_lo,&str[6],6); str_lo[6]=0;
  
  *hi = (uint4)strtoul(str_hi, (char **)NULL, 36);
  *lo = (uint4)strtoul(str_lo, (char **)NULL, 36);

  if ((fhi != NULL) && (flo != NULL))
  {
    strncpy(str, hex_str, 13);
    str[12] = '\0';
    strcat(str,"ZZZZZZZZZZZZ");
    strncpy(str_hi, &str[0], 6); str_hi[6] = 0;
    strncpy(str_lo, &str[6], 6); str_lo[6] = 0;
  
    *fhi = strtoul(str_hi, (char **)NULL, 36);
    *flo = strtoul(str_lo, (char **)NULL, 36);
  }
}


int UdmParseQueryString(UDM_AGENT * Agent,
                        UDM_VARLIST * vars,char * query_string)
{
  char * tok, *lt;
  size_t len;
  char *str = (char *)UdmMalloc((len = strlen(query_string)) + 7);
  char *qs = (char*)UdmStrdup(query_string);
  char qname[256];

  if ((str == NULL) || qs == NULL)
  {
    UDM_FREE(str);
    UDM_FREE(qs);
    return 1;
  }

  UdmSGMLUnescape(qs);
  
  tok= udm_strtok_r(qs, "&", &lt);
  while(tok)
  {
    char empty[]="";
    char * val;
    
    if((val=strchr(tok,'=')))
    {
      *val='\0';
      val++;
    }
    else
    {
      val=empty;
    }
    UdmUnescapeCGIQuery(str,val);
    UdmVarListAddQueryStr(vars,tok,str);
    udm_snprintf(qname, 256, "query.%s", tok);
    UdmVarListAddQueryStr(vars, qname, str);
    
    tok= udm_strtok_r(NULL, "&", &lt);
  }
  UDM_FREE(str);
  UDM_FREE(qs);
  return 0;
}


#if 0
static UDM_WIDEWORD*
UdmWordInWWList(UDM_WIDEWORDLIST *List, int *tok, size_t toklen,
                int hlstop, size_t phrpos)
{
  size_t uw;
  
  for(uw=0; uw < List->nwords; uw++)
  {
    size_t slen;
    UDM_WIDEWORD *W= &List->Word[uw];
    if (W->phrpos != phrpos)
      continue;
    if (!hlstop && W->origin == UDM_WORD_ORIGIN_STOP)
      continue;
    slen= W->ulen;
    if (wordlen < slen)
      continue;
    if (wordlen == slen && !UdmUniStrNCaseCmp(tok, W->uword, slen))
      return W;
      
    if (wordlen > slen) switch (W->match)
    {
      case UDM_MATCH_BEGIN:
        if (!UdmUniStrNCaseCmp(tok, W->uword, slen))
          return W;
        break;
      case UDM_MATCH_END:
        if (!UdmUniStrNCaseCmp(tok + wordlen - slen, W->uword, slen))
          return W;
        break;
      case UDM_MATCH_SUBSTR:
        {
          size_t l1, l2;
          for (l1 = 0; l1 < wordlen; l1++)
          {
            if (l1 + slen > wordlen) break;
            for (l2 = 0; l2 < slen; l2++)
            {
              if (UdmUniToLower(tok[l1 + l2]) != UdmUniToLower(W->uword[l2]))
                break;
            }
            if (l2 == slen)
            {
              return W;
              break;
            }
          }
        }
        break;
    }
  }
  return NULL;
}
#endif


static UDM_WIDEWORD*
UdmWordInWWList2(UDM_WIDEWORDLIST *List,
                 int *tok, size_t toklen,
                 UDM_CONV *uni_wcs, UDM_CONV *lc_uni, UDM_CONV *uni_bc,
                 int hlstop, size_t phrpos)
{
  size_t uw;
  char word[128]= "";
  size_t wordlen= 0;
  UDM_UNIDATA *unidata= udm_unidata_default; /* FIXME */

  if (List->wm != UDM_MATCH_FULL)
  {
    wordlen= UdmConv(uni_wcs, word, sizeof(word) - 1,
                              (char*)tok, toklen*sizeof(int));
    word[wordlen]= '\0';
    UdmStrToLowerExt(unidata, uni_wcs->to, word, wordlen, UDM_RECODE_HTML);
  }
  
  for(uw=0; uw < List->nwords; uw++)
  {
    size_t W_len;
    UDM_WIDEWORD *W= &List->Word[uw];

    if (W->phrpos != phrpos)
      continue;
    if (!hlstop && W->origin == UDM_WORD_ORIGIN_STOP)
      continue;

    if (W->match == UDM_MATCH_FULL)
    {
      int cmp= List->strip_noaccents ?
          UdmStrCaseAccentCmp2(unidata, uni_wcs,
                               (const char*) tok, toklen * sizeof(int),
                               W->word, W->len) :
          UdmStrCaseCmp2(unidata, uni_wcs,
                         (const char*) tok, toklen*sizeof(int),
                         W->word, W->len);
      if (!cmp)
        return W;
      continue;
    }

    W_len= W->len;

    if (wordlen < W_len)
      continue;
    if (wordlen == W_len && !memcmp(word, W->word, W_len))
      return W;

    if (wordlen > W_len) switch (W->match)
    {
      case UDM_MATCH_BEGIN:
        if (!memcmp(word, W->word, W_len))
          return W;
        break;
      case UDM_MATCH_END:
        if (!memcmp(word + wordlen - W_len, W->word, W_len))
          return W;
        break;
      case UDM_MATCH_SUBSTR:
        if (strstr(word, W->word))
          return W;
        break;
      default:
        UDM_ASSERT(0); /* Inpossible */
    }
  }
  return NULL;
}


/*
  Remove hilight markers from a string and return
  its new length, in bytes.
*/
static size_t
UdmRemoveHl(UDM_CHARSET *cs, char *str, size_t from, size_t to)
{
  if (cs == &udm_charset_sys_int)
  {
    int *stri= (int*) str;
    int *s= stri + from / sizeof(int);
    int *e= stri + to / sizeof(int);
    int *dst= s;
    for (; s < e; s++)
    { 
      if (*s != 2 && *s != 3)
        *dst++= *s;
    }
    return (dst - stri) * sizeof(int);
  }
  else
  {
    char *s= str + from, *e= str + to, *dst= s;
    for (; s < e; s++)
    { 
      if (*s != 2 && *s != 3)
        *dst++= *s;
    }
    return dst - str;
  }
}


static size_t
UdmHlAppend(UDM_CONV *uni_bc, UDM_WIDEWORD *found,
            char *dst, size_t dstlen, size_t dstmaxlen,
            int *tok, size_t toklen)
{
  int i2= 2, i3= 3;

  if (found)
  {
    dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) &i2, sizeof(i2));
  }
  if (uni_bc->to == &udm_charset_sys_int)
  {
    memcpy(dst + dstlen, tok, sizeof(*tok) * toklen);
    dstlen+= sizeof(*tok) * toklen;
  }
  else
    dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) tok, sizeof(*tok) * toklen);

  if (found)
    dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) &i3, sizeof(i3));

  /*fprintf(stderr, "appended to '%.*s'\n", dstlen, dst);*/

  return dstlen;
}

/*
#define DEBUG_HL 0
*/

/* Returns a 0-terminated string */

size_t
UdmHlConvertExtWithConv(char *dst, size_t dstmaxlen,
                        UDM_WIDEWORDLIST *List,
                        const char *src, size_t srclen,
                        UDM_CONV *uni_wcs, UDM_CONV *lc_uni, UDM_CONV *uni_bc,
                        int hilight_stopwords)
{
  int		*tok, *lt, ctype, *uni, *uend;
  int           i0= 0;
  size_t        dstlen= 0, dstlen_phr= 0, nfound=0, ulen;
  size_t        unimaxlen, expected_phrpos= 0;
  UDM_UNIDATA *unidata= udm_unidata_default; /* FIXME */

#ifdef DEBUG_HL
  fprintf(stderr, "wcs=%s fromcs=%s tocs=%s srclen=%d src='%s'\n",
          uni_wcs->to->name, lc_uni->from->name, uni_bc->to->name, srclen, src);
#endif


  /* Convert to unicode */
  unimaxlen= (srclen + 10) * sizeof(int);
  uni= (int *)UdmMalloc(unimaxlen);
  ulen= UdmConv(lc_uni,(char*)uni, unimaxlen, src, srclen) / sizeof(int);
  uni[ulen]= '0';
  uend= uni + ulen;

  /* Parse unicode string */
  for (tok= UdmUniGetSepToken(unidata, uni, uend, &lt, &ctype) ; tok ;
       tok= UdmUniGetSepToken(unidata, NULL, uend, &lt, &ctype))
  {
    size_t toklen= lt - tok;

    if (ctype == UDM_UNI_SEPAR || !List)
    {
      dstlen= UdmHlAppend(uni_bc, NULL, dst, dstlen, dstmaxlen, tok, toklen);
    }
    else
    {
      UDM_WIDEWORD *found= UdmWordInWWList2(List, tok, toklen,
                                            uni_wcs, lc_uni, uni_bc,
                                            hilight_stopwords, expected_phrpos);
      dstlen= UdmHlAppend(uni_bc, found, dst, dstlen, dstmaxlen, tok, toklen);
      if (found)
      {
        nfound++;
        if (found->phrpos + 1 == found->phrlen)
        {
          /* last in phrase found */
          expected_phrpos= 0;
          dstlen_phr= dstlen;
          nfound= 0;
        }
        else
        {
          /* middle in phrase found */
          expected_phrpos++;
        }
      }
      else
      {
        /* No word found on expected phrase position, rollback */
        if (nfound)
          dstlen= UdmRemoveHl(uni_bc->to, dst, dstlen_phr, dstlen);
        dstlen_phr= dstlen;
        expected_phrpos= 0;
        nfound= 0;
      }
    }
  }


#ifdef DEBUG_HL  
  fprintf(stderr, "end: expected_phrpos=%d dstlen=%d dstlen_phr=%d\n", expected_phrpos, dstlen, dstlen_phr);
#endif
  if (expected_phrpos > 0)
  {
    /* Roll back: incomplete last phrase */
    dstlen= UdmRemoveHl(uni_bc->to, dst, dstlen_phr, dstlen);
  }

  UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) &i0, sizeof(i0));
  UdmFree(uni);
  return dstlen;
}

static char*
UdmHlConvertExtWithConvDup(UDM_WIDEWORDLIST *List,
                           const char *src, size_t srclen,
                           UDM_CONV *uni_wcs, UDM_CONV *lc_uni, UDM_CONV *uni_bc,
                           int hlstop)
{
  size_t dstlen;
  char *dst;
  if(!src || !srclen)return NULL;
  dstlen= srclen * 14 + 10;
  dst= (char*)UdmMalloc(dstlen);
  UdmHlConvertExtWithConv(dst, dstlen,
                          List, src, srclen,
                          uni_wcs, lc_uni, uni_bc, hlstop);
  return dst;
}


#ifdef NOT_YET
/*
  For character sets having "septok" and when lcs == bcs
  
  TODO: this function does not support HTML entities:
    &Auml;
    &#196;
    &#xC4;

  UdmWordInWWList2() should also be modified to unserstand entitites
  when searching words.
*/
static char *
UdmHlConvertExtQuick(UDM_WIDEWORDLIST *List, const char *src,
                     UDM_CHARSET *cs,
                     int hilight_stopwords)
{
  char        *dst;
  const char  *srcend, *tok, *lt;
  size_t      srclen, dstlen= 0, dstlen_phr= 0;
  size_t      dstmaxlen, expected_phrpos= 0;
  int ctype;

  if(!src || !(srclen = strlen(src)))
    return NULL;

  srcend= src + srclen;
  dstmaxlen= srclen * 14 + 10;
  dst= (char*)UdmMalloc(dstmaxlen + 1);

  for (tok= cs->septoken(cs, src, srcend, &lt, &ctype) ; tok ;
       tok= cs->septoken(cs, NULL, srcend, &lt, &ctype))
  {
    size_t toklen= lt - tok;

    if (ctype == UDM_UNI_SEPAR || !List || toklen > 127)
    {
      memcpy(dst + dstlen, tok, toklen);
      dstlen+= toklen;
    }
    else
    {
      UDM_WIDEWORD *found;
      {
        char tmp[128];
        memcpy(tmp, tok, toklen);
        tmp[toklen]= '\0';
        cs->lcase(cs, tmp, toklen);
        found= UdmWordInWWList2(List, tmp, toklen, hilight_stopwords, expected_phrpos);
      }

      if (dstlen + toklen + 2 >= dstmaxlen)
        break;
      
      if (found)
      {
        dst[dstlen++]= '\2';
        memcpy(dst + dstlen, tok, toklen);
        dstlen+= toklen;
        dst[dstlen++]= '\3';
      }
      else
      {
        memcpy(dst + dstlen, tok, toklen);
        dstlen+= toklen;
      }
      
      if (found)
      {
        if (found->phrpos + 1 == found->phrlen)
        {
          /* last in phrase found */
          expected_phrpos= 0;
          dstlen_phr= dstlen;
        }
        else
        {
          /* middle in phrase found */
          expected_phrpos++;
        }
      }
      else
      {
        /* No word found on expected phrase position, rollback */
        dstlen= UdmRemoveHl(cs, dst, dstlen_phr, dstlen);
        dstlen_phr= dstlen;
        expected_phrpos= 0;
      }
    }
  }

  if (expected_phrpos > 0)
  {
    /* Roll back: incomplete last phrase */
    dstlen= UdmRemoveHl(cs, dst, dstlen_phr, dstlen);
  }

  dst[dstlen]= '\0';
  return dst;
}
#endif


size_t UdmHlConvertExt(char *dst, size_t dstlen,
                       UDM_WIDEWORDLIST *List, UDM_CHARSET *wcs,
                       const char * src, size_t length,
                       UDM_CHARSET * lcs, UDM_CHARSET * bcs, int hlstop)
{
  UDM_CONV lc_uni, uni_bc, uni_wcs;
  UdmConvInit(&lc_uni, lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
  UdmConvInit(&uni_bc, &udm_charset_sys_int, bcs, UDM_RECODE_HTML);
  UdmConvInit(&uni_wcs, &udm_charset_sys_int, wcs, UDM_RECODE_HTML);
  return UdmHlConvertExtWithConv(dst, dstlen, List, src, length,
                                 &uni_wcs, &lc_uni, &uni_bc, hlstop);
}

static
char *UdmHlConvertExtDup(UDM_WIDEWORDLIST *List, UDM_CHARSET *wcs,
                         const char * src, size_t srclen,
                         UDM_CHARSET * lcs, UDM_CHARSET * bcs, int hlstop)
{
  UDM_CONV lc_uni, uni_bc, uni_wcs;
  UdmConvInit(&uni_wcs, &udm_charset_sys_int, wcs, UDM_RECODE_HTML);
  UdmConvInit(&lc_uni, lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
  UdmConvInit(&uni_bc, &udm_charset_sys_int, bcs, UDM_RECODE_HTML);
  return UdmHlConvertExtWithConvDup(List, src, srclen,
                                    &uni_wcs, &lc_uni, &uni_bc, hlstop);
}


/* For PHP module compatibility */
char * UdmHlConvert(UDM_WIDEWORDLIST *List,const char * src,
                    UDM_CHARSET * lcs, UDM_CHARSET * bcs)
{
  return UdmHlConvertExtDup(List, lcs, src, strlen(src), lcs, bcs, 1);
}


int UdmConvert(UDM_ENV *Conf, UDM_RESULT *Res,
               UDM_CHARSET *lcs, UDM_CHARSET *bcs)
{
  size_t i;
  UDM_CONV lc_bc, lc_uni, uni_bc;
  int hlstop= UdmVarListFindBool(&Conf->Vars, "ExcerptStopword", 1);

  /* Init converters */
  UdmConvInit(&lc_bc,lcs,bcs,UDM_RECODE_HTML);
  UdmConvInit(&lc_uni, lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
  UdmConvInit(&uni_bc, &udm_charset_sys_int, bcs, UDM_RECODE_HTML);
  
  /* Convert word list */
  for(i=0;i<Res->WWList.nwords;i++)
  {
    UDM_WIDEWORD *W=&Res->WWList.Word[i];
    size_t len= strlen(W->word);
    char *newval= (char*)UdmMalloc(len * 12 + 1);

    len= UdmConv(&lc_bc,newval,len*12+1,W->word,len);
    newval[len]= '\0';
    UDM_FREE(W->word);
    W->word=newval;
    W->len= len;
  }
  
  /* Convert document sections */
  for(i=0;i<Res->num_rows;i++)
  {
    UDM_DOCUMENT  *D=&Res->Doc[i];
    size_t    sec;
    
    for(sec=0; sec < D->Sections.nvars; sec++)
    {
      UDM_VAR *Var= &D->Sections.Var[sec];
      
      /*
         A temporary fix to skip URL and CachedCopy:
         We will skip these sections for now.
         But this need a further fix in search.htm
         to distinguish two HTML formats:
         - HTML with &<>" escaped to &amp;&lt;&gt;&quot;
         - HTML with &<>" printed as is, no word hilight
         - HTML with &<>" printed as is, search word hilighted.
      */
      
      if (strcasecmp(Var->name,"URL") &&
          strcasecmp(Var->name,"CachedCopy") &&
          strcasecmp(Var->name,"Content-Type"))
      {
        char *newval;
        /* skip if highlight markers already exist - cluster node */
        if (Var->flags & UDM_VARFLAG_HL)
          continue;
        newval= UdmHlConvertExtWithConvDup(&Res->WWList,
                                           Var->val, Var->curlen,
                                           &uni_bc, &lc_uni, &uni_bc, hlstop);
        UDM_FREE(Var->val);
        Var->val= newval;
      }
    }
  }
  
  /* Convert Env->Vars */
  for (i = 0; i < Conf->Vars.nvars; i++)
  {
    UDM_VAR *Var= &Conf->Vars.Var[i];
    if (UdmVarType(Var) == UDM_VAR_STR &&
        strcasecmp(Var->name, "HlBeg") &&
        strcasecmp(Var->name, "HlEnd"))
    {
      size_t len= strlen(Var->val);
      char *newval= (char*)UdmMalloc(len * 12 + 1);
    
      UdmConv(&lc_bc, newval, len * 12 + 1, Var->val, len + 1);
      UDM_FREE(Var->val);
      Var->val= newval;
    }
  }
  
  return UDM_OK;
}


static char rm_hl_special[256]=
{
/*00*/  1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
/*10*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*20*/  0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,  /*  !"#$%&'()*+,-./ */
/*30*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0123456789:;<=>? */
/*40*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* @ABCDEFGHIJKLMNO */
/*50*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* PQRSTUVWXYZ[\]^_ */
/*60*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* `abcdefghijklmno */
/*70*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* pqrstuvwxyz{|}~  */
/*80*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*90*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*A0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*B0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*C0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*D0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*E0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*F0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};


char* UdmRemoveHiLightDup(const char *s)
{
  size_t len=strlen(s)+1;
  char   *d, *res = (char*)UdmMalloc(len);
  
  for(d= res; ; s++)
  {
    if ((((unsigned char) *s) <= '&') &&
        rm_hl_special[(unsigned char) *s])
    {
      switch(s[0])
      {
        case '\0':
          goto ex;
        case '\2':
        case '\3':
          break;
        case '&':
          if (s[1] == '#')
          {
            const char *e;
            int code= 0;
          
            for (e= s+2; (*e >= '0') && (*e <= '9'); code= code*10 + e[0]-'0', e++);
            if (*e == ';')
            {
              *d++= (code < 128) ? code : '?';
              s= e;
              break;
            }
          }
          /* pass through */
        
        default:
          *d++=*s;
      }
    }
    else
      *d++= *s;
  }
ex:
  *d='\0';
  return res;
}



int UdmCatToTextBuf(UDM_CATEGORY *C, char *textbuf, size_t len)
{
  char  *end;
  size_t  i;
  
  textbuf[0]='\0';
  
  end = textbuf;
  
  for(i = 0; i < C->ncategories; i++)
  {
    udm_snprintf(end, len - strlen(textbuf),
                 "<CAT\tid=\"%d\"\tpath=\"%s\"\tlink=\"%s\"\tname=\"%s\">\r\n",
                 C->Category[i].rec_id, C->Category[i].path,
                 C->Category[i].link, C->Category[i].name);
    end = end + strlen(end);
  }
  return UDM_OK;
}

int UdmCatFromTextBuf(UDM_CATEGORY *C, char *textbuf)
{
  const char  *htok, *last;
  UDM_HTMLTOK  tag;
  size_t    i, c;
  
  if (textbuf == NULL) return UDM_OK;
  UdmHTMLTOKInit(&tag);
  
  htok=UdmHTMLToken(textbuf,&last,&tag);
  
  if(!htok || tag.type != UDM_HTML_TAG)
    return UDM_OK;

  C->Category = (UDM_CATITEM*)UdmRealloc(C->Category, sizeof(UDM_CATITEM) * ((c = C->ncategories) + 1));
  bzero((void*)&C->Category[c], sizeof(UDM_CATITEM));
  
  for(i = 1; i < tag.ntoks; i++)
  {
    size_t  nlen = tag.toks[i].nlen;
    size_t  vlen = tag.toks[i].vlen;
    char  *name = UdmStrndup(tag.toks[i].name, nlen);
    char  *data = UdmStrndup(tag.toks[i].val, vlen);

    if (!strcmp(name, "id"))
    {
      C->Category[c].rec_id = atoi(data);
    }
    else if (!strcmp(name, "path"))
    {
      strncpy(C->Category[c].path, data, 128);
    }
    else if (!strcmp(name, "link"))
    {
      strncpy(C->Category[c].link, data, 128);
    }
    else if (!strcmp(name, "name"))
    {
      strncpy(C->Category[c].name, data, 128);
    }

    UDM_FREE(name);
    UDM_FREE(data);
  }

  C->ncategories++;
  return UDM_OK;
}


static int
UdmWWListWordInfo(UDM_VARLIST *Vars, UDM_WIDEWORDLIST *WWList)
{
  size_t  len, i, j, wsize;
  char  *wordinfo= NULL, *end;
  size_t  corder= (size_t)-1, ccount= 0;
  int have_suggestions= 0;
  
  for(len = i = 0; i < WWList->nwords; i++) 
    len += WWList->Word[i].len + 64;
  
  wsize= (1+len)*sizeof(char);
  wordinfo= (char*) UdmMalloc(wsize);
  *wordinfo= '\0';
  
  UdmVarListAddInt(Vars, "nwords", WWList->nwords);

  for(i = 0; i < WWList->nwords; i++)
  {
    char name[32], count[32];
    if (WWList->Word[i].origin == UDM_WORD_ORIGIN_QUERY ||
        WWList->Word[i].origin == UDM_WORD_ORIGIN_SPELL ||
        WWList->Word[i].origin == UDM_WORD_ORIGIN_SYNONYM ||
        WWList->Word[i].origin == UDM_WORD_ORIGIN_COLLATION)
    {
      if(wordinfo[0])
        strcat(wordinfo,", ");
      sprintf(UDM_STREND(wordinfo)," %s : %d", WWList->Word[i].word, WWList->Word[i].count);
      sprintf(count, "%d", WWList->Word[i].count);
    }
    else if (WWList->Word[i].origin == UDM_WORD_ORIGIN_STOP)
    {
      if(wordinfo[0])
        strcat(wordinfo,", ");
      sprintf(UDM_STREND(wordinfo)," %s : stopword", WWList->Word[i].word);
      strcpy(count, "stopword");
    }
    sprintf(name, "word%d.word", i);
    UdmVarListAddStr(Vars, name, WWList->Word[i].word);
    sprintf(name, "word%d.count", i);
    UdmVarListAddStr(Vars, name, count);
    sprintf(name, "word%d.order", i);
    UdmVarListAddInt(Vars, name, WWList->Word[i].order);
    sprintf(name, "word%d.origin", i);
    UdmVarListAddInt(Vars, name, WWList->Word[i].origin);
  }
  
  UdmVarListReplaceStr(Vars, "WE", wordinfo);
  
  *wordinfo = '\0';
  for(i = 0; i < WWList->nwords; i++)
  {
    corder= WWList->Word[i].order;
    ccount= 0;
    for(j= 0; j < WWList->nwords; j++)
      if (WWList->Word[j].order == corder)
        ccount += WWList->Word[j].count;
    if (WWList->Word[i].origin == UDM_WORD_ORIGIN_STOP)
    {
      sprintf(UDM_STREND(wordinfo),"%s%s : stopword", (*wordinfo) ? ", " : "",  WWList->Word[i].word);
    }
    else if (WWList->Word[i].origin == UDM_WORD_ORIGIN_QUERY)
    {
      sprintf(UDM_STREND(wordinfo),"%s%s : %d / %d", 
        (*wordinfo) ? ", " : "", WWList->Word[i].word, WWList->Word[i].count, ccount);
    }
  }
  UdmVarListReplaceStr(Vars, "W", wordinfo);
  
  *wordinfo= '\0';
  end= wordinfo;
  for (i= 0; i < WWList->nwords; i++)
  {
    UDM_WIDEWORD *Wi= &WWList->Word[i];
    UDM_WIDEWORD *Wb= NULL;
   
    if (Wi->origin == UDM_WORD_ORIGIN_QUERY)
    {
      if (Wi->count > 0)
      {
        Wb= Wi;
      }
      else
      {
        ccount= 0;
        for (j= 0; j < WWList->nwords; j++)
        {
          UDM_WIDEWORD *Wj= &WWList->Word[j];
          if (Wj->origin == UDM_WORD_ORIGIN_SUGGEST &&
              Wj->order == Wi->order && Wj->count > ccount)
          {
            ccount= WWList->Word[j].count;
            Wb= Wj;
            have_suggestions= 1;
          }
        }
      }
    }
    else if (Wi->origin == UDM_WORD_ORIGIN_STOP)
    {
      Wb= Wi;
    }
    
    if (Wb)
    {
      sprintf(end, "%s%s", wordinfo[0] ? " " : "", Wb->word);
      end= end + strlen(end);
    }
  }
  
  if (have_suggestions)
    UdmVarListReplaceStr(Vars, "WS", wordinfo);
  UDM_FREE(wordinfo);
  return UDM_OK;
}


int UdmResWordInfo(UDM_ENV *Env, UDM_RESULT *Res)
{
  return UdmWWListWordInfo(&Env->Vars, &Res->WWList);
}
