/*
 *   Copyright (C) 1997, 1998
 *   	Free Software Foundation, Inc.
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_WORDLIST_H
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <unistd.h>

#include <WordList.h>
#include <WordType.h>
#include <Configuration.h>

#include <sqlutil.h>
#include <hooks.h>
#include <html_content.h>

#include <strshift.h>
#include <rm.h>
#include <salloc.h>
#include <hash.h>
#include <md5str.h>
#include <isomap.h>
#include <mkdirp.h>

static int verbose = 0;

static ConfigDefaults defaults[] = {
  { "wordlist_extend", "true", 0 },
  { "wordlist_cache_size", "134217728", 0 },
  { "wordlist_page_size", "32768", 0 },
  { "wordlist_compress", "1", 0 },
  { "minimum_word_length", "1", 0 },
  { "maximum_word_length", "25", 0 },
  { 0 }
};

static Configuration	config;

class WebbaseMifluz
{
public:
  //
  // Construction/Destruction
  //
  WebbaseMifluz() {
    max_insert = 1024 * 1024 * 1024;
    insert_count = 0;
    config.Defaults(defaults);
    inv = new WordList(config);
  }
  ~WebbaseMifluz() {
    Close();
    delete inv;
  }

  int Open(webbase_t* base);
  int Close();

  int Validate(webbase_t*) { return OK; }

  char* Database(webbase_t* base) const;

  int  Insert(webbase_t* base, webbase_url_t* webbase_url);
  int  InsertChunk(int info, html_content_parsed_t* parsed);
  int  InsertContent(const char* buffer, int buffer_length);

  int  RebuildStart(webbase_t* base);

  int Watch(webbase_t* base) {
    int ret = OK;
    insert_count++;
    if(insert_count >= max_insert) {
      insert_count = 0;
      fprintf(stderr, "force validate after %d insertions\n", max_insert);
      ret = Validate(base);
    }
    return ret;
  }

  int insert_count;
  int max_insert;
  int server;
  WordReference wordRef;
  WordList* inv;
};

int WebbaseMifluz::RebuildStart(webbase_t* base)
{
  char* database = Database(base);
  Close();
  unlink(database);
  free(database);
  return Open(base);
}

char* WebbaseMifluz::Database(webbase_t* base) const
{
  char* database;
  /*
   * Open mifluz session
   */
  if((database = getenv("FULLTEXTDB")) == 0) {
    char* ecilaroot = getenv("ECILAROOT");
    if(!ecilaroot)
      ecilaroot = ".";
    database = smalloc(strlen(ecilaroot) + strlen(base->name) * 2 + 128);
    sprintf(database, "%s/db/M%s", ecilaroot, base->name);
    mkdirp(database, 0777);
    sprintf(database, "%s/db/M%s/%s", ecilaroot, base->name, base->name);
  } else {
    database = strdup(database);
  }

  return database;
}

int WebbaseMifluz::Open(webbase_t* base)
{
  char* database = Database(base);

  if(getenv("ECILA_MIFLUZ_MAX_INSERT")) {
    max_insert = atoi(getenv("ECILA_MIFLUZ_MAX_INSERT"));
  }

  int ret = inv->Open(database, O_RDWR);

  free(database);

  return ret;
}

int WebbaseMifluz::Close()
{
  insert_count = 0;
  return inv->Close();
}

int WebbaseMifluz::InsertChunk(int info, html_content_parsed_t* parsed)
{
  int ret = 1;
  char* buffer = parsed->buffer0;
  int buffer_length = parsed->buffer0_length;
  WordKey& key = wordRef.Key();

  switch(info) {

  case HTML_CONTENT_META:
    {
      buffer = parsed->buffer1;
      buffer_length = parsed->buffer1_length;

      if(!strncasecmp(parsed->buffer0, "keyword", 7)) {
	key.SetTag(WEBBASE_URL_TAG_KEY);
      } else if(!strncasecmp(parsed->buffer0, "description", 11)) {
	key.SetTag(WEBBASE_URL_TAG_DESCRIPTION);
      } else {
	buffer = 0;
	buffer_length = 0;
      }
    }
    break;

  case HTML_CONTENT_TITLE:
    key.SetTag(WEBBASE_URL_TAG_TITLE);
    break;

  case HTML_CONTENT_TEXT:
    key.SetTag(WEBBASE_URL_TAG_BODY);
    break;
  }

  if(buffer)
    ret = InsertContent(buffer, buffer_length);
  
  return ret;
}  

int WebbaseMifluz::InsertContent(const char* buffer, int buffer_length)
{
  const WordType& wtype = inv->GetWordType();
  const char* p = buffer;
  const char* word_start = buffer;
  WordKey& key = wordRef.Key();
  
  unaccent(buffer, buffer_length);
  while(p - buffer < buffer_length) {
    while(word_start - buffer < buffer_length && !wtype.IsChar(*word_start))
      word_start++;
    p = word_start;
    while(p - buffer < buffer_length && wtype.IsChar(*p))
      p++;
    if(word_start < p) {
      key.GetLocation()++;
      key.SetWord(String(word_start, p - word_start));
      inv->Insert(wordRef);
    }
    word_start = p;
  }

  return 1;
}

/*
 * C callback redirecting to WebbaseMifluz member function.
 */
static int html_content_collect(int info, html_content_parsed_t* parsed, void* data)
{
  WebbaseMifluz* mifluz = (WebbaseMifluz*)data;
  return mifluz->InsertChunk(info, parsed);
}

int WebbaseMifluz::Insert(webbase_t* base, webbase_url_t* webbase_url)
{
  WordKey& key = wordRef.Key();
  /*
   * Prepare the insertion context
   */
  wordRef.Clear();
  key.SetURL(webbase_url->w_rowid);
  key.SetServer(server);
  
  /*
   * Invoke the HTML parser
   */
  html_content_t walk;

  html_content_reset(&walk);
  walk.parser.info = HTML_SOURCE_FILENAME;
  char* path = uri_furi_string(webbase_url->w_url, strlen(webbase_url->w_url), URI_FURI_REAL_PATH);
  walk.parser.source = path;
  walk.parser.source_length = strlen(path);
  walk.content_callback = html_content_collect;
  walk.content_data = (void*)this;

  return html_content_parse(&walk);
}

static WebbaseMifluz* mifluz = 0;

void hook_verbose(int level)
{
  verbose += level;
}

void hook_init(webbase_t* base)
{
  if(!mifluz) mifluz = new WebbaseMifluz();
  mifluz->Open(base);
}

typedef struct walk_arg {
  webbase_t* base;
  char* table;
  int base_num;
} walk_arg_t;

#if 0
void hook_rehook_start_fix(Hash_Table* row, char* args)
{
  walk_arg_t* walk_arg = (walk_arg_t*)args;

  char* url_string = hash_get(row, "E_FILE");
  char* cid_string = hash_get(row, "FT_CID");
  int cid;
  
  if(!url_string) {
    fprintf(stderr, "missing E_FILE\n");
    return;
  }
  if(!cid_string) {
    fprintf(stderr, "missing FT_CID for %s\n", url_string);
    return;
  }
  cid = atoi(cid_string);

  if(!verbose) {
    static int count = 0;
    /*
     * Print a dot every 1000 call.
     */
    if(count > 1000) {
      fprintf(stderr, ".");
      count = 0;
    }
    count++;
  }

  /*
   * Transform Mifluz strange paths into URL
   */
  MIFLUZnormalize_path(url_string);
  {
    /*
     * Search for URL in base
     */
    char* query = smalloc(strlen(url_string) + 128);
    char hookid_string[128 + 1];
    int hookid = 0;
    int split_ok = 1;
    int found;
    sprintf(query, "select hookid from url where url = '%s'", sql_quote_char_simple(url_string));
    found = sql_select_value(&walk_arg->base->mysql, query, hookid_string, 128);
    if(found)
      hookid = atoi(hookid_string) - (HOOK_MAX_RECORDS * 2);
    
    /*
     * If splitted bases, check that table name ends with expected number.
     */
    if(split_base) {
      int base_num = hook_url2base(url_string, strlen(url_string));
      split_ok = (base_num == walk_arg->base_num);
    }
    if(verbose) fprintf(stderr, "%s %s ", url_string, cid_string, hookid_string);
    /*
     * If found in base and splitted table is as expected
     */
    if(split_ok && found) {
      /*
       * If hookid compares equal, fine
       */
      if(hookid == cid) {
	if(verbose) fprintf(stderr, "(ok)\n");
      } else {
	/*
	 * If hookid mismatch just update the hookid in base
	 */
	if(verbose) fprintf(stderr, "(update hookid %d -> %d)\n", hookid, cid);
      }
      /*
       * Replace hookid either to fix or to remove mark
       */
      sprintf(query, "update url set hookid = %d where url = '%s'", cid, sql_quote_char_simple(url_string));
      smysql_query(&walk_arg->base->mysql, query);
    } else {
      /*
       * Deleted if not found or base number mismatch
       */
      if(!found) {
	if(verbose) fprintf(stderr, "(not in base)");
      } else if(!split_ok) {
	/*
	 * No need to update base, the mark is still there and
	 * the URL will be hooked correctly later on.
	 */
	if(verbose) fprintf(stderr, "(wrong base)");
      } else {
	if(verbose) fprintf(stderr, "(???)");
      }
      sprintf(query, "delete from %s where ft_cid = %s", walk_arg->table, cid_string); 
      MIFLUZexec(query);
      if(verbose) fprintf(stderr, " removed\n");
    }

    free(query);
  }
}
#endif

int hook_rehook_start_1(webbase_t*, char*)
{
#if 0
  char query[256];
  walk_arg_t walk_arg;
  walk_arg.base = base;
  walk_arg.table = table;
  walk_arg.base_num = hook_table2base(table);

  sprintf(query, "unprotect table %s", table);
  MIFLUZexec(query);
  sprintf(query, "set max_search_rows %d", HOOK_MAX_RECORDS);
  MIFLUZexec(query);
  sprintf(query, "select ft_cid,e_file from %s", table);
  MIFLUZwalk(query, hook_rehook_start_fix, (char*)&walk_arg);
  sprintf(query, "set max_search_rows %d", MIFLUZ_MAX_ROWS);
  MIFLUZexec(query);
  MIFLUZvalidate(table);
#endif
  return 1;
}

static int hook_validate(webbase_t*)
{
  return OK;
}

int hook_rehook_start(webbase_t*)
{
#if 0
  char query[256];

  /*
   * First mark all hookids.
   */
  sprintf(query, "update url set hookid = hookid + %d where hookid < %d", HOOK_MAX_RECORDS * 2, HOOK_MAX_RECORDS * 2);
  if(verbose) fprintf(stderr, "%s\n", query);
  smysql_query(&base->mysql, query);

  if(split_base) {
    int i;
    char* table = smalloc(strlen(base->name) + 32);
    for(i = 0; i < SPLIT_MAX; i++) {
      sprintf(table, "%s%02d", base->name, i);
      if(verbose) fprintf(stderr, "handle index %s\n", table);
      hook_rehook_start_1(base, table);
    }
    free(table);
  } else {
    hook_rehook_start_1(base, base->name);
  }
#endif
  return 1;
}

int hook_rehook_end(webbase_t* base)
{
  return hook_validate(base);
}

int hook_rebuild_start(webbase_t* base)
{
  char query[256];

  sprintf(query, "update url set hookid = 0");
  if(verbose) fprintf(stderr, "%s\n", query);
  smysql_query(&base->mysql, query);

  return mifluz->RebuildStart(base);
}

int hook_rebuild_end(webbase_t* base)
{
  return hook_validate(base);
}

int hook_ok(webbase_t*, webbase_url_t* webbase_url)
{
  if((webbase_url->w_info & WEBBASE_URL_INFO_CONTENT) &&
     webbase_url->w_content_length > 0 &&
     !webbase_url_robots_p(webbase_url))
    return 1;
  else
    return 0;
}

static char* info_field = "";
static int info_field_size = 0;
static char* info_value = "";
static int info_value_size = 0;

void hook_info_set(char* info)
{
  if(info && info[0] != '\0') {
    int info_length;
    char* tmp;
    char* p = info;
    info = strdup(info);
    info_length = strlen(info);
    p = info;
    static_alloc(&info_field, &info_field_size, info_length + 128);
    static_alloc(&info_value, &info_value_size, info_length + 128);
    info_field[0] = '\0';
    info_value[0] = '\0';
    while(p && (tmp = strchr(p, '='))) {
      char* field;
      char* value;
      char* info_fieldp = info_field + strlen(info_field);
      char* info_valuep = info_value + strlen(info_value);

      field = p;
      *tmp = '\0';
      value = tmp + 1;
      p = strchr(value, ';');
      if(p) {
	*p++ = '\0';
      }
      
      sprintf(info_fieldp, ", %s", field);

      if(isdigit(value[0])) {
	sprintf(info_valuep, ", %s", value);
      } else {
	sprintf(info_valuep, ", '%s'", value);
      }
    }
    free(info);
  }
}

int hook_insert_watch(webbase_t* base)
{
  return mifluz->Watch(base);
}

int hook_insert(webbase_t* base, webbase_url_t* webbase_url)
{
  mifluz->Insert(base, webbase_url);
  return 1;
}

int hook_update(webbase_t*, webbase_url_t*)
{
  return 1;
}

int hook_delete_id(webbase_t*, int, char*, int)
{
#if 0
  char query[64];
  char table[32];

  if(split_base) {
    unsigned char digeststr[16];
    int i;
    cmd5hash(url, url_length, digeststr);
    i = (digeststr[15] & 0xf);
    sprintf(table, "%s%02d", base->name, i);
    sprintf(query, "DELETE FROM %s%02d WHERE FT_CID = %d", base->name, i, id);
  } else {
    strcpy(table, base->name);
    sprintf(query, "DELETE FROM %s WHERE FT_CID = %d", base->name, id);
  }
  MIFLUZlock(table);
  MIFLUZexec(query);
#endif
  return 1;
}

void hook_fix(webbase_t*, webbase_url_t*)
{
}

int hook_delete(webbase_t* base, webbase_url_t* webbase_url)
{
  hook_delete_id(base, webbase_url->w_hookid, webbase_url->w_url, strlen(webbase_url->w_url));
  webbase_url->w_hookid = 0;
  return OK;
}

void hook_end()
{
  if(mifluz) {
    delete mifluz;
    mifluz = 0;
  }
}

void hook_server(int server)
{
  if(mifluz)
    mifluz->server = server;
}

#endif /* HAVE_WORDLIST_H */
