/*
 *   Copyright (C) 1997, 1998
 *   	Free Software Foundation, Inc.
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_MIFLUZ_H

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <unistd.h>

#include <strshift.h>
#include <rm.h>
#include <salloc.h>
#include <hash.h>
#include <md5str.h>
#include <isomap.h>
#include <mkdirp.h>
#include <file_exists.h>

#include <sqlutil.h>
#include <html_content.h>
#include <WebbaseHookMifluz.h>

#define POS_Word     0
#define POS_Server   1
#define POS_URL      2
#define POS_Tag      3
#define POS_Location 4 


extern "C" {
void* hooksmifluz_init();
}

//
// Dynamic loading symbol and initialization function
//
void* hooksmifluz_init()
{
  return (void*)new WebbaseHookMifluz();
}

static int html_content_collect(int info, html_content_parsed_t* parsed, void* data);

WebbaseHookMifluz::WebbaseHookMifluz()
{
  context = 0;
  wordRef = 0;
  inv = 0;
}

WebbaseHookMifluz::~WebbaseHookMifluz()
{
  Close();
  if(wordRef) delete wordRef;
  if(context) delete context;
}

struct option* WebbaseHookMifluz::Options()
{
  static struct option long_options[] = {
    {"verbose_hooks", 0, 0, 0},
    {"hook_cache_size", 1, 0, 0},
    {"hook_page_size", 1, 0, 0},
    {"hook_compress", 0, 0, 0},
    {"index_file", 1, 0, 0},
    {"index_config", 1, 0, 0},
    {0, 0, 0, 0}
  };

  return long_options;
}

void WebbaseHookMifluz::Parse(int argc, char** argv)
{
  WebbaseGetopt::Parse(argc, argv);

  static ConfigDefaults defaults[] = {
    { "wordlist_extend", "true", 0 },
    { "wordlist_minimum_word_length", "1", 0 },
    { "wordlist_maximum_word_length", "25", 0 },
    { "wordlist_cache_size", "10000000", 0 }, // 10MB
    { "wordlist_cache_max", "100000000", 0 }, // 100MB

    { "wordlist_page_size", "8192", 0 },     // 8KB
    { "wordlist_compress", "1", 0 },
    { "wordlist_wordkey_description","Word 8/Server 19/URL 32/Tag 5/Location 16"},
    { 0, 0, 0 }
  };

  index_dir = DEFAULT_INDEX_DIR;
  if(index_base.empty())
    index_base = index_dir + String("/index");

  Configuration config;
  config.Defaults(defaults);

  if(index_config.empty())
    index_config = WordContext::ConfigFile();

  if(!index_config.empty()) {
    if(file_exists((char*)index_config)) {
      if(verbose) fprintf(stderr, "WebbaseHookMifluz::Parse: using mifluz configuration file %s\n", (char*)index_config);
      config.Read(index_config);
    } else {
      fprintf(stderr, "WebbaseHookMifluz::Parse: -index_config %s file does not exist\n", (char*)index_config);
    }
  } else {
    if(verbose) fprintf(stderr, "WebbaseHookMifluz::Parse: no -index_config parameter, using builtin defaults\n");
  }

  if(!cache_size.empty()) config.Add("wordlist_cache_size", cache_size);
  if(!page_size.empty()) config.Add("wordlist_page_size", page_size);
  if(!compress.empty()) config.Add("wordlist_compress", compress);

  context = new WordContext(config);
  wordRef = context->Word();
}

int WebbaseHookMifluz::HandleOption(struct option* option)
{
  int found = 1;

  if(!WebbaseHook::HandleOption(option)) {
    if(!strcmp(option->name, "hook_cache_size")) {
      cache_size = optarg;
    } else if(!strcmp(option->name, "hook_page_size")) {
      page_size = optarg;
    } else if(!strcmp(option->name, "hook_compress")) {
      compress = "true";
    } else if(!strcmp(option->name, "index_config")) {
      index_config = optarg;
    } else if(!strcmp(option->name, "index_file")) {
      index_base = optarg;
    } else {
      found = 0;
    }
  }

  return found;
}

int WebbaseHookMifluz::Init(webbase_t* base)
{
  WebbaseHook::Init(base);
  Open(O_RDWR);
  return 1;
}

int WebbaseHookMifluz::Ok(webbase_url_t* webbase_url)
{
  if((webbase_url->w_info & WEBBASE_URL_INFO_CONTENT) &&
     webbase_url->w_content_length > 0 &&
     !webbase_url_robots_p(webbase_url))
    return 1;
  else
    return 0;
}

int WebbaseHookMifluz::Insert(webbase_url_t* webbase_url)
{
  if(verbose) fprintf(stderr, "indexing %s\n", webbase_url->w_url);
  WordKey& key = wordRef->Key();
  /*
   * Prepare the insertion context
   */
  wordRef->Clear();
  key.Set(POS_URL,webbase_url->w_rowid);
  key.Set(POS_Server,server_id);
  
  /*
   * Invoke the HTML parser
   */
  html_content_t walk;

  html_content_reset(&walk);
  walk.parser.info = HTML_SOURCE_FILENAME;
#if 0
  char* path = uri_furi_string(webbase_url->w_url, strlen(webbase_url->w_url), URI_FURI_REAL_PATH);
#else
  char* path = webbase_url_file(webbase_url);
#endif
  walk.parser.source = path;
  walk.parser.source_length = strlen(path);
  walk.content_callback = html_content_collect;
  walk.content_data = (void*)this;

  return html_content_parse(&walk);
}

int WebbaseHookMifluz::InsertWatch()
{
  return 0;
}

int WebbaseHookMifluz::Update(webbase_url_t*)
{
  return 0;
}

int WebbaseHookMifluz::Delete(webbase_url_t*)
{
  return 0;
}

int WebbaseHookMifluz::DeleteId(int, char*, int)
{
  return 0;
}

int WebbaseHookMifluz::RehookStart()
{
  return 0;
}

int WebbaseHookMifluz::RehookEnd()
{
  return 0;
}

int WebbaseHookMifluz::RebuildStart()
{
  WebbaseHook::RebuildStart();
  //
  // Reset the index itself
  //
  Close();
  int ret;
  if((ret = Open(O_RDWR|O_TRUNC)) != 0) return ret;
  inv->BatchStart();
  return 1;
}

int WebbaseHookMifluz::RebuildEnd()
{
  WebbaseHook::RebuildEnd();
  inv->BatchEnd();
  Close();
  return 1;
}

void WebbaseHookMifluz::InfoSet(char*)
{
}

void WebbaseHookMifluz::Server(int server_arg)
{
  server_id = server_arg;
}

void WebbaseHookMifluz::End()
{
}

//
// Mifluz implementation
//

//
// insert helpers
//
/*
 * C callback redirecting to WebbaseHookMifluz member function.
 */
static int html_content_collect(int info, html_content_parsed_t* parsed, void* data)
{
  WebbaseHookMifluz* mifluz = (WebbaseHookMifluz*)data;
  return mifluz->InsertChunk(info, parsed);
}

int WebbaseHookMifluz::InsertChunk(int info, html_content_parsed_t* parsed)
{
  int ret = 1;
  unsigned char* buffer = parsed->buffer0;
  int buffer_length = parsed->buffer0_length;
  WordKey& key = wordRef->Key();

  switch(info) {

  case HTML_CONTENT_META:
    {
      buffer = parsed->buffer1;
      buffer_length = parsed->buffer1_length;

      if(!strncasecmp((const char*)parsed->buffer0, "keyword", 7)) {
	key.Set(POS_Tag,WEBBASE_URL_TAG_KEY);
      } else if(!strncasecmp((const char*)parsed->buffer0, "description", 11)) {
	key.Set(POS_Tag,WEBBASE_URL_TAG_DESCRIPTION);
      } else {
	buffer = 0;
	buffer_length = 0;
      }
    }
    break;

  case HTML_CONTENT_TITLE:
    key.Set(POS_Tag, WEBBASE_URL_TAG_TITLE);
    break;

  case HTML_CONTENT_TEXT:
    key.Set(POS_Tag, WEBBASE_URL_TAG_BODY);
    break;
  }

  if(buffer)
    ret = InsertContent((const char*)buffer, buffer_length);
  
  return ret;
}  

int WebbaseHookMifluz::InsertContent(const char* buffer, int buffer_length)
{
  const WordType& wtype = context->GetType();
  const unsigned char* p;
  const unsigned char* word_start;
  WordKey& key = wordRef->Key();
  key.Set(POS_Word, WORD_KEY_VALUE_INVALID);
  key.Set(POS_Location, WORD_KEY_VALUE_INVALID);
  unsigned char* buffer_tmp = (unsigned char*)malloc(buffer_length + 1);
  memcpy(buffer_tmp, buffer, buffer_length);
  buffer_tmp[buffer_length] = '\0';
  p = word_start = buffer_tmp;
  
  unaccent(buffer_tmp, buffer_length);
  while(p - buffer_tmp < buffer_length) {
    while(word_start - buffer_tmp < buffer_length && !wtype.IsChar(*word_start))
      word_start++;
    p = word_start;
    while(p - buffer_tmp < buffer_length && wtype.IsChar(*p))
      p++;
    if(word_start < p) {
      if(key.Overflow(POS_Location, 1)) {
	fprintf(stderr, "WebbaseHookMifluz::InsertContent: overflow location, check wordlist_wordkey_description definition\n");
      } else {
	key[POS_Location]++;
      }
      wordRef->SetWord(String((char*)word_start, p - word_start));
      //
      // Check for overflow after it occured, hence the 0 argument of Overflow.
      //
      if(key.Overflow(POS_Word, 0)) {
	fprintf(stderr, "WebbaseHookMifluz::InsertContent: overflow word identifier, check wordlist_wordkey_description definition\n");
      }
      if(verbose > 3) fprintf(stderr, "WebbaseHookMifluz::InsertContent: %s\n", (char*)wordRef->Get());
      inv->Override(*wordRef);
    }
    word_start = p;
  }

  free((void *)buffer_tmp);
  return 1;
}

//
// Return the pathname of the database. The caller must free it.
//
const char* WebbaseHookMifluz::Database() const
{
  return (const char*)index_base;
}

int WebbaseHookMifluz::Open(int flags)
{
  if(!inv) {
    inv = context->List();

    const char* database = Database();

    int ret = inv->Open(database, flags);

    return ret;
  } else {
    return 0;
  }
}

int WebbaseHookMifluz::Close()
{
  if(inv) {
    delete inv;
    inv = 0;
  }
  return 1;
}

#endif /* HAVE_MIFLUZ_H */
