/*
    Copyright (C) 2012 Oleksiy Chernyavskyy

    This file is part of XDClient.

    XDClient is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    XDClient is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with XDClient.  If not, see <http://www.gnu.org/licenses/>.
*/


#include <stdio.h>
#include <stdlib.h>
#define _GNU_SOURCE
#include <wchar.h>
#include <string.h>
#ifdef TARGET_FREEBSD
#include <sys/types.h>
#else
#include <stdint.h>
#endif
#include <dirent.h>
#include "index.h"
#include "file.h"
#include "xml_utf8.h"
#include "common.h"
#include "xdc.h"
#include "hash.h"
#include "utf8.h"
#include "conf.h"
#include "adt_id.h"
#include "md5.h"

char scbuf[BUFSZ];
wchar_t sbuf[BUFSZ];
char scfname[MAXPATH];
char scpath[MAXPATH];

void print_idx_header(index_lt *idxfile)
{
  if (!idxfile)
	return;

  fwprintf(stdout, L"index format version: %u\n", idxfile->idx_version);
  if (idxfile->index_path)
	fwprintf(stdout, L"index path: %s\n", idxfile->index_path);
  fwprintf(stdout, L"index file size: %lu\n", idxfile->index_size);
  fwprintf(stdout, L"index body size: %lu\n", idxfile->index_body_size);
  fwprintf(stdout, L"index body start: %lu\n", idxfile->index_body_start);
  fwprintf(stdout, L"ktag index length: %u\n", idxfile->ktag_index_len);
  fwprintf(stdout, L"mask length: %u\n", idxfile->mask_len);
  fwprintf(stdout, L"dict file size: %lu\n", idxfile->dict_size);
  fwprintf(stdout, L"dict mtime: %lu\n", idxfile->dict_mtime);
  fwprintf(stdout, L"dict ctime: %lu\n", idxfile->dict_ctime);
  if (idxfile->dict_md5_full)
	fwprintf(stdout, L"dict MD5 full: %s\n", idxfile->dict_md5_full);
  if (idxfile->dict_md5_part)
	fwprintf(stdout, L"dict MD5 part: %s\n", idxfile->dict_md5_part);
  fwprintf(stdout, L"set lang from: %d\n", idxfile->set_lang_from);
  fwprintf(stdout, L"set lang to: %d\n", idxfile->set_lang_to);
  fwprintf(stdout, L"full name pos: %lu\n", idxfile->full_name_pos);
  fwprintf(stdout, L"full name len: %u\n", idxfile->full_name_len);
  fwprintf(stdout, L"descr pos: %lu\n", idxfile->descr_pos);
  fwprintf(stdout, L"descr len: %u\n", idxfile->descr_len);
  fwprintf(stdout, L"articles number: %u\n", idxfile->arnum);
  fwprintf(stdout, L"\n");
}

void init_index_lt(index_lt *index)
{
  if (! index)
	return;

  index->index_path = NULL;
  index->index_size = 0;
  index->index_body_size = 0;
  index->index_body_start = 0;
  index->ktag_index_len = 0;
  index->mask_len = 0;

  index->dict_size = 0;
  index->dict_mtime = 0;
  index->dict_ctime = 0;
  index->dict_md5_full = NULL;
  index->dict_md5_part = NULL;
  index->set_lang_from = 0;
  index->set_lang_to = 0;
  index->full_name_pos = 0;
  index->full_name_len = 0;
  index->descr_pos = 0;
  index->descr_len = 0;
  index->arnum = 0;
  index->idx_version = 0;
  index->next = NULL;
}

void free_index_lt(index_lt *index)
{
  PRDEBUG("enter")
  if (!index)
	return;

  if (index->index_path)
	free(index->index_path);

  if (index->dict_md5_full)
	free(index->dict_md5_full);

  if (index->dict_md5_part)
	free(index->dict_md5_part);

  free(index);
  PRDEBUG("exit")
}

int write_idx_header(FILE *fp, index_lt *idxfile, fstat_lt *dict)
{
  uint32_t val32;
  uint16_t val16;
  uint8_t val8;
  uint16_t vsize;
  md5_byte_t *digest;

  if (!fp || !idxfile || !dict)
	return 0;

  val16 = HDIversion;
  vsize = sizeof(uint16_t);
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  val16 = (uint16_t) idxfile->idx_version;
  fwrite(&val16, sizeof(uint16_t), 1, fp);

  val16 = HDIsize;
  vsize = sizeof(uint32_t);
  val32 = (uint32_t) dict->size;
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  fwrite(&val32, sizeof(uint32_t), 1, fp);

  val16 = HDImtime;
  vsize = sizeof(uint32_t);
  val32 = (uint32_t) dict->mtime;
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  fwrite(&val32, sizeof(uint32_t), 1, fp);

  val16 = HDIctime;
  vsize = sizeof(uint32_t);
  val32 = (uint32_t) dict->ctime;
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  fwrite(&val32, sizeof(uint32_t), 1, fp);

  if (dict->md5_full) {
	val16 = HDImd5full;
	vsize = sizeof(md5_byte_t) * 16;
	digest = md5_string2byte(dict->md5_full);
	fwrite(&val16, sizeof(uint16_t), 1, fp);
	fwrite(&vsize, sizeof(vsize), 1, fp);
	fwrite(digest, sizeof(md5_byte_t), 16, fp);
	free(digest);
  }

  if (dict->md5_part) {
	val16 = HDImd5part;
	vsize = sizeof(md5_byte_t) * 16;
	digest = md5_string2byte(dict->md5_part);
	fwrite(&val16, sizeof(uint16_t), 1, fp);
	fwrite(&vsize, sizeof(vsize), 1, fp);
	fwrite(digest, sizeof(md5_byte_t), 16, fp);
	free(digest);
  }

  val16 = HDIktag_index_len;
  vsize = sizeof(uint8_t);
  val8 = (uint8_t) idxfile->ktag_index_len;
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  fwrite(&val8, sizeof(uint8_t), 1, fp);

  val16 = HDImask_len;
  vsize = sizeof(uint8_t);
  val8 = (uint8_t) idxfile->mask_len;
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  fwrite(&val8, sizeof(uint8_t), 1, fp);

  val16 = HDIset_lang_from;
  vsize = sizeof(uint8_t);
  val8 = (uint8_t) idxfile->set_lang_from;
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  fwrite(&val8, sizeof(uint8_t), 1, fp);

  val16 = HDIset_lang_to;
  vsize = sizeof(uint8_t);
  val8 = (uint8_t) idxfile->set_lang_to;
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  fwrite(&val8, sizeof(uint8_t), 1, fp);

  if (idxfile->full_name_pos && idxfile->full_name_len) {
	val16 = HDIfull_name;
	vsize = sizeof(uint32_t) + sizeof(uint16_t);
	val32 = (uint32_t) idxfile->full_name_pos;
	fwrite(&val16, sizeof(uint16_t), 1, fp);
	fwrite(&vsize, sizeof(vsize), 1, fp);
	fwrite(&val32, sizeof(uint32_t), 1, fp);
	val16 = (uint16_t) idxfile->full_name_len;
	fwrite(&val16, sizeof(uint16_t), 1, fp);
  }

  if (idxfile->descr_pos && idxfile->descr_len) {
	val16 = HDIdescription;
	vsize = sizeof(uint32_t) + sizeof(uint16_t);
	val32 = (uint32_t) idxfile->descr_pos;
	fwrite(&val16, sizeof(uint16_t), 1, fp);
	fwrite(&vsize, sizeof(vsize), 1, fp);
	fwrite(&val32, sizeof(uint32_t), 1, fp);
	val16 = (uint16_t) idxfile->descr_len;
	fwrite(&val16, sizeof(uint16_t), 1, fp);
  }

  val16 = HDIarnum;
  vsize = sizeof(uint32_t);
  val32 = (uint32_t) idxfile->arnum;
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  fwrite(&val32, sizeof(uint32_t), 1, fp);

  val16 = HDIbody_start;
  vsize = 0;
  fwrite(&val16, sizeof(uint16_t), 1, fp);
  fwrite(&vsize, sizeof(vsize), 1, fp);
  idxfile->index_body_start = ftell(fp);

  return 1;
}

int update_idx_header(index_lt *idxfile, fstat_lt *dict)
{
  void *idx_buf;
  FILE *fp;

  if (!idxfile || !dict)
	return 0;

  if (! can_rw(idxfile->index_path))
	return 0;

  idx_buf = malloc(idxfile->index_size);

  fp = fopen(idxfile->index_path, "r");
  fseek(fp, idxfile->index_body_start, SEEK_SET);
  fread(idx_buf, 1, idxfile->index_body_size, fp);
  fclose(fp);
  remove(idxfile->index_path);

  fp = fopen(idxfile->index_path, "w");

  write_idx_header(fp, idxfile, dict);

  fwrite(idx_buf, 1, idxfile->index_body_size, fp);

  fclose(fp);
  free(idx_buf);

  idxfile->dict_size = dict->size;
  idxfile->dict_mtime = dict->mtime;
  idxfile->dict_ctime = dict->ctime;

  if (idxfile->dict_md5_full) {
	free(idxfile->dict_md5_full);
	idxfile->dict_md5_full = NULL;
  }
  if (dict->md5_full)
	idxfile->dict_md5_full = strdup(dict->md5_full);

  if (idxfile->dict_md5_part) {
	free(idxfile->dict_md5_part);
	idxfile->dict_md5_part = NULL;
  }
  if (dict->md5_part)
	idxfile->dict_md5_part = strdup(dict->md5_part);

  idxfile->index_size = file_size(idxfile->index_path);

  return validate_index(idxfile);
}

void clean_index_struct(index_lt *index)
{
  if (!index)
	return;

  if (index->dict_md5_full)
	free(index->dict_md5_full);
  if (index->dict_md5_part)
	free(index->dict_md5_part);

  index->index_size = 0;
  index->index_body_size = 0;
  index->index_body_start = 0;
  index->ktag_index_len = 0;
  index->mask_len = 0;
  index->dict_size = 0;
  index->dict_mtime = 0;
  index->dict_ctime = 0;
  index->dict_md5_full = NULL;
  index->dict_md5_part = NULL;
  index->full_name_pos = 0;
  index->full_name_len = 0;
  index->descr_pos = 0;
  index->descr_len = 0;
}

int read_idx_header(index_lt *idxfile)
{
  int i;
  FILE *fp;
  uint8_t val8;
  uint16_t val16;
  uint32_t val32;
  md5_byte_t digest[16];
  uint16_t vsize;

  if (!idxfile)
	return 0;

  clean_index_struct(idxfile);
  
  if (! idxfile->index_path)
	return 0;

  if ((idxfile->index_size = file_size(idxfile->index_path)) == 0)
	return 0;

  fp = fopen(idxfile->index_path, "r");
  if (!fp)
	return 0;

  while(!feof(fp)) {
	fread(&val16, sizeof(uint16_t), 1, fp);
	fread(&vsize, sizeof(vsize), 1, fp);
	switch(val16) {
	  case HDIversion:
		fread(&val16, sizeof(uint16_t), 1, fp);
		idxfile->idx_version = val16;
		if (idxfile->idx_version != 1) {
		  clean_index_struct(idxfile);
		  return 0;
		}
		break;
	  case HDIsize:
		fread(&val32, sizeof(uint32_t), 1, fp);
		idxfile->dict_size = val32;
		break;
	  case HDImtime:
		fread(&val32, sizeof(uint32_t), 1, fp);
		idxfile->dict_mtime = val32;
		break;
	  case HDIctime:
		fread(&val32, sizeof(uint32_t), 1, fp);
		idxfile->dict_ctime = val32;
		break;
	  case HDImd5full:
		fread(digest, sizeof(md5_byte_t), 16, fp);
		idxfile->dict_md5_full = md5_byte2string(digest);
		break;
	  case HDImd5part:
		fread(digest, sizeof(md5_byte_t), 16, fp);
		idxfile->dict_md5_part = md5_byte2string(digest);
		break;
	  case HDIktag_index_len:
		fread(&val8, sizeof(uint8_t), 1, fp);
		idxfile->ktag_index_len = val8;
		break;
	  case HDImask_len:
		fread(&val8, sizeof(uint8_t), 1, fp);
		idxfile->mask_len = val8;
		break;
	  case HDIset_lang_from:
		fread(&val8, sizeof(uint8_t), 1, fp);
		idxfile->set_lang_from = val8;
		break;
	  case HDIset_lang_to:
		fread(&val8, sizeof(uint8_t), 1, fp);
		idxfile->set_lang_to = val8;
		break;
	  case HDIfull_name:
		fread(&val32, sizeof(uint32_t), 1, fp);
		fread(&val16, sizeof(uint16_t), 1, fp);
		idxfile->full_name_pos = val32;
		idxfile->full_name_len = val16;
		break;
	  case HDIdescription:
		fread(&val32, sizeof(uint32_t), 1, fp);
		fread(&val16, sizeof(uint16_t), 1, fp);
		idxfile->descr_pos = val32;
		idxfile->descr_len = val16;
		break;
	  case HDIarnum:
		fread(&val32, sizeof(uint32_t), 1, fp);
		idxfile->arnum = val32;
		break;
	  case HDIbody_start:
		idxfile->index_body_start = ftell(fp);
		goto RIH_L1;
	  default:
		if (vsize == sizeof(uint8_t)) {
		  fread(&val8, sizeof(uint8_t), 1, fp);
		} else if (vsize == sizeof(uint16_t)) {
		  fread(&val16, sizeof(uint16_t), 1, fp);
		} else if (vsize == sizeof(uint32_t)) {
		  fread(&val32, sizeof(uint32_t), 1, fp);
		} else {
		  for (i=0; i<vsize; i++)
			fread(&val8, sizeof(uint8_t), 1, fp);
		}
		break;
	}
  }

RIH_L1:

  fclose(fp);

  return validate_index(idxfile);
}

int validate_index(index_lt *idxfile)
{
  if (!idxfile)
	return 0;

  idxfile->index_body_size = idxfile->index_size - idxfile->index_body_start;
  if (idxfile->index_body_size % (8 + idxfile->ktag_index_len + idxfile->mask_len) != 0) {
	fprintf(stderr, "xdc: error: invalid index block size\n");
	fprintf(stderr, "     file: %s\n", idxfile->index_path);
	fprintf(stderr, "     index_size: %u\n", idxfile->index_size);
	fprintf(stderr, "     index_body_start: %u\n", idxfile->index_body_start);
	fprintf(stderr, "     index_body_size: %u\n", idxfile->index_body_size);
	fprintf(stderr, "     ktag_index_len: %u\n", idxfile->ktag_index_len);
	fprintf(stderr, "     mask_len: %u\n", idxfile->mask_len);
	return 0;
  }
  
  return 1;
}

int idx_clean(char *cfg_dir)
{
  DIR *dirp;
  struct dirent *de;
  int nfiles;

  PRDEBUG("enter")

  if (cfg_dir && check_cfg(cfg_dir) && can_rw(cfg_dir)) {
	dirp = opendir(cfg_dir);
	if (! dirp) {
	  fprintf(stderr, "xdc: warning: could not open %s\n", cfg_dir);
	  return 0;
	}

	fprintf(stdout, "cleaning:\n");
	nfiles = 0;
	while((de = readdir(dirp)) != NULL) {
	  if (de->d_name[0] == '.') {
#ifdef _DIRENT_HAVE_D_NAMLEN
		if (de->d_namlen == 1)
		  continue;
		if (de->d_name[1] == '.' && de->d_namlen == 2)
		  continue;
#else
		if (strlen(de->d_name) == 1)
		    continue;
		if (de->d_name[1] == '.' && strlen(de->d_name) == 2)
		    continue;
#endif
	  }
#ifdef _DIRENT_HAVE_D_NAMLEN
	  strncpy(scfname, de->d_name, de->d_namlen);
	  scfname[de->d_namlen] = '\0';
#else
	  strcpy(scfname, de->d_name);
#endif
	  snprintf(scpath, MAXPATH, "%s/%s", cfg_dir, scfname);
	  if (is_idx(scpath)) {
		fprintf(stdout, "  %s\n", scfname);
		if (remove(scpath) == 0)
		  nfiles++;
	  }
	}
	closedir(dirp);
	snprintf(scpath, MAXPATH, "%s/xdc.cache", cfg_dir);
	if (remove(scpath) == 0)
	  nfiles++;
	fprintf(stdout, "total removed: %d\n", nfiles);
  }
  PRDEBUG("exit")
  return 1;
}

int idx_file_create(fstat_lt *db, char *index_path, key_ll_t **keys, int klen, int arnum, DTag *tag_root)
{
  FILE *fp;
  int k;
  key_ll_t *_key;
  uint32_t val32;
  uint16_t val16;
  uint8_t val8;
  md5_byte_t *digest;
  DTag *tag;
  DAttr *attr;
  int set_lang_from, set_lang_to;
  int i;
  index_lt *index;

  PRDEBUG("enter")

  if (!db || !index_path || !keys || klen == 0)
	return 0;

  index = (index_lt*) malloc(sizeof(index_lt));
  init_index_lt(index);

  index->idx_version = 1;

  index->index_path = strdup(index_path);
  index->ktag_index_len = ktag_index_len();
  index->mask_len = opt_mask_len();

  tag = tag_seek(NULL, tag_root, L"xdxf", S_ALL);
  if (tag) {
	attr = tag->attributes;
	while (attr) {
	  if (! wcsncmp(attr->name, L"lang_from", attr->nlen)) {
		if (attr->value && attr->vlen > 0) {
		  for (i=0; i < attr->vlen && isspace(attr->value[i]); i++);
		  if (i<attr->vlen)
			index->set_lang_from = 1;
		}
	  } else if (! wcsncmp(attr->name, L"lang_to", attr->nlen)) {
		if (attr->value && attr->vlen > 0) {
		  for (i=0; i < attr->vlen && isspace(attr->value[i]); i++);
		  if (i<attr->vlen)
			index->set_lang_to = 1;
		}
	  }
	  attr = attr->next;
	}
  }

  tag = tag_seek(NULL, tag_root, L"full_name", S_ALL);
  if (tag && tag->body) {
	index->full_name_pos = tag->pos;
	index->full_name_len = tag->len;
  }

  tag = tag_seek(NULL, tag_root, L"description", S_ALL);
  if (tag && tag->body) {
	index->descr_pos = tag->pos;
	index->descr_len = tag->len;
  }

  index->arnum = arnum;

  fp = fopen(index_path, "w");
  if (!fp) {
	fwprintf(stderr, L"xdc: error: could not create index file: %s\n", index_path);
	return 0;
  }

  write_idx_header(fp, index, db);
  free_index_lt(index);

  for (k=0; k<klen; k++) {
	_key = keys[k];
	if (!_key) {
	  fwprintf(stderr, L"xdc: error: %s:%d: _key == NULL\n", __FUNCTION__, __LINE__);
	  fclose (fp);
	  return 0;
	}
	while (_key->prev)
	  _key = _key->prev;

	while(_key) {
	  fwrite(&_key->tag->pos, sizeof(uint32_t), 1, fp);
	  fwrite(&_key->tag->len, sizeof(uint32_t), 1, fp);
	  fwrite(&_key->ktag_index, ktag_index_len(), 1, fp);
	  fwrite(&_key->kmask, opt_mask_len(), 1, fp);
	  _key = _key->next;
	}
  }

  fclose(fp);
  PRDEBUG("exit")

  return 1;
}

keypart_t key_parts_p1[KPMAX];
wchar_t* read_key_by_idx(fstat_lt *dict, xdxf_idx_t *idx, FILE *dfp)
{
  DTag *tag_ar;
  DTag *tag_key;
  int ktag_index;
  wchar_t *key_str;

  if (!idx)
	return NULL;

  tag_ar = read_xdxf(dict->path, idx->shift, idx->size, dfp);
  if (!tag_ar)
	return NULL;

  if (! is_tag_name(tag_ar, L"ar")) {
	tag_free(tag_ar);
	return NULL;
  }
	
  key_str = NULL;
  ktag_index = 0;
  tag_key = tag_seek(tag_ar, tag_ar->body, L"k", S_ALL);

  while(ktag_index != idx->ktag_index && tag_key) {
	ktag_index++;
	tag_key = tag_seek(tag_ar, tag_key, L"k", S_ALL | S_SKIP);
  }

  if (tag_key) {
	if (keyparts_fetch(tag_key, key_parts_p1, KPMAX)) {
	  key_str = get_key_by_mask(key_parts_p1, KPMAX, idx->mask);
	  free_key_parts(key_parts_p1, KPMAX);
	}
  }

  tag_free(tag_ar);

  return key_str;
}

int gen_index(xdc_conf_t *xdc_conf)
{
  nameval_t *nv;
  mbs_ll_t *mbsl;
  fstat_lt *dict_flist_start;
  fstat_lt *idx_flist_start, *idx_flist_start2;
  fstat_lt *flist, *flist2, *flist_prev, *flist_next;
  char *output_dir;
  char *mbs, *mbs2;
  int valid_dict_num;
  int i;
  FILE *fp;
  uint8_t *index_data;
  wchar_t *dict_full_name;
  int lnum;
  int ret;

  ret = 0;

  if (!xdc_conf)
	return 0;

  output_dir = find_output_dir(xdc_conf);
  if (!output_dir) {
	fprintf(stderr, "xdclient: generate index: error: no index output directory available with read/write access\n");
	return 0;
  }

  fprintf(stdout, "xdclient: generate index: output directory: %s\n", output_dir);

  dict_flist_start = NULL;
  idx_flist_start = NULL;



  if (hlookup(xdc_conf->symtab, "cmd_arg_clean_output", 0))
	idx_clean(output_dir);

  dict_flist_start = search_dicts(xdc_conf, 0);
  if (dict_flist_start) {
	nv = hlookup(xdc_conf->symtab, "dict_flist", 1);
	nv->val = (void*) dict_flist_start;
  }

  idx_flist_start = search_indexes(xdc_conf, 0);

  /* check if we have at least one valid dictionary */
  flist = dict_flist_start;
  valid_dict_num = 0;
  while (flist) {
	if (file_exist(flist->path) && is_rfile(flist->path) && !is_dir(flist->path))
	  valid_dict_num++;
	flist = flist->next;
  }
  if (!valid_dict_num)
	goto GEN_INDEX_EXIT;

  fprintf(stdout, "xdclient: generate index: found %d dictionaries\n", valid_dict_num);

  /* read index files */
  lnum = load_indexes(idx_flist_start);

  /* remove invalid locations, locations of duplicated indexes, put indexes into hash */
  dup_index_remove(xdc_conf, idx_flist_start);

  hash_indexes(xdc_conf, idx_flist_start, H_IDX_MD5_FULL);

  flist = dict_flist_start;
  fprintf(stdout, "\n");
  while(flist) {
	fprintf(stdout, "xdclient: dict path: %s\n", flist->path);
	if (file_exist(flist->path) && is_rfile(flist->path)) {
	  get_full_name(xdc_conf, flist, &dict_full_name);
	  if (dict_full_name) {
		fwprintf(stdout, L"xdclient: dict name: %S\n", dict_full_name);
		free(dict_full_name);
	  }

	  fstat_update(flist, FSTAT_MD5_FULL | FSTAT_MD5_PART);
	  nv = hlookup(xdc_conf->symtab, pfmbs("dict_processed:md5_full:%s", flist->md5_full), 1);
	  if (!nv->ival) {
		nv->ival = 1;
		nv->val = (void*) flist;
		nv = hlookup(xdc_conf->symtab, pfmbs("index:dict_md5_full:%s", flist->md5_full), 0);
		if (nv && nv->ival == 1) {
		  flist2 = (fstat_lt*) nv->val;
		  fprintf(stdout, "xdclient: found index: %s\n", flist2->path);
		  index_data = (uint8_t*) malloc(sizeof(uint8_t) * flist2->size);
		  fp = fopen(flist2->path, "r");
		  fread(index_data, sizeof(uint8_t), flist2->size, fp);
		  fclose(fp);
		  i = 0;
		  do {
			mbs = pfmbs("%s/%d.idx", output_dir, i++);
		  } while (file_exist(mbs));
		  fp = fopen(mbs, "w");
		  fwrite(index_data, sizeof(uint8_t), flist2->size, fp);
		  fclose(fp);
		  free(index_data);
		} else if (nv && nv->ival == 2) {
		  /* index exist and is located in output directory */
		  flist2 = (fstat_lt*) nv->val;
		  fprintf(stdout, "xdclient: found index: %s\n", flist2->path);
		} else if (!nv) {
		  /* generate new index */
		  i = 0;
		  do {
			mbs = pfmbs("%s/%d.idx", output_dir, i++);
		  } while (file_exist(mbs));
		  mbs2 = strdup(mbs);

		  fprintf(stdout, "xdclient: generating index: %s\n", mbs2);
		  _gen_index(flist, mbs2);
		  free(mbs2);
		}
	  } else {
		flist2 = (fstat_lt*) nv->val;
		fprintf(stdout, "xdclient: duplicated dict for: %s\n", flist2->path);
	  }
	} else {
	  fprintf(stdout, "xdclient: invalid path\n");
	}
	fprintf(stdout, "\n");
	flist = flist->next;
  }

  mbs = pfmbs("%s/xdc.cache", output_dir);
  remove(mbs);
  fp = fopen(mbs, "w");
  flist = dict_flist_start;
  while (flist) {
	fprintf(fp, "%s\n", flist->path);
	flist = flist->next;
  }
  fclose(fp);

  ret = 1;

GEN_INDEX_EXIT:

  free(output_dir);
  flist_free(idx_flist_start, 0);
  flist_free(dict_flist_start, 0);
  htable_regex_free(xdc_conf->symtab, "dict_flist", 0);
  htable_regex_free(xdc_conf->symtab, "dict_processed:md5_full:.*", 0);
  unhash_indexes(xdc_conf);

  return ret;
}

keypart_t key_parts_p2[KPMAX];
int _gen_index(fstat_lt *dict, char *index_file)
{
  DTag *tag_root;
  int i, j;
  key_ll_t **keys;
  int nkeys;
  int arnum;
  DTag *tag_xdxf;
  DTag *tag_ar;
  DTag *tag_key;
  key_ll_t *optkeys;
  int ktag_index;
  char *dict_path;

  PRDEBUG("enter")

  if (!dict) {
	fprintf(stderr, "xdclient: %s: error: invalid parameter\n", __FUNCTION__);
	return 0;
  }

  dict_path = dict->path;

  if (!dict_path || !dict_path[0] || !file_exist(dict_path) || \
	  !is_rfile(dict_path) ||  is_dir(dict_path) || !index_file || \
	  !index_file[0] || file_exist(index_file)) {
	fprintf(stderr, "xdclient: %s: error: invalid parameter\n", __FUNCTION__);
	return 0;
  }

  tag_root = read_xdxf(dict_path, 0, 0, NULL);
  if (tag_root) {
	fprintf(stdout, "xdclient: dict parsed ok\n");
	tag_xdxf = tag_seek(NULL, tag_root, L"xdxf", S_ALL);

	nkeys = count_key_tags(tag_xdxf);
	arnum = count_ar_tags(tag_xdxf);
	fwprintf(stdout, L"xdclient: number of articles: %d\n", arnum);

	tag_ar = tag_seek(tag_xdxf, tag_xdxf->body, L"ar", S_ALL);

	optkeys = NULL;
	if (tag_ar && nkeys) {
	  keys = (key_ll_t**) malloc(sizeof(key_ll_t*) * nkeys);
	  if (!keys) {
		fwprintf(stderr, L"xdclient: %s: malloc error\n", __FUNCTION__);
		tag_free(tag_root);
		return 0;
	  }
	  bzero(keys, sizeof(key_ll_t*) * nkeys);
	  j = 0;
	  while (tag_ar) {
		tag_key = tag_seek(tag_ar, tag_ar->body, L"k", S_ALL);

		ktag_index = 0;
		while(tag_key) {
		  if (keyparts_fetch(tag_key, key_parts_p2, KPMAX)) {
			keys[j] = base_key_fetch(key_parts_p2, KPMAX);
			keys[j]->ktag_index = ktag_index;
			keys[j]->tag = tag_ar;
			j++;

			optkeys = fill_opts(key_parts_p2, KPMAX, tag_ar, optkeys, ktag_index);

			free_key_parts(key_parts_p2, KPMAX);
		  }
		  ktag_index++;
		  tag_key = tag_seek(tag_ar, tag_key, L"k", S_ALL | S_SKIP);
		}
		tag_ar = tag_seek(tag_xdxf, tag_ar, L"ar", S_ALL | S_SKIP);
	  }

	  /* double sort operation to exclude case when two articles exist having same key phrases
	   * with different cases, like Postal (game) and postal (man working in post office)
	   */
	  qsort(keys, nkeys, sizeof(key_ll_t*), cmpkeys);
	  /* it's possible that dictionary may have multiple equal keys for different articles that have different content
		 */
/*	  dup_remove(keys, &nkeys); */
	  crop_nonalpha(keys, nkeys);
	  kll_shrink_norm_spaces(keys, nkeys);
/*	  start_spaces_remove(keys, nkeys, 1); */
	  keys_tolower(keys, nkeys);
	  qsort(keys, nkeys, sizeof(key_ll_t*), cmpkeys);
	  optkeys = sort_keys(optkeys);
/*	  dup_remove2(optkeys); */
	  crop_nonalpha2(optkeys);
	  kll_shrink_norm_spaces2(optkeys);
/*	  start_spaces_remove2(optkeys, 1); */
	  keys_tolower2(optkeys);
	  optkeys = sort_keys(optkeys);
	  kl_base_opt_merge(keys, nkeys, optkeys);

	  idx_file_create(dict, index_file, keys, nkeys, arnum, tag_root);

	  free_keys(keys, nkeys);
	  /*
		 free_opt_keys(optkeys);
		 free_base_keys(keys, nkeys);
	   */

	}
	tag_free(tag_root);
  } else {
	fprintf(stdout, "xdclient: error: could not parse dictionary %s\n", dict_path);
  }

  PRDEBUG("exit")

  return 1;
}

int count_key_tags(DTag *tag_xdxf)
{
  DTag *tag_k;
  int nkeys;

  nkeys = 0;
  if (tag_xdxf) {
	tag_k = tag_seek(tag_xdxf, tag_xdxf->body, L"k", S_ALL);
	if (tag_k)
	  nkeys++;
	while(tag_k = tag_seek(tag_xdxf, tag_k, L"k", S_ALL | S_SKIP))
	  nkeys++;
  }

  return nkeys;
}

int count_ar_tags(DTag *tag_xdxf)
{
  DTag *tag_ar;
  int nkeys;

  nkeys = 0;
  if (tag_xdxf) {
	tag_ar = tag_seek(tag_xdxf, tag_xdxf->body, L"ar", S_ALL);
	if (tag_ar)
	  nkeys++;
	while(tag_ar = tag_seek(tag_xdxf, tag_ar, L"k", S_ALL | S_SKIP))
	  nkeys++;
  }

  return nkeys;
}

void free_key_parts(keypart_t *key_parts, int kp_len)
{
  int h;
  int npart;

  PRDEBUG("enter")

  npart = count_keys(key_parts, kp_len);

  for (h=0; h<npart; h++) {
      if (key_parts[h].str) {
		free(key_parts[h].str);
		key_parts[h].str = NULL;
      }
  }
  PRDEBUG("exit")
}

void free_keys(key_ll_t **keys, int nkeys)
{
  int i;
  key_ll_t *_key;
  key_ll_t *_key_next;

  if (!keys || nkeys == 0)
	return;

  for (i=0; i<nkeys; i++) {
	if (keys[i]) {
	  _key = keys[i];
	  while(_key->prev)
		_key = _key->prev;

	  while(_key) {
		_key_next = _key->next;
		if (_key->key)
		  free(_key->key);
		free(_key);
		_key = _key_next;
	  }
	}
  }
  free(keys);
}

void free_base_keys(key_ll_t **keys, int nkeys)
{
  int i;

  PRDEBUG("enter")
  if (!keys || nkeys == 0)
	return;

  for (i=0; i<nkeys; i++) {
	if (keys[i]) {
	  if (keys[i]->key)
		free(keys[i]->key);
	  free(keys[i]);
	}
  }
  free(keys);
  PRDEBUG("exit")

}

void free_opt_keys(key_ll_t *optkeys)
{
  key_ll_t *_key;
  key_ll_t *_key_next;

  PRDEBUG("enter")

  _key = optkeys;
  while(_key->prev)
	_key = _key->prev;

  while(_key) {
	_key_next = _key->next;
	if (_key->key)
	  free(_key->key);
	free(_key);
	_key = _key_next;
  }

  PRDEBUG("exit")

}

int keyparts_fetch(DTag *tag_key, keypart_t *key_parts, int kpsize)
{
  DTag *_tag;
  int i, h;
  DTag *tag_opt;
  int npart;
  int nopt;
  DTag *tag_vroot;
  wchar_t *_buf;

  if (!tag_key || !key_parts || kpsize == 0)
	return 0;

  tag_opt = NULL;
  _tag = tag_key->body;
  h=0;
  nopt = 0;
  npart = 0;
  tag_vroot = tag_key;
  while(_tag) {
	if (_tag->type == DT_Value) {
	  for (i=0; i<_tag->vlen && h <BUFSZ-1; i++, h++) {
		if (_tag->value[i] == L'&') {
		  if (wcsncmp(&_tag->value[i], L"&lt;", 4) == 0) {
			sbuf[h] = L'<';
			i+=3;
		  } else if (wcsncmp(&_tag->value[i], L"&gt;", 4) == 0) {
			sbuf[h] = L'>';
			i+=3;
		  } else if (wcsncmp(&_tag->value[i], L"&amp;", 5) == 0) {
			sbuf[h] = L'&';
			i+=4;
		  } else if (wcsncmp(&_tag->value[i], L"&apos;", 6) == 0) {
			sbuf[h] = L'\'';
			i+=5;
		  } else if (wcsncmp(&_tag->value[i], L"&quot;", 6) == 0) {
			sbuf[h] = L'"';
			i+=5;
		  } else {
			fwprintf(stderr, L"xdc: warning: invalid xml: undefined escape code\n");
			sbuf[h] = _tag->value[i];
		  }
		} else {
		  sbuf[h] = _tag->value[i];
		}
	  }
	  _tag = tag_rnext(tag_vroot, _tag, S_ALL);
	} else if (_tag->type == DT_Tag) {
	  if (wcsncmp(_tag->name, L"nu", _tag->nlen) == 0) {
		_tag = tag_rnext(tag_vroot, _tag, S_DOWN | S_UP);
	  } else if (wcsncmp(_tag->name, L"opt", _tag->nlen) == 0) {
		if (nopt < MAXOPTS) {
		  if (h>0) {
			sbuf[h] = L'\0';
			_buf = (wchar_t*) malloc(sizeof(wchar_t) * (h+1));
			wcscpy(_buf, sbuf);
	/*		key_parts[npart].str = wcsdup(sbuf); */
			key_parts[npart].str = _buf;
			key_parts[npart].is_opt = 0;
			npart++;
			h=0;
		  }
		  tag_opt = _tag;
		  tag_vroot = tag_opt;
		  _tag = tag_rnext(tag_vroot, tag_opt, S_DEEP);
		} else {
		  fwprintf(stderr, L"xdc: warning: %s:%d: nopt >= MAXOPTS\n", __FUNCTION__, __LINE__);
		  _tag = tag_rnext(tag_vroot, _tag, S_DOWN | S_UP);
		}
	  } else {
		_tag = tag_rnext(tag_vroot, _tag, S_ALL); 
	  }
	} else {
	  _tag = tag_rnext(tag_vroot, _tag, S_DOWN | S_UP);
	}

	if (!_tag && tag_opt) {
	  if (h>0) {
		sbuf[h] = L'\0';
		_buf = (wchar_t*) malloc(sizeof(wchar_t) * (h+1));
		wcscpy(_buf, sbuf);
/*		key_parts[npart].str = wcsdup(sbuf); */
		key_parts[npart].str = _buf;
		key_parts[npart].is_opt = 1;
		nopt++;
		npart++;
		h=0;
	  }
	  tag_vroot = tag_key;
	  _tag = tag_rnext(tag_vroot, tag_opt, S_DOWN | S_UP);
	  tag_opt = NULL;
	}
  }

  if (h>0) {
	sbuf[h] = L'\0';
	_buf = (wchar_t*) malloc(sizeof(wchar_t) * (h+1));
	wcscpy(_buf, sbuf);
/*	key_parts[npart].str = wcsdup(sbuf); */
	key_parts[npart].str = _buf;
	key_parts[npart].is_opt = 0;
	npart++;
	h=0;
  }
  key_parts[npart].str = NULL;

  return 1;
}

key_ll_t* base_key_fetch(keypart_t *kp, int kp_len)
{
  int h, i, k;
  key_ll_t *kl;
  int npart;
  wchar_t *_buf;

  if (!kp || kp_len == 0)
	return NULL;

  npart = count_keys(kp, kp_len);

  for (h=0, k=0; h<npart; h++) {
	if (! kp[h].is_opt) {
	  for (i=0; k <= BUFSZ && kp[h].str[i] != L'\0'; i++, k++)
		sbuf[k] = kp[h].str[i];
	}
  }
  sbuf[k] = L'\0';
  kl = (key_ll_t*) malloc(sizeof(key_ll_t));
  _buf = (wchar_t*) malloc(sizeof(wchar_t) * (k+1));
  wcscpy(_buf, sbuf);
/*  kl->key = wcsdup(sbuf); */
  kl->key = _buf;
  kl->ktag_index = 0;
  kl->kmask = 0;
  kl->tag = NULL;
  kl->next = NULL;
  kl->prev = NULL;

  return kl;
}

void idx_free_all(xdcidx_t *idx)
{
  xdcidx_t *_idx;

  while (idx) {
	_idx = idx->next;
	if (idx->path)
	  free(idx->path);
	free(idx);
	idx = _idx;
  }
}

void index_list_free(index_lt *index)
{
  if (!index)
	return;

  if (index->index_path)
	free(index->index_path);
  if (index->dict_md5_full)
	free(index->dict_md5_full);
  if (index->dict_md5_part)
	free(index->dict_md5_part);

  index_list_free(index->next);
  free(index);
}

index_lt* read_index_file(char *path)
{
  index_lt *index;

  if (!path)
	return NULL;

  index = (index_lt*) malloc(sizeof(index_lt));
  init_index_lt(index);

  index->index_path = strdup(path);

  if (!read_idx_header(index)) {
	free_index_lt(index);
	index = NULL;
  }

  return index;
}

int dup_index_remove(xdc_conf_t *xdc_conf, fstat_lt *flist_start)
{
  fstat_lt *flist, *flist_prev, *flist_next;
  index_lt *index;
  nameval_t *nv;
  char *output_dir;
  char *dir_name;

  if (!xdc_conf || !flist_start)
	return 0;

  output_dir = find_output_dir(xdc_conf);

  flist = flist_start;
  flist_prev = flist_next = NULL;
  while(flist) {
	if (index = (index_lt*) flist->udata) {
	  nv = hlookup(xdc_conf->symtab, pfmbs("dup_index_remove:dict_md5_full:%s", index->dict_md5_full), 1);
	  if (! nv->ival && is_rfile(flist->path) && !is_dir(flist->path)) {
		dir_name = mbs_dirname(flist->path);
		if (output_dir && !strcmp(dir_name, output_dir))
		  nv->ival = 2;
		else
		  nv->ival = 1;
		free(dir_name);
		nv->val = (void*) flist;
		nv->val_type = Tflist;
		flist_prev = flist;
		flist = flist->next;
	  } else {
		if (nv->val != flist) {
		  flist_next = flist->next;
		  flist->next = NULL;
		  flist_free(flist, 0);
		  flist = flist_next;
		  if (flist_prev)
			flist_prev->next = flist;
		}
	  }
	}
  }
  htable_regex_free(xdc_conf->symtab, "dup_index_remove:dict_md5_full:.*", 0);
  if (output_dir)
	free(output_dir);

  return 1;
}

int load_indexes(fstat_lt *flist_start)
{
  fstat_lt *flist;
  int lnum;

  if (! flist_start)
	return 0;

  flist = flist_start;
  lnum = 0;
  while(flist) {
	if (load_index(flist))
	  lnum++;
	flist = flist->next;
  }

  return lnum;
}

int load_index(fstat_lt *flist)
{
  if (!flist)
	return 0;

  if (flist->udata = (void*) read_index_file(flist->path)) {
	flist->udata_free = (void (*)(void*)) index_list_free;
	flist->udata_type = Tindex;
	return 1;
  } else {
	return 0;
  }
}

int hash_indexes(xdc_conf_t *xdc_conf, fstat_lt *flist_start, unsigned flags)
{
  fstat_lt *flist;
  index_lt *index;
  nameval_t *nv;
  char *output_dir;
  char *dir_name;
  int is_odir;
  int nfull_set;
  int npart_set;

  if (!flist_start)
	return 0;

  output_dir = find_output_dir(xdc_conf);

  flist = flist_start;
  nfull_set = npart_set = 0;
  while(flist) {
	if (flist->udata) {
	  is_odir = 0;
	  if (output_dir && (dir_name = mbs_dirname(flist->path))) {
		if (!strcmp(dir_name, output_dir))
		  is_odir = 1;
	  }

	  index = (index_lt*) flist->udata;
	  if ((flags & H_IDX_MD5_FULL) && index->dict_md5_full) {
		nv = hlookup(xdc_conf->symtab, pfmbs("index:dict_md5_full:%s", index->dict_md5_full), 1);
		if (! nv->ival) {
		  if (is_odir)
			nv->ival = 2;
		  else
			nv->ival = 1;
		  nv->val = (void*) flist;
		  nv->val_type = Tflist;
		}
		if (!nfull_set)
		  hlookup(xdc_conf->symtab, "index:dict_md5_full:set", 1);
		nfull_set++;
	  }
	  if ((flags & H_IDX_MD5_PART) && index->dict_md5_part) {
		nv = hlookup(xdc_conf->symtab, pfmbs("index:dict_md5_part:%s", index->dict_md5_part), 1);
		if (! nv->ival) {
		  if (is_odir)
			nv->ival = 2;
		  else
			nv->ival = 1;
		  nv->val = (void*) flist;
		  nv->val_type = Tflist;
		}
		if (!npart_set)
		  hlookup(xdc_conf->symtab, "index:dict_md5_part:set", 1);
		npart_set++;
	  }
	}
	flist = flist->next;
  }

  free(output_dir);

  return 1;
}

void unhash_indexes(xdc_conf_t *xdc_conf)
{
  if (!xdc_conf)
	return;

  if (hlookup(xdc_conf->symtab, "index:dict_md5_full:set", 0))
	htable_regex_free(xdc_conf->symtab, "index:dict_md5_full:.*", 0);
  if (hlookup(xdc_conf->symtab, "index:dict_md5_part:set", 0))
	htable_regex_free(xdc_conf->symtab, "index:dict_md5_part:.*", 0);
}

