
#include "conf.h"

#include <fstream>
#include <cassert>
#include <cstdlib>
#include <cctype>

#include "convert.hh"
#include "app_string.hh"
#include "error_impl.hh"
#include "error_messages.hh"
#include "getdata.hh"

typedef unsigned int   Uni32;
typedef unsigned short Uni16;


//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//
// Lookups
//
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////
//
// PspellToUniLookup
//

class PspellToUniLookup 
{
  Uni32 data[256];
  static const Uni32 npos = (Uni32)(-1);
public:
  void reset();
  Uni32 operator[] (char key) const {return data[(unsigned char)key];}
  bool have(char key) const {return data[(unsigned char)key] != npos;}
  bool insert(char key, Uni32 value);
};

void PspellToUniLookup::reset() 
{
  for (int i = 0; i != 256; ++i)
    data[i] = npos;
}

bool PspellToUniLookup::insert(char key, Uni32 value)
{
  if (data[(unsigned char)key] != npos) 
    return false;
  data[(unsigned char)key] = value;
  return true;
}

//////////////////////////////////////////////////////////////////////
//
// PspellFromUniLookup
//

// Assumes that the maxium number of items in the table is 256
// Also assumes (unsigned char)i == i % 256

// Based on the iso8859-* character sets it is very fast, almsot all
// lookups involving no more than 2 comparisons.
// NO looks ups involded more than 3 comparssions.
// Also, no division (or modules) is done whatsoever.


struct PspellUniItem {
  Uni32 key;
  char  value;
};

class PspellFromUniLookup 
{
private:
  char unknown;
  static const Uni32 npos = (Uni32)(-1);
  PspellUniItem * overflow_end;
  
  PspellUniItem data[256*4];

  PspellUniItem overflow[256]; // you can never be too careful;
  
public:
  PspellFromUniLookup(char u = '?') : unknown(u) {}
  void reset();
  char operator[] (Uni32 key) const;
  bool insert(Uni32 key, char value);
};

void PspellFromUniLookup::reset()
{
  for (unsigned int i = 0; i != 256*4; ++i)
    data[i].key = npos;
  overflow_end = overflow;
}

char PspellFromUniLookup::operator[] (Uni32 k) const
{
  const PspellUniItem * i = data + (unsigned char)k * 4;

  if (i->key == k) return i->value;
  ++i;
  if (i->key == k) return i->value;
  ++i;
  if (i->key == k) return i->value;
  ++i;
  if (i->key == k) return i->value;
  
  if (i->key == npos) return unknown;
  
  for(i = overflow; i != overflow_end; ++i)
    if (i->key == k) return i->value;

  return unknown;
}

bool PspellFromUniLookup::insert(Uni32 k, char v) 
{
  PspellUniItem * i = data + (unsigned char)k * 4;
  PspellUniItem * e = i + 4;
  while (i != e && i->key != npos) {
    if (i->key == k)
      return false;
    ++i;
  }
  if (i == e) {
    for(i = overflow; i != overflow_end; ++i)
      if (i->key == k) return false;
  }
  i->key = k;
  i->value = v;
  return true;
}

//////////////////////////////////////////////////////////////////////
//
// PspellCharLookup
//

class PspellCharLookup 
{
private:
  int data[256];
public:
  void reset();
  char operator[] (char key) const {return data[(unsigned char)key];}
  bool insert(char key, char value);
};

void PspellCharLookup::reset() {
  for (int i = 0; i != 256; ++i) 
    data[i] = -1;
}

bool PspellCharLookup::insert(char key, char value) 
{
  if (data[(unsigned char)key] != -1)
    return false;
  data[(unsigned char)key] = value;
  return true;
}


//////////////////////////////////////////////////////////////////////
//
//  PspellStriaghtThrough
//

class Pspell_StraightThrough : public PspellConvert
{
public:
  Pspell_StraightThrough(const char * e)
    : PspellConvert(e,e) {}
  void convert           (const char   * in, 
			  PspellAppendableString & out) const;
  const char * convert_until (const char   * in, const char * stop, 
			      PspellAppendableString & out) const;
  bool convert_next_char (const char * & in, 
			  PspellAppendableString & out) const;
};

void Pspell_StraightThrough::convert(const char * in, 
			       PspellAppendableString & out) const
{
  out.append(in);
}

const char * Pspell_StraightThrough::convert_until(const char * in, 
						  const char * stop,
						  PspellAppendableString & out) const
{
  out.append(in, stop-in);
  return stop;
}

bool Pspell_StraightThrough::convert_next_char (const char * & in, 
					       PspellAppendableString & out) const
{
  if (*in != '\0') {
    out.append(in,1);
    ++in;
    return true;
  } else {
    return false;
  }
}

//////////////////////////////////////////////////////////////////////
//
// read in char data
//

void read_in_char_data (PspellCanHaveErrorImpl & error,
			PspellConfig & config,
			const char * encoding,
			PspellToUniLookup & to,
			PspellFromUniLookup & from)
{
  error.reset_error();
  to.reset();
  from.reset();
  const char * temp_str = config.retrieve("pspell-data-dir");
  if (temp_str == 0)
    temp_str = config.retrieve("data-dir");
  assert(temp_str != 0);
  PspellString file_name = temp_str;
  file_name += '/';
  file_name += encoding;
  file_name += ".map";
  STD ifstream fdata(file_name.c_str());
  if (!fdata) {
    error.set_error(unknown_encoding, encoding);
    error.error_mesg_ += " This could also mean that the file \"";
    error.error_mesg_ += file_name;
    error.error_mesg_ += "\" could not be opened for reading "
      "or does not exist.";
    return;
  }
  PspellString chr_hex,uni_hex;
  char  chr;
  Uni32 uni;
  char * p;
  unsigned long t;
  PspellGetLineFromStream data(&fdata);
  while (getdata_pair(data, chr_hex, uni_hex)) {
    p = (char *)chr_hex.c_str();
    t = strtoul(p, &p, 16);
    if (p != chr_hex.c_str() + chr_hex.size() 
	|| t != (unsigned char)t /* check for overflow */) 
      {
	error.error_mesg_   = encoding;
	error.error_mesg_  += ".dat: ";
	error.set_error(bad_key, 
			chr_hex.c_str(),
			"two digit hex string");
	return;
      }
    chr = (char)t;
     
    p = (char *)uni_hex.c_str();
    t = strtoul(p, &p, 16);
    if (p != uni_hex.c_str() + uni_hex.size() 
	|| t != (Uni32)t /* check for overflow */) 
      {
	error.error_mesg_   = encoding;
	error.error_mesg_  += ".dat: ";
	error.set_error(bad_value,
			chr_hex.c_str(), uni_hex.c_str(),
			"four digit hex string");
	return;
      }
    uni = (Uni32)t;

    if (to.have(chr)) {
      error.error_mesg_   = encoding;
      error.error_mesg_  += ".dat: ";
      error.set_error(duplicate,
		      "Character",
		      chr_hex.c_str());
      return;
    }
    to.insert(chr, uni);
    if (!from.insert(uni, chr)) {
      error.error_mesg_   = encoding;
      error.error_mesg_  += ".dat: ";
      error.set_error(duplicate,
		      "Uni Character",
		      uni_hex.c_str());
      return;
    }
  }
  
  // insert the ascii characters if they are not already there
  unsigned int i; 
  for (i = 0; i != 128; ++i) {
    if (to.insert(i, i))
      from.insert(i,i);
  }
  for (; i != 255; ++i) {
    to.insert(i, '?');
  }
  
}


//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
//
//  PspellConvert
//
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////


PspellConvert::PspellConvert(const char * incode, const char * outcode)
  : in_code_(incode), out_code_(outcode) 
{}

bool operator== (const PspellConvert & rhs, const PspellConvert & lhs)
{
  return strcmp(rhs.in_code(), lhs.in_code()) == 0
    && strcmp(rhs.out_code(), lhs.out_code()) == 0;
}

void PspellConvert::convert (const char *  in, 
			     PspellAppendableString & out) const
{
  while (convert_next_char(in, out));
}

const char * PspellConvert::convert_until (const char *  in, 
					   const char * stop, 
					   PspellAppendableString & out) const
{
  while (in < stop && convert_next_char(in, out));
  return in;
}

//////////////////////////////////////////////////////////////////////
//
// Pspell Char to Uni16
//

class Pspell_Char_Uni16 : public PspellConvert
{
public:
  PspellToUniLookup lookup;
  Pspell_Char_Uni16(PspellConfig & c, const char * e);
  bool convert_next_char (const char * & in, 
			  PspellAppendableString & out) const;
};

Pspell_Char_Uni16::Pspell_Char_Uni16(PspellConfig & c, const char * e) 
  : PspellConvert(e, "machine unsigned 16")
{
  PspellFromUniLookup unused;
  read_in_char_data(error_, c, e, lookup, unused);
}


bool Pspell_Char_Uni16
::convert_next_char (const char * & in, 
		     PspellAppendableString & out) const
{
  if (*in != '\0') {
    Uni16 d = lookup[*in];
    out.append((char *)&d, 2);
    ++in;
    return true;
  } else {
    return false;
  }
}

//////////////////////////////////////////////////////////////////////
//
// Pspell Char to Uni32
//

class Pspell_Char_Uni32 : public PspellConvert
{
public:
  PspellToUniLookup lookup;
  Pspell_Char_Uni32(PspellConfig & c, const char * e);
  bool convert_next_char (const char * & in, 
			  PspellAppendableString & out) const;
};

Pspell_Char_Uni32::Pspell_Char_Uni32(PspellConfig & c, const char * e) 
  : PspellConvert(e, "machine unsigned 32")
{
  PspellFromUniLookup unused;
  read_in_char_data(error_, c, e, lookup, unused);
}


bool Pspell_Char_Uni32
::convert_next_char (const char * & in, 
		     PspellAppendableString & out) const
{
  if (*in != '\0') {
    Uni32 d = lookup[*in];
    out.append((char *)&d, 4);
    ++in;
    return true;
  } else {
    return false;
  }
}

//////////////////////////////////////////////////////////////////////
//
// Pspell Char to UTF8
//

class Pspell_Char_UTF8 : public PspellConvert
{
public:
  PspellToUniLookup lookup;
  Pspell_Char_UTF8(PspellConfig & c, const char * e);
  bool convert_next_char (const char * & in, 
			  PspellAppendableString & out) const;
};

Pspell_Char_UTF8::Pspell_Char_UTF8(PspellConfig & c, const char * e) 
  : PspellConvert(e, "UTF-8")
{
  PspellFromUniLookup unused;
  read_in_char_data(error_, c, e, lookup, unused);
}


bool Pspell_Char_UTF8
::convert_next_char (const char * & in, 
		     PspellAppendableString & out) const
{
  if (*in != '\0') {
    Uni32 c = lookup[*in];
    char str[4];

    if (c < 0x80) {
      str[0] = (char)c;
      out.append(str, 1);
    }
    else if (c < 0x800) {
      str[0] = (char)(0xC0 | c>>6);
      str[1] = (char)(0x80 | c & 0x3F);
      out.append(str, 2);
    }
    else if (c < 0x10000) {
      str[0] = (0xE0 | c>>12);
      str[1] = (0x80 | c>>6 & 0x3F);
      str[2] = (0x80 | c & 0x3F);
      out.append(str, 3);
    }
    else if (c < 0x200000) {
      str[0] = (0xF0 | c>>18);
      str[1] = (0x80 | c>>12 & 0x3F);
      str[2] = (0x80 | c>>6 & 0x3F);
      str[3] = (0x80 | c & 0x3F);
      out.append(str, 4);
    }

    ++in;
    return true;

  } else {

    return false;

  }
}

//////////////////////////////////////////////////////////////////////
//
// Pspell Uni16 to Char
//

class Pspell_Uni16_Char : public PspellConvert
{
public:
  PspellFromUniLookup lookup;
  Pspell_Uni16_Char(PspellConfig & c, const char * e);
  bool convert_next_char (const char * & in, 
			  PspellAppendableString & out) const;
};

Pspell_Uni16_Char::Pspell_Uni16_Char(PspellConfig & c, const char * e)
  : PspellConvert("machine unsigned 16", e) 
{
  PspellToUniLookup unused;
  read_in_char_data(error_, c, e, unused, lookup);
}


bool Pspell_Uni16_Char
::convert_next_char (const char * & in, 
		     PspellAppendableString & out) const
{
  Uni16 c = *(const Uni16 *)(in);
  if (c != 0) {
    char d = lookup[c];
    out.append(&d, 1);
    in += 2;
    return true;
  } else {
    return false;
  }
}

//////////////////////////////////////////////////////////////////////
//
// Pspell Uni32 to Char
//

class Pspell_Uni32_Char : public PspellConvert
{
public:
  PspellFromUniLookup lookup;
  Pspell_Uni32_Char(PspellConfig & c, const char * e);
  bool convert_next_char (const char * & in, 
			  PspellAppendableString & out) const;
};

Pspell_Uni32_Char::Pspell_Uni32_Char(PspellConfig & c, const char * e)
  : PspellConvert("machine unsigned 32", e) 
{
  PspellToUniLookup unused;
  read_in_char_data(error_, c, e, unused, lookup);
}


bool Pspell_Uni32_Char
::convert_next_char (const char * & in, 
		     PspellAppendableString & out) const
{
  Uni32 c = *(const Uni32 *)(in);
  if (c != 0) {
    char d = lookup[c];
    out.append(&d, 1);
    in += 4;
    return true;
  } else {
    return false;
  }
}

//////////////////////////////////////////////////////////////////////
//
// Pspell UTF8 to Char
//

class Pspell_UTF8_Char : public PspellConvert
{
public:
  PspellFromUniLookup lookup;
  Pspell_UTF8_Char(PspellConfig & c, const char * e);
  bool convert_next_char (const char * & in, 
			  PspellAppendableString & out) const;
};

Pspell_UTF8_Char::Pspell_UTF8_Char(PspellConfig & c, const char * e)
  : PspellConvert("UTF-8", e) 
{
  PspellToUniLookup unused;
  read_in_char_data(error_, c, e, unused, lookup);
}

#define get_check_next \
  c = *in;                                        \
  if ((c & 0xC0) != 0x80) {u = '?'; goto FINISH;} \
  ++in;                                           \
  u <<= 6;                                        \
  u |= c & 0x3F


bool Pspell_UTF8_Char
::convert_next_char (const char * & in, 
		     PspellAppendableString & out) const
{
  if (*in == '\0') {
    return false;
  }

  Uni32 u = (Uni32)(-1);

  char c = *in++;
  while ((c & 0xC0) == 0x80) c = *in++;
  if ((c & 0x80) == 0x00) { // 1-byte wide
    u = c;
  } else if ((c & 0xE0) == 0xC0) { // 2-byte wide
    u  = c & 0x1F; 
    get_check_next;
  } else if ((c & 0xF0) == 0xE0) { // 3-byte wide
    u  = c & 0x0F; 
    get_check_next;
    get_check_next;
  } else if ((c & 0xF8) == 0xF0) { // 4-byte wide
    u  = c & 0x0E; 
    get_check_next;
    get_check_next;
    get_check_next;
  }

 FINISH:

  assert (u != (Uni32)(-1));

  char d = lookup[u];
  out.append(&d, 1);
  return true;

}

//////////////////////////////////////////////////////////////////////
//
// Pspell Char to Char
//

class Pspell_Char_Char : public PspellConvert
{
public:
  PspellCharLookup lookup;
  
  Pspell_Char_Char(PspellConfig & c, const char * in, const char * out);
  bool convert_next_char (const char * & in, 
			  PspellAppendableString & out) const;
};

Pspell_Char_Char::Pspell_Char_Char(PspellConfig & c, 
				   const char * in_code, 
				   const char * out_code)
  : PspellConvert(in_code, out_code) 
{
  PspellToUniLookup   to;
  PspellFromUniLookup from;
  {
    PspellFromUniLookup unused;
    read_in_char_data(error_, c, in_code, to, unused);
    if (error_number() != 0) return;
  } {
    PspellToUniLookup unused;
    read_in_char_data(error_, c, out_code, unused, from);
    if (error_number() != 0) return;
  }
  lookup.reset();
  for (unsigned int i = 0; i != 256; ++i) {
    lookup.insert(i, from[to[i]]);
  }
}


bool Pspell_Char_Char
::convert_next_char (const char * & in, 
		     PspellAppendableString & out) const
{
  if (*in != '\0') {
    char d = lookup[*in];
    out.append(&d, 1);
    in += 1;
    return true;
  } else {
    return false;
  }
}


//////////////////////////////////////////////////////////////////////
//
// new_pspell_convert
//

PspellCanHaveError * new_pspell_convert(PspellConfig & c,
					const char * in, 
					const char * out) 
{
  assert(sizeof(Uni16) == 2);
  assert(sizeof(Uni32) == 4);

  PspellString in_s  = in;
  PspellString out_s = out;

  unsigned int i;
  for (i = 0; i != in_s.size(); ++i)
    in_s[i] = tolower(in_s[i]);
  for (i = 0; i != out_s.size(); ++i)
    out_s[i] = tolower(out_s[i]);
  in  = in_s .c_str();
  out = out_s.c_str();

  if (strcmp(in ,"ascii") == 0) 
    in = "iso8859-1";
  if (strcmp(out,"ascii") == 0) 
    out = "iso8859-1";

  if (strcmp(in, out) == 0)
    return new Pspell_StraightThrough(in);

  else if (strcmp(in, "machine unsigned 16") == 0)
    return new Pspell_Uni16_Char(c,out);

  else if (strcmp(in, "machine unsigned 32") == 0)
    return new Pspell_Uni32_Char(c,out);

  else if (strcmp(in, "utf-8") == 0)
    return new Pspell_UTF8_Char(c,out);

  else if (strcmp(out, "machine unsigned 16") == 0)
    return new Pspell_Char_Uni16(c,in);

  else if (strcmp(out, "machine unsigned 32") == 0)
    return new Pspell_Char_Uni32(c,in);

  else if (strcmp(out, "utf-8") == 0)
    return new Pspell_Char_UTF8(c,in);

  else
    return new Pspell_Char_Char(c, in, out);
}
