#!/usr/bin/perl

# This file is part of The New Aspell
# Copyright (C) 2004 by Kevin Atkinson under the GNU LGPL
# license version 2.0 or 2.1.  You should have received a copy of the
# LGPL license along with this library if you did not you can find it
# at http://www.gnu.org/.

use strict;
use warnings;

use constant { 
  CHAR => 0, TYPE => 1, DISPLAY=> 2, UPPER => 3, LOWER => 4, TITLE => 5,
  PLAIN => 6, NAME => 7
};

sub uni_char($$$); # uni def what

if ($#ARGV < 1) 
  {die "Usage: $0 <unicode data file> <textual reference table(s)>\n"}

my (%unidata);

open IN, $ARGV[0] or die "Can't open \"$ARGV[0]\": $!\n";

while (<IN>) {
  chop;
  s/\s*#\s*(.*)$//;
  my $n = $1;
  next unless $_;
  my @data = split / /;
  $n =~ /#\s*(.+?)\s*$/;
  push @data, $1;
  $unidata{$data[0]} = \@data;
}

shift;

my %uni_char;
my %char_uni;

foreach my $file (@ARGV) {
  my ($base) = $file =~ /^(.+)\.txt/i or die "$file does not end in \".txt\"\n";
  $base = lc $base;

  open OUT,">$base\.cset" or die "Can't create \"$base\.cset\": $!\n";

  my $nl = 1 if $base =~ s/-nl$//;

  open IN, "$base\.txt"   or die "Can't open \"$base.txt\": $!\n";

  $base =~ s/^.+\///;

  my @ascii = (0,16,32..64,91..96,123..127);
  my @chardata;
  undef %char_uni;
  undef %uni_char;

  my $no_ascii = 0;

  my $i = 1;
  while (<IN>) {
    /^\s*(\=|0x)([a-fA-F0-9]{2})\s+(U\+|0x)([a-fA-F0-9]{4})/ or next;
    #print "$2 $4\n";
    my $char = hex($2);
    my $uni  = hex($4);
    next if $char == $uni && ($char < 0x20 || (0x80 <= $char && $char < 0xA0));
    printf("Warning remapping '%c' (0x%X) may cause problems with Aspell.\n",
           $char, $char)
        if $char != $uni && ($char == 0x00 || $char == 0x10
                             || (0x20 <= $char && $char <= 0x40)
                             || (0x5B <= $char && $char <= 0x60)
                             || (0x7B <= $char && $char <= 0x7F));
    $no_ascii = 2 if $char != $uni && (0x20 <= $char && $char <= 0x7F);
    $char_uni{uc $2} = uc $4;
    $uni_char{uc $4} = uc $2;
    $i++;
  }

  foreach my $i (@ascii) {
    my $char = sprintf("%02X",$i);
    my $unichar = "00".$char;
    next if defined $char_uni{$char};
    $char_uni{$char} = $unichar;
    $uni_char{$unichar} = $char;
  }

  foreach my $i (0x00..0xFF) {
    my $char = sprintf("%02X",$i);
    my $unichar = $char_uni{$char};
    my $info = $unidata{$unichar} if defined $unichar;
    my $letter = $info->[NAME] =~ /LATIN.+LETTER/ if defined $info;
    if (defined $info && (!$nl || !$letter)) {
      $chardata[$i] =
        [$char,
         $unichar,
         $info->[TYPE],                            # 2
         $info->[DISPLAY],                         # 3
         uni_char($info->[UPPER], $char, "upper"), # 4
         uni_char($info->[LOWER], $char, "lower"), # 5
         uni_char($info->[TITLE], $char, "title"), # 6
         uni_char($info->[PLAIN], $char, "plain"), # 7
         '00',                                     # 8
         '00',                                     # 9
         $info->[NAME],
        ];
    } else {
      my $u = (defined $unichar ? hex($unichar)
               : $no_ascii && 0x20 <= $i && $i < 0x80 ? $i + 0xE000
               : $i >= 0xA0 ? $i + 0xE000
               : $i);
      my $n = ($u >= 0xE000 ? '<unused>'
               : defined $unichar ?
               ($letter ? '<unused latin letter>' : '<unused special>')
               : $u < 0x20 ? '<unused control>'
               : $u < 0x80 ? '<unused latin letter>'
               : $u < 0xA0 ? '<unused control>'
               : die "$base $u");
      $chardata[$i] =
        [$char,
         sprintf("%04X", $u),
         '-', ($n =~ /letter/ ? 'Y': 'N'),
         $char, $char, $char, '00', '00', '00',
         $n];
    }
  }

  foreach my $char (sort keys %char_uni) {
    my $unichar = $char_uni{$char};
    my $info = $unidata{$unichar};
    next unless defined $info;
    my $inf = $chardata[hex $char];
    my $t = uc($inf->[2]);
    if ($t eq 'L' || $t eq 'V') {
      $inf->[2] =~ tr/vV/lL/;
      $inf->[8] = $t eq 'V' ? '2A' : $chardata[hex $inf->[5]][7];
      $inf->[9] = $t eq 'V' ? '00' : $chardata[hex $inf->[5]][7];
    } else {
      $inf->[7] = '00';
    }
  }

  foreach my $char (sort keys %char_uni) {
  }

  print OUT "# Aspell Character Data File.\n";
  print OUT "= $base\n";
  print OUT "> ascii\n" unless $no_ascii || $base eq 'ascii';
  print OUT "/\n";
  print OUT "# <char> <uni> <type> <display> <upper> <lower> <title> <plain>\n";
  print OUT "#                                                         <sl-first> <sl-rest>\n";
  foreach my $i (0..255) {
    my @d = @{$chardata[$i]};
    print OUT "@d[0..9] # $d[10]\n";
  }
}

sub uni_char($$$) {
  my ($uni, $def, $what) = @_;
  my $chr =  $uni_char{$uni};
  return $chr if defined $chr;
  print STDERR "Warning U+$uni mot mapped.  It is needed for the \"$what\" mapping of 0x$def.\n"
    if ($unidata{$uni} eq 'L' || $unidata{$uni} eq 'V') 
      && ($unidata{$char_uni{$def}} eq 'L' || $unidata{$char_uni{$def}} eq 'V');
  return $def;
}
