#!/usr/bin/env perl

#                 Copyright (C) 2002, 2003 Stijn van Dongen
#
# You can redistribute and/or modify this program under the terms of the GNU
# General Public License;  either version 2 of the License or (at your option)
# any later version.

# TODO
#  check values; check correct usage --key=val and --key
#  (by associating type with key).


use strict;

my $mode_sort = 'a';    # a lphabetical
                        # o ccurrence

my $mode_score = 'e';   # e value
                        # b it

my $bcut = 6;
my $ecut = 40;

my $blastfix = "";
my $addfix = "";

my $user_tabfile = "";
my $stdhandler = 0;

while ($ARGV[0] =~ /^--/) {
   my $arg = shift @ARGV;
   my ($val, $key) = ("", "");
   if ($arg =~ /^--(.*?)(=(.*))?$/) {
      $key = $1;
      $val = $2 ? $3 : "";
   }
   else {
      print "Arguments must be in <--key=val> or <--key> format\n";
      exit(1);
   }

   if ($key eq 'sort') {
      $mode_sort = $val;
      if ($mode_sort !~ /[ao]/) {
         die "unknown sort mode <$mode_sort>\n";
      }
   }
   elsif ($key eq 'ecut') {
      $ecut = $val;
   }
   elsif ($key eq 'tab') {
      $user_tabfile = $val;
   }
   elsif ($key eq 'bcut') {
      $bcut = $val;
   }
   elsif ($key eq 'score') {
      $mode_score = $val;
      if ($mode_score !~ /[eb]/) {
         die "unknown sort mode <$mode_score>\n";
      }
   }
   elsif ($key eq 'xo-dat') {
      $addfix = $val;
   }
   elsif ($key eq 'xi-dat') {
      $blastfix = $val;
   }
   elsif ($key eq 'help') {
      help();
   }
   elsif ($key eq 'stdhandler') {
     $stdhandler = 1;
   }
   else {
      die "unknown argument $arg\n";
   }
}

if (!@ARGV) {
   help();
   exit(0);
}

my $fname = shift || die "please submit name of blast file\n";
my $obase = $fname;

if ($blastfix) {
   $obase = $fname;
   if ($fname =~ /\Q.$blastfix\E$/) {
      $obase =~ s/\Q.$blastfix\E$//;
   }
   else {
      $fname .= ".$blastfix";
   }
}
if ($addfix) {
   $obase .=  ".$addfix";
}

my ($gix, $giy);
my $seenlft = {};
my $seenrgt = {};
my $tagTocc = {};
my $me = "[$0] ";
my $lc = 0;

open(F_BLAST, "<$fname") || die "cannot open $fname\n";

my $user_tab = {};

if ($user_tabfile) {
   read_tab($user_tabfile, $user_tab);
}
else {
   open(F_TAB, ">$obase.tab") || die "cannot open $obase.idx\n";    # indices
   print F_TAB "#<mapped index> <tag>\n";
   if ($mode_sort eq 'o') {
      print F_TAB "# sort mode is by occurrence\n";
   }
   elsif ($mode_sort eq 'a') {
      print F_TAB "# sort mode is alphabetical\n";
   }
   open(F_HDR, ">$obase.hdr") || die "cannot open $obase.hdr\n";    # header
   open(F_MAP, ">$obase.map") || die "cannot open $obase.idx\n";    # map file
   #  <raw ordinal>     <sort ordinal>
}

my ($f_raw, $f_err);

if ($stdhandler) {
   $f_raw = \*STDOUT;
   $f_err = \*STDERR;
} else {
   open(F_RAW, ">$obase.raw") || die "cannot open $obase.raw\n";    # raw lists.
   $f_raw = \*F_RAW;
   open(F_ERR, ">$obase.err") || die "cannot open $obase.err\n";
   $f_err = \*F_ERR;
}

my $need_query = 1;
my $need_hits = 2;
my $need_gi = 3;

my $ID = 0;

my $state = $need_query;

while (<F_BLAST>) {
   $lc++;
   chomp;
   if (/^Query=\s+gi\|(\d+(_\d+)?)/ || /^Query=\s+(\S+)\s+.*$/) {
      if ($state != $need_query) {
         print STDERR "unexpected 'Query=' line\n";
      }
      $gix = $1;
      my $id = getid($gix);

      if ($id >= 0) {
         print $f_raw "$id ";
      }
      $state = $need_hits;
      $seenlft->{$gix}++;
      $seenrgt->{$gix}++;
   }
   elsif
   ($state == $need_hits && /sequences producing significant alignments/i) {
      $state = $need_gi;
   }
   elsif ($state == $need_hits && /no hits found/i) {
      print STDERR "no hits found for gi $gix\n";
      print $f_raw "\$\n";
      $state = $need_query;
   }
   elsif
   (  $state == $need_gi
   && (/^gi\|(\d+(_\d+)?)/ || /^(\S+)\s+.*\.\.\..*$/)
   )
   {  $giy = $1;
      my $id = getid($giy);
      my ($s, $b, $e);

      if (/(\S+)\s+(\S+)\s*$/) {
         $b = +$1;
         $e = +$2;
         $seenrgt->{$giy}++;
      }
      else {
         print STDERR "no scores in line $lc [$_]!\n";
         next;
      }
      if ($mode_score eq 'e') {
         $s = $e > 0 ? -log($e)/log(10) : 200;
         if ($s > 200) {
            $s = 200;
         }
         if ($s <= $ecut) {
            next;
         }
      }
      elsif ($mode_score eq 'b' || 1) {
         $s = $b;
         if ($s <= $bcut) {
            next;
         }
      }
      if ($id >= 0) {
         print $f_raw "$id:$s ";
      }
   }
   elsif (/^\s*$/) {
      # paragraph skip does not change state, including the $need_gi case.
   }
   elsif ($state == $need_gi) {
      print $f_raw "\$\n";
      $state = $need_query;
   }
}

if ($state == $need_gi) {
   print $f_err "run ended while expecting more secondary scores\n";
   print STDERR "run ended while expecting more secondary scores\n";
   print $f_raw "\$\n";
}

my $alnum = 0;
my $occTmisc = {};

unless ($user_tabfile) {
   if ($mode_sort eq 'a') {
      for (sort {$::a cmp $::b; } keys %$tagTocc) {
         print F_TAB "$alnum $_\n";
         $occTmisc->{$tagTocc->{$_}} = [ $alnum, $_ ];
         $alnum++;
      }
      print STDERR "Index [$obase.tab] is sorted by alphabetic order\n";
   }
   elsif ($mode_sort eq 'o' || 1) {
      for (sort {$tagTocc->{$::a} <=> $tagTocc->{$::b}; } keys %$tagTocc) {
         print F_TAB "$alnum $_\n";
         $occTmisc->{$tagTocc->{$_}} = [ $alnum, $_ ];
         $alnum++;
      }
      print STDERR "Index [$obase.tab] is sorted by occurrence order\n";
      print STDERR "Primary and secondary occurrences are considered equal\n";
   }
   my $ct = keys %$occTmisc;
   print F_MAP "(mclheader\nmcltype matrix\ndimensions $ct", 'x',
               "$ct\n)\n(mclmatrix\nbegin\n";
   for (sort {$::a <=> $::b; } keys %$occTmisc) {
      # print F_MAP "$_ ", $occTmisc->{$_}[0], " ", $occTmisc->{$_}[1], "\n";
      print F_MAP  "$_ $occTmisc->{$_}[0] \$\n";
   }
   print F_MAP ")\n";

   print F_HDR "(mclheader\nmcltype matrix\ndimensions ";
   print F_HDR $ID . 'x' . $ID;
   print F_HDR "\n)\n";
}


my $n_err = 0;
for (sort keys %$seenrgt) {
   if (!$seenlft->{$_}) {
      print $f_err "secondary element $_ not seen as primary element\n";
      print $f_err "emergency measure: added the element to the primary list\n";
      $n_err++;
   }
}

if ($n_err) {
   print STDERR $me, "$n_err secondary elements not seen as primary element\n";
   print STDERR $me, "I added all of them\n";
   print STDERR $me, "There were $ID elements in all\n";
}
else {
   print STDERR $me,
   "all secondary elements were also seen as primary elements (check ok)\n";
}


sub read_tab {
   my $file = shift;
   my $tab = shift;
   open (U_TAB, "<$file") || "die cannot open $file\n";
   while (<U_TAB>) {
      if (/^\s*#/) {
         next;
      }
      else {
         if (/^(\d+)\s+(.*)/) {
            $tab->{$2} = $1;
         }
         else {
            print STDERR "___ cannot parse line: $_";
         }
      }
   }
}



sub getid {
   my $gi = $_[0];

   if ($user_tabfile) {
      if (defined($user_tab->{$gi})) {
         return $user_tab->{$gi};
      }
      else {
         print STDERR "___ no user tab entry for label <$gi>\n";
      }
   }
   else {
      if (!exists($tagTocc->{$gi})) {
         $tagTocc->{$gi} = $ID++;
      }
      return $tagTocc->{$gi};
   }
   return -1;
}


sub help {
   print <<_help_;
Usage: mcxdeblast <options> file-name
where file-name is in BLAST NCBI format.
mcxdeblast will create
   base.hdr    [to be read by mcxassemble]
   base.raw    [to be read by mcxassemble]
   base.map    [to be read by mcxassemble]
   base.tab    [to be read by clmformat]
   base.err    [error log]
where base is derived from or equal to file-name
Options:
   --score=<b|e>     Use bit scores or E-values.
   --mode=<a|o>      Use alphabetic sorting (default) or occurrence.
   --xi-dat=<suf>    Strip <suf> from file-name to create output base name.
   --xo-dat=<suf>    Add <suf> to base name.
   --bcut=<val>      Ignore hits below bit score <val>
   --ecut=<val>      Ignore hits below E-value <val>
_help_
}
