#!/usr/local/bin/perl -w

use strict;

use Getopt::Long;
use Carp;
use FileHandle;
use FindBin qw($Bin);
use File::Basename qw( dirname );
use Time::localtime;

# Load libraries needed for reading config -----------------------------------
use vars qw( $SERVERROOT );
BEGIN{
  $SERVERROOT = dirname( $Bin );
  warn $SERVERROOT;
  unshift @INC, "$SERVERROOT";
  unshift @INC, "$SERVERROOT/conf";
  eval{ require SiteDefs };
  if ($@){ die "Can't use SiteDefs.pm - $@\n"; }
  warn  @SiteDefs::ENSEMBL_LIB_DIRS;
  map{ unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS;
}

use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::SeqFeature;
use utils::Tool;

my $host;#   = 'ens-livemirror';
my $user;#   = 'ensro';
my $pass;#   = '';
my $port;#   = 3306;
my $dbname;# = 'homo_sapiens_core_42_36d';

my @chromosomes;
my @genetypes;
my $path;
my $gtf_file = undef;
my $include_codons = 1;

my $start = undef;
my $end = undef;
my $seqname = undef;
my $coordsystem = 'toplevel';

my @SPECIES;
my ($dumpdir, $no_log, $logfile, $email, $no_email, $no_compress, $start_with, $end_with);
my ($noncore_db,$core_dbname) = '',''; #can dump from non-core databases if you provide a core db for the dna sequence
$| = 1;

&GetOptions(
  'host:s'        => \$host,
  'user:s'        => \$user,
  'dbname:s'      => \$dbname,
  'pass:s'        => \$pass,
  'port:n'        => \$port,
  'species:s'     => \@SPECIES,
  'path:s'        => \$path,
  'start_with:s'  => \$start_with,
  'end_with:s'    => \$end_with,
  'codons'        => \$include_codons,
  'chromosomes:s' => \@chromosomes,
  'genetypes:s'   => \@genetypes,
  'gtffile:s'     => \$gtf_file,
  'coordsystem:s' => \$coordsystem,
  'no_compress'   => \$no_compress,
  'no_log'        => \$no_log,
  'logfile:s'     => \$logfile,
  'email:s'       => \$email,
  'no_email'      => \$no_email,
  'dumpdir:s'     => \$dumpdir,
  'noncore_db:s'  => \$noncore_db,
  'core_dbname:s' => \$core_dbname,
);

if (scalar(@chromosomes)) {
  @chromosomes = split (/,/, join (',', @chromosomes));
}

if (scalar(@genetypes)) {
  @genetypes = split (/,/, join (',', @genetypes));
}

# Load modules needed for reading config -------------------------------------
require EnsEMBL::Web::SpeciesDefs;      # Loaded at run time
require EnsEMBL::Web::DBSQL::DBConnection;

my $SPECIES_DEFS = EnsEMBL::Web::SpeciesDefs->new();
$SPECIES_DEFS || die "\n\n[DIE] $0: SpeciesDefs config not found";
my $release = $SPECIES_DEFS->ENSEMBL_VERSION;
utils::Tool::info(2, "SiteDefs configured for release $release");


if (@SPECIES) {
  @SPECIES = @{ utils::Tool::check_species(\@SPECIES) };
}
else {
  @SPECIES = @{ utils::Tool::all_species()};
}

if ($start_with) {
    @SPECIES  = @{ utils::Tool::start_with_species($start_with, \@SPECIES) } ;
}
elsif ($end_with) {
  @SPECIES  = @{ utils::Tool::end_with_species($end_with, \@SPECIES) };
}


# Times and log file
my $script_start_time = time();
unless ($no_log) {
  (my $time = gmtime(time)) =~ s/\s+/\./g;
  $logfile ||= "logs/gtf$release"."_$time.log";
  print STDERR "Using logfile $logfile\n";
  open(STDERR, "> $logfile") || die "Can't create file:$!\n";
}


# For each species ----------------------------------------------------------
for my $sp( sort @SPECIES ){  # users selected spp

  # Connect to db
  my $db;
  if ($host, $port, $dbname) {
    $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(
      -host   => $host,
      -user   => $user,
      -port   => $port,
      -pass   => $pass,
      -dbname => $dbname
    );
    if ($noncore_db) {
      unless ($core_dbname) {
        print STDERR "Please specify details of core DB using 'core_dbname='. Exiting\n";
        exit;
      }
      my $core_db  = new Bio::EnsEMBL::DBSQL::DBAdaptor(
        -host   => $host,
        -user   => $user,
        -pass   => $pass,
        -port   => $port,
        -dbname => $core_dbname,
      );
      $db->dnadb($core_db);
    }
  }
  else {
    my $dbConnection = EnsEMBL::Web::DBSQL::DBConnection->new($sp, $SPECIES_DEFS);
    $db = $dbConnection->get_DBAdaptor('core') || ( utils::Tool::warning( 1, "DB $db is not valid for $sp" ));
  }

  # Get assembly and check against ini file -------------
  my $assembly;
  unless ($path) {
    my $cs_adaptor   = $db->get_CoordSystemAdaptor;
    my ($highest_cs) = @{$cs_adaptor->fetch_all()};
    $assembly     = $highest_cs->version();
  }

  utils::Tool::info(2, "$sp: gtf dump $assembly");

  # Get slices ---------------------------------------------
  my $sa  = $db->get_SliceAdaptor();

  my @slices;
  if (scalar(@ARGV) == 0) {
    my @tmpslices = @{$sa->fetch_all('toplevel',undef,1)};

    foreach my $slice (@tmpslices) { 
      if ($slice->coord_system->version eq $assembly || !$slice->coord_system->version) {
	push @slices, $slice; 
      }
    }
  } 
  elsif (scalar(@ARGV) == 1) {
    my $region   = $ARGV[0];

    @slices = ( $sa->fetch_by_name($region) );
  } 
  elsif (scalar(@ARGV) == 3) {
    my $slice_name = "toplevel:$assembly:" . $ARGV[0] . ":" . $ARGV[1] . ":" . $ARGV[2] . ":1";

    @slices = ( $sa->fetch_by_name($slice_name) );
  } 
  else {
    croak "ERROR: Will dump all toplevel unless command line has slicename or chr start end\n";
  }

  # Output file-----------------------------------------------
  # Work out species folder name
  my $sp_release = $SPECIES_DEFS->get_config($sp,"SPECIES_RELEASE_VERSION");
  $sp_release =~ s/\.//g;
  $dumpdir   ||= "/dumps/release-$release";

  my $sp_folder = "$dumpdir/gtf/". lc($sp);
  utils::Tool::check_dir($sp_folder);

  my $outfile = $gtf_file || "$sp_folder/$sp.$assembly.$release.gtf";
  my $gtffp = new FileHandle;
  $gtffp->open(">$outfile") or croak "Unable to open $outfile for write";
  $gtffp->autoflush(1);

  foreach my $slice (@slices) {
    my $genes = $slice->get_all_Genes(undef,undef,1);
    foreach my $gene (@$genes) {
      foreach my $trans (@{$gene->get_all_Transcripts}) {
	write_transcript_gtf($gtffp,$slice,$gene,$trans,$include_codons,$seqname);
      }
    }
  }
  utils::Tool::info(1, "Dumped $sp");

  # Compress (gzip files)
  compress( $outfile ) unless $no_compress;

} # end foreach species


# Work out timings -----------------------------------------------------------
my $time_taken = time - $script_start_time;
my $hours      = localtime($time_taken)->hour -1;
utils::Tool::info (2, "Used $logfile.") if $logfile;
utils::Tool::info (2, " Time taken: $hours:". localtime($time_taken)->min."mins");
close STDERR;

unless ($no_email) {
  $email ||= 'ssg-ensembl@sanger.ac.uk';
  utils::Tool::mail_log( $logfile, $email ) if $logfile;
}
exit;


###########################################################################
sub make_start_codon_features {
  my ($trans,$id) = @_;


  if (!$trans->translation) {
    return (());
  }

  my @translateable = @{$trans->get_all_translateable_Exons};

  my @pepgencoords = $trans->pep2genomic(1,1);

  # cdna can come padded these days so allow gap at the start
  if($pepgencoords[0]->isa('Bio::EnsEMBL::Mapper::Gap')){
    shift @pepgencoords;
  }

  if(scalar(@pepgencoords) > 3) {
    croak "pep start does not map cleanly\n";
  }

  unless($pepgencoords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
    croak "pep start maps to gap\n";
  }
  unless($pepgencoords[$#pepgencoords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
    croak "pep start (end of) maps to gap\n";
  }

  @translateable = @{$trans->get_all_translateable_Exons};
  my @startc_feat;
  my $phase = 0;
  foreach my $pepgencoord (@pepgencoords) {
    push @startc_feat, new Bio::EnsEMBL::SeqFeature(
                             -seqname => $id,
                             -source_tag => 'starttrans',
                             -primary_tag => 'similarity',
                             -start => $pepgencoord->start,
                             -end   => $pepgencoord->end,
                             -phase => $phase,
                             -strand => $translateable[0]->strand);
    $phase = 3 - ($pepgencoord->end - $pepgencoord->start + 1);
  }
  if ($translateable[0]->strand == 1) {
    @startc_feat = sort {$a->start <=> $b->start } @startc_feat;
  } else {
    @startc_feat = sort {$b->start <=> $a->start } @startc_feat;
  }
  return @startc_feat;

}

sub write_transcript_gtf {
  my ($fh,$slice,$gene,$transcript,$include_codons,$seqname) = @_;
  my $sliceoffset = $slice->start-1;

  my @startcs =  make_start_codon_features($transcript,$transcript->stable_id);
  my @endcs   =  make_stop_codon_features($transcript,$transcript->stable_id);


  my $chrname;
  $chrname = $slice->seq_region_name;

  my $idstr;

  if (defined($seqname)) {
    $idstr = $seqname;
  } else {
    $idstr = $chrname;
  }

  my ($hasstart,$hasend) = check_start_and_stop($slice,$transcript);

  if (!$include_codons) {
    $hasstart = $hasend = 0;
  }

  my @translateable_exons = @{$transcript->get_all_translateable_Exons} if $transcript->translation;

  my $count=1;
  my $intrans = 0;
  my $instop = 0;


  my $biotype_display;
  {
    no warnings 'uninitialized';
    $biotype_display = $noncore_db eq 'vega' ? $gene->status.'_'.$gene->biotype : $gene->biotype;
  }

  foreach my $exon (@{$transcript->get_all_Exons}) {
    my $strand = $exon->strand;

    if ($exon->strand == -1) {
        $strand = "-";
    } elsif ($exon->strand == 1) {
        $strand = "+";
    } elsif ($exon->strand == 0) {
        $strand = ".";
    }

    if ($transcript->translation && $exon == $transcript->translation->start_Exon) {
      $intrans = 1;
    }

    print $fh $idstr . "\t" . 
              $biotype_display . "\t" . 
              'exon' . "\t" . 
              ($exon->start+$sliceoffset) . "\t". 
              ($exon->end+$sliceoffset) . "\t". 
              "." . "\t". 
              $strand . "\t". 
              "." . "\t";
    print_attribs($fh,$gene,$transcript,$count,'exon');
    print $fh "\n";

    if ($intrans) {
      my $cdsexon = shift @translateable_exons;
      my $phase = $cdsexon->phase;
      if ($cdsexon->phase == 1) {
        $phase = 2;
      } elsif ($cdsexon->phase == 2) {
        $phase = 1;
      } elsif ($cdsexon->phase == -1) {
        $phase = 0;
      }

      my $exon_start = $cdsexon->start;
      my $exon_end   = $cdsexon->end;
      if ($transcript->translation && 
          $hasend && 
          ($exon->end >= $endcs[0]->start && $exon->start <= $endcs[0]->end)) {

        if ($cdsexon->strand == 1) {
          $exon_end = $cdsexon->end - $endcs[0]->length;
        } else {
          $exon_start = $cdsexon->start + $endcs[0]->length;
        }
      }

      if ($exon_start <= $cdsexon->end &&
          $exon_end >= $cdsexon->start &&
          !$instop) {
        print $fh $idstr . "\t" . 
                  $biotype_display . "\t" . 
                  'CDS' . "\t" . 
                  ($exon_start+$sliceoffset) . "\t". 
                  ($exon_end+$sliceoffset) . "\t". 
                  "." . "\t". 
                  $strand . "\t". 
                  $phase . "\t";
        print_attribs($fh,$gene,$transcript,$count,'CDS');
        print $fh "\n";
      }
    }
    if ($transcript->translation && 
        $exon == $transcript->translation->start_Exon && $hasstart) {
      my $tmpcnt = $count;
      foreach my $startc (@startcs) {
        print $fh $idstr . "\t" . 
                  $biotype_display . "\t" . 
                  'start_codon' . "\t" . 
                  ($startc->start+$sliceoffset) . "\t". 
                  ($startc->end+$sliceoffset) . "\t". 
                  "." . "\t". 
                  $strand . "\t". 
                  $startc->phase . "\t";
        print_attribs($fh,$gene,$transcript,$tmpcnt++,'start_codon');
        print $fh "\n";
      }
    }
    if ($transcript->translation && 
        ($exon == $transcript->translation->end_Exon)) {
      if ($hasend) {
        my $tmpcnt = $count - $#endcs;
        foreach my $endc (@endcs) {
          print $fh $idstr . "\t" . 
                    $biotype_display . "\t" . 
                    'stop_codon' . "\t" . 
                    ($endc->start+$sliceoffset) . "\t". 
                    ($endc->end+$sliceoffset) . "\t". 
                    "." . "\t". 
                    $strand . "\t". 
                    $endc->phase . "\t";
          print_attribs($fh,$gene,$transcript,$tmpcnt++,'stop_codon');
          print $fh "\n";
        }
      }
      $intrans = 0;
    }

    if (scalar(@endcs) && 
        ($exon->end >= $endcs[0]->start && $exon->start <= $endcs[0]->end)) {
      $instop = 1;
    }

    $count++;
  }
}

sub make_stop_codon_features {
  my ($trans,$id) = @_;

  if (!$trans->translation) {
    return (());
  }
  my @translateable = @{$trans->get_all_translateable_Exons};

  my $cdna_endpos = $trans->cdna_coding_end;

  my @pepgencoords = $trans->cdna2genomic($cdna_endpos-2,$cdna_endpos);

  if(scalar(@pepgencoords) > 3) {
    croak "pep end does not map cleanly\n";
  }

  unless($pepgencoords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
    croak "pep end maps to gap\n";
  }
  unless($pepgencoords[$#pepgencoords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
    croak "pep end (end of) maps to gap\n";
  }

  my @stopc_feat;
  my $phase = 0;
  foreach my $pepgencoord (@pepgencoords) {
    push @stopc_feat, new Bio::EnsEMBL::SeqFeature(
                             -seqname => $id,
                             -source_tag => 'endtrans',
                             -primary_tag => 'similarity',
                             -start => $pepgencoord->start,
                             -end   => $pepgencoord->end,
                             -phase => $phase,
                             -strand => $translateable[0]->strand);
    $phase = 3 - ($pepgencoord->end-$pepgencoord->start+1);
  }

  if ($translateable[0]->strand == 1) {
    @stopc_feat = sort {$a->start <=> $b->start } @stopc_feat;
  } else {
    @stopc_feat = sort {$b->start <=> $a->start } @stopc_feat;
  }
  return @stopc_feat;
}

sub print_attribs {
  my ($fh,$gene,$transcript,$count,$type) = @_;


  my $gene_name;
  $gene_name = $gene->external_name;
  $gene_name =~ s/^[A-Z]{1,3}:// if ($noncore_db eq 'vega');

  my $trans_name;
  $trans_name = $transcript->external_name;
  $trans_name =~ s/^[A-Z]{1,3}:// if ($noncore_db eq 'vega');

  print $fh " gene_id \"" .  get_gene_id($gene) . "\";" .
            " transcript_id \"" . get_transcript_id($transcript) . "\";";
  print $fh " exon_number \"$count\";";
  print $fh " gene_name \"" . $gene_name . "\";" if ($gene_name);
  print $fh " transcript_name \"" . $trans_name . "\";" if ($trans_name);
  if ($type eq 'CDS') {
    print $fh ' protein_id "' . get_translation_id($transcript->translation) . '";';
  }
}


sub get_gene_id {
  my $gene = shift;

  if (defined($gene->stable_id)) {
    return $gene->stable_id;
  }
  return $gene->dbID;
}

sub get_transcript_id {
  my $transcript = shift;

  if (defined($transcript->stable_id)) {
    return $transcript->stable_id;
  }
  return $transcript->dbID;
}
sub get_translation_id {
  my $translation = shift;

  if (defined($translation->stable_id)) {
    return $translation->stable_id;
  }
  return $translation->dbID;
}

sub check_start_and_stop {
  my ($slice,$trans) = @_;

  return (0,0) if (!defined($trans->translation));

  my $tln = $trans->translation;

  my $coding_start = $trans->cdna_coding_start;
  my $coding_end   = $trans->cdna_coding_end;
  my $cdna_seq     = uc($trans->spliced_seq);

  my $startseq     = substr($cdna_seq,$coding_start-1,3);
  my $endseq       = substr($cdna_seq,$coding_end-3,3);

  my $has_start = 1;
  my $has_end = 1;

  $has_start = 0  if ($startseq ne "ATG");
  $has_end = 0 if ($endseq ne "TAG" && $endseq ne "TGA" && $endseq ne "TAA");

  return ($has_start, $has_end);
}


#----------------------------------------------------------------------
sub compress {
  my $file = shift;
  return  unless (-e "$file");

  utils::Tool::info( 1, "Gzipping files");
  #my $size = -s $file;
  if (my $size = -s $file > 3500000000 ){
    #split_data($file);
    utils::Tool::warning(1, "Large file generated $file $size");
  }
  else {
    system("gzip -9 $file") ==0 or utils::Tool::warning(1, "Can't gzip file $! $file");
  }
  return 1;
}


