#!/usr/local/bin/perl

use strict;
use warnings;

package do_fasta_dump_test;

### Script used to dump cDNA, peptide, RNA and DNA (masked and non-masked sequences)
### used on the FTP site, for blast databases and the blat servers
### See additional options/documentation at end of script

use Carp;
use FindBin qw($Bin);
use File::Basename qw( dirname );
use Time::localtime;
use Time::HiRes qw(time);
use Getopt::Long;
use Pod::Usage;
use Data::Dumper qw( Dumper );

# Load libraries needed for reading config -----------------------------------
use vars qw( $SERVERROOT );
BEGIN{
  $SERVERROOT = dirname( $Bin );
  unshift @INC, "$SERVERROOT/conf";
  unshift @INC, "$SERVERROOT";
  eval{ require SiteDefs };
  if ($@){ die "Can't use SiteDefs.pm - $@\n"; }
  map{ unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS;
}

use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::DBLoader;
use Bio::SeqIO;
#use EnsEMBL::Web::Data::NewsItem;
use utils::Tool;

### This block defines our configuration variables
###
our $dump_machine = "ensdb-1-11";
our $basedir      = "/dumps/";
our $blastdir     = "/data_ensembl/blastdb/ensembl";
our $blatdir      = "/data/blat";
our $xdformat_command = '/dumps/shared/xdformat';
our $ssaha_command  = '/dumps/shared/ssaha2Build';
our $blat_command = '/nfs/WWW/bin/i686/faToTwoBit';
#our $key_file     = '/nfs/WWW/.ssh/blastsrv';
our $no_data;
our $no_dumpdata;
my ($no_compress, $no_log, $no_indexes, $no_ssaha, $no_remotedirs, $no_blat);
our @TYPES;
our $release;
my (@SPECIES, @DATABASES);
our $DUMPDIR;
our $PREVIOUS_DUMPDIR;
my %last_release_species_data;
my ($logfile, $email);
my $minimise_memory;
my ($start_with, $end_with);
my ($help, $info);
my $dump_dna;

&GetOptions(
  'type:s'          => \@TYPES,        # obligatory
  'release:s'       => \$release,

  'species:s'       => \@SPECIES,      # optional
  'database:s'      => \@DATABASES,
  'dumpdir:s'       => \$DUMPDIR,
  'past_release:s'    => \$PREVIOUS_DUMPDIR,
  'logfile:s'       => \$logfile,
  'email:s'         => \$email,
  'minimise_memory' => \$minimise_memory,

  'no_compress'     => \$no_compress,
  'no_log'          => \$no_log,
  'no_remotedirs'   => \$no_remotedirs,
  'no_ssaha'        => \$no_ssaha,
  'no_data'         => \$no_data,
  'no_indexes'      => \$no_indexes,
  'no_blat'         => \$no_blat,
  'no_dumpdata'     => \$no_dumpdata,  # optional
  'start_with:s'    => \$start_with,   # optional
  'end_with:s'      => \$end_with,     # optional
  'help'            => \$help,         # info
  'info'            => \$info,
) || pod2usage(2);

pod2usage(-verbose => 2) if $info;
pod2usage(-verbose => 1) if $help;

my $types = check_types(\@TYPES);
die "\n\n[DIE] You must provide an ensembl release number e.g. --release 30" unless $release;

# Load modules needed for reading config -------------------------------------
require EnsEMBL::Web::SpeciesDefs;    # Loaded at run time
require EnsEMBL::Web::DBSQL::DBConnection;

my $SPECIES_DEFS = EnsEMBL::Web::SpeciesDefs->new();
$SPECIES_DEFS || die "\n\n[DIE] $0: SpeciesDefs config not found";

# Check species if user defined.  Else use all species
if( @SPECIES ) {
  @SPECIES = @{ utils::Tool::check_species(\@SPECIES) };
} else {
  @SPECIES = @{ utils::Tool::all_species()};
}

$email ||= 'ssg-ensembl@sanger.ac.uk';

# Check the ENSEMBL_VERSION is up to date and matches user's request
if( my $sitedefs_release =  $SiteDefs::ENSEMBL_VERSION ne $release) {
  die "[*DIE] Ensembl release version requested is $release but site defs is configured to use $sitedefs_release";
}

my $blast_dir = $SPECIES_DEFS->get_config($SPECIES[0],"ENSEMBL_BLAST_DATA_PATH");

if( $blast_dir !~ /$release/ ) {
  if( $TYPES[0] eq 'all') {
    die "[*DIE] Ensembl release version requested is $release but blast dir is configured to be $blast_dir";
  }
  utils::Tool::warning (1, "WILL PUSH BLAST FILES TO $blast_dir.  Kill job if not correct.");
}

# Times and log file
our $script_start_time = time();
unless( $no_log) {
  (my $time = gmtime(time)) =~ s/\s+/\./g;
  $logfile ||= "logs/fasta$release"."_$time.log";
  print STDERR "Using logfile $logfile\n";
  open(STDERR, "> $logfile") || die "Can't create file:$!\n";
}

info(2, "Will push blast files to $blast_dir") unless $no_indexes;

# Start with a species further down in the alphabet
@SPECIES  = @{ utils::Tool::start_with_species($start_with, \@SPECIES) } if $start_with;
@SPECIES  = @{ utils::Tool::end_with_species($end_with, \@SPECIES) }     if $end_with;


# Validate DUMPDIR
if ( $TYPES[0] eq 'pre' ){ 
  $basedir .= '/PRE/';
  utils::Tool::check_dir($basedir);
  $blastdir = "/data_ensembl/blastdb/ensembl/PRE";
}
if ($TYPES[0] eq 'vega'){
  $basedir  = "/dumps/vega_dumps/";
  utils::Tool::check_dir($basedir);
  $blastdir = "data_ensembl/blastdb/vega";
  $blatdir = "/ensemblweb/blat/vega";
} 

$DUMPDIR   ||= $basedir."release-$release/";
utils::Tool::check_dir($DUMPDIR);


my %db_names = qw(
  DATABASE_CORE      core
  ENSEMBL_ESTGENE estgene
  DATABASE_VEGA    vega
);

my @blast_servers = @{$SPECIES_DEFS->get_config($SPECIES[0],"ENSEMBL_BLAST_SERVERS")};

unless( $no_remotedirs ) {
   # Create directories for blast
  unless ($blastdir =~ /vega|PRE/) {
    $blastdir .= '/release-'.$release;
  }

  foreach my $blast_machine (@blast_servers ){  #warn $blast_machine;
    info (1, "Creating dir for blast on $blast_machine") unless $no_indexes;
    eval{
#      system("ssh -i $key_file $blast_machine mkdir -p $blastdir"        ) unless ($no_indexes or $no_remotedirs);
      system("ssh $blast_machine mkdir -p $blastdir"        ) unless ($no_indexes or $no_remotedirs);
    };
    if ($@) {
      utils::Tool::warning(1, "$@") unless $@ =~ /File exists/;
    }
  }
}



# Do we need to dump DNA data?
foreach my $t ( @TYPES) {
  next if ($t =~/not_dna/); 
  if ($t =~/^dna|pre|all|blast|vega/) { $dump_dna = 1; } 
}  

# work out what DNA data we have available from the previous release 
if ($PREVIOUS_DUMPDIR && $dump_dna) {
  my %species_to_dump;
  foreach my $sp (@SPECIES) {
    $sp = lc($sp); 
    $species_to_dump{$sp} = [];
  }

  opendir (DIR, $PREVIOUS_DUMPDIR) or die $!;
  while (my $file = readdir(DIR)) { 
    next if ( $file =~/^\./);
    chomp $file;
    if (exists $species_to_dump{$file}){ 
      opendir (SPECIES_DIR, $PREVIOUS_DUMPDIR.'/'.$file) or die $!;
      while (my $species_file = readdir(SPECIES_DIR)) { 
        next unless $species_file eq 'dna';
        opendir (SPECIES_DNA_DIR, $PREVIOUS_DUMPDIR.'/'.$file .'/'.$species_file) or die $!;
          $last_release_species_data{$file}{'data'}{'old_species_folder'} = $PREVIOUS_DUMPDIR.'/'.$file .'/'.$species_file;
          while (my $species_dna_file = readdir(SPECIES_DNA_DIR)){
            next if ( $species_dna_file =~/^\./);
            $last_release_species_data{$file}{'data'}{'old_dna_files'}{$species_dna_file} =1;
            if ($species_dna_file =~/dna\.toplevel/){
              $species_dna_file =~s/\.dna\.toplevel\.fa\.gz//;
              my $species_name = ucfirst($file); 
              $species_dna_file =~s/$species_name//;  
              my @temp = split( /\./, $species_dna_file);
              my $old_release = pop @temp;
              my $old_assembly = join '.', @temp;
              $old_assembly =~s/^\.//;                  
              $last_release_species_data{$file}{'data'}{'old_assembly'} = $old_assembly; 
              $last_release_species_data{$file}{'data'}{'old_release'} = $old_release;    
            }
          } 
        closedir (SPECIES_DNA_DIR);  
      } 
      closedir (SPECIES_DIR); 
    }
  }
  closedir(DIR);
}


## Work out what files we need to dump:
foreach my $sp (sort @SPECIES) {
  my $species_time = time ();
 
  # Work out species folder name and empty if you are dumping --type all
  my $sp_release = $SPECIES_DEFS->get_config(ucfirst($sp),"SPECIES_RELEASE_VERSION");
  $sp_release =~ s/\.//g; 
  my $sp_folder = "$DUMPDIR"."fasta/". lc($sp);

  if ( -e $sp_folder && $TYPES[0] eq 'all' && !$no_dumpdata) {
    info( 1, "Removing existing copies of $sp_folder" );
    system("rm -rf $sp_folder") && die "Couldn't delete $sp_folder";
  }

  my $dbConnection = EnsEMBL::Web::DBSQL::DBConnection->new($sp, $SPECIES_DEFS);

  foreach my $db( @DATABASES || "DATABASE_CORE"){
    my $db_adaptor = $dbConnection->get_DBAdaptor( $db_names{$db} ) ||
      ( utils::Tool::warning( 1, "DB $db is not valid for $sp" ) && next );
    info( 2, "Dumping @TYPES from \n$db $sp - $DUMPDIR" );


    # Create directories and filehandles for dumps
    my $created_files =
      create_dirs_for_dumps( $sp, $SPECIES_DEFS, $sp_folder, $types,  $db_adaptor);

    # Do dumps
    $created_files = get_data($db_adaptor, $sp, $created_files, $sp_folder) unless $no_dumpdata;

    # Make blast indexes and copy to blastsrv machine $BLASTSRV
    $created_files = blast_indexes( $created_files, $sp ) unless $no_indexes; 

    # Make blat indexes
    $created_files = blat_indexes( $created_files, $sp, $sp_folder, $db_adaptor) unless $no_blat;

    # Compress (gzip files)
    compress( $created_files ) unless $no_compress;

    # Empty vars
    $created_files = undef;
  }   
  my $sp_time_taken = time - $species_time;
  my $hours         = localtime(time -$species_time)->hour - 1;
  info(2, "Species time $sp $hours hours". localtime($sp_time_taken)->min."mins");
}
   
# Work out timings -----------------------------------------------------------
my $time_taken = time - $script_start_time;
my $hours      = localtime($time_taken)->hour -1;
info (2, "Used $logfile.") if $logfile;
info (2, " Time taken: $hours:". localtime($time_taken)->min."mins");
close STDERR;

utils::Tool::mail_log( $logfile, $email ) if $logfile;
exit;

######################### END OF PROGRAM ######################################

sub need_dna {
  my $sp = shift; 
  my $current_assembly = shift;
  my $needed = 0;
  my $reason;

=cut
  ## Code for using the production db when it actullay stores what we need!
  my $hub = new EnsEMBL::Web::Hub;
  my $production_db_adaptor = EnsEMBL::Web::DBSQL::ProductionAdaptor->new($hub);
  my $sp_id = $production_db_adaptor->fetch_species_id(lc($sp) );
  my @changes = @{$production_db_adaptor->fetch_changelog({ release => $release, species => $sp_id}) };
=cut

  my $web_db_connection = Bio::EnsEMBL::DBSQL::DBConnection->new(
    -host => 'ensdb-1-13',
    -port => 5307,
    -user => 'ensro',
    -dbname =>  'ensembl_website'
  );

  my $sth = $web_db_connection->prepare(
    "SELECT n.data
      FROM news_item n, species s, item_species i
     WHERE n.release_id = ? and n.news_item_id = i.news_item_id and 
           i.species_id=s.species_id and s.name = ?"
  );

  ## Work out if we need to dump DNA for this species
  ## Currently we only dump DNA if we have a new species, a new assembly
  ## or if repeat masking has been re-run.

  # Is this a new species?
  if ( exists $last_release_species_data{lc($sp)}{'data'}{'old_dna_files'} ){ 
    # Check the info provided in DOI to see if there has been an assembly change
    # or re-run of repeat masking for this species

    my $species_name = ucfirst($sp);     
    $sth->execute($release,$species_name);
    foreach my $row ( $sth->fetchrow){
      my $data = $row;  
      $data =~s/\{|\}|\s|//g;
      my @split_pairs = split (/\,/, $data);
      foreach my $pair (@split_pairs) { 
        my ($key, $value) = split(/=>/, $pair);
        $key =~s/'//g;
        $value =~s/'//g;
          
        if ($key eq 'repeat_masking' && $value eq 'Y'){ 
          $needed = 1;
          $reason = 'Repeat masking has been re-run';
        } elsif( $key eq 'assembly' && $value eq 'Y' ) {
          $needed = 1;
          $reason = 'A new assembly has been declared';
        } 
      } 
    }
    $sth->finish;
#    $web_db_connection->db_handle->disconnect_when_inactive(1);
    #Check that old assembly name matches current assembly
    my $old_assembly = $last_release_species_data{lc($sp)}{'data'}{'old_assembly'};
    info(1, "$current_assembly");
    info(1, "$old_assembly");
    unless ( $current_assembly eq $old_assembly){
      $needed = 1;
      $reason = 'The current assembly name does not match the previous assembly name';
    }

  } else { # We do not have any data for this species so we have to re-dump it!
      $needed = 1;
      $reason = 'We do not have a previous version of the DNA fasta files';
  } 
  return ($needed, $reason);
}

#-----------------------------------------------------------------------------

sub check_types {

  ### Description: Checks the types of dumps you want to do
  ### There is a list of recognised types (%valid_types)
  ### but these can be grouped into 'compound types' (%compound_types)
  ### This subroutine validates the types of dumps requested by user
  ### and expands the compound_types into valid types if necessary
  ### Returns hashref


  my $types = shift;
  my %valid_types = map{ $_ => 1 }
  qw(
     pre
     blast
     ncrna
     cdna_all      cdna_known cdna_novel cdna_pseudo cdna_abinitio
     cdna_abinitio
     pep_all       pep_known  pep_novel  pep_abinitio            
     pep_abinitio 
     dna_toplevel  dna_toplevel_masked
     vega 
  );

  my %compound_types = (
    dna     => [ qw( dna_toplevel dna_toplevel_masked        )],
    cdna    => [ qw( cdna_all cdna_abinitio                 )],
    pep     => [ qw( pep_all pep_abinitio                                 )],
    rna     => [ qw( ncrna                                                                    )],
    blast   => [ qw( rna cdna pep dna_toplevel dna_toplevel_masked                            )],
    pre     => [ qw( rna cdna pep dna_toplevel dna_toplevel_masked               )],
    all     => [ qw( cdna pep dna rna                                                         )],
    not_dna => [ qw( cdna pep rna                                                             )],
    vega    => [ qw( cdna pep dna rna   )],                                     
  );

  return utils::Tool::validate_types(\%valid_types, \%compound_types, $types);
}

#----------------------------------------------------------------------
sub info{
  my $v   = shift;
  my $msg = shift;
  if( ! defined($msg) ){ $msg = $v; $v = 0 }
  $msg || ( carp("Need a warning message" ) && return );

  if ($v > 1) {
    warn( sprintf "[INFO_2] %s [%0.3fs]\n", $msg, time()- $script_start_time );
  } else {
    warn( sprintf "[INFO] %s [%0.3fs]\n", $msg, time()- $script_start_time );
  # warn( sprintf "[INFO] %s [%0.3fs %d]\n", $msg, time()- $script_start_time, &process_size );
  }
  return 1;
}

#----------------------------------------------------------------------

sub create_dirs_for_dumps {

  ### Aim: Creates Bio::SeqIO objects for filehandles used for the dumping
  ### Returns string, hashref for filehandles where key is the type of dump
  ### and value is the Bio::SeqIO

  my ($sp, $SPECIES_DEFS, $sp_folder, $types, $db_adaptor) = @_;

  # Get assembly and check against ini file -------------
  my $cs_adaptor   = $db_adaptor->get_CoordSystemAdaptor;
  my ($highest_cs) = @{$cs_adaptor->fetch_all()};
  my $assembly   = $highest_cs->version();
  $sp = ucfirst($sp);
  my $file_details = "$sp.$assembly.$release";


  # Create dirs ------------------------------------------
  my $filehandles;
  my $dna = 0;

  foreach my $type ( @$types) {
    my( $master_type ) = split( '_', $type );
    my $thisdir = $sp_folder."/$master_type";
    if ($master_type eq "dna"){
      my $needed = need_dna(ucfirst($sp), $assembly);
      if ($needed) { 
       info(2, "DNA needs to be dumped : " .$needed);  
       $dna = 1;  
      } else {  
        info(2, "We do not need to redump DNA files for this species, these will be copied from the previous release");
        copy_dna_files(ucfirst($sp), $thisdir, $release);
      }             
    }

    next if ($master_type eq "dna"  && $dna == 0);

    unless ( -e $thisdir ) {
      utils::Tool::check_dir( $thisdir );
      open(README, '>'."$thisdir/README") or die "Couldn't open file $thisdir/README: $!\n";
      print README readme($master_type);
      close README;
    }

    # Work out types of directories needed ------------------------
    my ($seqtype, $idtype);
    if ($type =~ /toplevel/) {
      $seqtype = "dna";
      $idtype  = "nonchromosomal";
      # chromosome directories are added later on per species basis
    } else {
      ($seqtype, $idtype) = split (/_/, $type);
    }
    if ($type =~ /masked/) {
      $seqtype .= "_rm";
    }

    # file name like dna.contig.fa cdna.abinitio.fa
    my $tmp_file = join (".", "$thisdir/$file_details", $seqtype, $idtype || (), "fa");
    $filehandles->{$type} = Bio::SeqIO->new(
      '-format' => 'Fasta',
      '-file'   => '>'.$tmp_file
    );
    if ( $type =~/toplevel/){
      my $toplevel_file = join (".", "$thisdir/$file_details", $seqtype, $idtype || (), "fa");
      $toplevel_file =~s/nonchromosomal/toplevel/;
      $filehandles->{$type .'_top'} = Bio::SeqIO->new(
        '-format' => 'Fasta',
        '-file'   => '>'.$toplevel_file
      );  
    } 
  }

  return $filehandles;
}

#------------------------------------------------------------------------------
sub copy_dna_files {
  ### if there have been no changes to the DNA since the previous release 
  ### and we have access to the old file we can copy these and rename them 
  ### rather than re-dumping them

  my $species = shift;
  my $folder = shift;
  my $release = shift;

  # create directory  
  unless (-e $folder){
    utils::Tool::check_dir( $folder );
  }

  # copy files from previous release and rename them
  my $old_files = $last_release_species_data{lc($species)}{'data'}{'old_dna_files'};
  
  foreach my $file (keys %$old_files ) {
    my $file_name = $file;
    my $old_release = $last_release_species_data{lc($species)}{'data'}{'old_release'};
    $file_name =~s/$old_release/$release/;
    my $new_file = $folder ."/".$file_name;
    my $path_to_old_file = $last_release_species_data{lc($species)}{'data'}{'old_species_folder'};  
    my $old_file = $path_to_old_file ."/". $file;
    info (1, "copying \n $old_file to \n $new_file");
    system("scp $old_file $new_file");
  }
        
  return 1;
}


#------------------------------------------------------------------------------
sub get_data {

  ### Loops through, creating a slice for each toplevel
  ### Starts with the shortest toplevel (uses less memory for longer)
  ### Passes the appropriate subroutine the correct filehandle and slice
  ### Returns hashref for filehandles where key is the type of dump
  ### and value is the Bio::SeqIO

  my $dbAdaptor  = shift || die( 'Need a DBAdaptor' );
  my $species    = shift || die( 'Need a species' );
  my $fhs      = shift || die( 'Need a hashref of types to dump' );
  my $sp_folder = shift;
  my $load_exons   = 1;
  my $cs_adaptor   = $dbAdaptor->get_CoordSystemAdaptor;
  my ($highest_cs) = @{$cs_adaptor->fetch_all()};
  my $assembly   = $highest_cs->version();
  my $prefix = $sp_folder.'/dna/'. ucfirst($species) .'.'. $assembly .'.'. $release. '.dna';

  # Dump even non-reference region
  my $sliceAdaptor   = $dbAdaptor->get_SliceAdaptor;
  my $gene_adaptor   = $dbAdaptor->get_GeneAdaptor;
  my $meta_container = $dbAdaptor->get_MetaContainer();

  # Only use repeat mask features listed in the meta table
  # Could optimise so only gets the start and end of the repeat feature rathan than whole feature
  my @analyses = @{$meta_container->list_value_by_key('repeat.analysis ')};

  if(!defined($minimise_memory)){
    $sliceAdaptor->cache_toplevel_seq_mappings();
  } else{
    info( 1, "Using minimise memory option. Will use less memory but will be slower" );
=cut    my @gene_ids = @{$gene_adaptor->list_dbIDs()};
    info( 1, "dumping the data for ".scalar(@gene_ids)." genes" );
    foreach my $gene_id ( @gene_ids ){
      my $gene = $gene_adaptor->fetch_by_dbID($gene_id);
      dump_data_for_gene($gene, $fhs);
    }
=cut
    if(scalar( grep{$_ !~ /^dna|abinitio|\/dna\// } keys %$fhs ))  {
      my $genes = $gene_adaptor->fetch_all;

      info( 1, "dumping the data for ".scalar(@$genes)." genes" );
      foreach my $gene ( @$genes ){    
        my $check = dump_data_for_gene($gene, $fhs);
        $gene_adaptor->clear_cache;
      }
    } 
  }

 # Sort these by incr slice length for more efficient memory usage
  foreach my $slice( sort {$a->seq_region_length <=> $b->seq_region_length} @{$sliceAdaptor->fetch_all('toplevel', undef, 1, undef, undef,)} ){
    info( 1, "Start toplevel ". $slice->name. " length:". $slice->seq_region_length );
    my $coord_system = $slice->coord_system->name;

    # Toplevel DNA dumps ------------------------------------------------------
    if ($fhs->{'dna_toplevel_masked'} or $fhs->{'dna_toplevel'}) {

      my $seq_name = $slice->seq_region_name;
      #info (1, "M1: $seq_name");
      # If not chr based toplevel, use predefined filehandle
      if ( ($coord_system !~ /^chromosome$/i) or  ( $seq_name =~/random
                           |^Un\d{4}$
                           |^Un\.\d{3}\.\d*$
                           |E\d\d\w*$
                           |_NT_
                           |scaffold_
                           |cutchr
                           |unplaced  
                           /x)  ) {

        if ( my $seqio = $fhs->{'dna_toplevel'} ){
          dump_dna( $seqio, $slice, 'dna',);
          my $seqio_top = $fhs->{"dna_toplevel_top"};
          dump_dna( $seqio_top, $slice, 'dna',);
        }

        if ( my $seqio_masked = $fhs->{'dna_toplevel_masked'} ){
          dump_dna( $seqio_masked, $slice->get_repeatmasked_seq(\@analyses), 'dna_rm' );
          my $seqio_top_rm = $fhs->{"dna_toplevel_masked_top"};
          dump_dna( $seqio_top_rm, $slice->get_repeatmasked_seq(\@analyses), 'dna_rm');
        }

      # chromosome based system, need to make new files for each chr
      } else { 
        my $file = ">$prefix.chromosome.$seq_name.fa";
        #info( 1, "M2: Dump into $file");

        if($fhs->{'dna_toplevel'} ){
          #info(1, "M3: DNA");
          my $seqio = Bio::SeqIO->new('-format' => 'Fasta',
                    '-file'   =>  sprintf ($file, "dna"));
          dump_dna( $seqio, $slice, 'dna' );
          my $seqio_top = $fhs->{"dna_toplevel_top"};  
          dump_dna( $seqio_top, $slice, 'dna' );
 
         # Need to add these so they are closed and gzipped
          $fhs->{"$prefix.chromosome.$seq_name.fa"} = $seqio;
        }

        if($fhs->{'dna_toplevel_masked'} ){
          $file = ">$prefix"."_rm.chromosome.$seq_name.fa";
          my $seqio_masked = Bio::SeqIO->new('-format' => 'Fasta',
                          '-file' => sprintf ($file,"dna_rm"));
          dump_dna_masked( $seqio_masked, $slice, mask_sequence($slice, \@analyses), 'dna_rm' );
          my $seqio_top_rm = $fhs->{"dna_toplevel_masked_top"};
          dump_dna_masked( $seqio_top_rm, $slice, mask_sequence($slice, \@analyses), 'dna_rm' );

          # Need to add these so they are closed and gzipped
          $fhs->{"$prefix"."_rm.chromosome.$seq_name.fa"} = $seqio_masked;
        }
      }
    }
    # End dna_toplevel (masked and not masked) ------------------------------
    if(scalar( grep{$_ !~ /^dna|abinitio|\/dna\// } keys %$fhs ))  {
      unless ($minimise_memory) {
        foreach my $gene( @{$slice->get_all_Genes(undef,undef,$load_exons)} ){
          dump_data_for_gene($gene, $fhs);
        }
      }
    }
    # Abinitio dumps ----------------------------------------------------------
    #info (1, "Abinitio section");
    if ( scalar (grep {$_ =~ /_abinitio/ } keys %$fhs ))  {
      foreach my $transcript( @{$slice->get_all_PredictionTranscripts(undef,$load_exons)} ){
        eval {
          my $subtype = $transcript->analysis->logic_name;
          if( my $seqio = $fhs->{'cdna_abinitio'} ){
            dump_cdna_rna( $seqio, $transcript, "cdna:$subtype" );
          }
          if( my $seqio = $fhs->{'pep_abinitio'} ){
            utils::Tool::warning(1, "No translation for transcript:". $transcript->stable_id )
            && next unless $transcript->translation;
            dump_pep( $seqio, $transcript, "pep:$subtype" );
          }
        };
      }
    }
    #info( 1, "End toplevel ". $slice->name );
  }

  # Delete empty files ----------------------------------------------------
  foreach my $type (keys %$fhs) {
    my $seqio = $fhs->{$type};
    my $file = $seqio->file;
    $file =~ s/^>//;
    $file  || ( utils::Tool::warning( 1, "Dump type $type has no filename" ) && next );
    -e $file || ( info( 1, "File $file ($type) does not exist" ) && next );
    if( -z $file ){ #Empty file
      unlink $file || warn "Can't delete $file: $!";
      info(1, "Deleting empty file $file");
      undef( $fhs->{$type} );
      next;
    }
    $fhs->{$type}->close;
  }
  return ($fhs);
}

#------------------------------------------------------------------------------
sub dump_data_for_gene {

  ### Arg1 : Bio::Ensembl::Gene object
  ### Arg2 : hashref for filehandles where key is the type of dump and
  ### value is the Bio::SeqIO
  ### Returns 1
  my ($gene, $fhs) = @_;
  foreach my $transcript( @{$gene->get_all_Transcripts} ){
    my $logic_name = lc($gene->analysis->logic_name);
    my $subtype;
    if ( $logic_name =~/ncrna/ && $transcript->biotype =~ /RNA/i) {
      # This will also get all ncRNA PSEUDOGENES.
      # Question whether the ncRNA_pseudogene should be stored in the ncrna biotype
      # or in the pseudogene biotype ?
       $subtype = 'ncrna';
     } elsif ($logic_name =~/ensembl_havana_lincrna/i ) {
       # we don't care about the transcript's biotypes
       $subtype = 'ncrna';
    } elsif ( $logic_name =~/ncrna/ && $transcript->biotype !~ /RNA/i) {
      # this combination should not happen, so we need to know when it does
      utils::Tool::warning(2, "Transcript ".$transcript->stable_id." with logic_name $logic_name has biotype ".$transcript->biotype);
    } elsif ( $logic_name =~/MT_genbank_import/i && $transcript->biotype =~ /RNA/i) {
      # The genes on MT are both coding and noncoding. We only want
      # the noncoding ones here.
      $subtype = 'ncrna';
    } elsif( $transcript->biotype =~ /pseudogene/i && ! $transcript->translation ) {
      # This will get the transcript labelled as /pseudogene/
      # but it will not get the POLYMORPHIC_PSEUDOGENE biotype because they should all have
      # translations. Polymorphic pseudogenes are coding in some individuals 
      # but a pseudogene in the reference.
      $subtype = 'pseudogene';
    } elsif( $transcript->is_known)    {
      $subtype = 'known';
    } elsif ($transcript->status eq 'NOVEL' || 
             $transcript->status eq 'PUTATIVE' || 
             $transcript->status eq 'PREDICTED' ||
             $transcript->status eq 'KNOWN_BY_PROJECTION' ||
             $transcript->status eq 'UNKNOWN') {
      $subtype = 'novel';
    } else {
      # We should never get here because the novel subtype will catch
      # all status other than known. Problems will only occur if the database
      # enum changes to include new status types. 
      info(2, "Transcript ".$transcript->stable_id." status ".$transcript->status." logic ".$logic_name." biotype ".$transcript->biotype);
    }
    if ($logic_name eq 'pseudogene' && $subtype ne 'pseudogene') {
      utils::Tool::warning(1, "Logic name is set to pseudogene, but transcript has translation:".$transcript->translation);
    }

    if ( my $seqio = $fhs->{'cdna_all'} and $subtype ne 'ncrna')  {
     dump_cdna_rna( $seqio, $transcript, "cdna:$subtype", $gene );
    }
    if( my $seqio = $fhs->{'cdna_known'} and $subtype eq 'known' ){
      dump_cdna_rna( $seqio, $transcript, 'cdna:known', $gene);
    }
    if( my $seqio = $fhs->{'cdna_pseudo'} and $subtype eq 'pseudogene' ){
      dump_cdna_rna( $seqio, $transcript, 'cdna:pseudogene', $gene);
    }
    if( my $seqio = $fhs->{'cdna_novel'} and $subtype eq 'novel' ){
      dump_cdna_rna( $seqio, $transcript, 'cdna:novel', $gene);
    }

    if( my $seqio = $fhs->{'pep_all'} and $subtype ne 'pseudogene'
      and $subtype ne 'ncrna'){
      dump_pep( $seqio, $transcript, "pep:$subtype", $gene);
    }
    if( my $seqio = $fhs->{'pep_known'} and $subtype eq 'known' ){
      dump_pep( $seqio, $transcript, 'pep:known', $gene);
    }
    if( my $seqio = $fhs->{'pep_novel'} and $subtype eq 'novel' ){
      dump_pep( $seqio, $transcript, 'pep:novel', $gene);
    }

    if( ( my $seqio = $fhs->{'ncrna'} and $subtype eq 'ncrna' ) ) {
      $subtype = $gene->biotype;
      dump_cdna_rna( $seqio, $transcript, "$logic_name:$subtype",$gene);
    }
  }
  return 1;
}

#---------------------------------------------------------------------
#FORMAT >STABLE_ID TYPE:SUBTYPE TOPLEVEL:SLICE_NAME gene:genename
#>ENST00000289823 cdna:known chromosome:NCBI34:8:21922367:21927699:1
#gene:ENSG00000158815

sub dump_cdna_rna {

  ### Returns 1
  ### Returns 1

  my( $seqio, $transcript, $type, $gene) = @_;
  return if $no_data;
  my $coord_system = $transcript->slice->coord_system;
  my $location = join( ':',
             $coord_system->name,
             $coord_system->version,
             $transcript->seq_region_name,
             $transcript->seq_region_start,
             $transcript->seq_region_end,
             $transcript->seq_region_strand );

  my $label = $gene ? "gene:".$gene->stable_id : " ";
  my $seq = $transcript->seq();
  $seq->description( join( " ", $type, $location, $label) );
  #info( 1, "CDNA>".$seq->display_id );
  $seqio->write_seq($seq);
  return 1;
}

#------------------------------------------------------------------------------
#>ENSP00000328693 pep:novel chromosome:NCBI34:1:904515:910768:1
#>gene:ENSG00000158815:transcript:ENST00000328693

sub dump_pep {

  ### Returns 1

  my ($seqio, $transcript, $type, $gene) = @_;
  return if $no_data;
  my $translation = $transcript->translation;
  if( ! $translation ){
    # Print if this is pseudogene -> logic of script must be faulty cos shouldnt get here if pseudo
    utils::Tool::warning( 1, "Transcript ".$transcript->stable_id." does not translate" );
    return;
  }
  my $coord_system = $transcript->slice->coord_system;
  my $location = join( ':',
             $coord_system->name,
             $coord_system->version,
             $transcript->seq_region_name,
             $transcript->seq_region_start,
             $transcript->seq_region_end,
             $transcript->seq_region_strand );

  my $display_id = $translation->stable_id || $transcript->stable_id;
  my $label;
  $label = "gene:".$gene->stable_id if $gene;
  $label .=" transcript:".$transcript->stable_id;

  my $seq = $transcript->translate;
  unless ($seq) {
    utils::Tool::warning (1, "No seq for Transcript ".$transcript->stable_id);
    return 1;
  }
  $seq->display_id( $display_id );
  $seq->description( join( " ", $type, $location, $label) );
  #info( 1, "PEP>type:$type ".$seq->display_id );
  $seqio->write_seq($seq);
  return 1;
}

#----------------------------------------------------------------------
sub dump_dna{

  ### Returns 1

  my ($seqio, $slice, $type_str, $location ) = @_;
  return if $no_data;

  # Force slice onto forward strand
  if( $slice->strand < 1 ){ $slice = $slice->invert }
  $location ||= $slice->name;
  my ($type, $subtype) = split(':', $type_str);

  my $pad_start = 'N' x ( $slice->start - 1 );
  my $pad_end   = 'N' x ( $slice->seq_region_length - $slice->end );

  my $seq = new Bio::Seq(
    -seq     => $pad_start . $slice->seq() . $pad_end,
    -display_id  => $slice->seq_region_name
  );
  $seq->description( "$type:". $slice->coord_system->name(). " $location" );
  #info( 1, "DNA>".$seq->display_id );

  $seqio->write_seq($seq);
  return 1;
}

#------------------------------------------------------------------------
sub dump_dna_masked{

  ### Returns 1

  my ($seqio, $slice, $seqstr, $type_str, $location) = @_;
  return if $no_data;

  # Force slice onto forward strand
  if( $slice->strand < 1 ){ $slice = $slice->invert }

  $location ||= $slice->name;
  my ($type, $subtype) = split(':', $type_str);

  my $pad_start = 'N' x ( $slice->start - 1 );
  my $pad_end   = 'N' x ( $slice->seq_region_length - $slice->end );

  my $seq = new Bio::Seq(
    -seq     => $pad_start . $seqstr . $pad_end,
    -display_id  => $slice->seq_region_name
  );
  $seq->description( "$type:". $slice->coord_system->name(). " $location" );
  #info( 1, "DNA>".$seq->display_id );

  $seqio->write_seq($seq);
  return 1;
}

#-----------------------------------------------------------------------
sub mask_sequence {

  ### Arg 1: slice
  ### Arg 2: optional arrayref of repeat.analysis types from meta table

  my ($slice, $logic_names) = (@_);
  my $seq_region = $slice->seq_region_name;

  my $chunk_start = $slice->start;
  my $chunk_size  = 10000000;
  my $chunk_end   = $chunk_start + $chunk_size-1;

  if ($chunk_end > $slice->end) {
    $chunk_end = $slice->end;
  }

  my $slice_seq;

  while ($chunk_start <= $slice->end) {
    my $chunk_slice_name = $slice->coord_system->name . ":" .$slice->coord_system->version . ":" .$slice->seq_region_name . ":" . $chunk_start . ":" . $chunk_end . ":1";

    #print STDERR " Chunk slice name = $chunk_slice_name\n";

    my $chunk_slice = $slice->adaptor->fetch_by_name($chunk_slice_name);
    my $chunk_seq =  $chunk_slice->get_repeatmasked_seq($logic_names)->seq;

    $slice_seq .= $chunk_seq;

    $chunk_start += $chunk_size;
    $chunk_end   += $chunk_size;
    if ($chunk_end > $slice->end) {
      $chunk_end = $slice->end;
    }
  }

  return $slice_seq;
}

#------------------------------------------------------------------------------
sub blast_indexes {
  my $created_files = shift;
  my $sp = shift;
  my $dna_command = "/usr/local/pubseq/bin/pressdb";
  my $pep_command = "/usr/local/pubseq/bin/setdb";
  my $release_dir = "release-$release";
  $release_dir = "PRE" if $TYPES[0] eq 'pre';

  # Check the cdna/pep configuration is the same for each source
  my %configured;
  foreach my $blast_datasources (qw(BLASTN_DATASOURCES TBLASTX_DATASOURCES TBLASTN_DATASOURCES BLASTP_DATASOURCES BLASTX_DATASOURCES )) {
    my %ini_datasources = %{ $SPECIES_DEFS->get_config($sp, $blast_datasources)|| {} };
    map { $configured{lc($_)}++ unless $_ eq 'DATASOURCE_TYPE' } (keys %ini_datasources);
  }

  foreach (keys %configured) {
    my $value = $_ =~ /^pep/ ?  2 : 3;
    utils::Tool::warning(1, "No configuration for $_ in $sp ini file") unless $configured{$_} == $value;
  }

  foreach my $types (sort keys %$created_files) { 
    my $seqio = $created_files->{$types};
    next unless $seqio;
    my $fh = $seqio->file;
    $fh =~ s/^>//;
    unlink $fh || warn "Can't delete $fh: $!" if -z $fh;  # delete empty file
    next unless $types =~ /^dna_top|^cdna|^pep|^ncrna/;
    if ($no_dumpdata) {
      if (-e "$fh.gz") {
        info(1, "Gunziping $fh.gz");
        warning(1, "unzip $fh already exists in file system") if -e $fh;
        system("gunzip $fh.gz");
      } 
    }

    # Do blast xdformat indexes
    my $names = xdformat_blast_indexes($types, $fh, $release_dir, $sp);

    # Copy blast indexes across------------------------------
    foreach my $file (keys %$names ) {
      next if $file =~/\.dna/;
      my $file_name = $names->{$file};
      my $xd_file = "$basedir/xdformat/$release_dir/" . $file_name;
      $xd_file =~s/\/\//\//;
      unless ( -s "$xd_file" ){ $xd_file = "$basedir/xdformat/$release_dir/dna/" .$file_name;}
      utils::Tool::warning(1, "Error producing file $xd_file") && next unless -s "$xd_file";

      $no_remotedirs && next;
      foreach my $blast_machine(@blast_servers){
#        warn "scp -i $key_file $xd_file $blast_machine:/$blastdir/$file_name";
#        system("scp -i $key_file $xd_file $blast_machine:/$blastdir/$file_name");
        warn "scp $xd_file $blast_machine:/$blastdir/$file_name";
#        system("scp $xd_file $blast_machine:/$blastdir/$file_name");

      }
      unlink($xd_file) || warn "Can't delete $xd_file: $!" unless $file eq $fh;
    }
  }
  return ($created_files);
}

#---------------------------------------------------------------------------
sub xdformat_blast_indexes {

  my ($types, $fh, $release_dir, $sp) = @_;
  next if $fh =~/nonchromosomal/;
  my $outfile = $fh;  
  if ($types =~/dna_top/) { 
    my @release = split('-', $release_dir);
    my $repeat_mask_date = $SPECIES_DEFS->get_config(ucfirst($sp), "REPEAT_MASK_DATE");
    $outfile =~s/$release[1]\./$repeat_mask_date\./; 
  }

  info (1, "xdformat indexes");
  my @names;
  if ($types =~/^pep/) {
    @names = qw( .xpd .xps .xpt .xpi);
    system ("$xdformat_command -p -I $fh -o $outfile");
  } else {
    @names = qw( .xnd .xns .xnt .xni);
    unless($fh =~/seq/) { system ("$xdformat_command -n -I $fh -o $outfile"); }
  }

  my $xdformat_dir = "$basedir/xdformat/$release_dir";
  utils::Tool::check_dir($xdformat_dir);
  my $xdformat_dna_dir = "$basedir/xdformat/$release_dir/dna";
  utils::Tool::check_dir($xdformat_dna_dir);
  my $xdformat_files = {};
  foreach my $index (@names ) {
     my $file = "$outfile$index"; 
     (my $file_name = $file) =~ s#.*/##;
     $file_name =~s/\/\//\//; 
     $xdformat_dir =~s/\/\//\//;  
     utils::Tool::warning(1, "Error producing file $file") && next unless -s "$file";
      if ($file_name =~/\.dn/){
        system ("mv $file $xdformat_dna_dir/$file_name");
      } else { warn "mv $file $xdformat_dir/$file_name";
       system ("mv $file $xdformat_dir/$file_name");
      }
      $xdformat_files->{$file} = $file_name;
  }
  return $xdformat_files;
}

#-----------------------------------------------------------------------------
sub blat_indexes {
  my ($created_files, $sp, $sp_folder, $db_adaptor) = @_;
  my $release_dir = "release-$release";
  my $cs_adaptor   = $db_adaptor->get_CoordSystemAdaptor;
  my ($highest_cs) = @{$cs_adaptor->fetch_all()};
  my $assembly   = $highest_cs->version();

  my $blat_dir;
  if( $DUMPDIR eq $basedir."release-$release/" ) {
      $blat_dir  = $basedir."blat/release-$release";
   } elsif ( $DUMPDIR =~ /^$basedir/ ) {
     $blat_dir  = "$basedir$release_dir/blat/";
   } else {
     $blat_dir = "$basedir$release_dir/blat/";
    }

  my $ini_datasources = $SPECIES_DEFS->get_config(ucfirst($sp), "BLAT_DATASOURCES");
  my @dna_files;
  foreach my $types (sort keys %$created_files) {
    my $seqio = $created_files->{$types};
    next unless $seqio;
    my $fh = $seqio->file;
    $fh =~ s/^>//;
    unlink $fh || warn "Can't delete $fh: $!" if -z $fh;  # delete empty file

    next unless $types =~ /^dna_toplevel_top/;
    if ($no_dumpdata) {
      if (-e "$fh.gz") {
        info(1, "Gunziping $fh.gz");
        warning(1, "unzip $fh already exists in file system") if -e $fh;
        system("gunzip $fh.gz");
      } 
    }
    unless ($fh=~/_rm/){ push (@dna_files, $fh)};
  }

  return ($created_files) unless scalar @dna_files >> 0;
  my $names = create_blat_indexes($ini_datasources, \@dna_files, $blat_dir, $sp, $assembly);
  return ($created_files);
}

#-----------------------------------------------------------------------------
sub create_blat_indexes {
  my ($ini_datasources, $dna_files, $blat_dir, $species, $assembly) = @_;  
  $species = ucfirst($species);
  info( 1, "Creating blat index files");
  my @f = @{$dna_files};
  my $files = join(" ", @f);
  my $port_blat;

  if ( $ini_datasources->{"LATESTGP"} ) {
     ($port_blat = $ini_datasources->{"LATESTGP"}) =~ s/\w.+:(\d+)/$1/;
  } else {
     utils::Tool::warning(1, "Add LATESTGP BLAT_DATASOURCES to $species ini file and correct file name in $blat_dir.");
  }

  my @temp = split(/:/, $port_blat);
  $port_blat = $temp[0];
  my $blat_index = "$port_blat.$species.$assembly.2bit"; 
  info (2, "Creating new blat server..");
  utils::Tool::check_dir($blat_dir);
  $files =~s/^\s*//;
  if ($files =~/\w+/) {
   system ("$blat_command $files  $blat_index");
   utils::Tool::warning(1, "Error producing file $blat_index") && next unless -s "$blat_index";
   system ("mv $blat_index $blat_dir");
  }
 return 1;
}

#-----------------------------------------------------------------------------
sub compress {
  my $created_files = shift;
  info( 1, "Gzipping fasta files");

  foreach my $types (keys %$created_files) {
    my $seqio = $created_files->{$types};
    next unless $seqio;
    my $fh = $seqio->file; warn $fh;
    $fh =~ s/^>//;
    next unless (-e "$fh");
    info(1, "zipping $fh");

    system("gzip -9 $fh") ==0 or utils::Tool::warning(1, "Can't gzip file $! $fh");
  }
}

#------------------------------------------------------------------------
sub readme {
  my $key = shift;

  # Text for readme files

my %text = (
dna => "
#### README ####

IMPORTANT: Please note you can download correlation data tables,
supported by Ensembl, via the highly customisable BioMart and
EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
http://www.biomart.org for more information.


#######################
Fasta DNA dumps
#######################

-----------
FILE NAMES
------------
The files are consistently named following this pattern:
   <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz

<species>:   The systematic name of the species.
<assembly>:  The assembly build name.
<release>:   The release number.
<sequence type>:
 * 'dna' - unmasked genomic DNA sequences.
  * 'dna_rm' - masked genomic DNA.  Interspersed repeats and low
     complexity regions are detected with the RepeatMasker tool and masked
     by replacing repeats with 'N's.
<id type> One of the following:
  * 'chromosome'a    - The top-level coordinate system in most species in Ensembl
  * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome
  * 'seqlevel'       - This is usually sequence scaffolds, chunks or clones.
     -- 'scaffold'  - Larger sequence contigs from the assembly of shorter
        sequencing reads (often from whole genome shotgun, WGS) which could
        not yet be assembled into chromosomes. Often more genome sequencing
        is needed to narrow gaps and establish a tiling path.
     -- 'chunk' -  While contig sequences can be assembled into large entities,
        they sometimes have to be artificially broken down into smaller entities
        called 'chunks'. This is due to limitations in the annotation
        pipeline and the finite record size imposed by MySQL which stores the
        sequence and annotation information.
     -- 'clone' - In general this is the smallest sequence entity.  It is often
        identical to the sequence of one BAC clone, or sequence region
        of one BAC clone which forms the tiling path.
<id>:     The actual sequence identifier. Depending on the <id type> the <id>
          could represent the name of a chromosome, a scaffold, a contig, a clone ..
          Field is empty for seqlevel files
fa: All files in these directories represent FASTA database files
gz: All files are compacted with GNU Zip for storage efficiency.

-----------
TOPLEVEL
----------
These files contain the full sequence of the assembly in fasta format.
They contain one chromosome per file.

EXAMPLES
   The genomic sequence of human chromosome 1:
     Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz

   The masked version of the genome sequence on human chromosome 1
   (contains '_rm' in the name):
     Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz

   Non-chromosomal assembly sequences:
   e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes
     Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz
     Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz


--------------
SPECIAL CASES
--------------
Some chromosomes have alternate haplotypes which are presented in files with 
the haplotype sequence only:
   Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz
   Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz
   

Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR)
on the Y is annotated.  By definition the PAR region is identical on the 
X and Y chromosome.  We provide this sequence in the following way.
-- The Y chromosome file contains the complete sequence of the PAR:
    Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz
-- The top level file includes only the unique portion of Y (i.e. the PAR 
   (region is N-masked):
      Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz\n",

pep => "
####################
Fasta Peptide dumps
####################
These files hold the protein translations of Ensembl gene predictions.

-----------
FILE NAMES
------------
The files are consistently named following this pattern:
   <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz

<species>:       The systematic name of the species.
<assembly>:      The assembly build name.
<release>:       The release number.
<sequence type>: pep for peptide sequences
<status>
  * 'pep.all' - the super-set of all translations resulting from Ensembl known
     or novel gene predictions.
  * 'pep.abinitio' translations resulting from 'ab initio' gene
     prediction algorithms such as SNAP and GENSCAN. In general, all
     'ab initio' predictions are based solely on the genomic sequence and
     not any other experimental evidence. Therefore, not all GENSCAN
     or SNAP predictions represent biologically real proteins.
fa : All files in these directories represent FASTA database files
gz : All files are compacted with GNU Zip for storage efficiency.

EXAMPLES (Note: Most species do not sequences for each different <status>)
 for Human:
    Homo_sapiens.NCBI36.40.pep.all.fa.gz
      contains all known and novel peptides
    Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz
      contains all abinitio predicted peptide

Difference between known and novel
----------------------------------
Protein models that can be mapped to species-specific entries in
Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as
known genes.  Those that cannot be mapped are called novel
(e.g. genes predicted on the basis of evidence from closely related species).

For models annotated by HAVANA the status is set manually. Models that have 
an HGNC name are referred to as known and the remaining models are referred to
as novel.

-------------------------------
FASTA Sequence Header Lines
------------------------------
The FASTA sequence header lines are designed to be consistent across
all types of Ensembl FASTA sequences.  This gives enough information
for the sequence to be identified outside the context of the FASTA
database file.

General format:

>ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT

Example of Ensembl Peptide header:

>ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693
 ^               ^   ^     ^                                   ^                    ^
 ID              |   |  LOCATION                          GENE:stable gene ID       |
                 | STATUS                                           TRANSCRIPT: stable transcript ID
               SEQTYPE
\n",


cdna => "
##################
Fasta cDNA dumps
#################

These files hold the cDNA sequences corresponding to Ensembl gene predictions.

------------
FILE NAMES
------------
The files are consistently named following this pattern:
<species>.<assembly>.<release>.<sequence type>.<status>.fa.gz

<species>: The systematic name of the species.
<assembly>: The assembly build name.
<release>: The release number.
<sequence type>: cdna for cDNA sequences
<status>
  * 'cdna.all' - the super-set of all transcripts resulting from
     Ensembl known, novel and pseudo gene predictions (see more below).
  * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction
     algorithms such as SNAP and GENSCAN. In general all 'ab initio'
     predictions are solely based on the genomic sequence and do not
     use other experimental evidence. Therefore, not all GENSCAN or SNAP
     cDNA predictions represent biologically real cDNAs.
     Consequently, these predictions should be used with care.

EXAMPLES  (Note: Most species do not sequences for each different <status>)
  for Human:
    Homo_sapiens.NCBI36.40.cdna.all.fa.gz
      cDNA sequences for all transcripts: known, novel and pseudo
    Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz
      cDNA sequences for 'ab-initio' prediction transcripts.

Difference between known and novel transcripts
-----------------------------------------------
Transcript or protein models that can be mapped to species-specific entries
in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl.
Those that cannot be mapped are called novel genes (e.g. genes predicted on
the basis of evidence from closely related species).

For models annotated by HAVANA the status is set manually. Models that have 
an HGNC name are referred to as known and the remaining models are referred to
as novel.

-------------------------------
FASTA Sequence Header Lines
------------------------------
The FASTA sequence header lines are designed to be consistent across
all types of Ensembl FASTA sequences.  This gives enough information
for the sequence to be identified outside the context of the FASTA file.

General format:

>ID SEQTYPE:STATUS LOCATION GENE

Example of an Ensembl cDNA header:

>ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815
 ^               ^    ^     ^                                       ^
 ID              |    |  LOCATION                         GENE: gene stable ID
                 |  STATUS
              SEQTYPE

\n",

ncrna => "
##################
Fasta RNA dumps
#################

These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA).

------------
FILE NAMES
------------
The files are consistently named following this pattern:
<species>.<assembly>.<release>.<sequence type>.<status>.fa.gz

<species>: The systematic name of the species.
<assembly>: The assembly build name.
<release>: The release number.
<sequence type>: ncrna for non-coding RNA sequences
<status>
  * 'ncrna' - all non-coding RNA genes, including ncRNA_pseudogenes

EXAMPLES
  for Human:
    Homo_sapiens.NCBI36.40.rna.nc.fa.gz
      Transcript sequences for all ncRNA gene types.


-------------------------------
FASTA Sequence Header Lines
------------------------------
The FASTA sequence header lines are designed to be consistent across
all types of Ensembl FASTA sequences.  This gives enough information
for the sequence to be identified outside the context of the FASTA file.

General format:

>ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671
   ^             ^     ^     ^                                          ^
   ID            |     |  LOCATION                            GENE: gene stable ID
                 |   STATUS
              SEQTYPE

\n",  );

my $warning = "#### README ####

IMPORTANT: Please note you can download correlation data tables,
supported by Ensembl, via the highly customisable BioMart and
EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
http://www.ebi.ac.uk/biomart/ for more information.

";
  return ($warning .$text{$key});
}

