#!/usr/local/bin/perl

use strict;
package indexXMLDumper;

my $ENSEMBL_ROOT;

BEGIN {
  use FindBin qw($Bin);
  use File::Basename qw( dirname );
  $ENSEMBL_ROOT = dirname( $Bin );
  $ENSEMBL_ROOT =~ s/\/utils$//;
  unshift @INC, "$ENSEMBL_ROOT/conf";
  eval{ require SiteDefs };
  if ($@){ die "Can't use SiteDefs.pm - $@\n"; }
  map{ unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS;
}

use EnsEMBL::Web::SpeciesDefs;
use XML::Parser;
use Data::Dumper;
use Getopt::Long;
use POSIX qw(strftime);
use URI;
use HTML::Strip;

use constant INC_SPECIES => 0;
use CGI;

$| = 1;

my ($species_list,$ignore_species_list,$indexes_list,$ignore_indexes_list,$search_engine,$debug,$dry_run,$log,$dir,$help);
GetOptions('species=s' => \$species_list,
           'ignore_species=s' => \$ignore_species_list,
           'index=s'   => \$indexes_list,
           'ignore_index=s' => \$ignore_indexes_list,
           'engine=s'  => \$search_engine,
           'debug'     => \$debug,
           'dry_run|n' => \$dry_run,
           'log=s'     => \$log,
           'dir=s'     => \$dir,
	   'help'      => \$help,
         );

if ($help){
  &about;
  exit;
}

my @species        = split ',',$species_list;
my @ignore_species = split ',',$ignore_species_list;
my @indexes        = split ',',$indexes_list;
my @ignore_indexes = split ',',$ignore_indexes_list;

our $SD = EnsEMBL::Web::SpeciesDefs->new();
our $release = $SD->ENSEMBL_VERSION;
our $nogzip = 1;    #try setting to 0 for on the fly gzipping (not tested)?
our %cross_references;  #get a set of external_dbs so we can make sure we link to them all correctly if we ever need to
our $parser =  new XML::Parser( 'ErrorContext' => 3 );
my $sitetype = $SD->ENSEMBL_SITETYPE || 'Ensembl';
my $website_url = $sitetype eq 'Vega' ? 'http://vega.sanger.ac.uk/'
                : $sitetype eq 'Pre'  ? 'http://pre.ensembl.org/'
                : 'http://www.ensembl.org'; #what happens to this when we are on an archive ?

$search_engine ||= 'lucene';

#redirect STDOUT and maybe STDERR to a file
$log ||= $debug ? '>-' : "dumping_${release}.log";
open (LOG,    ">$log") or die "Can't open $log: $!";
LOG->autoflush(1); #normally file output is buffered but better to see output after every print
unless ($debug) {
  open (STDERR, ">&LOG") or die "Can't open $log: $!";
}

## HACK 1 - if the INDEX is set to ALL grab all dumper methods...
if ($indexes_list eq 'ALL'){
  @indexes = ();
  my %ignored;
  @ignored{@ignore_indexes} = ();
  foreach my $method (keys %indexXMLDumper::) {
    if ($method =~ s/dump(\w+)/$1/) {
      push @indexes, $method unless exists $ignored{$method};
    }
  }
}

## HACK 2 - if the SPECIES is set to ALL grab stuff from config...
my %X = %{$SD->ENSEMBL_SPECIES_ALIASES};
my $input_c = @species;
if( $species_list eq 'ALL' ) {
  @species = ();
  my %ignored;
  @ignored{@ignore_species} = ();
  foreach my $sp (@{$SD->ENSEMBL_DATASETS}) {
    push @species,$sp unless exists $ignored{$sp};
  }
}
else {
  @species = grep {$_} map { $X{lc($_)} } @species;
  if (@species ne $input_c) {
    print LOG "WARNING - one or more of your species names is not recognised, please check your spelling. Available species are:\n";
    print LOG join "\n", @{$SD->ENSEMBL_DATASETS};
    print LOG "\n";
    exit;
  }
}

# EnsEMBL::Web::IndexSupport takes path to conf, path to files, species
my $conf = {};
$dir ||= $debug ? 'input' 
       : ($search_engine eq 'solr') ? "/nfs/eureka/data/solr/".lc($sitetype)
       : "/nfs/eureka/data/".lc($sitetype)."_$release";
print LOG "Dumping xml files to $dir\n";
mkdir $dir, 0777 unless -e "$dir";

my $start_time = time;
my $total_c;
foreach my $index (@indexes) {

  my $start = time;
  if ($index eq 'Help') {
    foreach my $type (qw(View Glossary FAQ)) {
      $conf->{"directory_$type"} = "$dir/".lc($sitetype)."_$type";
      mkdir $conf->{"directory_$type"}, 0777 unless -e $conf->{"directory_$type"};
    }
  }
  else {
    $conf->{'directory'} = "$dir/".lc($sitetype)."_$index";
    mkdir $conf->{'directory'}, 0777 unless -e  $conf->{'directory'};
  }
  if ($index eq 'Variation') {
    $conf->{'directory_somatic_mutations'} = "$dir/".lc($sitetype)."_SomaticMutation";
    mkdir $conf->{'directory_somatic_mutations'}, 0777 unless -e $conf->{'directory_somatic_mutations'};
  }

  foreach my $species ( @species ) {
    $conf->{'speciesname_list'} = undef;
    my $single_species = 0;
    my %dbs = %{$SD->get_config($species,'databases')||{}};
    $conf->{'dbs'}                     = { map { ($_,$dbs{$_}{NAME}) } keys %dbs };
    $conf->{'dbs'}{'DATABASE_COMPARA'} = 'ensembl_compara_'.$SD->ENSEMBL_VERSION;
    my $core = $dbs{'DATABASE_CORE'};
    $conf->{'core_details'}            = $core;

    if ($index eq 'Help') {
      $single_species = 1;
      my $website = $SD->multidb->{DATABASE_WEBSITE};
      $conf->{'website'}{'name'} = $website->{'NAME'};
      $conf->{'website'}{'dbh'}  = DBI->connect("dbi:mysql:$website->{'NAME'};host=$website->{'HOST'};port=$website->{'PORT'}", $website->{'USER'}, $website->{'PASS'}, { PrintError => 0, RaiseError => 0} );
    }

    else {

#      next unless ($species =~ /^[L-Z]/);
#      next unless $species =~ /Callithrix_jacchus|Meleagris_gallopavo|Nomascus_leucogenys|Oryctolagus_cuniculus/;
      $conf->{'species'}   = $species;
      $conf->{'speciesname_list'}{$SD->get_config($species,'SPECIES_COMMON_NAME')}++;
      $conf->{'speciesname_list'}{$species}++;
      $conf->{'authority'} = $SD->get_config($species,'AUTHORITY');

    }

    my $no_success = 1;
    my $c;
    print LOG "Starting: $index $species\n";
    while ($no_success) {
      $c++;
      $conf->{'dbh'} = DBI->connect( "dbi:mysql:$core->{'NAME'};host=$core->{'HOST'};port=$core->{'PORT'}", $core->{'USER'}, $core->{'PASS'}, { PrintError => 0, RaiseError => 0} );
      if ($index eq 'Family') {
        my $compara = $SD->multidb->{DATABASE_COMPARA};
        $conf->{'dbh_compara'} = DBI->connect( "dbi:mysql:$compara->{'NAME'};host=$compara->{'HOST'};port=$compara->{'PORT'}", $compara->{'USER'}, $compara->{'PASS'} );
      }
      my $function = "dump$index";
      no strict "refs";
      eval {
        $total_c += &$function( $conf );
      };
      if ($@) {
        unless ($debug) {
	  if ( $conf->{'dbh'}->errstr()) {
	    if ($c < 6) {
	      print LOG "WARNING: Error when dumping, will retry in 5 minutes (mysql error is ".$conf->{'dbh'}->errstr().", other errors are $@)\n";
	      sleep (5*60);}
	    else {
	      print LOG "WARNING: Error when dumping. I've retried five times and am moving on, you need to retry this later (error is ".$conf->{'dbh'}->errstr().")\n";
	      $no_success = 0;
	    }
	  }
	  else {
	    print LOG "WARNING: Error when dumping, probably due to a syntax error, you need to retry this later (error is $@)\n";
	    $no_success = 0;
	  }
	}
        else {
           print LOG "WARNING: Error when dumping. Exiting since in debug mode (mysql error is ".$conf->{'dbh'}->errstr().", other errors are $@)\n";
           exit;
         }
      }
      else {
        $no_success = 0;
      }
    }
    eval { $conf->{'dbh'}->disconnect; };
    if ($@) { print LOG "WARNING: Problems disconnecting database habdle, don't understand this!"; }
    last if $single_species;
  }
  print LOG "  Time for $index is " . &elapsed_time($start) . "\n\n";
}

#print LOG "\nCross references are:\n",Dumper(\%cross_references) if %cross_references;
print LOG "Dumped $total_c records in total\n";
print LOG "\nTotal time for dumping " . &elapsed_time($start_time) . "\n";
exit;

## Now for all the dump functions...............................


########################################################################## Help

sub dumpHelp {
  my $conf = shift;
  my $start_time = time;
  my $total_c;
  foreach my $type (qw(View Glossary FAQ)) {
    my $counter   = make_counter(0);
    my $file = $conf->{'directory_'.$type}.'/'.$conf->{'website'}{'name'}.'_'.$type.'.xml';
    $file .= ".gz" unless $nogzip;
    my $dbname    = $conf->{'website'}{'name'};
    my $db_type   = $type;
    my $fh;
    unless ($nogzip) {
      $fh = new IO::Zlib;
      $fh->open( "$file", "wb9" )
        or die("Can't open compressed stream to $file: $!");
    }
    else {
      open( $fh, ">$file" ) or die "Can't open $file: $!";
    }
    print LOG "  Dumping $dbname $type to $file ... ", format_datetime($start_time), "\n";
    header( $dbname, 'Help', $db_type, $nogzip, $fh, $release );
    my $sth = $conf->{'website'}{'dbh'}->prepare(qq(
       SELECT help_record_id, keyword, data
         FROM help_record
        WHERE status = 'live'
          AND type = lower('$type')));
    $sth->execute;
    my ($id, $keywords, $content);
    while( ($id, $keywords, $content) = $sth->fetchrow_array()){
      my ($extra,$description,$name);
      my $data = eval($content);
      if ($type eq 'View') {
        $description = $data->{'content'};
        $name = $data->{'ensembl_object'} . ' ' . $data->{'ensembl_action'} . ' View';
      }
      elsif ($type eq 'Glossary') {
        $description = $data->{'meaning'};
        $name = $data->{'word'};
        push @$extra, $data->{'expanded'} if $data->{'expanded'};
      }
      elsif ($type eq 'FAQ') {
        $description = $data->{'answer'};
        $name = $data->{'question'};
      }
      &p( helpLine( $id, $name, $keywords, $description, $extra, $type, $counter ), $nogzip, $fh); #### need to parse it
    }
    $total_c += footer( $counter->(),$nogzip,$fh,$type);
  }
  return $total_c;
}

sub helpLine {
  my ($id, $name, $keywords, $desc, $extra,  $type, $counter) = @_;
  my @keywords = map { s/^ //; $_; } split ',', $keywords if $keywords;

  #remove all HTML markup, extra whitespace and extra lines. Escape any characters that might have slipped through
  my $hs = HTML::Strip->new(decode_entities => 0); #don't decode HTML entitites, they're fine as they are
  my $description = $hs->parse( $desc );
  $description =~ s/\s{1,}/ /g;
  $description = clean($description);
  $hs->eof;

  #remove markup from headers and escape just to be sure
  $name =~ s/<\/*.+>?//g;
  $name = &clean($name);

  my $xml = qq(
<entry id="$id">
  <description>$description</description>
  <additional_fields>
    <field name="featuretype">$type</field>
    <field name="displayname">$name</field>);
  foreach my $kw (@keywords) {
    $xml .= qq(
    <field name="keyword">$kw</field>);
  }
  foreach my $kw (@$extra) {
       $xml .= qq(
    <field name="keyword">$kw</field>);
  }
  $xml .= qq(
  </additional_fields>);
  $counter->();
  return $xml . qq(
</entry>);
}

########################################################################## Markers

sub dumpMarker {
  my $conf = shift;
  my $dbname    = $conf->{'dbs'}{'DATABASE_CORE'};
  my $counter   = make_counter(0);
  my $dbspecies = $conf->{'species'};
  my $file = $conf->{'directory'}."/${dbname}_Marker.xml";
  $file .= ".gz" unless $nogzip;
  my $start_time = time;
  my $db = 'core';
  my $fh;
  unless ($nogzip) {
    $fh = new IO::Zlib;
    $fh->open( "$file", "wb9" )
      or die("Can't open compressed stream to $file: $!");
  }
  else {
    open( $fh, ">$file" ) or die "Can't open $file: $!";
  }
  print LOG "  Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
  header( $dbname, $dbspecies, $db, $nogzip, $fh, $release );
  my $sth = $conf->{'dbh'}->prepare("
   SELECT m.marker_id, ms2.name, ms1.name
     FROM marker_synonym as ms1, marker as m
LEFT JOIN marker_synonym as ms2 on ms2.marker_synonym_id = m.display_marker_synonym_id
    WHERE ms1.marker_id = m.marker_id
 ORDER BY m.marker_id
  ");

  $sth->execute( );
  my $names  = [];
  my $old_ID = 0;
  my ($ID,$marker,$synonym);
  my $params = { contigviewbottom => 'marker_core_marker=normal'}; #used to switch on track by default
  while( ($ID,$marker,$synonym) = $sth->fetchrow_array()){
    $marker = $synonym unless $marker;
    if($ID == $old_ID) {
      push @$names, $synonym;
    } else {
      if ($old_ID) {
        &p( markerLine( $dbspecies, $names, $params, $counter ), $nogzip, $fh);
      }
      $names    = [ $synonym ];
      $old_ID  = $ID;
    }
  }
  &p( markerLine( $dbspecies, $names, $params, $counter ), $nogzip, $fh);
  return footer( $counter->(),$nogzip,$fh);
}

sub markerLine {
  my( $species, $IDS, $params, $counter ) = @_;
  $species =~ s/_/ /;
  my @synonyms = sort @$IDS;
  my $key = pop @synonyms;
  my $desc = 'A marker with '.@synonyms.' synonym';
  $desc .= 's' if (@synonyms > 1);
  $desc .= 's' unless (@synonyms);
  return '' unless $key;
  my $xml = qq(
<entry id="$key">
  <description>$desc</description>
  <additional_fields>);
  $xml .= &common_fields($species,'Marker');
  foreach my $syn (@synonyms) {
    $xml .= qq(
    <field name="synonym">$syn</field>);
  }
  while (my ($param,$value) = each %$params) {
    $xml .= qq(
    <field name="$param">$value</field>);
  }
  $xml .= qq(
  </additional_fields>);
  $counter->();
  return $xml . qq(
</entry>);
}


########################################################################## QTLs

sub dumpQTL {
  my $conf = shift;
  my $dbname    = $conf->{'dbs'}{'DATABASE_CORE'};
  my $counter   = make_counter(0);
  my $dbspecies = $conf->{'species'};
  my $file = $conf->{'directory'}."/${dbname}_QTL.xml";
  $file .= ".gz" unless $nogzip;
  my $start_time = time;
  my $db = 'core';
  my $fh;
  unless ($nogzip) {
    $fh = new IO::Zlib;
    $fh->open( "$file", "wb9" )
      or die("Can't open compressed stream to $file: $!");
  }
  else {
    open( $fh, ">$file" ) or die "Can't open $file: $!";
  }
  print LOG "  Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
  header( $dbname, $dbspecies, $db, $nogzip, $fh, $release );
  my $sth = $conf->{'dbh'}->prepare("
 select c.name as chr, qf.seq_region_start, qf.seq_region_end,
        a.logic_name as analysis, q.qtl_id,
        q.trait, qs.source_database, qs.source_primary_id,
        fms1.source as fm1_source, fms1.name as fm1_name,
        fms2.source as fm2_source, fms2.name as fm2_name,
        pms.source  as pm_source,  pms.name  as pm_name
   from ((((((seq_region as c, qtl_feature as qf, qtl_synonym as qs,
        analysis as a, qtl as q) left join marker as fm1 on
        fm1.marker_id = q.flank_marker_id_1) left join marker_synonym as fms1 on
        fm1.display_marker_synonym_id = fms1.marker_synonym_id) left join marker as fm2 on
        fm2.marker_id = q.flank_marker_id_2) left join marker_synonym as fms2 on
        fm2.display_marker_synonym_id = fms2.marker_synonym_id) left join marker as pm on
        pm.marker_id = q.peak_marker_id) left join marker_synonym as pms on
        pm.display_marker_synonym_id = pms.marker_synonym_id 
  where c.seq_region_id = qf.seq_region_id and qs.qtl_id = q.qtl_id and 
        qf.analysis_id = a.analysis_id and qf.qtl_id = q.qtl_id
  ");
  $sth->execute();
  my $desc    = '';
  my $old_qtl = 0;
  my $old_ID  = '';
  my $old_pos = '';
  my $old_action;
  my $IDS     = []; 
  my $scale           = $SD->ENSEMBL_GENOME_SIZE || 1;
  my $max_length = $scale *= 1e6;
  while( my $T = $sth->fetchrow_hashref() ){
    if($T->{qtl_id} eq $old_qtl) {
      push @$IDS, $T->{source_primary_id};
      $desc .= " $T->{source_database}:$T->{source_primary_id}";
    } else {
      if ($old_ID) {
        &p( QTLLine( $dbspecies, $old_ID, $old_pos, $old_action, $IDS, $desc, $counter ), $nogzip, $fh);
      }
      $IDS = [ "$T->{trait} $T->{source_primary_id}" ];
      push @$IDS, $T->{fm1_name} if $T->{fm1_name};
      push @$IDS, $T->{fm2_name} if $T->{fm2_name};
      push @$IDS, $T->{pm_name}  if $T->{pm_name};
      $old_ID = $T->{pm_name};
      $old_pos = "$T->{chr}:".($T->{seq_region_start}-1e4).'-'.($T->{seq_region_end}+1e4);
      $old_action = $T->{seq_region_end}-$T->{seq_region_start} > $max_length ? 'Overview' : 'View';
      $desc       = "QTL exhibiting '$T->{trait}' has ";
      my $f2 = $T->{pm_name} ? 1 : 0;
      if( $T->{fm1_name} || $T->{fm2_name} ) {
        my $f1= ($T->{fm1_name}) && ($T->{fm2_name}) ? 1 : 0;
        $desc.='flanking marker'.($f1?'s ':' ').$T->{fm1_name}.($f1?' and ':'').$T->{fm2_name}.($f2?'; ':'');
      }
      if($f2) {
        $desc.= "peak marker $T->{pm_name};";
      }
      $desc.=" and names: $T->{source_database}:$T->{source_primary_id}";
      $old_qtl = $T->{qtl_id};
    }
  }
  &p( QTLLine( $dbspecies, $old_ID, $old_pos, $IDS, $desc, $counter ), $nogzip, $fh);
  return footer( $counter->(),$nogzip,$fh);

}

sub QTLLine {
  my ($species, $key, $pos, $action, $IDS, $desc, $counter ) = @_;
  return '' unless $key;
  $species =~ s/_/ /;
  my $xml = qq(
<entry id="$key">
  <description>$desc</description>
  <additional_fields>);
  $xml .= &common_fields($species,'QTL');
  foreach my $id (@$IDS) {
    $xml .= qq(
    <field name="synonym">$id</field>);
  }
  $xml .= qq(
    <field name="action">$action</field>
    <field name="location">$pos</field>
  </additional_fields>);
  $counter->();
  return $xml . '</entry>';
}

########################################################################## Variations

sub dumpVariation {
  my $conf = shift;
  my $VariationDB = $conf->{'dbs'}->{'DATABASE_VARIATION'};
  return unless $VariationDB;

  my $counter   = make_counter(0);
  my $dbspecies = $conf->{'species'};
  my $file = $conf->{'directory'}."/${VariationDB}_Variation.xml";
  $file .= ".gz" unless $nogzip;
  my $start_time = time;
  my $db = 'variation';
  my $fh;
  unless ($nogzip) {
    $fh = new IO::Zlib;
    $fh->open( "$file", "wb9" )
      or die("Can't open compressed stream to $file: $!");
  }
  else {
    open( $fh,  ">$file"  ) or die "Can't open $file: $!";
  }
  print LOG "  Dumping $VariationDB Variations to $file... ", &date_and_mem , "\n";
  header( $VariationDB, $dbspecies, 'variation', $nogzip, $fh, $release );

  my ($counter2,$file2,$fh2);
  my $somatic = { map { @$_ } @{$conf->{'dbh'}->selectall_arrayref( "select source_id, somatic_status from $VariationDB.source" )} };
  my $somatic_present = ( grep { $somatic->{$_} } keys %$somatic ) ? 1 : 0;
  if ($somatic_present) {
    $counter2 = make_counter(0);
    $file2 = $conf->{'directory_somatic_mutations'}."/${VariationDB}_SomaticMutation.xml";
    $file2 .= ".gz" unless $nogzip;
    unless ($nogzip) {
      $fh2 = new IO::Zlib;
      $fh2->open( "$file", "wb9" )
	or die("Can't open compressed stream to $file2: $!");
    }
    else {
      open( $fh2, ">$file2" ) or die "Can't open $file2: $!";
    }
    print LOG "  Dumping $VariationDB SomaticMutations to $file2 ... ", &date_and_mem, "\n";
    header( $VariationDB, $dbspecies, 'somatic mutation', $nogzip, $fh2, $release );
  }

  my $sources = { map { @$_ } @{$conf->{'dbh'}->selectall_arrayref( "select source_id, name from $VariationDB.source" )} };

  my $sth;

  print LOG "    Preparing to get SNP location info ", &date_and_mem, "\n";
  my $query = qq(
    SELECT vf.variation_id, concat(sr.name, ':', vf.seq_region_start, '-',  vf.seq_region_end, ':', vf.seq_region_strand ) as location
      FROM $VariationDB.variation_feature vf, $VariationDB.seq_region sr
     WHERE vf.seq_region_id = sr.seq_region_id);
  $sth = $conf->{'dbh'}->prepare($query);
  $sth->execute;
  my $locations;
  while ( my $rows = $sth->fetchrow_arrayref() ) {
    push @{$locations->{$rows->[0]}},$rows->[1];
  }
  print LOG "     Retrieved SNP location info ", &date_and_mem, "\n";

  print LOG "    Preparing to get extra SNP info (phenotypes etc) ", &date_and_mem, "\n";

  my %snp_extra = map { ($_->[0] => $_) }
    @{$conf->{'dbh'}->selectall_arrayref(
       "SELECT va.variation_id,
             group_concat( distinct sta.name SEPARATOR '; ') AS lsi,
             group_concat( distinct st.external_reference SEPARATOR ';') AS st,
             group_concat( distinct va.associated_gene SEPARATOR ';') AS gn,
             group_concat( distinct va.variation_names SEPARATOR ';') AS vars,
             group_concat( distinct
               if(
                 isnull(p.name),
                 p.description,
                 concat( p.description,' (',p.name,')' )
               )
               SEPARATOR ';'
             ) AS phen
        FROM $VariationDB.variation_annotation AS va left join
             $VariationDB.phenotype AS p ON p.phenotype_id = va.phenotype_id left join
             $VariationDB.study AS st ON st.study_id=va.study_id left join
             $VariationDB.associate_study AS sa ON sa.study1_id=st.study_id left join
             $VariationDB.study AS sta ON sta.study_id=sa.study2_id
       GROUP BY va.variation_id
       ORDER BY va.variation_id"
    )};

#  warn Data::Dumper::Dumper(\%snp_extra);

  print LOG "     Retrieved extra SNP info ", &date_and_mem, "\n";

  #disconnect and then connect to free up memory
  my $core = $conf->{'core_details'};
  $conf->{'dbh'}->disconnect;
  $conf->{'dbh'} = DBI->connect( "dbi:mysql:$core->{'NAME'};host=$core->{'HOST'};port=$core->{'PORT'}", $core->{'USER'}, $core->{'PASS'}, { PrintError => 0, RaiseError => 0} );

  my ($ssid_sth, $stmt,$ssid_hash);
  if ($dbspecies ne 'Homo_sapiens') {
    print LOG "    Preparing to get extra ssID info ", &date_and_mem, "\n";
    $stmt = qq{SELECT variation_id,
                      GROUP_CONCAT('ss',subsnp_id) as ssid
                 FROM $VariationDB.subsnp_map
             GROUP BY variation_id};
    $ssid_sth = $conf->{'dbh'}->prepare($stmt);
    $ssid_sth->execute();
    print LOG "     ...executed SQL, now retrieving data ", &date_and_mem, "\n";
    # Get a hashref of the results with the variation_id as key
    $ssid_hash = $ssid_sth->fetchall_hashref('variation_id');
    print LOG "     Retrieved extra ssID info ", &date_and_mem, "\n";

    #disconnect and then connect to free up memory
    $conf->{'dbh'}->disconnect;
    $conf->{'dbh'} = DBI->connect( "dbi:mysql:$core->{'NAME'};host=$core->{'HOST'};port=$core->{'PORT'}", $core->{'USER'}, $core->{'PASS'}, { PrintError => 0, RaiseError => 0} );
  }
  else {
    #for human we need to get the extra allele info as we go along (note, when we have 48G memory it might be worth trying agains to do it in one go)
    print LOG "    Will get extra ssID info as we go along ", &date_and_mem, "\n";
    $stmt = qq{SELECT GROUP_CONCAT('ss',subsnp_id) as ssid
                 FROM $VariationDB.subsnp_map
                WHERE variation_id = ?};
    $ssid_sth = $conf->{'dbh'}->prepare($stmt);
  }

  # Statement to get all variations, both mapped, failed and the precious non-mapped, non-failed
  print LOG "    Preparing to get all SNPs ", &date_and_mem, "\n";
  $sth = $conf->{'dbh'}->prepare(qq{
    SELECT
        v.variation_id,
        v.name,
        v.source_id,
        GROUP_CONCAT(
            vs.source_id,
            ' ',
            vs.name
        ),
        fd.description,
        v.somatic
    FROM
        $VariationDB.variation v LEFT JOIN
        $VariationDB.variation_synonym vs USING (variation_id) LEFT JOIN (
            $VariationDB.failed_variation fv JOIN
            $VariationDB.failed_description fd USING (failed_description_id)
        ) USING (variation_id)
    GROUP BY
        v.variation_id
    ORDER BY
        NULL});

  $sth->execute() or die "Error:", $DBI::errstr;
  print LOG "     ...executed SQL, now retrieving data and generating xml ", &date_and_mem,  "\n";

  my $new_syn_c = 0;
  while ( my $rowcache = $sth->fetchall_arrayref( undef, 10_000 ) ) {
    while ( my $row = shift( @{$rowcache} ) ) {
      my $variation_id       = $row->[0];
      my $variation_name     = $row->[1];
      my $source_id          = $row->[2];
      my $synonyms           = $row->[3];
      my $failed_desc        = $row->[4];
      my $somatic_var        = $row->[5];
      my $ssids;
      if ($ssid_hash) {
        $ssids = $ssid_hash->{$variation_id}{'ssid'};
      }
      else {
        #comment out these three lines if time constraints mean you need to generate indices without extra allele info
        $ssid_sth->execute($variation_id);
        (my $res) = $ssid_sth->fetchrow_arrayref();
        $ssids = $res->[0];
      }
      my %synonyms;
      my $syn_c = 0;
      foreach my $syn (split /,/, $synonyms) {
        my ($source_id,$sname) = split / /,$syn;
        $synonyms{$sname} = $sources->{$source_id};
        $syn_c++;
      }
      foreach my $ssid_syn (split /,/,$ssids) {
        unless (exists $synonyms{$ssid_syn}) {
          $synonyms{$ssid_syn} = 'dbSNP';
          $syn_c++;
          $new_syn_c++;
        }
      }
      my (@syns, @genes, @phenotypes, @studies);
      foreach my $syn (keys %synonyms) {
        push @syns, $syn;
      }

      my $somatic_mutation = $somatic_var;
      my $type             = $somatic_mutation ? 'Somatic Mutation' : 'Variation';
      my $snp_source       = $sources->{ $source_id };
      my $desc;
      my $locs = $locations->{$variation_id};

      my $x = $snp_extra{$variation_id};
      if( $x ) {
        push @syns,       split ';', $x->[1];
        push @genes,      split ';', $x->[3];
        push @syns, split ';', $x->[4];
        push @phenotypes, $x->[5] if $x->[5];
        push @studies,    split ';', $x->[2];
      }
      $desc = sprintf( "A $snp_source $type.%s%s%s",
                       @phenotypes  ? ' Phenotypes: ' .         (join ', ', @phenotypes) . '.' : '',
                       @genes       ? ' Gene Association(s): ' .(join ', ', @genes)      . '.' : '',
                       $failed_desc ? " $failed_desc."                                         : '');
      if ($somatic_mutation){
        &p( VariationLine( $dbspecies, $variation_name, \@syns, \@genes, \@phenotypes, \@studies, $locs, $snp_source, $desc, $counter2 ), $nogzip, $fh2);
      }
      else {
        &p( VariationLine( $dbspecies, $variation_name, \@syns, \@genes, \@phenotypes, \@studies, $locs, $snp_source, $desc, $counter ), $nogzip, $fh);
      }
    }
  }
  print LOG "    All SNPs dumped to file ", &date_and_mem,  "\n";

  my $c = footer( $counter->(),$nogzip,$fh,'Variation');
  if ($somatic_present) {
    $c += footer( $counter2->(),$nogzip,$fh2,'Somatic Mutation');
  }
  print LOG "  $new_syn_c synonyms added from the subsnp_map table\n";

  %snp_extra    = undef;
  $ssid_hash = undef;

  return $c;
}

sub VariationLine {
  my ($species, $name, $synonyms, $genes, $phenotypes, $studys, $locs, $source, $desc, $counter ) = @_;
  $species =~ s/_/ /;
  my $xml = qq(
<entry id="$name">
  <description>$desc</description>
  <additional_fields>);
  $xml .= &common_fields($species,'Variation');
  foreach (@$locs) {
    $xml .= qq(
    <field name="location">$_</field>);
  }
  foreach (@$synonyms) {
    $xml .= qq(
    <field name="synonym">$_</field>);
  }
  foreach (@$genes) {
    $xml .= qq(
    <field name="assoc_gene">$_</field>);
  }
  foreach (@$phenotypes) {
    $xml .= qq(
    <field name="phenotype">$_</field>);
  }
  foreach (@$studys ) {
    $xml .= qq(
    <field name="study">$_</field>);
  }
  $xml .= qq(
    <field name="source">$source</field>
  </additional_fields>);
  $counter->();
  return $xml . '</entry>';
}

########################################################################## Variation Phenotypes

sub dumpVariationPhenotype {
  my $conf = shift;
  my $VariationDB = $conf->{'dbs'}->{'DATABASE_VARIATION'};
  return unless $VariationDB;
  my $counter   = make_counter(0);
  my $dbspecies = $conf->{'species'};
  my $file = $conf->{'directory'}."/${VariationDB}_VariationPhenotype.xml";
  $file .= ".gz" unless $nogzip;
  my $start_time = time;
  my $db = 'variation';
  my $fh;
  unless ($nogzip) {
    $fh = new IO::Zlib;
    $fh->open( "$file", "wb9" )
      or die("Can't open compressed stream to $file: $!");
  }
  else {
    open $fh, '>', $file  or die "Can't open $file: $!";
  }
  print LOG "  Dumping $VariationDB Phenotype info to $file ... ", format_datetime($start_time), "\n";
  header( $VariationDB, $dbspecies, 'variation', $nogzip, $fh, $release );

  #phenotype info
  my $phenotype_info = $conf->{'dbh'}->selectall_hashref("select description, phenotype_id from $VariationDB.phenotype",'description');
  foreach my $phen (keys %$phenotype_info) {
    &p( VariationPhenLine( $dbspecies, $phen, $phenotype_info->{$phen}{'phenotype_id'}, $counter ), $nogzip, $fh);
  }

  return footer( $counter->(),$nogzip,$fh);
}

sub VariationPhenLine {
  my ($species, $name, $id, $counter ) = @_;
  $species =~ s/_/ /;
  my $xml = qq(
<entry id="$id">
  <description>$name</description>
  <additional_fields>);
  $xml .= &common_fields($species,'Phenotype');
  $xml .= qq(
  </additional_fields>);
  $counter->();
  return $xml . '</entry>';
}

########################################################################## StructuralVariations

sub dumpStructuralVariations {
  my $conf = shift;
  my $VariationDB = $conf->{'dbs'}->{'DATABASE_VARIATION'};
  return unless $VariationDB;
  my $counter   = make_counter(0);
  my $dbspecies = $conf->{'species'};
  my $file = $conf->{'directory'}."/${VariationDB}_StructuralVariations.xml";
  $file .= ".gz" unless $nogzip;
  my $start_time = time;
  my $db = 'variation';
  my $fh;
  unless ($nogzip) {
    $fh = new IO::Zlib;
    $fh->open( "$file", "wb9" )
      or die("Can't open compressed stream to $file: $!");
  }
  else {
    open( $fh, ">$file" ) or die "Can't open $file: $!";
  }
  print LOG "  Dumping $VariationDB to $file ... ", format_datetime($start_time), "\n";
  header( $VariationDB, $dbspecies, 'variation', $nogzip, $fh, $release );

  my $scale           = $SD->ENSEMBL_GENOME_SIZE || 1;
  my $max_length      = $scale *= 1e6;

  my $T = $conf->{'dbh'}->selectall_arrayref(qq(
    select v.structural_variation_id,
           v.variation_name,
           s.name,
           s.description,
           r.name,
           v.seq_region_start,
           v.seq_region_end,
           group_concat(ssv.name)
      from $VariationDB.structural_variation as v, $VariationDB.supporting_structural_variation as ssv, $VariationDB.source as s, $VariationDB.seq_region as r
     where s.source_id = v.source_id 
       and v.seq_region_id = r.seq_region_id
       and v.structural_variation_id = ssv.structural_variation_id
     group by v.structural_variation_id));
  foreach my $row ( @$T ) {
    my $id = $row->[1];
    my $location = $row->[4] .":" . $row->[5] ."-". $row->[6];
    my $length = $row->[6] - $row->[5] + 1;
    my $evidence =  $row->[7];
    my $action = ($length >> $max_length) ? 'Overview' : 'View';
    my $params = ($length >> $max_length) ? { 'cytoview' => 'variation_feature_structural=normal' }
                                          : { 'contigviewbottom' => 'variation_feature_structural=normal' };
    my $desc = qq(A structural variation from $row->[2], identified by $row->[3]. This variation has been mapped to chromosome $row->[4], from $row->[5] to $row->[6]. );
    &p( StrucVarLine( $dbspecies, $id, $evidence, $location, $desc, $action, $params, $counter ), $nogzip, $fh);
  }
  return footer( $counter->(),$nogzip,$fh);
}

sub StrucVarLine {
  my ($species, $id, $evidence, $location, $desc, $action, $params, $counter ) = @_;
  $species =~ s/_/ /;
  my $xml = qq(
<entry id="$id">
  <description>$desc</description>
  <additional_fields>);
  $xml .= &common_fields($species,'StructuralVariation');
  foreach (split ',', $evidence) {
    $xml .= qq(
    <field name="evidence">$_</field>);
  }
  $xml .= qq(
    <field name="location">$location</field>
    <field name="action">$action</field>);
  while (my ($param,$value) = each %$params) {
    $xml .= qq(
    <field name="$param">$value</field>);
  }
  $xml .= qq(
  </additional_fields>);
  $counter->();
  return $xml . '</entry>';
}


########################################################################## Similarity features

sub dumpGenomicAlignment {
  my $conf    = shift;
  my $dbspecies = $conf->{'species'};
  my $COREDB  = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $ESTDB   = $conf->{'dbs'}->{'DATABASE_OTHERFEATURES'};
  my $CDNADB  = $conf->{'dbs'}->{'DATABASE_CDNA'};
  my $RNASEQDB = $conf->{'dbs'}->{'DATABASE_RNASEQ'};
  my %dbs      = ( 'core' => $COREDB );
  $dbs{'otherfeatures'} = $ESTDB  if $ESTDB;
  $dbs{'cdna'}          = $CDNADB if $CDNADB;
  $dbs{'rnaseq'}        = $RNASEQDB if $RNASEQDB;
  my %tables = (
    'dna_align_feature'     => [ 'DnaAlignFeature',     'DNA alignment feature' ],
    'protein_align_feature' => [ 'ProteinAlignFeature', 'Protein alignment feature' ]
  );
  my $total_af_c;
  foreach my $db ( sort { $b cmp $a } keys %dbs) {

#    next unless ($db eq 'rnaseq');

    my $counter    = make_counter(0);
    my $dbname     = $dbs{$db};
    my $file       = $conf->{'directory'}."/${dbname}_GenomicAlignment.xml";
    $file         .= ".gz" unless $nogzip;
    my $start_time = time;
    my $fh;
    unless ($nogzip) {
      $fh = new IO::Zlib;
      $fh->open( "$file", "wb9" ) or die("Can't open compressed stream to $file: $!");
    }
    else {
      open( $fh, ">$file" ) or die "Can't open $file: $!";
    }
    print LOG "  Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
    header( $dbname, $dbspecies, $db, $nogzip, $fh, $release );

    foreach my $table ( keys %tables ) {
      my $source = $tables{ $table }[0];
      $source .= ";db=$db" unless $db eq 'core';
      my $type   = $tables{ $table }[1];
      my $sth    = $conf->{'dbh'}->prepare(
        "select ad.display_label, a.logic_name, ad.displayable, t.hit_name, ad.web_data, edb.db_name, count(*) as hits
                from (($dbname.analysis as a, $dbname.$table as t) left join
                     $dbname.analysis_description ad on a.analysis_id = ad.analysis_id) 
           left join $dbname.external_db edb on t.external_db_id = edb.external_db_id
               where a.analysis_id = t.analysis_id
                 and ad.displayable = 1
               group by ad.display_label, t.hit_name");
      $sth->execute();
      my $c = 0;
      while( my( $label, $logic_name, $displayable, $hid, $webdata, $db_name, $count  ) = $sth->fetchrow_array ) {
        my $wd = eval($webdata);
        next if $wd->{$table}{'do_not_display'};
        next unless $displayable;
        $c++;
        $label ||= '';
        my $desc = "$label $hid hits the genome in $count locations.";
        (my $track_name = $table) =~ s/feature//;
        my $renderer = $track_name . $db . '_' . lc($logic_name) . '=stack' ;
        my $params = { 'contigviewbottom' => $renderer };
        &p( GenomicAlignmentLine( $dbspecies, $hid, $desc, $db_name, $db, $tables{$table}->[0], $params, $counter ), $nogzip, $fh);
      }
      print LOG "  ...Dumped $c $table entries\n";
    }
    if ($db eq 'cdna') {
      my $sth    = $conf->{'dbh'}->prepare(
        "select ad.display_label, uo.identifier,  ur.summary_description, 'Unmapped feature'
           from $dbname.unmapped_object uo, $dbname.unmapped_reason ur, $dbname.analysis a 
                left join $dbname.analysis_description ad on a.analysis_id = ad.analysis_id
          where a.analysis_id = uo.analysis_id
            and uo.unmapped_reason_id = ur.unmapped_reason_id");
      $sth->execute();
      my $c++;
      while (my ($label, $hid, $reason, $type) = $sth->fetchrow_array ) {
        $c++;
        $label ||= '';
        my $desc = "$label $hid fails to map to the genome. Reason: $reason.";
        &p( GenomicAlignmentLine( $dbspecies, $hid, $desc, undef, $db, 'Unmapped feature', {}, $counter ), $nogzip, $fh);
      }
      print LOG "  ...Dumped $c Unmapped features\n";
    }
    $total_af_c += footer( $counter->(),$nogzip,$fh);
  }
  return $total_af_c;
}

sub GenomicAlignmentLine {
  my ($species, $id, $desc, $db_name, $db, $ftype, $params, $counter ) = @_;
  $species =~ s/_/ /;
  my $xml = qq(
<entry id="$id">
  <description>$desc</description>);
  if ($db_name) {
    $xml .= qq(
  <cross_references>
    <ref dbname="$db_name" dbkey="$id"/>
  </cross_references>);
    $cross_references{$db_name}++;
  }
  $xml .= qq(
  <additional_fields>);
  $xml .= &common_fields($species,$ftype);
  unless ($db eq 'core') {
    $xml .= qq(
    <field name="db">$db</field>);
  }
  while (my ($param,$value) = each %$params) {
    $xml .= qq(
    <field name="$param">$value</field>);
  }
  if ( $id =~ /^(\w+)\.\d+$/) {
    $xml .= qq(
    <field name="non_versioned">$1</field>);
  }
  $xml .= qq(
  </additional_fields>);
  $counter->();
  return $xml . '</entry>';
}


########################################################################## OligoProbes

sub dumpOligoProbe {
  my $conf = shift;
  my $FUNCDB = $conf->{'dbs'}->{'DATABASE_FUNCGEN'};
  return unless $FUNCDB;
  my $counter   = make_counter(0);
  my $dbspecies = $conf->{'species'};
  my $file = $conf->{'directory'}."/${FUNCDB}_OligoProbe.xml";
  $file .= ".gz" unless $nogzip;
  my $start_time = time;
  my $db = 'funcgen';
  my $fh;
  unless ($nogzip) {
    $fh = new IO::Zlib;
    $fh->open( "$file", "wb9" )
      or die("Can't open compressed stream to $file: $!");
  }
  else {
    open( $fh, ">$file" ) or die "Can't open $file: $!";
  }
  print LOG "  Dumping $FUNCDB to $file ... ", format_datetime($start_time), "\n";
  header( $FUNCDB, $dbspecies, $db, $nogzip, $fh, $release );

  my $sth    = $conf->{'dbh'}->prepare(
    "select ps.name, count(distinct pf.probe_feature_id) as hits, a.vendor
        from $FUNCDB.probe_set ps, $FUNCDB.probe p, $FUNCDB.probe_feature pf, $FUNCDB.array_chip ac, $FUNCDB.array a, $FUNCDB.status s, $FUNCDB.status_name sn
      where sn.name='MART_DISPLAYABLE'
        and sn.status_name_id=s.status_name_id
        and s.table_name='array'
        and s.table_id=a.array_id
        and ps.probe_set_id = p.probe_set_id
        and p.probe_id = pf.probe_id
        and p.array_chip_id = ac.array_chip_id
        and ac.array_id = a.array_id
      group by ps.name, a.vendor"
  );
  $sth->execute();
  while( my( $hid, $count, $type ) = $sth->fetchrow_array ) {
    next unless $hid;
    my $desc =  qq($type probeset $hid hits the genome in $count locations.);
    &p( OligoProbeLine($dbspecies,$hid,$desc,'pset',$db,$counter ), $nogzip, $fh);
  }

  my $sth    = $conf->{'dbh'}->prepare(
    "select group_concat(distinct p.name), count(distinct pf.probe_feature_id) as hits, a.vendor
        from $FUNCDB.probe p, $FUNCDB.probe_feature pf, $FUNCDB.array_chip ac, $FUNCDB.array a, $FUNCDB.status s, $FUNCDB.status_name sn
      where sn.name='MART_DISPLAYABLE'
        and sn.status_name_id=s.status_name_id
        and s.table_name='array'
        and s.table_id=a.array_id
        and p.probe_set_id is NULL
        and p.probe_id = pf.probe_id
        and p.array_chip_id = ac.array_chip_id
        and ac.array_id = a.array_id
      group by p.probe_id, a.vendor"
  );
  $sth->execute();
  while( my( $hid, $count, $type ) = $sth->fetchrow_array ) {
    next unless $hid;
    my $desc = qq($type probe $hid hits the genome in $count locations.);
    &p( OligoProbeLine($dbspecies,$hid,$desc,'probe',$db,$counter ), $nogzip, $fh);
  }
  return footer( $counter->(),$nogzip,$fh);
}

sub OligoProbeLine {
  my ($species,$hid,$desc,$pset,$db,$counter) = @_;
  $species =~ s/_/ /;
  my $xml = qq(
<entry id="$hid">
  <description>$desc</description>);
  $xml .= qq(
  <additional_fields>);
  $xml .= &common_fields($species,'ProbeFeature');
  $xml .= qq(
    <field name="db">$db</field>
    <field name="ptype">$pset</field>
  </additional_fields>);
  $counter->();
  return $xml . '</entry>';
}


########################################################################## Domains

sub dumpDomain {
  my $conf = shift;
  my $dbname    = $conf->{'dbs'}{'DATABASE_CORE'};
  my $counter   = make_counter(0);
  my $dbspecies = $conf->{'species'};
  my $file = $conf->{'directory'}."/${dbname}_Domain.xml";
  $file .= ".gz" unless $nogzip;
  my $start_time = time;
  my $db = 'core';
  my $fh;
  unless ($nogzip) {
    $fh = new IO::Zlib;
    $fh->open( "$file", "wb9" )
      or die("Can't open compressed stream to $file: $!");
  }
  else {
    open( $fh, ">$file" ) or die "Can't open $file: $!";
  }
  print LOG "  Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
  header( $dbname, $dbspecies, $db, $nogzip, $fh, $release );

  my $sth = $conf->{'dbh'}->prepare(
    "select x.dbprimary_acc, i.id, x.description
       from xref as x, interpro as i
      where x.dbprimary_acc = i.interpro_ac
      order by x.dbprimary_acc");
  $sth->execute();
  my $old_acc     = '';
  my $IDS         = [];
  my $description = '';
  my $count       = 0;
  my ($acc, $id, $desc, $old_desc );
  while( ($acc, $id, $desc ) = $sth->fetchrow_array()){
    if($acc eq $old_acc) {
      push @$IDS, $id;
      $count++;
    }
    else {
      if ($old_acc) {
        $description = "Interpro domain $old_acc [$old_desc] has";
        &p( DomainLine($dbspecies,$old_acc,$description,$IDS,$count,$counter ), $nogzip, $fh);
      }
      $IDS      = [ $id ];
      $old_acc  = $acc;
      $old_desc = $desc;
      $count    = 1;
    }
  }
  $description = "Interpro domain $old_acc [$old_desc] has";
  &p( DomainLine($dbspecies,$old_acc,$description,$IDS,$count,$counter ), $nogzip, $fh);
  return footer( $counter->(),$nogzip,$fh);
}

sub DomainLine {
  my($species, $acc, $desc, $IDS, $count, $counter) = @_;
  $species =~ s/_/ /;
  my $ids = join ',', @$IDS;
  my $description = "$desc $count associated external database identifiers: $ids";
  my $xml = qq(
<entry id="$acc">
  <description>$description</description>
  <additional_fields>);
  $xml .= &common_fields($species,'Domain');
  foreach my $id (@$IDS) {
    $xml .= qq(
    <field name="synonym">$id</field>);
  }
  $xml .= qq(
  </additional_fields>);
  $counter->();
  return $xml . '</entry>';
}


########################################################################## Families

sub dumpFamily {
  my $conf   = shift;
  my $FAMDB  = $conf->{'dbs'}->{'DATABASE_COMPARA'};
  my $dbname = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $t_sth  = $conf->{'dbh'}->prepare("select meta_value from $dbname.meta where meta_key='species.taxonomy_id'");
  $t_sth->execute;
  my $taxon_id = ($t_sth->fetchrow);
  return unless $taxon_id;

  my $counter   = make_counter(0);
  my $dbspecies = $conf->{'species'};
  my $file = $conf->{'directory'}."/${dbname}_Family.xml";
  $file .= ".gz" unless $nogzip;
  my $start_time = time;
  my $db = 'core';
  my $fh;
  unless ($nogzip) {
    $fh = new IO::Zlib;
    $fh->open( "$file", "wb9" )
      or die("Can't open compressed stream to $file: $!");
  }
  else {
    open( $fh, ">$file" ) or die "Can't open $file: $!";
  }
  print LOG "  Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
  header( $dbname, $dbspecies, $db, $nogzip, $fh, $release );

 ( my $species = $conf->{'species'} ) =~ s/_/ /g;

  my $sth = $conf->{'dbh_compara'}->prepare(qq(
    select f.stable_id,
           f.description,
           count(*) as N,
           sum( (source_name = 'ENSEMBLGENE') and (m.genome_db_id = gd.genome_db_id) ) as ensembl_genes_species,
           sum( (source_name = 'ENSEMBLPEP' ) and (m.genome_db_id = gd.genome_db_id) ) as ensembl_peptides_species,
           sum(  source_name = 'ENSEMBLGENE'                                         ) as ensembl_genes,
           sum(  source_name = 'ENSEMBLPEP'                                          ) as ensembl_peptides,
           sum( (source_name = 'Uniprot/SPTREMBL' ) and (m.taxon_id = gd.taxon_id)   ) as uniprot_sptrembl_species,
           sum( (source_name = 'Uniprot/SWISSPROT') and (m.taxon_id = gd.taxon_id)   ) as uniprot_swissprot_species,
           sum(  source_name = 'Uniprot/SPTREMBL'                                    ) as uniprot_sptrembl,
           sum(  source_name = 'Uniprot/SWISSPROT'                                   ) as uniprot_swissprot
      from family as f, family_member as fm, member as m, genome_db as gd
     where f.family_id=fm.family_id and fm.member_id=m.member_id and
           gd.name = '$dbspecies'
     group by f.family_id 
    having ensembl_genes_species > 0));
  $sth->execute();
  my $X = $conf->{'authority'} || 'Ensembl';
  while( my(
    $fid, $desc, $total,
    $ensembl_genes_species, $ensembl_peptides_species, $ensembl_genes, $ensembl_peptides,
    $uniprot_sptrembl_species, $uniprot_swissprot_species,
    $uniprot_sptrembl, $uniprot_swissprot
  ) = $sth->fetchrow_array()) {
    $desc =~ s/(\S+)\/(\S+)/$1 \/ $2/g;
    my $full_desc = qq(Ensembl protein family $fid [$desc] has $total members: $ensembl_genes $X genes ($ensembl_genes_species in $species); $ensembl_peptides $X proteins ($ensembl_peptides_species in $species); $uniprot_swissprot UniProtKB / Swiss-Prot proteins ($uniprot_swissprot_species in $species); $uniprot_sptrembl UniProtUK/TrEMBL proteins ($uniprot_sptrembl_species in $species).);
    &p( FamilyLine($dbspecies,$fid,$full_desc,$counter ), $nogzip, $fh);
  }
  return footer( $counter->(),$nogzip,$fh);
}

sub FamilyLine {
  my($species, $fid, $desc, $counter) = @_;
  $species =~ s/_/ /;
  my $xml = qq(
<entry id="$fid">
  <description>$desc</description>);
  $xml .= qq(
  <additional_fields>);
  $xml .= &common_fields($species,'Family');
  $xml .= qq(
  </additional_fields>);
  $counter->();
  return $xml . '</entry>';
}

########################################################################## Sequences

sub dumpSequence {
  my $conf = shift;
  my $sanger = sanger_project_names( $conf );
  my @misc_feat_disallowed = (); #features that we know we don't want to dump if we ever decide to do this (not implemented)
  my @name_order = (qw(name well_name clone_name sanger_project synonym embl_acc)); #defines the order of attribute_type we use to get the name for misc_features (comes from B::E::G::_clone.pm)
  my $COREDB   = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $OTFEATDB = $conf->{'dbs'}->{'DATABASE_OTHERFEATURES'};
  my %dbs      = ( 'core' => $COREDB );
  $dbs{'otherfeatures'} =  $OTFEATDB if $OTFEATDB;

  my $total_c;
 DB:
  foreach my $db ( sort { $b cmp $a } keys %dbs) {
    my $dbname    = $dbs{$db};
    my $counter   = make_counter(0);
    my $dbspecies = $conf->{'species'};
    my $file = $conf->{'directory'}."/${dbname}_Sequence.xml";
    $file .= ".gz" unless $nogzip;
    my $start_time = time;
    my $fh;
    unless ($nogzip) {
      $fh = new IO::Zlib;
      $fh->open( "$file", "wb9" )
        or die("Can't open compressed stream to $file: $!");
    }
    else {
      open( $fh, ">$file" ) or die "Can't open $file: $!";
    }
    print LOG "  Dumping $dbname to $file ... ", format_datetime($start_time), "\n";
    header( $dbname, $dbspecies, $db, $nogzip, $fh, $release );

    if ($db eq 'core') {
      my $lrgs = &findLRGs($conf);
      if (@$lrgs) {
        &sort_lrgs($dbspecies,'Sequence',$fh,$lrgs);
      }
    }

    #get all types of misc features - used for 'type' label
    my $feat_types = $conf->{'dbh'}->selectall_hashref(qq(
                  SELECT ms.code, ms.misc_set_id, ma.value as type 
                    FROM $dbname.attrib_type at, $dbname.misc_attrib ma, $dbname.misc_feature_misc_set mfms, $dbname.misc_set ms
                   WHERE at.attrib_type_id = ma.attrib_type_id
                     AND ma.misc_feature_id = mfms.misc_feature_id 
                     AND mfms.misc_set_id = ms.misc_set_id 
                     AND at.code = 'type'
                   GROUP by ms.code, ma.value), 'misc_set_id');
    if (%$feat_types) {
      my $mapsets = join ',', keys %$feat_types;
      #get all misc_features
      my $sth = $conf->{'dbh'}->prepare(qq(
       SELECT mf.misc_feature_id, sr.name, cs.name, mf.seq_region_start, mf.seq_region_end,
              mf.seq_region_end-mf.seq_region_start+1 as len, ms.misc_set_id, at.code, ma.value
         FROM $dbname.misc_feature_misc_set as ms,
              $dbname.misc_feature as mf,
              seq_region as sr,
              coord_system as cs,
              $dbname.misc_attrib as ma,
              $dbname.attrib_type as at
        WHERE cs.coord_system_id = sr.coord_system_id
          AND sr.seq_region_id = mf.seq_region_id 
          AND mf.misc_feature_id = ms.misc_feature_id
          AND ms.misc_set_id in ($mapsets)
          AND mf.misc_feature_id = ma.misc_feature_id
          AND ma.attrib_type_id = at.attrib_type_id
        ORDER by mf.misc_feature_id, at.code));
      $sth->execute();

      my ($old_ID,$old_sr,$old_sr_type,$old_start,$old_end,$old_len,$NAMES,$ftype);
      while( my($ID,$sr,$sr_type,$start,$end,$len,$ms_id,$code,$val) = $sth->fetchrow_array() ) {
        if($ID == $old_ID) {
          if (! $ftype && $code eq 'type') {
            $ftype = $feat_types->{$ms_id}{'type'};
            #some hacks for the display of the type
            $ftype =~ s/_/ /;
            $ftype =~ s/arrayclone/clone/;
            $ftype = ucfirst($ftype);
          }
          if (grep {$code eq $_} @name_order) {
            push @$NAMES, [$code,$val];
          }
        }
        else {
          if ($old_ID) {
            my ($name, $synonyms) = &sort_mf_names($NAMES,\@name_order);
            &p( SeqLine($dbspecies,$old_sr,$old_sr_type,$old_start,$old_end,$old_len,$name,$synonyms,$ftype,$sanger,$counter), $nogzip, $fh);
            $NAMES = undef;
            $ftype = undef;
          }
          if (! $ftype && $code eq 'type') {
            $ftype = $feat_types->{$ms_id}{'type'};
            $ftype =~ s/_/ /;
            $ftype =~ s/arrayclone/clone/;
            $ftype = ucfirst($ftype);
          }
          if (grep {$code eq $_} @name_order) {
            push @$NAMES, [$code,$val];
          }
          ($old_ID,$old_sr,$old_sr_type,$old_start,$old_end,$old_len) = ($ID,$sr,$sr_type,$start,$end,$len);
        }
      }
      my ( $name, $synonyms) = sort_mf_names($NAMES,\@name_order);
      &p( SeqLine($dbspecies,$old_sr,$old_sr_type,$old_start,$old_end,$old_len,$name,$synonyms,$ftype,$sanger,$counter), $nogzip, $fh);

    }
    if ($db ne 'core') {
      $total_c += footer( $counter->(),$nogzip,$fh);
      next DB;
    };

    #identify current default top level
    (my $current_cs_id) = $conf->{'dbh'}->selectrow_array(qq(
       SELECT cs.coord_system_id
         FROM coord_system cs, meta m
        WHERE cs.version = m.meta_value
          AND cs.name = 'chromosome'
          AND m.meta_key = 'assembly.default'));

    #get mappings between toplevel assemblies - used to check there are no name changes on the different assemblies
    my $sth = $conf->{'dbh'}->prepare(qq(
       SELECT distinct sr1.name, sr1.length, sr2.name, cs2.coord_system_id
         FROM coord_system cs1, seq_region sr1, assembly a, seq_region sr2, coord_system cs2
        WHERE cs1.coord_system_id = sr1.coord_system_id
          AND sr1.seq_region_id = a.asm_seq_region_id
          AND a.cmp_seq_region_id = sr2.seq_region_id
          AND sr2.coord_system_id = cs2.coord_system_id
          AND cs1.name = 'chromosome'
          AND cs2.name = 'chromosome'));
    my $mapped_ids;
    $sth->execute;
    while (my ($new_name,$new_length,$prev_name,$prev_coord_system_id) =  $sth->fetchrow_array() ) {
      $mapped_ids->{$prev_coord_system_id}{$prev_name}{'name'}   = $new_name;
      $mapped_ids->{$prev_coord_system_id}{$prev_name}{'length'} = $new_length;
    }

    #get all seq_regions
    $sth = $conf->{'dbh'}->prepare(qq(
       SELECT sr.name, sr.length, cs.name, cs.coord_system_id, srs.synonym
         FROM coord_system as cs, seq_region as sr
         LEFT JOIN seq_region_synonym as srs on sr.seq_region_id = srs.seq_region_id
        WHERE sr.coord_system_id = cs.coord_system_id));
    $sth->execute();
    my ($old_name,$old_type,$old_length);
    my $synonyms = [];
    while( my($name,$length,$type,$cs_id,$syn) = $sth->fetchrow_array() ) {
      my ($prev_name,$prev_length,$mapping_text);
      next if $type eq 'lrg';
      if ($type eq 'chromosome' && $cs_id != $current_cs_id) {
        # if this is an old seq_region mapped to a new one of the same name then skip it
        if ($mapped_ids->{$cs_id}{$name}) {
          $prev_name   = $mapped_ids->{$cs_id}{$name}{'name'};
          if ($prev_name ne $name) {
            print LOG "WARNING: found assembly mapped seq_region $prev_name instead of $name - please check this is correct since this we haven't come across this before\n";
          }
          next;
        }
        else {
          print LOG "WARNING: not dumping $name on coord_system $cs_id since it's an old assembly with no mapping - investigate what this means!\n";
          next;
        }
      }
      if ($old_name eq $name) {
	push @$synonyms, $syn if $syn;
      }
      else {
	if ($old_name) {
	  &p( SeqLine($dbspecies,$old_name,$old_type,1,$old_length,$old_length,$old_name,$synonyms,$old_type,$sanger,$counter), $nogzip, $fh);
	}
	$old_name = $name;
	$old_type = $type;
	$old_length = $length;
	$synonyms = [];
	push @$synonyms, $syn if $syn;
      }
    }
    &p( SeqLine($dbspecies,$old_name,$old_type,1,$old_length,$old_length,$old_name,$synonyms,$old_type,$sanger,$counter), $nogzip, $fh);

    $total_c += footer( $counter->(),$nogzip,$fh);
  }
  return $total_c;
}

#decide which of the possible many name entries we should be using, the rest will be synonyms
sub sort_mf_names {
  my ($all_names, $name_order) = @_;
  my $name_to_use;
  foreach my $name_type ( @$name_order ) {
    unless ($name_to_use) {
      foreach my $name (@$all_names) {
        if ($name->[0] eq $name_type) {
          $name_to_use = $name->[1];
        }
      }
    }
  }
  my @synonyms = map {$_->[1]} grep { $_->[1] ne $name_to_use } @$all_names;
  return $name_to_use, \@synonyms ;
}

sub SeqLine {
  my($species,$sr,$sr_type,$start,$end,$len,$name,$synonyms,$type,$sanger,$counter) = @_;
  $species =~ s/_/ /;
  print LOG "WARNING: no type set for $name" unless $type;
  my $action = $len > 0.5e6 ? 'Overview' : 'View';
  my $r = "$sr:$start-$end";
  my $desc = ($name eq $sr) ? "$type $name has a length of $len bp." : "$type $name (length $len bp) is mapped to $sr_type $sr.";
  my $extra_desc = '';

  #deal with any synonyms
  if (@$synonyms) {
    $extra_desc = " It has EMBL accessions / synonyms of " . join(',',@$synonyms) . ".";
  }

 #deal with with Sanger Projects
  my %extra;
  foreach my $n ($name, @$synonyms) {
    foreach( keys %{$sanger->{$n}||{}} ) {
      $extra{$_}=1;
    }
  }
  if (%extra) {
    foreach my $k (keys %extra) {
      push @$synonyms, $k unless grep {$_ eq $k} @$synonyms;
    }
    $extra_desc .= " It is mapped to the following Sanger projects: ".join( ', ',sort keys %extra ) . ".";
  }

  $desc .= $extra_desc;

  my $xml = qq(
<entry id="$name">
  <description>$desc</description>);
  $xml .= qq(
  <additional_fields>);
  $xml .= &common_fields($species,'Sequence');
  $xml .= qq(
    <field name="location">$r</field>
    <field name="action">$action</field>);
  foreach my $syn (@$synonyms) {
    $xml .= qq(
    <field name="synonym">$syn</field>);
  }
  $xml .= qq(
  </additional_fields>);
  $counter->();
  return $xml . '</entry>';
}

#retrieve sanger project name details
sub sanger_project_names {
  my $conf = shift;
  my %SPECIES = qw(
    Homo_sapiens Human Mus_musculus Mouse Danio_rerio Zebrafish
    Drosophila_melanogaster Drosophila Rattus_norvegicus Rat
  );
  my $sanger_species_name = $SPECIES{$conf->{'species'}};
  return {} unless $sanger_species_name;
  my $clones = {};
  my $dbh = DBI->connect("DBI:mysql:host=otterlive;port=3301;database=submissions", 'ottro', undef, {RaiseError => 1});
  unless( $dbh ) {
    warn "Can't connect to submissions database as 'read_only' ", DBI::errstr();
    return $clones;
  }
  my $sth = $dbh->prepare(
    "SELECT distinct a.project_name, a.accession 
       FROM project_acc a, project_dump d,
            sequence s, species_chromosome c
      WHERE a.sanger_id = d.sanger_id AND d.seq_id = s.seq_id
        AND s.chromosome_id = c.chromosome_id 
        AND c.species_name = '$sanger_species_name' AND a.accession != 'UNKNOWN'"
  );
  $sth->execute(  );
  while( my ($proj, $acc) = $sth->fetchrow() ) {
    $clones->{$acc}{$proj}=1;
  }
  return $clones;
}


########################################################################## Genes

sub dumpGene {
#  print LOG "Initial memory usage = ".&date_and_mem()."\n";
  my $conf      = shift;
  my $dbspecies = $conf->{'species'};
  my $COREDB    = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $ESTGENEDB = $conf->{'dbs'}->{'DATABASE_OTHERFEATURES'};
  my $RNASEQDB = $conf->{'dbs'}->{'DATABASE_RNASEQ'};
  my %dbs = ( 'core' => $COREDB );
  $dbs{'otherfeatures'} = $ESTGENEDB if $ESTGENEDB;
#  $dbs{'rnaseq'} = $RNASEQDB if $RNASEQDB; #don't add these until the IDs mean something and the gene pages are usefull!

  my $total_c;
 DB:
  foreach my $db ( sort { $b cmp $a } keys %dbs) {
    my $dbname    = $dbs{$db};
    my $counter   = make_counter(0);
    my $file = $conf->{'directory'}."/${dbname}_Gene.xml";
    $file .= ".gz" unless $nogzip;
    my $start_time = time;
    my $fh;
    unless ($nogzip) {
      $fh = new IO::Zlib;
      $fh->open( "$file", "wb9" )
        or die("Can't open compressed stream to $file: $!");
    }
    else {
      open( $fh, ">$file" ) or die "Can't open $file: $!";
    }
    print LOG "  Dumping $dbname to $file ... ".&date_and_mem()."\n";
    header( $dbname, $dbspecies, $db, $nogzip, $fh, $release );

    #commented out since removed from Gene in e60
#    if ($db eq 'core') {
#      my $lrgs = &dumpLRGs($conf);
#      if (@$lrgs) {
#        &sort_lrgs($dbspecies,'Gene',$fh,$lrgs); 
#        print LOG "    Printed LRGs to $file ... ".&date_and_mem()."\n";
#      }
#    }

    my $external_dbs = $conf->{'dbh'}->selectall_hashref(
      'select external_db_id, db_name, db_display_name
         from external_db', 'external_db_id'
       );
    my $external_synonyms;
    my $es = $conf->{'dbh'}->selectall_arrayref(qq(select xref_id, synonym from external_synonym));
    foreach (@$es) {
      $external_synonyms->{$_->[0]}{$_->[1]} = 1;
    }
    my %xrefs = ();
    foreach my $type( qw(Gene Transcript Translation) ) {
#      print LOG "starting memory usage for type $type = ".&date_and_mem()."\n";
      my $sql = qq(
         SELECT ox.ensembl_id, x.display_label, x.dbprimary_acc,
                x.description, x.external_db_id, x.xref_id
           FROM $dbname.object_xref as ox, $dbname.xref as x
          WHERE ox.ensembl_object_type = '$type'
            AND ox.xref_id = x.xref_id );
      my $T = $conf->{'dbh'}->selectall_arrayref($sql);
      foreach (@$T) {
        push @{$xrefs{$type}{$_->[0]}{$external_dbs->{$_->[4]}{'db_name'}}}, {
          'xref_description'=> $_->[3],
          'db_acc'          => $_->[2],
          'label'           => $_->[1],
          'external_db_id'  => $_->[4],
          'xref_id'         => $_->[5],
        };
      }
      print LOG "    $type xref query done ".&date_and_mem()."\n";
    }

    #disconnect and then connect to free up memory
    my $core = $conf->{'core_details'};
    $conf->{'dbh'}->disconnect;
    $conf->{'dbh'} = DBI->connect( "dbi:mysql:$core->{'NAME'};host=$core->{'HOST'};port=$core->{'PORT'}", $core->{'USER'}, $core->{'PASS'}, { PrintError => 0, RaiseError => 0} );

    my %exons = ();
    my $sql = qq(
       SELECT distinct t.gene_id, esi.stable_id
         FROM $dbname.transcript as t, $dbname.exon_transcript as et, $dbname.exon_stable_id as esi
        WHERE t.transcript_id = et.transcript_id
          AND et.exon_id = esi.exon_id);
    my $sth = $conf->{'dbh'}->prepare($sql);
    $sth->execute;
    while (my $r = $sth->fetchrow_arrayref()) {
      $exons{$r->[0]}{$r->[1]}=1;
    }
    $sth->finish;
    print LOG "    Exon query done ".&date_and_mem()."\n";

    my $gene_locations = $conf->{'dbh'}->selectall_hashref(qq(
      SELECT gsi.stable_id, sr.name as sr_name, g.seq_region_start, g.seq_region_end, g.seq_region_strand, cs.name as sr_type, cs.version
        FROM $dbname.gene_stable_id gsi, $dbname.gene g, $dbname.seq_region sr, $dbname.coord_system cs
       WHERE gsi.gene_id = g.gene_id
         AND g.seq_region_id = sr.seq_region_id
         AND sr.coord_system_id = cs.coord_system_id),
      'stable_id');

    $sql = qq(
      SELECT gsi.gene_id, tsi.transcript_id, trsi.translation_id,
             gsi.stable_id as gsid, tsi.stable_id as tsid, trsi.stable_id as trsid,
             g.description, x.display_label, ad.display_label, ad.web_data, ad.displayable, g.status, g.biotype, a.logic_name
        FROM ((( $dbname.gene_stable_id as gsi, $dbname.gene as g,
             $dbname.transcript_stable_id as tsi,
             $dbname.analysis_description as ad,
             $dbname.analysis as a,
             $dbname.transcript as t) left join
             $dbname.translation as tr on t.transcript_id = tr.transcript_id) left join
             $dbname.translation_stable_id as trsi on tr.translation_id = trsi.translation_id) left join
             $dbname.xref as x on g.display_xref_id = x.xref_id
       WHERE t.gene_id = gsi.gene_id 
         AND t.transcript_id = tsi.transcript_id
         AND t.gene_id = g.gene_id
         AND g.analysis_id = ad.analysis_id
         AND g.analysis_id = a.analysis_id
         AND a.logic_name != 'LRG_import'
       ORDER by gsi.stable_id, tsi.stable_id);
    $sth = $conf->{'dbh'}->prepare($sql);
    $sth->execute();

    my $old;
    my ($gid,$tid,$tlid,$gsid,$tsid,$tlsid,$desc,$disp_xref,$ad_dl,$webdata,$displayable,$status,$biotype,$ln);
    while (my $r = $sth->fetchrow_arrayref()) {
      ($gid,$tid,$tlid,$gsid,$tsid,$tlsid,$desc,$disp_xref,$ad_dl,$webdata,$displayable,$status,$biotype,$ln) = @$r;
      my $wd = eval($webdata);
      next if $wd->{'gene'}{'do_not_display'};
      next if ($ln =~ /rnaseq/);
      if( $old->{'gid'} != $gid ) {
        if( $old->{'gid'} ) {
          my $type = $old->{'biotype'}.' '.$old->{'ad_dl'};
          &p( GeneLine($dbspecies,
                       $old->{'gsid'},
                       $old->{'tsids'},
                       $old->{'tlsids'},
                       $old->{'exons'},
                       $old->{'biotype'},
                       $old->{'ad_dl'},
                       $old->{'disp_xref'},
                       $old->{'desc'},
                       $old->{'xrefs'},
                       $old->{'xref_descriptions'},
                       $gene_locations->{$old->{'gsid'}},
                       $db,
                       $counter),
              $nogzip, $fh);
        }

        #only use status for vega#

        $old = {
          'gid'       => $gid,
          'gsid'      => $gsid,
          'tlsids'    => {$tlsid?($tlsid=>1):()},
          'tsids'     => {$tsid ?($tsid=>1) :()}, 
          'desc'      => $desc,
          'exons'     => {},
          'biotype'   => $biotype,
          'ad_dl'     => $ad_dl,
          'xrefs'     => [],
          'xref_descriptions' => {},
          'disp_xref' => $disp_xref ? "$disp_xref" : "novel gene",
          'status'    => $status,
          'biotype'   => $biotype,
          'ad_dl'     => $ad_dl,
        };

        $old->{'exons'}  = $exons{$gid};

        #add xref details
        foreach my $type (qw(Gene Transcript Translation)) {
          my $id = ($type eq 'Gene')        ? $gid
                 : ($type eq 'Transcript')  ? $tid
                 : ($type eq 'Translation') ? $tlid
                 : '';
          while (my ($db_name,$xrefs) = each %{$xrefs{$type}{$id}}) {
            foreach my $dets (@$xrefs) {
              my $xref_id = $dets->{'xref_id'};
              $dets->{'db_name'} = $db_name;
              $old->{'xref_descriptions'}{$dets->{'xref_description'}}++ ;
              foreach my $syn (keys %{$external_synonyms->{$xref_id}}) {
                push @{$dets->{'synonyms'}},$syn;
              }
              push @{$old->{'xrefs'}}, $dets;
            }
          }
        }
      }
      else {
        $old->{'tsids' }{$tsid }=1;
        $old->{'tlsids'}{$tlsid}=1 if $tlsid;

        #add xref details
        foreach my $type (qw(Gene Transcript Translation)) {
          my $id = ($type eq 'Gene')        ? $gid
                 : ($type eq 'Transcript')  ? $tid
                 : ($type eq 'Translation') ? $tlid
                 : '';
          while (my ($db_name,$xrefs) = each %{$xrefs{$type}{$id}}) {
            foreach my $dets (@$xrefs) {
              my $xref_id = $dets->{'xref_id'};
              $dets->{'db_name'} = $db_name;
              $old->{'xref_descriptions'}{$dets->{'xref_description'}}++;
              foreach my $syn (keys %{$external_synonyms->{$xref_id}}) {
                push @{$dets->{'synonyms'}},$syn;
              }
              push @{$old->{'xrefs'}}, $dets;
            }
          }
        }
      }
    }
    &p( GeneLine($dbspecies,
                 $old->{'gsid'},
                 $old->{'tsids'},
                 $old->{'tlsids'},
                 $old->{'exons'},
                 $old->{'biotype'},
                 $old->{'ad_dl'},
                 $old->{'disp_xref'},
                 $old->{'desc'},
                 $old->{'xrefs'},
                 $old->{'xref_descriptions'},
                 $gene_locations->{$old->{'gsid'}},
                 $db,
                 $counter),
        $nogzip, $fh) if ($old->{'gsid'});
    print LOG "    Gene info query done ".&date_and_mem()."\n";
#    exit;

    my $other_count = 0;
    my %current_stable_ids =();
    foreach my $type (qw(gene transcript translation)) {
      $current_stable_ids{$type} = { map {@$_} @{$conf->{'dbh'}->selectall_arrayref( "select stable_id,1 from $COREDB.".$type."_stable_id" )}};
    }
    my $species = $conf->{'species'};
    my $sth = $conf->{'dbh'}->prepare( qq(
    SELECT sie.type, sie.old_stable_id, if(isnull(sie.new_stable_id),'NULL',sie.new_stable_id),
           ms.old_release*1.0 as X, ms.new_release*1.0 as Y
      FROM $dbname.mapping_session as ms, $dbname.stable_id_event as sie
     WHERE ms.mapping_session_id = sie.mapping_session_id and ( old_stable_id != new_stable_id or isnull(new_stable_id) )
     ORDER by Y desc, X desc
  ));

    $sth->execute();
    my %mapping = ();
    while( my($type,$osi,$nsi) = $sth->fetchrow_array() ) {
      next if $current_stable_ids{$type}{$osi}; ## Don't need to cope with current IDS already searchable...
      $mapping{$type}{$osi}{$nsi} = 1;
      if($mapping{$type}{$nsi}) {
        foreach( keys %{$mapping{$type}{$nsi}} ) {
          $mapping{$type}{$osi}{$_} = 1;
        }
      }
    }
    foreach my $type ( keys %mapping ) {
      foreach my $osi ( keys %{$mapping{$type}} ) {
        my @current_sis = ();
        my @deprecated_sis = ();
        foreach ( keys %{$mapping{$type}{$osi}} ) {
          next if $osi eq $_;
          if( $current_stable_ids{$_} ) {
            push @current_sis,$_;
          } elsif( $_ ne 'NULL' ) {
            push @deprecated_sis,$_;
          }
        }
        if( @current_sis ) {
          $other_count++;
          my $desc = qq(Ensembl $type $osi is no longer in the database but it has been mapped to the following current identifiers: @current_sis);
          $desc .= @deprecated_sis ? qq(; and the following deprecated identifiers: @deprecated_sis) : '';
          &p (&GeneStableIdMappingLine($dbspecies,$desc,$osi,\@current_sis,\@deprecated_sis,$type,$db,$counter),$nogzip, $fh);
        }
        elsif( @deprecated_sis ) {
          $other_count++;
          my $desc = qq(Ensembl $type $osi is no longer in the database but it has been mapped to the following identifiers: @deprecated_sis);
          &p (&GeneStableIdMappingLine($dbspecies,$desc,$osi,[],\@deprecated_sis,$type,$db,$counter),$nogzip, $fh);
        }
        else {
          $other_count++;
          my $desc = qq(Ensembl $type $osi is no longer in the database and has not been mapped to any newer identifiers);
          &p (&GeneStableIdMappingLine($dbspecies,$desc,$osi,[],[],$type,$db,$counter),$nogzip, $fh);
        }
      }
    }
    print LOG "    Stable_id mapping querying done ".&date_and_mem()."\n";
    $total_c += footer( $counter->(),$nogzip,$fh);

    while (0) {
      $other_count = 0;
      my %unmapped_queries = (
      'None' => qq(
      select a.logic_name, e.db_display_name,
             uo.identifier, ur.summary_description,
             'Not mapped'
        from $dbname.analysis as a, $dbname.external_db as e, $dbname.unmapped_object as uo,
             $dbname.unmapped_reason as ur
       where a.analysis_id = uo.analysis_id and
             uo.external_db_id = e.external_db_id and
             uo.unmapped_reason_id = ur.unmapped_reason_id and
               uo.ensembl_id = 0
    ),
      'Transcript' => qq(
      select a.logic_name, e.db_display_name,
             uo.identifier, ur.summary_description,
             concat( 'Transcript: ', tsi.stable_id, '; Gene: ',gsi.stable_id )
        from $dbname.analysis as a, $dbname.external_db as e, $dbname.unmapped_object as uo,
             $dbname.unmapped_reason as ur, $dbname.transcript_stable_id as tsi,
             $dbname.transcript as t, $dbname.gene_stable_id as gsi
       where a.analysis_id = uo.analysis_id and
             uo.external_db_id = e.external_db_id and
             uo.unmapped_reason_id = ur.unmapped_reason_id and
             uo.ensembl_id = t.transcript_id and
             uo.ensembl_object_type = 'Transcript' and
             t.transcript_id = tsi.transcript_id and
             t.gene_id       = gsi.gene_id
    ),
      'Translation' => qq(
      select a.logic_name, e.db_display_name, uo.identifier, ur.summary_description,
             concat( 'Protein: ',trsi.stable_id,'; Transcript: ', tsi.stable_id, '; Gene: ',gsi.stable_id )
        from $dbname.analysis as a, $dbname.external_db as e, $dbname.unmapped_object as uo,
             $dbname.unmapped_reason as ur, $dbname.transcript_stable_id as tsi,
             $dbname.translation as tr, $dbname.translation_stable_id as trsi,
             $dbname.transcript as t, $dbname.gene_stable_id as gsi
       where a.analysis_id = uo.analysis_id and 
             uo.external_db_id = e.external_db_id and
             uo.unmapped_reason_id = ur.unmapped_reason_id and
             uo.ensembl_id = tr.translation_id and 
             tr.transcript_id = t.transcript_id and
             trsi.translation_id = tr.translation_id and
             uo.ensembl_object_type = 'Translation' and
             t.transcript_id = tsi.transcript_id and
             t.gene_id       = gsi.gene_id
    )
    );

      foreach my $FLAG (keys %unmapped_queries) {
        my $SQL = $unmapped_queries{$FLAG};
        my $sth = $conf->{'dbh'}->prepare($SQL);
        $sth->execute;
        while( my $T = $sth->fetchrow_arrayref() ) {
          print O join "\t",
            (INC_SPECIES?"$conf->{'species'}\t":"").qq(Unmapped feature),
              "$T->[1] $T->[2]",
                "/$conf->{'species'}/Location/Genome?ftype=Gene;id=$T->[2]",
                  "$T->[2] $T->[4]",
                    "$T->[3]; $T->[4]\n";
        }
      }
    }
    close O;
  }
  return $total_c;
}

sub GeneLine {
  my ($species, $gsid, $tsids, $tlsids, $exons, $biotype, $ad_label, $disp_xref, $desc, $xrefs, $xref_descriptions, $gene_loc, $db, $counter) = @_;
  my $url = sprintf(qq(%s/Gene/Summary/g=%s&amp;db=%s),
                    $species,
                    $gsid,
                    $db);
  $species =~ s/_/ /;
  $biotype =~ s/_/ /;
  $desc = &clean($desc,'desc');
  $disp_xref = &clean($disp_xref,'name');
  my $description = $desc . " [Type: $biotype $ad_label]";
  my $xml;

  if ($search_engine eq 'solr') {
    $xml = qq(
<doc>
  <field name="id">$gsid"</field>
  <field name="name">$disp_xref</field>
  <field name="description">$description</field>);
    $xml .= &common_fields($species,'Gene');
    my (%seen_labels,%seen_syns);
    foreach my $xref (@$xrefs) {
      my $dbname   = $xref->{'db_name'};
      my $dbkey    = $xref->{'label'};
      next if $seen_labels{$dbkey};
      $seen_labels{$dbkey} = 1;
      $dbkey = &clean($dbkey,'dbkey');
      if (! $dbname) {
        print LOG "WARNING: $gsid has a xref ($dbkey) with no dbname - check db and refer to release coordinator\n";
      }
      else {
        $xml .= qq(
  <field name="_xr_$dbname">$dbkey"</field>);
        #non-versioned
        if ($dbkey =~ /^(\w+)\.\d+$/) {
          my $k = $1;
          next if $seen_labels{$k};
          $seen_labels{$dbkey} = $k;
          $xml .= qq(
  <field name="_xr_$dbname">$k</field>);
        }
        #xref synonyms
        foreach my $syn (@{$xref->{'synonyms'}}) {
          next if $seen_syns{$syn};
          $seen_syns{$syn} = 1;
          $syn = &clean($syn,'syn');
          $xml .= qq(
  <field name="_xr_$dbname">$syn</field>);
        }
      }
    }
    foreach my $xref_description (keys %{$xref_descriptions}) {
      if ($xref_description) {
        $xref_description = &clean($xref_description,'xref description');
        $xml .= qq(
  <field name="xref_description">$xref_description</field>);
      }
    }
    if (my $t_count = keys %$tsids) {
      $xml .= qq(
  <field name="transcript_count">$t_count</field>);
      foreach my $tsi (keys %$tsids) {
        $xml .= qq(
  <field name="transcript">$tsi</field>);
      }
    }
    if (my $tl_count = keys %$tlsids) {
      $xml .= qq(
  <field name="translation_count">$tl_count</field>);
      foreach my $tlsi (keys %$tlsids) {
        $xml .= qq(
  <field name="peptide">$tlsi</field>);
      }
    }
    if (my $e_count = keys %$exons) {
      $xml .= qq(
  <field name="exon_count">$e_count</field>);
      foreach my $esi (keys %$exons) {
        $xml .= qq(
  <field name="exon">$esi</field>);
      }
    }

    #these are not to be searchable
    if ($gene_loc) {
      my $loc = $gene_loc->{'sr_name'} .':'. $gene_loc->{'seq_region_start'} .'-'. $gene_loc->{'seq_region_end'} .':'. $gene_loc->{'seq_region_strand'};
      $xml .= qq(
  <field name="location">$loc</field>);
    }
    $xml .= qq(
  <field name="source">$ad_label</field>
  <field name= "domain_url">$url</field>);

    $xml .= qq(
</doc>);

    $counter->();
    return $xml;
  }

  elsif ($search_engine eq 'lucene') {
    $xml = qq(
<entry id="$gsid">
  <name>$disp_xref</name>
  <description>$description</description>);
    $xml .= qq(
  <cross_references>);
    my (%seen_labels,%seen_syns);
    my @non_versioned;
    foreach my $xref (@$xrefs) {
      my $dbname   = $xref->{'db_name'};
      my $dbkey    = $xref->{'label'};
      next if $seen_labels{$dbkey};
    $seen_labels{$dbkey} = 1;
      $dbkey = &clean($dbkey,'dbkey');
      if (! $dbname) {
        print LOG "WARNING: $gsid has a xref ($dbkey) with no dbname - check db and refer to release coordinator\n";
      }
      else {
        $xml .= qq(
    <ref dbname="$dbname" dbkey="$dbkey"/>);
        if ($dbkey =~ /^(\w+)\.\d+$/) {
          my $k = $1;
          next if $seen_labels{$k};
          $seen_labels{$dbkey} = $k;
          push @non_versioned, $k;
        }
        foreach my $syn (@{$xref->{'synonyms'}}) { 
          next if $seen_syns{$syn};
          $seen_syns{$syn} = 1;
          $syn = &clean($syn,'syn');
          $xml .= qq(
    <ref dbname="$dbname" dbkey="$syn"/>);
        }
      }
      $cross_references{$dbname}++;
    }
    $xml .= qq(
  </cross_references>);
    $xml .= qq(
  <additional_fields>);
    $xml .= &common_fields($species,'Gene');
    $xml .= qq(
    <field name="action">Summary</field>
    <field name="source">$ad_label</field>);
    if ($gene_loc) {
      my $loc = $gene_loc->{'sr_name'} .':'. $gene_loc->{'seq_region_start'} .'-'. $gene_loc->{'seq_region_end'} .':'. $gene_loc->{'seq_region_strand'};
      $xml .= qq(
    <field name="location">$loc</field>);
    }
    foreach my $xref_description (keys %{$xref_descriptions}) {
      if ($xref_description) {
        $xref_description = &clean($xref_description,'xref description');
        $xml .= qq(
    <field name="xref_description">$xref_description</field>);
      }
    }
    unless ($db eq 'core') {
      $xml .= qq(
    <field name="db">$db</field>);
    }
    foreach my $tsi (keys %$tsids) {
      $xml .= qq(
    <field name="transcript">$tsi</field>);
    }
    foreach my $tlsi (keys %$tlsids) {
      $xml .= qq(
    <field name="peptide">$tlsi</field>);
    }
    foreach my $esi (keys %$exons) {
      $xml .= qq(
    <field name="exon">$esi</field>);
    }
    foreach my $nv_xref (@non_versioned) {
      $xml .= qq(
    <field name="non_versioned">$nv_xref</field>);
    }
    $xml .= qq(
  </additional_fields>);
    $counter->();
    return $xml . qq(
</entry>);
  }
}

sub GeneStableIdMappingLine {
  my ($species,$desc,$osi,$current_sis,$deprecated_sis,$type,$db,$counter) = @_;
  $species =~ s/_/ /;
  my $xml = qq(
<entry id="$osi">
  <description>$desc</description>
  <name>"Retired Ensembl $type stable ID"</name>
  <additional_fields>);
  $type = 'IDHistory_'.lc($type);

  $xml .= &common_fields($species,$type);
  foreach my $id (@$current_sis,@$deprecated_sis) {
    $xml .= qq(
    <field name="old_id">$id</field>);
  }
  $xml .= qq(
    <field name="action">Idhistory</field>
    <field name="db">$db</field>
  </additional_fields>);
  $counter->();
  return $xml . qq(
</entry>);
}

########################################################################## Transcripts

sub dumpTranscript {
  my $conf      = shift;
  my $dbspecies = $conf->{'species'};
  my $COREDB    = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $ESTGENEDB = $conf->{'dbs'}->{'DATABASE_OTHERFEATURES'};
  my $RNASEQDB  = $conf->{'dbs'}->{'DATABASE_RNASEQ'};
  my %dbs = ( 'core' => $COREDB );
  $dbs{'otherfeatures'} = $ESTGENEDB if $ESTGENEDB;
#  $dbs{'rnaseq'} = $RNASEQDB if $RNASEQDB; #don't add these until the IDs mean something and the transcript pages are usefull!
  my $total_c;

 DB:
  foreach my $db ( sort { $b cmp $a } keys %dbs) {
    my $dbname    = $dbs{$db};
    my $counter   = make_counter(0);
    my $file = $conf->{'directory'}."/${dbname}_Transcript.xml";
    $file .= ".gz" unless $nogzip;
    my $start_time = time;
    my $fh;
    unless ($nogzip) {
      $fh = new IO::Zlib;
      $fh->open( "$file", "wb9" )
        or die("Can't open compressed stream to $file: $!");
    }
    else {
      open( $fh, ">$file" ) or die "Can't open $file: $!";
    }
    print LOG "  Dumping $dbname to $file ... ".&date_and_mem()."\n";
    header( $dbname, $dbspecies, $db, $nogzip, $fh, $release );

    my $external_dbs = $conf->{'dbh'}->selectall_hashref(
      'select external_db_id, db_name, db_display_name
         from external_db', 'external_db_id'
       );
    my $external_synonyms;
    my $es = $conf->{'dbh'}->selectall_arrayref(qq(select xref_id, synonym from external_synonym));
    foreach (@$es) {
      $external_synonyms->{$_->[0]}{$_->[1]} = 1;
    }
    my %xrefs = ();
    foreach my $type( qw(Transcript Translation) ) {
      my $sql = qq(
         SELECT ox.ensembl_id, x.display_label, x.dbprimary_acc,
                x.description, x.external_db_id, x.xref_id
           FROM $dbname.object_xref as ox, $dbname.xref as x
          WHERE ox.ensembl_object_type = '$type'
            AND ox.xref_id = x.xref_id );
      my $T = $conf->{'dbh'}->selectall_arrayref($sql);
      foreach (@$T) {
        push @{$xrefs{$type}{$_->[0]}{$external_dbs->{$_->[4]}{'db_name'}}}, {
          'xref_description'=> $_->[3],
          'db_acc'          => $_->[2],
          'label'           => $_->[1],
          'external_db_id'  => $_->[4],
          'xref_id'         => $_->[5],
        };
      }
      print LOG "    $type xref query done ".&date_and_mem()."\n";
    }

    #disconnect and then connect to free up memory
    my $core = $conf->{'core_details'};
    $conf->{'dbh'}->disconnect;
    $conf->{'dbh'} = DBI->connect( "dbi:mysql:$core->{'NAME'};host=$core->{'HOST'};port=$core->{'PORT'}", $core->{'USER'}, $core->{'PASS'}, { PrintError => 0, RaiseError => 0} );

    my %exons = ();
    my $sql = qq(
       SELECT t.transcript_id, esi.stable_id
         FROM $dbname.transcript as t, $dbname.exon_transcript as et, $dbname.exon_stable_id as esi
        WHERE t.transcript_id = et.transcript_id
          AND et.exon_id = esi.exon_id);
    my $sth = $conf->{'dbh'}->prepare($sql);
    $sth->execute;
    while (my $r = $sth->fetchrow_arrayref()) {
      $exons{$r->[0]}{$r->[1]}=1;
    }
    $sth->finish;
    print LOG "    Exon query done ".&date_and_mem()."\n";

    #get descriptions for the genes to use if none have been assigned to the transcript
    my $gene_desc = $conf->{'dbh'}->selectall_hashref(qq(
      SELECT t.transcript_id, g.description
        FROM transcript t, gene g
       WHERE t.gene_id = g.gene_id),
     'transcript_id');

    my $trans_locations = $conf->{'dbh'}->selectall_hashref(qq(
      SELECT tsi.stable_id, sr.name as sr_name, t.seq_region_start, t.seq_region_end, t.seq_region_strand, cs.name as sr_type, cs.version
        FROM $dbname.transcript_stable_id tsi, $dbname.transcript t, $dbname.seq_region sr, $dbname.coord_system cs
       WHERE tsi.transcript_id = t.transcript_id
         AND t.seq_region_id = sr.seq_region_id
         AND sr.coord_system_id = cs.coord_system_id),
      'stable_id');

    $sql = qq(
      SELECT tsi.transcript_id, trsi.translation_id,
             tsi.stable_id as tsid, trsi.stable_id as trsid,
             t.description, x.display_label, ad.display_label, ad.web_data, ad.displayable, t.status, t.biotype, a.logic_name
        FROM ((( $dbname.transcript_stable_id as tsi,
             $dbname.analysis_description as ad,
             $dbname.analysis as a,
             $dbname.transcript as t) left join
             $dbname.translation as tr on t.transcript_id = tr.transcript_id) left join
             $dbname.translation_stable_id as trsi on tr.translation_id = trsi.translation_id) left join
             $dbname.xref as x on t.display_xref_id = x.xref_id
       WHERE t.transcript_id = tsi.transcript_id
         AND t.analysis_id = ad.analysis_id
         AND t.analysis_id = a.analysis_id
         AND a.logic_name != 'LRG_import'
       ORDER by tsi.stable_id);
    $sth = $conf->{'dbh'}->prepare($sql);
    $sth->execute();

    my $old;
    my ($tid,$tlid,$tsid,$tlsid,$desc,$disp_xref,$ad_dl,$webdata,$displayable,$status,$biotype,$ln);
    while (my $r = $sth->fetchrow_arrayref()) {
      ($tid,$tlid,$tsid,$tlsid,$desc,$disp_xref,$ad_dl,$webdata,$displayable,$status,$biotype,$ln) = @$r;
      $desc = $desc ? $desc : $gene_desc->{$tid}{'description'} ? $gene_desc->{$tid}{'description'} : '';
      my $wd = eval($webdata);
      next if $wd->{'gene'}{'do_not_display'};
      next if ($ln =~ /rnaseq/);
      if( $old->{'tid'} != $tid ) {
        if( $old->{'tid'} ) {
          my $type = $old->{'biotype'}.' '.$old->{'ad_dl'};
          &p( TransLine($dbspecies,
                       $old->{'tsid'},
                       $old->{'tlsids'},
                       $old->{'exons'},
                       $old->{'biotype'},
                       $old->{'ad_dl'},
                       $old->{'disp_xref'},
                       $old->{'desc'},
                       $old->{'xrefs'},
                       $old->{'xref_descriptions'},
                       $trans_locations->{$old->{'tsid'}},
                       $db,
                       $counter),
              $nogzip, $fh);
        }

        #only use status for vega#

        $old = {
          'tid'       => $tid,
          'tsid'      => $tsid,
          'tlsids'    => {$tlsid?($tlsid=>1):()},
          'desc'      => $desc,
          'exons'     => {},
          'biotype'   => $biotype,
          'ad_dl'     => $ad_dl,
          'xrefs'     => [],
          'xref_descriptions' => {},
          'disp_xref' => $disp_xref ? "$disp_xref" : "novel transcript",
          'status'    => $status,
          'biotype'   => $biotype,
          'ad_dl'     => $ad_dl,
        };

        $old->{'exons'}  = $exons{$tid};

        #add xref details
        foreach my $type (qw(Transcript Translation)) {
          my $id = ($type eq 'Transcript')  ? $tid
                 : ($type eq 'Translation') ? $tlid
                 : '';
          while (my ($db_name,$xrefs) = each %{$xrefs{$type}{$id}}) {
            foreach my $dets (@$xrefs) {
              my $xref_id = $dets->{'xref_id'};
              $dets->{'db_name'} = $db_name;
              $old->{'xref_descriptions'}{$dets->{'xref_description'}}++ ;
              foreach my $syn (keys %{$external_synonyms->{$xref_id}}) {
                push @{$dets->{'synonyms'}},$syn;
              }
              push @{$old->{'xrefs'}}, $dets;
            }
          }
        }
      }
      else {
        $old->{'tlsids'}{$tlsid}=1 if $tlsid;

        #add xref details
        foreach my $type (qw(Transcript Translation)) {
          my $id = ($type eq 'Transcript')  ? $tid
                 : ($type eq 'Translation') ? $tlid
                 : '';
          while (my ($db_name,$xrefs) = each %{$xrefs{$type}{$id}}) {
            foreach my $dets (@$xrefs) {
              my $xref_id = $dets->{'xref_id'};
              $dets->{'db_name'} = $db_name;
              $old->{'xref_descriptions'}{$dets->{'xref_description'}}++;
              foreach my $syn (keys %{$external_synonyms->{$xref_id}}) {
                push @{$dets->{'synonyms'}},$syn;
              }
              push @{$old->{'xrefs'}}, $dets;
            }
          }
        }
      }
    }
    &p( TransLine($dbspecies,
                 $old->{'tsid'},
                 $old->{'tlsids'},
                 $old->{'exons'},
                 $old->{'biotype'},
                 $old->{'ad_dl'},
                 $old->{'disp_xref'},
                 $old->{'desc'},
                 $old->{'xrefs'},
                 $old->{'xref_descriptions'},
                 $trans_locations->{$old->{'tsid'}},
                 $db,
                 $counter),
        $nogzip, $fh) if ($old->{'gsid'});
    print LOG "    Transcript info query done ".&date_and_mem()."\n";
    $total_c += footer( $counter->(),$nogzip,$fh);
  }
  return $total_c;
}

sub TransLine {
  my ($species, $tsid, $tlsids, $exons, $biotype, $ad_label, $disp_xref, $desc, $xrefs, $xref_descriptions, $trans_loc, $db, $counter) = @_;
  $species =~ s/_/ /;
  $biotype =~ s/_/ /;
  $desc = &clean($desc,'desc');
  $disp_xref = &clean($disp_xref,'name');
  my $description = $desc . " [Type: $biotype $ad_label]";
  my $xml = qq(
<entry id="$tsid">
  <name>$disp_xref</name>
  <description>$description</description>);

  $xml .= qq(
  <cross_references>);
  my (%seen_labels,%seen_syns);
  my @non_versioned;
  foreach my $xref (@$xrefs) {
    my $dbname   = $xref->{'db_name'};
    my $dbkey    = $xref->{'label'};
    next if $seen_labels{$dbkey};
    $seen_labels{$dbkey} = 1;
    $dbkey = &clean($dbkey,'dbkey');
    if (! $dbname) {
      print LOG "WARNING: $tsid has a xref ($dbkey) with no dbname - check db and refer to release coordinator\n";
    }
    else {
      if ($dbkey =~ /^(\w+)\.\d+$/) {
        my $k = $1;
        next if $seen_labels{$k};
        $seen_labels{$dbkey} = $k;
        push @non_versioned, $k;
      }
      $xml .= qq(
    <ref dbname="$dbname" dbkey="$dbkey"/>);
      foreach my $syn (@{$xref->{'synonyms'}}) {
        next if $seen_syns{$syn};
        $seen_syns{$syn} = 1;
        $syn = &clean($syn,'syn');
        $xml .= qq(
    <ref dbname="$dbname" dbkey="$syn"/>);
      }
    }
    $cross_references{$dbname}++;
  }
  $xml .= qq(
  </cross_references>);

  $xml .= qq(
  <additional_fields>);
  $xml .= &common_fields($species,'Transcript');
  $xml .= qq(
    <field name="action">Summary</field>
    <field name="source">$ad_label</field>);
  if ($trans_loc) {
    my $loc = $trans_loc->{'sr_name'} .':'. $trans_loc->{'seq_region_start'} .'-'. $trans_loc->{'seq_region_end'} .':'. $trans_loc->{'seq_region_strand'};
    $xml .= qq(
    <field name="location">$loc</field>);
  }

  foreach my $xref_description (keys %{$xref_descriptions}) {
    if ($xref_description) {
      $xref_description = &clean($xref_description,'xref description');
      $xml .= qq(
    <field name="xref_description">$xref_description</field>);
    }
  }
  unless ($db eq 'core') {
    $xml .= qq(
    <field name="db">$db</field>);
  }
  foreach my $tlsi (keys %$tlsids) {
    $xml .= qq(
    <field name="peptide">$tlsi</field>);
  }
  foreach my $esi (keys %$exons) {
    $xml .= qq(
    <field name="exon">$esi</field>);
  }
  foreach my $nv_xref (@non_versioned) {
    $xml .= qq(
    <field name="non_versioned">$nv_xref</field>);
  }
  $xml .= qq(
  </additional_fields>);

  $counter->();

  return $xml . qq(
</entry>);
}


########################################################################## Regulatory Features

sub dumpRegulatoryFeature {
  my $conf = shift;
  my $dbname = $conf->{'dbs'}->{'DATABASE_FUNCGEN'};
  return unless $dbname;

  my $dbspecies = $conf->{'species'};
  my $db = 'regulatoryfeatures';
  my $counter   = make_counter(0);
  my $file = $conf->{'directory'}."/${dbname}_RegulatoryFeature.xml";
  $file .= ".gz" unless $nogzip;
  my $start_time = time;
  my $fh;
  unless ($nogzip) {
    $fh = new IO::Zlib;
    $fh->open( "$file", "wb9" )
      or die("Can't open compressed stream to $file: $!");
  }
  else {
    open( $fh, ">$file" ) or die "Can't open $file: $!";
  }
  print LOG "  Dumping $dbname to $file ... ".&date_and_mem()."\n";
  header( $dbname, $dbspecies, $db, $nogzip, $fh, $release );

  my $ftype = 'RegulatoryFeature';
  my $params;

  my $prefix = ($dbspecies =~/Mus/) ? 'ENSMUSR' : 'ENSR';
  my $d = $conf->{'dbh'}->selectall_arrayref( 
    "select concat('$prefix', lpad(rf.stable_id, 11, 0)),
            sr.name, rf.bound_seq_region_start, rf.bound_seq_region_end, ft.name
       from $dbname.regulatory_feature rf, $dbname.seq_region sr, $dbname.coord_system cs,
            $dbname.feature_type ft, $dbname.feature_set fs
      where fs.name='RegulatoryFeatures:MultiCell' and fs.feature_set_id=rf.feature_set_id and
            rf.feature_type_id=ft.feature_type_id and rf.seq_region_id=sr.seq_region_id and
            sr.coord_system_id=cs.coord_system_id and cs.is_current=1
   group by rf.regulatory_feature_id"
  );

  foreach my $row ( @$d ) {
    my $desc = qq($row->[4] regulatory feature);
    my $id = $row->[0];
    my $ids = [ ];
    my $r = $row->[1] .":".$row->[2] . "-" . $row->[3];
    $params = { 'r' => $r, 'subtype' => 'RegulatoryFeature' };
    &p (RegulatoryFeatureLine($dbspecies,$desc,$id,$ids,$ftype,$params,$db,$counter),$nogzip, $fh);
  }

## External Features...
  my $e = $conf->{'dbh'}->selectall_arrayref(
  "select ef.display_label, group_concat(distinct ft.name), count(distinct ef.external_feature_id),
          ft.description, ft.class, fs.name, sr.name, ef.seq_region_start, ef.seq_region_end,
          ef.seq_region_strand
    from  $dbname.feature_type ft, $dbname.external_feature ef, $dbname.feature_set fs, $dbname.seq_region sr,
          $dbname.coord_system cs,$dbname.status s, $dbname.status_name sn
    where ft.feature_type_id=ef.feature_type_id and fs.feature_set_id=ef.feature_set_id and
          fs.type='external' and ef.seq_region_id=sr.seq_region_id  and sr.coord_system_id=cs.coord_system_id
          and cs.is_current=1 and  s.table_name='feature_set' and s.table_id=fs.feature_set_id and
          s.status_name_id=sn.status_name_id and sn.name='MART_DISPLAYABLE'
 group by ef.display_label"
  );

  foreach my $f ( @$e) {
    my ($display_label, $f_name, $count, $f_desc, $f_class, $fs_name, $seq_region, $start, $end, $strand) = @$f;
    my @ids = split ' ', $f_name;
    $params = { 'fs_name' => $fs_name, 'subtype' => 'RegulatoryFactor' };
    my $desc = "$display_label is a $f_class from $fs_name which hits the genome in $count locations";
    &p(RegulatoryFeatureLine($dbspecies,$desc,$display_label,\@ids,$ftype,$params,$db,$counter),$nogzip, $fh);
  }
  return footer( $counter->(),$nogzip,$fh);
}

sub RegulatoryFeatureLine {
  my ($species,$desc,$id,$ids,$ftype,$params,$db,$counter) = @_;
  $species =~ s/_/ /;
  my $xml = qq(
<entry id="$id">
  <description>$desc</description>
  <additional_fields>);
  $xml .= &common_fields($species,$ftype);
  foreach my $syn (@$ids) {
    $xml .= qq(
    <field name="synonym">$syn</field>);
  }
  while (my ($param,$value) = each %$params) {
    $xml .= qq(
    <field name="$param">$value</field>);
  }
  $xml .= qq(
  </additional_fields>);
  $counter->();
  return $xml . qq(
</entry>);
}

########################################################################## LRGs

sub findLRGs {
  my $conf = shift;
  return $conf->{'dbh'}->selectall_arrayref(
        qq(SELECT gsi.stable_id, x.display_label, edb.db_name, tsi.stable_id, sr.length
             FROM gene_stable_id gsi, gene g, analysis a, object_xref ox, xref x, external_db edb, transcript t, transcript_stable_id tsi, seq_region sr
            WHERE gsi.gene_id = g.gene_id
              AND g.analysis_id = a.analysis_id
              AND g.gene_id = ox.ensembl_id
              AND ox.xref_id = x.xref_id
              AND x.external_db_id = edb.external_db_id
              AND g.gene_id = t.gene_id
              AND t.transcript_id = tsi.transcript_id
              AND gsi.stable_id = sr.name
              AND ox.ensembl_object_type = 'Gene'
              AND edb.db_name = 'HGNC'
              AND a.logic_name = 'LRG_import'
            ORDER by gsi.stable_id, tsi.stable_id)
      );
}

sub sort_lrgs {
  my ($dbspecies,$type,$fh,$lrgs) = @_;
  my ($prev_gsi,$prev_disp_label,$prev_db_name,$prev_length);
  my $query_terms = [];
  foreach my $rec (@$lrgs) {
    my $gsi = $rec->[0];
    if ($gsi eq $prev_gsi) {
      push @$query_terms, $rec->[3];
    }
    else {
      if ($prev_gsi) {
        &p(LRGLine($dbspecies,$type,$prev_gsi,$prev_disp_label,$prev_db_name,$query_terms,$prev_length),
           $nogzip, $fh);
      }
      $prev_gsi = $gsi;
      $prev_disp_label = $rec->[1];
      $prev_db_name = $rec->[2];
      $query_terms = [ $rec->[3] ];
      $prev_length = $rec->[4];
    }
  }
}

sub LRGLine {
  my ($species,$type,$gsi,$dbkey,$dbname,$tsids,$length) = @_;
  $species =~ s/_/ /;
  my $description = "$gsi is a fixed reference sequence of length $length with a fixed transcript(s) for reporting purposes. It was created for $dbname gene $dbkey";
  my $xml = qq(
<entry id="$gsi">
  <description>$description</description>);
  $xml .= qq(
  <cross_references>
    <ref dbname="$dbname" dbkey="$dbkey"/>
  </cross_references>);
  $xml .= qq(
  <additional_fields>);
  $xml .= &common_fields($species,$type);
  $xml .= qq(
    <field name="action">LRG_Summary</field>);
  foreach my $tsi (@$tsids) {
    $xml .= qq(
    <field name="transcript">$tsi</field>);
  }
  return $xml . qq(
  </additional_fields>
</entry>);
}

#######################################################################

#deal with those pesky non-standard characters in gene names and descriptions (order of regexps is important)
sub clean {
  my ($text,$field) = @_;
  $text =~ s/&/&amp;/g;
  $text =~ s/<i>//g;
  $text =~ s/<\/i>//g;
  $text =~ s/<sup>/-/g;
  $text =~ s/<\/sup>/-/g;
  $text =~ s/<em>//g;
  $text =~ s/<\/em>//g;
  $text =~ s/</&lt;/g;
  $text =~ s/>/&gt;/g;
  $text =~ s/'/&#44;/g;
  $text =~ s/"/&quot;/g;
  $text =~ s/ & / &amp; /g;
  $text =~ s/</&lt;/g;
  $text =~ s/>/&gt;/g;


  if ($text =~ /[<>]/) {
    print LOG "WARNING: Unsupported character $text in field $field\n";
  }
  return $text;
}

sub make_counter {
  my $start = shift;
  return sub { $start++ }
}

sub header {
  my ( $dbname, $dbspecies, $dbtype, $nogzip, $fh, $release ) = @_;
  if ($search_engine eq 'lucene') {
    $dbspecies =~ s/_/ /;
    p("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>",$nogzip,$fh,1);
    p("<!DOCTYPE database [ <!ENTITY auml \"&#228;\">]>",$nogzip,$fh,1);
    p("<database>",$nogzip,$fh,1);
    p("<name>$dbname</name>",$nogzip,$fh,1);
    p("<description>$sitetype $dbspecies $dbtype database</description>",$nogzip,$fh,1);
    p("<release>$release</release>",$nogzip,$fh,1);
    p("",$nogzip,$fh,1);
    p("<entries>",$nogzip,$fh,1);
  }
  elsif ($search_engine eq 'solr') {
    p("<add>",$nogzip,$fh,1);
  }
  else {
    print LOG "Incorrect data format -$search_engine- specified. Please correct\n";
  }
}

sub footer {
  my ($ecount,$nogzip,$fh,$type) = @_;
  if ($search_engine eq 'lucene') {
    p("</entries>",$nogzip,$fh,1);
    p("<entry_count>$ecount</entry_count>",$nogzip,$fh,1);
    p("</database>",$nogzip,$fh,1);
  }
  elsif ($search_engine eq 'solr') {
    p("</add>",$nogzip,$fh,1);
  }
  my $text = $type ? "  ...Dumped $ecount $type entries\n" : "  ...Dumped $ecount entries\n";
  print LOG $text;
  if ($nogzip) {
    close($fh) or die $!;
  }
  else {
    $fh->close();
  }
  return $ecount;
}

sub common_fields {
  my ($species,$feature_type) = @_;
  my $xml;
  if ($search_engine eq 'lucene') {
    $xml = qq(
    <field name="species">$species</field>);
    foreach my $sp (keys %{$conf->{'speciesname_list'}}) {
      $xml .= qq(
    <field name="species_name">$sp</field>);
    }
    $xml .= qq(
    <field name="featuretype">$feature_type</field>);
  }
  elsif ($search_engine eq 'solr') {
    $xml = qq(
  <field name="website">$website_url</field>
  <field name="species">$species</field>);
    foreach my $sp (keys %{$conf->{'speciesname_list'}}) {
      $xml .= qq(
  <field name="species_alias">$sp</field>);
    }
    $xml .= qq(
  <field name="featuretype">$feature_type</field>);
  }
  return $xml;
}

sub p {
  my ($str,$nogzip,$fh,$no_parse) = @_;
  return unless $str;
  unless ($no_parse) {
    eval { $parser->parse($str); };
    if ($@) {
      print LOG "\n\nWARNING XML - Error in record, ignoring (although the record count is correct):\nError is $@\nRecord is:\n$str";
      return;
    }
  }

  # TODO - encoding
  $str .= "\n";
  if ($nogzip) {
    if (! $dry_run) {
      print $fh $str or die "Can't write to file ", $!;
    }
  }
  else {
    print $fh $str or die "Can't write string: $str";
  }
}

sub format_datetime {
  my $t = shift;
  my ( $y, $m, $d, $ss, $mm, $hh ) = ( localtime($t) )[ 5, 4, 3, 0, 1, 2 ];
  $y += 1900;
  $d = "0" . $d if ( $d < 10 );
  my $ms = text_month($m);
  return sprintf "$d-$ms-$y %02d:%02d:%02d", $hh, $mm, $ss;
}

sub text_month {
  my $m = shift;
  my @months = qw[JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC];
  return $months[$m];
}

sub date_and_mem {
  my $date = strftime "%Y-%m-%d %T", localtime;
  my $mem = `ps -p $$ -o vsz |tail -1`;
  chomp $mem;
  return "[$date, mem $mem]";
}

sub elapsed_time {
  my $start_time = shift;
  my $diff = time - $start_time;
  my $sec = $diff % 60;
  $diff = ($diff - $sec) / 60;
  my $min = $diff % 60;
  my $hours = ($diff - $min) / 60;
  return "${hours}h ${min}min ${sec}sec";
}

sub about {
  print STDERR <<ABOUT_END;

usage:
./indexXMLDumper --species=aaa,bbb|ALL --index=vvv,xxx|ALL (--ignore_species=ccc,ddd) (--ignore_index=yyy,zzz) (--debug) (--log=index.log) (--dir=dir) (--dry_run|n)

'ALL' can be used for both 'species' and 'index'

The script will retry five times if it encounters MySQL errors (for example loses database connections)
unless --debug is specified in which case it will exit on the first failure.

In the abscence of --debug option, STDOUT and STDERR are sent to a log file (dumping_$release.log unless specified otherwise). If you run with --debug then output is sent to STDOUT

The locations of the actual xml files can be specified with the dir option. If not specified then it is dumped locally with debug, or in the correct location for indexing without (/nfs/eureka/data/ensembl_$release - note you have to be a member of the www-search UNIX group to write here)

Examine the log file for 'WARNING' messages, particularly 'WARNING XML' - this will identify any entries identified by XML::Parser as not being well formed and which would crash index production. These are not added to the xml file but the record count in the footer *will* include them. Either manaully fix and add the entry(ies) to the xml file, or else fix the database / script to account for it and rerun.

--dry_run|n does everything apart from actually write the xml [note that this option will delete any preexisting file]

Examples:

./indexXMLDumper --species=ALL --index=ALL --debug --log=debug.log  #all log output to debug.log; xml to input; all indices for all species
./indexXMLDumper --species=ALL --index=ALL                          #all log output to dumping_$release.log; xml to /nfs/eureka/data/ensembl_$release; all indices for all species
./indexXMLDumper --species=ALL --index=ALL --dir=input              #all log output to dumping_$release.log; xml to input; all indices for all species
./indexXMLDumper --species=ALL --ignore_species=Homo_sapiens --index=Variation --log=no_hs_variations.log #log output to no_hs_variations; xml to /nfs/eureka/data/ensembl_$release"; Variations for all species but human
./indexXMLDumper --species=ALL --ignore_index=Variation --log=no_variations.log #log output to no_variations; xml to /nfs/eureka/data/ensembl_$release"; All indexes apart from Variations for for all species
./indexXMLDumper --species=ALL --index=Gene --engine solr  -log dumping.log #output in format for solr (for testing, only works for genes as yet)

ABOUT_END
}
