#!/localsw/bin/perl

use strict;

my $ENSEMBL_ROOT;

BEGIN {
  use FindBin qw($Bin);
  use File::Basename qw( dirname );
  $ENSEMBL_ROOT = dirname( $Bin );
  $ENSEMBL_ROOT =~ s/\/utils$//;
  unshift @INC, "$ENSEMBL_ROOT/conf";
  eval{ require SiteDefs };
  if ($@){ die "Can't use SiteDefs.pm - $@\n"; }
  map{ unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS;
}

use EnsEMBL::Web::SpeciesDefs;
use Data::Dumper;
package indexDumper;

use constant INC_SPECIES => 0;
use CGI;

our $SD = EnsEMBL::Web::SpeciesDefs->new();

my ( $SPECIES_STRING, @indexes) = @ARGV;

## HACK 1 - if the INDEX is set to all grab all dumper methods...
@indexes = map { /dump(\w+)/?$1:() } keys %indexDumper:: if $indexes[0] eq 'ALL';

## HACK 2 - if the SPECIES is set to ALL grab stuff from config...

my @species;
my %X = %{$SD->ENSEMBL_SPECIES_ALIASES};

if( $SPECIES_STRING eq 'ALL' ) {
  @species = @{$SD->ENSEMBL_DATASETS}
} else {
  @species = grep {$_} map { $X{lc($_)} } split /:/, $SPECIES_STRING;
}

# EnsEMBL::Web::IndexSupport takes path to conf, path to files, species
my $conf = {};
mkdir 'input', 0777 unless -e 'input';

foreach my $species ( @species ) {

  $conf->{'directory'} = "input/$species";
  mkdir $conf->{'directory'}, 0777 unless -e  $conf->{'directory'};
  $conf->{'species'}   = $species;
  $conf->{'authority'} = $SD->get_config($species,'AUTHORITY');
  my %dbs = %{$SD->get_config($species,'databases')||{}};
  $conf->{'dbs'}       = { map { ($_,$dbs{$_}{NAME}) } keys %dbs };
  $conf->{'dbs'}{'DATABASE_COMPARA'} = 'ensembl_compara_'.$SD->ENSEMBL_VERSION;
  my $core = $dbs{'DATABASE_CORE'};
  $conf->{'dbh_compara'} = DBI->connect( "dbi:mysql:ensembl_compara_".$SD->ENSEMBL_VERSION.";host=ensdb-1-13;port=5307", $core->{'USER'}, $core->{'PASS'} );
  $conf->{'dbh'}         = DBI->connect( "dbi:mysql:$core->{'NAME'};host=$core->{'HOST'};port=$core->{'PORT'}", $core->{'USER'}, $core->{'PASS'} );
#  $conf->{'dbh'}         = DBI->connect( "dbi:mysql:$core->{'NAME'};host=ens-staging;port=3306", $core->{'USER'}, $core->{'PASS'} );
  warn join ' ', $species, Data::Dumper::Dumper $conf;
  foreach my $index (@indexes) {
    warn "  Index: $species $index";
    my $function = "dump$index";
    no strict "refs";
    &$function( $conf );
  }
}

## Now for all the dump functions...............................

########################################################################## QTLs

sub dumpQTL {
  my $conf = shift;
  my $sth = $conf->{'dbh'}->prepare("
 select c.name as chr, qf.seq_region_start, qf.seq_region_end,
        a.logic_name as analysis, q.qtl_id,
        q.trait, qs.source_database, qs.source_primary_id,
        fms1.source as fm1_source, fms1.name as fm1_name,
        fms2.source as fm2_source, fms2.name as fm2_name,
        pms.source  as pm_source,  pms.name  as pm_name
   from ((((((seq_region as c, qtl_feature as qf, qtl_synonym as qs,
        analysis as a, qtl as q) left join marker as fm1 on
        fm1.marker_id = q.flank_marker_id_1) left join marker_synonym as fms1 on
        fm1.display_marker_synonym_id = fms1.marker_synonym_id) left join marker as fm2 on
        fm2.marker_id = q.flank_marker_id_2) left join marker_synonym as fms2 on
        fm2.display_marker_synonym_id = fms2.marker_synonym_id) left join marker as pm on
        pm.marker_id = q.peak_marker_id) left join marker_synonym as pms on
        pm.display_marker_synonym_id = pms.marker_synonym_id 
  where c.seq_region_id = qf.seq_region_id and qs.qtl_id = q.qtl_id and 
        qf.analysis_id = a.analysis_id and qf.qtl_id = q.qtl_id
  ");
  $sth->execute();
  open O, ">$conf->{'directory'}/QTL.txt";
  my $desc       = '';
  my $old_qtl = 0;
  my $old_ID  = '';
  my $old_pos = '';
  my $IDS;
  while( my $T = $sth->fetchrow_hashref() ){
    if($T->{qtl_id} eq $old_qtl) {
      $IDS  .= " $T->{source_primary_id}";
      $desc .= " $T->{source_database}:$T->{source_primary_id}";
    } else {
      print O &QTLLine( $conf->{'species'}, $old_ID, $old_pos, $IDS,$desc );
      $IDS = "$T->{trait} $T->{source_primary_id}";
      $IDS.= " $T->{fm1_name}" if $T->{fm1_name};
      $IDS.= " $T->{fm2_name}" if $T->{fm2_name};
      $IDS.= " $T->{pm_name}" if $T->{pm_name};
      $old_ID = $T->{pm_name};
      $old_pos = "$T->{chr}:".($T->{seq_region_start}-1e4).'-'.($T->{seq_region_end}+1e4);
      $desc       = "QTL exhibiting '$T->{trait}' has ";
      my $f2 = $T->{pm_name} ? 1 : 0;
      if( $T->{fm1_name} || $T->{fm2_name} ) {
        my $f1= ($T->{fm1_name}) && ($T->{fm2_name}) ? 1 : 0;
        $desc.='flanking marker'.($f1?'s ':' ').$T->{fm1_name}.($f1?' and ':'').$T->{fm2_name}.($f2?'; ':'');
      }
      if($f2) {
        $desc.= "peak marker $T->{pm_name};";
      }
      $desc.=" and names: $T->{source_database}:$T->{source_primary_id}";
      $old_qtl = $T->{qtl_id};
    }
  }
  print O &QTLLine( $conf->{'species'}, $old_ID,$old_pos, $IDS,$desc );
  close O;
}

sub QTLLine {
  my ($species, $trait, $pos, $IDS, $desc ) = @_;
  return if $trait eq '';
  return join "\t", (INC_SPECIES?"$conf->{'species'}\t":"").qq(QTL),
    $trait, "/$conf->{'species'}/Location/View?r=$pos", $IDS, "$desc\n";
}

########################################################################## Variations

sub dumpVariation {
  my $conf = shift;
  my $VarDB = $conf->{'dbs'}->{'DATABASE_VARIATION'};
  return unless $VarDB;
  open O,  ">$conf->{'directory'}/Variation.txt";
  open P,  ">$conf->{'directory'}/VariationPhenotype.txt";
  open S,  ">$conf->{'directory'}/SomaticMutation.txt";

  my $phenotype_info = $conf->{'dbh'}->selectall_hashref("select description, phenotype_id from $VarDB.phenotype",'description');
  foreach my $phen (keys %$phenotype_info) {
    print P join "\t",
      'VariationPhenotype',
      $phen,
      sprintf( "/$conf->{'species'}/Location/Genome?ftype=Phenotype;id=%s;phenotype_name=%s", $phenotype_info->{$phen}{'phenotype_id'}, $phen),
      $phen,
      "\n",
  }
  my $sources = { map { @$_ } @{$conf->{'dbh'}->selectall_arrayref( "select source_id, name from $VarDB.source" )} };
  my $somatic = { map { @$_ } @{$conf->{'dbh'}->selectall_arrayref( "select source_id, somatic from $VarDB.source" )} };

  my %snp_extra = map { ($_->[0] => $_) }
    @{$conf->{'dbh'}->selectall_arrayref(
      "select va.variation_id,
              group_concat( distinct va.local_stable_id SEPARATOR '; ') as lsi,
              group_concat( distinct va.study           SEPARATOR '; ') as st,
              group_concat( distinct va.associated_gene SEPARATOR '; ') as gn,
              group_concat( distinct va.variation_names SEPARATOR '; ') as vars,
              group_concat( distinct
                if(
                  isnull(p.name),
                  description,
                  concat( p.description,' (',p.name,')' )
                )
                SEPARATOR '; '
              ) as phen
         from $VarDB.variation_annotation as va left join
              $VarDB.phenotype as p on p.phenotype_id = va.phenotype_id
        group by va.variation_id
        order by va.variation_id"
    )};
 my $sth = $conf->{'dbh'}->prepare(qq(
     SELECT vf.variation_id, vf.variation_name, vf.source_id, group_concat(vs.source_id, ' ',vs.name), vf.consequence_type
       FROM $VarDB.variation_feature vf left join $VarDB.variation_synonym vs on vf.variation_id = vs.variation_id, $VarDB.source s
      WHERE vf.source_id = s.source_id
      GROUP BY vf.variation_id));
  $sth->execute() or die "Error:", $DBI::errstr;
  while (my $rowcache = $sth->fetchall_arrayref( undef, 10_000 ) ) {
    while ( my $row = shift( @{$rowcache} ) ) {
      my $v_id  = $row->[0];
      my $name  = $row->[1];
      my %synonyms;
      my $syn_c = 0;
      my $somatic_mutation = $somatic->{$row->[2]};
      foreach my $syn (split /,/, @$row->[3]) {
        my ($id,$sname) = split / /,$syn;
        $synonyms{$sname} = $sources->{$id};
        $syn_c++;
      }
      my (@K, $synonym_text);
      foreach my $syn (keys %synonyms) {
        push @K, $syn;
        $synonym_text .= $synonyms{$syn}.':'.$syn.' ';
      }

      my $snp_source = $sources->{ $row->[2] };
      my $extra      = '';
      my $x = $snp_extra{$v_id};
      if( $x ) {
        push @K,$x->[1],$x->[3],$x->[4];
        $extra = ' It is associated with following phenotypes:'. $x->[4].','.$x->[5];
        if( $x->[1] ) {
          $extra .= sprintf ', through %s', $x->[1];
        }
        if( $x->[2] ) {
          $extra .= sprintf ', through study(s): %s, and associated with following gene(s): %s', $x->[2], $x->[3];
        }
      }

      if ( $somatic_mutation == 1){
        print S join "\t",
	 "$snp_source Somatic Mutation",
	  $name,
         "/$conf->{'species'}/Variation/Summary?source=$snp_source;v=$name",
         "$name @K",
         sprintf( "A %s Somatic Mutation %s%s\n",
            $snp_source,
            $syn_c > 1 ? "with $syn_c synonyms: $synonym_text"
          : $syn_c     ? "with one synonym: $synonym_text"
          :              "with no synonyms",
          $extra
        );
      } else {
       print O join "\t",
        "$snp_source Variation",
        $name,
        "/$conf->{'species'}/Variation/Summary?source=$snp_source;v=$name",
        "$name @K",
        sprintf( "A %s Variation %s%s\n",
            $snp_source,
            $syn_c > 1 ? "with $syn_c synonyms: $synonym_text"
          : $syn_c     ? "with one synonym: $synonym_text"
          :              "with no synonyms",
          $extra
        );
      }
    }
  }

 $sth = $conf->{'dbh'}->prepare(qq(
    SELECT v.variation_id, v.name, v.source_id, fd.description, group_concat(vs.source_id, ' ',vs.name)
      FROM $VarDB.failed_description fd, $VarDB.failed_variation fv, $VarDB.variation v
          LEFT JOIN $VarDB.variation_synonym vs ON v.variation_id = vs.variation_id
     WHERE v.variation_id = fv.variation_id
       AND fv.failed_description_id = fd.failed_description_id
     GROUP BY v.variation_id));
  $sth->execute() or die "Error:", $DBI::errstr;
#  while ( my $rowcache = $sth->fetchall_arrayref( undef, 10_000 ) ) {
 while ( my $rowcache = $sth->fetchall_arrayref( undef, 100 ) ) {
    while ( my $row = shift( @{$rowcache} ) ) {
      my $v_id       = $row->[0];
      my $name       = $row->[1];
      my $snp_source = $sources->{ $row->[2] };
      my $extra      = '';
      my %synonyms;
      my @K;
      my $syn_c = 0;
      foreach my $syn (split /,/, @$row->[4]) {
  my ($id,$sname) = split / /,$syn;
  $synonyms{$sname} = $sources->{$id};
  $syn_c++;
      }
      my (@K, $synonym_text);
      foreach my $syn (keys %synonyms) {
  push @K, $syn;
  $synonym_text .= $synonyms{$syn}.':'.$syn.' ';
      }

    my $x = $snp_extra{$v_id};
      if( $x ) {
  push @K,$x->[1],$x->[3],$x->[4];
  $extra = '. It is associated with following phenotypes:'. $x->[4].','.$x->[5];
  if( $x->[1] ) {
    $extra .= sprintf ', through %s', $x->[1];
  }
  if( $x->[2] ) {
    $extra .= sprintf ', through study(s): %s, and associated with following gene(s): %s', $x->[2], $x->[3];
  }
      }
      $extra .= '. ' if $extra;
      $extra .= $row->[3] if $row->[3];
      print O join "\t",
  "$snp_source Variation",
  $name,
        "/$conf->{'species'}/Variation/Summary?source=$snp_source;v=$name",
  "$name @K",
  sprintf( "A %s Variation %s. %s\n",
          $snp_source,
          $syn_c > 1 ? "with $syn_c synonyms: $synonym_text"
        : $syn_c     ? "with one synonym: $synonym_text"
  :              "with no synonyms",
        $extra
      );
    }
  }

 $sth = $conf->{'dbh'}->prepare(qq(
    SELECT v.variation_id, v.name, v.source_id, GROUP_CONCAT(vs.source_id, ' ', vs.name)
      FROM $VarDB.variation v LEFT JOIN
           $VarDB.variation_feature vf USING (variation_id) LEFT JOIN
           $VarDB.failed_variation fv USING (variation_id) LEFT JOIN
           $VarDB.variation_synonym vs USING (variation_id)
     WHERE vf.variation_feature_id IS NULL
       AND fv.variation_id IS NULL
  GROUP BY v.variation_id
  ORDER BY v.variation_id ASC));

  $sth->execute() or die "Error:", $DBI::errstr;
  while ( my $rowcache = $sth->fetchall_arrayref( undef, 10_000 ) ) {
    while ( my $row = shift( @{$rowcache} ) ) {
      my $v_id       = $row->[0];
      my $name       = $row->[1];
      my $snp_source = $sources->{ $row->[2] };
      my $extra      = '';
      my %synonyms;
      my @K;
      my $syn_c = 0;
      foreach my $syn (split /,/, @$row->[3]) {
  my ($id,$sname) = split / /,$syn;
  $synonyms{$sname} = $sources->{$id};
  $syn_c++;
      }
      my (@K, $synonym_text);
      foreach my $syn (keys %synonyms) {
  push @K, $syn;
  $synonym_text .= $synonyms{$syn}.':'.$syn.' ';
      }

      my $x = $snp_extra{$v_id};
      if( $x ) {
  push @K,$x->[1],$x->[3],$x->[4];
  $extra = '; and is associated with following phenotypes:'. $x->[4].','.$x->[5];
  if( $x->[1] ) {
    $extra .= sprintf ', through %s', $x->[1];
  }
  if( $x->[2] ) {
    $extra .= sprintf ', through study(s): %s, and associated with following gene(s): %s', $x->[2], $x->[3];
  }
      }
      print O join "\t",
  "$snp_source Variation",
  $name,
        "/$conf->{'species'}/Variation/Summary?source=$snp_source;v=$name",
  "$name @K",
  sprintf( "A %s Variation %s. %s\n",
          $snp_source,
          $syn_c > 1 ? "with $syn_c synonyms: $synonym_text"
        : $syn_c     ? "with one synonym: $synonym_text"
  :              "with no synonyms",
        $extra
      );
    }
  }

  close O;
  close S;
  close P;
}

sub dumpStructuralVariations {
  my $conf = shift;
  my $VarDB = $conf->{'dbs'}->{'DATABASE_VARIATION'};
  return unless $VarDB;

  my $species = $conf->{'species'};
  my $scale           = $SD->ENSEMBL_GENOME_SIZE || 1;
  my $max_length      = $scale *= 1e6;
  my $T = $conf->{'dbh'}->selectall_arrayref("
    select v.structural_variation_id,
           v.variation_name,
           s.name,
           s.description,
           r.name,
           v.seq_region_start,
           v.seq_region_end
      from $VarDB.structural_variation as v, $VarDB.source as s, $VarDB.seq_region as r
     where s.source_id = v.source_id and r.seq_region_id = v.seq_region_id
     order by v.structural_variation_id"
  );

  open A, ">$conf->{'directory'}/StructuralVariations.txt" unless scalar @$T == 0;

  foreach my $row ( @$T ) {
    my $r = $row->[4] .":" . $row->[5] ."-". $row->[6];
    my $length = $row->[6] - $row->[7];
    my $action = 'View';
    my $view_config = 'contigviewbottom=variation_feature_structural=normal;';
    if ($length >> $max_length) {
      $action = 'Overview';
      $view_config = 'cytoview=variation_feature_structural=normal;';
    }
    print A join "\t", (INC_SPECIES?"$species\t":"")."Structural Variation",
    $row->[1], "/$species/Location/$action?r=$r;sv=$row->[1];$view_config",
    $row->[1], qq(A structural variation from $row->[2], identified by $row->[3].<br />This variation has been mapped to chromosome $row->[4], from $row->[5] to $row->[6].\n );
  }
  close A;
}


########################################################################## Markers

sub dumpMarker {
  my $conf = shift;
  my $sth = $conf->{'dbh'}->prepare("
   SELECT m.marker_id, ms2.name, ms1.name
     FROM marker_synonym as ms1, marker as m
LEFT JOIN marker_synonym as ms2 on ms2.marker_synonym_id = m.display_marker_synonym_id
    WHERE ms1.marker_id = m.marker_id
 ORDER BY m.marker_id
  ");

  $sth->execute( );

  open O, ">$conf->{'directory'}/Marker.txt";
  my $counter    = 0;
  my $desc       = '';
  my $old_ID = 0;
  my ($ID,$marker,$synonym);
  while( ($ID,$marker,$synonym) = $sth->fetchrow_array()){
    $marker = $synonym unless $marker;
    if($ID == $old_ID) {
      $desc .= " $synonym";
      $counter++;
    } else {
      print O &markerLine( $conf->{'species'},$desc,$counter);
      $desc    = "$synonym";
      $old_ID  = $ID;
      $counter = 1;
    }
  }
  print O &markerLine( $conf->{'species'},$desc,$counter);
  close O;
}

sub markerLine {
  my( $species, $IDS, $counter ) = @_;
  return if $counter == 0;
  my @synonyms = sort split /\s+/, $IDS;
  my $key = pop @synonyms;
  return join "\t",
    (INC_SPECIES?"$species\t":"").qq(Ensembl Marker),    ## Type
    qq($key),
    qq(/$species/Marker/Details?m=$key;contigviewbottom=marker_core_marker=normal),
    qq($IDS),
    qq(A marker with $counter synonyms ($IDS)\n);
}

########################################################################## Similarity features

sub dumpGenomicAlignment {
  my  $conf = shift;
  open D, ">$conf->{'directory'}/GenomicAlignment.txt";

  my $species = $conf->{'species'};
  my $COREDB = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $ESTDB  = $conf->{'dbs'}->{'DATABASE_OTHERFEATURES'};
  my $CDNADB = $conf->{'dbs'}->{'DATABASE_CDNA'};
  my %dbs    = ( 'core' => $COREDB );
  $dbs{'otherfeatures'}  =  $ESTDB  if $ESTDB;
  $dbs{'cdna'} = $CDNADB if $CDNADB;
  my %tables = (
    'dna_align_feature'     => [ 'DnaAlignFeature',     'DNA alignment feature' ],
    'protein_align_feature' => [ 'ProteinAlignFeature', 'Protein alignment feature' ]
  );
  my %errors = ();
  my $core_features;
  foreach my $db ( sort { $b cmp $a } keys %dbs) {
    my $DB_NAME = $dbs{$db};
    foreach my $table ( keys %tables ) {
      my $source = $tables{ $table }[0];
      $source .= ";db=$db" unless $db eq 'core';
      my $type   = $tables{ $table }[1];
      my $sth    = $conf->{'dbh'}->prepare(
	"select a.logic_name, ad.display_label, t.hit_name, t.seq_region_start, t.seq_region_end, count(*) as hits
                from ($DB_NAME.analysis as a, $DB_NAME.$table as t) left join
                     $DB_NAME.analysis_description as ad on ad.analysis_id = a.analysis_id
               where a.analysis_id = t.analysis_id
               group by a.logic_name, t.hit_name");
      $sth->execute();
      my $c = 0;
      while( my( $logic_name, $label, $hid, $sr_start, $sr_end, $count  ) = $sth->fetchrow_array ) {
	$c++;
	print D join "\t",
	  (INC_SPECIES?"$species\t":"")."$type: ".($label||$logic_name),
	    $hid,
	    "/$species/Location/Genome?ftype=$source;id=$hid",
	    $hid,
	    qq($logic_name $hid hits the genome in $count locations.\n);
      }
#      warn "Dumped $c genomic alignments from $DB_NAME $table";
    }
    if ($db eq 'cdna') { #need to remove url when rendering the page since we have no where to show them!
      my $sth    = $conf->{'dbh'}->prepare(
	"select uo.identifier, a.logic_name, ur.summary_description, 'Unmapped feature'
                from $DB_NAME.analysis a, $DB_NAME.unmapped_object uo, $DB_NAME.unmapped_reason ur 
               where a.analysis_id = uo.analysis_id
                 and uo.unmapped_reason_id = ur.unmapped_reason_id");
      $sth->execute();
      while (my ($name, $ln, $desc, $type) = $sth->fetchrow_array ) {
	$ln =~ s/_/ /;
	print D join "\t",
	  (INC_SPECIES?"$species\t":"")."$type: ",
	    "$ln $name",
	    "/$species/Location/Genome?ftype=UnmappedObject;id=$name;db=$db",
	    $name,
	    "Reason: $desc\n";
      }
    }
  }
  close D;
}

sub dumpOligoProbe {
  my $conf = shift;
  open A, ">$conf->{'directory'}/OligoProbe.txt";
  my $species = $conf->{'species'};
  my $FUNCDB = $conf->{'dbs'}->{'DATABASE_FUNCGEN'};
  if ($FUNCDB) {
    my $sth    = $conf->{'dbh'}->prepare(
      "select ps.name, count(distinct pf.probe_feature_id) as hits, a.vendor
        from $FUNCDB.probe_set ps, $FUNCDB.probe p, $FUNCDB.probe_feature pf, $FUNCDB.array_chip ac, $FUNCDB.array a, $FUNCDB.status s, $FUNCDB.status_name sn
      where sn.name='MART_DISPLAYABLE'
        and sn.status_name_id=s.status_name_id
        and s.table_name='array'
        and s.table_id=a.array_id
        and ps.probe_set_id = p.probe_set_id
        and p.probe_id = pf.probe_id
        and p.array_chip_id = ac.array_chip_id
        and ac.array_id = a.array_id
      group by ps.name, a.vendor"
    );
    $sth->execute();
    while( my( $hid, $count, $type ) = $sth->fetchrow_array ) {
      next unless $hid;
      print A join "\t", (INC_SPECIES?"$species\t":"")."$type Probe set",
	$hid, "/$species/Location/Genome?ftype=ProbeFeature;fdb=funcgen;ptype=pset;id=$hid;",
	  $hid, qq($type probeset $hid hits the genome in $count locations.\n);
    }

    my $sth    = $conf->{'dbh'}->prepare(
      "select group_concat(distinct p.name), count(distinct pf.probe_feature_id) as hits, a.vendor
        from $FUNCDB.probe p, $FUNCDB.probe_feature pf, $FUNCDB.array_chip ac, $FUNCDB.array a, $FUNCDB.status s, $FUNCDB.status_name sn
      where sn.name='MART_DISPLAYABLE'
        and sn.status_name_id=s.status_name_id
        and s.table_name='array'
        and s.table_id=a.array_id
        and p.probe_set_id is NULL
        and p.probe_id = pf.probe_id
        and p.array_chip_id = ac.array_chip_id
        and ac.array_id = a.array_id
      group by p.probe_id, a.vendor"
    );
    $sth->execute();
    while( my( $hid, $count, $type ) = $sth->fetchrow_array ) {
      next unless $hid;
      print A join "\t", (INC_SPECIES?"$species\t":"")."$type Probe",
  $hid, "/$species/Location/Genome?ftype=ProbeFeature;fdb=funcgen;ptype=probe;id=$hid;",
    $hid, qq($type probe $hid hits the genome in $count locations.\n);
    }
  }
  close A;
}


########################################################################## Diseases

sub dumpDisease {
  my $conf = shift;
  my $DISDB   = $conf->{'dbs'}->{'ENSEMBL_DISEASE'};    
  return unless $DISDB;
  my $COREDB   = $conf->{'dbs'}->{'DATABASE_CORE'};    
  my $sth = $conf->{'dbh'}->prepare(
    "select distinct concat( g.gene_symbol,':',g.omim_id, ' (',gsi.stable_id,')') as name,
            d.disease, d.disease
       from $DISDB.disease as d,
            $DISDB.gene as g,
            $COREDB.xref as cx, 
            $COREDB.object_xref as cox, 
            $COREDB.translation as tr,
            $COREDB.transcript as t,
            $COREDB.gene_stable_id as gsi 
      where d.id = g.id and cx.display_label = g.omim_id and cx.external_db_id = 1500 and
            cox.xref_id = cx.xref_id and cox.ensembl_id = tr.translation_id and
            tr.transcript_id = t.transcript_id and gsi.gene_id = t.gene_id
      order by disease, g.omim_id, g.gene_symbol"
  );
  $sth->execute();
  open O, ">$conf->{'directory'}/Disease.txt";
  my $old_omim = '';
  my $IDS  = '';
  my $description = '';
  my $old_desc = '';
  my ($gene, $omim, $desc );
  while( ($gene, $omim, $desc ) = $sth->fetchrow_array()){
    if($omim eq $old_omim) {
      $IDS .= " $gene";
      $description .=", $gene";
    } else {
      print O &diseaseLine($conf->{'species'}, $old_omim, $IDS, $description );
      $description = "$desc Genes: $gene";
      $IDS  = "$gene $desc";
      $old_omim = $omim;
    }
  }
  print O &diseaseLine($conf->{'species'}, $old_omim, $IDS, $description );
  close O;
}

use CGI qw(escape escapeHTML);
sub diseaseLine {
  my($species, $did, $IDS, $description) = @_;
  return if $description eq '';
  return join "\t",
    (INC_SPECIES?"$species\t":"").qq(OMIM disease),    ## Type
    escapeHTML($did),               ## ID
    qq(/$species/featureview?type=Disease;id=).escape($did),
    qq($did $IDS),     ## Index locations,
    qq(OMIM - $description\n);
}

########################################################################## Domains

sub dumpDomain {
  my $conf = shift;
  my $COREDB = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $t_sth = $conf->{'dbh'}->prepare(
      "select i.interpro_ac
       from $COREDB.interpro i, $COREDB.protein_feature pf, $COREDB.translation tl, $COREDB.transcript_stable_id tsi
       where i.id = pf.hit_name
       and pf.translation_id = tl.translation_id
       and tl.transcript_id = tsi.transcript_id
       group by i.interpro_ac");
  $t_sth->execute;
  my $sth = $conf->{'dbh'}->prepare(
    "select x.dbprimary_acc, i.id, x.description
       from $COREDB.xref as x, $COREDB.interpro as i
      where x.dbprimary_acc = i.interpro_ac
      order by x.dbprimary_acc");
  $sth->execute();
  open O, ">$conf->{'directory'}/Domain.txt";
  my $old_acc     = '';
  my $IDS         = '';
  my $description = '';
  my $counter     = 0;
  my ($acc, $id, $desc, $old_desc );
  while( ($acc, $id, $desc ) = $sth->fetchrow_array()){
    if($acc eq $old_acc) {
      $IDS         .= " $id";
      $description .= ", $id";
      $counter++;
    }
    else {
      print O &domainLine($conf->{'species'}, $old_acc, $IDS, $old_desc, $counter, $description);
      $description = $id;
      $IDS         = $id;
      $old_acc     = $acc;
      $old_desc    = $desc;
      $counter     = 1;
    }
  }
  print O &domainLine( $conf->{'species'}, $old_acc, $IDS, $old_desc, $counter, $description );
  close O;
}

sub domainLine {
  my($species, $did, $IDS, $desc, $counter, $description ) = @_;
  return if $did eq '';
  return join "\t",
    (INC_SPECIES?"$species\t":"").qq(Interpro domain),    ## Type
    qq($did),               ## ID
    qq(/$species/Location/Genome?ftype=Domain;id=$did), ## URL
    qq($did $IDS $desc),     ## Index locations
    qq(InterPro domain $did [$desc] has $counter associated external database identifiers: $description.\n);   ## Description text

}
########################################################################## Families

sub dumpFamily {
  my $conf = shift;
  my $FAMDB = $conf->{'dbs'}->{'DATABASE_COMPARA'};
  my $CORE  = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $t_sth = $conf->{'dbh'}->prepare(
    "select meta_value from $CORE.meta where meta_key='species.taxonomy_id'"
  );
  $t_sth->execute;
  my $taxon_id = ($t_sth->fetchrow);
  return unless $taxon_id;

  my $db_species = $conf->{'species'};
  (my $species = $conf->{'species'} ) =~ s/_/ /g;

  my $sth = $conf->{'dbh_compara'}->prepare( "
select f.stable_id,
       f.description,
       count(*) as N,
       sum( (source_name = 'ENSEMBLGENE') and (m.genome_db_id = gd.genome_db_id) ) as ensembl_genes_species,
       sum( (source_name = 'ENSEMBLPEP' ) and (m.genome_db_id = gd.genome_db_id) ) as ensembl_peptides_species,
       sum(  source_name = 'ENSEMBLGENE'                                         ) as ensembl_genes,
       sum(  source_name = 'ENSEMBLPEP'                                          ) as ensembl_peptides,
       sum( (source_name = 'Uniprot/SPTREMBL' ) and (m.taxon_id = gd.taxon_id)   ) as uniprot_sptrembl_species,
       sum( (source_name = 'Uniprot/SWISSPROT') and (m.taxon_id = gd.taxon_id)   ) as uniprot_swissprot_species,
       sum(  source_name = 'Uniprot/SPTREMBL'                                    ) as uniprot_sptrembl,
       sum(  source_name = 'Uniprot/SWISSPROT'                                   ) as uniprot_swissprot
  from family as f, family_member as fm, member as m, genome_db as gd
 where f.family_id=fm.family_id and fm.member_id=m.member_id and
       gd.name = '$db_species'
 group by f.family_id 
having ensembl_genes_species > 0"
  );
  $sth->execute();
  open O, ">$conf->{'directory'}/Family.txt";
  my $X = $conf->{'authority'} || 'Ensembl';
  while( my(
    $fid, $desc, $total, 
    $ensembl_genes_species, $ensembl_peptides_species, $ensembl_genes, $ensembl_peptides,
    $uniprot_sptrembl_species, $uniprot_swissprot_species,
    $uniprot_sptrembl, $uniprot_swissprot
  ) = $sth->fetchrow_array()) {
    print O join "\t",
(INC_SPECIES?"$species\t":"").qq(Ensembl protein family),    ## Type
        qq($fid),               ## ID
        qq(/$conf->{'species'}/Gene/Family/Genes?family=$fid), ## URL,
        qq($fid $desc), ## keywords...
        qq(Ensembl protein family $fid [$desc] has $total members: $ensembl_genes $X genes ($ensembl_genes_species in $species); $ensembl_peptides $X proteins ($ensembl_peptides_species in $species); $uniprot_swissprot UniProtKB/Swiss-Prot proteins ($uniprot_swissprot_species in $species); $uniprot_sptrembl UniProtUK/TrEMBL proteins ($uniprot_sptrembl_species in $species).\n); ## Description text
  }
  close O;
}

sub familyLine {
    my($species, $fid, $IDS, $desc, $counter, $description) = @_;
    return if $fid eq '';
    return join "\t",
        (INC_SPECIES?"$species\t":"").qq(Ensembl protein family),    ## Type
        qq($fid),               ## ID
        qq(/$species/familyview?family=$fid), ## URL
        qq($fid $IDS $desc),     ## Index locations
        qq(Ensembl protein family $fid [$desc] has $counter members: $description.\n);   ## Description text
}
 
########################################################################## Genes

sub dumpGene {
  use Data::Dumper;
  my $conf = shift;
  my $COREDB = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $SANGDB = $conf->{'dbs'}->{'DATABASE_VEGA'};
  my $ESTGENEDB = $conf->{'dbs'}->{'DATABASE_OTHERFEATURES'};
  my %DBS = ( 'core' => $COREDB );
#  $DBS{'vega'}          = $SANGDB    if $SANGDB;   #commented out since we don't need these in e60!
  $DBS{'otherfeatures'} = $ESTGENEDB if $ESTGENEDB;
  open O, ">$conf->{'directory'}/Gene.txt";
  my $core_xrefs;
 DB:
  foreach my $DB ( sort keys %DBS ) {
    my $gene_count = 0;
    warn "STARTING... $DB";
    my $DBNAME = $DBS{$DB};
    my $extra = $DB ne 'core' ? ";db=$DB" : '';
    my %xrefs = ();
    my %xrefs_desc = ();

    foreach my $type( qw(Gene Transcript Translation) ) {
      my $T = $conf->{'dbh'}->selectall_arrayref(
        "select ox.ensembl_id,
                x.display_label, x.dbprimary_acc, ed.db_display_name, es.synonym, x.description
           from ($DBNAME.object_xref as ox, $DBNAME.xref as x, $DBNAME.external_db as ed) left join $DBNAME.external_synonym as es on es.xref_id = x.xref_id
          where ox.ensembl_object_type = '$type' and ox.xref_id = x.xref_id and x.external_db_id = ed.external_db_id"
      );
      foreach ( @$T ) {
        $xrefs{$type}{$_->[0]}{$_->[3]}{$_->[1]} =1 if $_->[1];
        $xrefs{$type}{$_->[0]}{$_->[3]}{$_->[2]} =1 if $_->[2];
        $xrefs{$type}{$_->[0]}{$_->[3]}{$_->[4]} =1 if $_->[4];
        $xrefs_desc{$type}{$_->[0]}{$_->[5]}      =1 if $_->[5];
      }
      warn "XREF $type query done";
    }
    my %exons = ();
#    $conf->{'dbh'}->disconnect;
#    $conf->{'dbh'} = DBI->connect( "dbi:mysql:homo_sapiens_core_60_37e;host=ensdb-1-13;port=5307", 'ensadmin', 'ensembl' );
    my $T = $conf->{'dbh'}->selectall_arrayref(
      "select distinct t.gene_id, esi.stable_id
         from $DBNAME.transcript as t, $DBNAME.exon_transcript as et, $DBNAME.exon_stable_id as esi
        where t.transcript_id = et.transcript_id and et.exon_id = esi.exon_id"
    );
    foreach( @$T ) {
      $exons{$_->[0]}{$_->[1]}=1;
    }
    my $gene_info = $conf->{'dbh'}->selectall_arrayref("
      select gsi.gene_id, tsi.transcript_id, trsi.translation_id,
             gsi.stable_id as gsid, tsi.stable_id as tsid, trsi.stable_id as trsid,
             g.description, ed.db_display_name, x.dbprimary_acc,x.display_label, ad.display_label, ad.description, g.source, g.status, g.biotype
        from (((( $DBNAME.gene_stable_id as gsi, $DBNAME.gene as g,
             $DBNAME.transcript_stable_id as tsi,
             $DBNAME.analysis as a,
             $DBNAME.analysis_description as ad,
             $DBNAME.transcript as t) left join
             $DBNAME.translation as tr on t.transcript_id = tr.transcript_id) left join
             $DBNAME.translation_stable_id as trsi on tr.translation_id = trsi.translation_id) left join
             $DBNAME.xref as x on g.display_xref_id = x.xref_id) left join
             $DBNAME.external_db as ed on ed.external_db_id = x.external_db_id
       where t.gene_id = gsi.gene_id and t.transcript_id = tsi.transcript_id and t.gene_id = g.gene_id and g.analysis_id = a.analysis_id and a.analysis_id = ad.analysis_id AND a.logic_name != 'LRG_import'
       order by gsi.stable_id, tsi.stable_id
    ");
    warn "Gene info query done";
    my $old;
    foreach my $row (@$gene_info) {
      my($g,$t,$tr,$gs,$ts,$trs,$d,$ddb,$dpa,$dn,$a,$ad,$s,$st,$bt) = @$row;
      if ($DB eq 'core') {
	$core_xrefs->{$dn}{'display_xref'} = $ddb;
      }
      if( $old->{'g'} != $g ) {
        if( $old->{'g'} ) {
          print O &geneLine( $conf->{'species'},
			     $old->{'ex'},
			     $old->{'s'}.' '.$old->{'bt'},
			     $old->{'gs'},
			     $old->{'alt'},
			     $extra,
			     join( ' ', grep{$_}keys %{$old->{'i'}}),
			     $old->{'t'},
			     $old->{'tr'},
			     $old->{'e'},
			     $old->{'d'} );
	  $gene_count++;
        }
        $old = {
          'g'   => $g,
          'gs'  => $gs,
          'd'   => $d,
          'tr'  => {$trs?($trs=>1):()},
          't'   => {$ts ?($ts=>1) :()},
          'ex'  => {},
          'e'   => {},
          'i'   => {$gs=>1,$ts=>1,$trs=>1}, 
          'alt' => ($DB ne 'vega' && $dn) ? "($ddb: $dn)" : $dn ? "($dn)" : "(novel gene)",
          'a'   => $a,
          'ad'  => $ad,
          's'   => ucfirst($s),
          'st'  => $st,
          'bt'  => $bt,
	  'dn'  => $dn,
        };
	if (   ($DB ne 'vega')
	    || ($core_xrefs->{$dn}{'display_xref'})
            || ($core_xrefs->{$dn}{'display_xref'} !~ /'Clone_based'/) ) {
	  $old->{'i'}->{$d}=1;
	  $old->{'i'}->{$dpa}=1;
	  $old->{'i'}->{$dn}=1;
	}
        $old->{'ex'} = $exons{$g};
        foreach my $K (keys %{$exons{$g}}) { $old->{'i'}{$K} = 1; }
        foreach my $db( keys %{$xrefs{'Gene'}{$g}||{}} ) {
          foreach my $K( keys %{$xrefs{'Gene'}{$g}{$db}} ) {
	    $old->{'e'}{$db}{$K}=1;	
	    &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xrefs');
	  }
        }
        foreach my $db( keys %{$xrefs{'Transcript'}{$t}||{}} ) {
          foreach my $K( keys %{$xrefs{'Transcript'}{$t}{$db}} ) {
	    $old->{'e'}{$db}{$K}=1;	
	    &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xrefs');
	  }
        }
        foreach my $db( keys %{$xrefs{'Translation'}{$tr}||{}} ) {
          foreach my $K( keys %{$xrefs{'Translation'}{$tr}{$db}} ) {
	    $old->{'e'}{$db}{$K}=1;	
	    &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xrefs');
	  }
        }
	foreach my $K( keys %{$xrefs_desc{'Translation'}{$tr}} ) {
	  &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xref_desc');
	}
	foreach my $K( keys %{$xrefs_desc{'Transcript' }{$t }} ) {
	  &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xref_desc');
	}
	foreach my $K( keys %{$xrefs_desc{'Gene'       }{$g }} ) {
	  &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xref_desc');
	}
      } else {
        $old->{'t'}{$ts}=1;
        $old->{'tr'}{$trs}=1;
        $old->{'i'}{$ts}=1;
        $old->{'i'}{$trs}=1;
	foreach my $db( keys %{$xrefs{'Transcript'}{$t}||{}} ) {
          foreach my $K( keys %{$xrefs{'Transcript'}{$t}{$db}} ) {
	    $old->{'e'}{$db}{$K}=1;	
	    &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xrefs');
	  }
        }
        foreach my $db( keys %{$xrefs{'Translation'}{$tr}||{}} ) {
          foreach my $K( keys %{$xrefs{'Translation'}{$tr}{$db}} ) {
	    $old->{'e'}{$db}{$K}=1;	
	    &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xrefs');
	  }
        }
	foreach my $K( keys %{$xrefs_desc{'Translation'}{$tr}} ) {
	  &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xref_desc');
	}
	foreach my $K( keys %{$xrefs_desc{'Transcript' }{$t }} ) {
	  &check_and_store($DB,$dn,$K,$core_xrefs,$old,'xref_desc');
	}
      }
    }
    print O &geneLine( $conf->{'species'},
		       $old->{'ex'},
		       $old->{'s'}.' '.$old->{'bt'},
		       $old->{'gs'},
		       $old->{'alt'},
		       $extra,
		       join( ' ', grep{$_}keys %{$old->{'i'}}),
		       $old->{'t'},
		       $old->{'tr'},
		       $old->{'e'},
		       $old->{'d'} );
    $gene_count++;
    print STDOUT "$gene_count $DB genes indexed\n";
  }
#  warn Dumper($core_xrefs->{'ARSE'});
  my $other_count = 0;
  my %current_stable_ids =();
  foreach my $type (qw(gene transcript translation)) {
    $current_stable_ids{$type}  = {map {@$_} @{$conf->{'dbh'}->selectall_arrayref( "select stable_id,1 from $COREDB.".$type."_stable_id" )}};
  }
  my $species = $conf->{'species'};
  my $sth = $conf->{'dbh'}->prepare( qq(
    select sie.type, sie.old_stable_id, if(isnull(sie.new_stable_id),'NULL',sie.new_stable_id), ms.old_release*1.0 as X, ms.new_release*1.0 as Y
      from $COREDB.mapping_session as ms, $COREDB.stable_id_event as sie
     where ms.mapping_session_id = sie.mapping_session_id and ( old_stable_id != new_stable_id or isnull(new_stable_id) )
     order by Y desc, X desc
  ));

  $sth->execute();
  my %mapping = ();
  while( my($type,$osi,$nsi) = $sth->fetchrow_array() ) {
    next if $current_stable_ids{$type}{$osi}; ## Don't need to cope with current IDS already searchable...
    $mapping{$type}{$osi}{$nsi}=1;
    if($mapping{$type}{$nsi}) {
      foreach( keys %{$mapping{$type}{$nsi}} ) {
        $mapping{$type}{$osi}{$_}=1;
      }
    }
  }
  foreach my $type ( keys %mapping ) {
    my $url = $type eq 'gene' ? 'Gene/Idhistory?g'
	    : $type eq 'transcript' ? 'Transcript/Idhistory?t'
	    : 'Transcript/Idhistory/Protein?protein';
    foreach my $osi ( keys %{$mapping{$type}} ) {
      my @current_sis = ();
      my @deprecated_sis = ();
      foreach ( keys %{$mapping{$type}{$osi}} ) {
	next if $osi eq $_;
        if( $current_stable_ids{$_} ) {
          push @current_sis,$_;
        } elsif( $_ ne 'NULL' ) {
          push @deprecated_sis,$_;
        }
      }
      if( @current_sis ) {
	$other_count++;
        print O join( "\t",
          (INC_SPECIES?"$species\t":"").qq(Ensembl $type),
          qq($osi),
          qq(/$species/$url=$osi),
          join( ' ', $osi, @current_sis, @deprecated_sis ),
          qq($type $osi is no longer in the Ensembl database but it has been mapped to the following current identifiers: @current_sis).
          ( @deprecated_sis ? "; and the following deprecated identifiers: @deprecated_sis" : '').
          ".\n"
        );
      } elsif( @deprecated_sis ) {
	$other_count++;
        print O join( "\t",
          (INC_SPECIES?"$species\t":"").qq(Ensembl $type),
          qq($osi),
          qq(/$species/$url=$osi),
          join( ' ', $osi, @deprecated_sis ),
          qq($type $osi is no longer in the Ensembl database but it has been mapped to the following identifiers: @deprecated_sis\n)
        );
      } else {
	$other_count++;
        print O join( "\t",
          (INC_SPECIES?"$species\t":"").qq(Ensembl $type),
          qq($osi),
          qq(/$species/$url=$osi),
          qq($osi),
          qq($type $osi is no longer in the Ensembl database and it has not been mapped to any newer identifiers\n)
        );
      }
    }
  }
  print STDOUT "$other_count other stable IDS indexed\n";
  $other_count = 0;
  my %unmapped_queries = (
    'None' => qq(
      select a.logic_name, e.db_display_name,
             uo.identifier, ur.summary_description,
             'Not mapped'
        from $COREDB.analysis as a, $COREDB.external_db as e, $COREDB.unmapped_object as uo,
             $COREDB.unmapped_reason as ur
       where a.analysis_id = uo.analysis_id and 
             uo.external_db_id = e.external_db_id and
             uo.unmapped_reason_id = ur.unmapped_reason_id and
               uo.ensembl_id = 0
    ),
    'Transcript' => qq(
      select a.logic_name, e.db_display_name,
             uo.identifier, ur.summary_description,
             concat( 'Transcript: ', tsi.stable_id, '; Gene: ',gsi.stable_id )
        from $COREDB.analysis as a, $COREDB.external_db as e, $COREDB.unmapped_object as uo,
             $COREDB.unmapped_reason as ur, $COREDB.transcript_stable_id as tsi,
             $COREDB.transcript as t, $COREDB.gene_stable_id as gsi
       where a.analysis_id = uo.analysis_id and 
             uo.external_db_id = e.external_db_id and
             uo.unmapped_reason_id = ur.unmapped_reason_id and
             uo.ensembl_id = t.transcript_id and
             uo.ensembl_object_type = 'Transcript' and
             t.transcript_id = tsi.transcript_id and
             t.gene_id       = gsi.gene_id
    ),
    'Translation' => qq(
      select a.logic_name, e.db_display_name, uo.identifier, ur.summary_description,
             concat( 'Protein: ',trsi.stable_id,'; Transcript: ', tsi.stable_id, '; Gene: ',gsi.stable_id )
        from $COREDB.analysis as a, $COREDB.external_db as e, $COREDB.unmapped_object as uo,
             $COREDB.unmapped_reason as ur, $COREDB.transcript_stable_id as tsi,
             $COREDB.translation as tr, $COREDB.translation_stable_id as trsi,
             $COREDB.transcript as t, $COREDB.gene_stable_id as gsi
       where a.analysis_id = uo.analysis_id and 
             uo.external_db_id = e.external_db_id and
             uo.unmapped_reason_id = ur.unmapped_reason_id and
             uo.ensembl_id = tr.translation_id and 
             tr.transcript_id = t.transcript_id and
             trsi.translation_id = tr.translation_id and
             uo.ensembl_object_type = 'Translation' and
             t.transcript_id = tsi.transcript_id and
             t.gene_id       = gsi.gene_id
    )
  );

  %unmapped_queries = (); #don't do these for now

  foreach my $FLAG (keys %unmapped_queries) {
    my $SQL = $unmapped_queries{$FLAG};
    my $sth = $conf->{'dbh'}->prepare($SQL);
    $sth->execute;
    while( my $T = $sth->fetchrow_arrayref() ) {
      print O join "\t",
        (INC_SPECIES?"$conf->{'species'}\t":"").qq(Unmapped feature),
        "$T->[1] $T->[2]",
        "/$conf->{'species'}/Location/Genome?ftype=Gene;id=$T->[2]",
        "$T->[2] $T->[4]",
        "$T->[3]; $T->[4]\n";
    }
  }
  close O;
}


sub check_and_store {
  my ($DB,$dn,$K,$core_xrefs,$old,$rec_type) = @_;
  if ($DB eq 'core') {
    $old->{'i'}{$K}=1;
    push @{$core_xrefs->{$dn}{$rec_type}}, $K;
  }

#enable filtering here
  elsif (0) {
#  elsif ($DB eq 'vega' ) {
    if (   ($core_xrefs->{$dn}{'display_xref'} !~ /'Clone_based'/)
        || (! grep {$_ eq $K} @{$core_xrefs->{$dn}{$rec_type}})
        || ($K =~ /OTT|ENS/)) {
      $old->{'i'}{$K}=1;
    }
  }
  else { $old->{'i'}{$K}=1; }
}

sub geneLine {
  my ($species, $exons, $X, $gid, $altid, $extra, $IDX, $transcripts, $peptides, $external_identifiers, $description )=@_;
  return if $gid eq '';
  my $T = scalar (keys %$transcripts);
  my $P = scalar (keys %$peptides);
  my $E = scalar (keys %$exons);
  my $EX = scalar (keys %$external_identifiers);
  my $desc = $description ? "Description: $description" : '';
  $IDX =~ s/\n/ /g;
#  if ($gid =~ /ENSG00000157399|OTTHUMG00000137358/) {
#    warn "indexing $gid with ",Data::Dumper::Dumper($IDX);
#  }
#  $EX =~ s/\n/ /g;
  return join( "\t",
    (INC_SPECIES?"$species\t":"").qq($X Gene),
    qq($gid $altid),
    qq(/$species/Gene/Summary?g=$gid$extra),
    qq($IDX),
    qq(<br />$desc\n)
    );
}

########################################################################## Sequences

sub dumpSequence {
  my $conf = shift;
  my $sanger = sanger_project_names( $conf );
  my %config = (
    "Homo_sapiens" => [
      [ 'Clone',      'tilepath,cloneset_1mb,cloneset_30k,cloneset_32k', 'name,well_name,clone_name,synonym,embl_acc,sanger_project,alt_well_name,bacend_well_name' ],
      [ 'NT Contig',  'ntctgs',                  'name' ],
      [ 'Encode region', 'encode', 'name,synonym,description' ],
    ],
    "Mus_musculus" => [
      [ 'BAC',        'cloneset_0_5mb,cloneset_1mb,bac_map,tilingpath_cloneset', 'embl_acc,name,clone_name,well_name,synonym,alt_embl_acc' ],
      [ 'Fosmid',     'fosmid_map', 'name,clone_name' ],
      [ 'Supercontig','superctgs', 'name' ],
    ],
    "Anopheles_gambiae" => [
      [ 'BAC' ,       'bacs',        'name,synonym,clone_name' ],
      [ 'BAC band' ,  'bacs_bands',  'name,synonym,clone_name' ],
    ],
    "Gallus_gallus" => [
      [ 'BAC', 'bac_map', 'name,synonym,clone_name' ],
      [ 'BAC ends', 'bacends', 'name,synonym,clone_name', 'otherfeatures' ]
    ]
  );
  
  my $COREDB   = $conf->{'dbs'}->{'DATABASE_CORE'};
  my $ESTDB    = $conf->{'dbs'}->{'DATABASE_OTHERFEATURES'};

  open O, ">$conf->{'directory'}/Sequence.txt";

  my $lrgs = &retrieveLRGs($conf);
  if (@$lrgs) {
    foreach (@{&sort_lrgs($lrgs)}) {
      print O $_;
    }
  }

  my @types = @{$config{$conf->{'species'}}||[]};
  foreach my $arrayref ( @types ) {
    my( $TYPE, $mapsets, $annotationtypes,$DB ) = @$arrayref;
    my $DB = $DB eq 'otherfeatures' ? $ESTDB : $COREDB;
    my @temp         = split (',',$mapsets);
    my @mapsets;
    foreach my $X ( @temp ) {
       my $ID = $conf->{'dbh'}->selectrow_array( "select misc_set_id from $DB.misc_set where code = ?", {}, $X );
      push @mapsets, $ID if($ID);
    }
    next unless @mapsets;
    @temp = split (',',$annotationtypes);
    my @mapannotationtypes;
    foreach my $X ( @temp ) {
      my $ID = $conf->{'dbh'}->selectrow_array( "select attrib_type_id from $DB.attrib_type where code = ?", {}, $X );
      push @mapannotationtypes, $ID if($ID);
    }
    next unless @mapannotationtypes;
    my $Z = " ma.value";
    my $MAPSETS = join ',',@mapsets;
    my $sth = $conf->{'dbh'}->prepare(
      "select mf.misc_feature_id, sr.name, mf.seq_region_start, mf.seq_region_end,
              ma.value, mf.seq_region_end-mf.seq_region_start+1 as len, 
              at.code
         from $DB.misc_feature_misc_set as ms, 
              $DB.misc_feature as mf,
              seq_region   as sr,
              $DB.misc_attrib  as ma,
              $DB.attrib_type  as at 
        where mf.seq_region_id = sr.seq_region_id and mf.misc_feature_id = ms.misc_feature_id and ms.misc_set_id in ($MAPSETS) and
              mf.misc_feature_id = ma.misc_feature_id and ma.attrib_type_id = at.attrib_type_id
        order by mf.misc_feature_id, at.code"
    );
    $sth->execute();
    my ($oldtype,$old_ID,$oldchr,$old_start,$old_end,$emblaccs,$oldlen,$synonyms, $NAME );
    while( my($ID,$chr,$start,$end,$val,$len,$type) = $sth->fetchrow_array() ) {
      if($ID == $old_ID) {
        $NAME = $val if $type eq 'well_name' || $type eq 'clone_name' || $type eq 'name' || $type eq 'non_ref';
        $NAME = $val if !$NAME && $type eq 'embl_acc';
        $NAME = $val if !$NAME && $type eq 'synonym';
        $NAME = $val if !$NAME && $type eq 'sanger_project';
        $emblaccs.=", $val" if $val;
      } else {
        print O &seqLine( $conf->{'species'},$TYPE,$NAME,$oldchr,$old_start,$old_end,$emblaccs,$oldlen,$sanger) if $old_ID;
        $NAME = undef;
        $NAME = $val if $type eq 'well_name' || $type eq 'clone_name' || $type eq 'name' || $type eq 'non_ref';
        $NAME = $val if !$NAME && $type eq 'embl_acc';
        $NAME = $val if !$NAME && $type eq 'synonym';
        $NAME = $val if !$NAME && $type eq 'sanger_project';
        ($old_ID,$oldchr,$old_start,$old_end,$emblaccs,$oldlen) = ($ID,$chr,$start,$end,$val,$len);
      }
    }
    print O &seqLine( $conf->{'species'},$TYPE,$NAME,$oldchr,$old_start,$old_end,$emblaccs,$oldlen,$sanger) if $old_ID;
  }

  #identify current default top level
  (my $current_cs_id) = $conf->{'dbh'}->selectrow_array(
     "select cs.coord_system_id
        from coord_system cs, meta m
       where cs.version = m.meta_value
         and cs.name = 'chromosome'
         and m.meta_key = 'assembly.default'");

  #get mappings between toplevel assemblies
  my $sth = $conf->{'dbh'}->prepare(
     "select distinct sr1.name, sr1.length, sr2.name, cs2.coord_system_id
        from coord_system cs1, seq_region sr1, assembly a, seq_region sr2, coord_system cs2
       where cs1.coord_system_id = sr1.coord_system_id
         and sr1.seq_region_id = a.asm_seq_region_id
         and a.cmp_seq_region_id = sr2.seq_region_id
         and sr2.coord_system_id = cs2.coord_system_id
         and cs1.name = 'chromosome'
         and cs2.name = 'chromosome'");
  my $mapped_ids;
  $sth->execute;
  while (my ($new_name, $new_length, $prev_name, $prev_coord_system_id) =  $sth->fetchrow_array() ) {
    $mapped_ids->{$prev_coord_system_id}{$prev_name}{'name'} = $new_name;
    $mapped_ids->{$prev_coord_system_id}{$prev_name}{'length'} = $new_length;
  }

  #get all seq_regions
  $sth = $conf->{'dbh'}->prepare(
    "select c.name, c.length, cs.name, cs.coord_system_id
       from seq_region as c, coord_system as cs
      where c.coord_system_id = cs.coord_system_id" );
  $sth->execute();
  while( my($name,$length,$type,$cs_id) = $sth->fetchrow_array() ) {
    my ($prev_name,$prev_length);
    if ($type eq 'chromosome' && $cs_id != $current_cs_id) {
      # if an old seq_region is mapped to a new one use it...
      if ($mapped_ids->{$cs_id}{$name}) {
	$prev_name   = $mapped_ids->{$cs_id}{$name}{'name'};
	$prev_length = $mapped_ids->{$cs_id}{$name}{'length'};
	if ($prev_name ne $name) { #don't dump identically named regions
	  warn "dumping assembly mapped seq_region $prev_name instead of $name, please check it's correct since this hasn't been checked yet";
	  $name = $prev_name;
	  $length = $prev_length;
	}
      }
      else {
	#...otherwise ignore
#	warn "not dumping $name on coord_system $cs_id since it's an old assembly with no mapping";
	next;
      }
    }
    my $extra_IDS = ''; my $extra_desc = '';
    if( %{$sanger->{$name}||{}} ) {
      $extra_IDS  = join ' ', '',sort keys %{$sanger->{$name}};
      $extra_desc = " and corresponds to the following Sanger projects: ".join( ', ',sort keys %{$sanger->{$name}});
    }
    my $r   = "$name:1-$length";
    my $ids = $prev_name ? "$prev_name$extra_IDS" : "$name$extra_IDS";
    unless ($type eq 'lrg') {
      print O join "\t",
	(INC_SPECIES?"$conf->{'species'}\t":"").ucfirst($type), 
        $prev_name ? $prev_name : $name,
        ($type eq 'chromosome' && length( $name ) < 5) ? "/$conf->{'species'}/Location/Chromosome?r=$r" 
         : ($length > 0.5e6) ? "/$conf->{'species'}/Location/Overview?r=$r" 
         : "/$conf->{'species'}/Location/View?r=$r",
        "$ids",
        "$name is a @{[ucfirst($type)]} (of length $length)$extra_desc\n";
    }
  }
  close O;
}

sub seqLine {
  my($species,$type,$name,$chr,$start,$end,$val,$len,$sanger) = @_;
  my $Q = $val; $Q=~s/,//g;
  my $SCRIPT = $len > 0.5e6 ? 'Location/Overview' : 'Location/View';
  my $r = "$chr:$start-$end";
  my $extra_IDS = '';
  my $extra_desc = '';
  my %HASH;
  foreach ( split / +/,"$name $val" ) {
    foreach( keys %{$sanger->{$name}||{}} ) {
      $HASH{$_}=1;
    }
  }
  if( %HASH ) {
    $extra_IDS  = join ' ', '',sort keys %HASH;
    $extra_desc = " and corresponds to the following Sanger projects: ".join( ', ',sort keys %HASH );
  }

  return join "\t",
    (INC_SPECIES?"$species\t":"").$type,
    $name, "/$species/$SCRIPT?r=$r",
    "$name $Q$extra_IDS",
    "$type $name is mapped to Chromosome $chr, and has". (($val &&  ($val ne "")) ? " EMBL accession(s)/synonyms $val and " : "").
    " length $len bps$extra_desc\n";
}



########################################################################## LRGs

sub retrieveLRGs {
  my $conf = shift;
  return $conf->{'dbh'}->selectall_arrayref(
        qq(SELECT gsi.stable_id, x.display_label, edb.db_name, tsi.stable_id, sr.length
             FROM gene_stable_id gsi, gene g, analysis a, object_xref ox, xref x, external_db edb, transcript t, transcript_stable_id tsi, seq_region sr
            WHERE gsi.gene_id = g.gene_id
              AND g.analysis_id = a.analysis_id
              AND g.gene_id = ox.ensembl_id
              AND ox.xref_id = x.xref_id
              AND x.external_db_id = edb.external_db_id
              AND g.gene_id = t.gene_id
              AND t.transcript_id = tsi.transcript_id
              AND gsi.stable_id = sr.name
              AND ox.ensembl_object_type = 'Gene'
              AND edb.db_name = 'HGNC'
              AND a.logic_name = 'LRG_import'
            ORDER by gsi.stable_id, tsi.stable_id)
      );
}

sub sort_lrgs {
  my ($lrgs, $fh) = @_;
  my ($prev_gsi,$prev_disp_label,$prev_db_name,$prev_length);
  my $query_terms = [];
  my $to_print;
  foreach my $rec (@$lrgs) {
    my $gsi = $rec->[0];
    if ($gsi eq $prev_gsi) {
      push @$query_terms, $rec->[3];
    }
    else {
      if ($prev_gsi) {
        push @$to_print, (LRGLine($prev_gsi,$prev_disp_label,$prev_db_name,$query_terms,$prev_length));
      }
      $prev_gsi = $gsi;
      $prev_disp_label = $rec->[1];
      $prev_db_name = $rec->[2];
      $query_terms = [ $gsi, $prev_disp_label, $rec->[3] ];
      $prev_length = $rec->[4];
    }
  }
  push @$to_print, &LRGLine($prev_gsi,$prev_disp_label,$prev_db_name,$query_terms,$prev_length);
  return $to_print;
}

sub LRGLine {
  my ($gsi,$dbkey,$dbname,$tsids,$length) = @_;
  my $ids = join ' ', @$tsids;
  return join "\t",
    'LRG Sequence',
    $gsi,
    "/Homo_sapiens/LRG/Summary?lrg=$gsi",
    $ids,
    "<br />$gsi is a fixed reference sequence of length $length with a fixed transcript(s) for reporting purposes. It was created for $dbname gene $dbkey\n";
}

########################################################################## Sanger Projects

sub sanger_project_names {
  my $conf = shift;
  my %SPECIES = qw(
    Homo_sapiens Human Mus_musculus Mouse Danio_rerio Zebrafish
    Drosophila_melanogaster Drosophila Rattus_norvegicus Rat
  );
  my $sanger_species_name = $SPECIES{$conf->{'species'}};
  return {} unless $sanger_species_name;
  my $clones = {};
  my $dbh = DBI->connect("DBI:mysql:host=otterlive;port=3301;database=submissions", 'ottro', undef, {RaiseError => 1});
  unless( $dbh ) {
    warn "Can't connect to submissions database as 'read_only' ", DBI::errstr();
    return $clones;
  }
  my $sth = $dbh->prepare(
    "SELECT distinct a.project_name, a.accession 
       FROM project_acc a, project_dump d,
            sequence s, species_chromosome c
      WHERE a.sanger_id = d.sanger_id AND d.seq_id = s.seq_id
        AND s.chromosome_id = c.chromosome_id 
        AND c.species_name = '$sanger_species_name' AND a.accession != 'UNKNOWN'"
  );
  $sth->execute(  );
  while( my ($proj, $acc) = $sth->fetchrow() ) {
    $clones->{$acc}{$proj}=1;
  }
  return $clones;
}

########################################################################## Regulatory Features

sub dumpRegulatoryFeature {
  my $conf = shift;
  my $FDB = $conf->{'dbs'}->{'DATABASE_FUNCGEN'};
  return unless $FDB;
  my $species = $conf->{'species'};

  my $T = $conf->{'dbh'}->selectall_arrayref(
    "select fs.feature_set_id, fs.name as f_set, fs.display_label as set_label,
            fts.name as set_type, fts.class as set_class, fts.description as set_desc,
            a.logic_name, ad.display_label as anal_name, ad.description as anal_desc
       from ( ($FDB.feature_set as fs left join $FDB.feature_type as fts on fs.feature_type_id = fts.feature_type_id)
  left join $FDB.analysis as a on a.analysis_id = fs.analysis_id)
  left join $FDB.analysis_description as ad on ad.analysis_id = a.analysis_id" );
  my $feature_sets = {};
  foreach( @$T ) {
    $feature_sets->{ $_->[0] } = {
      'name'  => $_->[1],
      'label' => $_->[2],
      'type'  => $_->[3],
      'class' => $_->[4],
      'desc'  => $_->[5],
      'logic' => $_->[6],
      'a_name'=> $_->[7],
      'a_desc'=> $_->[8]
    };
  }
  $T = $conf->{'dbh'}->selectall_arrayref( "select * from $FDB.feature_type" );
  my $feature_types = {};
  foreach( @$T ) {
    $feature_types->{ $_->[0] } = {
      'name'  => $_->[1],
      'class' => $_->[2],
      'desc'  => $_->[3]
    };
  }


  open A, ">$conf->{'directory'}/RegulatoryFeature.txt" unless scalar @$T == 0;


  my $v = sprintf '%s_%s',
    $SD->ENSEMBL_VERSION,
    $SD->get_config( $species, 'SPECIES_RELEASE_VERSION' )
  ;
  my $a = $SD->get_config( $species, 'ASSEMBLY_NAME' );


## Regulatory features...

  my $prefix = 'ENSR';
  if ($species =~/Mus/) {$prefix = 'ENSMUSR';}
  my $d = $conf->{'dbh'}->selectall_arrayref( 
    "select concat('$prefix', lpad(rf.stable_id, 11, 0)),
            sr.name, rf.bound_seq_region_start, rf.bound_seq_region_end, ft.name
       from $FDB.regulatory_feature rf, $FDB.seq_region sr, $FDB.coord_system cs,
            $FDB.feature_type ft, $FDB.feature_set fs
      where fs.name='RegulatoryFeatures' and fs.feature_set_id=rf.feature_set_id and
            rf.feature_type_id=ft.feature_type_id and rf.seq_region_id=sr.seq_region_id and
            sr.coord_system_id=cs.coord_system_id and cs.is_current=1
   group by rf.regulatory_feature_id"
  );

  foreach my $row ( @$d ) {
    my $r = $row->[1] .":".$row->[2] . "-" . $row->[3];
    print A join "\t", (INC_SPECIES?"$species\t":"")."Ensembl Regulatory Feature",
    $row->[0], "/$species/Regulation/Details?rf=$row->[0];r=$r;",
    $row->[0], qq($row->[4] regulatory feature.\n);
  }

## External Features...


  my $e = $conf->{'dbh'}->selectall_arrayref(
  "select ef.display_label, group_concat(distinct ft.name), count(distinct ef.external_feature_id),
          ft.description, ft.class, fs.name, sr.name, ef.seq_region_start, ef.seq_region_end,
          ef.seq_region_strand
    from  $FDB.feature_type ft, $FDB.external_feature ef, $FDB.feature_set fs, $FDB.seq_region sr,
          $FDB.coord_system cs,$FDB.status s, $FDB.status_name sn
    where ft.feature_type_id=ef.feature_type_id and fs.feature_set_id=ef.feature_set_id and
          fs.type='external' and ef.seq_region_id=sr.seq_region_id  and sr.coord_system_id=cs.coord_system_id
          and cs.is_current=1 and  s.table_name='feature_set' and s.table_id=fs.feature_set_id and
          s.status_name_id=sn.status_name_id and sn.name='MART_DISPLAYABLE'
 group by ef.display_label"
  );

  foreach my $f ( @$e) {
    my ($display_label, $f_name, $count, $f_desc, $f_class, $fs_name, $seq_region, $start, $end, $strand) = @$f;
    my ($description, $link);
    my $ids = $display_label;
    $ids .= ' '.$f_name unless $display_label eq $f_name;
    if ($count >> 1) {
     $link = "/$species/Location/Genome?ftype=RegulatoryFactor;id=$display_label;name=$f_name;";
    } else {
      # link miranda features to gene reg view other feats to region in detail
      if ( $f_class eq 'RNA'){
          my ($trans_id, $feat) = split(/:/,$display_label);
          $link = "/$species/Gene/Regulation?t=$trans_id;hid=$display_label";
      } else {
        $start = $start -=100;
        $end = $end +=100;
        my $region  =$seq_region .':' . $start . "-" . $end;
        my $renderer;
        if ($f_class =~/Search/){ $renderer = 'regulatory_regions_funcgen_search';}
        else {$renderer = 'regulatory_regions_funcgen_feature_set';}
        $link = "/$species/Location/View?r=$region;hid=$display_label;contigviewbottom=$renderer=normal";
      }
    }
     $description = "$display_label is a $f_class from $fs_name which hits the genome in $count locations\n";
    print A join "\t", (INC_SPECIES?"$species\t":""). $f_class,
    $display_label, $link, $ids, $description;
  }

  close A if scalar @$T >= 1;
}

