#!/usr/local/bin/perl

=head1 NAME

utils:Blastssaha - create list of blast and ssaha db in use

=head1 SYNOPSIS

This script outputs a list of files that are in use as blast and ssaha databases files according to the $SERVERROOT/conf/<species>.ini files.

The databases for the species given with the --nospecies argument are omitted
(as they are assumed to be those that are no longer in use)

./list_blast_ssaha [options]

Options:
  --nospecies, --source ssaha

Example:
  Compile a list of species that have a new gene build (need new blast databases
  only) and those that have assembly changes (new ssaha and blast db) from
  release notes/emails.

  Run this script with the species that need new blast/ssaha databases
  in the species argument.

  The output is a list of all currently configured databases for each species
   
   ./list_blast_ssaha --nospecies Caenorhabditis_elegans --sources ssaha
   ./list_blast_ssaha --nospecies Caenorhabditis_elegans --sources blast

  Delete all sources that are not in use. Check the pre-site too.
  WARNING: bug  - for some reason it doesn't accept the first argument?!?!


=head1 OPTIONS

B<--nospecies>
   Optional: if no species is specified, all species will be done
   List the species THAT ARE DUE TO CHANGE IN THE RELEASE

B<-- sources>
  Optional. Default is all sources.  Use this to narrow your search.

B<-h,--help>
  Prints a brief help message and exits.

B<-i,--info>
  Prints all the help information.

=head1 DESCRIPTION

B<This program:>

Produces a list for all species (excluding those listed in --nospecies) of 
the blast files and ssaha server configured in the ini file (SERVERROOT/conf/$species.ini)

More on --sources: Valid options are:

B<  all:> blast, ssaha

B<  blast>; Lists all the files in use for these sources: 
  BLASTN_DATASOURCES 
  TBLASTX_DATASOURCES 
  TBLASTN_DATASOURCES 
  BLASTP_DATASOURCES 
  BLASTX_DATASOURCE

B<  ssaha>; Lists all the ssaha servers in use: SSAHA2_DATASOURCES


=head1 LICENCE

This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details

=head1 CONTACT

Fiona Cunningham <webmaster@sanger.ac.uk>

=cut

use strict;
use warnings;
use Pod::Usage;
use Getopt::Long;
use FindBin qw($Bin);
use File::Basename qw( dirname );
use vars qw( $SERVERROOT );
BEGIN{
  $SERVERROOT = dirname( $Bin );
  unshift @INC, "$SERVERROOT/conf";
  eval{ require SiteDefs };
  if ($@){ die "Can't use SiteDefs.pm - $@\n"; }
  map{ unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS;
}

use EnsEMBL::Web::BlastView::Tool;
my @SPECIES;
my @sources;
my $info;
my $help;
our $sort = "species";

&GetOptions(
    'nospecies:s'  => \@SPECIES, ## species which can be deleted
    'help'         => \$help,
    'info'         => \$info,
    'sort:s'      =>  \$sort,
    'source:s'    => \@sources,
);
pod2usage1(-verbose => 2) if $info;
pod2usage(-verbose => 1) if $help;
@sources = qw(BLASTN_DATASOURCES TBLASTX_DATASOURCES 
		   TBLASTN_DATASOURCES BLASTP_DATASOURCES 
		   BLASTX_DATASOURCES SSAHA2_DATASOURCES) unless @sources;

my $search_db;
my $count_spp;
if (@SPECIES) {
  my $input_spp = EnsEMBL::Web::BlastView::Tool::check_species(\@SPECIES);

  # Create a list of all species that are the same 
  # (i.e. diffce between $input_spp and all species)
  my @ok_species;
  my %bad_list;
  map { $bad_list{$_} = 1 } @$input_spp;

  foreach my $spp (@{ EnsEMBL::Web::BlastView::Tool::all_species() } ) {
    push (@ok_species, $spp) unless $bad_list{$spp};
    $count_spp++;
  }
  $search_db = db_in_use( \@ok_species, \@sources);
}
else {
  $search_db = db_in_use( EnsEMBL::Web::BlastView::Tool::all_species, \@sources);

}
print_dbs($search_db);
print scalar @SPECIES, " species will be replaced with new data (total species: $count_spp):\n", join "\n", @SPECIES,"\n" if @SPECIES;


sub db_in_use {
  my $species = shift;
  my $sources = shift;
  my %search_db;

  foreach my $spp (sort @$species) {
    foreach my $source (@$sources) {
      my %tmp = %{EnsEMBL::Web::BlastView::Tool::get_config(  { species=>$spp, 
					      values => $source}   )};

      foreach (keys %tmp ) {
	my $info = $tmp{$_};
	next if $info eq 'dna' or $info eq 'peptide';
	my ($machine, $port) = split /:/, $info;
	if ($sort eq 'machine') {
	  $search_db{$machine}{ $spp } = $port;
	}
	else {
	  $search_db{$spp}{ $machine } = $port;
	}

      }
    }
  }
  return \%search_db;
}


sub print_dbs {
  my $species_dbs = shift;
  foreach my $species  (sort keys %$species_dbs) {
    print "$species\n";
    my %tmp_hash = %{ $species_dbs->{$species}};
    foreach  (sort keys %tmp_hash  ){
      my $port = $tmp_hash{$_} || "";
      print "$port\n$_\n" ;
    }
    print "\n";
  }
  print "\nKEEP PRE DATABASES TOO\n\n";
  return;
}

1;
=END
| ssaha07
| 70002
| Canis familiaris
| CanFam 2.0
| pre
|-
