#!/usr/local/bin/perl

use strict;
use warnings;

package do_mysql_dump;

use FindBin qw($Bin);
use Cwd;
use File::Basename;
use Time::localtime;
use Getopt::Long;
use Pod::Usage;
use DBI;

# --- load libraries needed for reading config ---
use vars qw( $SERVERROOT );
BEGIN{
  $SERVERROOT = dirname( $Bin );
  unshift @INC, "$SERVERROOT/conf";
  unshift @INC, "$SERVERROOT";
  eval{ require SiteDefs };
  if ($@){ die "Can't use SiteDefs.pm - $@\n"; }
  map{ unshift @INC, $_ } @SiteDefs::ENSEMBL_LIB_DIRS;
}

use utils::Tool;

our $MYSQL_BIN = "/usr/local/ensembl/mysql/bin";   # this should be a cluster-wide CDSL
our $basedir = "/dumps/";
my ($dumpdir, $help, $info, $logfile, $email, $start_with, $end_with);
my ($no_data, $no_zip, $no_log);
my @SPECIES ;
my @dbs;

&GetOptions(
	    'species:s'      => \@SPECIES,
	    'database:s'     => \@dbs,
	    'logfile:s'      => \$logfile,
	    'no_log'         => \$no_log,
	    'start_with:s' => \$start_with,
	    'end_with:s'   => \$end_with,
	    'email:s'        => \$email,
	    'dumpdir:s'      => \$dumpdir,
	    'help'           => \$help,
	    'info'           => \$info,
	    'no_data'        => \$no_data,
	    'no_zip'        => \$no_zip,
	   ) || pod2usage(2); ;

pod2usage(-verbose => 2) if $info;
pod2usage(1) if $help;
exit(0) if ($help) ;


# Load modules needed for reading config -------------------------------------
require EnsEMBL::Web::SpeciesDefs; 
my $species_defs = EnsEMBL::Web::SpeciesDefs->new();
$species_defs || pod2usage("$0: SpeciesDefs config not found");


# Sort out dumpdir ----------------------------------------------------------
my $sitedefs_release =  $SiteDefs::ENSEMBL_VERSION;
$dumpdir ||=  $basedir."release-$sitedefs_release";  #should be cluster CDSL "/mysqld/current/var"
if( $dumpdir !~/^\// ){
  pod2usage("[*DIE] Must provide the full path to dumpdir: $dumpdir" ) 
}

# Check species -------------------------------------------------------------
if ( @SPECIES ) {
  @SPECIES = @{ utils::Tool::check_species(\@SPECIES) }; 
} else {
  @SPECIES = @{ utils::Tool::all_species()};  
}
@SPECIES  = @{ utils::Tool::start_with_species($start_with, \@SPECIES) } if $start_with;

@SPECIES  = @{ utils::Tool::end_with_species($end_with, \@SPECIES) } if $end_with;


# Log file -------------------------------------------------------------------
my $time1 = time;

unless ($no_log) {
  (my $time = gmtime(time)) =~ s/\s+/\./g;
  $logfile ||= "logs/mysql"."_$time.log";
  $logfile = $SERVERROOT."/$logfile";
  print STDERR "Using logfile $logfile\n";
  open(STDERR, "> $logfile") || die "Can't create file:$!\n";
}

utils::Tool::info(1, "Dumping data to $dumpdir" );


# Web user db conf -----------------------------------------------------------
my %web_userdb = (
	   USER => $species_defs->ENSEMBL_USERDB_USER,
	   PASS => $species_defs->ENSEMBL_USERDB_PASS,
	   HOST => $species_defs->DATABASE_USERDB_HOST,
	   PORT => $species_defs->DATABASE_USERDB_PORT,
	   NAME => $species_defs->ENSEMBL_USERDB_NAME,
	   DRIVER => 'mysql',
	  );


foreach my $sp( @SPECIES ){
  # Hard-code all databases _never_ to dump
  my %kill_list = map {$_=>1} qw( ENSEMBL_BLAST 
				DATABASE_COMPARA_MULTIPLE
				ENSEMBL_BLAST_LOG
				DATABASE_FASTA
        DATABASE_HEALTHCHECK
				ENSEMBL_GLOVAR );
  # Work out dumpdir ---------------------------------------------------------
  my @sp_dumpdirs;
  if ($sp eq 'Multi') {
    #$dumpdir =~ s#^/mysql/#/mysqlg/#;
    utils::Tool::info(1, "Using dumpdir $dumpdir");
    @sp_dumpdirs  = "$dumpdir/multi_species_$sitedefs_release/data/mysql";
  }
  else {
    my $sp_release = utils::Tool::get_config( { species=>$sp, values => "SPECIES_RELEASE_VERSION" }) || "";
    $sp_release =~ s/\.//g;

    @sp_dumpdirs= join "_", ("$dumpdir/".lc($sp), $sitedefs_release, "$sp_release/data/mysql");
    $kill_list{"DATABASE_WEBSITE"} = 1;
    $kill_list{"ensembl_web_user_db"} = 1;
  }

  # Work out db ------------------------------------------------------------
  my %all_db_info = %{ $species_defs->get_config($sp,'databases') || {} };
  $all_db_info{"ensembl_web_user_db"} = {%web_userdb};
  @dbs = ( keys %all_db_info ) unless @dbs;
  push (@dbs, 'ensembl_web_user_db') if $sp eq 'Multi';
  my $write_user = $species_defs->get_config($sp, 'DATABASE_WRITE_USER');
  my $write_pass = $species_defs->get_config($sp, 'DATABASE_WRITE_PASS');
  my $flag_mart = 0;

  foreach my $db( @dbs ){
    if( $kill_list{$db} ){
      utils::Tool::info( 1, "Skipping $db (kill list)" );
      next;
    }
    unless ( $all_db_info{$db} ){
      utils::Tool::warning( 1, "$db is not a valid DB for $sp; select from:\n ".
	     join( "\n       ", sort keys %all_db_info ) );
      next;
    }

    if ( $db =~ /_MART_/ ) {
      push (@sp_dumpdirs, "$dumpdir/mart_$sitedefs_release/data/mysql");
      $flag_mart = 1;
    }

    unless ( $all_db_info{$db}{NAME} ){
      utils::Tool::warning( 1, "DB $db has no NAME, skipping" );
      next;
    }

    $all_db_info{$db}{USER}  = $write_user if $write_user;
    $all_db_info{$db}{PASS}  = $write_pass if $write_pass;
    $all_db_info{$db}{dumpdir} = create_dumpdir( $all_db_info{$db}{NAME},
					       \@sp_dumpdirs, 
						 $sitedefs_release) ||"";
    if ( $db eq 'ensembl_web_user_db' or $no_data ) {
      $all_db_info{$db}{NODATA} = " --nodata";
    }

    utils::Tool::info( 2, "Dumping $sp $db: ".$all_db_info{$db}{NAME} );
    dump_mysql( \%{ $all_db_info{$db} }, $no_zip);
  }
  @dbs = ();
  utils::Tool::info( 1, "Dumps for $sp completed" );
}
utils::Tool::info( 1, "All dumps completed" );


# Work out timings ---------------------------------------------------------
my $time_taken = time - $time1;
my $hours      = localtime($time_taken)->hour -1;
utils::Tool::info (2, "Used $logfile.") if $logfile;
utils::Tool::info (2, " Time taken: $hours:". localtime($time_taken)->min."mins");

$email ||= 'ssg-ensembl@sanger.ac.uk';
utils::Tool::mail_log( $logfile, $email, "" ) if $logfile;
close STDERR;
exit;


##############################################################################
sub dump_mysql {
  my $db_meta = shift;
  my $no_zip  = shift;
  $| = 1;
  $ENV{'CMD_ENV'} = "xpg4";

  my $DB      = $db_meta->{NAME} or die ("Need a db to dump!");
  my $dumpdir = $db_meta->{dumpdir} || '.';
  chdir( $dumpdir ) or die( "Cannot chdir to $dumpdir: $!" );

  my $dump_command = "$MYSQL_BIN/mysqldump";
  $db_meta->{USER}   and $dump_command .= " -u$db_meta->{USER}";
  $db_meta->{PASS}   and $dump_command .= " -p$db_meta->{PASS}";
  $db_meta->{HOST}   and $dump_command .= " -h$db_meta->{HOST}";
  $db_meta->{PORT}   and $dump_command .= " -P$db_meta->{PORT}";
  $db_meta->{NODATA} and $dump_command .= " --no_data";


  ### NOTE #################################################################
  # If you get a funny error message saying permission is denied etc.
  # "Got error: 1045: Access denied for user: 'ensadmin@%' (Using password: YES) when executing 'SELECT INTO OUTFILE'"
  # Note that you can only dump on the localhost machine (e.g. port 3307
  # if you are on ecs3d) if you use -T switch
  # The port number can't have a ";" after it so check errors for this
  ###########################################################################

  my $dump_command_T = $dump_command . " -T . $DB";
  #warn $dump_command_T;
  system( $dump_command_T ) and die( "Cannot $dump_command_T: $!" );

  opendir (DIR, $dumpdir);
  foreach  (grep /\.sql$/, readdir DIR) {
    unlink;
  }
#   system("rm -f ./[a-b]*.sql");
#   system("rm -f ./[c]*.sql");
#   system("rm -f ./[d]*.sql");
#   system("rm -f ./[e]*.sql");
#   system("rm -f ./[f-g]*.sql");
#   system("rm -f ./[h]*.sql");
#   system("rm -f ./[i-n]*.sql");
#   system("rm -f ./[o]*.sql");
#   system("rm -f ./[p-q]*.sql");
#   system("rm -f ./[r-z]*.sql");
#   system("rm -f ./*.sql");

  # DON'T USE THE -T OPTION COS IT STOPS IT WORKING

  unless ( $no_zip ) {
    my $dump_command_d = $dump_command . " -d $DB | gzip -9 -c > $DB.sql.gz";
    system( $dump_command_d ) and die( "Cannot $dump_command_d: $!" );
  }

  # Rename ensembl_web_user_db so they include the release number
  if ( $dumpdir =~ /ensembl_web_user_db|ensembl_website/ ){
    rename_files( $DB, $dumpdir );
  }



  # Divide files up ------------------------------------------
  foreach my $f (<*.txt>){
    my $file_size = -s $f;

    # Split file if size is greater than 4G (size before gzip)
    if ($file_size > 3500000000 ){
      split_data($DB, $f);
      next;
    }

    # Otherwise just zip it
    unless ( $no_zip ) {
      utils::Tool::info( 1, "Starting gzip for $f");

      my $ret_value = system ("cat $f | gzip -9 -c > $f.table.gz");
      utils::Tool::info( 1, "finished gzip for $DB.$f");
      unlink($f) unless $ret_value;
    }
  }

  foreach my $h (<*.gz>){
    system("/bin/sum $h >> CHECKSUMS");
  }
  system("gzip -9 CHECKSUMS");

  return 1;
}

#-----------------------------------------------------------------------------
sub split_data {
  my ($DB, $file) = @_;
  my $chunk = 0;
  my $bytes = 0;
  my $name = $file;
  $name =~ s/\.txt//;


  open (IN, "$file") or die "Can't open infile $file: $!\n";
  open (OUT, ">$name.$chunk.txt.table") 
    or die "Can't create ${name}.${chunk}.txt.table: $!\n";

  # Core dna file compression ~66%, RefSNP table = 75%
  # Feature dna compression = 88 %
  utils::Tool::info("Creating ${name}.${chunk}.txt.table"); 
  while(<IN>){
    $bytes += length $_;
    if ($bytes > 3500000000){
      print OUT $_;
      close (OUT);
      system ("gzip -9 $name.$chunk.txt.table"); # gzip file

      # Prepare and open next file
      $chunk++;
      open (OUT, ">$name.$chunk.txt.table") or die "Can't create ${name}.${chunk}.txt.table: $!\n";

      utils::Tool::info ("Creating $DB ${name}.${chunk}.txt.table"); 
      $bytes = 0;
    }
    else{
      print OUT $_;
    }
  }
  system ("gzip -9 $name.$chunk.txt.table"); # gzip file
  close(IN);
  close OUT;
  unlink $file;
}

#----------------------------------------------------------------------------
sub create_dumpdir {

  my ( $db_name, $sp_dumpdirs, $release ) = @_;
  my $db_dumpdir;

  if ($db_name =~ /ensembl_help/) {
    $db_dumpdir = "$sp_dumpdirs->[0]/ensembl_help"."_$sitedefs_release";
  }

  elsif ($db_name =~ /ensembl_website|ensembl_web_user_db/) {
    $db_dumpdir = "$sp_dumpdirs->[0]/$db_name"."_$sitedefs_release";
  }

  else {
    $db_dumpdir = $db_name =~ /_mart_/ ? $sp_dumpdirs->[1] : $sp_dumpdirs->[0];
    $db_dumpdir .= "/$db_name";
  }


  if( -e $db_dumpdir ){
    utils::Tool::info( 1, "Removing existing copy of $db_dumpdir" );
    system("rm -Rf $db_dumpdir") && die "Couldn't delete $db_dumpdir";
  }

  utils::Tool::check_dir( $db_dumpdir );
  system("chmod 777 $db_dumpdir");
  unless ( -d $db_dumpdir && -w $db_dumpdir ){ 
    utils::Tool::warning("$db_dumpdir is not a directory or is not writable" ) && next;
  }
  return $db_dumpdir;
}
#------------------------------------------------------------------------------
sub rename_files {
  my $db_name    = shift;
  my $sp_dumpdir = shift;

  my $new_db = "$db_name"."_$sitedefs_release";
  system ("mv $sp_dumpdir/$db_name.sql.gz $sp_dumpdir/$new_db.sql.gz" );
  return;
}


1;

__END__

=head1 NAME

do_mysql_dump - Dump Ensembl databases to flatfiles

=head1 SYNOPSIS

do_mysql_dump [options]

Options:
  --help, --info, --species, --database, --logfile --dumpdir
  --no_data  --no_zip --email --no_log --start_with --end_with

=head1 OPTIONS

B<-h,--help>
  Prints a brief help message and exits.

B<-i,--info>
  Prints man page and exits.

B<--species>
  One or more species to dump.

B<--database>
  One or more databases to dump (DATABASE_CORE, DATABASE_EST etc)
  All DBs in config (not in kill list) will be dumped if omitted.

B<--dumpdir>
  Specifies directory to dump into (default /mysql/dumps/FTP). 
  The directory must be the full path (i.e. start with "/")

B<--no_log>
  Output to standard out (STDOUT). Default is to use a log file.

b<--logfile>
  Choose the name of your logfile. The default is "fasta<release_number>timestamp.log".

B<--email>
   Sends an email to this address if you use a log file

B<--no_data>
   Only dump the sql files.  No data

B<--no_zip>
   Default is to gzip all files.  This options turns it off.

B<--start_with>
  Optional: give it a species name and it will skip all species before this in the alphabet.

B<--end_with>
  Optional: give it a species name and it will skip all species after this in the alphabet.


=head1 DESCRIPTION

B<This program:>

Dumps Ensembl databases to flatfiles.  

Maintained by Ensembl web team <ensembl-webteam@ensembl.org>

=cut

