#!/usr/local/ensembl/bin/perl
# KW   HTG; HTGS_DRAFT; HTGS_PHASE1.


# pfetch2ensembl
#
# Cared for by Simon Potter
# (C) GRL/EBI 2001
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code


=pod

=head1 NAME

pfetch2ensembl

=head1 SYNOPSIS

pfetch2ensembl -clone AB000381.1 -contigs

=head1 DESCRIPTION

Load clones into EnsEMBL database from raw sequence. Takes a clone 
accession and version and raw sequence (or retrieve using pfetch)
and splits into contigs.

=head1 OPTIONS

    -dbhost DB host
    -dbuser DB user
    -dbname DB name
    -dbpass DB pass
    -contigs file
    -clone  clone accession and version (separated by '.')

=head1 CONTACT

Simon Potter: scp@sanger.ac.uk

=head1 BUGS

Insert list of bugs here!

=cut


use strict;
use Getopt::Long;
use FileHandle;
use Bio::Root::RootI;
use Bio::Seq;
use Bio::SeqIO;
use Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Clone;
use Bio::EnsEMBL::RawContig;


my ($clone, $contig);
my $contigs;
my ($seqfile, $write);
my ($dbname, $dbhost, $dbpass, $dbuser, $dbport);
my ($help, $info);
my $verbose;
my $accver;
my $length;
my @split;
my $split_finished;
my $nosplit;


$Getopt::Long::autoabbrev = 0;   # personal preference :)
$dbuser = 'ensadmin';            # default

&GetOptions(
            "clone=s"   => \$accver,
            "dbname=s"  => \$dbname,
            "dbhost=s"  => \$dbhost,
            "dbuser=s"  => \$dbuser,
            "dbpass=s"  => \$dbpass,
            "dbport=i"  => \$dbport,
            "help"      => \$help,
            "split_fin!" => \$split_finished,
            "info"      => \$info,
            "v"         => \$verbose,
	    "contigs=s" => \$contigs,
            "write"     => \$write,
	    "nosplit!"  => \$nosplit
);

if ($help) {
    &usage;
    exit 0;
} elsif ($info) {
    exec("perldoc $0");
}

unless ($accver) {
    print STDERR "Must specify -clone\n";
    exit 1;
}

unless ($dbname && $dbuser && $dbhost) {
    print STDERR "Must specify all DB parameters\n";
    exit 1;
}


my $dbobj = Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor->new(
    '-host'   => $dbhost,
    '-user'   => $dbuser,
    '-dbname' => $dbname,
    '-pass'   => $dbpass,
    '-port'   => $dbport,
) or die "Can't connect to DB";

my $sic = $dbobj->get_StateInfoContainer;
my @analysis = $dbobj->get_AnalysisAdaptor->fetch_by_logic_name('SubmitContig');

if ($#analysis != 0) {
    print "More than one or none SubmitContig logic name. Eeek!\n";
    exit 1;
}

my ($seq, $phase);
($accver, $seq) = &pfetch($accver);

my ($acc) = $accver =~ /(\w+)\.?/;
my ($ver) = $accver =~ /\.(\d+)/;

unless ($seq) {
    print STDERR "Skip [$accver]\n";
    exit;
}

$phase = get_phase($accver);

unless (defined $phase && $phase >= 0 && $phase <= 3) {
    print "Unknown phase for $accver - setting to 0\n";
    $phase = 0;
#    exit;
}

my $dbclone;
eval {
    $dbclone = $dbobj->get_CloneAdaptor->fetch_by_accession_version($acc, $ver);
};
if ($dbclone && $write) {
    print "Have $accver already\n";
    exit;
}

my $clone = new Bio::EnsEMBL::Clone;
$clone->htg_phase($phase);
$clone->id($acc);
$clone->embl_id($acc);
$clone->version(1);
$clone->embl_version($ver);
my $now = time;
$clone->created($now);
$clone->modified($now);

print "\n\tembl_id     ", $clone->embl_id, "\n";
print "\tversion     ", $clone->version, "\n";
print "\temblversion ", $clone->embl_version, "\n";
print "\thtg_phase   ", $clone->htg_phase, "\n";

$write = 0 unless $phase;

if ($phase == 3 && ! $split_finished) {
    my $contig = new Bio::EnsEMBL::RawContig;
    $length = $seq->length;
    $contig->name("$acc.$ver.1.$length");
    $contig->embl_offset(1);
    $contig->length($length);
    $contig->seq($seq->seq);

    print "\n\tname:   ", $contig->name, "\n";
    print "\toffset: ", $contig->embl_offset, "\n";
    print "\tlength: ", $contig->length, "\n";
    print "\tend:    ", ($contig->embl_offset + $contig->length - 1), "\n";

    $clone->add_Contig($contig);
print "VAC: here1\n";
    $sic->store_input_id_analysis($contig->name,$analysis[0]) if $write;
}
else {

    if ($contigs) {
print "VAC: splitting contigs\n";
	@split = &getContigs($contigs, $accver);
    }
    elsif ($nosplit) {
	@split = [ 1, $seq->length ];
    }
    else {
	@split = &scanClone($seq->seq);
    }
    foreach my $startend (@split) {
	my $id = join '.', ($acc, $ver, $startend->[0], $startend->[1]);
	my $contig = new Bio::EnsEMBL::RawContig;
	my $offset = $startend->[0];
	my $end    = $startend->[1];
	my $length = $startend->[1] - $startend->[0] + 1;
	my $subseq = $seq->subseq($offset, $end);
	$contig->name($id);
	$contig->embl_offset($offset);
	$contig->length($length);
	# $contig->seq(new Bio::Seq(-id => $id, -seq => $subseq));
	$contig->seq($subseq);

	print "\n\tname:     ", $contig->name, "\n";
        print "\toffset: ", $contig->embl_offset, "\n";
        print "\tlength: ", $contig->length, "\n";
        print "\tend:    ", ($contig->embl_offset + $contig->length - 1), "\n";

        $clone->add_Contig($contig);
print "VAC: here 2\n";
	$sic->store_input_id_analysis($contig->name,$analysis[0]) if $write;
    }
}

print "VAC: here3\n";
$dbobj->get_CloneAdaptor->store($clone) if $write;



sub usage {
    print <<EOF
$0 [options]
Options:
  -clone  accession.version of clone to load
  -dbname )
  -dbhost ) obvious
  -dbuser )
EOF
}



=head2 scanClone

    Title   :   scanClone
    Usage   :   @contigs = $obj->scanClone($seq)
    Function:   Scans the clone sequence to find positions of contigs
		by assuming at least x n's  between contigs
    Returns :   contig positions: list of lists (start, end)
    Args    :   seq: string

=cut

sub scanClone {
  my($seq) = @_;
  my(@gaps, @contig, $start, $gap);

  # get a list of gaps - at least 70 bp
  my $pos = 0;
  while ($pos < length $seq) {
    my $unused = substr $seq, $pos;
    ($gap) = $unused =~ /(n{70,})/i;
    last unless $gap;
    $start = 1 + index $seq, $gap, $pos;
    push @gaps, [ $start, $start + length($gap) - 1 ];
    $pos = $start + length $gap;
  }

  # calc coords of contigs

  if (@gaps){
    # 1st contig before 1st gap unless the sequence starts off with a gap
    push @contig, [1, $gaps[0]->[0] - 1] unless $gaps[0]->[0] == 1;

    # contigs other than 1st and last are between gaps
    foreach my $i (0 .. $#gaps - 1) {
      push @contig, [$gaps[$i]->[1] + 1, $gaps[$i + 1]->[0] - 1];
    }

    # last contig after last gap unless the sequence ends with a gap
    push @contig, [$gaps[$#gaps]->[1] + 1, length($seq)]
     unless $gaps[$#gaps]->[1] == length($seq);
  }
  else {
    # no gaps
    push @contig, [1, length($seq)];
  }

  return sort {$a->[0] <=> $b->[0]} @contig;
}


sub pfetch {
    my ($sv) = @_;

#    open PFETCH, "pfetch -A $sv |" or die( "Can't pfetch with [$sv]\n");
    open PFETCH, "pfetch $sv |" or die( "Can't pfetch with [$sv]\n");
    my $seqio = Bio::SeqIO->new(
	-fh     => \*PFETCH,
	-format => 'fasta'
    );
    my $seq = $seqio->next_seq;
    my ($acc, $ver) = $seq->desc =~ /\b(\w+)\.(\d+)\b/;
    if ($sv !~ /\.\d+/) {
        if ($acc ne $sv) {
            print "Error input $sv not same as found $acc\n";
            return undef;
	}
	else {
	    $sv .= ".$ver";
	}
    }
    close PFETCH or die;
    if ($seq->length > 0) {
	return $sv, $seq;
    }
    else {
	return undef;
    }
}


sub get_phase {
    my ($sv) = @_;
    my ($phase1, $phase2, @contigs);

    open PFETCH, "pfetch -F $sv |" or die;

    while (<PFETCH>) {
	chomp;
	($phase1) = /^KW.+PHASE(\d)/ unless $phase1;
	$phase2 = 3 if /^ID.+HUM/;
	$phase2 = 3 if /^ID.+MUS/;
	$phase2 = 3 if /^ID.+ROD/;
	if (/CC.+?(\d+)\s+(\d+): contig/) {
            push @contigs, [$1, $2];
	}
	return undef if ($phase2 && $phase1 && $phase1 != 4);
	last if /^SQ/;
    }
    close PFETCH;
    $phase = $phase1 || $phase2 || undef;
}

sub getContigs {
    my($file, $sv) = @_;
    my(@contigs);

    open CONTIG, "< $file" or die "Can't open contigs file $file";
    while (<CONTIG>) {
        chomp;
        my ($clone, $start, $end) = split;
print "VAC: here now $clone $sv\n";
        next unless $clone eq $sv;
print "VAC: stillhere now\n";
        unless ($start && $end && $start <= $end) {
            die "Illegal line in contig file: $_\n";
        }
        push @contigs, [ $start, $end ];
    }
    close CONTIG;

    return @contigs;
}
