use strict;

# ncbi assembly -> ensembl static golden path
# author: simon potter, 09/02
# based loosely on the ucsc assembly parser
# (c) ebi/grl 2001

# takes 3 file names from cmd line:
#
#  1. loc of nt contigs on chromosome (e.g. seq_contig.md)
#  2. agp - clones in nt_contigs (e.g. build_xx.agp)
#  3. ensembl raw_contigs (e.g. contig.txt)
# 1. and 2. are given with the assembly; 3. must be dumped out
# It might be a good idea to sort 1. by chromosome name, so that
# the chromosome_id and name match for chr 1 to 22
#  Additionally requires golden_path type as last option
#
# [examples at end of file]
#
# outputs 3 files:
#
# static.txt - tab delimited assembly table
# chrom.txt  - tab delimited chromosome table
# idlist.txt - list of contig names and internal IDs used

# "strategy":
#
# read nt_contig/chromosome file and store start/end/orientation
# of nt_contigs on the chromosome;
# parse list of ensembl raw contig coords and store in uberhash;
# go through agp linewise and look for matching raw contigs
#
# does some checks, e.g. that contig_id is unique

# Note: the chromosome_id in assembly and chromosome tables starts
# from 1. Care is needed if loading these tables into a database
# where another assembly is already stored.


my($nt, $agp, $raw, $gptype);

($nt, $agp, $raw, $gptype) = @ARGV or die "horribly";

my %chr_len;
my $chr_id = 1;
my %nt_contig;
my %clone;
my %used_chr_ids;
my %used_ctg_ids;

my $default_ori = 1;
my $chr_offset = 0;	# chr coordinates may be off by 1
			# but depends on the assembly version...
my $allow_pipe = 0;	# convert '|' to '_'


open NT,  "< $nt"  or die "Can't open nt_contig file $nt";
open AGP, "< $agp" or die "Can't open agp file $agp";
open CTG, "< $raw" or die "Can't open raw contig file $raw";

open SGP, "> static.txt" or die "Can't open static.txt for output";
open INF, "> idlist.txt" or die "Can't open idlist.txt for output";


while (<NT>) {
    my ($chr, $start, $end, $ori, $nt_ctg) = (split)[1, 2, 3, 4, 5];

    $chr =~ s!\|!_! unless $allow_pipe;	# "we" don't want |'s in chr name ...

    unless (defined $chr_len{$chr}) {
        $chr_len{$chr}->{'id'}    = $chr_id;
        $chr_len{$chr}->{'start'} = undef;
        $chr_len{$chr}->{'end'}   = undef;
	$chr_id++;
    };

    $chr_len{$chr}->{'start'} ||= $start;
    $chr_len{$chr}->{'end'} = $end;

    if    ($ori eq '+') { $ori = 1;  }
    elsif ($ori eq '-') { $ori = -1; }
    elsif ($ori eq '?' || $ori eq '0') { $ori = $default_ori; }
    else  { die "Bad NT contig orientation $ori"; }

    $nt_contig{$nt_ctg}->{'chr'}   = $chr;
    $nt_contig{$nt_ctg}->{'start'} = $start + $chr_offset;  # nt_ctg loc on chromosome
    $nt_contig{$nt_ctg}->{'end'}   = $end + $chr_offset;
    $nt_contig{$nt_ctg}->{'ori'}   = $ori;
}
close NT;


while (<CTG>) {
    my ($contig_id, $internal_id, $start, $length) = split;
    $contig_id =~ /([^.]+\.[^.]+)\./ || die "Bad id $contig_id";
    my $clone_id = $1;
    if (!defined $clone{$clone_id}) {
        $clone{$clone_id} = [];
    }

    push @{$clone{$clone_id}}, {
	"id"    => $contig_id,
	"iid"   => $internal_id,
	"start" => $start,
	"end"   => $start + $length - 1
    };
}
close CTG;


while (<AGP>) {
    my ($nt_ctg, $nt_start, $nt_end, $sv, $raw_start, $raw_end, $raw_ori) =
     (split)[0, 1, 2, 5, 6, 7, 8];

    next if $raw_start eq 'fragment';
    next unless $sv =~ m{^\S+\.\d+$};

    unless (($raw_end - $raw_start) == ($nt_end - $nt_start)) {
	die "Raw contig and nt contig coords don't match";
    }
    if    ($raw_ori eq '+') { $raw_ori = 1;  }
    elsif ($raw_ori eq '-') { $raw_ori = -1; }
    else  { die "Bad raw contig orientation $raw_ori"; }

    my $nt_ori = $nt_contig{$nt_ctg}->{'ori'};
    my $chr    = $nt_contig{$nt_ctg}->{'chr'};
    my ($chr_start, $chr_end);
    $chr =~ s!\|!_! unless $allow_pipe;

    if ($nt_ori == 1) {
	# forward oriented nt contig: raw contigs forward from nt contig start
	$chr_start = $nt_contig{$nt_ctg}->{'start'} + $nt_start - 1;
	$chr_end   = $nt_contig{$nt_ctg}->{'start'} + $nt_end   - 1;
    }
    else {
	# reverse oriented nt contig: raw contigs back from nt contig end
	$raw_ori *= -1;  # flip raw contig if nt contig is in reverse ori
	$chr_start = $nt_contig{$nt_ctg}->{'end'} - $nt_end   + 1;
	$chr_end   = $nt_contig{$nt_ctg}->{'end'} - $nt_start + 1;
    }

    my $seen = 0;
    unless (exists $clone{$sv}) {
	print STDERR "No clone found: $sv\n";
	next;
    }
    foreach my $raw_ctg (@{$clone{$sv}}) {
	my $start = $raw_ctg->{'start'};
	my $end   = $raw_ctg->{'end'};
	if ($raw_start >= $start && $raw_start <= $end) {
	    if ($raw_end > $end) {
		print STDERR $raw_ctg->{'id'}, " doesn't fit into ",
		 "$raw_start:$raw_end\n";
	    }
	    else {
		$seen = 1;
		$raw_start = $raw_start - $start + 1;
		$raw_end   = $raw_end   - $start + 1;
		$used_chr_ids{$chr_len{$chr}->{'id'}} = 1;

                if (defined $used_ctg_ids{$raw_ctg->{'iid'}}) {
		    print STDERR "Warning: ", $raw_ctg->{'id'},
		     " used more than once\n";
		}
		else {
		    $used_ctg_ids{$raw_ctg->{'iid'}} = 1;
		}

		print SGP join("\t",
		    $chr_len{$chr}->{'id'},
		    $chr_start,
		    $chr_end,
		    $nt_ctg,
		    $nt_start,
		    $nt_end,
		    $nt_ori,
		    $raw_ctg->{'iid'},
		    $raw_start,
		    $raw_end,
		    $raw_ori,
		    $gptype
		), "\n";

		print INF $raw_ctg->{'id'}, "\t", $raw_ctg->{'iid'}, "\n";

		# print "$chr $nt_ctg:$nt_start:$nt_end ";
		# print $raw_ctg->{'id'}, ":$raw_start:$raw_end:$raw_ori ";
		# print "$chr_start:$chr_end\n";

		last;
	    }
	}
    }
    if ($seen == 0) {
	print STDERR "Can't fit clone $sv: $raw_start - $raw_end\n";
    }
}
close AGP;


open CHR, "> chrom.txt" or die "Can't open chrom.txt for output";

foreach my $chr (keys %chr_len) {
    print CHR join("\t",
	$chr_len{$chr}->{'id'},
        $chr,
	0,
	0,
	0,
	$chr_len{$chr}->{'end'}
    ), "\n" if $used_chr_ids{$chr_len{$chr}->{'id'}};
}

close CHR;


__END__

seq_contig.md:

9606    1       0       0       +       start   -1      contig  10
9606    1       0       644775  ?       NT_021903.5     21903   contig  1
9606    1       694776  1755501 -       NT_004350.5     4350    contig  1
9606    1       1805502 2002021 -       NT_028057.1     28057   contig  1

build_xx.agp:

NT_004615.5          1       4637   1  D  AL390868.5     144054   148690  -
NT_004615.5       4638       4737   2  N       100 fragment no
NT_004615.5       4738      15035   3  D  AC021696.3      87558    97855  +
NT_004615.5      15036      15135   4  N       100 fragment no
NT_004615.5      15136      20250   5  D  AC021696.3      19497    24611  -

contig.txt (id, internal_id, offset, length):

AB000381.1.1.35863      1       1       35863
AB012723.1.1.40850      11      1       40850
AB015355.1.1.43999      24      1       43999
AB015752.1.1.116160     25      1       116160
AB016897.1.1.331211     26      1       331211
