use strict;

# finished chromosome AGPs -> ensembl assembly/chromosome tables
# author: simon potter, 04/03
# (c) ebi/grl 2003

# takes inputs:
# dump of ensembl raw contig names/internal IDs ('e_ctg.dat')
# AGP files in $agp_dir/human_*.agp

# outputs 3 files:
#
# static.dat - tab delimited assembly table
# chrom.dat  - tab delimited chromosome table
# idlist.txt - list of contig names and internal IDs used

# "strategy":
#
# parse list of ensembl raw contig coords and store in uberhash;
# go through AGP linewise and look for matching raw contigs
#
# does _some_ checks:
#   each raw contig used no more than once
#   no gaps in AGP
#   no unfinished sequence

# Note: the chromosome_id in assembly and chromosome tables starts
# from 1. Care is needed if loading these tables into a database
# where another assembly is already stored.


my (@agps);

my $gptype = "NCBI35";
my $raw = "e_ctg.dat";
my $agp_dir = "./fin_agp";

my %clone;
my %used_ctg_ids;

my @chrs = qw/
     chr1  chr2  chr3  chr4  chr5  chr6  chr7  chr8  chr9 chr10
    chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20
     chrX  chrY DR52 DR53 PAR chrMT
/;
my @chr_info;
my $chr_id = 1;


open CTG, "< $raw"       or die "Can't open raw contig file $raw";
open ASS, "> static.dat" or die "Can't open static.dat for output";
open INF, "> idlist.txt" or die "Can't open idlist.txt for output";
open CHR, "> chr.dat"    or die "Can't open chr.dat for output";


while (<CTG>) {
    my ($name, $internal_id) = split;
    my ($sv, $start, $end) = $name =~ /(.+)\.(\d+)\.(\d+)/;

    die "Bad contig $name" unless $sv && $start && $end;

    $clone{$sv} ||= [];
    push @{$clone{$sv}}, {
	name  => $name,
	id    => $internal_id,
	start => $start,
	end   => $end
    };
}

close CTG;


foreach my $chr (@chrs) {

    my $agp = "$agp_dir/$chr.agp";
    my $global_end;
    my $prev_end;

    unless (-f $agp) {
	print "Not found AGP for chr $chr\n";
	next;
    }

    open AGP, "< $agp" or die "Canna open AGP file $agp";
    while (<AGP>) {
        next if /^#/;
	my ($chr_name, $chr_start, $chr_end, $status) = (split)[0, 1, 2, 4];

	die "Unexpected gap: $_" if ($prev_end && $prev_end + 1 != $chr_start);
	$prev_end = $chr_end;

	next if $status eq "N";

	next unless $status eq "F";
	if ($chr_name ne "$chr"){
	  print "Bad chr name in $agp: $chr_name\n";
	  next;
        }

	$global_end = $chr_end;

        my ($sv, $clone_start, $clone_end, $clone_ori) = (split)[5, 6, 7, 8];

        die "Invalid SV: $_" unless $sv =~ m{^\w+\.\d+$};
        unless (exists $clone{$sv}) {
	    print STDERR "No clone found: $sv\n";
	    next;
        }

	die "Raw contig and chr coords don't match: $_"
         unless ($clone_end - $clone_start) == ($chr_end - $chr_start);

        if    ( $clone_ori eq '+') { $clone_ori = 1;  }
        elsif ( $clone_ori eq '-') { $clone_ori = -1; }
        else  { die "Bad clone orientation $clone_ori: $_"; }

        my $matched = 0;

        foreach my $raw_ctg (@{$clone{$sv}}) {
	    my $start = $raw_ctg->{'start'};
	    my $end   = $raw_ctg->{'end'};

	    # skip if this raw contig outside clone
	    next if ($clone_start < $start) || ($clone_start > $end);

	    if ($end < $clone_end) {
		print STDERR $raw_ctg->{'name'}, " too short to fit into ",
		 "$clone_start:$clone_end\n";
		next;
	    }

	    $matched = 1;

	    my $ctg_start = $clone_start - $start + 1;
	    my $ctg_end   = $clone_end   - $start + 1;

            if (defined $used_ctg_ids{$raw_ctg->{'id'}}) {
		print STDERR "Warning: ", $raw_ctg->{'name'},
		 " used more than once\n";
	    }
	    else {
		$used_ctg_ids{$raw_ctg->{'id'}} = 1;
	    }

	    print ASS join("\t",
		$chr_id,
		$chr_start,
		$chr_end,
		$sv,
		$clone_start,
		$clone_end,
		1,
		# $nt_ctg,
		# $nt_start,
		# $nt_end,
		# $nt_ori,
		$raw_ctg->{'id'},
		$ctg_start,
		$ctg_end,
		$clone_ori,
		$gptype
	    ), "\n";

	    print INF $raw_ctg->{'name'}, "\t", $raw_ctg->{'id'}, "\n";
        }
        if ($matched == 0) {
	    print STDERR "Can't fit clone $sv: $clone_start - $clone_end\n";
        }
    }
    close AGP;

    push @chr_info, [ $chr_id, $chr, $global_end ];
    $chr_id++;
}


foreach my $c (@chr_info) {
    print CHR join("\t",
	$c->[0],
	$c->[1],
	0,
	0,
	0,
	$c->[2],
    ), "\n";
}

close CHR;


__END__

human-*.agp:

Human-chr1	1	50000	1	N	50000
Human-chr1	50001	50616	2	F	AP006221.1	36116	36731 -
Human-chr1	50617	217280	3	F	AL627309.15	241	166904 +


ctg.dat (name, contig_id):

AB000381.1.1.35863      1
AB012723.1.1.40850      11
AB015355.1.1.43999      24
AB015752.1.1.116160     25
AB016897.1.1.331211     26

