#!/usr/bin/perl
# Usage: abyss2afg reads.fa contigs.fa alignments.kalign >out.afg
# Written by Shaun Jackman <sjackman@bcgsc.ca>.
use strict;
use Getopt::Long;
use Pod::Usage;
use List::Util qw'min max';

my $args = join ' ', @ARGV;

sub version {
	print <<EOF;
abyss2afg (ABySS)
Written by Shaun Jackman.

Copyright 2009 Canada's Michael Smith Genome Science Centre
EOF
	exit;
}

my ($opt_mean, $opt_sd);
Getopt::Long::Configure(qw'bundling');
GetOptions(
	'mean|m=i' => \$opt_mean,
   	'sd|s=i' => \$opt_sd,
   	'help' => sub { pod2usage(-verbose => 1) },
   	'man' => sub { pod2usage(-verbose => 2) },
   	'version' => \&version);

for (@ARGV) { die "cannot read `$_'" unless $_ eq '-' || -r }

print <<EOF;
{UNV
eid:afg
com:
generated by abyss2afg $args
.
}
EOF

print <<EOF
{LIB
eid:1
iid:1
{DST
mea:$opt_mean
std:$opt_sd
}
}
EOF
if defined $opt_mean && defined $opt_sd;


sub find_mate($)
{
	my $id = shift;
	return $id =~ s%/1$%/2% || $id =~ s%/2$%/1%
		|| $id =~ s/_A$/_B/ || $id =~ s/_B$/_A/
		|| $id =~ s/_F$/_R/ || $id =~ s/_R$/_F/
		|| $id =~ s/_forward$/_reverse/
		|| $id =~ s/_reverse$/_forward/
		? $id : undef;
}

my ($red_iid, $frg_iid, %reds, %frgs, $id, %seq, %tle);

sub new_read($$)
{
	my ($id, $seq) = @_;
	die "error: duplicate sequence ID `$id'" if exists $seq{$id};
	$seq{$id} = $seq;

	my $mate_id = find_mate($id);
	return unless defined $mate_id;

	my $qlength = length $seq;
	my $qlt = 'K' x $qlength;
	$frgs{$id} = $frgs{$mate_id} = ++$frg_iid
		unless exists $frgs{$id};
	$reds{$id} = ++$red_iid;
	print "{RED\nclr:0,$qlength\niid:$red_iid\neid:$id\n",
		"frg:$frgs{$id}\n",
		"seq:\n$seq\n.\nqlt:\n$qlt\n.\n}\n";

	if(exists $seq{$mate_id} && exists $seq{$id}) {
		my ($iid0, $iid1) = ($reds{$mate_id}, $reds{$id});
		print "{FRG\nrds:$iid0,$iid1\nlib:1\n",
			"eid:$mate_id\niid:$frg_iid\ntyp:I\n}\n";
	}
}

while (<>) {
	chomp;
	next if /^#/ || /^@/;

	if (/^>/) {
		($id) = split ' ', substr $_, 1;
		next;
	}
	if (/^[acgtnACGTN]/) {
		new_read($id, $_);
		undef $id;
		next;
	}

	my ($query, @align) = split '\t';
	next if @align == 0;

	my ($qid, $qseq) = split ' ', $query;
	if (defined $qseq) {
		new_read($qid, $qseq);
	} else {
		die "no read `$qid'" unless exists $seq{$qid};
		$qseq = $seq{$qid};
	}

	@align = sort {
		# Sort the alignments first by position on the read and
		# then by the length of the alignment.
		my (undef, undef, $a_qstart, $a_alength) = split ' ', $a;
		my (undef, undef, $b_qstart, $b_alength) = split ' ', $b;
		$a_qstart <=> $b_qstart || $b_alength <=> $a_alength;
	} @align;

	my $best_start = 0;
	my $best_end = 0;
	@align = grep {
		my (undef, undef, $qstart, $alength) = split;
		my $qend = $qstart + $alength;
		if ($qstart > $best_start && $qend <= $best_end
				|| $qstart >= $best_start && $qend < $best_end) {
			# This alignment is entirely covered by another
			# alignment, so this alignment is inferior.
			0; # Discard this alignment.
		} else {
			die unless $qstart >= $best_start;
			die unless $qend >= $best_end;
			$best_start = $qstart;
			$best_end = $qend;
			1; # Keep this alignment.
		}
	} @align;
	my $nalign = @align;

	for (@align) {
		my ($tid, $tstart, $qstart, $alength, $qlength, $sense)
			= split;
		die if length $qseq != $qlength;

		die "no contig `$tid'" unless exists $seq{$tid};
		my $tlength = length $seq{$tid};

		my $qend = $qstart + $alength;
		if ($sense) {
			# Reverse coordinates.
			($qstart, $qend)
				= ($qlength - $qend, $qlength - $qstart);
		}

		my $astart = $tstart - $qstart;
		my $off = max $astart, 0;
		my $qastart = max 0, 0 - $astart;
		my $qaend = min $qlength, $tlength - $astart;

		if ($sense) {
			# Reverse coordinates.
			($qastart, $qaend)
				= ($qlength - $qaend, $qlength - $qastart);
		}
		my $clr = $sense ? "$qaend,$qastart" : "$qastart,$qaend";

		die unless exists $reds{$qid};
		my $red_iid = $reds{$qid};
		push @{$tle{$tid}},
			"{TLE\nclr:$clr\noff:$off\nsrc:$red_iid\n}\n";
	}
}

my $ctg_iid;
for my $ctg_eid (sort {$a<=>$b} keys %tle) {
	my $seq = $seq{$ctg_eid};

	# Split long lines.
	my $qlt = 'K' x (length $seq);
	$seq =~ s/.{70}/$&\n/sg;
	$qlt =~ s/.{70}/$&\n/sg;

	# Contig sequence.
	$ctg_iid++;
	print "{CTG\niid:$ctg_iid\n",
		"eid:$ctg_eid\n",
		"seq:\n", $seq, "\n.\n",
		"qlt:\n", $qlt, "\n.\n";

	print for @{$tle{$ctg_eid}};

	print "}\n";
}

=pod

=head1 NAME

abyss2afg - create an AMOS AFG assembly from an ABySS assembly

=head1 SYNOPSIS

B<KAligner> B<--seq> B<-mk> I<K> F<reads.fa> F<contigs.fa> |
 B<abyss2afg> F<contigs.fa> - >F<out.afg>

B<bank-transact> B<-cb> F<out.bnk> B<-m> F<out.afg>

B<hawkeye> F<out.bnk>

=head1 DESCRIPTION

Create an AMOS AFG assembly from an ABySS assembly. The reads are
first aligned against the contigs using KAligner. These alignments are
then converted to an AFG assembly.

=head1 AUTHOR

Written by Shaun Jackman.

=head1 REPORTING BUGS

Report bugs to <abyss@bcgsc.ca>.

=head1 COPYRIGHT

Copyright 2009 Canada's Michael Smith Genome Science Centre

=head1 SEE ALSO

http://www.bcgsc.ca/platform/bioinfo/software/abyss

http://amos.sourceforge.net/hawkeye
