#!/usr/bin/perl
# Usage: abyss2ace reads.fa contigs.fa alignments.kalign >out.ace
# Written by Shaun Jackman <sjackman@bcgsc.ca>.
use strict;
use Getopt::Long;
use Pod::Usage;

my %opt;
GetOptions(\%opt, qw'help man version');
pod2usage(-verbose => 1) if $opt{help};
pod2usage(-verbose => 2) if $opt{man};
if ($opt{version}) {
	print <<EOF;
abyss2ace (ABySS)
Written by Shaun Jackman.

Copyright 2009 Canada's Michael Smith Genome Science Centre
EOF
	exit;
}

for (@ARGV) { die "cannot read `$_'" unless $_ eq '-' || -r }

my ($nreads, $id, %seq, %af, %rd);

while (<>) {
	chomp;
	next if /^#/ || /^@/;

	if (/^>/) {
		($id) = split ' ', substr $_, 1;
		next;
	}
	if (/^[acgtnACGTN]/) {
		die "error: duplicate sequence ID `$id'" if exists $seq{$id};
		$seq{$id} = $_;
		undef $id;
		next;
	}

	my ($query, @align) = split '\t';
	next if @align == 0;

	my ($qid, $qseq) = split ' ', $query;
	if (!defined $qseq) {
		die "no read `$qid'" unless exists $seq{$qid};
		$qseq = $seq{$qid};
	}

	@align = sort {
		# Sort the alignments first by position on the read and
		# then by the length of the alignment.
		my (undef, undef, $a_qstart, $a_alength) = split ' ', $a;
		my (undef, undef, $b_qstart, $b_alength) = split ' ', $b;
		$a_qstart <=> $b_qstart || $b_alength <=> $a_alength;
	} @align;

	my $best_start = 0;
	my $best_end = 0;
	@align = grep {
		my (undef, undef, $qstart, $alength) = split;
		my $qend = $qstart + $alength;
		if ($qstart > $best_start && $qend <= $best_end
				|| $qstart >= $best_start && $qend < $best_end) {
			# This alignment is entirely covered by another
			# alignment, so this alignment is inferior.
			0; # Discard this alignment.
		} else {
			die unless $qstart >= $best_start;
			die unless $qend >= $best_end;
			$best_start = $qstart;
			$best_end = $qend;
			1; # Keep this alignment.
		}
	} @align;
	my $nalign = @align;

	my $i = 0;
	for (@align) {
		my ($tid, $tstart, $qstart, $alength, $qlength, $sense)
			= split;
		die if length $qseq != $qlength;

		die "no contig `$tid'" if !exists $seq{$tid};
		my $tlength = length $seq{$tid};

		# Read IDs must be unique, so add a suffix if this read
		# has multiple alignments.
		$i++;
		my $suffix = $nalign == 1 ? '' : "_$i/$nalign";

		my $seq = $qseq;
		my $qend = $qstart + $alength;
		if ($sense) {
			# Reverse complement.
			($seq = reverse $seq) =~ tr/ACGT/TGCA/;
			# Reverse coordinates.
			my $s = $qlength - $qend;
			my $e = $qlength - $qstart;
			$qstart = $s;
			$qend = $e;
		}
		$sense = $sense ? 'C' : 'U';

		my $astart = $tstart - $qstart;
		my $qastart = $qstart <= $tstart ? 1
			: 1 + $qstart - $tstart;

		my $qaend = $tlength - $astart;
		$qaend = $qlength if $qaend > $qlength;

		my $afstart = 1 + $astart;
		push @{$af{$tid}},
			"AF $qid$suffix $sense $afstart\n";
		push @{$rd{$tid}},
			"RD $qid$suffix $qlength 0 0\n" .
			"$seq\n\n" .
			"QA 1 $qlength $qastart $qaend\n" .
			"DS PHD_FILE: x\n";

		$nreads++;
	}
}

print 'AS ', scalar keys %rd, ' ',
	$nreads + scalar keys %rd, "\n";

my $bq_line = '30 ' x 19 . "30\n";

for my $coid (sort {$a<=>$b} keys %rd) {
	my $seq = $seq{$coid};
	my $n = length $seq;

	# Split long lines.
	$seq =~ s/.{60}/$&\n/sg;

	# Fake a base segment.
	push @{$af{$coid}},
		"AF CONTIG_$coid U 1\n";
	push @{$rd{$coid}},
		"RD CONTIG_$coid $n 0 0\n" .
		"$seq\n\n" .
		"QA 1 $n 1 $n\n" .
		"DS PHD_FILE: x\n";

	# Contig sequence.
	print "CO $coid $n ",
		scalar @{$rd{$coid}}, " 1 U\n",
		$seq, "\n\n";

	# Contig quality.
	print "BQ\n";
	print $bq_line for 1..int($n / 20);
	print '30 ' x ($n % 20 - 1), '30' if $n % 20 > 0;
	print "\n\n";

	print for @{$af{$coid}};
	print "BS 1 $n CONTIG_$coid\n";
	print for @{$rd{$coid}};
}

=pod

=head1 NAME

abyss2ace - create a Consed ACE assembly from an ABySS assembly

=head1 SYNOPSIS

B<KAligner> B<--seq> B<-mk> I<K> F<reads.fa> F<contigs.fa> |
 B<abyss2ace> F<contigs.fa> - >F<out.ace>

B<consed> B<-nophd> B<-ace> F<out.ace>

=head1 DESCRIPTION

Create a Consed ACE assembly from an ABySS assembly. The reads are
first aligned against the contigs using KAligner. These alignments are
then converted to an ACE assembly.

=head1 AUTHOR

Written by Shaun Jackman.

=head1 REPORTING BUGS

Report bugs to <abyss@bcgsc.ca>.

=head1 COPYRIGHT

Copyright 2009 Canada's Michael Smith Genome Science Centre

=head1 SEE ALSO

L<ABYSS(1)>

http://www.bcgsc.ca/platform/bioinfo/software/abyss
