//******************************************************************************
//
// File:    DnaSequence.java
// Package: edu.rit.phyl.pars
// Unit:    Class edu.rit.phyl.pars.DnaSequence
//
// This Java source file is copyright (C) 2007 by Alan Kaminsky. All rights
// reserved. For further information, contact the author, Alan Kaminsky, at
// ark@cs.rit.edu.
//
// This Java source file is part of the Parallel Java Library ("PJ"). PJ is free
// software; you can redistribute it and/or modify it under the terms of the GNU
// General Public License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// PJ is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
// A PARTICULAR PURPOSE. See the GNU General Public License for more details.
//
// A copy of the GNU General Public License is provided in the file gpl.txt. You
// may also obtain a copy of the GNU General Public License on the World Wide
// Web at http://www.gnu.org/licenses/gpl.html.
//
//******************************************************************************

package edu.rit.phyl.pars;

import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;

import java.util.Arrays;

/**
 * Class DnaSequence encapsulates a DNA sequence. The DNA sequence consists of a
 * sequence of <B>sites</B>. Each site has a <B>state,</B> which is a set of
 * <B>bases</B>. The four bases are adenine, cytosine, guanine, and thymine. For
 * textual I/O, each state is represented by a single character as follows:
 * <P>
 * <TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>
 * <TR><TD><I>Char.</I></TD><TD WIDTH=20> </TD>
 * <TD><I>Meaning</I></TD><TD WIDTH=20> </TD>
 * <TD><I>Set</I></TD></TR>
 * <TR><TD>A</TD><TD WIDTH=20> </TD>
 * <TD>Adenine</TD><TD WIDTH=20> </TD>
 * <TD>(A)</TD></TR>
 * <TR><TD>C</TD><TD WIDTH=20> </TD>
 * <TD>Cytosine</TD><TD WIDTH=20> </TD>
 * <TD>(C)</TD></TR>
 * <TR><TD>G</TD><TD WIDTH=20> </TD>
 * <TD>Guanine</TD><TD WIDTH=20> </TD>
 * <TD>(G)</TD></TR>
 * <TR><TD>T</TD><TD WIDTH=20> </TD>
 * <TD>Thymine</TD><TD WIDTH=20> </TD>
 * <TD>(T)</TD></TR>
 * <TR><TD>Y</TD><TD WIDTH=20> </TD>
 * <TD>pYrimidine</TD><TD WIDTH=20> </TD>
 * <TD>(C or T)</TD></TR>
 * <TR><TD>R</TD><TD WIDTH=20> </TD>
 * <TD>puRine</TD><TD WIDTH=20> </TD>
 * <TD>(A or G)</TD></TR>
 * <TR><TD>W</TD><TD WIDTH=20> </TD>
 * <TD>"Weak"</TD><TD WIDTH=20> </TD>
 * <TD>(A or T)</TD></TR>
 * <TR><TD>S</TD><TD WIDTH=20> </TD>
 * <TD>"Strong"</TD><TD WIDTH=20> </TD>
 * <TD>(C or G)</TD></TR>
 * <TR><TD>K</TD><TD WIDTH=20> </TD>
 * <TD>"Keto"</TD><TD WIDTH=20> </TD>
 * <TD>(G or T)</TD></TR>
 * <TR><TD>M</TD><TD WIDTH=20> </TD>
 * <TD>"aMino"</TD><TD WIDTH=20> </TD>
 * <TD>(A or C)</TD></TR>
 * <TR><TD>B</TD><TD WIDTH=20> </TD>
 * <TD>not A</TD><TD WIDTH=20> </TD>
 * <TD>(C or G or T)</TD></TR>
 * <TR><TD>D</TD><TD WIDTH=20> </TD>
 * <TD>not C</TD><TD WIDTH=20> </TD>
 * <TD>(A or G or T)</TD></TR>
 * <TR><TD>H</TD><TD WIDTH=20> </TD>
 * <TD>not G</TD><TD WIDTH=20> </TD>
 * <TD>(A or C or T)</TD></TR>
 * <TR><TD>V</TD><TD WIDTH=20> </TD>
 * <TD>not T</TD><TD WIDTH=20> </TD>
 * <TD>(A or C or G)</TD></TR>
 * <TR><TD>X</TD><TD WIDTH=20> </TD>
 * <TD>unknown</TD><TD WIDTH=20> </TD>
 * <TD>(A or C or G or T)</TD></TR>
 * <TR><TD>-</TD><TD WIDTH=20> </TD>
 * <TD>deletion</TD><TD WIDTH=20> </TD>
 * <TD>()</TD></TR>
 * </TABLE>
 * <P>
 * The DNA sequence also has an associated <B>score,</B> an integer. The score
 * can be set to anything and later retrieved.
 *
 * @author  Alan Kaminsky
 * @version 20-Dec-2007
 */
public class DnaSequence
	implements Externalizable
	{

// Hidden constants.

	private static final long serialVersionUID = 711922997806967374L;

	// Amount of extra padding in byte array.
	private static final int PAD = 128;

	// Mapping from the state of a site to the corresponding output character.
	// A=1, C=2, G=4, T=8.
	static final char[] state2char = new char[]
		{/*----*/ '-',
		 /*---A*/ 'A',
		 /*--C-*/ 'C',
		 /*--CA*/ 'M',
		 /*-G--*/ 'G',
		 /*-G-A*/ 'R',
		 /*-GC-*/ 'S',
		 /*-GCA*/ 'V',
		 /*T---*/ 'T',
		 /*T--A*/ 'W',
		 /*T-C-*/ 'Y',
		 /*T-CA*/ 'H',
		 /*TG--*/ 'K',
		 /*TG-A*/ 'D',
		 /*TGC-*/ 'B',
		 /*TGCA*/ 'X'};

// Hidden data members.

	// Sequence data. Each site's set of bases is stored as a bitmap in one
	// byte. A=1, C=2, G=4, T=8. Sequence data is shared between multiple DNA
	// sequence objects wherever possible. PAD bytes of padding are added to
	// avert cache interference.
	byte[] mySites;

	// Score.
	int myScore;

	// 128 bytes of extra padding to avert cache interference.
	private transient long p0, p1, p2, p3, p4, p5, p6, p7;
	private transient long p8, p9, pa, pb, pc, pd, pe, pf;

// Exported constructors.

	/**
	 * Construct a new zero-length DNA sequence. The score is initially 0.
	 */
	public DnaSequence()
		{
		this (0);
		}

	/**
	 * Construct a new DNA sequence with the given length. The score is
	 * initially 0.
	 *
	 * @param  N  Length (number of sites).
	 *
	 * @exception  NegativeArraySizeException
	 *     (unchecked exception) Thrown if <TT>len</TT> &lt; 0.
	 */
	public DnaSequence
		(int N)
		{
		this (N, 0);
		}

	/**
	 * Construct a new DNA sequence with the given length and score.
	 *
	 * @param  N      Length (number of sites).
	 * @param  score  Score.
	 *
	 * @exception  NegativeArraySizeException
	 *     (unchecked exception) Thrown if <TT>len</TT> &lt; 0.
	 */
	public DnaSequence
		(int N,
		 int score)
		{
		this.mySites = new byte [N+PAD];
		this.myScore = score;
		}

	/**
	 * Construct a new DNA sequence that is a copy of the given DNA sequence.
	 *
	 * @param  seq  DNA sequence to copy.
	 *
	 * @exception  NullPointerException
	 *     (unchecked exception) Thrown if <TT>seq</TT> is null.
	 */
	public DnaSequence
		(DnaSequence seq)
		{
		copy (seq);
		}

// Exported operations.

	/**
	 * Get this DNA sequence's length.
	 *
	 * @return  Length (number of sites).
	 */
	public int length()
		{
		return mySites.length - PAD;
		}

	/**
	 * Get this DNA sequence's score.
	 *
	 * @return  Score.
	 */
	public int score()
		{
		return myScore;
		}

	/**
	 * Set this DNA sequence's score.
	 *
	 * @param  score  Score.
	 */
	public void score
		(int score)
		{
		myScore = score;
		}

	/**
	 * Make this DNA sequence be a copy of the given DNA sequence.
	 *
	 * @param  seq  DNA sequence to copy.
	 *
	 * @exception  NullPointerException
	 *     (unchecked exception) Thrown if <TT>seq</TT> is null.
	 */
	public void copy
		(DnaSequence seq)
		{
		this.mySites = seq.mySites;
		this.myScore = seq.myScore;
		}

	/**
	 * Make this DNA sequence be the ancestor of the two given DNA sequences in
	 * the Fitch parsimony score algorithm. This DNA sequence's score is set to
	 * the number of state changes at the ancestor.
	 *
	 * @param  seq1  First child DNA sequence.
	 * @param  seq2  Second child DNA sequence.
	 */
	public void setFitchAncestor
		(DnaSequence seq1,
		 DnaSequence seq2)
		{
		// Allocate storage for new sites.
		byte[] sites1 = seq1.mySites;
		byte[] sites2 = seq2.mySites;
		int N = sites1.length - PAD;
		byte[] newSites = new byte [N+PAD];

		// Process all sites. Count state changes.
		int nChanges = 0;
		for (int i = 0; i < N; ++ i)
			{
			// Compute intersection of states.
			int state1 = sites1[i];
			int state2 = sites2[i];
			int state3 = state1 & state2;

			// If intersection is not empty, record intersection, otherwise
			// record union and note one state change.
			if (state3 == 0)
				{
				state3 = state1 | state2;
				++ nChanges;
				}

			// Update site.
			newSites[i] = (byte) state3;
			}

		// Record new sites.
		mySites = newSites;

		// Record number of state changes.
		myScore = nChanges;
		}

	/**
	 * Determine if this DNA sequence is equal to the given object. The two are
	 * equal if the sequences of states are equal (the scores do not matter).
	 *
	 * @param  obj  Object to test.
	 *
	 * @return  True if this DNA sequence is equal to <TT>obj</TT>, false
	 *          otherwise.
	 */
	public boolean equals
		(Object obj)
		{
		return
			obj instanceof DnaSequence &&
			Arrays.equals (this.mySites, ((DnaSequence) obj).mySites);
		}

	/**
	 * Returns a hash code for this DNA sequence.
	 */
	public int hashCode()
		{
		return Arrays.hashCode (mySites);
		}

	/**
	 * Returns a string version of this DNA sequence. The string consists of
	 * just the sequence of states (the score is not included).
	 */
	public String toString()
		{
		StringBuilder buf = new StringBuilder();
		int N = mySites.length - PAD;
		for (int i = 0; i < N; ++ i)
			{
			if (i > 0 && i % 10 == 0) buf.append (' ');
			buf.append (state2char [mySites[i]]);
			}
		return buf.toString();
		}

	/**
	 * Write this DNA sequence to the given object output stream.
	 *
	 * @param  out  Object output stream.
	 *
	 * @exception  IOException
	 *     Thrown if an I/O error occurred.
	 */
	public void writeExternal
		(ObjectOutput out)
		throws IOException
		{
		int N = mySites.length - PAD;
		out.writeInt (N);
		out.write (mySites, 0, N);
		out.writeInt (myScore);
		}

	/**
	 * Read this DNA sequence from the given object input stream.
	 *
	 * @param  in  Object input stream.
	 *
	 * @exception  IOException
	 *     Thrown if an I/O error occurred.
	 * @exception  ClassNotFoundException
	 *     Thrown if a class needed to deserialize this DNA sequence cannot be
	 *     found.
	 */
	public void readExternal
		(ObjectInput in)
		throws IOException, ClassNotFoundException
		{
		int N = in.readInt();
		mySites = new byte [N+PAD];
		in.readFully (mySites, 0, N);
		myScore = in.readInt();
		}

	}
