package com.biotechvana.javabiotoolkit.io;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.util.Comparator;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;

import com.biotechvana.javabiotoolkit.AminoAcid;
import com.biotechvana.javabiotoolkit.BioSequence;
import com.biotechvana.javabiotoolkit.DNABase;
import com.biotechvana.javabiotoolkit.DNASequence;
import com.biotechvana.javabiotoolkit.NucleotideSequenceDirectionality;
import com.biotechvana.javabiotoolkit.PeptideSequenceDirectionality;
import com.biotechvana.javabiotoolkit.ProteinSequence;
import com.biotechvana.javabiotoolkit.RNABase;
import com.biotechvana.javabiotoolkit.RNASequence;
import com.biotechvana.javabiotoolkit.exceptions.InvalidSequenceCharacterException;
import com.biotechvana.javabiotoolkit.exceptions.SequenceTooLongException;
import com.biotechvana.javabiotoolkit.text.LineSeparatorFormat;
import com.biotechvana.javabiotoolkit.text.UTF8BufferTrimmer;

/**
 * Objects of this class mark the position of FASTA headers in a plaintext file. It does not guarantee that the 
 * following block of text is a well-formed sequence (i.e. invalid characters may appear).
 *
 * @version	1.4
 * 
 * @see		FASTAReader
 * 	
 * @author	<a href="mailto:alfonso.munozpomer@biotechvana.com">Alfonso Muñoz-Pomer Fuentes</a>,
 * 			<a href="http://www.biotechvana.com">Biotechvana</a>.
 */
public class FASTAFileRecord
{
	
	/*
	 * A comparator for <code>FastaRecord</code>s based on their sequence byte offsets within a file.
	 * 
	 * @author	Alfonso Muñoz-Pomer Fuentes, <a href="http://www.biotechvana.com">Biotechvana</a>
	 */
	public static Comparator<FASTAFileRecord> sequenceByteOffsetComparator = new Comparator<FASTAFileRecord>()
	{
		@Override
		public int compare(FASTAFileRecord fr1, FASTAFileRecord fr2)
		{
			if (fr1.sequenceByteOffset < fr2.sequenceByteOffset)
			{
				return -1;
			}
			else if (fr1.sequenceByteOffset > fr2.sequenceByteOffset)
			{
				return 1;
			}
			return 0;	// This shouldn't happen but...
		}
	};
	
	/*
	 * 
	 */
	public static Comparator<FASTAFileRecord> descriptionComparator = new Comparator<FASTAFileRecord>()
	{
		@Override
		public int compare(FASTAFileRecord fr1, FASTAFileRecord fr2)
		{
			return (fr1.headerLineSB.toString().compareTo(fr2.headerLineSB.toString()));
		}
	};
	
	/*
	 * 
	 */
	public static Comparator<FASTAFileRecord> descriptionComparatorCaseInsensitive = new Comparator<FASTAFileRecord>()
	{
		@Override
		public int compare(FASTAFileRecord fr1, FASTAFileRecord fr2)
		{
			return (fr1.headerLineSB.toString().compareToIgnoreCase(fr2.headerLineSB.toString()));
		}
	};
	
	
	
	
	
	// The File can be obtained from the outer class. The reference is here just in case.
	private File filePath;
	private Charset fileCharset;
	@SuppressWarnings("unused")
	private LineSeparatorFormat fileLineSeparator;
	long descriptionByteOffset;
	long descriptionBytes;
    long sequenceByteOffset;
	long sequenceBytes;
	StringBuilder headerLineSB;
	long sequenceLength;
	SortedMap<Long, Long> componentsByteOffsets;
	@SuppressWarnings("unused")
	private IContigProvider contigProvider;
	
	/**
	 * Creates a reference to a FASTA record stored in a file. FASTA records are identified by lines which start 
	 * with the &ldquo;>&rdquo; (&ldquo;greater than&rdquo;) character, which may be followed by a description 
	 * (usually a sequence identifier) in the same line. The next line should contain a IUB/IUPAC coded sequence 
	 * (nucleotides or amino acids), possibly split along multiple lines. Invalid characters (that is, those which 
	 * do not correspond to an amino acid or nucleotide base <em>in the sequence block</em>) can be discarded when 
	 * the sequence is actually read.
	 * <p>
	 * Note: character offsets vary depending on the encoding used and may or may not be equal to the offset in 
	 * bytes.
	 * 
	 * @param	filePath				file in which this FASTA header is stored.
	 * @param	fileCharset				encoding <code>Charset</code> for the file.
	 * @param	fileLineSeparator		format of the line separator (<code>\n</code>, <code>\r</code> or
	 * 									<code>\r\n</code>).
	 * @param	descriptionByteOffset	file offset in bytes where the record starts.
	 * @param	descriptionBytes		number of bytes spanned by the description block.
	 * @param	sequenceByteOffset		file offset in bytes where the sequence block starts.
	 * @param	sequenceBytes			number of bytes spanned by the sequence block.
	 * @param	sequencelength			length in symbols of the sequence block.
	 * 
	 * @since	0.1
	 */
	public FASTAFileRecord(File filePath, Charset fileCharset, LineSeparatorFormat fileLineSeparator,
	 long descriptionByteOffset, long descriptionBytes,
	 long sequenceByteOffset, long sequenceBytes, SortedMap<Long, Long> componentsByteOffsets,
	 long sequenceLength,StringBuilder descLineBS)
	{
		this.filePath = filePath;
		this.fileCharset = fileCharset;
		this.fileLineSeparator = fileLineSeparator;
		
		this.descriptionByteOffset = descriptionByteOffset;
		this.descriptionBytes = descriptionBytes;
		this.sequenceByteOffset = sequenceByteOffset;
		this.sequenceBytes = sequenceBytes;
		
		this.headerLineSB = null;
		this.sequenceLength = sequenceLength;
		
		// If there are only the start and end offsets do not create a contigProvider, otherwise do...
		this.componentsByteOffsets = componentsByteOffsets;
		if (componentsByteOffsets == null || componentsByteOffsets.size() <= 2)
		{
			contigProvider = null;
		}
		else
		{
			// This implementation of IContigProvider conemplates DNA sequences only
			contigProvider = new IContigProvider()
			{
				@Override
				public DNASequence loadDnaComponent(boolean ignoreInvalidCharacters, int index)
				throws SequenceTooLongException, FileNotFoundException, InvalidSequenceCharacterException, 
					   IOException
				{
					if (FASTAFileRecord.this.sequenceLength > Integer.MAX_VALUE)
					{
						throw new SequenceTooLongException();
					}

					// Get the channel from the File argument and allocate byte buffer
					FileChannel inFC =
						new FileInputStream(FASTAFileRecord.this.filePath).getChannel();	// FileNotFoundException
					ByteBuffer bBuffer = ByteBuffer.allocate(FASTAReader.I_byteBufferSize);
					
					// To keep track of the offset within the file
					long fileByteOffset = FASTAFileRecord.this.componentsByteOffsets.get(
							FASTAFileRecord.this.componentsByteOffsets.keySet().toArray()[index]);
					long cBytes = 0;
					char c = '\n';
					
					DNASequence ds = new DNASequence("", NucleotideSequenceDirectionality.C5_C3, "");
					try
					{
						inFC.position(FASTAFileRecord.this.componentsByteOffsets.get(
								FASTAFileRecord.this.componentsByteOffsets.keySet().toArray()[index]));	// IOException
						while (inFC.read(bBuffer) != -1 &&
							   fileByteOffset < FASTAFileRecord.this.componentsByteOffsets.get(
									  FASTAFileRecord.this.componentsByteOffsets.keySet().toArray()[index + 1]))	// IOException
						{
							bBuffer.flip();
							// If not EOF and encoding is UTF-8...
							if (FASTAFileRecord.this.fileCharset == 
								Charset.forName("UTF-8") && inFC.size() - inFC.position() > 0)	// IOException
							{
								// ... maybe the buffer ends at an incomplete muliple byte character 
								inFC.position(inFC.position() - UTF8BufferTrimmer.endTrimUTF8Characters(bBuffer));
							}
							CharBuffer cBuffer = FASTAFileRecord.this.fileCharset.decode(bBuffer);
							bBuffer.clear();
							
							// The sequence starts/is in this buffer
							while (cBuffer.hasRemaining() && 
								   fileByteOffset < FASTAFileRecord.this.componentsByteOffsets.get(FASTAFileRecord.this.componentsByteOffsets.keySet().toArray()[index + 1]))
							{
								c = cBuffer.get();
								cBytes = FASTAFileRecord.this.fileCharset.encode(String.valueOf(c)).limit();

								if (DNABase.valueOf(c) != null)
								{	// ... insert the base
									ds.add(DNABase.valueOf(c));
								}
								else if (!ignoreInvalidCharacters && c != '\n' && c != '\r')
								{
									throw new InvalidSequenceCharacterException(c + " is not a valid nucleotide IUPAC character.");
								}
								// Update current byte and character count
								fileByteOffset += cBytes;
							}
						}
					}
					finally
					{
						inFC.close();	// IOException
					}
					
					return ds;
				}

				@Override
				public DNASequence loadDnaComponent(int index)
				throws SequenceTooLongException, InvalidSequenceCharacterException, 
					   FileNotFoundException, IOException
				{
					return loadDnaComponent(true, index);
				}

				@Override
				public int getNumberOfComponents()
				{
					return FASTAFileRecord.this.componentsByteOffsets.size() - 1;
				}

				@Override
				public int contigLength()
				{
					return (int)FASTAFileRecord.this.length();
				}
			};
		}
		
		
		if(descLineBS != null) {
			 headerLineSB = new StringBuilder(descLineBS);
			if (headerLineSB.charAt(0) == '>')
			{
				headerLineSB.deleteCharAt(0);
			}
			while (headerLineSB.length() > 0 && (headerLineSB.charAt(headerLineSB.length() - 1) == '\n' ||
					headerLineSB.charAt(headerLineSB.length() - 1) == '\r'))
			{
				headerLineSB.deleteCharAt(headerLineSB.length() - 1);
			}
		}
	}
	
	
	public FASTAFileRecord(File filePath, Charset fileCharset, LineSeparatorFormat fileLineSeparator,
			 long descriptionByteOffset, long descriptionBytes,
			 long sequenceByteOffset, long sequenceBytes, SortedMap<Long, Long> componentsByteOffsets,
			 long sequenceLength) {
		this( filePath,  fileCharset,  fileLineSeparator,
			  descriptionByteOffset,  descriptionBytes,
			  sequenceByteOffset,  sequenceBytes, componentsByteOffsets,
			  sequenceLength,null);
	}
	
	
	/**
	 * Returns a FASTA header without &ldquo;>$rdquo;
	 * 
	 * @return	Description of this FASTA record, removing the initial &ldquo;>$rdquo; character.
	 * 
	 * @throws FileNotFoundException	if the file referred by this record cannot be found.
	 * @throws IOException				if there is an I/O error when opening, reading or closing the file.
	 *
	 * @since	0.1
	 */
	public StringBuilder retrieveDescriptionSB() throws FileNotFoundException, IOException
	{
		// Get the channel from the File argument and allocate byte buffer
		FileChannel inFC = new FileInputStream(this.filePath).getChannel();	// FileNotFoundException
		ByteBuffer bBuffer = ByteBuffer.allocate(FASTAReader.I_byteBufferSize);
		
		// To keep track of the offset within the file
		long fileByteOffset = descriptionByteOffset;
		long cBytes = 0;
		char c;
		
		StringBuilder descriptionSB = new StringBuilder();
		try
		{
			inFC.position(descriptionByteOffset);	// IOException
			while (inFC.read(bBuffer) != -1 && fileByteOffset < descriptionByteOffset + descriptionBytes)
			{
				bBuffer.flip();
				// If not EOF and encoding is UTF-8...
				if (fileCharset == Charset.forName("UTF-8") && inFC.size() - inFC.position() > 0)	// IOException
				{
					// ... maybe the buffer ends at an incomplete muliple byte character 
					inFC.position(inFC.position() - UTF8BufferTrimmer.endTrimUTF8Characters(bBuffer));
				}
				CharBuffer cBuffer = fileCharset.decode(bBuffer);
				bBuffer.clear();
				
				// The sequence starts/is in this buffer
				while (cBuffer.hasRemaining() && fileByteOffset < descriptionByteOffset + descriptionBytes)
				{
					c = cBuffer.get();
					cBytes = fileCharset.encode(String.valueOf(c)).limit();
					
					descriptionSB.append(c);
					
					// Update current byte and character count
					fileByteOffset += cBytes;
				}
			}
			if (descriptionSB.charAt(0) == '>')
			{
				descriptionSB.deleteCharAt(0);
			}
		}
		finally
		{
			inFC.close();	// IOException
		}
		while (descriptionSB.length() > 0 && (descriptionSB.charAt(descriptionSB.length() - 1) == '\n' ||
											  descriptionSB.charAt(descriptionSB.length() - 1) == '\r'))
		{
			descriptionSB.deleteCharAt(descriptionSB.length() - 1);
		}
		return descriptionSB;
	}
	
	/**
	 * Reads the line(s) following the FASTA header in this record until the next header or the end of file, 
	 * guessing the sequence type, and returns an instance of <code>BioSequence</code>.  
	 * 
	 * @return	a <code>DnaSequence</code>, a <code>RnaSequence</code> or a <code>ProteinSequence</code>. 
	 * 
	 * @throws	FileNotFoundException				if the file referred by this record cannot be found.
	 * @throws	IOException							if there is an I/O error when opening, reading or closing the 
	 * 												file.
	 * @throws	InvalidSequenceCharacterException	if a character that does not match an amino acid IUPAC symbol 
	 * 												is found. 
	 * @throws	SequenceTooLongException			if the sequence requested to be loaded is longer than
	 * 												<code>Integer.MAX_VALUE</code>.
	 * 
	 * @since	1.2
	 */
	BioSequence retrieveBioSequence()
	throws FileNotFoundException, IOException, InvalidSequenceCharacterException, SequenceTooLongException
	{
		if (sequenceLength > Integer.MAX_VALUE)
		{
			throw new SequenceTooLongException();
		}
		
		// Get the channel from the File argument and allocate byte buffer
		FileChannel inFC = new FileInputStream(this.filePath).getChannel();	// FileNotFoundException
		ByteBuffer bBuffer = ByteBuffer.allocate(FASTAReader.I_byteBufferSize);
		
		// To keep track of the offset within the file
		long fileByteOffset = sequenceByteOffset;
		long cBytes = 0;
		char c = '\n';
		char previousChar;
		
		boolean dnaSequence = true;
		boolean hasThymine = false;
		boolean rnaSequence = false;
		boolean proteinSequence = false;
		
		DNASequence ds = new DNASequence("", NucleotideSequenceDirectionality.C5_C3, "");
		RNASequence rs = new RNASequence("", NucleotideSequenceDirectionality.C5_C3, "");
		ProteinSequence ps = new ProteinSequence("", PeptideSequenceDirectionality.N_C, "");
		
		try
		{
			if (headerLineSB == null)
			{
				// We shouldn't load as a side effect. Everything is clean and pure unless requested otherwise.
				ds.setDescription(retrieveDescriptionSB());
			}
			else
			{
				ds.setDescription(headerLineSB);
			}
			inFC.position(sequenceByteOffset);	// IOException
			while (inFC.read(bBuffer) != -1 && fileByteOffset < sequenceByteOffset + sequenceBytes)	// IOException
			{
				bBuffer.flip();
				// If not EOF and encoding is UTF-8...
				if (fileCharset == Charset.forName("UTF-8") && inFC.size() - inFC.position() > 0)	// IOException
				{
					// ... maybe the buffer ends at an incomplete muliple byte character 
					inFC.position(inFC.position() - UTF8BufferTrimmer.endTrimUTF8Characters(bBuffer));
				}
				CharBuffer cBuffer = fileCharset.decode(bBuffer);
				bBuffer.clear();
				
				// The sequence starts/is in this buffer
				while (cBuffer.hasRemaining() && fileByteOffset < sequenceByteOffset + sequenceBytes)
				{
					previousChar = c;
					c = cBuffer.get();
					cBytes = fileCharset.encode(String.valueOf(c)).limit();

					if (c != '\n' && c != '\r')
					{
						// If we're getting a DNA sequence...
						if (dnaSequence)
						{
							if (DNABase.valueOf(c) != null)
							{	// ... insert the base
								ds.add(DNABase.valueOf(c));
								if (!hasThymine && ds.get(ds.getLength() - 1) == DNABase.T)
								{
									hasThymine = true;
								}
							}
							else
							{	// ... or relax conditions for an appropriate sequence
								dnaSequence = false;
								// If the DNA sequence didn't have T and c can be mapped to a RNA base... (it's U)
								if (!hasThymine && RNABase.valueOf(c) != null)
								{
									rnaSequence = true;
									// rs = ds.transcribe();
									rs.setDescription(ds.getDescription());
									for (int i = 0 ; i < ds.getLength() ; i++)
									{
										rs.add(RNABase.valueOf(ds.get(i).getUpperCaseChar()));
									}
									rs.add(RNABase.valueOf(c));
									ds = null;	// @Garbage collector: it's dangerous out there, take this! 
								}
								// ... otherwise convert it to a protein if c is an amino acid symbol
								else if (AminoAcid.valueOf(c) != null)
								{
									proteinSequence = true;
									ps.setDescription(ds.getDescription());
									for (int i = 0 ; i < ds.getLength() ; i++)
									{
										ps.add(AminoAcid.valueOf(ds.get(i).getUpperCaseChar()));
									}
									ps.add(AminoAcid.valueOf(c));
									ds = null;	// @Garbage collector: it's dangerous out there, take this!
								}
							}
						}
						// If we're getting a RNA sequence...
						else if (rnaSequence)
						{
							if (RNABase.valueOf(c) != null)
							{	// ... insert the base
								rs.add(RNABase.valueOf(c));
							}
							else
							{	// ... or relax conditions for an appropriated sequence
								rnaSequence = false;
								// The only possibility left is a protein sequence. Is c an amino acid?
								if (AminoAcid.valueOf(c) != null)
								{
									proteinSequence = true;
									ps.setDescription(rs.getDescription());
									for (int i = 0 ; i < rs.getLength() ; i++)
									{
										ps.add(AminoAcid.valueOf(rs.get(i).getUpperCaseChar()));
									}
									ps.add(AminoAcid.valueOf(c));
									rs = null;	// @Garbage collector: it's dangerous out there, take this!
								}
							}
						}
						// If we're getting a protein sequence...
						else if (proteinSequence)
						{
							if (AminoAcid.valueOf(c) != null)
							{
								ps.add(AminoAcid.valueOf(c));
							}
							else
							{
								proteinSequence = false;
							}
						}
						// Everything's false. Existence is futile. Sequence type is unknown.
						else
						{
							throw new InvalidSequenceCharacterException(
									previousChar + " is not a nucleotide base or an amino acid. Sequence type " +
									"cannot be determined");
						}
					}
					// Update current byte and character count
					fileByteOffset += cBytes;
				}
			}
		}
		finally
		{
			inFC.close();	// IOException
		}
		
		// Sequence goes until EOF
		if (dnaSequence)
		{
			return ds;
		}
		else if (rnaSequence)
		{
			return rs;
		}
		else if (proteinSequence)
		{
			return ps;
		}
		// And just in case...
		else
		{
			return null;
		}
	}
	
	/**
	 * Returns the <code>DnaSequence</code> instance represented by this FASTA record.
	 * 
	 * @param	ignoreInvalidCharacters	if <code>true</code> characters that do not match a DNA nucleotide IUPAC 
	 * 									symbol from the input file are ignored.
	 * 
	 * @return	the <code>DnaSequence</code> represented by this record.
	 * 
	 * @throws	FileNotFoundException				if the file referred by this record cannot be found.
	 * @throws	IOException							if there is an I/O error when opening, reading or closing the 
	 * 												file.
	 * @throws	InvalidSequenceCharacterException	if <code>ignoreInvalidCharacters</code> is set to 
	 * 												<code>false</code> and an invalid character is read.
	 * @throws	SequenceTooLongException			if the requested sequence is longer than 
	 * 												<code>Integer.MAX_VALUE</code>.
	 * 
	 * @since	1.2
	 */
	DNASequence retrieveDnaSequence(boolean ignoreInvalidCharacters)
	throws FileNotFoundException, IOException, InvalidSequenceCharacterException, SequenceTooLongException
	{
		if (sequenceLength > Integer.MAX_VALUE)
		{
			throw new SequenceTooLongException();
		}

		// Get the channel from the File argument and allocate byte buffer
		FileChannel inFC = new FileInputStream(this.filePath).getChannel();	// FileNotFoundException
		ByteBuffer bBuffer = ByteBuffer.allocate(FASTAReader.I_byteBufferSize);
		
		// To keep track of the offset within the file
		long fileByteOffset = sequenceByteOffset;
		long cBytes = 0;
		char c = '\n';
		
		DNASequence ds = new DNASequence("", NucleotideSequenceDirectionality.C5_C3, "");
		try
		{
			if (headerLineSB == null)
			{
				// We shouldn't load as a side effect. Everything is clean and pure unless requested otherwise.
				ds.setDescription(retrieveDescriptionSB());
			}
			else
			{
				ds.setDescription(headerLineSB);
			}
			inFC.position(sequenceByteOffset);	// IOException
			while (inFC.read(bBuffer) != -1 && fileByteOffset < sequenceByteOffset + sequenceBytes)	// IOException
			{
				bBuffer.flip();
				// If not EOF and encoding is UTF-8...
				if (fileCharset == Charset.forName("UTF-8") && inFC.size() - inFC.position() > 0)	// IOException
				{
					// ... maybe the buffer ends at an incomplete muliple byte character 
					inFC.position(inFC.position() - UTF8BufferTrimmer.endTrimUTF8Characters(bBuffer));
				}
				CharBuffer cBuffer = fileCharset.decode(bBuffer);
				bBuffer.clear();
				
				// The sequence starts/is in this buffer
				while (cBuffer.hasRemaining() && fileByteOffset < sequenceByteOffset + sequenceBytes)
				{
					c = cBuffer.get();
					cBytes = fileCharset.encode(String.valueOf(c)).limit();

					if (DNABase.valueOf(c) != null)
					{	// ... insert the base
						ds.add(DNABase.valueOf(c));
					}
					else if (!ignoreInvalidCharacters && c != '\n' && c != '\r')
					{
						throw new InvalidSequenceCharacterException(
								c + " is not a valid nucleotide IUPAC character.");
					}
					// Update current byte and character count
					fileByteOffset += cBytes;
				}
			}
		}
		finally
		{
			inFC.close();	// IOException
		}
		
		return ds;
	}
	
	/**
	 * Returns the <code>RnaSequence</code> instance represented by this FASTA record.
	 * 
	 * @param	ignoreInvalidCharacters	if <code>true</code> characters that do not match a RNA nucleotide IUPAC 
	 * 									symbol from the input file are ignored.
	 * 
	 * @return	the <code>RnaSequence</code> represented by this record.
	 * 
	 * @throws	FileNotFoundException				if the file referred by this record cannot be found.
	 * @throws	IOException							if there is an I/O error when opening, reading or closing the 
	 * 												file.
	 * @throws	InvalidSequenceCharacterException	if <code>ignoreInvalidCharacters</code> is set to 
	 * 												<code>false</code> and an invalid character is read.
	 * @throws	SequenceTooLongException			if the requested sequence is longer than 
	 * 												<code>Integer.MAX_VALUE</code>.
	 * 
	 * @since	1.2
	 */
	RNASequence retrieveRnaSequence(boolean ignoreInvalidCharacters)
	throws FileNotFoundException, IOException, InvalidSequenceCharacterException, SequenceTooLongException
	{
		if (sequenceLength > Integer.MAX_VALUE)
		{
			throw new SequenceTooLongException();
		}

		// Get the channel from the File argument and allocate byte buffer
		FileChannel inFC = new FileInputStream(this.filePath).getChannel();	// FileNotFoundException
		ByteBuffer bBuffer = ByteBuffer.allocate(FASTAReader.I_byteBufferSize);
		
		// To keep track of the offset within the file
		long fileByteOffset = sequenceByteOffset;
		long cBytes = 0;
		char c = '\n';
		
		RNASequence rs = new RNASequence("", NucleotideSequenceDirectionality.C5_C3, "");
		try
		{
			if (headerLineSB == null)
			{
				// We shouldn't load as a side effect. Everything is clean and pure unless requested otherwise.
				rs.setDescription(retrieveDescriptionSB());
			}
			else
			{
				rs.setDescription(headerLineSB);
			}
			inFC.position(sequenceByteOffset);	// IOException
			while (inFC.read(bBuffer) != -1 && fileByteOffset < sequenceByteOffset + sequenceBytes)	// IOException
			{
				bBuffer.flip();
				// If not EOF and encoding is UTF-8...
				if (fileCharset == Charset.forName("UTF-8") && inFC.size() - inFC.position() > 0)	// IOException
				{
					// ... maybe the buffer ends at an incomplete muliple byte character 
					inFC.position(inFC.position() - UTF8BufferTrimmer.endTrimUTF8Characters(bBuffer));
				}
				CharBuffer cBuffer = fileCharset.decode(bBuffer);
				bBuffer.clear();
				
				// The sequence starts/is in this buffer
				while (cBuffer.hasRemaining() && fileByteOffset < sequenceByteOffset + sequenceBytes)
				{
					c = cBuffer.get();
					cBytes = fileCharset.encode(String.valueOf(c)).limit();

					if (RNABase.valueOf(c) != null)
					{	// ... insert the base
						rs.add(RNABase.valueOf(c));
					}
					else if (!ignoreInvalidCharacters && c != '\n' && c != '\r')
					{
						throw new InvalidSequenceCharacterException(
								c + " is not a valid nucleotide IUPAC character.");
					}
					// Update current byte and character count
					fileByteOffset += cBytes;
				}
			}
		}
		finally
		{
			inFC.close();	// IOException
		}
		
		return rs;
	}

	/**
	 * Returns the <code>ProteinSequence</code> instance represented by this FASTA record.
	 * 
	 * @param	ignoreInvalidCharacters	if <code>true</code> characters that do not match an amino acid IUPAC 
	 * 									symbol from the input file are ignored.
	 * 
	 * @return	the <code>ProteinSequence</code> represented by this record.
	 * 
	 * @throws	FileNotFoundException				if the file referred by this record cannot be found.
	 * @throws	IOException							if there is an I/O error when opening, reading or closing the 
	 * 												file.
	 * @throws	InvalidSequenceCharacterException	if <code>ignoreInvalidCharacters</code> is set to 
	 * 												<code>false</code> and an invalid character is read.
	 * @throws SequenceTooLongException				if the requested sequence is longer than 
	 * 												<code>Integer.MAX_VALUE</code>.

	 * @since	1.2
	 */
	ProteinSequence retrieveProteinSequence(boolean ignoreInvalidCharacters)
	throws FileNotFoundException, IOException, InvalidSequenceCharacterException
	{
		// Get the channel from the File argument and allocate byte buffer
		FileChannel inFC = new FileInputStream(this.filePath).getChannel();	// FileNotFoundException
		ByteBuffer bBuffer = ByteBuffer.allocate(FASTAReader.I_byteBufferSize);
		
		// To keep track of the offset within the file
		long fileByteOffset = sequenceByteOffset;
		long cBytes = 0;
		char c = '\n';
		
		ProteinSequence ps = new ProteinSequence("", PeptideSequenceDirectionality.N_C, "");
		try
		{
			if (headerLineSB == null)
			{
				// We shouldn't load as a side effect. Everything is clean and pure unless requested otherwise.
				ps.setDescription(retrieveDescriptionSB());
			}
			else
			{
				ps.setDescription(headerLineSB);
			}
			inFC.position(sequenceByteOffset);	// IOException
			while (inFC.read(bBuffer) != -1 && fileByteOffset < sequenceByteOffset + sequenceBytes)	// IOException
			{
				bBuffer.flip();
				// If not EOF and encoding is UTF-8...
				if (fileCharset == Charset.forName("UTF-8") && inFC.size() - inFC.position() > 0)	// IOException
				{
					// ... maybe the buffer ends at an incomplete muliple byte character 
					inFC.position(inFC.position() - UTF8BufferTrimmer.endTrimUTF8Characters(bBuffer));
				}
				CharBuffer cBuffer = fileCharset.decode(bBuffer);
				bBuffer.clear();
				
				// The sequence starts/is in this buffer
				while (cBuffer.hasRemaining() && fileByteOffset < sequenceByteOffset + sequenceBytes)
				{
					c = cBuffer.get();
					cBytes = fileCharset.encode(String.valueOf(c)).limit();

					if (AminoAcid.valueOf(c) != null)
					{	// ... insert the base
						ps.add(AminoAcid.valueOf(c));
					}
					else if (!ignoreInvalidCharacters && c != '\n' && c != '\r')
					{
						throw new InvalidSequenceCharacterException(
								c + " is not a valid amino acid IUPAC character.");
					}
					// Update current byte and character count
					fileByteOffset += cBytes;
				}
			}
		}
		finally
		{
			inFC.close();	// IOException
		}
		
		return ps;
	}
	
	/**
	 * 
	 * @throws FileNotFoundException
	 * @throws IOException
	 *
	 * @since	x.y.z
	 */
	private void loadDescriptionSB() throws FileNotFoundException, IOException
	{
		headerLineSB = retrieveDescriptionSB();
	}
		public String loadDescription() throws FileNotFoundException, IOException {
		loadDescriptionSB();
		return headerLineSB.toString();
		}

	/**
	 * 
	 * 
	 *
	 * @return 
	 * @since	x.y.z
	 */
	public StringBuilder getDescriptionSB()
	{
		if (headerLineSB == null)
		{
			try
			{
				loadDescription();
			}
			catch (Exception exception)
			{
				return null;
			}
		}
		return headerLineSB;
	}
	
	/**
	 * Returns the offset in bytes within the file at which this record (its header) starts.
	 * 
	 * @return	the offset in bytes within the file at which this record (its header) starts.
	 *
	 * @since	1.0
	 */
	public long offset()
	{
		return descriptionByteOffset;
	}
	
	/**
	 * Returns the offset in bytes within the file at which the sequence block of this record starts.
	 * 
	 * @return	the offset in bytes within the file at which the sequence block of this record starts.
	 *
	 * @since	1.0
	 */
	public long sequenceOffset()
	{
		return sequenceByteOffset;
	}
	
	/**
	 * Returns the length of the <code>BioSequence</code> that would result of this FASTA record.
	 * 
	 * @return	length of the <code>BioSequence</code> that would result of this FASTA record.
	 * 
	 * @since	1.0rc3
	 */
	public long length()
	{
		return sequenceLength;
	}
	
	/**
	 * Returns the <code>Charset</code> used to decode the file where this record is stored.
	 * 
	 * @return	the <code>Charset</code> used to decode this record&rsquo;s file.
	 *
	 * @since	1.3
	 */
	public Charset fileCharset()
	{
		return fileCharset;
	}
	
	/**
	 * Returns the <code>File</code> instance referring to this record&rsquo;s file.
	 * 
	 * @return	the <code>File</code> instance referring to this record&rsquo;s file.
	 *
	 * @since	1.3
	 */
	public File filePath()
	{
		return filePath;
	}
	
	/**
	 * Returns the number of bytes spanned by this record&rsquo;s sequence block. If this record is not the last 
	 * one in the file the sequence block extends to the byte before the first one of the next sequence; otherwise 
	 * it extends until the last byte in the file.
	 * 
	 * @return	the number of bytes spanned by this record&rsquo;s sequence block.
	 *
	 * @since	1.3
	 */
	public long sequenceBytes()
	{
		return sequenceBytes;
	}

	
	
	/**
	 * sequence name is the first part of the header berfore space
	 * @return
	 */
	public  String getSequenceName() {

		String retrieveDescriptionSB = this.getDescriptionSB().toString();
		String allHeadTitle = retrieveDescriptionSB.trim();
		int indexOfSpace = allHeadTitle.indexOf(' ');
		if(indexOfSpace > 0 )
		{
			return allHeadTitle.substring(0, indexOfSpace);
		}
		return allHeadTitle;
	}
	
	/**
	 * Any str after the first space in  the sequence header
	 * @return
	 */
	public  String getSequenceDesc() {
		String retrieveDescriptionSB = this.getDescriptionSB().toString();

		String allHeadTitle = retrieveDescriptionSB.trim();
		int indexOfSpace = allHeadTitle.indexOf(' ');
		if(indexOfSpace > 0 )
		{
			return allHeadTitle.substring(indexOfSpace, allHeadTitle.length());
		}
		return "";
	}
	
	
	
	public static String getSeqNameFromDesc(String retrieveDescriptionSB) {

		String allHeadTitle = retrieveDescriptionSB.trim();
		int indexOfSpace = allHeadTitle.indexOf(' ');
		if(indexOfSpace > 0 )
		{
			return allHeadTitle.substring(0, indexOfSpace);
		}
		return allHeadTitle;
	}
	public static String getSeqDescFromDesc(String retrieveDescriptionSB) {

		String allHeadTitle = retrieveDescriptionSB.trim();
		int indexOfSpace = allHeadTitle.indexOf(' ');
		if(indexOfSpace > 0 )
		{
			return allHeadTitle.substring(indexOfSpace, allHeadTitle.length()).trim();
		}
		return "";
	}

	/**
	 * shift record location by the corssponging offset -/+
	 * @param shiftOffset
	 */
	public void shiftOffset(long shiftOffset) {
		// TODO Auto-generated method stub
		 descriptionByteOffset +=shiftOffset;
		
	     sequenceByteOffset +=shiftOffset;
		
		for(Entry<Long, Long> ent : componentsByteOffsets.entrySet()) {
			ent.setValue(ent.getValue()+shiftOffset);
		}
	}

	
	
}

