/*
 * @author		Alfonso Muñoz-Pomer Fuentes, 
 * 				<a href="mailto:alfonso.munozpomer@biotechvana.com">
 * 				alfonso.munozpomer@biotechvana.com</a>,  
 * 				<a href="http://www.biotechvana.com">Biotechvana</a>
 *
 * @date		2010-11-01
 * 
 * @license		See <a href="http://www.biotechvana.com></a>
 *
 * @copyright	Copyright Biotech Vana, S.L. 2006-2010
 */

package com.biotechvana.javabiotoolkit.io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.eclipse.core.runtime.NullProgressMonitor;

import com.biotechvana.javabiotoolkit.AminoAcid;
import com.biotechvana.javabiotoolkit.DNABase;
import com.biotechvana.javabiotoolkit.DNASequence;
import com.biotechvana.javabiotoolkit.GeneticCode;
import com.biotechvana.javabiotoolkit.RNABase;
import com.biotechvana.javabiotoolkit.RNASequence;
import com.biotechvana.javabiotoolkit.exceptions.GeneticCodeIncorrectSyntaxException;

/**
 * Objects of this class can read files which store a genetic code (i.e. codon to amino acid translation rules, and  
 * which codons are start/termination codons) and then create an instance of <code>GeneticCode</code> representing 
 * the translation table saved in the file.
 * <p>
 * There are two supported formats: Biotechvana plaintext and GeneRunner TRT. Examples of a fragment of a file of each 
 * of these formats follow. Both examples are for the standard genetic code. Different codings can be used by 
 * altering the amino acid code after each codon and by specifying the start and stop codons. Start codons may encode 
 * for amino acids different to methionine; likewise, stop codons may not encode for the termination symbol 
 * &ldquo;*&rdquo;.
 * <p>
 * <strong>Biotechvana plaintext format</strong><br />
 * <code>
 * # Standard genetic code<br/>
 * GAG -> E<br/>
 * # Stop codon<br/>
 * UAA -> * stop<br/>
 * CUG -> L<br/>
 * AGU -> S<br/>
 * GCA -> A<br/>
 * AUG -> M start<br/>
 * ...
 * </code>
 * <p> 
 * <strong>GeneRunner TRT format</strong><br />
 * <code>
 * TRANSLATION<br />
 * Universal Translation Table<br />
 * "DEFAULT.AAT"<br />
 * "DEFAULT.SUT"<br />
 * ! Codon AA    Start  Stop  Sup<br />
 * "gag"  "Glu"   0     0    ""
 * "taa"  "Och"   0     2    ""
 * "ctg"  "Leu"   0     0    ""
 * "agt"  "Ser"   0     0    ""
 * "gca"  "Ala"   0     0    ""
 * "atg"  "Met"   1     0    ""
 * ...
 * </code>
 * 
 * @version	1.1, 2010-11-01
 * 
 * @author	<a href="mailto:alfonso.munozpomer@biotechvana.com">Alfonso Muñoz-Pomer Fuentes</a>,
 * 			<a href="http://www.biotechvana.com">Biotechvana</a>.
 * 
 * @see	GeneticCode
 * @see	DNASequence
 * @see	RNASequence
 * @see	AminoAcid
 */
public class GeneticCodeReader
{
	// File to which this object is associated and the code stored in it
	private File filePath;
	private GeneticCode code;
	
	// Regexes for Biotechvana format lines
	static Pattern biotechvanaRnaRulePattern;
	static Pattern biotechvanaDnaRulePattern;
	static Pattern biotechvanaCommentPattern;
	// Regexes for GeneRunner format lines
	static Pattern geneRunnerHeader1Pattern;
	static Pattern geneRunnerHeader2Pattern;
	static Pattern geneRunnerHeader3Pattern;
	static Pattern geneRunnerHeader4Pattern;
	static Pattern geneRunnerRulePattern;
	static Pattern geneRunnerCommentPattern;
	// Blank line regex
	static Pattern blankLinePattern;
	
	{
		// Building blocks for rules regexes
		String rnaCodonRegex = "([" + RNABase.getRegexes() + "]{3})";
		String dnaCodonRegex = "([" + DNABase.getRegexes() + "]{3})";
		String aminoRegex = "([" + AminoAcid.getRegexes() + "]{1})";
		String separatorRegex = "\\-\\>";
		String startStopRegex = "(start|stop)?";
		
		StringBuilder aminoAbbreviationsRegex = new StringBuilder("(");
		for (AminoAcid amino : AminoAcid.values())
		{
			aminoAbbreviationsRegex.append(amino.getAbbreviation() + "|");
		}
		aminoAbbreviationsRegex.append("Och|Amb|Umb|\\*\\*\\*)");
				
		// biotechvanaRnaRulePattern: UUG -> K | UUG->K | UUG->    K
		// biotechvanaDnaRulePattern: ATT -> O | ATT   -> O | ATT-> O
		// biotechvanaCommentPattern: #Bla bla bla |     #   Bla bla bla
		biotechvanaRnaRulePattern = Pattern.compile(
			"^\\s*" + rnaCodonRegex + 
			"\\s*" + separatorRegex + 
			"\\s*" + aminoRegex + 
			"\\s*" + startStopRegex +
			"\\s*$",
			Pattern.CASE_INSENSITIVE);
		biotechvanaDnaRulePattern = Pattern.compile(
				"^\\s*" + dnaCodonRegex + 
				"\\s*" + separatorRegex + 
				"\\s*" + aminoRegex + 
				"\\s*" + startStopRegex +
				"\\s*$",
				Pattern.CASE_INSENSITIVE);
		biotechvanaCommentPattern = Pattern.compile(
				"\\s*" +
				"#.*$",
				Pattern.CASE_INSENSITIVE);
		
		// Ex. "aaa"  "Lys"   0     0    ""
		geneRunnerHeader1Pattern = Pattern.compile(
			"^\\s*" + "TRANSLATION" +
			"\\s*$",
			Pattern.CASE_INSENSITIVE);
		geneRunnerHeader2Pattern = Pattern.compile(
			"^.+$",
			Pattern.CASE_INSENSITIVE);
		geneRunnerHeader3Pattern = Pattern.compile(
			"^\\s*" + "\".+\"" +
			"\\s*$",
			Pattern.CASE_INSENSITIVE);
		geneRunnerHeader4Pattern = Pattern.compile(
				"^\\s*" + "\".+\"" +
				"\\s*$",
				Pattern.CASE_INSENSITIVE);
		geneRunnerRulePattern = Pattern.compile(
			"^\\s*" +
			"\"" + dnaCodonRegex + "\"" +
			"\\s*" +
			"\"" + aminoAbbreviationsRegex + "\"" +
			"\\s*" + "(0|1)" + "\\s*" + "(0|2)" +
			"\\s*" + "\".*\"" +
			"\\s*$",
			Pattern.CASE_INSENSITIVE);
		geneRunnerCommentPattern = Pattern.compile(
			"^\\s*" + "!.*$",
			Pattern.CASE_INSENSITIVE);
	}
	
	/**
	 * Constructs a <code>GeneticCodeReader</code> and parses <code>filePath</code>. The code stored in the file can be 
	 * retrieved by calling <code>geneticCode()</code> on the receiver.
	 * 
	 * @param	filePath	file to parse in order to extract a genetic code.
	 * 
	 * @throws	IOException
	 * @throws	GeneticCodeIncorrectSyntaxException 
	 * 
	 * @since	0.1
	 */
	public GeneticCodeReader(File filePath)
	throws IOException, GeneticCodeIncorrectSyntaxException
	{

		// Get filename and guess file format
		this.filePath = filePath;
		
		if (isEmpty())
		{
			code = null;
		}
		else if (isBiotechvanaRnaFormat())
		{
			code = parseBiotechvanaRnaFormat();
		}
		else if (isBiotechvanaDnaFormat())
		{
			code = parseBiotechvanaDnaFormat();
		}
		else if (isGeneRunnerFormat())
		{
			code = parseGeneRunnerFormat();
		}
		else
		{
			throw new GeneticCodeIncorrectSyntaxException(filePath.getName() + ": file format unknown");
		}
	}
			
	/**
	 * Returns the genetic code stored in the file associated to this reader.
	 * 
	 * @return	instance of <code>GeneticCode</code>.
	 * 
	 * @see		GeneticCode
	 * 
	 * @since	0.1
	 */
	public GeneticCode geneticCode()
	{
		return code;
	}

	/*
	 * Parses a file in Biotechvana format, RNA style.
	 * 
	 * @throws	FileNotFoundException 
	 * @throws	GeneticCodeIncorrectSyntaxException
	 * 
	 * @since	0.1 
	 */
	private GeneticCode parseBiotechvanaRnaFormat()
	throws FileNotFoundException, IOException, GeneticCodeIncorrectSyntaxException
	{
		GeneticCode newCode = new GeneticCode();
		newCode.setDescription(filePath.getName());

		BufferedReader bufferedFileReader = new BufferedReader(new FileReader(filePath));
		String line;
		try
		{
			while ((line = bufferedFileReader.readLine()) != null)
			{
				// Process line(s)
				line = line.trim();
				Matcher biotechvanaCodeRnaMatcher = biotechvanaRnaRulePattern.matcher(line);
				if (biotechvanaCodeRnaMatcher.matches())
				{
					//Extract codon from RNA rule
					RNASequence codon = new RNASequence(biotechvanaCodeRnaMatcher.group(1));
					AminoAcid amino = AminoAcid.valueOf(biotechvanaCodeRnaMatcher.group(2).charAt(0));
					newCode.addCodon(codon, amino);

					if (biotechvanaCodeRnaMatcher.group(3) != null)
					{
						if (biotechvanaCodeRnaMatcher.group(3).equals("start"))
						{
							newCode.addStartCodon(codon);
						}
						else if (biotechvanaCodeRnaMatcher.group(3).equals("stop"))
						{
							newCode.addStopCodon(codon);
						}
					}

				}
			}
		}
		finally
		{
			bufferedFileReader.close();
		}
		return newCode;
	}
	
	/*
	 * Parses a file in Biotechvana format, DNA style. 
	 * 
	 * @throws	FileNotFoundException 
	 * @throws	GeneticCodeIncorrectSyntaxException 
	 * 
	 * @since	0.2
	 */
	private GeneticCode parseBiotechvanaDnaFormat()
	throws FileNotFoundException, IOException, GeneticCodeIncorrectSyntaxException
	{
		GeneticCode newCode = new GeneticCode();
		newCode.setDescription(filePath.getName());

		BufferedReader bufferedFileReader = new BufferedReader(new FileReader(filePath));
		String line;
		try
		{
			while ((line = bufferedFileReader.readLine()) != null)
			{
				// Process line(s)
				line = line.trim();
				Matcher biotechvanaCodeDnaMatcher = biotechvanaDnaRulePattern.matcher(line);
				if (biotechvanaCodeDnaMatcher.matches())
				{
					//Extract codon from RNA rule
					DNASequence codon = new DNASequence(biotechvanaCodeDnaMatcher.group(1));
					AminoAcid amino = AminoAcid.valueOf(biotechvanaCodeDnaMatcher.group(2).charAt(0));
					newCode.addCodon(codon.transcribe(new NullProgressMonitor()), amino);
					
					if (biotechvanaCodeDnaMatcher.group(3) != null)
					{
						if (biotechvanaCodeDnaMatcher.group(3).equals("start"))
						{
							newCode.addStartCodon(codon.transcribe(new NullProgressMonitor()));
						}
						else if (biotechvanaCodeDnaMatcher.group(3).equals("stop"))
						{
							newCode.addStopCodon(codon.transcribe(new NullProgressMonitor()));
						}
					}
				}
			}
		}
		finally
		{
			bufferedFileReader.close();
		}
		return newCode;
	}
	
	/*
	 * Parses a file in GeneRunner TRT format. 
	 * 
	 * @throws	FileNotFoundException 
	 * @throws	GeneticCodeIncorrectSyntaxException 
	 * 
	 * @since	0.3
	 */
	private GeneticCode parseGeneRunnerFormat()
	throws FileNotFoundException, IOException, GeneticCodeIncorrectSyntaxException
	{
		GeneticCode newCode = new GeneticCode();
		newCode.setDescription(filePath.getName());

		BufferedReader bufferedFileReader = new BufferedReader(new FileReader(filePath));
		String line;
		try {
			while ((line = bufferedFileReader.readLine()) != null)
			{
				// Process line(s)
				line = line.trim();
				Matcher geneRunnerRuleMatcher = geneRunnerRulePattern.matcher(line);
				if (geneRunnerRuleMatcher.matches())
				{
					//Extract codon from RNA rule
					DNASequence codon = new DNASequence(geneRunnerRuleMatcher.group(1));
					AminoAcid amino = AminoAcid.valueOfAbbreviation(geneRunnerRuleMatcher.group(2));
					newCode.addCodon(codon.transcribe(new NullProgressMonitor()), amino);
					
					if (geneRunnerRuleMatcher.group(3).equals("1") && geneRunnerRuleMatcher.group(4).equals("2"))
					{
						throw new GeneticCodeIncorrectSyntaxException(
								codon.toString() + " is marked both as a start and a stop codon.");
					}
					else if (geneRunnerRuleMatcher.group(3).equals("1"))
					{
						newCode.addStartCodon(codon.transcribe(new NullProgressMonitor()));
					}
					else if (geneRunnerRuleMatcher.group(4).equals("2"))
					{
						newCode.addStopCodon(codon.transcribe(new NullProgressMonitor()));
					}
				}
			}
		}
		finally
		{
			bufferedFileReader.close();
		}
		return newCode;
	}
	
	/*
	 * Determines if a file is empty, with zero or more blanks (tabs, spaces, etc.)
	 * 
	 * @return
	 * 
	 * @throws	FileNotFoundException
	 * 
	 * @since	0.1
	 */
	private boolean isEmpty()
	throws FileNotFoundException, IOException
	{
		BufferedReader bufferedFileReader = new BufferedReader(new FileReader(filePath));
		String line;
		while ((line = bufferedFileReader.readLine()) != null)
		{
			// Process line(s)
			line = line.trim();
			Matcher biotechvanaCommentMatcher = biotechvanaCommentPattern.matcher(line);
			Matcher geneRunnerCommentMatcher = geneRunnerCommentPattern.matcher(line);
			if (!biotechvanaCommentMatcher.matches() && !geneRunnerCommentMatcher.matches() && !line.matches("^\\s*$"))
			{
				bufferedFileReader.close();
				return false;
			}
		}
		bufferedFileReader.close();
		return true;
	}
	
	/*
	 * Determines if the file is in Biotechvana plaintext file, RNA style
	 * 
	 * @return
	 * 
	 * @throws	FileNotFoundException, IOException
	 * 
	 * @since	0.1
	 */
	private boolean isBiotechvanaRnaFormat()
	throws FileNotFoundException, IOException
	{
		BufferedReader bufferedFileReader = new BufferedReader(new FileReader(filePath));
		try
		{
			String line;
			while ((line = bufferedFileReader.readLine()) != null)
			{
				// Process line(s)
				line = line.trim();
				Matcher biotechvanaRnaRuleMatcher = biotechvanaRnaRulePattern.matcher(line);
				Matcher biotechvanaCommentMatcher = biotechvanaCommentPattern.matcher(line);
				if (!biotechvanaRnaRuleMatcher.matches() && !biotechvanaCommentMatcher.matches() &&
					!line.matches("^\\s*$"))
				{
					return false;
				}
			}
		}
		finally
		{
			bufferedFileReader.close();
		}
		
		return true;
	}
	
	/*
	 * Determines if the file is in Biotechvana plaintext file, DNA style 
	 * 
	 * @return
	 * 
	 * @throws FileNotFoundException 
	 */
	private boolean isBiotechvanaDnaFormat()
	throws FileNotFoundException, IOException
	{
		BufferedReader bufferedFileReader = new BufferedReader(new FileReader(filePath));
		try
		{
			String line;
			while ((line = bufferedFileReader.readLine()) != null)
			{
				// Process line(s)
				line = line.trim();
				Matcher biotechvanaDnaRuleMatcher = biotechvanaDnaRulePattern.matcher(line);
				Matcher biotechvanaCommentMatcher = biotechvanaCommentPattern.matcher(line);
				
				if (!biotechvanaDnaRuleMatcher.matches() && !biotechvanaCommentMatcher.matches() && !line.matches("^\\s*$"))
				{
					return false;
				}
			}
		}
		finally
		{
			bufferedFileReader.close();
		}
		return true;
	}
	
	/*
	 * Determines if the file is in GeneRunner TRT format.
	 * 
	 * @return
	 *
	 * @throws	FileNotFoundException
	 * 
	 * @since	0.1
	 */
	private boolean isGeneRunnerFormat()
	throws FileNotFoundException, IOException
	{
		BufferedReader bufferedFileReader = new BufferedReader(new FileReader(filePath));
		try
		{
			String line;
			// Look for first header
			while ((line = bufferedFileReader.readLine()) != null)
			{
				Matcher geneRunnerHeader1Matcher = geneRunnerHeader1Pattern.matcher(line);
				Matcher geneRunnerCommentMatcher = geneRunnerCommentPattern.matcher(line);
				
				if (!geneRunnerHeader1Matcher.matches() && !geneRunnerCommentMatcher.matches() &&
					!line.matches("^\\s*$"))
				{
					return false;
				}
				else if (geneRunnerHeader1Matcher.matches())
				{
					break; // First header found
				}
			}
			// Look for second header
			while ((line = bufferedFileReader.readLine()) != null)
			{
				Matcher geneRunnerHeader2Matcher = geneRunnerHeader2Pattern.matcher(line);
				Matcher geneRunnerCommentMatcher = geneRunnerCommentPattern.matcher(line);
				
				if (!geneRunnerHeader2Matcher.matches() && !geneRunnerCommentMatcher.matches() &&
					!line.matches("^\\s*$"))
				{
					return false;
				}
				else if (geneRunnerHeader2Matcher.matches())
				{
					break; // Second header found
				}
			}
			// Look for third header
			while ((line = bufferedFileReader.readLine()) != null)
			{
				Matcher geneRunnerHeader3Matcher = geneRunnerHeader3Pattern.matcher(line);
				Matcher geneRunnerCommentMatcher = geneRunnerCommentPattern.matcher(line);
				
				if (!geneRunnerHeader3Matcher.matches() && !geneRunnerCommentMatcher.matches() &&
						!line.matches("^\\s*$"))
				{
					return false;
				}
				else if (geneRunnerHeader3Matcher.matches())
				{
					break; // Third header found
				}
			}
			// Look for fourth header
			while ((line = bufferedFileReader.readLine()) != null)
			{
				Matcher geneRunnerHeader4Matcher = geneRunnerHeader4Pattern.matcher(line);
				Matcher geneRunnerCommentMatcher = geneRunnerCommentPattern.matcher(line);
				
				if (!geneRunnerHeader4Matcher.matches() && !geneRunnerCommentMatcher.matches() && 
					!line.matches("^\\s*$"))
				{
					return false;
				}
				else if (geneRunnerHeader4Matcher.matches())
				{
					break; // Third header found
				}
			}
			// Look for rules
			while ((line = bufferedFileReader.readLine()) != null)
			{
				// Process line(s)
				line = line.trim();
				Matcher geneRunnerRuleMatcher = geneRunnerRulePattern.matcher(line);
				Matcher geneRunnerCommentMatcher = geneRunnerCommentPattern.matcher(line);
				
				if (!geneRunnerRuleMatcher.matches() && !geneRunnerCommentMatcher.matches() && !line.matches("^\\s*$"))
				{
					return false;
				}
			}
		}
		finally
		{
			bufferedFileReader.close();
		}
		return true;
	}
}
