/*
 * Created on 9-Jun-2003
 * 
 * The terms for using this software are as follows:
 * 
 * USE AT YOUR OWN RISK - if this program goes insane and takes
 * out several bystanders, don't come knocking on my door with
 * lawyers.
 * 
 * If you want to extend or use this software for some sort of
 * commercial (read: money-making) software, tell me about it
 * first. I probably won't ask for a cut because the software
 * isn't that complicated, but I do want to know where my little
 * baby heads after it leaves my machine.
 * 
 * If you have any questions about this program, feel free to
 * email me at straxus@baynet.net. I'd love to hear how this
 * program worked for you, or any suggestions or bugfixes that
 * you believe this software should use. I believe that software
 * should evolve and become better, so there's an extremely good
 * chance your suggestion will make it into the next version.
 * 
 * Oh, and for those of you curious about the author's (my) name,
 * just email and ask. :)
 */
package mozilla_training_analyzer;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.TreeSet;

/** A class which represents all of the information that is found
 *  within a Mozilla Bayesian Filter Training file.
 *
 * @author Straxus
 */
public class TrainingData {

	/** The first expected byte of the training.dat header.
	 */
	public static final int firstHdrByte = 0xFE;

	/** The second expected byte of the training.dat header.
	 */
	public static final int secondHdrByte = 0xED;

	/** The third expected byte of the training.dat header.
	 */
	public static final int thirdHdrByte = 0xFA;

	/** The fourth expected byte of the training.dat header.
	 */
	public static final int fourthHdrByte = 0xCE;

	/** The default filename for the Mozilla training file.
	 */
	public static final String outputFileName = "training.dat";

	/** The number of good messages processed by this file.
	 */
	private int goodMessageCount = -1;

	/** The number of bad messages processed by this file.
	 */
	private int badMessageCount = -1;

	/** The number of good tokens found within this data
	 * 	structure.
	 */
	private int numGoodTokens = -1;

	/** The number of bad tokens found within this data structure.
	 */
	private int numBadTokens = -1;

	/** The list of tokens found within this data structure.
	 */
	private TreeSet tokenSet = null;

	/** Creates a new, empty TrainingData.
	 */
	public TrainingData() {
		// No need to do anything, this class will be populated later.

	}

	/** Creates a new TrainingData which is populated with the
	 *  specified values.
	 *
	 * @param goodMsgCount The number of good messages processed
	 * 						by this set of tokens.
	 * @param badMsgCount	The number of bad messages processed
	 * 						by this set of tokens.
	 * @param newTokenSet	The set of tokens found within the
	 * 						represented training file.
	 */
	public TrainingData(
		int goodMsgCount,
		int badMsgCount,
		TreeSet newTokenSet) {

		goodMessageCount = goodMsgCount;
		badMessageCount = badMsgCount;
		tokenSet = newTokenSet;
	}

	/** Returns a byte[] which contains all of this TrainingData's
	 * 	information in a format which is ready to be written out
	 * 	to disk. The byte[] generated by this method is a
	 *  well-formatted Mozilla Bayesian Filter Training file.
	 *
	 * @return A byte[] which represents a well-formatted Mozilla
	 * 			Bayesian Filter Training file.
	 *
	 * @throws IOException If an error was encountered while
	 * 						generating the output array.
	 */
	public byte[] outputTrainingDataFileContents() throws IOException {

		// This is a good idea so that we don't write bad data to
		// the training data file. It's not strictly necessary
		// if the class has been used right, however I prefer
		// to lose a few cycles to have added robustness.
		validateTokenCount();

		ByteArrayOutputStream dataBytes = new ByteArrayOutputStream();

		// Write the necessary file header to the output file.
		dataBytes.write(TrainingData.firstHdrByte);
		dataBytes.write(TrainingData.secondHdrByte);
		dataBytes.write(TrainingData.thirdHdrByte);
		dataBytes.write(TrainingData.fourthHdrByte);

		// Write out the number of good messages processed
		dataBytes.write(Globals.makeBytes(goodMessageCount), 0, 4);

		// Write out the number of bad messages processed
		dataBytes.write(Globals.makeBytes(badMessageCount), 0, 4);

		// Write out the number of good tokens to be written
		dataBytes.write(Globals.makeBytes(numGoodTokens), 0, 4);

		// Get an iterator for our set of tokens.
		Iterator tokenIter = tokenSet.iterator();

		// Create this here to avoid unnecessary object creation
		// inside the loop.
		MozillaSpamToken tempToken = null;
		String tempString = null;

		// Iterate through the entire TreeSet and write out all
		// tokens with a GoodTokenCount > 0
		while (tokenIter.hasNext()) {

			// Get a reference to the next token.
			tempToken = (MozillaSpamToken) tokenIter.next();

			if (tempToken.getGoodTokenCount() > 0) {

				// First, write out the number of good occurences of this token
				dataBytes.write(
					Globals.makeBytes(tempToken.getGoodTokenCount()),
					0,
					4);

				// Get a reference to this string for ease of use
				tempString = tempToken.getTokenString();

				// Then, write out the length of the token string
				dataBytes.write(Globals.makeBytes(tempString.length()), 0, 4);

				// Finally, write out the token itself.
				dataBytes.write(tempString.getBytes(), 0, tempString.length());
			}

		}

		// Write out the number of bad tokens to be written
		dataBytes.write(Globals.makeBytes(numBadTokens));

		// Recreate our iterator for the second loop-through.
		tokenIter = tokenSet.iterator();

		// Iterate through the entire TreeSet and write out all
		// tokens with a BadTokenCount > 0
		while (tokenIter.hasNext()) {

			// Get a reference to the next token.
			tempToken = (MozillaSpamToken) tokenIter.next();

			if (tempToken.getBadTokenCount() > 0) {

				// First, write out the number of bad occurences of this token
				dataBytes.write(
					Globals.makeBytes(tempToken.getBadTokenCount()),
					0,
					4);

				// Get a reference to this string for ease of use
				tempString = tempToken.getTokenString();

				// Then, write out the length of the token string
				dataBytes.write(Globals.makeBytes(tempString.length()), 0, 4);

				// Finally, write out the token itself.
				dataBytes.write(tempString.getBytes(), 0, tempString.length());
			}
		}

		// Blank these out to help the garbage collector realize
		// that references no longer exist to these objects.
		tokenIter = null;
		tempToken = null;
		tempString = null;

		return dataBytes.toByteArray();
	}

	/** This method searches through the set of tokens in this
	 * 	TrainingData and validates the good and bad token counts.
	 *  Please note that this method should be called after any
	 * 	changes to this TrainingData's token set, otherwise the
	 * 	token counts will be out-of-date.
	 */
	public void validateTokenCount() {

		int goodTokenCount = 0;
		int badTokenCount = 0;

		// Get an iterator for our set of tokens.
		Iterator tokenIter = tokenSet.iterator();

		// Create this here to avoid unnecessary object creation
		// inside the loop.
		MozillaSpamToken tempToken = null;

		// Iterate through the entire TreeSet and write out all
		// tokens with a GoodTokenCount > 0
		while (tokenIter.hasNext()) {

			// Get a reference to the next token.
			tempToken = (MozillaSpamToken) tokenIter.next();

			if (tempToken.getGoodTokenCount() > 0) {
				goodTokenCount++;
			}

			if (tempToken.getBadTokenCount() > 0) {
				badTokenCount++;
			}
		}

		// Set the new, validated token counts.
		numGoodTokens = goodTokenCount;
		numBadTokens = badTokenCount;
	}

	/** Returns the number of bad messages processed by this set
	 * 	of tokens.
	 *
	 * @return The number of bad messages processed by this set
	 * 			of tokens.
	 */
	public int getBadMessageCount() {
		return badMessageCount;
	}

	/** Returns the number of good messages processed by this set
	 * 	of tokens.
	 *
	 * @return The number of good messages processed by this set
	 * 			of tokens.
	 */
	public int getGoodMessageCount() {
		return goodMessageCount;
	}

	/** Returns the number of bad tokens contained within this set
	 * 	of tokens.
	 *
	 * @return The number of bad tokens contained within this set
	 * 			of tokens.
	 */
	public int getNumBadTokens() {
		return numBadTokens;
	}

	/** Returns the number of good tokens contained within this
	 * 	set of tokens.
	 *
	 * @return The number of good tokens contained within this
	 * 			set of tokens.
	 */
	public int getNumGoodTokens() {
		return numGoodTokens;
	}

	/** Returns the set of tokens that this class represents.
	 * @return
	 */
	public TreeSet getTokenSet() {
		return tokenSet;
	}

	/** Sets the number of bad messages processed by this set of
	 * 	tokens.
	 *
	 * @param i 	The number of bad messages processed by this
	 * 				set of tokens.
	 */
	public void setBadMessageCount(int i) {
		badMessageCount = i;
	}

	/** Sets the number of good messages processed by this set of
	 * 	tokens.
	 *
	 * @param i 	The number of good messages processed by this
	 * 				set of tokens.
	 */
	public void setGoodMessageCount(int i) {
		goodMessageCount = i;
	}

	/** Changes the set of tokens that this class represents.
	 * 	After calling this method, you should also call
	 * 	validateTokenCount().
	 *
	 * @param set	The new set of tokens that this class
	 * 				represents.
	 */
	public void setTokenSet(TreeSet set) {
		tokenSet = set;
	}

	/** Outputs the data represented by this TrainingData as a
	 * 	human-readable plaintext file.
	 * 
	 * @return A String which contains all of the data which
	 * 			this TrainingData represents, in plaintext format.
	 */
	public String toTextDocument() {

		// Create a StringBuffer to hold our output as we
		// generate it.
		StringBuffer bufferOut = new StringBuffer();

		// Print a header to the text file.
		bufferOut
			.append("Good messages: ")
			.append(goodMessageCount)
			.append("\nBad messages: ")
			.append(badMessageCount)
			.append("\n\nList of tokens\n--------------\n");

		// Create this outside the loop to avoid unnecessary
		// creation of a bunch of objects inside the loop.
		Object tempObj = null;

		// Get an iterator for our set of tokens.
		Iterator tokenIter = tokenSet.iterator();

		// Keep going while elements remain in the list.
		while (tokenIter.hasNext()) {

			// Get a reference to the next token.
			tempObj = tokenIter.next();

			// Write that token's info to the output stream.
			bufferOut.append(tempObj.toString());
			bufferOut.append('\n');
		}

		return bufferOut.toString();
	}

	/** Outputs the data represented by this TrainingData as a
	 * 	valid HTML 4.01 Transitional page.
	 * 
	 * @return A String which contains all of the data which
	 * 			this TrainingData represents, in HTML 4.01
	 * 			Transitional format.
	 */
	public String toHTMLDocument() {

		// Create a StringBuffer to hold our output as we
		// generate it.
		StringBuffer bufferOut = new StringBuffer();

		// Print a header to the HTML file.
		// Please note - the generated HTML is valid HTML 4.01
		// transitional - validated at http://walidator.w3c.org
		bufferOut
			.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n")
			.append("<html lang=\"en\">\n")
			.append("	<head>\n")
			.append("		<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\">\n")
			.append("		<title>\n")
			.append("			Mozilla Bayesian Filter Tokens\n")
			.append("		</title>\n")
			.append("	</head>\n\n")
			.append("	<body>\n")
			.append("		<h1>\n")
			.append("			<div align=\"center\">\n")
			.append("				Mozilla Bayesian Filter Tokens\n")
			.append("			</div>\n")
			.append("		</h1>\n")
			.append("		<dl>\n")
			.append("			<dt>\n")
			.append("				Token\n")
			.append("			</dt>\n")
			.append("			<dd>\n")
			.append("				The string which has been detected\n")
			.append("				and tracked by the Bayesian filter.\n")
			.append("			</dd>\n")
			.append("			<dt>\n")
			.append("				Good\n")
			.append("			</dt>\n")
			.append("			<dd>\n")
			.append("				The number of occurences of this\n")
			.append("				token in non-junk (good) emails.\n")
			.append("			</dd>\n")
			.append("			<dt>\n")
			.append("				Bad\n")
			.append("			</dt>\n")
			.append("			<dd>\n")
			.append("				The number of occurences of this\n")
			.append("				token in junk (bad) emails.\n")
			.append("			</dd>\n")
			.append("		</dl>\n")
			.append("		<table align=\"center\" rules=\"all\">\n")
			.append("			<tr><th><strong>Token</strong></th>")
			.append("<th><strong>Good</strong></th>")
			.append("<th><strong>Bad</strong></th></tr>\n");

		// Create this outside the loop to avoid unnecessary
		// creation of a bunch of objects inside the loop.
		MozillaSpamToken tempToken = null;

		// Get an iterator for our set of tokens.
		Iterator tokenIter = tokenSet.iterator();

		// Keep going while elements remain in the list.
		while (tokenIter.hasNext()) {

			// Get a reference to the next token.
			tempToken = (MozillaSpamToken) tokenIter.next();

			// Write that token's info to the output stream.
			bufferOut
				.append("			<tr><td>")
				.append(tempToken.getTokenString())
				.append("</td><td>")
				.append(tempToken.getGoodTokenCount())
				.append("</td><td>")
				.append(tempToken.getBadTokenCount())
				.append("</td></tr>\n");
		}

		// Add a footer to the text file.
		bufferOut.append("		</table>\n").append("	</body>\n").append(
			"</html>\n");

		return bufferOut.toString();
	}

	/** Outputs the data represented by this TrainingData as a
	 * 	well-formed XML document. This XML document will conform
	 *  to the DTD generated by the writeXMLDTD(File) method.
	 * 
	 * @return A String which contains all of the data which
	 * 			this TrainingData represents, in XML format.
	 */
	public String toXMLDocument() {

		// First, a quick discussion about my chosen method of
		// implementation. I'm doing this with Java 1.3.1, which
		// does not come with a built-in XML parser as 1.4 does.
		// In addition, even if I had an XML parser available,
		// I believe that this method is far faster than adding
		// all of these items to a DOM tree and writing out that
		// DOM tree to the file. So, in the interest of simplicity
		// and speed, I have elected to write out the XML file as
		// text rather than doing the whole DOM thing. As I will
		// not be doing any manipulation of the DOM tree, I see
		// no benefit to its usage here. If you should happen to
		// disagree, implementing this using DOM should be quite
		// straightforward.

		// Create a StringBuffer to hold our output as we
		// generate it.
		StringBuffer bufferOut = new StringBuffer();

		// Print a header to the XML file.
		bufferOut
			.append("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n")
			.append("<!DOCTYPE tokenfile SYSTEM \"trainer_xml.dtd\">")
			.append("<tokenfile>\n")
			.append("	<good_msgs>")
			.append(goodMessageCount)
			.append("</good_msgs>\n")
			.append("	<bad_msgs>")
			.append(badMessageCount)
			.append("</bad_msgs>\n");

		// Create this outside the loop to avoid unnecessary
		// creation of a bunch of objects inside the loop.
		MozillaSpamToken tempToken = null;

		// Get an iterator for our set of tokens.
		Iterator tokenIter = tokenSet.iterator();

		// Keep going while elements remain in the list.
		while (tokenIter.hasNext()) {

			// Get a reference to the next token.
			tempToken = (MozillaSpamToken) tokenIter.next();

			// Write that token's info to the output stream.
			bufferOut
				.append("	<token>\n")
				.append("		<name>")
				.append(tempToken.getTokenString())
				.append("</name>\n")
				.append("		<good>")
				.append(tempToken.getGoodTokenCount())
				.append("</good>\n")
				.append("		<bad>")
				.append(tempToken.getBadTokenCount())
				.append("</bad>\n")
				.append("	</token>\n");
		}

		// Add a footer to the text file.
		bufferOut.append("</tokenfile>\n");

		return bufferOut.toString();
	}

	/** This method generates a DTD to which the XML output of the
	 * 	toXMLDocument() method conforms.
	 *
	 * @param fileToWrite The File to write the DTD to.
	 *
	 * @throws IOException If an error was encountered while
	 * 						writing to the output file.
	 */
	public void writeXMLDTD(File fileToWrite) throws IOException {

		// Create a StringBuffer to hold our output as we
		// generate it.
		StringBuffer bufferOut = new StringBuffer();

		bufferOut
			.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n")
			.append("<!-- The root element of this Mozilla training data XML file. -->\n")
			.append("<!ELEMENT tokenfile (good_msgs, bad_msgs, token*)>\n\n")
			.append("<!-- Represents the number of good (non-junk) messages processed by this token file. -->\n")
			.append("<!ELEMENT good_msgs (#PCDATA)>\n\n")
			.append("<!-- Represents the number of bad (junk) messages processed by this token file. -->\n")
			.append("<!ELEMENT bad_msgs (#PCDATA)>\n\n")
			.append("<!-- Represents a single token in the training file. -->\n")
			.append("<!ELEMENT token (name, good, bad)>\n\n")
			.append("<!-- Represents the string associated with this token. -->\n")
			.append("<!ELEMENT name (#PCDATA)>\n\n")
			.append("<!-- Represents the number of times this token has appeared in good (non-junk) emails. -->\n")
			.append("<!ELEMENT good (#PCDATA)>\n\n")
			.append("<!-- Represents the number of times this token has appeared in bad (junk) emails. -->\n")
			.append("<!ELEMENT bad (#PCDATA)>\n");

		// Erase the output file if it exists 
		if (fileToWrite.exists()) {

			fileToWrite.delete();
		}

		// Create a new output file.
		fileToWrite.createNewFile();

		// Create a stream for that output file.
		FileOutputStream outStream = new FileOutputStream(fileToWrite);

		// Write out the DTD.
		outStream.write(bufferOut.toString().getBytes());

		// Flush the output and close the file stream.
		outStream.flush();
		outStream.close();
	}
}
