/**
 * pdfXtk-Extras - PDF Extraction Toolkit Extras
 * Copyright (c) by the authors/contributors.  All rights reserved.
 * This project includes code from PDFBox and TouchGraph.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * http://pdfxtk.sourceforge.net
 *
 */
package at.ac.tuwien.dbai.pdfwrap;

import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import javax.imageio.ImageIO;

import org.apache.log4j.Logger;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;

import at.ac.tuwien.dbai.pdfwrap.analysis.PageProcessor;
import at.ac.tuwien.dbai.pdfwrap.exceptions.DocumentProcessingException;
import at.ac.tuwien.dbai.pdfwrap.model.document.*;
import at.ac.tuwien.dbai.pdfwrap.model.graph.*;
import at.ac.tuwien.dbai.pdfwrap.ocr.SegmentExtractor;
import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFObjectExtractor;
import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFPage;
import at.ac.tuwien.dbai.pdfwrap.util.ExtraUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.ListUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.SegmentUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.Utils;


/**
 * This is the main program that parses the pdf document and transforms it.
 * Based upon PDFBox code example from Ben Litchfield
 *
 * @author Tamir Hassan, pdfanalyser@tamirhassan.com
 * @version PDF Analyser 0.9
 * @author Ben Litchfield (ben@csh.rit.edu)
 */
public class ProcessFileExtras
{
	// TODO: move somewhere sensible!  this is a global var, at least for GUI
	// moved to GUI 30.11.06
	//public static float XML_RESOLUTION = 150;
	
	public static final String TABLE = "-table";
    public static final String R_TABLE = "-ruledtable";
    public static final String NR_TABLE = "-nonruledtable";
    
    public static final int OCR_OFF = 0;
    public static final int OCR_ON = 1; // both
    public static final int OCR_AUTO_FILEEXT = 2;
    public static final int OCR_AUTO_IMAGEPDF = 3;
    
    private static final Logger LOG = Logger.getLogger( ProcessFileExtras.class );

    /**
     * Infamous main method.
     *
     * @param args Command line arguments, should be one and a reference to a file.
     *
     * @throws Exception If there is an error parsing the document.
     */
    public static void main(String[] args) throws Exception
    {
    	System.out.println("in main method");
        boolean toConsole = false;
        boolean toXHTML = true;
        boolean borders = true;
        boolean rulingLines = true;
        boolean processSpaces = false;
        int OCRMode = OCR_OFF;
        int processType = PageProcessor.PP_BLOCK;
        int currentArgumentIndex = 0;
        String password = "";
        String encoding = ProcessFile.DEFAULT_ENCODING;
        PDFObjectExtractor extractor = new PDFObjectExtractor();
        String inFile = null;
        String outFile = null;
        int startPage = 1;
        int endPage = Integer.MAX_VALUE;
        for( int i=0; i<args.length; i++ )
        {
            if( args[i].equals( ProcessFile.PASSWORD ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                password = args[i];
            }
            else if( args[i].equals( ProcessFile.ENCODING ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                encoding = args[i];
            }
            else if( args[i].equals( ProcessFile.START_PAGE ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                startPage = Integer.parseInt( args[i] );
            }
            else if( args[i].equals( ProcessFile.END_PAGE ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                endPage = Integer.parseInt( args[i] );
            }
            else if( args[i].equals( ProcessFile.CONSOLE ) )
            {
                toConsole = true;
            }
            else if( args[i].equals( TABLE ))
            {
                processType = CustomPageProcessor.PP_TABLE;
            }
            else if( args[i].equals( R_TABLE ))
            {
                processType = CustomPageProcessor.PP_R_TABLE;
            }
            else if( args[i].equals( NR_TABLE ))
            {
                processType = CustomPageProcessor.PP_NR_TABLE;
            }
            else if( args[i].equals( ProcessFile.NOBORDERS ))
            {
            	borders = false;
            }
            else if( args[i].equals( ProcessFile.XMILLUM ) )
            {
                toXHTML = false;
            }
            else if( args[i].equals( ProcessFile.NORULINGLINES ))
            {
            	rulingLines = false;
            }
            else if( args[i].equals( ProcessFile.PROCESS_SPACES ))
            {
            	processSpaces = false;
            }
            else
            {
                if( inFile == null )
                {
                    inFile = args[i];
                }
                else
                {
                    outFile = args[i];
                }
            }
        }

        if( inFile == null )
        {
            usage();
        }

        if( outFile == null && inFile.length() >4 )
        {
            outFile = inFile.substring( 0, inFile.length() -4 ) + ".txt";
        }
        
        // decide whether we have a pdf or image (TODO: command-line override)
        boolean pdf = true;
		if (inFile.endsWith("png") ||
			inFile.endsWith("tif") ||
			inFile.endsWith("tiff")||
			inFile.endsWith("jpg") ||
			inFile.endsWith("jpeg")||
			inFile.endsWith("PNG") ||
			inFile.endsWith("TIF") ||
			inFile.endsWith("TIFF") ||
			inFile.endsWith("JPG") ||
			inFile.endsWith("JPEG")) pdf = false;
		
//		System.err.println("Processing: " + inFile);
		
		if (!pdf && OCRMode == 2)
			;// TODO
		
        // load the input file
        File inputFile = new File(inFile);
        /*
        STR_INFILE = inputFile.getCanonicalPath();
        File tempOutFile = new File(outFile); // tmp for str only
        if (tempOutFile.getParent() != null)
        	STR_OUTPUT_PATH = tempOutFile.getParent();
        */
        byte[] inputDoc = ProcessFile.getBytesFromFile(inputFile);
        
        System.out.println("using output file: " + outFile);
        
        org.w3c.dom.Document resultDocument = null;
        
     // set up page processor object
        PageProcessor pp = new CustomPageProcessor();
        pp.setProcessType(processType);
        pp.setRulingLines(rulingLines);
        pp.setProcessSpaces(processSpaces);
        // no iterations should be automatically set to -1
        
        // do the processing
    	resultDocument =
    		ProcessFile.processPDFToXMLDocument
    		(inputDoc, pp, toXHTML, borders,
    		startPage, endPage, encoding, password);
    	
        // now output the XML Document by serializing it to output
        Writer output = null;
        if( toConsole )
        {
            output = new OutputStreamWriter( System.out );
        }
        else
        {
            if( encoding != null )
            {
                output = new OutputStreamWriter(
                    new FileOutputStream( outFile ), encoding );
            }
            else
            {
                //use default encoding
                output = new OutputStreamWriter(
                    new FileOutputStream( outFile ) );
            }
            //System.out.println("using out put file: " + outFile);
        }
        //System.out.println("resultDocument: " + resultDocument);
        ProcessFile.serializeXML(resultDocument, output);
        
        if( output != null )
        {
            output.close();
        }
    }
    
    
    /*
	    public static String STR_INFILE = "";
	    public static String STR_OUTPUT_PATH = ".";
	    public static int STR_CURR_PAGE_NO = -1;
	    public static final String STR_IMAGE_PREFIX = "-imgPrefix";
	    */
	    
	    /*
	     * possible conversions:
	     * pdf -> xml, pdf -> xhtml,
	     * gecko -> xml, gecko -> xhtml
	     */
	    
    // modified from method in ProcessFile
    // (unnecessary without OCR)
    public static List<Page> processPDF(byte[] theFile, CustomPageProcessor pp, // 2011-03-08 PageProcessor -> CustomPageProcessor
    	int startPage, int endPage, String encoding, String password,
    	List<AdjacencyGraph<GenericSegment>> adjGraphList, boolean GUI)
        throws DocumentProcessingException
    {
        boolean toConsole = false;
        if (password == null)
            password = "";
        if (encoding == null || encoding == "")
            encoding = ProcessFile.DEFAULT_ENCODING;
        
        if (startPage == 0)
            startPage = 1;
        if (endPage == 0)
            endPage = Integer.MAX_VALUE;
        
        ByteArrayInputStream inStream = new ByteArrayInputStream(theFile);
        PDDocument document = null;
        
        try {
        
        	PDFObjectExtractor extractor = new PDFObjectExtractor();
//          PDDocument document = null;
            document = PDDocument.load( inStream );
    //      document.print();
            if( document.isEncrypted() )
            {
                try
                {
                    document.decrypt( password );
                }
                catch( InvalidPasswordException e )
                {
                    if(!(password == null || password == ""))//they supplied the wrong password
                    {
                        throw new DocumentProcessingException
                            ("Error: The supplied password is incorrect.");
                    }
                    else
                    {
                        //they didn't suppply a password and the default of "" was wrong.
                        throw new DocumentProcessingException
                            ( "Error: The document is encrypted." );
                    }
                } catch (CryptographyException e) {
                    throw new DocumentProcessingException(e);
                }
            }
    
            extractor.setStartPage( startPage );
            extractor.setEndPage( endPage );
            // stripper.writeText( document, output );
            
            List<PDFPage> thePages = extractor.findObjects(document);
            List<Page> theResult = new ArrayList<Page>();
            
            startPage = extractor.getStartPage();
            endPage = extractor.getEndPage();
            
	        // now the DU part
	        
	        Iterator<PDFPage> pageIter = thePages.iterator();
	        int currentPage = -1;
	        while(pageIter.hasNext())
	        {
	        	currentPage ++;
	            PDFPage thePage = pageIter.next();

	            // beginning of addition for OCR 2011-03-08
	            if (ListUtils.selectImageSegments(thePage.getItems()).size() == 1 && pp.isPerformOCR())
	            {
	            	ImageSegment imgSeg = ListUtils.selectImageSegments(thePage.getItems()).get(0);
	            	if (imgSeg.getWidth() > thePage.getWidth() * 0.75 &&
	            		imgSeg.getHeight() > thePage.getHeight() * 0.75)
	            	{
		            	// get first image
		            	BufferedImage pageImage = extractor.pageImage(document, currentPage);
		            	
		            	// 15.12.10 -- bugfixing for rotated pages
	//	            	System.out.println("imageType: " + pageImage.getType());
	//	            	System.out.println("colorModel: " + 
	//	            	pageImage.getColorModel());
		            	
		            	// rotate image back if necessary
		            	if (thePage.getRotation() == 90 || thePage.getRotation() == -270)
		            	{
		            		pageImage = ExtraUtils.rotate90ACW(pageImage);
		            	}
		            	else if (thePage.getRotation() == 270 || thePage.getRotation() == -90)
		            	{
		            		pageImage = ExtraUtils.rotate90CW(pageImage);
		            	}
		            	
		            	pp.setPageImage(pageImage);
	            	}
	            }
	            // end of addition for OCR 2011-03-08
	            
            	Page resultPage = pp.processPage(thePage);
                theResult.add(resultPage);
                if (adjGraphList != null)
                	adjGraphList.add(pp.getAdjGraph());
	            
	        }
	        
	        // 17.11.10 document-wide processing for headers, footers, etc.
	        if (!GUI)
	        	theResult = pp.processDocPages(theResult, null);
	        
            // move to finally block somewhere?
            if( document != null )
            {
                document.close();
            }
	        return theResult;
        }
        catch (IOException e)
        {
        	e.printStackTrace();
            throw new DocumentProcessingException(e);
        }
        
    }

	public static List<Page> processScannedImage(byte[] theFile, CustomPageProcessor pp)
    throws DocumentProcessingException
	{
    	
    	List<Page> theResult = new ArrayList<Page>();
        boolean toConsole = false;
        
        ByteArrayInputStream inStream = new ByteArrayInputStream(theFile);
        
        try {
            SegmentExtractor extractor = new SegmentExtractor();
            List<PDFPage> thePages = new ArrayList<PDFPage>();
            BufferedImage img = ImageIO.read(inStream);
            //extractor.slice(inStream);
            
         // static?
            int imgScale = img.getWidth() / 600;
            
            BufferedImage procPageImage = extractor.preprocessImage(img, imgScale);
            extractor.slice(procPageImage);
            PDFPage thePage = new PDFPage();
//            thePage.getItems().addAll(extractor.getCharList());
            
            Float scaleFactor = Utils.PDF_POINT_RESOLUTION / Utils.XML_RESOLUTION;
            
            Iterator iter = extractor.getCharList().iterator();
            while(iter.hasNext())
            {
            	GenericSegment gs = (GenericSegment)iter.next();
            	gs.scaleCoordinates(scaleFactor * imgScale);
            	thePage.getItems().add(gs);
            }
            
            //thePage.findBoundingBox(); // works with just this line instead of the next threeo?
            float[] bBox = {0, img.getWidth(null), 0, img.getHeight(null)};
            thePage.setBoundingBox(bBox);
            thePage.scaleCoordinates(scaleFactor);
            thePage.reverseYCoordinatesPNG();
            
            /*
            10.11.10 -- this is done in PageProcessor now -- also for image-PDFs
            commented out to avoid duplicate lines
            
            SegmentList initialLines =
            	lxLineFinder.findLines(thePage.getItems(), 0.75f, false, false);
            initialLines.findBoundingBoxes();
            initialLines.findText(false);
            thePage.getItems().addAll(initialLines);
            
            System.out.println("added initialLines: " + initialLines);
            */
            
            thePages.add(thePage);
        
        
	        // now the DU part
	        Iterator pageIter = thePages.iterator();
	        while(pageIter.hasNext())
	        {
	            thePage = (PDFPage)pageIter.next();
	            /*
	            if (table)
	            {
	                Page resultPage = new Page();
	                Table resultTable = pp.processTable(thePage);
	                resultPage.getItems().add(resultTable);
	                theResult.add(resultPage);
	            }
	            else
	            {
	            */
	            // TODO: extractor.getpImage is a hack, as this is usually a static variable
	            	pp.setPageImage(img);
	                Page resultPage = pp.processPage(thePage);
	                theResult.add(resultPage);
	            /*
	            }
	            */
	        }
        }
        
        catch (IOException e)
        {
        	e.printStackTrace();
            throw new DocumentProcessingException(e);
        }
        
        return theResult;
	}
    
    // this version uses PP_TABLE instead of PP_BLOCK
    public static byte[] TablePDFToXHTML(byte[] theFile, 
    	int startPage, int endPage, String encoding, String password)
        throws DocumentProcessingException
    {
    	CustomPageProcessor pp = new CustomPageProcessor(CustomPageProcessor.PP_TABLE);
    	
        return ProcessFile.processPDFToByteArray(theFile, pp, true, true,
            startPage, endPage, encoding, password);
    }
    
    // as in ProcessFile but with CustomPageProcessor to work with OCR
    // (unnecessary without OCR)
    public static org.w3c.dom.Document processPDFToXMLDocument(byte[] theFile,
		CustomPageProcessor pp, boolean toXHTML, boolean borders,
		int startPage, int endPage, String encoding, String password)
		throws DocumentProcessingException
	{
		List<Page> theResult = processPDF(theFile, pp, startPage, endPage, 
			encoding, password, null, false);
		
		return ProcessFile.processResultToXMLDocument(theResult, toXHTML, borders);
	}


	/*
	public static byte[] processPDFToByteArray(byte[] theFile, 
		PageProcessor pp, int toXHTML, 
		int startPage, int endPage, String encoding, String password)
	    throws DocumentProcessingException
	{
		org.w3c.dom.Document resultDocument;
	    // calls the above and returns a byte[] from the XML Document.
		List<Page> theResult = processPDF(theFile, pp, startPage, endPage, 
			encoding, password, null, false);
		resultDocument = processResultToXMLDocument(theResult, toXHTML, borders);
		
	    return serializeXML(resultDocument);
	}
	*/
	
    // as in ProcessFile but with CustomPageProcessor to work with OCR
    // (unnecessary without OCR)
	public static byte[] processPDFToByteArray(byte[] theFile, 
		CustomPageProcessor pp, boolean toXHTML, boolean borders, 
		int startPage, int endPage, String encoding, String password)
	    throws DocumentProcessingException
	{
	    // calls the above and returns a byte[] from the XML Document.
	    
	    org.w3c.dom.Document resultDocument =
	        processPDFToXMLDocument(theFile, pp, toXHTML, borders,
	        startPage, endPage, encoding, password);
	    
	    return ProcessFile.serializeXML(resultDocument);
	}


	/**
	 * This will print the usage requirements and exit.
	 */
	private static void usage()
	{
	    System.err.println( "Usage: java at.ac.tuwien.dbai.pdfwrap.ProcessFile [OPTIONS] <PDF file> [Text File]\n" +
	        "  -password  <password>        Password to decrypt document\n" +
	        "  -encoding  <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
	        "  -xhtml                       output XHTML (instead of XMillum-XML)\n" +
	        "  -console                     Send text to console instead of file\n" +
	        "  -startPage <number>          The first page to start extraction(1 based)\n" +
	        "  -endPage <number>            The last page to extract(inclusive)\n" +
	        "  <PDF file>                   The PDF document to use\n" +
	        "  [Text File]                  The file to write the text to\n"
	        );
	    System.exit( 1 );
	}
}