drakkar.mast.retrieval.parser.PdfParser.java Source code

Java tutorial

Introduction

Here is the source code for drakkar.mast.retrieval.parser.PdfParser.java

Source

/*
 * DrakkarKeel - An Enterprise Collaborative Search Platform
 *
 * The contents of this file are subject under the terms described in the
 * DRAKKARKEEL_LICENSE file included in this distribution; you may not use this
 * file except in compliance with the License. 
 *
 * 2013-2014 DrakkarKeel Platform.
 */
package drakkar.mast.retrieval.parser;

import drakkar.oar.util.OutputMonitor;
import drakkar.mast.retrieval.DocumentLucene;
import drakkar.mast.retrieval.DocumentMinion;
import com.sun.labs.minion.SimpleIndexer;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.util.Calendar;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 * Para extraer de una documento PDF: contenido, ttulo, autor, sumario, nmero
 * de pginas, fecha de creacin y de modificacin --version de PDFBox 1.0.0
 * --nota: Terrier utiliza PDFBox 0.6.7a
 *
 *
 */
public class PdfParser {

    PDDocument pdoc = null;
    PDFTextStripper pdfText = null;
    StringWriter swriter = null;
    PDDocumentInformation pinf = null;
    private String title = null;
    private String author = null;
    private int numberPages = 0;
    private Calendar calcreation = null;
    private Calendar calmodification = null;
    private String allContent;

    /**
     * Default constructor
     */
    public PdfParser() {
    }

    /**
     * Para extraer contenido del pdf
     *
     * @param f
     * @return
     */
    public boolean analyzePdfDocument(File f) {

        try {

            pdoc = PDDocument.load(f);

            if (!pdoc.isEncrypted() && pdoc.getCurrentAccessPermission().canExtractContent()
                    && pdoc.getNumberOfPages() != 0) {

                this.numberPages = pdoc.getNumberOfPages();
                pdfText = new PDFTextStripper();

                swriter = new StringWriter();

                ////////////////////datos
                pinf = pdoc.getDocumentInformation();
                if (pinf == null) {
                    OutputMonitor.printLine("The document does not have available information.",
                            OutputMonitor.INFORMATION_MESSAGE);
                } else {
                    setTitle(pinf.getTitle());
                    setAuthor(pinf.getAuthor());
                    setNumberpages(pdoc.getNumberOfPages());
                    setCalCreation(pinf.getCreationDate());
                    setCalModification(pinf.getModificationDate());

                    pdfText.writeText(pdoc, swriter);
                    allContent = swriter.getBuffer().toString();
                }

                pdoc.close();
                swriter.close();

                return true;

            } else {
                OutputMonitor.printLine("Encrypted document.", OutputMonitor.INFORMATION_MESSAGE);
            }
        } catch (Exception ex) {
            OutputMonitor.printStream("", ex);
        } finally {
            if (pdoc != null) {
                try {
                    pdoc.close();
                } catch (IOException ex) {
                    OutputMonitor.printStream("IO", ex);
                }
            }
        }

        return false;
    }

    /**
     * Divide el contenido del pdf de 100 en 100 pginas de acuerdo al nmero
     * total para el motor de bsqueda Minion
     *
     * @param f
     * @param indexer indexador de Minion
     * @throws IOException
     */
    public void divideTextforMinion(File f, SimpleIndexer indexer) throws IOException {
        pdoc = PDDocument.load(f);
        this.numberPages = pdoc.getNumberOfPages();

        if (!pdoc.isEncrypted() && pdoc.getCurrentAccessPermission().canExtractContent()
                && pdoc.getNumberOfPages() != 0) {

            String fragment = null;
            int start = 0, end = 0;
            int count = 0;

            if (this.numberPages > 100) {

                for (int i = 0; i < numberPages; i = i + 100) {
                    count++; //para el key del document
                    swriter = new StringWriter();
                    pdfText = new PDFTextStripper();

                    start = i;
                    end = 99 + i;

                    if (end > numberPages) {
                        end = numberPages;
                    }

                    pdfText.setStartPage(start);
                    pdfText.setEndPage(end);
                    pdfText.writeText(pdoc, swriter);

                    fragment = swriter.getBuffer().toString();

                    DocumentMinion docm = new DocumentMinion(indexer, f.getPath() + count);
                    docm.addField("filepath", f.getAbsolutePath());
                    docm.addField("name", f.getName());
                    docm.addField("book", fragment);
                    docm.closeDocument();

                    swriter.close();
                }

                pdoc.close();

            } else { //si tiene menos de 100 pginas toma todo el texto como est

                swriter = new StringWriter();
                pdfText = new PDFTextStripper();

                pdfText.writeText(pdoc, swriter);
                fragment = swriter.getBuffer().toString();
                DocumentMinion docm = new DocumentMinion(indexer, f.getPath());
                docm.addField("filepath", f.getAbsolutePath());
                docm.addField("name", f.getName());
                docm.addField("book", fragment);
                docm.closeDocument();

                swriter.close();
                pdoc.close();
            }
        } else {
            OutputMonitor.printLine("Encrypted book.", OutputMonitor.INFORMATION_MESSAGE);
        }

        if (pdoc != null) {
            try {
                pdoc.close();
            } catch (IOException ex) {
                OutputMonitor.printStream("", ex);
            }
        }

    }

    /**
     * Divide el contenido del pdf de 100 en 100 pginas de acuerdo al nmero
     * total para el motor de bsqueda Lucene
     *
     * @param f
     * @param doccs
     * @param doc
     * @param doclsi
     * @throws IOException
     */
    public void divideTextforLucene(File f, DocumentLucene doccs, DocumentLucene doc, DocumentLucene doclsi)
            throws IOException {
        pdoc = PDDocument.load(f);
        this.numberPages = pdoc.getNumberOfPages();

        if (!pdoc.isEncrypted() && pdoc.getCurrentAccessPermission().canExtractContent()
                && pdoc.getNumberOfPages() != 0) {

            String fragment = null;
            int start = 0, end = 0;
            int count = 0;

            if (this.numberPages > 100) {

                for (int i = 0; i < numberPages; i = i + 100) {
                    count++; //para el key del document
                    swriter = new StringWriter();
                    pdfText = new PDFTextStripper();

                    start = i;
                    end = 99 + i;

                    if (end > numberPages) {
                        end = numberPages;
                    }

                    pdfText.setStartPage(start);
                    pdfText.setEndPage(end);
                    pdfText.writeText(pdoc, swriter);

                    fragment = swriter.getBuffer().toString();

                    doc.addField("filepath", f.getCanonicalPath());
                    doccs.addField("filepathcs", f.getCanonicalPath());
                    doc.addField("name", f.getName());
                    doccs.addField("namecs", f.getName());
                    doc.addField("book", fragment);
                    doccs.addField("bookcs", fragment);
                    ///////
                    if (doclsi != null) {
                        doclsi.addField("book", fragment);

                    }

                    swriter.close();
                }

                pdoc.close();

            } else { //si tiene menos de 100 pginas toma todo el texto como est

                swriter = new StringWriter();
                pdfText = new PDFTextStripper();

                //index
                pdfText.writeText(pdoc, swriter);
                fragment = swriter.getBuffer().toString();
                doc.addField("filepath", f.getCanonicalPath());
                doccs.addField("filepathcs", f.getCanonicalPath());
                doc.addField("name", f.getName());
                doccs.addField("namecs", f.getName());
                doc.addField("book", fragment);
                doccs.addField("bookcs", fragment);

                if (doclsi != null) {
                    doclsi.addField("book", fragment);

                }
                swriter.close();
                pdoc.close();
            }
        } else {
            OutputMonitor.printLine("Encrypted book.", OutputMonitor.INFORMATION_MESSAGE);
        }

        if (pdoc != null) {
            try {
                pdoc.close();
            } catch (IOException ex) {
                OutputMonitor.printStream("", ex);
            }
        }

    }

    ///////////////////////////SET y GET
    /**
     * @return the title
     */
    public String getTitle() {
        return title;
    }

    /**
     * @param title the title to set
     */
    public void setTitle(String title) {
        this.title = title;
    }

    /**
     * @return the author
     */
    public String getAuthor() {
        return author;
    }

    /**
     * @param author the author to set
     */
    public void setAuthor(String author) {
        this.author = author;
    }

    /**
     * @return the numberpages
     */
    public int getNumberpages() {
        return numberPages;
    }

    /**
     * @param numberpages the numberpages to set
     */
    public void setNumberpages(int numberpages) {
        this.numberPages = numberpages;
    }

    /**
     * @return the cal
     */
    public Calendar getCalCreation() {
        return calcreation;
    }

    /**
     * @param cal the cal to set
     */
    public void setCalCreation(Calendar cal) {
        this.calcreation = cal;
    }

    /**
     * @return the calmod
     */
    public Calendar getCalModification() {
        return calmodification;
    }

    /**
     * @param calmod the calmod to set
     */
    public void setCalModification(Calendar calmod) {
        this.calmodification = calmod;
    }

    /**
     * @return the allContent
     */
    public String getAllContent() {
        return allContent;
    }

    /**
     * @param allContent the allContent to set
     */
    public void setAllContent(String allContent) {
        this.allContent = allContent;
    }
}