Example usage for com.itextpdf.text.pdf.parser TextExtractionStrategy getResultantText

List of usage examples for com.itextpdf.text.pdf.parser TextExtractionStrategy getResultantText

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf.parser TextExtractionStrategy getResultantText.

Prototype

public String getResultantText();

Source Link

Document

Returns the result so far.

Usage

From source file:br.com.smarttaco.util.HelenaBarbosa.java

/**
 * pdf2txt/* ww  w .  j a va  2s  .com*/
 *
 * @param pdf
 * @param paginas se for <code>null</code> realiza leitura completa.
 * @param txt
 * @throws FileNotFoundException
 * @throws IOException
 */
private static void pdf2txt(final String pdf, List<Integer> paginas, final String txt)
        throws FileNotFoundException, IOException {
    PdfReader reader = new PdfReader(pdf);
    //System.out.println(reader.getInfo().toString());
    if (paginas != null) {
        reader.selectPages(paginas);
    }
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(txt, "UTF-8");
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        out.println(strategy.getResultantText());
    }
    out.flush();
    out.close();
    reader.close();
}

From source file:com.cloudhub.util.PDFToText.java

License:Apache License

/**
 * Parses a PDF to a plain text file./*from   w w w .  j ava  2 s.  c  o m*/
 *
 * @param source the original PDF
 * @param destination the resulting text
 * @throws IOException
 */
public static void parsePdf(String source, String destination) throws IOException {
    PdfReader reader = new PdfReader(source);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(new FileOutputStream(destination));
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        out.println(strategy.getResultantText());
    }
    out.flush();
    out.close();
}

From source file:com.erikHolz.vertretungsplan.Converter.java

License:Open Source License

public void parsePDF() throws IOException {

    PdfReader reader = new PdfReader(fileDest + ".pdf");
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(new FileOutputStream(fileDest + "__.txt"));

    TextExtractionStrategy strategy;
    for (int intI = 1; intI <= reader.getNumberOfPages(); intI++) {
        strategy = parser.processContent(intI, new LocationTextExtractionStrategy());
        out.println(strategy.getResultantText());
    }//w  w  w . jav  a2s  . co m

    out.flush();
    out.close();
    reader.close();

    // lschen der ursprnglichen pdf
    File f = new File(fileDest + ".pdf");
    if (f.exists())
        f.delete();
}

From source file:com.joanzapata.PDFViewActivity.java

License:Open Source License

public void parsePdf(String pdf2, String txt) throws IOException {
    // String//from ww  w.j  a  v  a  2 s.  c  o  m
    // pdf1=Environment.getExternalStoragePublicDirectory(Environment.DIRECTORY_DOWNLOADS)+
    // File.separator + "about.pdf";
    PdfReader reader = new PdfReader(pdf2);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);

    File file = getFileStreamPath("test.txt");

    if (!file.exists()) {
        file.createNewFile();
    }

    FileOutputStream writer = openFileOutput(file.getName(), Context.MODE_PRIVATE);

    // PrintWriter out = new PrintWriter(new FileOutputStream(txt));
    TextExtractionStrategy strategy;
    for (int i = 1; i <= pageNumber; i++) {
        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
        writer.write(strategy.getResultantText().getBytes());
        writer.flush();
    }
    writer.close();
    reader.close();
    String totalString = readFromFile();
    System.out.println(totalString);
    totalContent = totalString;
    String word = "Number of words  " + wordCount(totalString) + "\n";
    String averageWords = "Average length of words  " + " " + averageWords(totalString) + "\n";
    String sentenseCount = "Number of sentences  " + sentenseCount(totalString) + "\n";
    String averageSentenses = "Average length of sentences  " + averageSentense(totalString) + "\n";
    // String complexity=
    String readability = " Readability Index  " + getReadability(totalString) + "\n";

    String subject = "Subject Area  " + etSubject.getText().toString() + "\n";
    String article = "Type of the Article : " + etArticle.getText().toString() + "\n";
    String pages = " Pages  " + startPage + " to " + pageNumber + "\n";
    String book = "book name  " + etBookName.getText().toString() + "\n";
    String readername = "reader name  " + etReaderName.getText().toString() + "\n";
    totalInfo = readername + book + subject + "\n" + article + pages + word + averageWords + sentenseCount
            + averageSentenses + readability;
    displayMessage(totalInfo);

}

From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java

License:Apache License

public static String extractITextText(String pdf) {
    PdfReader reader = null;/*from   w  ww.  j  a  v  a2s.  c  om*/
    try {
        reader = new PdfReader(pdf);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    TextExtractionStrategy strategy;
    String text = "";
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        try {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            text += strategy.getResultantText();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    reader.close();

    return text;
}

From source file:com.sustainalytics.crawlerfilter.PDFtoText.java

License:Apache License

public static String extractITextText(String pdf) {
    PdfReader reader = null;/*ww w.j a  v  a  2s  .c  om*/
    try {
        reader = new PdfReader(pdf);
    } catch (IOException e) {
        logger.info("Error in reading file with iText parser\n");
    }
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    TextExtractionStrategy strategy;
    String text = "";
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {
        try {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            text += strategy.getResultantText();
        } catch (IOException e) {
            logger.info("Error in parsing with iText parser\n");
        }
        logger.info("PDF text extracted from " + pdf + "\n");
    }
    reader.close();

    return text;
}

From source file:conversorpdf.Conversor.Conversor.java

/**
 * Parses a PDF to a plain text file.//from  w ww .  j  a  v  a  2 s  .  c  o  m
 * @param pdf the original PDF
 * @param txt the resulting text
 * @throws IOException
 */
public boolean parsePdf(String pdf, String txt, boolean removerAcento) throws IOException {
    PdfReader reader = new PdfReader(pdf);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    PrintWriter out = new PrintWriter(new FileOutputStream(txt));
    TextExtractionStrategy strategy;
    for (int i = 1; i <= reader.getNumberOfPages(); i++) {

        strategy = parser.processContent(i, new SimpleTextExtractionStrategy());

        if (removerAcento == false) {
            out.println(strategy.getResultantText());
        } else {
            out.println(this.removeAcentos(strategy.getResultantText()));
        }

    }
    out.flush();
    out.close();

    return true;
}

From source file:coviam.pdf.PdfParser.java

public void getText() {

    String pdf = "/home/amit/NetBeansProjects/ResumeParser/data/resumes/ejd1.pdf";
    String text = "/home/amit/NetBeansProjects/ResumeParser/data/resumes/edj1.txt";
    StringBuffer textBuffer = new StringBuffer();
    String resultText = "";
    PdfReader reader;//from w  ww  .  j a v  a2 s .c  o m
    try {
        reader = new PdfReader(pdf);
        PdfReaderContentParser contentParser = new PdfReaderContentParser(reader);
        PrintWriter printWriter = new PrintWriter(new FileOutputStream(text));
        TextExtractionStrategy strategy;
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = contentParser.processContent(i, new SimpleTextExtractionStrategy());
            textBuffer.append(strategy.getResultantText());
        }
        resultText = textBuffer.toString();
        resultText = resultText.replaceAll("-\n", "");
        System.out.println("-->" + resultText);

        StringTokenizer stringTokenizer = new StringTokenizer(resultText, "\n");
        PrintWriter lineWriter = new PrintWriter(
                new FileOutputStream("/home/amit/NetBeansProjects/ResumeParser/data/resumes/edj1.txt"));
        while (stringTokenizer.hasMoreTokens()) {
            String curToken = stringTokenizer.nextToken();
            lineWriter.println("line-->" + curToken);
        }
        lineWriter.flush();
        lineWriter.close();
        System.out.flush();
        System.out.close();
    } catch (IOException ioe) {

    }

}

From source file:de.offis.health.icardea.cied.pdf.extractor.PDFiText5Extractor.java

License:GNU General Public License

public String getText(int pageNumber) throws IOException, Exception {
    String returnValue = null;//ww  w.  j  a  v a2  s  .c  o  m

    if (pdfReader != null) {
        int numberOfPages = getNumberOfPages();

        if (pageNumber > 0 && pageNumber <= numberOfPages) {
            TextExtractionStrategy strategy;
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
            strategy = parser.processContent(pageNumber, new SimpleTextExtractionStrategy());

            if (strategy != null && strategy.getResultantText().trim().length() > 0) {
                returnValue = PAGE_START_MARKER + strategy.getResultantText();
            } // end if
        } else {
            // TODO: Add own exception.
            throw new Exception("The given page number (" + pageNumber + ") "
                    + "is not in the range of valid pages (1.." + numberOfPages + ").");
        } // end if..else
    } else {
        // TODO: Add own exception.
        throw new Exception("There is no open PDF to work with.");
    } // end if..else
    return returnValue;
}

From source file:helper.PdfText.java

License:Apache License

/**
 * @param pdfFile this file will be extracted.
 * @return the plain text of the pdf/*from  ww  w.  j a v  a 2s  .c  o m*/
 */
public String itext(File pdfFile) {

    PdfReader reader;
    try {
        reader = new PdfReader(pdfFile.getAbsolutePath());
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        StringBuffer buf = new StringBuffer();
        TextExtractionStrategy strategy;
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            buf.append(strategy.getResultantText());
        }

        return buf.toString();
    } catch (IOException e) {
        throw new HttpArchiveException(500, e);
    }

}