Example usage for org.apache.pdfbox.text PDFTextStripper writeText

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper writeText.

Prototype

public void writeText(PDDocument doc, Writer outputStream) throws IOException

Source Link

Document

This will take a PDDocument and write the text of that document to the print writer.

Usage

From source file:com.plumblarrick.andrew.cityrecordtextextractor.IssueExtractorPositional.java

public void extractToFile(String inFileName, String outFileName) throws IOException {

    this.inFileName = inFileName;
    this.outFileName = outFileName;
    try {/*from  www .  ja va2 s  .  c  om*/
        document = PDDocument.load(new File(inFileName));

        PDFTextStripper stripper = new CRTStripper();
        //stripper.setSortByPosition(true);
        stripper.setStartPage(0);
        stripper.setEndPage(document.getNumberOfPages());

        fileOut = (new BufferedWriter(new PrintWriter(outFileName, "UTF-8")));

        fileOut.write("Source file: " + inFileName + "\n");
        stripper.writeText(document, fileOut);

    } finally {
        if (document != null) {
            document.close();
            fileOut.flush();
            fileOut.close();
        }
    }
}

From source file:org.codelibs.fess.crawler.extractor.impl.PdfExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }/*w ww  .j a  va  2 s  . c o m*/

    synchronized (pdfBoxLockObj) {
        final String password = getPassword(params);
        try (PDDocument document = PDDocument.load(in, password)) {
            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final Writer output = new OutputStreamWriter(baos, encoding);
            final PDFTextStripper stripper = new PDFTextStripper();
            final AtomicBoolean done = new AtomicBoolean(false);
            final PDDocument doc = document;
            final Set<Exception> exceptionSet = new HashSet<>();
            final Thread task = new Thread(() -> {
                try {
                    stripper.writeText(doc, output);
                } catch (final Exception e) {
                    exceptionSet.add(e);
                } finally {
                    done.set(true);
                }
            });
            task.setDaemon(true);
            task.start();
            task.join(timeout);
            if (!done.get()) {
                for (int i = 0; i < 100 && !done.get(); i++) {
                    task.interrupt();
                    Thread.sleep(50);
                }
                throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
            } else if (!exceptionSet.isEmpty()) {
                throw exceptionSet.iterator().next();
            }
            output.flush();
            final ExtractData extractData = new ExtractData(baos.toString(encoding));
            extractMetadata(document, extractData);
            return extractData;
        } catch (final Exception e) {
            throw new ExtractException(e);
        }
    }
}

From source file:org.dspace.app.rest.BitstreamContentRestControllerIT.java

License:BSD License

private String extractPDFText(byte[] content) throws IOException {
    PDFTextStripper pts = new PDFTextStripper();
    pts.setSortByPosition(true);/*from w w w  .  j  ava2s  .  co m*/

    try (ByteArrayInputStream source = new ByteArrayInputStream(content);
            Writer writer = new StringWriter();
            PDDocument pdfDoc = PDDocument.load(source)) {

        pts.writeText(pdfDoc, writer);
        return writer.toString();
    }
}

From source file:org.haplo.component.pdfbox.ConvertPDFToText.java

License:Mozilla Public License

protected void performOperation() throws Exception {
    try (PDDocument pdf = PDDocument.load(new File(this.inputPathname))) {
        PDFTextStripper stripper = new PDFTextStripper();
        try (FileOutputStream out = new FileOutputStream(new File(this.outputPathname))) {
            try (OutputStreamWriter writer = new OutputStreamWriter(out, "UTF-8")) {
                stripper.writeText(pdf, writer);
            }/*from  www. j  ava 2  s  .c om*/
        }
    }
}

From source file:org.haplo.component.pdfbox.TextExtractPDF.java

License:Mozilla Public License

protected String extract() throws IOException {
    String text = null;/*from  w  w w  .j  a  va  2  s.  com*/
    try (PDDocument pdf = PDDocument.load(new File(getInputPathname()))) {
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdf, writer);
        text = writer.toString();
    }
    return text;
}

From source file:org.quelea.services.importexport.SurvivorSongbookParser.java

License:Open Source License

/**
 * Get the text on a page in the PDF document.
 * @param document the document.//ww w.ja v  a2  s.  c om
 * @param stripper the PDF stripper used to get the text.
 * @param page     the page number.
 * @return the text on the given page.
 * @throws IOException if something went wrong.
 */
private String getPageText(PDDocument document, PDFTextStripper stripper, int page) throws IOException {
    stripper.setStartPage(page);
    stripper.setEndPage(page);
    StringWriter textWriter = new StringWriter();
    stripper.writeText(document, textWriter);
    return textWriter.toString().replace("", "'").replace("`", "'");
}

From source file:org.wildfly.camel.test.fop.FopIntegrationTest.java

License:Apache License

private String extractTextFromDocument(PDDocument document) throws IOException {
    Writer output = new StringWriter();
    PDFTextStripper stripper = new PDFTextStripper();
    stripper.writeText(document, output);
    return output.toString().trim();
}

From source file:uk.org.openeyes.PDFFunctions.java

/**
 *
 * @param PDFDoc/*from   ww  w. ja  va  2  s  .com*/
 * @throws IOException
 */
public void dumpPDFStructure(PDDocument PDFDoc) throws IOException {
    PDFTextStripper stripper = new PDFFunctions();
    stripper.setSortByPosition(true);
    stripper.setStartPage(0);
    stripper.setEndPage(PDFDoc.getNumberOfPages());
    Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
    stripper.writeText(PDFDoc, dummy);

}