Example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper.

Prototype

public PDFTextStripper() throws IOException

Source Link

Document

Instantiate a new PDFTextStripper object.

Usage

From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java

public static void main(String[] args) {
    // TODO code application logic here
    try {/*from  w w  w.ja va 2  s  .  com*/
        for (String arg : args) {
            if (!arg.startsWith("-")) {
                filePaths.add(arg);
            } else {
                try {
                    options.add(Flags.fromString(arg));
                } catch (IllegalArgumentException ex) {
                    System.err.println("Illegal options: " + arg);
                }
            }
        }
        Collections.sort(filePaths);
        for (String filePath : filePaths) {
            System.out.println("Loading: " + filePath);
            PDDocument doc = PDDocument.load(new File(filePath));
            PDFTextStripper stripper = new PDFTextStripper();
            String rawText = stripper.getText(doc);
            String[] textArray = rawText.split("[\\r\\n]+");
            Bill bill = parsePdf(textArray);
            if (options.contains(Flags.INSERT_INTO_DB)) {
                DatabaseConnector db = new DatabaseConnector();
                DbWriter writer = new DbWriter(db.getConnection());
                boolean isInserted = writer.insertDetail(bill.getBillSummary(), bill.getPhoneSummaryData(),
                        bill.getPhoneDetail());
                writer.commit();
                doc.close();
                if (!isInserted) {
                    System.out.println(filePath + " was not inserted into database.");
                }
            }
        }

    } catch (Exception ex) {
        ex.printStackTrace(System.out);
    }
}

From source file:neuralclassification.Classificator.java

String readText(String filepath, String name) {
    PDDocument pdfDocument = null;//from   www  .  ja va 2s  . c o  m
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}

From source file:neuralclassification.Trainer.java

String readText(String name) {
    PDDocument pdfDocument = null;/*  w  w  w.j  a  v  a2 s .  c o m*/
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}

From source file:opennlp.PDFTools.java

public String getStringFromPDF(String filePath) {

    String text = null;//from ww w  . ja  v  a 2s .  c o  m

    try {

        File file = new File(filePath);
        parser = new PDFParser(new RandomAccessFile(file, "r"));

        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdDoc.getNumberOfPages();
        pdfStripper.setStartPage(1);
        //pdfStripper.setEndPage(10);

        pdfStripper.setEndPage(pdDoc.getNumberOfPages());

        text = pdfStripper.getText(pdDoc);

    } catch (IOException e) {
        logger.error("IO ERROR", e);
    } catch (Exception ex) {
        logger.error("ERROR", ex);
    }

    return text;
}

From source file:org.apache.james.mailbox.store.search.PDFTextExtractor.java

License:Apache License

private ParsedContent extractTextFromPDF(InputStream inputStream) throws IOException {
    return new ParsedContent(Optional.ofNullable(new PDFTextStripper().getText(PDDocument.load(inputStream))),
            ImmutableMap.of());//from w  w w  .j  a v  a  2 s  . c o  m
}

From source file:org.codelibs.fess.crawler.extractor.impl.PdfExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }/*  w ww . j a  va  2 s . co m*/

    synchronized (pdfBoxLockObj) {
        final String password = getPassword(params);
        try (PDDocument document = PDDocument.load(in, password)) {
            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final Writer output = new OutputStreamWriter(baos, encoding);
            final PDFTextStripper stripper = new PDFTextStripper();
            final AtomicBoolean done = new AtomicBoolean(false);
            final PDDocument doc = document;
            final Set<Exception> exceptionSet = new HashSet<>();
            final Thread task = new Thread(() -> {
                try {
                    stripper.writeText(doc, output);
                } catch (final Exception e) {
                    exceptionSet.add(e);
                } finally {
                    done.set(true);
                }
            });
            task.setDaemon(true);
            task.start();
            task.join(timeout);
            if (!done.get()) {
                for (int i = 0; i < 100 && !done.get(); i++) {
                    task.interrupt();
                    Thread.sleep(50);
                }
                throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
            } else if (!exceptionSet.isEmpty()) {
                throw exceptionSet.iterator().next();
            }
            output.flush();
            final ExtractData extractData = new ExtractData(baos.toString(encoding));
            extractMetadata(document, extractData);
            return extractData;
        } catch (final Exception e) {
            throw new ExtractException(e);
        }
    }
}

From source file:org.dspace.app.rest.BitstreamContentRestControllerIT.java

License:BSD License

private String extractPDFText(byte[] content) throws IOException {
    PDFTextStripper pts = new PDFTextStripper();
    pts.setSortByPosition(true);//from ww w  . j av  a2 s  .c  om

    try (ByteArrayInputStream source = new ByteArrayInputStream(content);
            Writer writer = new StringWriter();
            PDDocument pdfDoc = PDDocument.load(source)) {

        pts.writeText(pdfDoc, writer);
        return writer.toString();
    }
}

From source file:org.grouplens.samantha.modeler.dao.PdfFileDAO.java

License:Open Source License

private PdfFileDAO(String filePath) {
    try {/*from  w w  w . ja v  a  2s  . com*/
        stripper = new PDFTextStripper();
        pdfDoc = PDDocument.load(new File(filePath));
    } catch (IOException e) {
        throw new BadRequestException(e);
    }
    numPages = pdfDoc.getNumberOfPages();
}

From source file:org.haplo.component.pdfbox.ConvertPDFToText.java

License:Mozilla Public License

protected void performOperation() throws Exception {
    try (PDDocument pdf = PDDocument.load(new File(this.inputPathname))) {
        PDFTextStripper stripper = new PDFTextStripper();
        try (FileOutputStream out = new FileOutputStream(new File(this.outputPathname))) {
            try (OutputStreamWriter writer = new OutputStreamWriter(out, "UTF-8")) {
                stripper.writeText(pdf, writer);
            }//from   w  w  w . j  av  a  2  s  .co m
        }
    }
}

From source file:org.haplo.component.pdfbox.TextExtractPDF.java

License:Mozilla Public License

protected String extract() throws IOException {
    String text = null;/*from  w w  w  .  j a  va 2  s .c  om*/
    try (PDDocument pdf = PDDocument.load(new File(getInputPathname()))) {
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdf, writer);
        text = writer.toString();
    }
    return text;
}