Example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

List of usage examples for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper.

Prototype

public PDFTextStripper() throws IOException 

Source Link

Document

Instantiate a new PDFTextStripper object.

Usage

From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java

public static void main(String[] args) {
    // TODO code application logic here
    try {/*from  w w  w.ja va 2  s  .  com*/
        for (String arg : args) {
            if (!arg.startsWith("-")) {
                filePaths.add(arg);
            } else {
                try {
                    options.add(Flags.fromString(arg));
                } catch (IllegalArgumentException ex) {
                    System.err.println("Illegal options: " + arg);
                }
            }
        }
        Collections.sort(filePaths);
        for (String filePath : filePaths) {
            System.out.println("Loading: " + filePath);
            PDDocument doc = PDDocument.load(new File(filePath));
            PDFTextStripper stripper = new PDFTextStripper();
            String rawText = stripper.getText(doc);
            String[] textArray = rawText.split("[\\r\\n]+");
            Bill bill = parsePdf(textArray);
            if (options.contains(Flags.INSERT_INTO_DB)) {
                DatabaseConnector db = new DatabaseConnector();
                DbWriter writer = new DbWriter(db.getConnection());
                boolean isInserted = writer.insertDetail(bill.getBillSummary(), bill.getPhoneSummaryData(),
                        bill.getPhoneDetail());
                writer.commit();
                doc.close();
                if (!isInserted) {
                    System.out.println(filePath + " was not inserted into database.");
                }
            }
        }

    } catch (Exception ex) {
        ex.printStackTrace(System.out);
    }
}

From source file:neuralclassification.Classificator.java

String readText(String filepath, String name) {
    PDDocument pdfDocument = null;//from   www  .  ja va 2s  . c o  m
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}

From source file:neuralclassification.Trainer.java

String readText(String name) {
    PDDocument pdfDocument = null;/*  w  w  w.j  a  v  a2 s .  c o m*/
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}

From source file:opennlp.PDFTools.java

public String getStringFromPDF(String filePath) {

    String text = null;//from ww w  . ja  v  a 2s .  c o  m

    try {

        File file = new File(filePath);
        parser = new PDFParser(new RandomAccessFile(file, "r"));

        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdDoc.getNumberOfPages();
        pdfStripper.setStartPage(1);
        //pdfStripper.setEndPage(10);

        pdfStripper.setEndPage(pdDoc.getNumberOfPages());

        text = pdfStripper.getText(pdDoc);

    } catch (IOException e) {
        logger.error("IO ERROR", e);
    } catch (Exception ex) {
        logger.error("ERROR", ex);
    }

    return text;
}

From source file:org.apache.james.mailbox.store.search.PDFTextExtractor.java

License:Apache License

private ParsedContent extractTextFromPDF(InputStream inputStream) throws IOException {
    return new ParsedContent(Optional.ofNullable(new PDFTextStripper().getText(PDDocument.load(inputStream))),
            ImmutableMap.of());//from w  w w  .j  a v  a  2 s  . c o  m
}

From source file:org.codelibs.fess.crawler.extractor.impl.PdfExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }/*  w ww . j a  va  2 s . co m*/

    synchronized (pdfBoxLockObj) {
        final String password = getPassword(params);
        try (PDDocument document = PDDocument.load(in, password)) {
            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final Writer output = new OutputStreamWriter(baos, encoding);
            final PDFTextStripper stripper = new PDFTextStripper();
            final AtomicBoolean done = new AtomicBoolean(false);
            final PDDocument doc = document;
            final Set<Exception> exceptionSet = new HashSet<>();
            final Thread task = new Thread(() -> {
                try {
                    stripper.writeText(doc, output);
                } catch (final Exception e) {
                    exceptionSet.add(e);
                } finally {
                    done.set(true);
                }
            });
            task.setDaemon(true);
            task.start();
            task.join(timeout);
            if (!done.get()) {
                for (int i = 0; i < 100 && !done.get(); i++) {
                    task.interrupt();
                    Thread.sleep(50);
                }
                throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
            } else if (!exceptionSet.isEmpty()) {
                throw exceptionSet.iterator().next();
            }
            output.flush();
            final ExtractData extractData = new ExtractData(baos.toString(encoding));
            extractMetadata(document, extractData);
            return extractData;
        } catch (final Exception e) {
            throw new ExtractException(e);
        }
    }
}

From source file:org.dspace.app.rest.BitstreamContentRestControllerIT.java

License:BSD License

private String extractPDFText(byte[] content) throws IOException {
    PDFTextStripper pts = new PDFTextStripper();
    pts.setSortByPosition(true);//from ww w  . j av  a2 s  .c  om

    try (ByteArrayInputStream source = new ByteArrayInputStream(content);
            Writer writer = new StringWriter();
            PDDocument pdfDoc = PDDocument.load(source)) {

        pts.writeText(pdfDoc, writer);
        return writer.toString();
    }
}

From source file:org.grouplens.samantha.modeler.dao.PdfFileDAO.java

License:Open Source License

private PdfFileDAO(String filePath) {
    try {/*from  w w  w . ja v  a  2s  . com*/
        stripper = new PDFTextStripper();
        pdfDoc = PDDocument.load(new File(filePath));
    } catch (IOException e) {
        throw new BadRequestException(e);
    }
    numPages = pdfDoc.getNumberOfPages();
}

From source file:org.haplo.component.pdfbox.ConvertPDFToText.java

License:Mozilla Public License

protected void performOperation() throws Exception {
    try (PDDocument pdf = PDDocument.load(new File(this.inputPathname))) {
        PDFTextStripper stripper = new PDFTextStripper();
        try (FileOutputStream out = new FileOutputStream(new File(this.outputPathname))) {
            try (OutputStreamWriter writer = new OutputStreamWriter(out, "UTF-8")) {
                stripper.writeText(pdf, writer);
            }//from   w  w  w . j  av  a  2  s  .co m
        }
    }
}

From source file:org.haplo.component.pdfbox.TextExtractPDF.java

License:Mozilla Public License

protected String extract() throws IOException {
    String text = null;/*from  w w  w  .  j a  va 2  s .c  om*/
    try (PDDocument pdf = PDDocument.load(new File(getInputPathname()))) {
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdf, writer);
        text = writer.toString();
    }
    return text;
}