Example usage for org.apache.pdfbox.text PDFTextStripper getText

List of usage examples for org.apache.pdfbox.text PDFTextStripper getText

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper getText.

Prototype

public String getText(PDDocument doc) throws IOException 

Source Link

Document

This will return the text of a document.

Usage

From source file:cz.mzk.editor.server.handler.GetOcrFromPdfHandler.java

License:Open Source License

private String pdftoText(String fileName) throws ActionException {

    File pdfFile = new File(fileName);

    if (!pdfFile.isFile()) {
        LOGGER.error("The file: " + fileName + " does not exist.");
        throw new ActionException("Unable to parse the pdf file.");
    }//from   w  w  w . j  a  v  a2  s .  c om

    PDFParser parser = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;
    PDDocument pdDoc = null;
    String parsedText;
    try {
        parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile)));
    } catch (Exception e) {
        LOGGER.error("Unable to open PDF Parser.: " + e);
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file.");
    }

    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        LOGGER.error("An exception occured in parsing the PDF Document.");
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file. " + e);
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    return parsedText;
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

public static StringBuffer getTextFromPDF(PDDocument pdDoc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    String password = "";
    // extract PDF document's textual content
    try {/*  w w  w.j  a va 2  s  . c  o m*/
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    }
    return docText;
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

/**
 * /*from w ww . j av  a2  s .co  m*/
 *
 * @throws Exception.
 */
private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    PDDocument pdDoc = null;
    String password = "";

    // extract PDF document's textual content
    try {
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        //password
        pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc);
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    } finally {
        closePDDocument(pdDoc);
    }
    return docText;
}

From source file:edu.umsl.runPDF.java

public void readPDF() throws IOException {
    System.out.println("Please enter PDF file location, omit extension: ");
    String input = sc.next();// ww  w .  j  a  va 2s .  c  om
    pdfFile = new File(input);
    PDDocument pdDocument = PDDocument.load(pdfFile);
    PDFTextStripper strip = new PDFTextStripper();
    //        strip.setStartPage(1);
    //        strip.setEndPage(1);
    content = strip.getText(pdDocument);
    System.out.println("PDF Read");
    //        System.out.println(content);
    //        FileOutputStream outStream;
    //        strip.writeText(txtFile, outStream);

}

From source file:extractor.pdftotext.PdfToText.java

private String getPdfBoxRaw(File file) {
    try {/*  ww  w.ja va  2s  .co  m*/
        PDDocument doc = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();

        stripper.setPageStart("PAGE START");
        stripper.setPageEnd("PAGE END");
        //gets the text form the doc and replaces unknown signs with \n
        String rawText = stripper.getText(doc).replaceAll("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cn}]", "\n");
        doc.close();
        return rawText;

    } catch (IOException ex) {
        Logger.getLogger(PdfToText.class.getName()).log(Level.SEVERE, null, ex);
    }
    return "";
}

From source file:it.myideas.bancamarcheextractor.Distinta.java

public static Distinta parse(Path file) {

    try (PDDocument doc = PDDocument.load(file.toFile())) {

        Distinta distinta = new Distinta();

        PDFTextStripper stripper = new PDFTextStripper();
        String contents = stripper.getText(doc);
        Stream<String> lines = Arrays.stream(contents.split(stripper.getLineSeparator()));

        log.debug("FILE:" + file.toString());
        log.debug(contents);//  w w w .j  a v a  2  s  .c  om

        lines.forEach(line -> {

            if (line.startsWith("Tipo disposizione")) {
                distinta.tipoDisposizione = line.replace("Tipo disposizione", "").trim().toLowerCase();
            } else if (line.startsWith("1 Esecuzione")) {
                String[] p = line.split(" ");

                distinta.beneficiario = Arrays.stream(Arrays.copyOfRange(p, 4, p.length))
                        .map(String::toLowerCase).collect(Collectors.joining("_"));

                distinta.data = LocalDate.parse(p[2], DateTimeFormatter.ofPattern("dd/MM/yyyy"));
            }

        });

        if (!isOk(distinta.beneficiario) || !isOk(distinta.tipoDisposizione) || distinta.data == null) {
            throw new IOException("Parser failure for file " + file.toString());
        }

        return distinta;
    } catch (IOException e) {
        log.error("Error parsing PDF", e);
        return null;
    }
}

From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java

public JavaBillScrapper(File pdfFile) throws IOException {
    PDDocument doc = PDDocument.load(pdfFile);
    PDFTextStripper stripper = new PDFTextStripper();
    String rawText = stripper.getText(doc);
    String[] textArray = rawText.split("[\\r\\n]+");
    this.billObj = parsePdf(textArray);
    doc.close();/*  www  .jav a2s . c o  m*/
}

From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java

public static void main(String[] args) {
    // TODO code application logic here
    try {/* ww w . j a  v a  2s  .  c  o m*/
        for (String arg : args) {
            if (!arg.startsWith("-")) {
                filePaths.add(arg);
            } else {
                try {
                    options.add(Flags.fromString(arg));
                } catch (IllegalArgumentException ex) {
                    System.err.println("Illegal options: " + arg);
                }
            }
        }
        Collections.sort(filePaths);
        for (String filePath : filePaths) {
            System.out.println("Loading: " + filePath);
            PDDocument doc = PDDocument.load(new File(filePath));
            PDFTextStripper stripper = new PDFTextStripper();
            String rawText = stripper.getText(doc);
            String[] textArray = rawText.split("[\\r\\n]+");
            Bill bill = parsePdf(textArray);
            if (options.contains(Flags.INSERT_INTO_DB)) {
                DatabaseConnector db = new DatabaseConnector();
                DbWriter writer = new DbWriter(db.getConnection());
                boolean isInserted = writer.insertDetail(bill.getBillSummary(), bill.getPhoneSummaryData(),
                        bill.getPhoneDetail());
                writer.commit();
                doc.close();
                if (!isInserted) {
                    System.out.println(filePath + " was not inserted into database.");
                }
            }
        }

    } catch (Exception ex) {
        ex.printStackTrace(System.out);
    }
}

From source file:neuralclassification.Classificator.java

String readText(String filepath, String name) {
    PDDocument pdfDocument = null;//from   w  w w.  jav  a2s . com
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}

From source file:neuralclassification.Trainer.java

String readText(String name) {
    PDDocument pdfDocument = null;/*from  www.ja  v a2s . c o  m*/
    String paper = null;
    try {
        pdfDocument = PDDocument.load(new File(filepath + "/" + name));
        PDFTextStripper stripper = new PDFTextStripper();
        paper = stripper.getText(pdfDocument);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        if (pdfDocument != null)
            try {
                pdfDocument.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
    }

    return paper;
}