Example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper.

Prototype

public PDFTextStripper() throws IOException

Source Link

Document

Instantiate a new PDFTextStripper object.

Usage

From source file:edu.esprit.filereader.PdfReader.java

public String ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r"));

    parser.parse();//from  www. j  a  va 2  s. co m
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(10);

    // reading text from page 1 to 10
    // if you want to get text from full pdf file use this code
    // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:edu.umsl.runPDF.java

public void readPDF() throws IOException {
    System.out.println("Please enter PDF file location, omit extension: ");
    String input = sc.next();//from   w  w  w  .  j  a v a 2s. c  o m
    pdfFile = new File(input);
    PDDocument pdDocument = PDDocument.load(pdfFile);
    PDFTextStripper strip = new PDFTextStripper();
    //        strip.setStartPage(1);
    //        strip.setEndPage(1);
    content = strip.getText(pdDocument);
    System.out.println("PDF Read");
    //        System.out.println(content);
    //        FileOutputStream outStream;
    //        strip.writeText(txtFile, outStream);

}

From source file:extractor.Extractor.java

public static ArrayList<Document> returnDocuments(String pathBase, String[] files) {

    ArrayList<Document> documents = new ArrayList<>();

    for (String file : files) {
        PDDocument pdDocument = null;/*from  w  w w.  ja  v a 2 s.  co  m*/
        String paperString = null;
        try {
            pdDocument = PDDocument.load(new File(pathBase + file));
            paperString = new PDFTextStripper().getText(pdDocument);
            pdDocument.close();
            Document document = new Document(paperString);
            documents.add(document);

        } catch (FileNotFoundException ex) {
            System.out.println("Arquivo no encontrado! Detalhes: " + ex.getLocalizedMessage());
            continue;
        } catch (IOException ex) {
            Logger.getLogger(Classifierdoc.class.getName()).log(Level.SEVERE, null, ex);

        }
    }

    return documents;
}

From source file:extractor.pdftotext.PdfToText.java

private String getPdfBoxRaw(File file) {
    try {/*from   w  w w.  j a  v  a  2s. com*/
        PDDocument doc = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();

        stripper.setPageStart("PAGE START");
        stripper.setPageEnd("PAGE END");
        //gets the text form the doc and replaces unknown signs with \n
        String rawText = stripper.getText(doc).replaceAll("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cn}]", "\n");
        doc.close();
        return rawText;

    } catch (IOException ex) {
        Logger.getLogger(PdfToText.class.getName()).log(Level.SEVERE, null, ex);
    }
    return "";
}

From source file:indexer.PDFTextExtractor.java

License:Open Source License

public PDFTextExtractor() {
    try {// w w  w  .j  a va 2 s  . c o  m
        stripper = new PDFTextStripper();
    } catch (IOException e) {
        log.error("Could not create PDF Text Stripper", e);
    }

}

From source file:io.cloudslang.content.utilities.services.PdfParseService.java

License:Open Source License

public static String getPdfContent(final Path path, final String password) throws IOException {
    try (final PDDocument document = getPdfDocument(path, password)) {
        return new PDFTextStripper().getText(document);
    }/*from   w  ww  .  j  av  a 2s  . co m*/
}

From source file:it.myideas.bancamarcheextractor.Distinta.java

public static Distinta parse(Path file) {

    try (PDDocument doc = PDDocument.load(file.toFile())) {

        Distinta distinta = new Distinta();

        PDFTextStripper stripper = new PDFTextStripper();
        String contents = stripper.getText(doc);
        Stream<String> lines = Arrays.stream(contents.split(stripper.getLineSeparator()));

        log.debug("FILE:" + file.toString());
        log.debug(contents);/*from  w ww.j a  va  2s .  c om*/

        lines.forEach(line -> {

            if (line.startsWith("Tipo disposizione")) {
                distinta.tipoDisposizione = line.replace("Tipo disposizione", "").trim().toLowerCase();
            } else if (line.startsWith("1 Esecuzione")) {
                String[] p = line.split(" ");

                distinta.beneficiario = Arrays.stream(Arrays.copyOfRange(p, 4, p.length))
                        .map(String::toLowerCase).collect(Collectors.joining("_"));

                distinta.data = LocalDate.parse(p[2], DateTimeFormatter.ofPattern("dd/MM/yyyy"));
            }

        });

        if (!isOk(distinta.beneficiario) || !isOk(distinta.tipoDisposizione) || distinta.data == null) {
            throw new IOException("Parser failure for file " + file.toString());
        }

        return distinta;
    } catch (IOException e) {
        log.error("Error parsing PDF", e);
        return null;
    }
}

From source file:javaapplication1.PDFManager.java

public Map<String, String> ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0

    parser.parse();//from   w w w  . j a v  a 2s. c o  m
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    // if you want to get text from full pdf file use this code
    //       pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    // if you want specific number of pages
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(1);
    Text = pdfStripper.getText(pdDoc);
    System.out.println(Text);
    // spilt 

    String[] result = Text.split("\n");
    Map<String, String> map = new HashMap<String, String>();

    try {
        for (int j = 0; j < result.length; j++) {

            if (result[j].contains("Type")) {

                String x = result[j].substring(5);
                map.put("Type", x);
            } else if (result[j].contains("Document Number")) {

                String x = result[j].substring(16);
                map.put("Document Number", x);
            } else if (result[j].contains("Date of Birth")) {

                String x = result[j].substring(14);
                map.put("Date of Birth", x);
            } else if (result[j].contains("Date of Expiry")) {

                String x = result[j].substring(15);
                map.put("Date of Expiry", x);
            } else if (result[j].contains("Issuer")) {

                String x = result[j].substring(7);
                map.put("Issuer", x);
            } else if (result[j].contains("Nationality")) {

                String x = result[j].substring(12);
                map.put("Nationality", x);
            } else if (result[j].contains("First Names")) {

                String x = result[j].substring(12);
                map.put("First Names", x);
            } else if (result[j].contains("Last Names")) {

                String x = result[j].substring(11);
                map.put("Last Names", x);
            } else if (result[j].contains("Discretionary 1")) {

                String x = result[j].substring(16);
                map.put("Discretionary 1", x);
            }
            //                            else if (result[j].contains("Discretionary 2"))
            //                            {
            //                               
            //                                String x = result[j].substring(16);
            //                                map.put("Discretionary 2", x);
            //                                
            //                            }
            else if (result[j].contains("Gender")) {

                String x = result[j].substring(7);
                map.put("Gender", x);
            }
        }

    } catch (Exception e) {

        JOptionPane.showMessageDialog(null, "please selecet OCR PDF", "worng pass", JOptionPane.ERROR_MESSAGE);

    }
    return map;

}

From source file:main.PdfReader.java

License:Apache License

@Test
public void testPDFReader() throws Exception {
    // page with example pdf document
    driver.get("http://www.vandevenbv.nl/dynamics/modules/SFIL0200/view.php?fil_Id=5515");

    URL url = new URL(driver.getCurrentUrl());
    BufferedInputStream fileToParse = new BufferedInputStream(url.openStream());

    PDDocument document = null;/*from w  w  w .  j  av a2 s .  co  m*/
    try {
        document = PDDocument.load(fileToParse);
        String output = new PDFTextStripper().getText(document);
        System.out.println(output);
    } finally {

        if (document != null) {
            document.close();
        }
    }
}

From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java

public JavaBillScrapper(File pdfFile) throws IOException {
    PDDocument doc = PDDocument.load(pdfFile);
    PDFTextStripper stripper = new PDFTextStripper();
    String rawText = stripper.getText(doc);
    String[] textArray = rawText.split("[\\r\\n]+");
    this.billObj = parsePdf(textArray);
    doc.close();/*from ww  w .j  a  v  a 2s. co m*/
}