Example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

List of usage examples for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper.

Prototype

public PDFTextStripper() throws IOException 

Source Link

Document

Instantiate a new PDFTextStripper object.

Usage

From source file:edu.esprit.filereader.PdfReader.java

public String ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r"));

    parser.parse();//from  www. j  a  va 2  s. co m
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(10);

    // reading text from page 1 to 10
    // if you want to get text from full pdf file use this code
    // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:edu.umsl.runPDF.java

public void readPDF() throws IOException {
    System.out.println("Please enter PDF file location, omit extension: ");
    String input = sc.next();//from   w  w  w  .  j  a v a 2s. c  o m
    pdfFile = new File(input);
    PDDocument pdDocument = PDDocument.load(pdfFile);
    PDFTextStripper strip = new PDFTextStripper();
    //        strip.setStartPage(1);
    //        strip.setEndPage(1);
    content = strip.getText(pdDocument);
    System.out.println("PDF Read");
    //        System.out.println(content);
    //        FileOutputStream outStream;
    //        strip.writeText(txtFile, outStream);

}

From source file:extractor.Extractor.java

public static ArrayList<Document> returnDocuments(String pathBase, String[] files) {

    ArrayList<Document> documents = new ArrayList<>();

    for (String file : files) {
        PDDocument pdDocument = null;/*from  w  w w.  ja  v a 2 s.  co  m*/
        String paperString = null;
        try {
            pdDocument = PDDocument.load(new File(pathBase + file));
            paperString = new PDFTextStripper().getText(pdDocument);
            pdDocument.close();
            Document document = new Document(paperString);
            documents.add(document);

        } catch (FileNotFoundException ex) {
            System.out.println("Arquivo no encontrado! Detalhes: " + ex.getLocalizedMessage());
            continue;
        } catch (IOException ex) {
            Logger.getLogger(Classifierdoc.class.getName()).log(Level.SEVERE, null, ex);

        }
    }

    return documents;
}

From source file:extractor.pdftotext.PdfToText.java

private String getPdfBoxRaw(File file) {
    try {/*from   w  w w.  j a  v  a  2s. com*/
        PDDocument doc = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();

        stripper.setPageStart("PAGE START");
        stripper.setPageEnd("PAGE END");
        //gets the text form the doc and replaces unknown signs with \n
        String rawText = stripper.getText(doc).replaceAll("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cn}]", "\n");
        doc.close();
        return rawText;

    } catch (IOException ex) {
        Logger.getLogger(PdfToText.class.getName()).log(Level.SEVERE, null, ex);
    }
    return "";
}

From source file:indexer.PDFTextExtractor.java

License:Open Source License

public PDFTextExtractor() {
    try {// w w  w  .j  a va 2 s  . c o  m
        stripper = new PDFTextStripper();
    } catch (IOException e) {
        log.error("Could not create PDF Text Stripper", e);
    }

}

From source file:io.cloudslang.content.utilities.services.PdfParseService.java

License:Open Source License

public static String getPdfContent(final Path path, final String password) throws IOException {
    try (final PDDocument document = getPdfDocument(path, password)) {
        return new PDFTextStripper().getText(document);
    }/*from   w  ww  .  j  av  a 2s  . co m*/
}

From source file:it.myideas.bancamarcheextractor.Distinta.java

public static Distinta parse(Path file) {

    try (PDDocument doc = PDDocument.load(file.toFile())) {

        Distinta distinta = new Distinta();

        PDFTextStripper stripper = new PDFTextStripper();
        String contents = stripper.getText(doc);
        Stream<String> lines = Arrays.stream(contents.split(stripper.getLineSeparator()));

        log.debug("FILE:" + file.toString());
        log.debug(contents);/*from  w ww.j a  va  2s .  c om*/

        lines.forEach(line -> {

            if (line.startsWith("Tipo disposizione")) {
                distinta.tipoDisposizione = line.replace("Tipo disposizione", "").trim().toLowerCase();
            } else if (line.startsWith("1 Esecuzione")) {
                String[] p = line.split(" ");

                distinta.beneficiario = Arrays.stream(Arrays.copyOfRange(p, 4, p.length))
                        .map(String::toLowerCase).collect(Collectors.joining("_"));

                distinta.data = LocalDate.parse(p[2], DateTimeFormatter.ofPattern("dd/MM/yyyy"));
            }

        });

        if (!isOk(distinta.beneficiario) || !isOk(distinta.tipoDisposizione) || distinta.data == null) {
            throw new IOException("Parser failure for file " + file.toString());
        }

        return distinta;
    } catch (IOException e) {
        log.error("Error parsing PDF", e);
        return null;
    }
}

From source file:javaapplication1.PDFManager.java

public Map<String, String> ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0

    parser.parse();//from   w w w  . j a v  a 2s. c o  m
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    // if you want to get text from full pdf file use this code
    //       pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    // if you want specific number of pages
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(1);
    Text = pdfStripper.getText(pdDoc);
    System.out.println(Text);
    // spilt 

    String[] result = Text.split("\n");
    Map<String, String> map = new HashMap<String, String>();

    try {
        for (int j = 0; j < result.length; j++) {

            if (result[j].contains("Type")) {

                String x = result[j].substring(5);
                map.put("Type", x);
            } else if (result[j].contains("Document Number")) {

                String x = result[j].substring(16);
                map.put("Document Number", x);
            } else if (result[j].contains("Date of Birth")) {

                String x = result[j].substring(14);
                map.put("Date of Birth", x);
            } else if (result[j].contains("Date of Expiry")) {

                String x = result[j].substring(15);
                map.put("Date of Expiry", x);
            } else if (result[j].contains("Issuer")) {

                String x = result[j].substring(7);
                map.put("Issuer", x);
            } else if (result[j].contains("Nationality")) {

                String x = result[j].substring(12);
                map.put("Nationality", x);
            } else if (result[j].contains("First Names")) {

                String x = result[j].substring(12);
                map.put("First Names", x);
            } else if (result[j].contains("Last Names")) {

                String x = result[j].substring(11);
                map.put("Last Names", x);
            } else if (result[j].contains("Discretionary 1")) {

                String x = result[j].substring(16);
                map.put("Discretionary 1", x);
            }
            //                            else if (result[j].contains("Discretionary 2"))
            //                            {
            //                               
            //                                String x = result[j].substring(16);
            //                                map.put("Discretionary 2", x);
            //                                
            //                            }
            else if (result[j].contains("Gender")) {

                String x = result[j].substring(7);
                map.put("Gender", x);
            }
        }

    } catch (Exception e) {

        JOptionPane.showMessageDialog(null, "please selecet OCR PDF", "worng pass", JOptionPane.ERROR_MESSAGE);

    }
    return map;

}

From source file:main.PdfReader.java

License:Apache License

@Test
public void testPDFReader() throws Exception {
    // page with example pdf document
    driver.get("http://www.vandevenbv.nl/dynamics/modules/SFIL0200/view.php?fil_Id=5515");

    URL url = new URL(driver.getCurrentUrl());
    BufferedInputStream fileToParse = new BufferedInputStream(url.openStream());

    PDDocument document = null;/*from w  w  w .  j  av a2 s .  co  m*/
    try {
        document = PDDocument.load(fileToParse);
        String output = new PDFTextStripper().getText(document);
        System.out.println(output);
    } finally {

        if (document != null) {
            document.close();
        }
    }
}

From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java

public JavaBillScrapper(File pdfFile) throws IOException {
    PDDocument doc = PDDocument.load(pdfFile);
    PDFTextStripper stripper = new PDFTextStripper();
    String rawText = stripper.getText(doc);
    String[] textArray = rawText.split("[\\r\\n]+");
    this.billObj = parsePdf(textArray);
    doc.close();/*from ww  w .j  a  v  a 2s. co m*/
}