List of usage examples for org.apache.pdfbox.text PDFTextStripper PDFTextStripper
public PDFTextStripper() throws IOException
From source file:edu.esprit.filereader.PdfReader.java
public String ToText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); parser.parse();//from www. j a va 2 s. co m cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(10); // reading text from page 1 to 10 // if you want to get text from full pdf file use this code // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; }
From source file:edu.umsl.runPDF.java
public void readPDF() throws IOException { System.out.println("Please enter PDF file location, omit extension: "); String input = sc.next();//from w w w . j a v a 2s. c o m pdfFile = new File(input); PDDocument pdDocument = PDDocument.load(pdfFile); PDFTextStripper strip = new PDFTextStripper(); // strip.setStartPage(1); // strip.setEndPage(1); content = strip.getText(pdDocument); System.out.println("PDF Read"); // System.out.println(content); // FileOutputStream outStream; // strip.writeText(txtFile, outStream); }
From source file:extractor.Extractor.java
public static ArrayList<Document> returnDocuments(String pathBase, String[] files) { ArrayList<Document> documents = new ArrayList<>(); for (String file : files) { PDDocument pdDocument = null;/*from w w w. ja v a 2 s. co m*/ String paperString = null; try { pdDocument = PDDocument.load(new File(pathBase + file)); paperString = new PDFTextStripper().getText(pdDocument); pdDocument.close(); Document document = new Document(paperString); documents.add(document); } catch (FileNotFoundException ex) { System.out.println("Arquivo no encontrado! Detalhes: " + ex.getLocalizedMessage()); continue; } catch (IOException ex) { Logger.getLogger(Classifierdoc.class.getName()).log(Level.SEVERE, null, ex); } } return documents; }
From source file:extractor.pdftotext.PdfToText.java
private String getPdfBoxRaw(File file) { try {/*from w w w. j a v a 2s. com*/ PDDocument doc = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); stripper.setPageStart("PAGE START"); stripper.setPageEnd("PAGE END"); //gets the text form the doc and replaces unknown signs with \n String rawText = stripper.getText(doc).replaceAll("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cn}]", "\n"); doc.close(); return rawText; } catch (IOException ex) { Logger.getLogger(PdfToText.class.getName()).log(Level.SEVERE, null, ex); } return ""; }
From source file:indexer.PDFTextExtractor.java
License:Open Source License
public PDFTextExtractor() { try {// w w w .j a va 2 s . c o m stripper = new PDFTextStripper(); } catch (IOException e) { log.error("Could not create PDF Text Stripper", e); } }
From source file:io.cloudslang.content.utilities.services.PdfParseService.java
License:Open Source License
public static String getPdfContent(final Path path, final String password) throws IOException { try (final PDDocument document = getPdfDocument(path, password)) { return new PDFTextStripper().getText(document); }/*from w ww . j av a 2s . co m*/ }
From source file:it.myideas.bancamarcheextractor.Distinta.java
public static Distinta parse(Path file) { try (PDDocument doc = PDDocument.load(file.toFile())) { Distinta distinta = new Distinta(); PDFTextStripper stripper = new PDFTextStripper(); String contents = stripper.getText(doc); Stream<String> lines = Arrays.stream(contents.split(stripper.getLineSeparator())); log.debug("FILE:" + file.toString()); log.debug(contents);/*from w ww.j a va 2s . c om*/ lines.forEach(line -> { if (line.startsWith("Tipo disposizione")) { distinta.tipoDisposizione = line.replace("Tipo disposizione", "").trim().toLowerCase(); } else if (line.startsWith("1 Esecuzione")) { String[] p = line.split(" "); distinta.beneficiario = Arrays.stream(Arrays.copyOfRange(p, 4, p.length)) .map(String::toLowerCase).collect(Collectors.joining("_")); distinta.data = LocalDate.parse(p[2], DateTimeFormatter.ofPattern("dd/MM/yyyy")); } }); if (!isOk(distinta.beneficiario) || !isOk(distinta.tipoDisposizione) || distinta.data == null) { throw new IOException("Parser failure for file " + file.toString()); } return distinta; } catch (IOException e) { log.error("Error parsing PDF", e); return null; } }
From source file:javaapplication1.PDFManager.java
public Map<String, String> ToText() throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); // update for PDFBox V 2.0 parser.parse();//from w w w . j a v a 2s. c o m cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); // if you want to get text from full pdf file use this code // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); // if you want specific number of pages pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); Text = pdfStripper.getText(pdDoc); System.out.println(Text); // spilt String[] result = Text.split("\n"); Map<String, String> map = new HashMap<String, String>(); try { for (int j = 0; j < result.length; j++) { if (result[j].contains("Type")) { String x = result[j].substring(5); map.put("Type", x); } else if (result[j].contains("Document Number")) { String x = result[j].substring(16); map.put("Document Number", x); } else if (result[j].contains("Date of Birth")) { String x = result[j].substring(14); map.put("Date of Birth", x); } else if (result[j].contains("Date of Expiry")) { String x = result[j].substring(15); map.put("Date of Expiry", x); } else if (result[j].contains("Issuer")) { String x = result[j].substring(7); map.put("Issuer", x); } else if (result[j].contains("Nationality")) { String x = result[j].substring(12); map.put("Nationality", x); } else if (result[j].contains("First Names")) { String x = result[j].substring(12); map.put("First Names", x); } else if (result[j].contains("Last Names")) { String x = result[j].substring(11); map.put("Last Names", x); } else if (result[j].contains("Discretionary 1")) { String x = result[j].substring(16); map.put("Discretionary 1", x); } // else if (result[j].contains("Discretionary 2")) // { // // String x = result[j].substring(16); // map.put("Discretionary 2", x); // // } else if (result[j].contains("Gender")) { String x = result[j].substring(7); map.put("Gender", x); } } } catch (Exception e) { JOptionPane.showMessageDialog(null, "please selecet OCR PDF", "worng pass", JOptionPane.ERROR_MESSAGE); } return map; }
From source file:main.PdfReader.java
License:Apache License
@Test public void testPDFReader() throws Exception { // page with example pdf document driver.get("http://www.vandevenbv.nl/dynamics/modules/SFIL0200/view.php?fil_Id=5515"); URL url = new URL(driver.getCurrentUrl()); BufferedInputStream fileToParse = new BufferedInputStream(url.openStream()); PDDocument document = null;/*from w w w . j av a2 s . co m*/ try { document = PDDocument.load(fileToParse); String output = new PDFTextStripper().getText(document); System.out.println(output); } finally { if (document != null) { document.close(); } } }
From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java
public JavaBillScrapper(File pdfFile) throws IOException { PDDocument doc = PDDocument.load(pdfFile); PDFTextStripper stripper = new PDFTextStripper(); String rawText = stripper.getText(doc); String[] textArray = rawText.split("[\\r\\n]+"); this.billObj = parsePdf(textArray); doc.close();/*from ww w .j a v a 2s. co m*/ }