List of usage examples for org.apache.pdfbox.text PDFTextStripper getText
public String getText(PDDocument doc) throws IOException
From source file:cz.mzk.editor.server.handler.GetOcrFromPdfHandler.java
License:Open Source License
private String pdftoText(String fileName) throws ActionException { File pdfFile = new File(fileName); if (!pdfFile.isFile()) { LOGGER.error("The file: " + fileName + " does not exist."); throw new ActionException("Unable to parse the pdf file."); }//from w w w . j a v a2 s . c om PDFParser parser = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; PDDocument pdDoc = null; String parsedText; try { parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile))); } catch (Exception e) { LOGGER.error("Unable to open PDF Parser.: " + e); e.printStackTrace(); throw new ActionException("Unable to parse the pdf file."); } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { LOGGER.error("An exception occured in parsing the PDF Document."); e.printStackTrace(); throw new ActionException("Unable to parse the pdf file. " + e); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return parsedText; }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
public static StringBuffer getTextFromPDF(PDDocument pdDoc, String pageNum) throws Exception { StringBuffer docText = new StringBuffer(); String password = ""; // extract PDF document's textual content try {/* w w w.j a va 2 s . c o m*/ PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); int page = Integer.parseInt(pageNum); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } docText = new StringBuffer(stripper.getText(pdDoc)); } catch (IOException e) { throw new Exception("Cannot parse PDF document", e); } return docText; }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
/** * /*from w ww . j av a2 s .co m*/ * * @throws Exception. */ private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception { StringBuffer docText = new StringBuffer(); PDDocument pdDoc = null; String password = ""; // extract PDF document's textual content try { PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/); int page = Integer.parseInt(pageNum); if (page != -1) { stripper.setStartPage(page); stripper.setEndPage(page); } //password pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc); docText = new StringBuffer(stripper.getText(pdDoc)); } catch (IOException e) { throw new Exception("Cannot parse PDF document", e); } finally { closePDDocument(pdDoc); } return docText; }
From source file:edu.umsl.runPDF.java
public void readPDF() throws IOException { System.out.println("Please enter PDF file location, omit extension: "); String input = sc.next();// ww w . j a va 2s . c om pdfFile = new File(input); PDDocument pdDocument = PDDocument.load(pdfFile); PDFTextStripper strip = new PDFTextStripper(); // strip.setStartPage(1); // strip.setEndPage(1); content = strip.getText(pdDocument); System.out.println("PDF Read"); // System.out.println(content); // FileOutputStream outStream; // strip.writeText(txtFile, outStream); }
From source file:extractor.pdftotext.PdfToText.java
private String getPdfBoxRaw(File file) { try {/* ww w.ja va 2s .co m*/ PDDocument doc = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); stripper.setPageStart("PAGE START"); stripper.setPageEnd("PAGE END"); //gets the text form the doc and replaces unknown signs with \n String rawText = stripper.getText(doc).replaceAll("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cn}]", "\n"); doc.close(); return rawText; } catch (IOException ex) { Logger.getLogger(PdfToText.class.getName()).log(Level.SEVERE, null, ex); } return ""; }
From source file:it.myideas.bancamarcheextractor.Distinta.java
public static Distinta parse(Path file) { try (PDDocument doc = PDDocument.load(file.toFile())) { Distinta distinta = new Distinta(); PDFTextStripper stripper = new PDFTextStripper(); String contents = stripper.getText(doc); Stream<String> lines = Arrays.stream(contents.split(stripper.getLineSeparator())); log.debug("FILE:" + file.toString()); log.debug(contents);// w w w .j a v a 2 s .c om lines.forEach(line -> { if (line.startsWith("Tipo disposizione")) { distinta.tipoDisposizione = line.replace("Tipo disposizione", "").trim().toLowerCase(); } else if (line.startsWith("1 Esecuzione")) { String[] p = line.split(" "); distinta.beneficiario = Arrays.stream(Arrays.copyOfRange(p, 4, p.length)) .map(String::toLowerCase).collect(Collectors.joining("_")); distinta.data = LocalDate.parse(p[2], DateTimeFormatter.ofPattern("dd/MM/yyyy")); } }); if (!isOk(distinta.beneficiario) || !isOk(distinta.tipoDisposizione) || distinta.data == null) { throw new IOException("Parser failure for file " + file.toString()); } return distinta; } catch (IOException e) { log.error("Error parsing PDF", e); return null; } }
From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java
public JavaBillScrapper(File pdfFile) throws IOException { PDDocument doc = PDDocument.load(pdfFile); PDFTextStripper stripper = new PDFTextStripper(); String rawText = stripper.getText(doc); String[] textArray = rawText.split("[\\r\\n]+"); this.billObj = parsePdf(textArray); doc.close();/* www .jav a2s . c o m*/ }
From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java
public static void main(String[] args) { // TODO code application logic here try {/* ww w . j a v a 2s . c o m*/ for (String arg : args) { if (!arg.startsWith("-")) { filePaths.add(arg); } else { try { options.add(Flags.fromString(arg)); } catch (IllegalArgumentException ex) { System.err.println("Illegal options: " + arg); } } } Collections.sort(filePaths); for (String filePath : filePaths) { System.out.println("Loading: " + filePath); PDDocument doc = PDDocument.load(new File(filePath)); PDFTextStripper stripper = new PDFTextStripper(); String rawText = stripper.getText(doc); String[] textArray = rawText.split("[\\r\\n]+"); Bill bill = parsePdf(textArray); if (options.contains(Flags.INSERT_INTO_DB)) { DatabaseConnector db = new DatabaseConnector(); DbWriter writer = new DbWriter(db.getConnection()); boolean isInserted = writer.insertDetail(bill.getBillSummary(), bill.getPhoneSummaryData(), bill.getPhoneDetail()); writer.commit(); doc.close(); if (!isInserted) { System.out.println(filePath + " was not inserted into database."); } } } } catch (Exception ex) { ex.printStackTrace(System.out); } }
From source file:neuralclassification.Classificator.java
String readText(String filepath, String name) { PDDocument pdfDocument = null;//from w w w. jav a2s . com String paper = null; try { pdfDocument = PDDocument.load(new File(filepath + "/" + name)); PDFTextStripper stripper = new PDFTextStripper(); paper = stripper.getText(pdfDocument); } catch (IOException e) { throw new RuntimeException(e); } finally { if (pdfDocument != null) try { pdfDocument.close(); } catch (IOException e) { throw new RuntimeException(e); } } return paper; }
From source file:neuralclassification.Trainer.java
String readText(String name) { PDDocument pdfDocument = null;/*from www.ja v a2s . c o m*/ String paper = null; try { pdfDocument = PDDocument.load(new File(filepath + "/" + name)); PDFTextStripper stripper = new PDFTextStripper(); paper = stripper.getText(pdfDocument); } catch (IOException e) { throw new RuntimeException(e); } finally { if (pdfDocument != null) try { pdfDocument.close(); } catch (IOException e) { throw new RuntimeException(e); } } return paper; }