List of usage examples for org.apache.pdfbox.text PDFTextStripper PDFTextStripper
public PDFTextStripper() throws IOException
From source file:net.anthonypoon.billscrapper.JavaBillScrapper.java
public static void main(String[] args) { // TODO code application logic here try {/*from w w w.ja va 2 s . com*/ for (String arg : args) { if (!arg.startsWith("-")) { filePaths.add(arg); } else { try { options.add(Flags.fromString(arg)); } catch (IllegalArgumentException ex) { System.err.println("Illegal options: " + arg); } } } Collections.sort(filePaths); for (String filePath : filePaths) { System.out.println("Loading: " + filePath); PDDocument doc = PDDocument.load(new File(filePath)); PDFTextStripper stripper = new PDFTextStripper(); String rawText = stripper.getText(doc); String[] textArray = rawText.split("[\\r\\n]+"); Bill bill = parsePdf(textArray); if (options.contains(Flags.INSERT_INTO_DB)) { DatabaseConnector db = new DatabaseConnector(); DbWriter writer = new DbWriter(db.getConnection()); boolean isInserted = writer.insertDetail(bill.getBillSummary(), bill.getPhoneSummaryData(), bill.getPhoneDetail()); writer.commit(); doc.close(); if (!isInserted) { System.out.println(filePath + " was not inserted into database."); } } } } catch (Exception ex) { ex.printStackTrace(System.out); } }
From source file:neuralclassification.Classificator.java
String readText(String filepath, String name) { PDDocument pdfDocument = null;//from www . ja va 2s . c o m String paper = null; try { pdfDocument = PDDocument.load(new File(filepath + "/" + name)); PDFTextStripper stripper = new PDFTextStripper(); paper = stripper.getText(pdfDocument); } catch (IOException e) { throw new RuntimeException(e); } finally { if (pdfDocument != null) try { pdfDocument.close(); } catch (IOException e) { throw new RuntimeException(e); } } return paper; }
From source file:neuralclassification.Trainer.java
String readText(String name) { PDDocument pdfDocument = null;/* w w w.j a v a2 s . c o m*/ String paper = null; try { pdfDocument = PDDocument.load(new File(filepath + "/" + name)); PDFTextStripper stripper = new PDFTextStripper(); paper = stripper.getText(pdfDocument); } catch (IOException e) { throw new RuntimeException(e); } finally { if (pdfDocument != null) try { pdfDocument.close(); } catch (IOException e) { throw new RuntimeException(e); } } return paper; }
From source file:opennlp.PDFTools.java
public String getStringFromPDF(String filePath) { String text = null;//from ww w . ja v a 2s . c o m try { File file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file, "r")); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); //pdfStripper.setEndPage(10); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); text = pdfStripper.getText(pdDoc); } catch (IOException e) { logger.error("IO ERROR", e); } catch (Exception ex) { logger.error("ERROR", ex); } return text; }
From source file:org.apache.james.mailbox.store.search.PDFTextExtractor.java
License:Apache License
private ParsedContent extractTextFromPDF(InputStream inputStream) throws IOException { return new ParsedContent(Optional.ofNullable(new PDFTextStripper().getText(PDDocument.load(inputStream))), ImmutableMap.of());//from w w w .j a v a 2 s . c o m }
From source file:org.codelibs.fess.crawler.extractor.impl.PdfExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new CrawlerSystemException("The inputstream is null."); }/* w ww . j a va 2 s . co m*/ synchronized (pdfBoxLockObj) { final String password = getPassword(params); try (PDDocument document = PDDocument.load(in, password)) { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Writer output = new OutputStreamWriter(baos, encoding); final PDFTextStripper stripper = new PDFTextStripper(); final AtomicBoolean done = new AtomicBoolean(false); final PDDocument doc = document; final Set<Exception> exceptionSet = new HashSet<>(); final Thread task = new Thread(() -> { try { stripper.writeText(doc, output); } catch (final Exception e) { exceptionSet.add(e); } finally { done.set(true); } }); task.setDaemon(true); task.start(); task.join(timeout); if (!done.get()) { for (int i = 0; i < 100 && !done.get(); i++) { task.interrupt(); Thread.sleep(50); } throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec."); } else if (!exceptionSet.isEmpty()) { throw exceptionSet.iterator().next(); } output.flush(); final ExtractData extractData = new ExtractData(baos.toString(encoding)); extractMetadata(document, extractData); return extractData; } catch (final Exception e) { throw new ExtractException(e); } } }
From source file:org.dspace.app.rest.BitstreamContentRestControllerIT.java
License:BSD License
private String extractPDFText(byte[] content) throws IOException { PDFTextStripper pts = new PDFTextStripper(); pts.setSortByPosition(true);//from ww w . j av a2 s .c om try (ByteArrayInputStream source = new ByteArrayInputStream(content); Writer writer = new StringWriter(); PDDocument pdfDoc = PDDocument.load(source)) { pts.writeText(pdfDoc, writer); return writer.toString(); } }
From source file:org.grouplens.samantha.modeler.dao.PdfFileDAO.java
License:Open Source License
private PdfFileDAO(String filePath) { try {/*from w w w . ja v a 2s . com*/ stripper = new PDFTextStripper(); pdfDoc = PDDocument.load(new File(filePath)); } catch (IOException e) { throw new BadRequestException(e); } numPages = pdfDoc.getNumberOfPages(); }
From source file:org.haplo.component.pdfbox.ConvertPDFToText.java
License:Mozilla Public License
protected void performOperation() throws Exception { try (PDDocument pdf = PDDocument.load(new File(this.inputPathname))) { PDFTextStripper stripper = new PDFTextStripper(); try (FileOutputStream out = new FileOutputStream(new File(this.outputPathname))) { try (OutputStreamWriter writer = new OutputStreamWriter(out, "UTF-8")) { stripper.writeText(pdf, writer); }//from w w w . j av a 2 s .co m } } }
From source file:org.haplo.component.pdfbox.TextExtractPDF.java
License:Mozilla Public License
protected String extract() throws IOException { String text = null;/*from w w w . j a va 2 s .c om*/ try (PDDocument pdf = PDDocument.load(new File(getInputPathname()))) { PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdf, writer); text = writer.toString(); } return text; }