List of usage examples for org.apache.pdfbox.text PDFTextStripper writeText
public void writeText(PDDocument doc, Writer outputStream) throws IOException
From source file:com.plumblarrick.andrew.cityrecordtextextractor.IssueExtractorPositional.java
public void extractToFile(String inFileName, String outFileName) throws IOException { this.inFileName = inFileName; this.outFileName = outFileName; try {/*from www . ja va2 s . c om*/ document = PDDocument.load(new File(inFileName)); PDFTextStripper stripper = new CRTStripper(); //stripper.setSortByPosition(true); stripper.setStartPage(0); stripper.setEndPage(document.getNumberOfPages()); fileOut = (new BufferedWriter(new PrintWriter(outFileName, "UTF-8"))); fileOut.write("Source file: " + inFileName + "\n"); stripper.writeText(document, fileOut); } finally { if (document != null) { document.close(); fileOut.flush(); fileOut.close(); } } }
From source file:org.codelibs.fess.crawler.extractor.impl.PdfExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new CrawlerSystemException("The inputstream is null."); }/*w ww .j a va 2 s . c o m*/ synchronized (pdfBoxLockObj) { final String password = getPassword(params); try (PDDocument document = PDDocument.load(in, password)) { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Writer output = new OutputStreamWriter(baos, encoding); final PDFTextStripper stripper = new PDFTextStripper(); final AtomicBoolean done = new AtomicBoolean(false); final PDDocument doc = document; final Set<Exception> exceptionSet = new HashSet<>(); final Thread task = new Thread(() -> { try { stripper.writeText(doc, output); } catch (final Exception e) { exceptionSet.add(e); } finally { done.set(true); } }); task.setDaemon(true); task.start(); task.join(timeout); if (!done.get()) { for (int i = 0; i < 100 && !done.get(); i++) { task.interrupt(); Thread.sleep(50); } throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec."); } else if (!exceptionSet.isEmpty()) { throw exceptionSet.iterator().next(); } output.flush(); final ExtractData extractData = new ExtractData(baos.toString(encoding)); extractMetadata(document, extractData); return extractData; } catch (final Exception e) { throw new ExtractException(e); } } }
From source file:org.dspace.app.rest.BitstreamContentRestControllerIT.java
License:BSD License
private String extractPDFText(byte[] content) throws IOException { PDFTextStripper pts = new PDFTextStripper(); pts.setSortByPosition(true);/*from w w w . j ava2s . co m*/ try (ByteArrayInputStream source = new ByteArrayInputStream(content); Writer writer = new StringWriter(); PDDocument pdfDoc = PDDocument.load(source)) { pts.writeText(pdfDoc, writer); return writer.toString(); } }
From source file:org.haplo.component.pdfbox.ConvertPDFToText.java
License:Mozilla Public License
protected void performOperation() throws Exception { try (PDDocument pdf = PDDocument.load(new File(this.inputPathname))) { PDFTextStripper stripper = new PDFTextStripper(); try (FileOutputStream out = new FileOutputStream(new File(this.outputPathname))) { try (OutputStreamWriter writer = new OutputStreamWriter(out, "UTF-8")) { stripper.writeText(pdf, writer); }/*from www. j ava 2 s .c om*/ } } }
From source file:org.haplo.component.pdfbox.TextExtractPDF.java
License:Mozilla Public License
protected String extract() throws IOException { String text = null;/*from w w w .j a va 2 s. com*/ try (PDDocument pdf = PDDocument.load(new File(getInputPathname()))) { PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdf, writer); text = writer.toString(); } return text; }
From source file:org.quelea.services.importexport.SurvivorSongbookParser.java
License:Open Source License
/** * Get the text on a page in the PDF document. * @param document the document.//ww w.ja v a2 s. c om * @param stripper the PDF stripper used to get the text. * @param page the page number. * @return the text on the given page. * @throws IOException if something went wrong. */ private String getPageText(PDDocument document, PDFTextStripper stripper, int page) throws IOException { stripper.setStartPage(page); stripper.setEndPage(page); StringWriter textWriter = new StringWriter(); stripper.writeText(document, textWriter); return textWriter.toString().replace("", "'").replace("`", "'"); }
From source file:org.wildfly.camel.test.fop.FopIntegrationTest.java
License:Apache License
private String extractTextFromDocument(PDDocument document) throws IOException { Writer output = new StringWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(document, output); return output.toString().trim(); }
From source file:uk.org.openeyes.PDFFunctions.java
/** * * @param PDFDoc/*from ww w. ja va 2 s .com*/ * @throws IOException */ public void dumpPDFStructure(PDDocument PDFDoc) throws IOException { PDFTextStripper stripper = new PDFFunctions(); stripper.setSortByPosition(true); stripper.setStartPage(0); stripper.setEndPage(PDFDoc.getNumberOfPages()); Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream()); stripper.writeText(PDFDoc, dummy); }