List of usage examples for org.apache.pdfbox.text PDFTextStripper PDFTextStripper
public PDFTextStripper() throws IOException
From source file:ExtractTextFromPdf.java
public static void main(String[] args) { PDFParser parser = null;//from ww w . j a v a2 s. c om PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; String parsedText; String fileName = "C:/Users/Kavya Gupta/Desktop/Texas_Title.pdf"; File file = new File(fileName); try { byte data[] = new byte[1024]; ((RandomAccessRead) file).read(data, 0, 1024); pdDoc = PDDocument.load(new File(fileName)); pdfStripper = new PDFTextStripper(); parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", "")); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } }
From source file:com.enginkutuk.pdfboxsample.PDFBoxSample.java
public void readPdfFile(String path) { try {/*from w w w. j a v a 2 s. co m*/ PDDocument document = null; document = PDDocument.load(new File(path)); document.getClass(); if (!document.isEncrypted()) { PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); PDFTextStripper Tstripper = new PDFTextStripper(); String st = Tstripper.getText(document); System.out.println(st); JOptionPane.showMessageDialog(null, st); } } catch (Exception e) { e.printStackTrace(); } }
From source file:com.formkiq.core.service.conversion.PdfToHtmlFormatConverter.java
License:Apache License
@Override public ConversionResult convert(final Object data, final WorkflowOutputDocumentType inputType, final WorkflowOutputDocumentType outputType) throws IOException { PDDocument doc = (PDDocument) data;//from w w w . jav a 2 s . c om String s = new PDFTextStripper().getText(doc); return new ConversionResult(Strings.getBytes(s)); }
From source file:com.jt.tool.pdf.CreateBookmarks.java
License:Apache License
/** * parse text//from w w w.ja v a 2s . c o m */ public static String getPageText(PDDocument document, int start, int offset) throws Exception { PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(start); stripper.setEndPage(start + offset); return stripper.getText(document); }
From source file:com.jubination.backend.service.thyrocare.report.parallel.worker.PDFParserBox.java
public String ToText(String url) throws IOException { try {/*from w ww. jav a 2 s. co m*/ this.pdfStripper = null; this.pdDoc = null; pdDoc = PDDocument.load(new URL(url).openStream()); pdDoc.getClass(); pdfStripper = new PDFTextStripper() { @Override protected void processTextPosition(TextPosition text) { super.processTextPosition(text); } }; pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; } finally { try { if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } }
From source file:com.jubinationre.controller.PDFReportAPIController.java
public String ToText(String url) throws IOException { this.pdfStripper = null; this.pdDoc = null; pdDoc = PDDocument.load(new URL(url).openStream()); pdDoc.getClass();//from ww w. ja v a2s . c o m pdfStripper = new PDFTextStripper() { @Override protected void processTextPosition(TextPosition text) { // if(text.getFont().getName().endsWith("Bold")){ super.processTextPosition(text); // } } }; // pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); //pdfStripper.setEndPage(10); // reading text from page 1 to 10 // if you want to get text from full pdf file use this code pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); return Text; }
From source file:com.liferay.faces.bridge.test.integration.demo.JSFExportPDFPortletTester.java
License:Open Source License
private String getPDFText(InputStream inputStream) throws IOException { String text = ""; PDDocument pdDocument = null;// w w w . j a va 2 s .com try { pdDocument = PDDocument.load(inputStream); PDFTextStripper pdfTextStripper = new PDFTextStripper(); text = pdfTextStripper.getText(pdDocument); } finally { ClosableUtil.close(pdDocument); ClosableUtil.close(inputStream); } return text; }
From source file:com.ning.billing.recurly.TestRecurlyClient.java
License:Apache License
@Test(groups = "integration") public void testCreateInvoiceAndRetrieveInvoicePdf() throws Exception { final Account accountData = TestUtils.createRandomAccount(); PDDocument pdDocument = null;/* w w w . j a v a 2 s . c om*/ try { // Create a user final Account account = recurlyClient.createAccount(accountData); // Create an Adjustment final Adjustment a = new Adjustment(); a.setUnitAmountInCents(150); a.setCurrency(CURRENCY); final Adjustment createdA = recurlyClient.createAccountAdjustment(accountData.getAccountCode(), a); // Post an invoice/invoice the adjustment final Invoice invoiceData = new Invoice(); invoiceData.setCollectionMethod("manual"); invoiceData.setLineItems(null); final Invoice invoice = recurlyClient.postAccountInvoice(accountData.getAccountCode(), invoiceData) .getChargeInvoice(); Assert.assertNotNull(invoice); InputStream pdfInputStream = recurlyClient.getInvoicePdf(invoice.getId()); Assert.assertNotNull(pdfInputStream); pdDocument = PDDocument.load(pdfInputStream); String pdfString = new PDFTextStripper().getText(pdDocument); Assert.assertNotNull(pdfString); Assert.assertTrue(pdfString.contains("Invoice # " + invoice.getId())); Assert.assertTrue(pdfString.contains("Subtotal $" + 1.5)); // Attempt to close the invoice final Invoice closedInvoice = recurlyClient.markInvoiceSuccessful(invoice.getId()); Assert.assertEquals(closedInvoice.getState(), "paid", "Invoice not closed successfully"); } finally { if (pdDocument != null) { pdDocument.close(); } // Close the account recurlyClient.closeAccount(accountData.getAccountCode()); } }
From source file:com.pluszero.rostertogo.PdfManager.java
private void ToText(File file) throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; parser = new PDFParser(file); // for pfdBox 1.8, as 2.0 not yet supported in Android parser.parse();/*from ww w . j a va2 s. c o m*/ cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); text = pdfStripper.getText(pdDoc); pdDoc.close(); }
From source file:com.pluszero.rostertogo.PdfManager.java
private void ToText(InputStream is) throws IOException { this.pdfStripper = null; this.pdDoc = null; this.cosDoc = null; parser = new PDFParser(is); // for PdfBox 1.8 as 2.0 not yet supported in Android parser.parse();//from w ww .j a v a 2s. c om cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); text = pdfStripper.getText(pdDoc); pdDoc.close(); }