Example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

List of usage examples for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper.

Prototype

public PDFTextStripper() throws IOException 

Source Link

Document

Instantiate a new PDFTextStripper object.

Usage

From source file:ExtractTextFromPdf.java

public static void main(String[] args) {

    PDFParser parser = null;//from ww  w  .  j a  v a2  s. c  om
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;

    String parsedText;
    String fileName = "C:/Users/Kavya Gupta/Desktop/Texas_Title.pdf";
    File file = new File(fileName);
    try {
        byte data[] = new byte[1024];
        ((RandomAccessRead) file).read(data, 0, 1024);

        pdDoc = PDDocument.load(new File(fileName));
        pdfStripper = new PDFTextStripper();
        parsedText = pdfStripper.getText(pdDoc);
        System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
    } catch (Exception e) {
        e.printStackTrace();
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e1) {
            e.printStackTrace();
        }
    }
}

From source file:com.enginkutuk.pdfboxsample.PDFBoxSample.java

public void readPdfFile(String path) {
    try {/*from  w w w. j  a  v  a  2  s.  co m*/
        PDDocument document = null;
        document = PDDocument.load(new File(path));
        document.getClass();
        if (!document.isEncrypted()) {
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            PDFTextStripper Tstripper = new PDFTextStripper();
            String st = Tstripper.getText(document);
            System.out.println(st);
            JOptionPane.showMessageDialog(null, st);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.formkiq.core.service.conversion.PdfToHtmlFormatConverter.java

License:Apache License

@Override
public ConversionResult convert(final Object data, final WorkflowOutputDocumentType inputType,
        final WorkflowOutputDocumentType outputType) throws IOException {

    PDDocument doc = (PDDocument) data;//from  w  w  w . jav  a  2  s . c om
    String s = new PDFTextStripper().getText(doc);
    return new ConversionResult(Strings.getBytes(s));
}

From source file:com.jt.tool.pdf.CreateBookmarks.java

License:Apache License

/**
 * parse text//from  w  w w.ja  v  a 2s . c  o  m
 */
public static String getPageText(PDDocument document, int start, int offset) throws Exception {
    PDFTextStripper stripper = new PDFTextStripper();
    stripper.setStartPage(start);
    stripper.setEndPage(start + offset);
    return stripper.getText(document);
}

From source file:com.jubination.backend.service.thyrocare.report.parallel.worker.PDFParserBox.java

public String ToText(String url) throws IOException {
    try {/*from  w  ww. jav a 2  s.  co m*/
        this.pdfStripper = null;
        this.pdDoc = null;
        pdDoc = PDDocument.load(new URL(url).openStream());
        pdDoc.getClass();
        pdfStripper = new PDFTextStripper() {
            @Override
            protected void processTextPosition(TextPosition text) {
                super.processTextPosition(text);
            }
        };
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(pdDoc.getNumberOfPages());
        Text = pdfStripper.getText(pdDoc);

        return Text;
    } finally {
        try {
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

From source file:com.jubinationre.controller.PDFReportAPIController.java

public String ToText(String url) throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    pdDoc = PDDocument.load(new URL(url).openStream());

    pdDoc.getClass();//from   ww w.  ja  v  a2s  .  c  o m

    pdfStripper = new PDFTextStripper() {

        @Override
        protected void processTextPosition(TextPosition text) {
            //  if(text.getFont().getName().endsWith("Bold")){

            super.processTextPosition(text);
            //  }

        }
    };
    // pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    //pdfStripper.setEndPage(10);

    // reading text from page 1 to 10
    // if you want to get text from full pdf file use this code
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:com.liferay.faces.bridge.test.integration.demo.JSFExportPDFPortletTester.java

License:Open Source License

private String getPDFText(InputStream inputStream) throws IOException {

    String text = "";
    PDDocument pdDocument = null;//  w  w w . j  a va 2  s .com

    try {

        pdDocument = PDDocument.load(inputStream);

        PDFTextStripper pdfTextStripper = new PDFTextStripper();
        text = pdfTextStripper.getText(pdDocument);
    } finally {

        ClosableUtil.close(pdDocument);
        ClosableUtil.close(inputStream);
    }

    return text;
}

From source file:com.ning.billing.recurly.TestRecurlyClient.java

License:Apache License

@Test(groups = "integration")
public void testCreateInvoiceAndRetrieveInvoicePdf() throws Exception {
    final Account accountData = TestUtils.createRandomAccount();

    PDDocument pdDocument = null;/*  w w  w  . j a v  a  2  s .  c om*/
    try {

        // Create a user
        final Account account = recurlyClient.createAccount(accountData);

        // Create an Adjustment
        final Adjustment a = new Adjustment();
        a.setUnitAmountInCents(150);
        a.setCurrency(CURRENCY);

        final Adjustment createdA = recurlyClient.createAccountAdjustment(accountData.getAccountCode(), a);

        // Post an invoice/invoice the adjustment
        final Invoice invoiceData = new Invoice();
        invoiceData.setCollectionMethod("manual");
        invoiceData.setLineItems(null);
        final Invoice invoice = recurlyClient.postAccountInvoice(accountData.getAccountCode(), invoiceData)
                .getChargeInvoice();
        Assert.assertNotNull(invoice);

        InputStream pdfInputStream = recurlyClient.getInvoicePdf(invoice.getId());
        Assert.assertNotNull(pdfInputStream);

        pdDocument = PDDocument.load(pdfInputStream);
        String pdfString = new PDFTextStripper().getText(pdDocument);

        Assert.assertNotNull(pdfString);
        Assert.assertTrue(pdfString.contains("Invoice # " + invoice.getId()));
        Assert.assertTrue(pdfString.contains("Subtotal $" + 1.5));
        // Attempt to close the invoice
        final Invoice closedInvoice = recurlyClient.markInvoiceSuccessful(invoice.getId());
        Assert.assertEquals(closedInvoice.getState(), "paid", "Invoice not closed successfully");

    } finally {
        if (pdDocument != null) {
            pdDocument.close();
        }
        // Close the account
        recurlyClient.closeAccount(accountData.getAccountCode());
    }
}

From source file:com.pluszero.rostertogo.PdfManager.java

private void ToText(File file) throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    parser = new PDFParser(file); // for pfdBox 1.8, as 2.0 not yet supported in Android

    parser.parse();/*from  ww w  .  j  a  va2 s.  c o  m*/
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    text = pdfStripper.getText(pdDoc);
    pdDoc.close();
}

From source file:com.pluszero.rostertogo.PdfManager.java

private void ToText(InputStream is) throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    parser = new PDFParser(is); // for PdfBox 1.8 as 2.0 not yet supported in Android

    parser.parse();//from  w  ww  .j  a  v  a 2s.  c om
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());
    text = pdfStripper.getText(pdDoc);
    pdDoc.close();
}