Example usage for org.apache.pdfbox.text PDFTextStripper getText

List of usage examples for org.apache.pdfbox.text PDFTextStripper getText

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper getText.

Prototype

public String getText(PDDocument doc) throws IOException 

Source Link

Document

This will return the text of a document.

Usage

From source file:ExtractTextFromPdf.java

public static void main(String[] args) {

    PDFParser parser = null;//from  ww  w  . jav a  2s.  com
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;

    String parsedText;
    String fileName = "C:/Users/Kavya Gupta/Desktop/Texas_Title.pdf";
    File file = new File(fileName);
    try {
        byte data[] = new byte[1024];
        ((RandomAccessRead) file).read(data, 0, 1024);

        pdDoc = PDDocument.load(new File(fileName));
        pdfStripper = new PDFTextStripper();
        parsedText = pdfStripper.getText(pdDoc);
        System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
    } catch (Exception e) {
        e.printStackTrace();
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e1) {
            e.printStackTrace();
        }
    }
}

From source file:com.enginkutuk.pdfboxsample.PDFBoxSample.java

public void readPdfFile(String path) {
    try {/*  www.  j  a v  a  2  s .c o  m*/
        PDDocument document = null;
        document = PDDocument.load(new File(path));
        document.getClass();
        if (!document.isEncrypted()) {
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            PDFTextStripper Tstripper = new PDFTextStripper();
            String st = Tstripper.getText(document);
            System.out.println(st);
            JOptionPane.showMessageDialog(null, st);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.jt.tool.pdf.CreateBookmarks.java

License:Apache License

/**
 * parse text//from  w  w  w. j a  va2  s. c  o  m
 */
public static String getPageText(PDDocument document, int start, int offset) throws Exception {
    PDFTextStripper stripper = new PDFTextStripper();
    stripper.setStartPage(start);
    stripper.setEndPage(start + offset);
    return stripper.getText(document);
}

From source file:com.liferay.faces.bridge.test.integration.demo.JSFExportPDFPortletTester.java

License:Open Source License

private String getPDFText(InputStream inputStream) throws IOException {

    String text = "";
    PDDocument pdDocument = null;//from w w w .  j a  v  a  2  s. c o m

    try {

        pdDocument = PDDocument.load(inputStream);

        PDFTextStripper pdfTextStripper = new PDFTextStripper();
        text = pdfTextStripper.getText(pdDocument);
    } finally {

        ClosableUtil.close(pdDocument);
        ClosableUtil.close(inputStream);
    }

    return text;
}

From source file:com.proquest.demo.allinone.PDFLBase.java

/**
 * Extract Text using PDF BOX, instead APDFL
 *
 * @param fileNamePath// www .  j  a v a 2 s.  c  o m
 * @return
 * @throws Exception
 */
public byte[] extractTextPDFBox(final String fileNamePath) throws Exception {
    final String BLANK_SPACE = " ";
    final String UTF_8 = "UTF-8";
    final PropertyReaderLib libPropertyReaderLib = new PropertyReaderLib(PropertyFileNames.PDFLIBRARY);
    final String regex = libPropertyReaderLib
            .getPropertyValue(PdfLibraryKeys.REGEX_TO_REMOVE_FROM_EXTRACTEDTEXT.toString());

    byte[] bytesToReturn = null;
    try {
        final File file = new File(fileNamePath);
        final PDDocument pdfDoc = PDDocument.load(file);
        final PDFTextStripper pdfStripper = new PDFTextStripper();
        final String textFromPDF = pdfStripper.getText(pdfDoc);
        pdfDoc.close();

        bytesToReturn = textFromPDF.getBytes(UTF_8);
        final String textStr = new String(bytesToReturn).replaceAll(regex, BLANK_SPACE);
        bytesToReturn = textStr.getBytes();
    } catch (IOException e) {
        throw new Exception(e.getMessage());
    }
    return bytesToReturn;
}

From source file:com.sastix.cms.common.services.htmltopdf.PdfTest.java

License:Apache License

@Test
public void testPdfFromStringTo() throws Exception {

    // GIVEN an html template containing special characters that java stores in utf-16 internally
    Pdf pdf = pdfBuilder.build();// w w w.  ja  v  a2s . co m
    pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Mller</h1></html>", PageType.htmlAsString);

    String tempFolder = temporaryFolder.newFolder().getPath();
    pdf.saveAs(tempFolder + "/output.pdf");

    // WHEN
    byte[] pdfBytes = pdf.getPDF();

    PDFParser parser = new PDFParser(
            new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes)));

    // that is a valid PDF (otherwise an IOException occurs)
    parser.parse();
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));

    assertThat("document should contain the creditorName", pdfText, containsString("Mller"));
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method returns the content of the document
*//*from  w  ww  . ja v  a  2 s.co m*/
private String getPDFText(String file, int startPage, int endPage) throws IOException {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    PDDocument doc = PDDocument.load(new File(file));

    PDFTextStripper localStripper = new PDFTextStripper();
    if (null != this.stripper) {
        localStripper = this.stripper;
    }

    this.updateStartAndEndPages(file, startPage, endPage);
    localStripper.setStartPage(this.startPage);
    localStripper.setEndPage(this.endPage);

    String txt = localStripper.getText(doc);
    logger.info("PDF Text before trimming : " + txt);
    if (this.bTrimWhiteSpace) {
        txt = txt.trim().replaceAll("\\s+", " ").trim();
        logger.info("PDF Text after  trimming : " + txt);
    }

    doc.close();
    return txt;
}

From source file:com.validation.manager.core.server.core.AttachmentServerTest.java

License:Apache License

/**
 * Test of addFile method, of class AttachmentServer.
 *//*from   w ww  . j a va 2s .  c om*/
@Test
public void testAddRetrieveTextFile() {
    try {
        System.out.println("add text File");
        File f = new File("target/Test.txt");
        f.deleteOnExit();
        List<String> lines = Arrays.asList("The first line", "The second line");
        Path file = Paths.get(f.getAbsolutePath());
        Files.write(file, lines, Charset.forName("UTF-8"));
        AttachmentServer instance = new AttachmentServer();
        instance.addFile(f, f.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(f.getAbsolutePath());
        assertEquals(1, (int) instance.getAttachmentType().getId());//Text file
        System.out.println("retrieveFile");
        AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK());
        File loadedFile = temp.getAttachedFile("target/loaded/");
        BufferedReader br = new BufferedReader(new FileReader(loadedFile));
        String line;
        int count = 0;
        while ((line = br.readLine()) != null) {
            assertEquals(lines.get(count), line);
            System.out.println(line);
            count++;
        }
        assertEquals(lines.size(), count);
        //Create pdf file
        System.out.println("add pdf File");
        File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf");
        pdf.deleteOnExit();
        instance = new AttachmentServer();
        instance.addFile(pdf, pdf.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(pdf.getAbsolutePath());
        assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file
        System.out.println("retrieveFile");
        temp = new AttachmentServer(instance.getAttachmentPK());
        loadedFile = temp.getAttachedFile("target/loaded/");
        PDFTextStripper pdfStripper;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        try {
            PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile));
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(1);
            String parsedText = pdfStripper.getText(pdDoc);
            System.out.println(parsedText);
        } catch (IOException ex) {
            Exceptions.printStackTrace(ex);
            fail();
        } finally {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        }
    } catch (IOException | VMException ex) {
        Exceptions.printStackTrace(ex);
        fail();
    }
}

From source file:converter.PDFPac.java

/**
 * vrati text z pdf//from   ww  w  .ja v a2  s . com
 *
 * @param url
 * @return
 */
private String getTextFromPDF(String url) {
    String text = "";
    try {
        PDDocument pdDoc = PDDocument.load(new File(url));
        PDFTextStripper pdfStripper = new PDFTextStripper();
        text = pdfStripper.getText(pdDoc);
        pdDoc.close();

    } catch (IOException ex) {
        logger.warning("PDFPac soubor nebyl nalezen " + url + "chyba " + ex);
    }
    return text;
}

From source file:cz.incad.kramerius.k5indexer.KrameriusPDFDocument.java

public String getPage(int page) throws Exception {
    logger.log(Level.INFO, "Getting page {0}", page);
    try {/*from   w w w.  j a v a 2  s  . c  o m*/
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }

        return StringEscapeUtils.escapeXml(stripper.getText(pdDoc));
    } catch (Exception ex) {
        return "";
    }
}