Example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper.

Prototype

public PDFTextStripper() throws IOException

Source Link

Document

Instantiate a new PDFTextStripper object.

Usage

From source file:com.proquest.demo.allinone.PDFLBase.java

/**
 * Extract Text using PDF BOX, instead APDFL
 *
 * @param fileNamePath/*from www.  ja  v  a2 s  .  c om*/
 * @return
 * @throws Exception
 */
public byte[] extractTextPDFBox(final String fileNamePath) throws Exception {
    final String BLANK_SPACE = " ";
    final String UTF_8 = "UTF-8";
    final PropertyReaderLib libPropertyReaderLib = new PropertyReaderLib(PropertyFileNames.PDFLIBRARY);
    final String regex = libPropertyReaderLib
            .getPropertyValue(PdfLibraryKeys.REGEX_TO_REMOVE_FROM_EXTRACTEDTEXT.toString());

    byte[] bytesToReturn = null;
    try {
        final File file = new File(fileNamePath);
        final PDDocument pdfDoc = PDDocument.load(file);
        final PDFTextStripper pdfStripper = new PDFTextStripper();
        final String textFromPDF = pdfStripper.getText(pdfDoc);
        pdfDoc.close();

        bytesToReturn = textFromPDF.getBytes(UTF_8);
        final String textStr = new String(bytesToReturn).replaceAll(regex, BLANK_SPACE);
        bytesToReturn = textStr.getBytes();
    } catch (IOException e) {
        throw new Exception(e.getMessage());
    }
    return bytesToReturn;
}

From source file:com.sastix.cms.common.services.htmltopdf.PdfTest.java

License:Apache License

@Test
public void testPdfFromStringTo() throws Exception {

    // GIVEN an html template containing special characters that java stores in utf-16 internally
    Pdf pdf = pdfBuilder.build();//  ww  w  . jav  a 2  s .  co  m
    pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Mller</h1></html>", PageType.htmlAsString);

    String tempFolder = temporaryFolder.newFolder().getPath();
    pdf.saveAs(tempFolder + "/output.pdf");

    // WHEN
    byte[] pdfBytes = pdf.getPDF();

    PDFParser parser = new PDFParser(
            new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes)));

    // that is a valid PDF (otherwise an IOException occurs)
    parser.parse();
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));

    assertThat("document should contain the creditorName", pdfText, containsString("Mller"));
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method returns the content of the document
*///from  ww w  .j av  a  2s  . c o  m
private String getPDFText(String file, int startPage, int endPage) throws IOException {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    PDDocument doc = PDDocument.load(new File(file));

    PDFTextStripper localStripper = new PDFTextStripper();
    if (null != this.stripper) {
        localStripper = this.stripper;
    }

    this.updateStartAndEndPages(file, startPage, endPage);
    localStripper.setStartPage(this.startPage);
    localStripper.setEndPage(this.endPage);

    String txt = localStripper.getText(doc);
    logger.info("PDF Text before trimming : " + txt);
    if (this.bTrimWhiteSpace) {
        txt = txt.trim().replaceAll("\\s+", " ").trim();
        logger.info("PDF Text after  trimming : " + txt);
    }

    doc.close();
    return txt;
}

From source file:com.validation.manager.core.server.core.AttachmentServerTest.java

License:Apache License

/**
 * Test of addFile method, of class AttachmentServer.
 *//*from w  w w  .j  ava2 s  . c  o  m*/
@Test
public void testAddRetrieveTextFile() {
    try {
        System.out.println("add text File");
        File f = new File("target/Test.txt");
        f.deleteOnExit();
        List<String> lines = Arrays.asList("The first line", "The second line");
        Path file = Paths.get(f.getAbsolutePath());
        Files.write(file, lines, Charset.forName("UTF-8"));
        AttachmentServer instance = new AttachmentServer();
        instance.addFile(f, f.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(f.getAbsolutePath());
        assertEquals(1, (int) instance.getAttachmentType().getId());//Text file
        System.out.println("retrieveFile");
        AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK());
        File loadedFile = temp.getAttachedFile("target/loaded/");
        BufferedReader br = new BufferedReader(new FileReader(loadedFile));
        String line;
        int count = 0;
        while ((line = br.readLine()) != null) {
            assertEquals(lines.get(count), line);
            System.out.println(line);
            count++;
        }
        assertEquals(lines.size(), count);
        //Create pdf file
        System.out.println("add pdf File");
        File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf");
        pdf.deleteOnExit();
        instance = new AttachmentServer();
        instance.addFile(pdf, pdf.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(pdf.getAbsolutePath());
        assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file
        System.out.println("retrieveFile");
        temp = new AttachmentServer(instance.getAttachmentPK());
        loadedFile = temp.getAttachedFile("target/loaded/");
        PDFTextStripper pdfStripper;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        try {
            PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile));
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(1);
            String parsedText = pdfStripper.getText(pdDoc);
            System.out.println(parsedText);
        } catch (IOException ex) {
            Exceptions.printStackTrace(ex);
            fail();
        } finally {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        }
    } catch (IOException | VMException ex) {
        Exceptions.printStackTrace(ex);
        fail();
    }
}

From source file:converter.PDFPac.java

/**
 * vrati text z pdf/*from   www . jav  a 2 s. c om*/
 *
 * @param url
 * @return
 */
private String getTextFromPDF(String url) {
    String text = "";
    try {
        PDDocument pdDoc = PDDocument.load(new File(url));
        PDFTextStripper pdfStripper = new PDFTextStripper();
        text = pdfStripper.getText(pdDoc);
        pdDoc.close();

    } catch (IOException ex) {
        logger.warning("PDFPac soubor nebyl nalezen " + url + "chyba " + ex);
    }
    return text;
}

From source file:cz.incad.kramerius.k5indexer.KrameriusPDFDocument.java

public String getPage(int page) throws Exception {
    logger.log(Level.INFO, "Getting page {0}", page);
    try {//  w w w  .  j  a  v a 2s.  co  m
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }

        return StringEscapeUtils.escapeXml(stripper.getText(pdDoc));
    } catch (Exception ex) {
        return "";
    }
}

From source file:cz.mzk.editor.server.handler.GetOcrFromPdfHandler.java

License:Open Source License

private String pdftoText(String fileName) throws ActionException {

    File pdfFile = new File(fileName);

    if (!pdfFile.isFile()) {
        LOGGER.error("The file: " + fileName + " does not exist.");
        throw new ActionException("Unable to parse the pdf file.");
    }/*from w w w. jav  a  2s.  co m*/

    PDFParser parser = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;
    PDDocument pdDoc = null;
    String parsedText;
    try {
        parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile)));
    } catch (Exception e) {
        LOGGER.error("Unable to open PDF Parser.: " + e);
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file.");
    }

    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        LOGGER.error("An exception occured in parsing the PDF Document.");
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file. " + e);
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    return parsedText;
}

From source file:data.PDFManager.java

/**
 * /* w  ww . j  ava2 s.  c  o m*/
 * @return String do conteudo do pdf
 * @throws IOException 
 */
public String ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r"));

    parser.parse();
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

public static StringBuffer getTextFromPDF(PDDocument pdDoc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    String password = "";
    // extract PDF document's textual content
    try {/*from  www.  ja va  2 s  .  co m*/
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    }
    return docText;
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

/**
 * //from w  w w  .  jav  a2  s.  c  om
 *
 * @throws Exception.
 */
private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    PDDocument pdDoc = null;
    String password = "";

    // extract PDF document's textual content
    try {
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        //password
        pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc);
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    } finally {
        closePDDocument(pdDoc);
    }
    return docText;
}