Example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

List of usage examples for org.apache.pdfbox.text PDFTextStripper PDFTextStripper

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper PDFTextStripper.

Prototype

public PDFTextStripper() throws IOException 

Source Link

Document

Instantiate a new PDFTextStripper object.

Usage

From source file:com.proquest.demo.allinone.PDFLBase.java

/**
 * Extract Text using PDF BOX, instead APDFL
 *
 * @param fileNamePath/*from www.  ja  v  a2 s  .  c om*/
 * @return
 * @throws Exception
 */
public byte[] extractTextPDFBox(final String fileNamePath) throws Exception {
    final String BLANK_SPACE = " ";
    final String UTF_8 = "UTF-8";
    final PropertyReaderLib libPropertyReaderLib = new PropertyReaderLib(PropertyFileNames.PDFLIBRARY);
    final String regex = libPropertyReaderLib
            .getPropertyValue(PdfLibraryKeys.REGEX_TO_REMOVE_FROM_EXTRACTEDTEXT.toString());

    byte[] bytesToReturn = null;
    try {
        final File file = new File(fileNamePath);
        final PDDocument pdfDoc = PDDocument.load(file);
        final PDFTextStripper pdfStripper = new PDFTextStripper();
        final String textFromPDF = pdfStripper.getText(pdfDoc);
        pdfDoc.close();

        bytesToReturn = textFromPDF.getBytes(UTF_8);
        final String textStr = new String(bytesToReturn).replaceAll(regex, BLANK_SPACE);
        bytesToReturn = textStr.getBytes();
    } catch (IOException e) {
        throw new Exception(e.getMessage());
    }
    return bytesToReturn;
}

From source file:com.sastix.cms.common.services.htmltopdf.PdfTest.java

License:Apache License

@Test
public void testPdfFromStringTo() throws Exception {

    // GIVEN an html template containing special characters that java stores in utf-16 internally
    Pdf pdf = pdfBuilder.build();//  ww  w  . jav  a 2  s .  co  m
    pdf.addPage("<html><head><meta charset=\"utf-8\"></head><h1>Mller</h1></html>", PageType.htmlAsString);

    String tempFolder = temporaryFolder.newFolder().getPath();
    pdf.saveAs(tempFolder + "/output.pdf");

    // WHEN
    byte[] pdfBytes = pdf.getPDF();

    PDFParser parser = new PDFParser(
            new RandomAccessBufferedFileInputStream(new ByteArrayInputStream(pdfBytes)));

    // that is a valid PDF (otherwise an IOException occurs)
    parser.parse();
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String pdfText = pdfTextStripper.getText(new PDDocument(parser.getDocument()));

    assertThat("document should contain the creditorName", pdfText, containsString("Mller"));
}

From source file:com.testautomationguru.utility.PDFUtil.java

License:Apache License

/**
* This method returns the content of the document
*///from  ww w  .j av  a  2s  . c o  m
private String getPDFText(String file, int startPage, int endPage) throws IOException {

    logger.info("file : " + file);
    logger.info("startPage : " + startPage);
    logger.info("endPage : " + endPage);

    PDDocument doc = PDDocument.load(new File(file));

    PDFTextStripper localStripper = new PDFTextStripper();
    if (null != this.stripper) {
        localStripper = this.stripper;
    }

    this.updateStartAndEndPages(file, startPage, endPage);
    localStripper.setStartPage(this.startPage);
    localStripper.setEndPage(this.endPage);

    String txt = localStripper.getText(doc);
    logger.info("PDF Text before trimming : " + txt);
    if (this.bTrimWhiteSpace) {
        txt = txt.trim().replaceAll("\\s+", " ").trim();
        logger.info("PDF Text after  trimming : " + txt);
    }

    doc.close();
    return txt;
}

From source file:com.validation.manager.core.server.core.AttachmentServerTest.java

License:Apache License

/**
 * Test of addFile method, of class AttachmentServer.
 *//*from w  w w  .j  ava2 s  . c  o  m*/
@Test
public void testAddRetrieveTextFile() {
    try {
        System.out.println("add text File");
        File f = new File("target/Test.txt");
        f.deleteOnExit();
        List<String> lines = Arrays.asList("The first line", "The second line");
        Path file = Paths.get(f.getAbsolutePath());
        Files.write(file, lines, Charset.forName("UTF-8"));
        AttachmentServer instance = new AttachmentServer();
        instance.addFile(f, f.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(f.getAbsolutePath());
        assertEquals(1, (int) instance.getAttachmentType().getId());//Text file
        System.out.println("retrieveFile");
        AttachmentServer temp = new AttachmentServer(instance.getAttachmentPK());
        File loadedFile = temp.getAttachedFile("target/loaded/");
        BufferedReader br = new BufferedReader(new FileReader(loadedFile));
        String line;
        int count = 0;
        while ((line = br.readLine()) != null) {
            assertEquals(lines.get(count), line);
            System.out.println(line);
            count++;
        }
        assertEquals(lines.size(), count);
        //Create pdf file
        System.out.println("add pdf File");
        File pdf = Tool.convertToPDF(loadedFile, "target/Text.pdf");
        pdf.deleteOnExit();
        instance = new AttachmentServer();
        instance.addFile(pdf, pdf.getName());
        instance.write2DB();
        //Delete the file
        FileUtils.delete(pdf.getAbsolutePath());
        assertEquals(2, (int) instance.getAttachmentType().getId());//PDF file
        System.out.println("retrieveFile");
        temp = new AttachmentServer(instance.getAttachmentPK());
        loadedFile = temp.getAttachedFile("target/loaded/");
        PDFTextStripper pdfStripper;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        try {
            PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(loadedFile));
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            pdfStripper.setStartPage(1);
            pdfStripper.setEndPage(1);
            String parsedText = pdfStripper.getText(pdDoc);
            System.out.println(parsedText);
        } catch (IOException ex) {
            Exceptions.printStackTrace(ex);
            fail();
        } finally {
            if (cosDoc != null) {
                cosDoc.close();
            }
            if (pdDoc != null) {
                pdDoc.close();
            }
        }
    } catch (IOException | VMException ex) {
        Exceptions.printStackTrace(ex);
        fail();
    }
}

From source file:converter.PDFPac.java

/**
 * vrati text z pdf/*from   www . jav  a 2 s. c om*/
 *
 * @param url
 * @return
 */
private String getTextFromPDF(String url) {
    String text = "";
    try {
        PDDocument pdDoc = PDDocument.load(new File(url));
        PDFTextStripper pdfStripper = new PDFTextStripper();
        text = pdfStripper.getText(pdDoc);
        pdDoc.close();

    } catch (IOException ex) {
        logger.warning("PDFPac soubor nebyl nalezen " + url + "chyba " + ex);
    }
    return text;
}

From source file:cz.incad.kramerius.k5indexer.KrameriusPDFDocument.java

public String getPage(int page) throws Exception {
    logger.log(Level.INFO, "Getting page {0}", page);
    try {//  w w w  .  j  a  v a 2s.  co  m
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }

        return StringEscapeUtils.escapeXml(stripper.getText(pdDoc));
    } catch (Exception ex) {
        return "";
    }
}

From source file:cz.mzk.editor.server.handler.GetOcrFromPdfHandler.java

License:Open Source License

private String pdftoText(String fileName) throws ActionException {

    File pdfFile = new File(fileName);

    if (!pdfFile.isFile()) {
        LOGGER.error("The file: " + fileName + " does not exist.");
        throw new ActionException("Unable to parse the pdf file.");
    }/*from w w w. jav  a  2s.  co m*/

    PDFParser parser = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;
    PDDocument pdDoc = null;
    String parsedText;
    try {
        parser = new PDFParser(new RandomAccessBufferedFileInputStream(new FileInputStream(pdfFile)));
    } catch (Exception e) {
        LOGGER.error("Unable to open PDF Parser.: " + e);
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file.");
    }

    try {
        parser.parse();
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
        LOGGER.error("An exception occured in parsing the PDF Document.");
        e.printStackTrace();
        throw new ActionException("Unable to parse the pdf file. " + e);
    } finally {
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    return parsedText;
}

From source file:data.PDFManager.java

/**
 * /* w  ww . j  ava2 s.  c  o m*/
 * @return String do conteudo do pdf
 * @throws IOException 
 */
public String ToText() throws IOException {
    this.pdfStripper = null;
    this.pdDoc = null;
    this.cosDoc = null;

    file = new File(filePath);
    parser = new PDFParser(new RandomAccessFile(file, "r"));

    parser.parse();
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    pdDoc.getNumberOfPages();
    pdfStripper.setStartPage(1);
    pdfStripper.setEndPage(pdDoc.getNumberOfPages());

    Text = pdfStripper.getText(pdDoc);
    return Text;
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

public static StringBuffer getTextFromPDF(PDDocument pdDoc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    String password = "";
    // extract PDF document's textual content
    try {/*from  www.  ja va  2 s  .  co m*/
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    }
    return docText;
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

/**
 * //from w  w w  .  jav  a2  s.  c  om
 *
 * @throws Exception.
 */
private StringBuffer getTextFromPDF(byte[] doc, String pageNum) throws Exception {
    StringBuffer docText = new StringBuffer();
    PDDocument pdDoc = null;
    String password = "";

    // extract PDF document's textual content
    try {
        PDFTextStripper stripper = new PDFTextStripper(/*"UTF-8"*/);
        int page = Integer.parseInt(pageNum);
        if (page != -1) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
        }
        //password
        pdDoc = PDDocument.load(new ByteArrayInputStream(doc), password); // new PDDocument(cosDoc);
        docText = new StringBuffer(stripper.getText(pdDoc));
    } catch (IOException e) {
        throw new Exception("Cannot parse PDF document", e);
    } finally {
        closePDDocument(pdDoc);
    }
    return docText;
}