Example usage for org.apache.pdfbox.text PDFTextStripper getText

List of usage examples for org.apache.pdfbox.text PDFTextStripper getText

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper getText.

Prototype

public String getText(PDDocument doc) throws IOException 

Source Link

Document

This will return the text of a document.

Usage

From source file:org.knime.ext.textprocessing.nodes.source.parser.pdf.PDFDocumentParser.java

License:Open Source License

private Document parseInternal(final InputStream is) throws Exception {
    m_currentDoc = new DocumentBuilder(m_tokenizerName);
    m_currentDoc.setDocumentFile(new File(m_docPath));
    m_currentDoc.setDocumentType(m_type);
    m_currentDoc.addDocumentCategory(m_category);
    m_currentDoc.addDocumentSource(m_source);

    if (m_charset == null) {
        m_charset = Charset.defaultCharset();
    }/*w  w  w  . j  a v a  2  s .  c  om*/

    PDDocument document = null;
    try {
        document = PDDocument.load(is);

        // extract text from pdf
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setSortByPosition(true);
        String text = stripper.getText(document);
        m_currentDoc.addSection(text, SectionAnnotation.UNKNOWN);

        // extract meta data from pdf
        String title = null;
        String authors = null;

        if (m_filenameAsTitle) {
            title = m_docPath.toString().trim();
        }

        PDDocumentInformation information = document.getDocumentInformation();
        if (information != null) {
            if (!checkTitle(title)) {
                title = information.getTitle();
            }
            authors = information.getAuthor();
        }

        // if title meta data does not exist use first sentence
        if (!checkTitle(title)) {
            List<Section> sections = m_currentDoc.getSections();
            if (sections.size() > 0) {
                try {
                    title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim();
                } catch (IndexOutOfBoundsException e) {
                    LOGGER.debug("Parsed PDF document " + m_docPath + " is empty.");
                    title = "";
                }
            }
        }
        // if no useful first sentence exist use filename
        if (!checkTitle(title)) {
            title = m_docPath.toString().trim();
        }
        m_currentDoc.addTitle(title);

        // use author meta data
        if (authors != null) {
            Set<Author> authSet = AuthorUtil.parseAuthors(authors);
            for (Author a : authSet) {
                m_currentDoc.addAuthor(a);
            }
        }

        // add document to list
        return m_currentDoc.createDocument();
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:org.titans.fyp.webcrawler.PageCollector.java

License:Open Source License

private static void pdfToText(String pdfURL) {

    pdfURL = "https://" + pdfURL.split("://")[1];
    //        System.out.println(pdfURL);

    try {/*from w w w.  j  a  v  a 2s. co m*/

        PDDocument pddDocument = PDDocument.load((new URL(pdfURL)).openStream());
        PDFTextStripper textStripper = new PDFTextStripper();
        String doc = textStripper.getText(pddDocument);
        pddDocument.close();
        System.out.println(doc);
    } catch (Exception e) {
        e.getMessage();
    }
}

From source file:org.vesalainen.ham.pdf.RfaxTest.java

License:Open Source License

public void test() throws IOException {
    PDDocument document = PDDocument.load(new File("rfax.pdf"));
    if (!document.isEncrypted()) {
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);
        try (BufferedWriter bw = Files.newBufferedWriter(Paths.get("src", "main", "resources", "rfax.txt"))) {
            bw.write(text);/*from   w  w w  . j  av  a 2 s.  co m*/
        }
    }
    document.close();
}

From source file:PDF.PDFTest.java

License:Apache License

public static void constrainText(String start, String end, File file) throws IOException {
    PDDocument doc = PDDocument.load(file);
    PDFTextStripper stripper = new PDFTextStripper();
    String text = stripper.getText(doc); // we get the text of the entire
    // document into a String
    String[] split_on_start = text.split(start); // split on the start
    // parameter, take upper
    // bound//  ww  w .  j  av a2s .  c om
    String[] split_on_end = split_on_start[1].split(end); // split on end
    // parameter,
    // take lower
    // bound
    String constrained_string = start;
    constrained_string += split_on_end[0]; // the final string will be the
    // area in between start and end
    doc.close();
    System.out.print(constrained_string);
}

From source file:pdf.to.info.PDF.java

/**
 * Reading text from PDF file//from   w ww.ja va  2  s .  c  o m
 *
 * @param filePath
 * @return
 * @throws java.io.IOException
 */
public String ReadText(String filePath) throws IOException {
    PDFTextStripper pdfStripper = new PDFTextStripper();
    return pdfStripper.getText(ReadPDDoc(filePath));
}

From source file:PDSL.PDFProcessor.java

public void pdfToText(String dirFrom, String dirTo) throws IOException {
    File pdfFolder = new File(dirFrom);
    File[] listOfPDF = pdfFolder.listFiles();
    for (File thePDF : listOfPDF) {
        PDDocument pdDoc = PDDocument.load(thePDF);
        PDFTextStripper pdfStripper = new PDFTextStripper();
        String parsedText = pdfStripper.getText(pdDoc);
        PrintWriter out = new PrintWriter(dirTo + "/" + thePDF.getName().replace(".pdf", ".txt"));
        out.write(parsedText);//ww w.  j  a v  a  2 s.  c o  m
        out.close();
    }
}

From source file:src.controller.DocumentController.java

public void convertToDocX(File filepath, File filename) {

    PDDocument documentpdf;//from w ww.j a va  2 s  . c om

    try {

        documentpdf = PDDocument.load(filepath);

        PDFTextStripper pdfStripper = new PDFTextStripper();
        pdfStripper.setStartPage(1);
        //pdfStripper.setEndPage( 1 );

        String parsedText = pdfStripper.getText(documentpdf);
        System.out.println(parsedText);

        // enregistrement du document dans un fichier
        FileOutputStream out = new FileOutputStream(filepath + filename.toString() + ".docx");

        XWPFDocument document = new XWPFDocument();

        //create Paragraph
        XWPFParagraph paragraph = document.createParagraph();
        XWPFRun run = paragraph.createRun();

        run.setText(parsedText);

        document.write(out);

        out.close();

    } catch (IOException ex) {
        Logger.getLogger(DocumentController.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:uk.org.openeyes.DICOMCommonFunctions.java

/**
 *
 * @param binData//from   w w w  . j  a v  a  2 s  .  co  m
 * @return
 */
protected String parsePDFData(byte[] binData) {

    PDFParser parser = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;
    String parsedText = "";

    RandomAccessRead pdfData = new RandomAccessBuffer(binData);
    try {
        parser = new PDFParser(pdfData);
        parser.parse();
        //cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = parser.getPDDocument();
        if (debug) {
            PDFFunctions PDFHelper = new PDFFunctions();
            debugMessage("<<<<<<< PDFDATA >>>>>>>>\n");
            //PDFHelper.dumpPDFStructure(pdDoc);
            PDFHelper.dumpPDFContent(pdDoc);
            debugMessage("<<<<<<< /PDFDATA >>>>>>>>\n");
        }
        parsedText = pdfStripper.getText(pdDoc);

        //debugMessage("<<<<<<< PDFDATA >>>>>>>>\n"+parsedText+"<<<<<<< /PDFDATA >>>>>>>>\n");            
    } catch (Exception e) {
        e.printStackTrace();
        try {
            if (cosDoc != null)
                cosDoc.close();
            if (pdDoc != null)
                pdDoc.close();
        } catch (Exception e1) {
            e.printStackTrace();
        }

    }
    return parsedText;
}

From source file:uk.org.openeyes.DICOMHFAVF.java

/**
 *
 * @param Attrs//from   www.  ja v a2s.  c o m
 * @throws IOException
 */
public void collectData(Attributes Attrs) throws IOException {

    if (Attrs.contains(parser.getTagInteger("00420011"))) {
        byte[] pdfbytes = Attrs.getBytes(parser.getTagInteger("00420011"));

        FileOutputStream pdffos = new FileOutputStream("d:\\work\\wombex\\WombexUK\\AcrossHealth\\"
                + Attrs.getString(parser.getTagInteger("00420010")) + ".pdf");
        pdffos.write(pdfbytes);
        pdffos.close();

        PDFParser parser = null;
        PDDocument pdDoc = null;
        COSDocument cosDoc = null;
        PDFTextStripper pdfStripper;
        String parsedText = "";

        RandomAccessRead pdfData = new RandomAccessBuffer(pdfbytes);
        try {
            parser = new PDFParser(pdfData);
            parser.parse();
            cosDoc = parser.getDocument();
            pdfStripper = new PDFTextStripper();
            pdDoc = new PDDocument(cosDoc);
            parsedText = pdfStripper.getText(pdDoc);
            System.out.println(parsedText);
            debugMessage(parsedText);
        } catch (Exception e) {
            e.printStackTrace();
            try {
                if (cosDoc != null)
                    cosDoc.close();
                if (pdDoc != null)
                    pdDoc.close();
            } catch (Exception e1) {
                e.printStackTrace();
            }

        }

    }

    // this is just a test function for HFA files
    // 2 main groups: 7717 and 0301 can be extracted
    if (Attrs.contains(parser.getTagInteger("03010010"))) {
        debugMessage("Extracting 0301 group...");
        debugMessage("Test Type: " + Attrs.getString(parser.getTagInteger("03011000")));
        debugMessage("Test strategy: " + Attrs.getString(parser.getTagInteger("03011001")));
        debugMessage("Test Pattern: " + Attrs.getString(parser.getTagInteger("03011002")));
        debugMessage("Screening Mode: " + Attrs.getString(parser.getTagInteger("03011003")));

        debugMessage("Stimulus Color: " + Attrs.getString(parser.getTagInteger("03011004")));
        debugMessage("Stimulus Size: " + Attrs.getString(parser.getTagInteger("03011005")));
        debugMessage("Blue Yellow: " + Attrs.getString(parser.getTagInteger("03011006")));
        debugMessage("PDB Version: " + Attrs.getString(parser.getTagInteger("03011007")));

        debugMessage("HFA Raw Data: ");
        byte[] rawbytes;

        try {
            rawbytes = Attrs.getBytes(parser.getTagInteger("03011008"));
            // TODO: move this part into a function, and this is a hack now!!!
            byte[] correctedBytes = new byte[rawbytes.length - 2];
            int j = 0;
            for (int i = 0; i < rawbytes.length - 1; i++) {
                if (!String.format("%02X", rawbytes[i]).equals("04")) {
                    correctedBytes[j] = rawbytes[i];
                    j++;
                }
            }

            FileOutputStream fos = new FileOutputStream(
                    "d:\\work\\wombex\\WombexUK\\AcrossHealth\\byte_test_new.xml");
            fos.write(correctedBytes);
            fos.close();

            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder;
            Document doc = null;
            String encapsulatedBinaryData = "";
            try {
                dBuilder = dbFactory.newDocumentBuilder();
                try {
                    doc = dBuilder.parse(new ByteArrayInputStream(correctedBytes));
                    //new FileInputStream("c:\\work\\wombex\\WombexUK\\AcrossHealth\\byte_test.xml"));
                    NodeList binaryNodes;
                    binaryNodes = doc.getElementsByTagName("xio:hfa_II_serial_binhex");
                    if (binaryNodes.getLength() > 0) {
                        encapsulatedBinaryData = binaryNodes.item(0).getTextContent();
                    }

                } catch (SAXException ex) {
                    Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex);
                }
            } catch (ParserConfigurationException ex) {
                Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex);
            }

            debugMessage(encapsulatedBinaryData);

            /*FileOutputStream decf = new FileOutputStream("c:\\work\\wombex\\WombexUK\\AcrossHealth\\decompressed.bin");
            String todecompress = encapsulatedBinaryData.substring(413,500);
            debugMessage(todecompress);
            decompressor.decompress(new ByteArrayInputStream(todecompress.getBytes()), decf);
            decf.close();
            */
            extractEncapsulatedBinData(encapsulatedBinaryData);

        } catch (IOException ex) {
            Logger.getLogger(DICOMParser.class.getName()).log(Level.SEVERE, null, ex);
        }

        debugMessage(Attrs.getString(parser.getTagInteger("03011008")));

    } else if (Attrs.contains(parser.getTagInteger("77170010"))) {
        debugMessage("Extracting 7717 group...");
        debugMessage("Test name: " + Attrs.getString(parser.getTagInteger("77171001")));
        debugMessage("Test strategy: " + Attrs.getString(parser.getTagInteger("77171002")));
        debugMessage("Stimulus Size: " + Attrs.getString(parser.getTagInteger("77171003")));
        debugMessage("Stimulus Color: " + Attrs.getString(parser.getTagInteger("77171004")));

        debugMessage("Background State: " + Attrs.getString(parser.getTagInteger("77171005")));
        debugMessage("Foveal Result: " + Attrs.getString(parser.getTagInteger("77171006")));
        debugMessage("Screening Mode: " + Attrs.getString(parser.getTagInteger("77171007")));
        debugMessage("Fixation Trials: " + Attrs.getString(parser.getTagInteger("77171008")));
        debugMessage("Fixation Errors: " + Attrs.getString(parser.getTagInteger("77171009")));

        debugMessage("False Positive Percent: " + Attrs.getString(parser.getTagInteger("77171010")));
        debugMessage("False Positive Trials : " + Attrs.getString(parser.getTagInteger("77171011")));
        debugMessage("False Positive Errors: " + Attrs.getString(parser.getTagInteger("77171012")));
        debugMessage("False Negative Percent: " + Attrs.getString(parser.getTagInteger("77171013")));
        debugMessage("False Negative Trials : " + Attrs.getString(parser.getTagInteger("77171014")));
        debugMessage("False Negative Errors: " + Attrs.getString(parser.getTagInteger("77171015")));
        debugMessage("Mean Deviation: " + Attrs.getString(parser.getTagInteger("77171016")));
        debugMessage("Mean Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171017")));
        debugMessage("Pattern Standard Deviation: " + Attrs.getString(parser.getTagInteger("77171018")));
        debugMessage(
                "Pattern Standard Deviation Probability: " + Attrs.getString(parser.getTagInteger("77171019")));
        debugMessage("Short Term Fluctuation: " + Attrs.getString(parser.getTagInteger("77171020")));
        debugMessage(
                "Corrected Pattern Standard Deviation: " + Attrs.getString(parser.getTagInteger("77171021")));
        debugMessage("Corrected Pattern Standard Deviation Probability: "
                + Attrs.getString(parser.getTagInteger("77171022")));
        debugMessage("Glaucoma Hemifield Test: " + Attrs.getString(parser.getTagInteger("77171023")));

        debugMessage("Fixation Monitor: " + Attrs.getString(parser.getTagInteger("77171024")));
        debugMessage("Fixation Target: " + Attrs.getString(parser.getTagInteger("77171025")));
        debugMessage("Pupil Diameter (in mm): " + Attrs.getString(parser.getTagInteger("77171026")));
        debugMessage("Sphere: " + Attrs.getString(parser.getTagInteger("77171027")));
        debugMessage("Cylinder: " + Attrs.getString(parser.getTagInteger("77171028")));
        debugMessage("Axis: " + Attrs.getString(parser.getTagInteger("77171029")));
        debugMessage("Visual Acuity: " + Attrs.getString(parser.getTagInteger("77171030")));
        debugMessage("Short Term Fluctuation Probabilit: " + Attrs.getString(parser.getTagInteger("77171031")));
        debugMessage("Visual Field Index: " + Attrs.getString(parser.getTagInteger("77171034")));
        debugMessage("VFM Sequence:");
        if (Attrs.contains(parser.getTagInteger("77171040"))) {
            Sequence Seq = Attrs.getSequence(parser.getTagInteger("77171040"));
            for (int sq = 0; sq < Seq.size(); sq++) {
                Attributes AttrData = (Attributes) Seq.get(sq);
                debugMessage("> Private creator: " + AttrData.getString(parser.getTagInteger("77170010")));
                debugMessage(">> Section Number: " + AttrData.getString(parser.getTagInteger("77171041")));
                debugMessage(">> Section Value: " + AttrData.getString(parser.getTagInteger("77171042")));
            }
        }
    }
}