Example usage for org.apache.pdfbox.text PDFTextStripper setSortByPosition

Introduction

In this page you can find the example usage for org.apache.pdfbox.text PDFTextStripper setSortByPosition.

Prototype

public void setSortByPosition(boolean newSortByPosition)

Source Link

Document

The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen.

Usage

From source file:org.dspace.app.rest.BitstreamContentRestControllerIT.java

License:BSD License

private String extractPDFText(byte[] content) throws IOException {
    PDFTextStripper pts = new PDFTextStripper();
    pts.setSortByPosition(true);

    try (ByteArrayInputStream source = new ByteArrayInputStream(content);
            Writer writer = new StringWriter();
            PDDocument pdfDoc = PDDocument.load(source)) {

        pts.writeText(pdfDoc, writer);//ww w. jav  a  2 s . com
        return writer.toString();
    }
}

From source file:org.knime.ext.textprocessing.nodes.source.parser.pdf.PDFDocumentParser.java

License:Open Source License

private Document parseInternal(final InputStream is) throws Exception {
    m_currentDoc = new DocumentBuilder(m_tokenizerName);
    m_currentDoc.setDocumentFile(new File(m_docPath));
    m_currentDoc.setDocumentType(m_type);
    m_currentDoc.addDocumentCategory(m_category);
    m_currentDoc.addDocumentSource(m_source);

    if (m_charset == null) {
        m_charset = Charset.defaultCharset();
    }//from  w  w w  .j av  a2  s  . c  o  m

    PDDocument document = null;
    try {
        document = PDDocument.load(is);

        // extract text from pdf
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setSortByPosition(true);
        String text = stripper.getText(document);
        m_currentDoc.addSection(text, SectionAnnotation.UNKNOWN);

        // extract meta data from pdf
        String title = null;
        String authors = null;

        if (m_filenameAsTitle) {
            title = m_docPath.toString().trim();
        }

        PDDocumentInformation information = document.getDocumentInformation();
        if (information != null) {
            if (!checkTitle(title)) {
                title = information.getTitle();
            }
            authors = information.getAuthor();
        }

        // if title meta data does not exist use first sentence
        if (!checkTitle(title)) {
            List<Section> sections = m_currentDoc.getSections();
            if (sections.size() > 0) {
                try {
                    title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim();
                } catch (IndexOutOfBoundsException e) {
                    LOGGER.debug("Parsed PDF document " + m_docPath + " is empty.");
                    title = "";
                }
            }
        }
        // if no useful first sentence exist use filename
        if (!checkTitle(title)) {
            title = m_docPath.toString().trim();
        }
        m_currentDoc.addTitle(title);

        // use author meta data
        if (authors != null) {
            Set<Author> authSet = AuthorUtil.parseAuthors(authors);
            for (Author a : authSet) {
                m_currentDoc.addAuthor(a);
            }
        }

        // add document to list
        return m_currentDoc.createDocument();
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:uk.org.openeyes.PDFFunctions.java

/**
 *
 * @param PDFDoc/*w  w w .  jav a2 s  .c om*/
 * @throws IOException
 */
public void dumpPDFStructure(PDDocument PDFDoc) throws IOException {
    PDFTextStripper stripper = new PDFFunctions();
    stripper.setSortByPosition(true);
    stripper.setStartPage(0);
    stripper.setEndPage(PDFDoc.getNumberOfPages());
    Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
    stripper.writeText(PDFDoc, dummy);

}