List of usage examples for org.apache.pdfbox.text PDFTextStripper setSortByPosition
public void setSortByPosition(boolean newSortByPosition)
From source file:org.dspace.app.rest.BitstreamContentRestControllerIT.java
License:BSD License
private String extractPDFText(byte[] content) throws IOException { PDFTextStripper pts = new PDFTextStripper(); pts.setSortByPosition(true); try (ByteArrayInputStream source = new ByteArrayInputStream(content); Writer writer = new StringWriter(); PDDocument pdfDoc = PDDocument.load(source)) { pts.writeText(pdfDoc, writer);//ww w. jav a 2 s . com return writer.toString(); } }
From source file:org.knime.ext.textprocessing.nodes.source.parser.pdf.PDFDocumentParser.java
License:Open Source License
private Document parseInternal(final InputStream is) throws Exception { m_currentDoc = new DocumentBuilder(m_tokenizerName); m_currentDoc.setDocumentFile(new File(m_docPath)); m_currentDoc.setDocumentType(m_type); m_currentDoc.addDocumentCategory(m_category); m_currentDoc.addDocumentSource(m_source); if (m_charset == null) { m_charset = Charset.defaultCharset(); }//from w w w .j av a2 s . c o m PDDocument document = null; try { document = PDDocument.load(is); // extract text from pdf PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(true); String text = stripper.getText(document); m_currentDoc.addSection(text, SectionAnnotation.UNKNOWN); // extract meta data from pdf String title = null; String authors = null; if (m_filenameAsTitle) { title = m_docPath.toString().trim(); } PDDocumentInformation information = document.getDocumentInformation(); if (information != null) { if (!checkTitle(title)) { title = information.getTitle(); } authors = information.getAuthor(); } // if title meta data does not exist use first sentence if (!checkTitle(title)) { List<Section> sections = m_currentDoc.getSections(); if (sections.size() > 0) { try { title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim(); } catch (IndexOutOfBoundsException e) { LOGGER.debug("Parsed PDF document " + m_docPath + " is empty."); title = ""; } } } // if no useful first sentence exist use filename if (!checkTitle(title)) { title = m_docPath.toString().trim(); } m_currentDoc.addTitle(title); // use author meta data if (authors != null) { Set<Author> authSet = AuthorUtil.parseAuthors(authors); for (Author a : authSet) { m_currentDoc.addAuthor(a); } } // add document to list return m_currentDoc.createDocument(); } finally { if (document != null) { document.close(); } } }
From source file:uk.org.openeyes.PDFFunctions.java
/** * * @param PDFDoc/*w w w . jav a2 s .c om*/ * @throws IOException */ public void dumpPDFStructure(PDDocument PDFDoc) throws IOException { PDFTextStripper stripper = new PDFFunctions(); stripper.setSortByPosition(true); stripper.setStartPage(0); stripper.setEndPage(PDFDoc.getNumberOfPages()); Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream()); stripper.writeText(PDFDoc, dummy); }