List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation
public PDDocumentInformation getDocumentInformation()
From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java
License:Open Source License
public Properties getProperties(final InputStream is) throws IOException, DocumentReadException { try {/*w w w. j a v a 2 s. c o m*/ return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() { public Properties run() throws Exception { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } PDDocument pdDocument = PDDocument.load(is); Properties props = new Properties(); try { if (pdDocument.isEncrypted()) { try { pdDocument.decrypt(""); } catch (InvalidPasswordException e) { throw new DocumentReadException("The pdf document is encrypted.", e); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { throw new DocumentReadException(e.getMessage(), e); } } PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); PDMetadata meta = catalog.getMetadata(); if (meta != null) { XMPMetadata metadata = meta.exportXMPMetadata(); XMPSchemaDublinCore dc = metadata.getDublinCoreSchema(); if (dc != null) { try { if (dc.getTitle() != null) props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle())); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } try { if (dc.getDescription() != null) props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription())); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (dc.getCreators() != null) { for (String creator : dc.getCreators()) { props.put(DCMetaData.CREATOR, fixEncoding(creator)); } } } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (dc.getDates() != null) { for (Calendar date : dc.getDates()) { props.put(DCMetaData.DATE, date); } } } catch (Exception e) { LOG.warn("getDate failed: " + e.getMessage()); } } XMPSchemaPDF pdf = metadata.getPDFSchema(); if (pdf != null) { try { if (pdf.getKeywords() != null) props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords())); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (pdf.getProducer() != null) props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer())); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } } XMPSchemaBasic basic = metadata.getBasicSchema(); if (basic != null) { try { if (basic.getCreateDate() != null) props.put(DCMetaData.DATE, basic.getCreateDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (basic.getModifyDate() != null) props.put(DCMetaData.DATE, basic.getModifyDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } // DCMetaData.PUBLISHER - basic.getCreatorTool() } } if (props.isEmpty()) { // The pdf doesn't contain any metadata, try to use the document // information instead PDDocumentInformation docInfo = pdDocument.getDocumentInformation(); if (docInfo != null) { try { if (docInfo.getAuthor() != null) props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor()); } catch (Exception e) { LOG.warn("getAuthor failed: " + e.getMessage()); } try { if (docInfo.getCreationDate() != null) props.put(DCMetaData.DATE, docInfo.getCreationDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (docInfo.getCreator() != null) props.put(DCMetaData.CREATOR, docInfo.getCreator()); } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (docInfo.getKeywords() != null) props.put(DCMetaData.SUBJECT, docInfo.getKeywords()); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (docInfo.getModificationDate() != null) props.put(DCMetaData.DATE, docInfo.getModificationDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } try { if (docInfo.getProducer() != null) props.put(DCMetaData.PUBLISHER, docInfo.getProducer()); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } try { if (docInfo.getSubject() != null) props.put(DCMetaData.DESCRIPTION, docInfo.getSubject()); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (docInfo.getTitle() != null) props.put(DCMetaData.TITLE, docInfo.getTitle()); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } // docInfo.getTrapped(); } } } finally { if (pdDocument != null) { pdDocument.close(); } if (is != null) { try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } } return props; } }); } catch (PrivilegedActionException pae) { Throwable cause = pae.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } else if (cause instanceof RuntimeException) { throw (RuntimeException) cause; } else { throw new RuntimeException(cause); } } }
From source file:org.fit.pdfdom.PDFDomTree.java
License:Open Source License
@Override protected void endDocument(PDDocument document) throws IOException { //use the PDF title String doctitle = document.getDocumentInformation().getTitle(); if (doctitle != null && doctitle.trim().length() > 0) title.setTextContent(doctitle);/*w w w .jav a 2 s . com*/ //set the main style globalStyle.setTextContent(createGlobalStyle()); }
From source file:org.knime.ext.textprocessing.nodes.source.parser.pdf.PDFDocumentParser.java
License:Open Source License
private Document parseInternal(final InputStream is) throws Exception { m_currentDoc = new DocumentBuilder(m_tokenizerName); m_currentDoc.setDocumentFile(new File(m_docPath)); m_currentDoc.setDocumentType(m_type); m_currentDoc.addDocumentCategory(m_category); m_currentDoc.addDocumentSource(m_source); if (m_charset == null) { m_charset = Charset.defaultCharset(); }/* w w w.j a v a 2 s . c o m*/ PDDocument document = null; try { document = PDDocument.load(is); // extract text from pdf PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(true); String text = stripper.getText(document); m_currentDoc.addSection(text, SectionAnnotation.UNKNOWN); // extract meta data from pdf String title = null; String authors = null; if (m_filenameAsTitle) { title = m_docPath.toString().trim(); } PDDocumentInformation information = document.getDocumentInformation(); if (information != null) { if (!checkTitle(title)) { title = information.getTitle(); } authors = information.getAuthor(); } // if title meta data does not exist use first sentence if (!checkTitle(title)) { List<Section> sections = m_currentDoc.getSections(); if (sections.size() > 0) { try { title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim(); } catch (IndexOutOfBoundsException e) { LOGGER.debug("Parsed PDF document " + m_docPath + " is empty."); title = ""; } } } // if no useful first sentence exist use filename if (!checkTitle(title)) { title = m_docPath.toString().trim(); } m_currentDoc.addTitle(title); // use author meta data if (authors != null) { Set<Author> authSet = AuthorUtil.parseAuthors(authors); for (Author a : authSet) { m_currentDoc.addAuthor(a); } } // add document to list return m_currentDoc.createDocument(); } finally { if (document != null) { document.close(); } } }
From source file:org.knoesis.matvocab.indexer.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *///from ww w .ja v a2 s .c om private void addContent(Document document, InputStream is, String documentLocation, PDFTextStripper stripper) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on pdfDocument.decrypt(""); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } else { stripper.resetEngine(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addField(document, "contents", contents); addField(document, "stemmedcontents", contents); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addField(document, "Author", info.getAuthor()); try { addField(document, "CreationDate", info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Creator", info.getCreator()); addField(document, "Keywords", info.getKeywords()); try { addField(document, "ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Producer", info.getProducer()); addField(document, "Subject", info.getSubject()); addField(document, "Title", info.getTitle()); addField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addField(document, "summary", summary); addField(document, "numpages", String.valueOf(pdfDocument.getNumberOfPages())); } catch (CryptographyException e) { throw new IOException("Error decrypting document(" + documentLocation + "): " + e); } catch (InvalidPasswordException e) { //they didn't suppply a password and the default of "" was wrong. throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed."); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.mitre.xtext.converters.PDFConverter.java
License:Apache License
/** Implementation is informed by PDFBox authors. *//* w w w . ja v a 2 s . c o m*/ @Override public synchronized ConvertedDocument convert(java.io.File doc) throws IOException { /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Adapted from LucenePDFDocument.java from PDFBox lucene project * * This class is used to create a document for the lucene search engine. * This should easily plug into the IndexHTML or IndexFiles that comes with * the lucene project. This class will populate the following fields. * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> <tr> * <td>path</td> <td>File system path if loaded from a file</td> </tr> <tr> * <td>url</td> <td>URL to PDF document</td> </tr> <tr> <td>contents</td> * <td>Entire contents of PDF document, indexed but not stored</td> </tr> * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> <tr> * <td>modified</td> <td>The modified date/time according to the url or * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the Lucene * document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>Creator</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>Keywords</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>ModificationDate</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Producer</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Subject</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Trapped</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Encrypted</td> <td>From PDF * meta-data if available</td> </tr> </table> * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.23 $ * * @throws IOException If there is an error parsing the document. */ PDDocument pdfDocument = null; ConvertedDocument textdoc = new ConvertedDocument(doc); try { pdfDocument = PDDocument.load(doc); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on /** * * Exception in thread "main" java.lang.NoClassDefFoundError: * org/bouncycastle/jce/provider/BouncyCastleProvider at * org.apache.pdfbox.pdmodel.PDDocument.openProtection(PDDocument.java:1090) * at * org.apache.pdfbox.pdmodel.PDDocument.decrypt(PDDocument.java:594) * * CRYPTO stuff -- load BouncyCastle crypto JAR files. try { * pdfDocument.decrypt(""); } catch (CryptographyException e) { * throw new IOException("Error decrypting document(" + pdf_file * + "): " + e); } catch (InvalidPasswordException e) { //they * didn't suppply a password and the default of "" was wrong. * throw new IOException( "Error: The document(" + pdf_file + ") * is encrypted "); } finally { if (pdfDocument != null) { * pdfDocument.close();} } */ textdoc.addProperty("encrypted", "YES"); } else { //create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper.resetEngine(); stripper.writeText(pdfDocument, writer); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { textdoc.addAuthor(info.getAuthor()); try { textdoc.addCreateDate(info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } textdoc.addProperty("creator_tool", info.getCreator()); textdoc.addProperty("keywords", info.getKeywords()); /* try { metadata.add("ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } */ //metadata.add("Producer", info.getProducer()); textdoc.addProperty("subject", info.getSubject()); String ttl = info.getTitle(); if (ttl == null || "untitled".equalsIgnoreCase(ttl)) { ttl = textdoc.filename; } textdoc.addTitle(ttl); // metadata.add("Trapped", info.getTrapped()); // TODO: Character set is what? textdoc.setEncoding("UTF-8"); } // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. textdoc.setPayload(writer.getBuffer().toString()); } return textdoc; } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.mycore.media.MCRMediaPDFParser.java
License:Open Source License
/** * Parse file and store metadata in related Object. * //from w w w . j a v a2s . c o m * @return MCRMediaObject * can be held any MCRMediaObject * @see MCRMediaObject#clone() */ @SuppressWarnings("unchecked") public synchronized MCRMediaObject parse(File file) throws Exception { if (!file.exists()) throw new IOException("File \"" + file.getName() + "\" doesn't exists!"); MCRPDFObject media = new MCRPDFObject(); LOGGER.info("parse " + file.getName() + "..."); PDDocument pdf = PDDocument.load(file); try { media.fileName = file.getName(); media.fileSize = file.length(); media.folderName = (file.getAbsolutePath()).replace(file.getName(), ""); PDPageTree pages = pdf.getDocumentCatalog().getPages(); media.numPages = pdf.getNumberOfPages(); PDPage page = (PDPage) pages.get(0); PDRectangle rect = page.getMediaBox(); media.width = Math.round(rect.getWidth()); media.height = Math.round(rect.getHeight()); PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { media.tags = new MCRMediaTagObject(); media.tags.author = info.getAuthor(); media.tags.creator = info.getCreator(); media.tags.producer = info.getProducer(); media.tags.title = info.getTitle(); media.tags.subject = info.getSubject(); media.tags.keywords = info.getKeywords(); } } catch (Exception e) { LOGGER.error(e.getMessage()); throw new Exception(e.getMessage()); } finally { pdf.close(); } return media; }
From source file:org.nuxeo.pdf.PDFUtils.java
License:Open Source License
/** * Convenience method: If a parameter is null or "", it is not modified * * @param inPdfDoc//from ww w. j ava2 s .co m * @param inTitle * @param inSubject * @param inAuthor * */ public static void setInfos(PDDocument inPdfDoc, String inTitle, String inSubject, String inAuthor) { if (inTitle != null && inTitle.isEmpty()) { inTitle = null; } if (inSubject != null && inSubject.isEmpty()) { inSubject = null; } if (inAuthor != null && inAuthor.isEmpty()) { inAuthor = null; } if (inTitle != null || inAuthor != null || inSubject != null) { PDDocumentInformation docInfo = inPdfDoc.getDocumentInformation(); if (inTitle != null) { docInfo.setTitle(inTitle); } if (inSubject != null) { docInfo.setSubject(inSubject); } if (inAuthor != null) { docInfo.setAuthor(inAuthor); } inPdfDoc.setDocumentInformation(docInfo); } }
From source file:org.nuxeo.pdf.test.PDFPageExtractorTest.java
License:Open Source License
@Test public void testExtractPages_WithSetInfo() throws Exception { Blob extracted;//from ww w . j a v a 2 s. c o m String originalName = pdfFileBlob.getFilename().replace(".pdf", ""); PDFPageExtractor pe = new PDFPageExtractor(pdfFileBlob); extracted = pe.extract(5, 9, null, "One Upon a Time", "Fairyland", "Cool Author"); assertTrue(extracted instanceof FileBlob); assertEquals(originalName + "-5-9.pdf", extracted.getFilename()); PDDocument doc = PDDocument.load(extracted.getStream()); utils.track(doc); PDDocumentInformation docInfo = doc.getDocumentInformation(); assertEquals("One Upon a Time", docInfo.getTitle()); assertEquals("Fairyland", docInfo.getSubject()); assertEquals("Cool Author", docInfo.getAuthor()); doc.close(); utils.untrack(doc); }
From source file:org.nuxeo.pdf.test.PDFUtilsTest.java
License:Open Source License
@Test public void test_setInfos() throws Exception { PDDocument doc = PDDocument.load(pdfFile); utils.track(doc);// www.j a v a 2 s.c o m PDDocumentInformation docInfoOriginal = doc.getDocumentInformation(); // Check original document has the expected values assertEquals("Untitled 3", docInfoOriginal.getTitle()); assertNull(docInfoOriginal.getSubject()); assertNull(docInfoOriginal.getAuthor()); // Now, modify // First, actually, don't modify PDFUtils.setInfos(doc, null, "", null); PDDocumentInformation newDocInfo = doc.getDocumentInformation(); assertEquals(docInfoOriginal.getTitle(), newDocInfo.getTitle()); assertEquals(docInfoOriginal.getSubject(), newDocInfo.getSubject()); assertEquals(docInfoOriginal.getAuthor(), newDocInfo.getAuthor()); // Now, modify PDFUtils.setInfos(doc, "The Title", "The Subject", "The Author"); newDocInfo = doc.getDocumentInformation(); assertEquals("The Title", newDocInfo.getTitle()); assertEquals("The Subject", newDocInfo.getSubject()); assertEquals("The Author", newDocInfo.getAuthor()); doc.close(); utils.untrack(doc); }
From source file:org.olat.search.service.document.file.pdf.PdfBoxExtractor.java
License:Apache License
private String getTitle(PDDocument document) { if (document != null && document.getDocumentInformation() != null) { return document.getDocumentInformation().getTitle(); }/*from www.j a va 2s . co m*/ return null; }