Example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation

List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation.

Prototype

public PDDocumentInformation getDocumentInformation() 

Source Link

Document

This will get the document info dictionary.

Usage

From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java

License:Open Source License

public Properties getProperties(final InputStream is) throws IOException, DocumentReadException {
    try {/*w  w  w.  j a v a  2  s. c o  m*/
        return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() {
            public Properties run() throws Exception {
                if (is == null) {
                    throw new IllegalArgumentException("InputStream is null.");
                }

                PDDocument pdDocument = PDDocument.load(is);
                Properties props = new Properties();
                try {
                    if (pdDocument.isEncrypted()) {
                        try {
                            pdDocument.decrypt("");
                        } catch (InvalidPasswordException e) {
                            throw new DocumentReadException("The pdf document is encrypted.", e);
                        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
                            throw new DocumentReadException(e.getMessage(), e);
                        }
                    }

                    PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
                    PDMetadata meta = catalog.getMetadata();
                    if (meta != null) {
                        XMPMetadata metadata = meta.exportXMPMetadata();

                        XMPSchemaDublinCore dc = metadata.getDublinCoreSchema();
                        if (dc != null) {
                            try {
                                if (dc.getTitle() != null)
                                    props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle()));
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }
                            try {
                                if (dc.getDescription() != null)
                                    props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription()));
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getCreators() != null) {
                                    for (String creator : dc.getCreators()) {
                                        props.put(DCMetaData.CREATOR, fixEncoding(creator));
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getDates() != null) {
                                    for (Calendar date : dc.getDates()) {
                                        props.put(DCMetaData.DATE, date);
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getDate failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaPDF pdf = metadata.getPDFSchema();
                        if (pdf != null) {
                            try {
                                if (pdf.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords()));
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }

                            try {
                                if (pdf.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer()));
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaBasic basic = metadata.getBasicSchema();
                        if (basic != null) {
                            try {
                                if (basic.getCreateDate() != null)
                                    props.put(DCMetaData.DATE, basic.getCreateDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (basic.getModifyDate() != null)
                                    props.put(DCMetaData.DATE, basic.getModifyDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }

                            // DCMetaData.PUBLISHER - basic.getCreatorTool()
                        }
                    }

                    if (props.isEmpty()) {
                        // The pdf doesn't contain any metadata, try to use the document
                        // information instead
                        PDDocumentInformation docInfo = pdDocument.getDocumentInformation();

                        if (docInfo != null) {
                            try {
                                if (docInfo.getAuthor() != null)
                                    props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor());
                            } catch (Exception e) {
                                LOG.warn("getAuthor failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getCreationDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreator() != null)
                                    props.put(DCMetaData.CREATOR, docInfo.getCreator());
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }
                            try {

                                if (docInfo.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getModificationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getModificationDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, docInfo.getProducer());
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getSubject() != null)
                                    props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getTitle() != null)
                                    props.put(DCMetaData.TITLE, docInfo.getTitle());
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }

                            // docInfo.getTrapped();
                        }
                    }
                } finally {
                    if (pdDocument != null) {
                        pdDocument.close();
                    }

                    if (is != null) {
                        try {
                            is.close();
                        } catch (IOException e) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("An exception occurred: " + e.getMessage());
                            }
                        }
                    }
                }
                return props;
            }
        });

    } catch (PrivilegedActionException pae) {
        Throwable cause = pae.getCause();
        if (cause instanceof IOException) {
            throw (IOException) cause;
        } else if (cause instanceof RuntimeException) {
            throw (RuntimeException) cause;
        } else {
            throw new RuntimeException(cause);
        }
    }
}

From source file:org.fit.pdfdom.PDFDomTree.java

License:Open Source License

@Override
protected void endDocument(PDDocument document) throws IOException {
    //use the PDF title
    String doctitle = document.getDocumentInformation().getTitle();
    if (doctitle != null && doctitle.trim().length() > 0)
        title.setTextContent(doctitle);/*w  w  w  .jav  a 2  s  .  com*/
    //set the main style
    globalStyle.setTextContent(createGlobalStyle());
}

From source file:org.knime.ext.textprocessing.nodes.source.parser.pdf.PDFDocumentParser.java

License:Open Source License

private Document parseInternal(final InputStream is) throws Exception {
    m_currentDoc = new DocumentBuilder(m_tokenizerName);
    m_currentDoc.setDocumentFile(new File(m_docPath));
    m_currentDoc.setDocumentType(m_type);
    m_currentDoc.addDocumentCategory(m_category);
    m_currentDoc.addDocumentSource(m_source);

    if (m_charset == null) {
        m_charset = Charset.defaultCharset();
    }/*  w w  w.j  a  v  a 2  s . c  o  m*/

    PDDocument document = null;
    try {
        document = PDDocument.load(is);

        // extract text from pdf
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setSortByPosition(true);
        String text = stripper.getText(document);
        m_currentDoc.addSection(text, SectionAnnotation.UNKNOWN);

        // extract meta data from pdf
        String title = null;
        String authors = null;

        if (m_filenameAsTitle) {
            title = m_docPath.toString().trim();
        }

        PDDocumentInformation information = document.getDocumentInformation();
        if (information != null) {
            if (!checkTitle(title)) {
                title = information.getTitle();
            }
            authors = information.getAuthor();
        }

        // if title meta data does not exist use first sentence
        if (!checkTitle(title)) {
            List<Section> sections = m_currentDoc.getSections();
            if (sections.size() > 0) {
                try {
                    title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim();
                } catch (IndexOutOfBoundsException e) {
                    LOGGER.debug("Parsed PDF document " + m_docPath + " is empty.");
                    title = "";
                }
            }
        }
        // if no useful first sentence exist use filename
        if (!checkTitle(title)) {
            title = m_docPath.toString().trim();
        }
        m_currentDoc.addTitle(title);

        // use author meta data
        if (authors != null) {
            Set<Author> authSet = AuthorUtil.parseAuthors(authors);
            for (Author a : authSet) {
                m_currentDoc.addAuthor(a);
            }
        }

        // add document to list
        return m_currentDoc.createDocument();
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:org.knoesis.matvocab.indexer.LucenePDFDocument.java

License:Apache License

/**
 * This will add the contents to the lucene document.
 *
 * @param document The document to add the contents to.
 * @param is The stream to get the contents from.
 * @param documentLocation The location of the document, used just for debug messages.
 *
 * @throws IOException If there is an error parsing the document.
 *///from  ww w .ja v a2  s .c  om
private void addContent(Document document, InputStream is, String documentLocation, PDFTextStripper stripper)
        throws IOException {
    PDDocument pdfDocument = null;
    try {
        pdfDocument = PDDocument.load(is);

        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            pdfDocument.decrypt("");
        }

        //create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        if (stripper == null) {
            stripper = new PDFTextStripper();
        } else {
            stripper.resetEngine();
        }
        stripper.writeText(pdfDocument, writer);

        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        String contents = writer.getBuffer().toString();
        // Add the tag-stripped contents as a Reader-valued Text field so it will
        // get tokenized and indexed.
        addField(document, "contents", contents);

        addField(document, "stemmedcontents", contents);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            addField(document, "Author", info.getAuthor());
            try {
                addField(document, "CreationDate", info.getCreationDate());
            } catch (IOException io) {
                //ignore, bad date but continue with indexing
            }
            addField(document, "Creator", info.getCreator());
            addField(document, "Keywords", info.getKeywords());
            try {
                addField(document, "ModificationDate", info.getModificationDate());
            } catch (IOException io) {
                //ignore, bad date but continue with indexing
            }
            addField(document, "Producer", info.getProducer());
            addField(document, "Subject", info.getSubject());
            addField(document, "Title", info.getTitle());
            addField(document, "Trapped", info.getTrapped());
        }
        int summarySize = Math.min(contents.length(), 500);
        String summary = contents.substring(0, summarySize);
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        addField(document, "summary", summary);
        addField(document, "numpages", String.valueOf(pdfDocument.getNumberOfPages()));
    } catch (CryptographyException e) {
        throw new IOException("Error decrypting document(" + documentLocation + "): " + e);
    } catch (InvalidPasswordException e) {
        //they didn't suppply a password and the default of "" was wrong.
        throw new IOException(
                "Error: The document(" + documentLocation + ") is encrypted and will not be indexed.");
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:org.mitre.xtext.converters.PDFConverter.java

License:Apache License

/** Implementation is informed by PDFBox authors.
 *//*  w  w  w  . ja  v a  2 s .  c o m*/
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {

    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *      http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    /**
     * Adapted from LucenePDFDocument.java from PDFBox lucene project
     *
     * This class is used to create a document for the lucene search engine.
     * This should easily plug into the IndexHTML or IndexFiles that comes with
     * the lucene project. This class will populate the following fields.
     * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> <tr>
     * <td>path</td> <td>File system path if loaded from a file</td> </tr> <tr>
     * <td>url</td> <td>URL to PDF document</td> </tr> <tr> <td>contents</td>
     * <td>Entire contents of PDF document, indexed but not stored</td> </tr>
     * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> <tr>
     * <td>modified</td> <td>The modified date/time according to the url or
     * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the Lucene
     * document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF meta-data if
     * available</td> </tr> <tr> <td>Creator</td> <td>From PDF meta-data if
     * available</td> </tr> <tr> <td>Keywords</td> <td>From PDF meta-data if
     * available</td> </tr> <tr> <td>ModificationDate</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Producer</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Subject</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Trapped</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Encrypted</td> <td>From PDF
     * meta-data if available</td> </tr> </table>
     *
     * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
     * @version $Revision: 1.23 $
     *
     * @throws IOException If there is an error parsing the document.
     */
    PDDocument pdfDocument = null;
    ConvertedDocument textdoc = new ConvertedDocument(doc);

    try {
        pdfDocument = PDDocument.load(doc);

        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            /**
             *
             * Exception in thread "main" java.lang.NoClassDefFoundError:
             * org/bouncycastle/jce/provider/BouncyCastleProvider at
             * org.apache.pdfbox.pdmodel.PDDocument.openProtection(PDDocument.java:1090)
             * at
             * org.apache.pdfbox.pdmodel.PDDocument.decrypt(PDDocument.java:594)
             *
             * CRYPTO stuff -- load BouncyCastle crypto JAR files. try {
             * pdfDocument.decrypt(""); } catch (CryptographyException e) {
             * throw new IOException("Error decrypting document(" + pdf_file
             * + "): " + e); } catch (InvalidPasswordException e) { //they
             * didn't suppply a password and the default of "" was wrong.
             * throw new IOException( "Error: The document(" + pdf_file + ")
             * is encrypted "); } finally { if (pdfDocument != null) {
             * pdfDocument.close();} }
             */
            textdoc.addProperty("encrypted", "YES");
        } else {

            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            stripper.resetEngine();
            stripper.writeText(pdfDocument, writer);

            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if (info != null) {
                textdoc.addAuthor(info.getAuthor());
                try {
                    textdoc.addCreateDate(info.getCreationDate());
                } catch (IOException io) {
                    //ignore, bad date but continue with indexing
                }
                textdoc.addProperty("creator_tool", info.getCreator());
                textdoc.addProperty("keywords", info.getKeywords());
                /* try {
                 metadata.add("ModificationDate", info.getModificationDate());
                 } catch (IOException io) {
                 //ignore, bad date but continue with indexing
                 } */
                //metadata.add("Producer", info.getProducer());
                textdoc.addProperty("subject", info.getSubject());
                String ttl = info.getTitle();
                if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
                    ttl = textdoc.filename;
                }
                textdoc.addTitle(ttl);
                // metadata.add("Trapped", info.getTrapped());

                // TODO: Character set is what?
                textdoc.setEncoding("UTF-8");
            }

            // Note: the buffer to string operation is costless;
            // the char array value of the writer buffer and the content string
            // is shared as long as the buffer content is not modified, which will
            // not occur here.
            textdoc.setPayload(writer.getBuffer().toString());
        }
        return textdoc;

    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:org.mycore.media.MCRMediaPDFParser.java

License:Open Source License

/**
 * Parse file and store metadata in related Object.
 * //from   w  w  w .  j a v  a2s  . c o  m
 * @return MCRMediaObject
 *              can be held any MCRMediaObject
 * @see MCRMediaObject#clone()
 */
@SuppressWarnings("unchecked")
public synchronized MCRMediaObject parse(File file) throws Exception {
    if (!file.exists())
        throw new IOException("File \"" + file.getName() + "\" doesn't exists!");

    MCRPDFObject media = new MCRPDFObject();

    LOGGER.info("parse " + file.getName() + "...");

    PDDocument pdf = PDDocument.load(file);
    try {
        media.fileName = file.getName();
        media.fileSize = file.length();
        media.folderName = (file.getAbsolutePath()).replace(file.getName(), "");

        PDPageTree pages = pdf.getDocumentCatalog().getPages();

        media.numPages = pdf.getNumberOfPages();

        PDPage page = (PDPage) pages.get(0);
        PDRectangle rect = page.getMediaBox();

        media.width = Math.round(rect.getWidth());
        media.height = Math.round(rect.getHeight());

        PDDocumentInformation info = pdf.getDocumentInformation();
        if (info != null) {
            media.tags = new MCRMediaTagObject();
            media.tags.author = info.getAuthor();
            media.tags.creator = info.getCreator();
            media.tags.producer = info.getProducer();
            media.tags.title = info.getTitle();
            media.tags.subject = info.getSubject();
            media.tags.keywords = info.getKeywords();
        }
    } catch (Exception e) {
        LOGGER.error(e.getMessage());
        throw new Exception(e.getMessage());
    } finally {
        pdf.close();
    }

    return media;
}

From source file:org.nuxeo.pdf.PDFUtils.java

License:Open Source License

/**
 * Convenience method: If a parameter is null or "", it is not modified
 *
 * @param inPdfDoc//from  ww  w. j  ava2 s .co m
 * @param inTitle
 * @param inSubject
 * @param inAuthor
 *
 */
public static void setInfos(PDDocument inPdfDoc, String inTitle, String inSubject, String inAuthor) {

    if (inTitle != null && inTitle.isEmpty()) {
        inTitle = null;
    }
    if (inSubject != null && inSubject.isEmpty()) {
        inSubject = null;
    }
    if (inAuthor != null && inAuthor.isEmpty()) {
        inAuthor = null;
    }

    if (inTitle != null || inAuthor != null || inSubject != null) {

        PDDocumentInformation docInfo = inPdfDoc.getDocumentInformation();
        if (inTitle != null) {
            docInfo.setTitle(inTitle);
        }
        if (inSubject != null) {
            docInfo.setSubject(inSubject);
        }
        if (inAuthor != null) {
            docInfo.setAuthor(inAuthor);
        }
        inPdfDoc.setDocumentInformation(docInfo);
    }
}

From source file:org.nuxeo.pdf.test.PDFPageExtractorTest.java

License:Open Source License

@Test
public void testExtractPages_WithSetInfo() throws Exception {

    Blob extracted;//from ww  w  . j  a v a  2  s. c  o m
    String originalName = pdfFileBlob.getFilename().replace(".pdf", "");
    PDFPageExtractor pe = new PDFPageExtractor(pdfFileBlob);

    extracted = pe.extract(5, 9, null, "One Upon a Time", "Fairyland", "Cool Author");
    assertTrue(extracted instanceof FileBlob);
    assertEquals(originalName + "-5-9.pdf", extracted.getFilename());
    PDDocument doc = PDDocument.load(extracted.getStream());
    utils.track(doc);
    PDDocumentInformation docInfo = doc.getDocumentInformation();
    assertEquals("One Upon a Time", docInfo.getTitle());
    assertEquals("Fairyland", docInfo.getSubject());
    assertEquals("Cool Author", docInfo.getAuthor());
    doc.close();
    utils.untrack(doc);
}

From source file:org.nuxeo.pdf.test.PDFUtilsTest.java

License:Open Source License

@Test
public void test_setInfos() throws Exception {

    PDDocument doc = PDDocument.load(pdfFile);
    utils.track(doc);//  www.j  a  v a  2  s.c  o m

    PDDocumentInformation docInfoOriginal = doc.getDocumentInformation();
    // Check original document has the expected values
    assertEquals("Untitled 3", docInfoOriginal.getTitle());
    assertNull(docInfoOriginal.getSubject());
    assertNull(docInfoOriginal.getAuthor());
    // Now, modify
    // First, actually, don't modify
    PDFUtils.setInfos(doc, null, "", null);
    PDDocumentInformation newDocInfo = doc.getDocumentInformation();
    assertEquals(docInfoOriginal.getTitle(), newDocInfo.getTitle());
    assertEquals(docInfoOriginal.getSubject(), newDocInfo.getSubject());
    assertEquals(docInfoOriginal.getAuthor(), newDocInfo.getAuthor());
    // Now, modify
    PDFUtils.setInfos(doc, "The Title", "The Subject", "The Author");
    newDocInfo = doc.getDocumentInformation();
    assertEquals("The Title", newDocInfo.getTitle());
    assertEquals("The Subject", newDocInfo.getSubject());
    assertEquals("The Author", newDocInfo.getAuthor());

    doc.close();
    utils.untrack(doc);
}

From source file:org.olat.search.service.document.file.pdf.PdfBoxExtractor.java

License:Apache License

private String getTitle(PDDocument document) {
    if (document != null && document.getDocumentInformation() != null) {
        return document.getDocumentInformation().getTitle();
    }/*from  www.j  a va 2s .  co  m*/
    return null;
}