Example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation.

Prototype

public PDDocumentInformation getDocumentInformation()

Source Link

Document

This will get the document info dictionary.

Usage

From source file:PDFConverter.java

License:Apache License

/**
 * Implementation is informed by PDFBox authors.
 *
 * @param doc/*from  www  .  j a va2 s .  c o m*/
 * @return
 * @throws IOException
 */
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {

    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *      http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    /**
     * Adapted from LucenePDFDocument.java from PDFBox lucene project
     *
     * This class is used to create a document for the lucene search engine.
     * This should easily plug into the IndexHTML or IndexFiles that comes
     * with the lucene project. This class will populate the following
     * fields.
     * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr>
     * <tr>
     * <td>path</td> <td>File system path if loaded from a file</td> </tr>
     * <tr>
     * <td>url</td> <td>URL to PDF document</td> </tr> <tr>
     * <td>contents</td>
     * <td>Entire contents of PDF document, indexed but not stored</td>
     * </tr>
     * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr>
     * <tr>
     * <td>modified</td> <td>The modified date/time according to the url or
     * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the
     * Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF
     * meta-data if available</td> </tr> <tr> <td>ModificationDate</td>
     * <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td>
     * <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td>
     * <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td>
     * <td>From PDF meta-data if available</td> </tr> <tr>
     * <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr>
     * </table>
     *
     * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
     * @version $Revision: 1.23 $
     *
     * @throws IOException If there is an error parsing the document.
     */
    PDDocument pdfDocument = null;
    ConvertedDocument textdoc = new ConvertedDocument(doc);

    try {
        pdfDocument = PDDocument.load(doc);

        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            // Even if the doc is encrypted, apparently you can try. Throw exception if it fails.
            textdoc.addProperty("encrypted", "YES");
        }

        //create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        stripper.resetEngine();
        stripper.writeText(pdfDocument, writer);

        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            textdoc.addAuthor(info.getAuthor());
            try {
                textdoc.addCreateDate(info.getCreationDate());
            } catch (IOException io) {
                //ignore, bad date but continue with indexing
            }
            textdoc.addProperty("creator_tool", info.getCreator());
            textdoc.addProperty("keywords", info.getKeywords());
            /* try {
             metadata.add("ModificationDate", info.getModificationDate());
             } catch (IOException io) {
             //ignore, bad date but continue with indexing
             } */
            //metadata.add("Producer", info.getProducer());
            textdoc.addProperty("subject", info.getSubject());
            String ttl = info.getTitle();
            if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
                ttl = textdoc.filename;
            }
            textdoc.addTitle(ttl);
            // metadata.add("Trapped", info.getTrapped());

            // TODO: Character set is what?
            textdoc.setEncoding("UTF-8");
        }

        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        textdoc.setText(writer.getBuffer().toString());

        return textdoc;

    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:PDFExtractMetadata.java

License:Apache License

/**
 * This is the main method./*from   w ww.  j a v  a 2s .c o  m*/
 *
 * @param args The command line arguments.
 *
 * @throws IOException If there is an error parsing the document.
 * @throws XmpParsingException
 */
public static void main(String[] args) throws IOException, XmpParsingException {
    if (args.length != 1) {
        usage();
        System.exit(1);
    } else {
        PDDocument document = null;
        try {
            document = PDDocument.load(new File(args[0]));
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDMetadata meta = catalog.getMetadata();
            if (meta != null) {
                DomXmpParser xmpParser = new DomXmpParser();
                try {
                    XMPMetadata metadata = xmpParser.parse(meta.createInputStream());

                    DublinCoreSchema dc = metadata.getDublinCoreSchema();
                    if (dc != null) {
                        display("Title:", dc.getTitle());
                        display("Description:", dc.getDescription());
                        listString("Creators: ", dc.getCreators());
                        listCalendar("Dates:", dc.getDates());
                        listString("Subjects:", dc.getSubjects());
                    }

                    AdobePDFSchema pdf = metadata.getAdobePDFSchema();
                    if (pdf != null) {
                        display("Keywords:", pdf.getKeywords());
                        display("PDF Version:", pdf.getPDFVersion());
                        display("PDF Producer:", pdf.getProducer());
                    }

                    XMPBasicSchema basic = metadata.getXMPBasicSchema();
                    if (basic != null) {
                        display("Create Date:", basic.getCreateDate());
                        display("Modify Date:", basic.getModifyDate());
                        display("Creator Tool:", basic.getCreatorTool());
                    }
                } catch (XmpParsingException e) {
                    System.err.println("An error ouccred when parsing the meta data: " + e.getMessage());
                }
            } else {
                // The pdf doesn't contain any metadata, try to use the
                // document information instead
                PDDocumentInformation information = document.getDocumentInformation();
                if (information != null) {
                    showDocumentInformation(information);
                }
            }

        } finally {
            if (document != null) {
                document.close();
            }
        }
    }

}

From source file:adams.flow.transformer.PDFMetaData.java

License:Open Source License

/**
 * Executes the flow item./*from   w w  w. j  a v  a 2 s . co m*/
 *
 * @return      null if everything is fine, otherwise error message
 */
@Override
protected String doExecute() {
    String result;
    File file;
    SpreadSheet sheet;
    PDDocument document;
    PDDocumentInformation info;
    Row row;
    Set<String> keys;

    result = null;

    // get file
    if (m_InputToken.getPayload() instanceof File)
        file = (File) m_InputToken.getPayload();
    else
        file = new PlaceholderFile((String) m_InputToken.getPayload());

    sheet = new DefaultSpreadSheet();
    sheet.setDataRowClass(SparseDataRow.class);
    sheet.setName("Meta-Data: " + file.getAbsolutePath());

    try {
        row = sheet.addRow();
        document = PDDocument.load(file.getAbsoluteFile());
        info = document.getDocumentInformation();

        addCell(row, "Title", info.getTitle());
        addCell(row, "Subject", info.getSubject());
        addCell(row, "Author", info.getAuthor());
        addCell(row, "Keywords", info.getKeywords());
        addCell(row, "Producer", info.getProducer());
        addCell(row, "Creation Date", info.getCreationDate());
        addCell(row, "Modification Date", info.getModificationDate());
        addCell(row, "Creator", info.getCreator());
        addCell(row, "Trapped", info.getTrapped());
        keys = info.getMetadataKeys();
        for (String key : keys)
            addCell(row, "Meta-" + key, info.getCustomMetadataValue(key));
    } catch (Exception e) {
        result = handleException("Failed to extract meta-data: ", e);
    }

    if (result == null)
        m_OutputToken = new Token(sheet);

    return result;
}

From source file:au.org.alfred.icu.pdf.services.factories.tests.PDFBoxPrintingFactoryTest.java

@Test
public void canSetDocumentInformation() throws IOException {
    PDDocument pdf = new PDDocument();

    String title = "PMIID";
    String subject = "VISIT_NUMBER";
    String keywords = "VISIT_NUMBER";
    String creator = "USER_NAME";
    String author = "USER_NAME";

    ICUDischargeSummaryFactory.addDocumentInformation(pdf, title, subject, keywords, creator, author);

    assertTrue(pdf.getDocumentInformation().getAuthor().equals(author));

}

From source file:com.fangxin365.core.utils.PDFMerger.java

License:Apache License

/**
 * append all pages from source to destination.
 * // w  w w .  ja  va 2  s .c  om
 * @param destination
 *            the document to receive the pages
 * @param source
 *            the document originating the new pages
 * 
 * @throws IOException
 *             If there is an error accessing data from either document.
 */
public void appendDocument(PDDocument destination, PDDocument source) throws IOException {
    if (destination.isEncrypted()) {
        System.out.println("Error: destination PDF is encrypted, can't append encrypted PDF documents.");
    }
    if (source.isEncrypted()) {
        System.out.println("Error: source PDF is encrypted, can't append encrypted PDF documents.");
    }
    PDDocumentInformation destInfo = destination.getDocumentInformation();
    PDDocumentInformation srcInfo = source.getDocumentInformation();
    destInfo.getDictionary().mergeInto(srcInfo.getDictionary());

    PDDocumentCatalog destCatalog = destination.getDocumentCatalog();
    PDDocumentCatalog srcCatalog = source.getDocumentCatalog();

    // use the highest version number for the resulting pdf
    float destVersion = destination.getDocument().getVersion();
    float srcVersion = source.getDocument().getVersion();

    if (destVersion < srcVersion) {
        destination.getDocument().setVersion(srcVersion);
    }

    if (destCatalog.getOpenAction() == null) {
        destCatalog.setOpenAction(srcCatalog.getOpenAction());
    }

    // maybe there are some shared resources for all pages
    COSDictionary srcPages = (COSDictionary) srcCatalog.getCOSDictionary().getDictionaryObject(COSName.PAGES);
    COSDictionary srcResources = (COSDictionary) srcPages.getDictionaryObject(COSName.RESOURCES);
    COSDictionary destPages = (COSDictionary) destCatalog.getCOSDictionary().getDictionaryObject(COSName.PAGES);
    COSDictionary destResources = (COSDictionary) destPages.getDictionaryObject(COSName.RESOURCES);
    if (srcResources != null) {
        if (destResources != null) {
            destResources.mergeInto(srcResources);
        } else {
            destPages.setItem(COSName.RESOURCES, srcResources);
        }
    }

    PDFCloneUtility cloner = new PDFCloneUtility(destination);

    try {
        PDAcroForm destAcroForm = destCatalog.getAcroForm();
        PDAcroForm srcAcroForm = srcCatalog.getAcroForm();
        if (destAcroForm == null) {
            cloner.cloneForNewDocument(srcAcroForm);
            destCatalog.setAcroForm(srcAcroForm);
        } else {
            if (srcAcroForm != null) {
                mergeAcroForm(cloner, destAcroForm, srcAcroForm);
            }
        }
    } catch (Exception e) {
        // if we are not ignoring exceptions, we'll re-throw this
        if (!ignoreAcroFormErrors) {
            throw (IOException) e;
        }
    }

    COSArray destThreads = (COSArray) destCatalog.getCOSDictionary().getDictionaryObject(COSName.THREADS);
    COSArray srcThreads = (COSArray) cloner
            .cloneForNewDocument(destCatalog.getCOSDictionary().getDictionaryObject(COSName.THREADS));
    if (destThreads == null) {
        destCatalog.getCOSDictionary().setItem(COSName.THREADS, srcThreads);
    } else {
        destThreads.addAll(srcThreads);
    }

    PDDocumentNameDictionary destNames = destCatalog.getNames();
    PDDocumentNameDictionary srcNames = srcCatalog.getNames();
    if (srcNames != null) {
        if (destNames == null) {
            destCatalog.getCOSDictionary().setItem(COSName.NAMES, cloner.cloneForNewDocument(srcNames));
        } else {
            cloner.cloneMerge(srcNames, destNames);
        }

    }

    PDDocumentOutline destOutline = destCatalog.getDocumentOutline();
    PDDocumentOutline srcOutline = srcCatalog.getDocumentOutline();
    if (srcOutline != null) {
        if (destOutline == null) {
            PDDocumentOutline cloned = new PDDocumentOutline(
                    (COSDictionary) cloner.cloneForNewDocument(srcOutline));
            destCatalog.setDocumentOutline(cloned);
        } else {
            PDOutlineItem first = srcOutline.getFirstChild();
            if (first != null) {
                PDOutlineItem clonedFirst = new PDOutlineItem(
                        (COSDictionary) cloner.cloneForNewDocument(first));
                destOutline.appendChild(clonedFirst);
            }
        }
    }

    String destPageMode = destCatalog.getPageMode();
    String srcPageMode = srcCatalog.getPageMode();
    if (destPageMode == null) {
        destCatalog.setPageMode(srcPageMode);
    }

    COSDictionary destLabels = (COSDictionary) destCatalog.getCOSDictionary()
            .getDictionaryObject(COSName.PAGE_LABELS);
    COSDictionary srcLabels = (COSDictionary) srcCatalog.getCOSDictionary()
            .getDictionaryObject(COSName.PAGE_LABELS);
    if (srcLabels != null) {
        int destPageCount = destination.getNumberOfPages();
        COSArray destNums = null;
        if (destLabels == null) {
            destLabels = new COSDictionary();
            destNums = new COSArray();
            destLabels.setItem(COSName.NUMS, destNums);
            destCatalog.getCOSDictionary().setItem(COSName.PAGE_LABELS, destLabels);
        } else {
            destNums = (COSArray) destLabels.getDictionaryObject(COSName.NUMS);
        }
        COSArray srcNums = (COSArray) srcLabels.getDictionaryObject(COSName.NUMS);
        if (srcNums != null) {
            for (int i = 0; i < srcNums.size(); i += 2) {
                COSNumber labelIndex = (COSNumber) srcNums.getObject(i);
                long labelIndexValue = labelIndex.intValue();
                destNums.add(COSInteger.get(labelIndexValue + destPageCount));
                destNums.add(cloner.cloneForNewDocument(srcNums.getObject(i + 1)));
            }
        }
    }

    COSStream destMetadata = (COSStream) destCatalog.getCOSDictionary().getDictionaryObject(COSName.METADATA);
    COSStream srcMetadata = (COSStream) srcCatalog.getCOSDictionary().getDictionaryObject(COSName.METADATA);
    if (destMetadata == null && srcMetadata != null) {
        PDStream newStream = new PDStream(destination, srcMetadata.getUnfilteredStream(), false);
        newStream.getStream().mergeInto(srcMetadata);
        newStream.addCompression();
        destCatalog.getCOSDictionary().setItem(COSName.METADATA, newStream);
    }

    // finally append the pages
    @SuppressWarnings("unchecked")
    List<PDPage> pages = srcCatalog.getAllPages();
    Iterator<PDPage> pageIter = pages.iterator();
    while (pageIter.hasNext()) {
        PDPage page = pageIter.next();
        PDPage newPage = new PDPage((COSDictionary) cloner.cloneForNewDocument(page.getCOSDictionary()));
        newPage.setCropBox(page.findCropBox());
        newPage.setMediaBox(page.findMediaBox());
        newPage.setRotation(page.findRotation());
        destination.addPage(newPage);
    }
}

From source file:com.formkiq.core.service.generator.pdfbox.PdfEditorServiceImpl.java

License:Apache License

/**
 * Build {@link FormJSON} from {@link PDDocument}.
 * @param doc {@link PDDocument}//from w  w  w.j ava 2 s. c o  m
 * @param texts {@link List} of {@link PdfTextField}
 * @return {@link FormJSON}
 */
private FormJSON buildFormJSON(final PDDocument doc, final List<PdfTextField> texts) {

    String title = doc.getDocumentInformation().getTitle();

    if (isEmpty(title)) {

        float maxFont = texts.stream().map(s -> Float.valueOf(s.getFontSize())).max(Float::compare).get()
                .floatValue();

        StringBuilder sb = new StringBuilder();

        for (PdfTextField text : texts) {

            if ((int) text.getFontSize() == (int) maxFont) {
                sb.append(text.getText() + " ");
            }
        }

        title = sb.toString();
    }

    title = isEmpty(title) ? "Untitled" : title.trim();

    FormJSON form = ObjectBuilder.buildFormJSON(title);
    return form;
}

From source file:com.github.joemcintyre.pdffinish.PDFFinish.java

License:Open Source License

/**
 * Show metadata from PDF document./*from   w  ww. j a  v  a 2s  .com*/
 * 
 * @param document Loaded PDF document.
 */
private static void showMetadata(PDDocument document) throws IOException {
    PDDocumentInformation info = document.getDocumentInformation();
    System.out.println("Title: " + info.getTitle());
    System.out.println("Author: " + info.getAuthor());
    System.out.println("Subject: " + info.getSubject());
    System.out.println("Keywords: " + info.getKeywords());
    System.out.println("Creator: " + info.getCreator());
    System.out.println("Producer: " + info.getProducer());
    System.out.println("Creation Date: " + info.getCreationDate());
    System.out.println("Modification Date: " + info.getModificationDate());
}

From source file:com.github.joemcintyre.pdffinish.PDFFinish.java

License:Open Source License

/**
 * Update metadata.// w  w  w.ja  v  a 2  s .  c  o  m
 * 
 * @param document Loaded PDF document.
 * @throws IOException
 */
private void updateMetadata(PDDocument document) throws IOException {
    PDDocumentInformation info = document.getDocumentInformation();
    if (title != null) {
        info.setTitle(title);
    }
    if (author != null) {
        info.setAuthor(author);
    }
    if (subject != null) {
        info.setSubject(subject);
    }
    if (keywords != null) {
        info.setKeywords(keywords);
    }
}

From source file:com.jaeksoft.searchlib.parser.PdfParser.java

License:Open Source License

private void extractMetaData(ParserResultItem result, PDDocument pdf) throws IOException {
    PDDocumentInformation info = pdf.getDocumentInformation();
    if (info != null) {
        result.addField(ParserFieldEnum.title, info.getTitle());
        result.addField(ParserFieldEnum.subject, info.getSubject());
        result.addField(ParserFieldEnum.author, info.getAuthor());
        result.addField(ParserFieldEnum.producer, info.getProducer());
        result.addField(ParserFieldEnum.keywords, info.getKeywords());
        String d = getDate(getCreationDate(info));
        if (d != null)
            result.addField(ParserFieldEnum.creation_date, d);
        d = getDate(getModificationDate(info));
        if (d != null)
            result.addField(ParserFieldEnum.modification_date, d);
    }/*from   w  w  w .  j a va 2  s  . com*/
    int pages = pdf.getNumberOfPages();
    result.addField(ParserFieldEnum.number_of_pages, pages);
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();
    if (catalog != null) {
        result.addField(ParserFieldEnum.language, catalog.getLanguage());
    }
}

From source file:com.openkm.util.metadata.MetadataExtractor.java

License:Open Source License

/**
 * Extract metadata from PDF//from   w  ww .  jav  a  2 s  .  c  o  m
 */
public static PdfMetadata pdfExtractor(InputStream is) throws IOException {
    PDDocument doc = PDDocument.load(is);
    PDDocumentInformation info = doc.getDocumentInformation();
    PdfMetadata md = new PdfMetadata();

    md.setNumberOfPages(doc.getNumberOfPages());
    md.setTitle(info.getTitle());
    md.setAuthor(info.getAuthor());
    md.setSubject(info.getSubject());
    md.setKeywords(info.getKeywords());
    md.setCreator(info.getCreator());
    md.setProducer(info.getProducer());
    md.setTrapped(info.getTrapped());
    md.setCreationDate(info.getCreationDate());
    md.setModificationDate(info.getModificationDate());

    log.info("pdfExtractor: {}", md);
    return md;
}