Example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation

List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentInformation.

Prototype

public PDDocumentInformation getDocumentInformation() 

Source Link

Document

This will get the document info dictionary.

Usage

From source file:org.apache.camel.component.fop.FopHelper.java

License:Apache License

public static String getDocumentMetadataValue(PDDocument document, COSName name) {
    PDDocumentInformation info = document.getDocumentInformation();
    return info.getDictionary().getString(name);
}

From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java

License:Apache License

@SuppressWarnings("deprecation")
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    XMPMetadata xmp = null;/*w ww . j  a v a  2 s . c  o m*/
    XMPSchemaDublinCore dcSchema = null;
    try {
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata());
        }
        if (xmp != null) {
            dcSchema = xmp.getDublinCoreSchema();
        }
    } catch (IOException e) {
        //swallow
    }
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    // TODO Remove these in Tika 2.0
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
    addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    org.apache.jempbox.xmp.XMPMetadata xmp = null;
    XMPSchemaDublinCore dcSchema = null;
    XMPSchemaMediaManagement mmSchema = null;
    try {//from   w  ww  .  j  a  v a  2 s. co m
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
        }
    } catch (IOException e) {
    }

    if (xmp != null) {
        try {
            dcSchema = xmp.getDublinCoreSchema();
        } catch (IOException e) {
        }

        JempboxExtractor.extractXMPMM(xmp, metadata);
    }

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context)
        throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);

    XMPMetadata xmp = null;// www .  j a  v  a 2 s. c o m
    if (dom != null) {
        xmp = new XMPMetadata(dom);
    }
    XMPSchemaDublinCore dcSchema = null;

    /*if (xmp != null) {
    try {
        dcSchema = xmp.getDublinCoreSchema();
    } catch (IOException e) {}
            
    JempboxExtractor.extractXMPMM(xmp, metadata);
    }*/

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
    // TODO Remove these in Tika 2.0
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate());
    addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
    addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate());

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getCOSObject().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
            addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
                    info.getCOSObject().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
    metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set(PDF.PDFA_VERSION, version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf18.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    org.apache.jempbox.xmp.XMPMetadata xmp = null;
    XMPSchemaDublinCore dcSchema = null;
    XMPSchemaMediaManagement mmSchema = null;
    try {// w  ww  .j ava  2 s .c o m
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
        }
    } catch (IOException e) {
    }

    if (xmp != null) {
        try {
            dcSchema = xmp.getDublinCoreSchema();
        } catch (IOException e) {
        }

        JempboxExtractor.extractXMPMM(xmp, metadata);
    }

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.codelibs.fess.crawler.extractor.impl.PdfExtractor.java

License:Apache License

private void extractMetadata(final PDDocument document, final ExtractData extractData) {
    final PDDocumentInformation info = document.getDocumentInformation();
    if (info == null) {
        return;//from www.  j av a2s.c  o  m
    }

    for (final String key : info.getMetadataKeys()) {
        final String value = info.getCustomMetadataValue(key);
        addMetadata(extractData, key, value);
    }
}

From source file:org.crossref.pdfmark.Main.java

License:Open Source License

public static void writeInfoDictionary(FileInputStream in, String outputFile, byte[] xmp)
        throws IOException, COSVisitorException {

    PDFParser parser = new PDFParser(in);
    parser.parse();//from  w w  w  .  jav  a 2s . com

    PDDocument document = parser.getPDDocument();
    PDDocumentInformation info = document.getDocumentInformation();

    for (Entry<String, String> entry : XmpUtils.toInfo(xmp).entrySet()) {
        info.setCustomMetadataValue(entry.getKey(), entry.getValue());
    }

    document.setDocumentInformation(info);
    document.save(outputFile);
    document.close();
}

From source file:org.deepfs.fsml.xdcr.PDFTransducer.java

@Override
public String read(final String path) {
    PDDocument pd = null;
    final StringWriter sw = new StringWriter();
    final StringBuilder sb = new StringBuilder(128);
    try {/*from w w  w .  ja  v a  2 s  .c o  m*/
        pd = PDDocument.load(path);
        PDDocumentInformation info = pd.getDocumentInformation();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setEndPage(NO_PAGES);
        stripper.writeText(pd, sw);
        sb.append(keyValue(NS + "title", info.getTitle()));
        sb.append(keyValue(NS + "subject", info.getSubject()));
        sb.append(keyValue(NS + "creator", info.getCreator()));
        sb.append(keyValue(NS + "author", info.getAuthor()));
        sb.append(keyValue(NS + "producer", info.getProducer()));
        sb.append(keyValue(NS + "date",
                info.getCreationDate() != null ? new SimpleDateFormat().format(info.getCreationDate().getTime())
                        : null));
        sb.append(keyValue(NS + "content", sw.getBuffer().toString()));

        sb.append(keyValue(NS + "keywords", info.getKeywords()));
    } catch (IOException e) {
        e.printStackTrace();
    } catch (IndexOutOfBoundsException oe) {
        // [MS] thanks apache pdfbox :-)
        System.err.println(oe.getMessage());
    }
    return sb.toString();
}

From source file:org.dspace.content.packager.PDFPackager.java

License:BSD License

private void crosswalkPDF(Context context, Item item, InputStream metadata)
        throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;//from  www.j a  va2s.  com

    try {
        PDFParser parser = new PDFParser(metadata);
        parser.parse();
        cos = parser.getDocument();

        // sanity check: PDFBox breaks on encrypted documents, so give up.
        if (cos.getEncryptionDictionary() != null) {
            throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
        }

        /* PDF to DC "crosswalk":
         *
         * NOTE: This is not in a crosswalk plugin because (a) it isn't
         * useful anywhere else, and more importantly, (b) the source
         * data is not XML so it doesn't fit the plugin's interface.
         *
         * pattern of crosswalk -- PDF dict entries to DC:
         *   Title -> title.null
         *   Author -> contributor.author
         *   CreationDate -> date.created
         *   ModDate -> date.created
         *   Creator -> description.provenance (application that created orig)
         *   Producer -> description.provenance (convertor to pdf)
         *   Subject -> description.abstract
         *   Keywords -> subject.other
         *    date is java.util.Calendar
         */
        PDDocument pd = new PDDocument(cos);
        PDDocumentInformation docinfo = pd.getDocumentInformation();
        String title = docinfo.getTitle();

        // sanity check: item must have a title.
        if (title == null) {
            throw new MetadataValidationException(
                    "This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
        }
        if (log.isDebugEnabled()) {
            log.debug("PDF Info dict title=\"" + title + "\"");
        }
        item.addDC("title", null, "en", title);
        String value = docinfo.getAuthor();
        if (value != null) {
            item.addDC("contributor", "author", null, value);
            if (log.isDebugEnabled()) {
                log.debug("PDF Info dict author=\"" + value + "\"");
            }
        }

        value = docinfo.getCreator();
        if (value != null) {
            item.addDC("description", "provenance", "en",
                    "Application that created the original document: " + value);
        }

        value = docinfo.getProducer();
        if (value != null) {
            item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value);
        }

        value = docinfo.getSubject();
        if (value != null) {
            item.addDC("description", "abstract", null, value);
        }

        value = docinfo.getKeywords();
        if (value != null) {
            item.addDC("subject", "other", null, value);
        }

        // Take either CreationDate or ModDate as "date.created",
        // Too bad there's no place to put "last modified" in the DC.
        Calendar calValue = docinfo.getCreationDate();
        if (calValue == null) {
            calValue = docinfo.getModificationDate();
        }

        if (calValue != null) {
            item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
        }
        item.update();
    } finally {
        if (cos != null) {
            cos.close();
        }
    }
}

From source file:org.encuestame.business.search.DataFile.java

License:Apache License

/**
 * Extract Metadata in PDF Documents./*from  w  ww .j a  v a2 s  . c o m*/
 * @param pdDoc
 * @return
 */
public static AttachmentIndex extractMetadataPDFDocument(final PDDocument pdDoc) {
    PDDocumentInformation docInfo = pdDoc.getDocumentInformation();
    author = docInfo.getAuthor();
    title = docInfo.getTitle();
    producer = docInfo.getProducer();
    subject = docInfo.getSubject();
    AttachmentIndex attachmentMetadata = IndexerFile.addMetadatatoBean(author, title, producer, subject);
    return attachmentMetadata;
}