Example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog.

Prototype

public PDDocumentCatalog getDocumentCatalog()

Source Link

Document

This will get the document CATALOG.

Usage

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

void extractAcroForm(PDDocument pdf) throws IOException, SAXException, TikaException {
    //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
    //this code derives from Ben's code
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();

    if (catalog == null)
        return;// ww w . ja va2 s.c o  m

    PDAcroForm form = catalog.getAcroForm();
    if (form == null)
        return;

    //if it has xfa, try that.
    //if it doesn't exist or there's an exception,
    //go with traditional AcroForm
    PDXFAResource pdxfa = form.getXFA();

    if (pdxfa != null) {
        //if successful, return
        XFAExtractor xfaExtractor = new XFAExtractor();
        InputStream is = null;
        try {
            is = new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes()));
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        }
        if (is != null) {
            try {
                xfaExtractor.extract(is, xhtml, metadata, context);
                return;
            } catch (XMLStreamException e) {
                //if there was an xml parse exception in xfa, try the AcroForm
                EmbeddedDocumentUtil.recordException(e, metadata);
            } finally {
                IOUtils.closeQuietly(is);
            }
        }
    }

    @SuppressWarnings("rawtypes")
    List fields = form.getFields();

    if (fields == null)
        return;

    @SuppressWarnings("rawtypes")
    ListIterator itr = fields.listIterator();

    if (itr == null)
        return;

    xhtml.startElement("div", "class", "acroform");
    xhtml.startElement("ol");

    while (itr.hasNext()) {
        Object obj = itr.next();
        if (obj != null && obj instanceof PDField) {
            processAcroField((PDField) obj, 0);
        }
    }
    xhtml.endElement("ol");
    xhtml.endElement("div");
}

From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java

License:Apache License

private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
        throws IOException, SAXException, TikaException {
    PDDocumentCatalog catalog = document.getDocumentCatalog();
    PDDocumentNameDictionary names = catalog.getNames();
    if (names == null) {
        return;/*w  w  w  .j a v a2s. c  o  m*/
    }
    PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();

    if (embeddedFiles == null) {
        return;
    }

    Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
    //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
    //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    //If there is a need we could add a fully recursive search to find a non-null
    //Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
        processEmbeddedDocNames(embeddedFileNames);
    } else {
        List<PDNameTreeNode> kids = embeddedFiles.getKids();
        if (kids == null) {
            return;
        }
        for (PDNameTreeNode n : kids) {
            Map<String, COSObjectable> childNames = n.getNames();
            if (childNames != null) {
                processEmbeddedDocNames(childNames);
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java

License:Apache License

private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, SAXException {
    //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
    //this code derives from Ben's code
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();

    if (catalog == null)
        return;//  w w w  . ja  va2  s .  c  om

    PDAcroForm form = catalog.getAcroForm();
    if (form == null)
        return;

    @SuppressWarnings("rawtypes")
    List fields = form.getFields();

    if (fields == null)
        return;

    @SuppressWarnings("rawtypes")
    ListIterator itr = fields.listIterator();

    if (itr == null)
        return;

    handler.startElement("div", "class", "acroform");
    handler.startElement("ol");

    while (itr.hasNext()) {
        Object obj = itr.next();
        if (obj != null && obj instanceof PDField) {
            processAcroField((PDField) obj, handler, 0);
        }
    }
    handler.endElement("ol");
    handler.endElement("div");
}

From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java

License:Apache License

private String extractXFAText(PDDocument pdfDocument) throws IOException {
    PDDocumentCatalog catalog = pdfDocument.getDocumentCatalog();
    String xfaXml = null;//from w w  w . j ava2 s  .  co  m
    if (catalog != null) {
        PDAcroForm acroForm = catalog.getAcroForm();
        if (acroForm != null) {
            PDXFAResource xfa = acroForm.getXFA();
            if (xfa != null) {
                //TODO consider streaming and writing as we read along
                //to preserve memory.
                //See and replicate how xfa getBytes() does it.
                xfaXml = new String(xfa.getBytes());
            }
        }
    }
    // No XFA, do nothing
    if (xfaXml == null) {
        return null;
    }

    // Extract text from XFA 
    StringBuilder b = new StringBuilder();
    Matcher m = PATTERN_TEXT.matcher(xfaXml);
    while (m.find()) {
        String tag = getMatchGroup(m, 1);
        boolean isText = "text".equals(tag);
        String attribs = getMatchGroup(m, 2);
        String value = getMatchGroup(m, 3);

        // Reject href text
        if (isText && attribs.contains("name=\"embeddedHref\"")) {
            continue;
        }

        // Get text from free-form exData
        if ("exData".equals(tag)) {
            if (attribs.contains("contentType=\"application/xml\"")
                    || attribs.contains("contentType=\"text/html\"")
                    || attribs.contains("contentType=\"text/xml\"")) {
                value = PATTERN_STRIP_MARKUP.matcher(value).replaceAll(" ");
            }
        }
        b.append(value);
        b.append("\n");
    }
    return b.toString();
}

From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java

License:Apache License

@SuppressWarnings("deprecation")
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    XMPMetadata xmp = null;//from  w  w  w . java 2  s .  co m
    XMPSchemaDublinCore dcSchema = null;
    try {
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata());
        }
        if (xmp != null) {
            dcSchema = xmp.getDublinCoreSchema();
        }
    } catch (IOException e) {
        //swallow
    }
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    // TODO Remove these in Tika 2.0
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
    addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDF2XHTML.java

License:Apache License

private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, SAXException {
    //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
    //this code derives from Ben's code
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();

    if (catalog == null)
        return;/*w  w  w  .j  a  v a 2 s  . c  om*/

    PDAcroForm form = catalog.getAcroForm();
    if (form == null)
        return;

    //if it has xfa, try that.
    //if it doesn't exist or there's an exception,
    //go with traditional AcroForm
    PDXFA pdxfa = form.getXFA();
    if (pdxfa != null) {
        XFAExtractor xfaExtractor = new XFAExtractor();
        try {
            xfaExtractor.extract(new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes())), handler,
                    metadata);
            return;
        } catch (XMLStreamException | IOException e) {
            //if there was an xml parse exception in xfa, try the AcroForm
        }
    }

    @SuppressWarnings("rawtypes")
    List fields = form.getFields();

    if (fields == null)
        return;

    @SuppressWarnings("rawtypes")
    ListIterator itr = fields.listIterator();

    if (itr == null)
        return;

    handler.startElement("div", "class", "acroform");
    handler.startElement("ol");

    while (itr.hasNext()) {
        Object obj = itr.next();
        if (obj != null && obj instanceof PDField) {
            processAcroField((PDField) obj, handler, 0);
        }
    }
    handler.endElement("ol");
    handler.endElement("div");
}

From source file:org.apache.tika.parser.pdf.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    org.apache.jempbox.xmp.XMPMetadata xmp = null;
    XMPSchemaDublinCore dcSchema = null;
    XMPSchemaMediaManagement mmSchema = null;
    try {/*from w  ww. j a  v  a  2  s  .com*/
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
        }
    } catch (IOException e) {
    }

    if (xmp != null) {
        try {
            dcSchema = xmp.getDublinCoreSchema();
        } catch (IOException e) {
        }

        JempboxExtractor.extractXMPMM(xmp, metadata);
    }

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDFParser.java

License:Apache License

private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig config) {
    if (config.getIfXFAExtractOnlyXFA() && pdDocument.getDocumentCatalog() != null
            && pdDocument.getDocumentCatalog().getAcroForm() != null
            && pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
        return true;
    }//from  w ww  .ja va2 s .  c  o m
    return false;
}

From source file:org.apache.tika.parser.pdf.PDFParser.java

License:Apache License

private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata)
        throws SAXException, IOException, TikaException {
    XFAExtractor ex = new XFAExtractor();
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();/*from w  w  w  .j  a v a  2  s .c  o  m*/
    try {
        ex.extract(new ByteArrayInputStream(pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes()),
                xhtml, metadata);
    } catch (XMLStreamException e) {
        throw new TikaException("XML error in XFA", e);
    }
    xhtml.endDocument();
}

From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context)
        throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);

    XMPMetadata xmp = null;/*w  w  w  .j  a  v  a  2s . com*/
    if (dom != null) {
        xmp = new XMPMetadata(dom);
    }
    XMPSchemaDublinCore dcSchema = null;

    /*if (xmp != null) {
    try {
        dcSchema = xmp.getDublinCoreSchema();
    } catch (IOException e) {}
            
    JempboxExtractor.extractXMPMM(xmp, metadata);
    }*/

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
    // TODO Remove these in Tika 2.0
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate());
    addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
    addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate());

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getCOSObject().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
            addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
                    info.getCOSObject().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
    metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set(PDF.PDFA_VERSION, version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}