Example usage for org.apache.pdfbox.cos COSDictionary getNameAsString

Introduction

In this page you can find the example usage for org.apache.pdfbox.cos COSDictionary getNameAsString.

Prototype

public String getNameAsString(COSName key)

Source Link

Document

This is a convenience method that will get the dictionary object that is expected to be a name and convert it to a string.

Usage

From source file:uk.bl.wa.tika.parser.pdf.pdfbox.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    addMetadata(metadata, Metadata.TITLE, info.getTitle());
    addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
    addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "pdf:creator", info.getCreator());
    addMetadata(metadata, "pdf:producer", info.getProducer());
    addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate",
            "Keywords", "Producer", "Subject", "Title", "Trapped" });
    if (info.getCOSObject() != null && info.getCOSObject().keySet() != null) {
        for (COSName key : info.getCOSObject().keySet()) {
            String name = key.getName();
            if (!handledMetadata.contains(name)) {
                addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
            }/*  w w w  .ja v a  2 s .co m*/
        }
    }
    // ANJ Extensions:
    //
    //
    // Add other data of interest:
    metadata.set("pdf:version", "" + document.getDocument().getVersion());
    metadata.set("pdf:numPages", "" + document.getNumberOfPages());
    //metadata.set("pdf:cryptoMode", ""+getCryptoModeAsString(reader));
    //metadata.set("pdf:openedWithFullPermissions", ""+reader.isOpenedWithFullPermissions());
    metadata.set("pdf:encrypted", "" + document.isEncrypted());
    //metadata.set("pdf:metadataEncrypted", ""+document.isMetadataEncrypted());
    //metadata.set("pdf:128key", ""+reader.is128Key());
    //metadata.set("pdf:tampered", ""+reader.isTampered());
    try {
        if (document.getDocumentCatalog().getMetadata() != null) {
            XMPMetadata xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata());
            // There is a special class for grabbing data in the PDF schema - not sure it will add much here:
            // Could parse xmp:CreatorTool and pdf:Producer etc. etc. out of here.
            XMPSchemaPDF pdfxmp = xmp.getPDFSchema();
            // Added a PDF/A schema class:
            xmp.addXMLNSMapping(XMPSchemaPDFA.NAMESPACE, XMPSchemaPDFA.class);
            XMPSchemaPDFA pdfaxmp = (XMPSchemaPDFA) xmp.getSchemaByClass(XMPSchemaPDFA.class);
            if (pdfaxmp != null) {
                metadata.set("pdfaid:part", pdfaxmp.getPart());
                metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase();
                //metadata.set("pdfa:version", version );                    
                metadata.set("pdf:version", version);
            }
            // TODO WARN if this XMP version is inconsistent with document header version?
        }
    } catch (IOException e) {
        log.error("XMP Parsing failed: " + e);
        metadata.set("pdf:metadata-xmp-parse-failed", "" + e);
    }

    // Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    metadata.set("pdf:version", baseVersion + " Adobe Extension Level " + el);
                }
                // TODO WARN if this embedded version is inconsistent with document header version?
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
    // End Of ANJ Extensions.
}