Example usage for org.apache.pdfbox.cos COSDictionary getInt

Introduction

In this page you can find the example usage for org.apache.pdfbox.cos COSDictionary getInt.

Prototype

public int getInt(COSName key)

Source Link

Document

This is a convenience method that will get the dictionary object that is expected to be an int.

Usage

From source file:uk.ac.liverpool.thumbnails.PDFService.java

License:Open Source License

@Override
public FontInformation[] extractFontList(URI u, File fff) throws MalformedURLException, IOException {

    SortedSet<FontInformation> ret = new TreeSet<FontInformation>();
    PDDocument document = getPages(u, fff);
    List pages = document.getDocumentCatalog().getAllPages();
    int i = 0;/*from  w w w .  j a  va  2 s. c o m*/
    // The code down here is easier as it gets all the fonts used in the document. Still, this would inlcude unused fonts, so we get the fonts page by page and add them to a Hash table.
    for (COSObject c : document.getDocument().getObjectsByType(COSName.FONT)) {
        if (c == null || !(c.getObject() instanceof COSDictionary))
            continue;
        //System.out.println(c.getObject());

        COSDictionary fontDictionary = (COSDictionary) c.getObject();
        // System.out.println(dic.getNameAsString(COSName.BASE_FONT));
        //            }
        //        }
        //        int pagen = document.getNumberOfPages();
        //        i=0;
        //        for (int p=0;p<pagen;p++){
        //            PDPage page = (PDPage)pages.get(p);
        //            PDResources res = page.findResources();
        //            //for each page resources
        //            if (res==null) continue; 
        //            // get the font dictionary
        //            COSDictionary fonts = (COSDictionary) res.getCOSDictionary().getDictionaryObject( COSName.FONT );
        //            for( COSName fontName : fonts.keySet() ) {
        //                COSObject font = (COSObject) fonts.getItem( fontName );
        //                // if the font has already been visited we ingore it
        //                long objectId = font.getObjectNumber().longValue();
        //                if (ret.get(objectId)!=null)
        //                    continue;
        //                if( font==null ||  ! (font.getObject() instanceof COSDictionary) )
        //                    continue;
        //                COSDictionary fontDictionary = (COSDictionary)font.getObject();

        // Type MUSt be font
        if (!fontDictionary.getNameAsString(COSName.TYPE).equals("Font"))
            continue;
        // get the variables
        FontInformation fi = new FontInformation();
        fi.fontType = fontDictionary.getNameAsString(COSName.SUBTYPE);

        String baseFont = fontDictionary.getNameAsString(COSName.BASE_FONT);
        if (baseFont == null)
            continue;
        if (Arrays.binarySearch(standard14, baseFont) >= 0)
            continue;
        COSDictionary fontDescriptor = (COSDictionary) fontDictionary.getDictionaryObject(COSName.FONT_DESC);
        COSBase enc = fontDictionary.getItem(COSName.ENCODING);
        COSBase uni = fontDictionary.getItem(COSName.TO_UNICODE);
        int firstChar = fontDictionary.getInt(COSName.FIRST_CHAR);
        int lastChar = fontDictionary.getInt(COSName.LAST_CHAR);
        String encoding;
        boolean toUnicode = uni != null;
        if (enc == null) {
            encoding = "standard14";
        }
        if (enc instanceof COSString) {
            encoding = ((COSString) enc).getString();
        } else {
            encoding = "table";
        }
        fi.isSubset = false;
        boolean t = true;
        // Type one and TT can have subsets defineing the basename see 5.5.3 pdfref 1.6
        //  if (fi.fontType.lastIndexOf(COSName.TYPE1.getName())!=-1 || fi.fontType.equals(COSName.TRUE_TYPE.getName()) )
        if (baseFont != null) {
            if (baseFont.length() > 6) {
                for (int k = 0; k < 6; k++)
                    if (!Character.isUpperCase(baseFont.charAt(k)))
                        t = false;
                if (baseFont.charAt(6) != '+')
                    t = false;
            } else
                t = false;
            fi.isSubset = t;
            if (fi.isSubset)
                baseFont = baseFont.substring(7);
        }
        fi.fontFlags = 0;
        if (fi.fontType.equals(COSName.TYPE0) || fi.fontType.equals(COSName.TYPE3))
            fi.isEmbedded = true;

        if (fontDescriptor != null) {
            // in Type1 charset indicates font is subsetted
            if (fontDescriptor.getItem(COSName.CHAR_SET) != null)
                fi.isSubset = true;
            if (fontDescriptor.getItem(COSName.FONT_FILE) != null
                    || fontDescriptor.getItem(COSName.FONT_FILE3) != null
                    || fontDescriptor.getItem(COSName.FONT_FILE2) != null)
                fi.isEmbedded = true;
            fi.fontFlags = fontDescriptor.getInt(COSName.getPDFName("Flags"));
            fi.fontFamily = fontDescriptor.getString(COSName.FONT_FAMILY);
            fi.fontStretch = fontDescriptor.getString(COSName.FONT_STRETCH);
        }
        fi.charset = encoding;
        fi.fontName = baseFont;
        fi.isToUnicode = toUnicode;

        ret.add(fi);

    } // for all fonts 

    //    } // for all pages
    Iterator<FontInformation> it = ret.iterator();
    FontInformation prev = null;
    LinkedList<FontInformation> toDelete = new LinkedList<FontInformation>();
    while (it.hasNext()) {
        FontInformation current = it.next();

        if (prev != null && prev.fontName.equals(current.fontName) && prev.fontType.startsWith("CIDFontType"))
            toDelete.add(current);
        prev = current;
    }
    ret.removeAll(toDelete);
    FontInformation[] retArray = ret.toArray(new FontInformation[0]);

    return retArray;
}

From source file:uk.bl.wa.tika.parser.pdf.pdfbox.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    addMetadata(metadata, Metadata.TITLE, info.getTitle());
    addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
    addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "pdf:creator", info.getCreator());
    addMetadata(metadata, "pdf:producer", info.getProducer());
    addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate",
            "Keywords", "Producer", "Subject", "Title", "Trapped" });
    if (info.getCOSObject() != null && info.getCOSObject().keySet() != null) {
        for (COSName key : info.getCOSObject().keySet()) {
            String name = key.getName();
            if (!handledMetadata.contains(name)) {
                addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
            }//from  w ww  .j  a v a  2 s. c  om
        }
    }
    // ANJ Extensions:
    //
    //
    // Add other data of interest:
    metadata.set("pdf:version", "" + document.getDocument().getVersion());
    metadata.set("pdf:numPages", "" + document.getNumberOfPages());
    //metadata.set("pdf:cryptoMode", ""+getCryptoModeAsString(reader));
    //metadata.set("pdf:openedWithFullPermissions", ""+reader.isOpenedWithFullPermissions());
    metadata.set("pdf:encrypted", "" + document.isEncrypted());
    //metadata.set("pdf:metadataEncrypted", ""+document.isMetadataEncrypted());
    //metadata.set("pdf:128key", ""+reader.is128Key());
    //metadata.set("pdf:tampered", ""+reader.isTampered());
    try {
        if (document.getDocumentCatalog().getMetadata() != null) {
            XMPMetadata xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata());
            // There is a special class for grabbing data in the PDF schema - not sure it will add much here:
            // Could parse xmp:CreatorTool and pdf:Producer etc. etc. out of here.
            XMPSchemaPDF pdfxmp = xmp.getPDFSchema();
            // Added a PDF/A schema class:
            xmp.addXMLNSMapping(XMPSchemaPDFA.NAMESPACE, XMPSchemaPDFA.class);
            XMPSchemaPDFA pdfaxmp = (XMPSchemaPDFA) xmp.getSchemaByClass(XMPSchemaPDFA.class);
            if (pdfaxmp != null) {
                metadata.set("pdfaid:part", pdfaxmp.getPart());
                metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase();
                //metadata.set("pdfa:version", version );                    
                metadata.set("pdf:version", version);
            }
            // TODO WARN if this XMP version is inconsistent with document header version?
        }
    } catch (IOException e) {
        log.error("XMP Parsing failed: " + e);
        metadata.set("pdf:metadata-xmp-parse-failed", "" + e);
    }

    // Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    metadata.set("pdf:version", baseVersion + " Adobe Extension Level " + el);
                }
                // TODO WARN if this embedded version is inconsistent with document header version?
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
    // End Of ANJ Extensions.
}