Example usage for org.apache.pdfbox.cos COSDictionary getInt

Introduction

In this page you can find the example usage for org.apache.pdfbox.cos COSDictionary getInt.

Prototype

public int getInt(COSName key)

Source Link

Document

This is a convenience method that will get the dictionary object that is expected to be an int.

Usage

From source file:com.esri.geoportal.commons.pdf.PdfUtils.java

License:Apache License

/**
 * Extracts the geospatial metadata from a GeoPDF
 * /* w  w  w .  java 2 s . c  o  m*/
 * @param page the PDF page to read geospatial metadata from
 * @param geometryServiceUrl url of a <a href="https://developers.arcgis.com/rest/services-reference/geometry-service.htm">geometry service</a> for reprojecting coordinates. 
 * 
 * @see <a href="https://www.loc.gov/preservation/digital/formats/fdd/fdd000312.shtml">Library of Congress information on GeoPDF</a>
 * @see <a href="https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf">The PDF specification</a>, section 8, for instructions for translating coordinates.
 * 
 * @returns the bounding box of the GeoPDF as "yMin xMin, yMax xMax"
 */
private static String extractGeoPDFProps(PDPage page, String geometryServiceUrl) {

    // The LGI dictionary is an array, we'll loop through all entries and pull the first one for a bounding box
    COSArray lgi = (COSArray) page.getCOSObject().getDictionaryObject("LGIDict");

    List<String> bBoxes = new ArrayList<>();

    lgi.iterator().forEachRemaining(item -> {

        String currentBbox = null;

        // Set up the Coordinate Transformation Matrix (used to translate PDF coords to geo coords)
        Double[][] ctmValues = null;

        COSDictionary dictionary = (COSDictionary) item;
        if (dictionary.containsKey("CTM")) {
            ctmValues = new Double[3][3];

            // The last column in the matrix is always constant
            ctmValues[0][2] = 0.0;
            ctmValues[1][2] = 0.0;
            ctmValues[2][2] = 1.0;

            COSArray ctm = (COSArray) dictionary.getDictionaryObject("CTM");
            for (int i = 0; i < ctm.toList().size(); i += 2) {
                int ctmRow = i / 2;
                ctmValues[ctmRow][0] = Double.parseDouble(((COSString) ctm.get(i)).getString());
                ctmValues[ctmRow][1] = Double.parseDouble(((COSString) ctm.get(i + 1)).getString());
            }
        }

        // Get the neatline (i.e. the bounding box in *PDF* coordinates)
        Double[][] neatLineValues = null;
        int neatLineLength = 0;
        if (dictionary.containsKey("Neatline")) {

            COSArray neatline = (COSArray) dictionary.getDictionaryObject("Neatline");
            neatLineLength = neatline.toList().size();
            neatLineValues = new Double[neatLineLength / 2][3];

            for (int i = 0; i < neatline.toList().size(); i += 2) {
                int neatLineRow = i / 2;
                neatLineValues[neatLineRow][0] = Double.parseDouble(((COSString) neatline.get(i)).getString());
                neatLineValues[neatLineRow][1] = Double
                        .parseDouble(((COSString) neatline.get(i + 1)).getString());
                neatLineValues[neatLineRow][2] = 1.0;
            }
        }

        // Translate the PDF coordinates to Geospatial coordintates by multiplying the two matricies
        MultiPoint mp = new MultiPoint();
        if (ctmValues != null && neatLineValues != null) {
            Double[][] resultCoords = new Double[neatLineLength / 2][3];
            for (int z = 0; z < neatLineLength / 2; z++) {
                for (int i = 0; i < 3; i++) {
                    resultCoords[z][i] = neatLineValues[z][0] * ctmValues[0][i]
                            + neatLineValues[z][1] * ctmValues[1][i] + neatLineValues[z][2] * ctmValues[2][i];
                }
                mp.add(resultCoords[z][0], resultCoords[z][1]);
            }
        }

        // Project the geospatial coordinates to WGS84 for the Dublin-Core metadata
        if (dictionary.containsKey("Projection")) {
            COSDictionary projectionDictionary = (COSDictionary) dictionary.getDictionaryObject("Projection");
            String projectionType = projectionDictionary.getString("ProjectionType");

            try (GeometryService svc = new GeometryService(HttpClients.custom().useSystemProperties().build(),
                    new URL(geometryServiceUrl));) {

                // UTM projections require slightly different processing
                if ("UT".equals(projectionType)) {
                    String zone = Integer.toString(projectionDictionary.getInt("Zone"));
                    String hemisphere = projectionDictionary.getString("Hemisphere");

                    // Get the wkt for the geospatial coordinate system
                    String wkt = datumTranslation(projectionDictionary.getItem("Datum"));

                    if (zone != null && hemisphere != null && wkt != null) {
                        // Generate a list of UTM strings
                        List<String> utmCoords = new ArrayList<>();
                        for (Point2D pt : mp.getCoordinates2D()) {
                            String coord = String.format("%s%s %s %s", zone, hemisphere, Math.round(pt.x),
                                    Math.round(pt.y));
                            utmCoords.add(coord);
                        }

                        MultiPoint reproj = svc.fromGeoCoordinateString(utmCoords, WGS84_WKID);

                        currentBbox = generateBbox(reproj);

                    } else {
                        LOG.warn("Missing UTM argument: zone: {}, hemisphere: {}, datum: {}", zone, hemisphere,
                                wkt);
                        LOG.debug("Projection dictionary {}", projectionDictionary);
                    }
                } else {
                    // Generate Well Known Text for projection and re-projects the points to WGS 84
                    String wkt = getProjectionWKT(projectionDictionary, projectionType);

                    if (wkt != null) {
                        MultiPoint reproj = svc.project(mp, wkt, WGS84_WKID);

                        currentBbox = generateBbox(reproj);

                    } else if (LOG.isDebugEnabled()) {
                        // Print out translated coordinates for debugging purposes
                        LOG.debug("Translated Coordinates");
                        for (Point2D pt : mp.getCoordinates2D()) {
                            LOG.debug(String.format("\t%s, %s", pt.x, pt.y));
                        }
                    }
                }
            } catch (Exception e) {
                // If something goes wrong, just try the next set of coordinates
                LOG.error("Exception reprojecting geometry, skipping this geopdf dictionary instance...", e);
            }
        }

        if (currentBbox != null) {
            bBoxes.add(currentBbox);
        }

    });

    return bBoxes.get(0);
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check for encryption with Apache PDFBox
 * -> query the encryption dictionary (might allow more granular checks of protection)
 * @param pPDF pdf file to check/*from ww  w .  ja  v  a 2 s  . co  m*/
 * @return whether or not the file has DRM
 */
public static boolean hasDRMGranular(File pPDF) {

    boolean ret = false;

    try {
        PDFParser parser = new PDFParser(new FileInputStream(pPDF));
        parser.parse();

        COSDictionary dict = parser.getDocument().getEncryptionDictionary();
        if (dict != null) {

            //print encryption dictionary
            //            for(COSName key:dict.keySet()) {
            //               System.out.print(key.getName());
            //               String value = dict.getString(key);
            //               if(value!=null){
            //                  System.out.println(": "+value);
            //               } else {
            //                  System.out.println(": "+dict.getLong(key));
            //               }
            //            }

            //this feaure in pdfbox is currently broken, see: https://issues.apache.org/jira/browse/PDFBOX-1651
            //AccessPermission perms = parser.getPDDocument().getCurrentAccessPermission();
            //this is a work around; creating a new object from the data
            AccessPermission perms = new AccessPermission(dict.getInt("P"));

            boolean debug = false;

            if (debug) {

                System.out.println("canAssembleDocument()        : " + perms.canAssembleDocument());
                System.out.println("canExtractContent()          : " + perms.canExtractContent());
                System.out.println("canExtractForAccessibility() : " + perms.canExtractForAccessibility());
                System.out.println("canFillInForm()              : " + perms.canFillInForm());
                System.out.println("canModify()                  : " + perms.canModify());
                System.out.println("canModifyAnnotations()       : " + perms.canModifyAnnotations());
                System.out.println("canPrint()                   : " + perms.canPrint());
                System.out.println("canPrintDegraded()           : " + perms.canPrintDegraded());
                System.out.println("isOwnerPermission()          : " + perms.isOwnerPermission());
                System.out.println("isReadOnly()                 : " + perms.isReadOnly());

            }
        }

        parser.getDocument().close();

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return ret;
}

From source file:modules.PDFFontDependencyExtractorModule.java

License:Apache License

public PDFFontResults extractFontList(File f) throws IOException, InvalidParameterException {
    PDDocument document;/*from   ww w .j a va  2  s .  co m*/
    try {
        document = PDDocument.load(f);
    } catch (IOException x) {
        throw new InvalidParameterException("Not a PDF file");
    }
    SortedSet<FontInformation> ret = new TreeSet<FontInformation>(new Comparator<FontInformation>() {

        @Override
        public int compare(FontInformation o1, FontInformation o2) {
            int a = o1.fontName.compareTo(o2.fontName);
            if (a != 0)
                return a;
            else
                return o1.fontType.compareTo(o2.fontType);
        }

    });

    document.getDocumentCatalog().getAllPages();
    // The code down here is easier as it gets all the fonts used in the
    // document. Still, this would inlcude unused fonts, so we get the fonts
    // page by page and add them to a Hash table.
    for (COSObject c : document.getDocument().getObjectsByType(COSName.FONT)) {
        if (c == null || !(c.getObject() instanceof COSDictionary)) {
            continue;
            // System.out.println(c.getObject());
        }

        COSDictionary fontDictionary = (COSDictionary) c.getObject();
        // System.out.println(dic.getNameAsString(COSName.BASE_FONT));
        // }
        // }
        // int pagen = document.getNumberOfPages();
        // i=0;
        // for (int p=0;p<pagen;p++){
        // PDPage page = (PDPage)pages.get(p);
        // PDResources res = page.findResources();
        // //for each page resources
        // if (res==null) continue;
        // // get the font dictionary
        // COSDictionary fonts = (COSDictionary)
        // res.getCOSDictionary().getDictionaryObject( COSName.FONT );
        // for( COSName fontName : fonts.keySet() ) {
        // COSObject font = (COSObject) fonts.getItem( fontName );
        // // if the font has already been visited we ingore it
        // long objectId = font.getObjectNumber().longValue();
        // if (ret.get(objectId)!=null)
        // continue;
        // if( font==null || ! (font.getObject() instanceof COSDictionary) )
        // continue;
        // COSDictionary fontDictionary = (COSDictionary)font.getObject();

        // Type MUSt be font
        if (!fontDictionary.getNameAsString(COSName.TYPE).equals("Font")) {
            continue;
        }
        // get the variables
        FontInformation fi = new FontInformation();
        fi.fontType = fontDictionary.getNameAsString(COSName.SUBTYPE);

        String baseFont = fontDictionary.getNameAsString(COSName.BASE_FONT);
        if (baseFont == null) {
            continue;
        }
        if (Arrays.binarySearch(standard14, baseFont) >= 0) {
            continue;
        }
        COSDictionary fontDescriptor = (COSDictionary) fontDictionary.getDictionaryObject(COSName.FONT_DESC);
        COSBase enc = fontDictionary.getItem(COSName.ENCODING);
        COSBase uni = fontDictionary.getItem(COSName.TO_UNICODE);
        fontDictionary.getInt(COSName.FIRST_CHAR);
        fontDictionary.getInt(COSName.LAST_CHAR);
        String encoding;
        boolean toUnicode = uni != null;
        if (enc == null) {
            encoding = "standard14";
        }
        if (enc instanceof COSString) {
            encoding = ((COSString) enc).getString();
        } else {
            encoding = "table";
        }
        fi.isSubset = false;
        boolean t = true;
        // Type one and TT can have subsets defineing the basename see 5.5.3
        // pdfref 1.6
        // if (fi.fontType.lastIndexOf(COSName.TYPE1.getName())!=-1 ||
        // fi.fontType.equals(COSName.TRUE_TYPE.getName()) )
        if (baseFont != null) {
            if (baseFont.length() > 6) {
                for (int k = 0; k < 6; k++)
                    if (!Character.isUpperCase(baseFont.charAt(k))) {
                        t = false;
                    }
                if (baseFont.charAt(6) != '+') {
                    t = false;
                }
            } else {
                t = false;
            }
            fi.isSubset = t;
            if (fi.isSubset) {
                fi.baseName = baseFont.substring(0, 6);
                baseFont = baseFont.substring(7);
            }
        }
        fi.fontFlags = 0;
        if (fi.fontType.equals(COSName.TYPE0.getName()) || fi.fontType.equals(COSName.TYPE3.getName())) {
            fi.isEmbedded = true;
        }

        if (fontDescriptor != null) {
            // in Type1 charset indicates font is subsetted
            if (fontDescriptor.getItem(COSName.CHAR_SET) != null) {
                fi.isSubset = true;
            }
            if (fontDescriptor.getItem(COSName.FONT_FILE) != null
                    || fontDescriptor.getItem(COSName.FONT_FILE3) != null
                    || fontDescriptor.getItem(COSName.FONT_FILE2) != null) {
                fi.isEmbedded = true;
            }
            fi.fontFlags = fontDescriptor.getInt(COSName.getPDFName("Flags"));
            fi.fontFamily = fontDescriptor.getString(COSName.FONT_FAMILY);
            fi.fontStretch = fontDescriptor.getString(COSName.FONT_STRETCH);

        }
        fi.charset = encoding;
        fi.fontName = baseFont;
        fi.isToUnicode = toUnicode;
        fi.encoding = fontDictionary.getNameAsString(COSName.CID_TO_GID_MAP);

        ret.add(fi);

    } // for all fonts

    HashMultimap<String, FontInformation> m = HashMultimap.create();

    for (FontInformation ff : ret) {
        m.put(ff.fontName, ff);
    }
    LinkedList<FontInformation> missing = new LinkedList<FontInformation>();
    Set<String> k = m.keySet();
    for (String kk : k) {
        Set<FontInformation> s = m.get(kk);
        if (s.size() < 1) {
            continue;
        }
        if (s.size() > 1) {
            boolean found = false;
            FontInformation ff = null;
            for (FontInformation fonti : s) {
                if (!fonti.isEmbedded) {
                    ff = fonti;
                } else {
                    found = true;
                }
            }
            if (!found) {
                missing.add(ff);
            }
        } else {
            FontInformation ff = s.iterator().next();
            if (!ff.isEmbedded) {
                missing.add(ff);
            }
        }

    }

    // } // for all pages
    // Iterator<FontInformation> it = ret.iterator();
    // FontInformation prev = null;
    // LinkedList<FontInformation> toDelete = new
    // LinkedList<FontInformation>();
    // while (it.hasNext()) {
    // FontInformation current = it.next();
    //
    // if (prev!= null && prev.fontName.equals(current.fontName) &&
    // (prev.fontType.startsWith("CIDFontType") ||
    // current.fontType.startsWith("CIDFontType")))
    // toDelete.add(current);
    // prev = current;
    // }
    //
    // //ret.removeAll(toDelete);
    // FontInformation[] retArray =toDelete.toArray(new FontInformation[0]);
    //

    if (missing.size() == 0) {
        missing = null;
    } else {
        System.out.println("Found missing fonts: " + f);
        System.out.println(missing);
    }
    return new PDFFontResults(new LinkedList<FontInformation>(ret), missing);
}

From source file:net.padaf.preflight.helpers.GraphicsValidationHelper.java

License:Apache License

public List<ValidationError> validatePattern(DocumentHandler handler, COSObject cObj)
        throws ValidationException {
    COSDictionary cosPattern = (COSDictionary) cObj.getObject();
    int ptype = cosPattern.getInt(DICTIONARY_KEY_PATTERN_TYPE);

    XObjectValidator validator = null;/*from w ww.j  a  v a 2 s.  c  om*/

    switch (ptype) {
    case DICTIONARY_PATTERN_TILING:
        validator = new TilingPattern(handler, (COSStream) cosPattern);
        break;
    case DICTIONARY_PATTERN_SHADING:
        validator = new ShadingPattern(handler, cosPattern);
        break;
    default:
        throw new ValidationException("Unkown pattern type : " + ptype);
    }

    return validator.validate();
}

From source file:org.apache.fop.render.pdf.PageParentTreeFinderTestCase.java

License:Apache License

@Test
public void testGetPageParentTreeArray() throws IOException {
    File resource = new File(getClass().getResource(LINK).getFile());
    PDDocument doc = PDDocument.load(resource);
    PDPage srcPage = doc.getPage(0);//ww w.  j  a v  a  2s.  c o m
    PageParentTreeFinder finder = new PageParentTreeFinder(srcPage);
    COSArray markedContentParents = finder.getPageParentTreeArray(doc);
    Assert.assertEquals(markedContentParents.size(), 3);
    COSObject firstObj = (COSObject) markedContentParents.get(0);
    COSObject secObj = (COSObject) markedContentParents.get(1);
    COSArray firstKids = (COSArray) firstObj.getDictionaryObject(COSName.K);
    COSDictionary firstKid = (COSDictionary) firstKids.get(0);
    int test = firstKid.getInt("MCID");
    int expected = 0;
    Assert.assertEquals(test, expected);
    COSDictionary firstKidBrother = (COSDictionary) firstKids.get(2);
    test = firstKidBrother.getInt("MCID");
    expected = 2;
    Assert.assertEquals(test, expected);
    COSArray secKidsArray = (COSArray) secObj.getDictionaryObject(COSName.K);
    COSDictionary secondKid = (COSDictionary) secKidsArray.get(0);
    test = secondKid.getInt("MCID");
    expected = 1;
    Assert.assertEquals(test, expected);
}

From source file:org.apache.fop.render.pdf.pdfbox.PageParentTreeFinder.java

License:Apache License

private int findXObjectStructParent() throws IOException {
    int position = -1;
    Iterable<COSName> mapXObject = srcPage.getResources().getXObjectNames();
    for (COSName n : mapXObject) {
        PDXObject t = srcPage.getResources().getXObject(n);
        COSDictionary xObjectDict = (COSDictionary) t.getCOSObject();
        position = xObjectDict.getInt(COSName.STRUCT_PARENTS);
        if (position != -1) {
            return position;
        }/* w  ww.j  av  a2s . co m*/
    }
    return position;
}

From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java

License:Apache License

@SuppressWarnings("deprecation")
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    XMPMetadata xmp = null;//w w w . java  2  s .  c  o m
    XMPSchemaDublinCore dcSchema = null;
    try {
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata());
        }
        if (xmp != null) {
            dcSchema = xmp.getDublinCoreSchema();
        }
    } catch (IOException e) {
        //swallow
    }
    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    // TODO Remove these in Tika 2.0
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
    addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    org.apache.jempbox.xmp.XMPMetadata xmp = null;
    XMPSchemaDublinCore dcSchema = null;
    XMPSchemaMediaManagement mmSchema = null;
    try {/*from   w  ww .  j ava 2  s  . com*/
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
        }
    } catch (IOException e) {
    }

    if (xmp != null) {
        try {
            dcSchema = xmp.getDublinCoreSchema();
        } catch (IOException e) {
        }

        JempboxExtractor.extractXMPMM(xmp, metadata);
    }

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context)
        throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);

    XMPMetadata xmp = null;/*from ww w  . j  a v a  2s. c o  m*/
    if (dom != null) {
        xmp = new XMPMetadata(dom);
    }
    XMPSchemaDublinCore dcSchema = null;

    /*if (xmp != null) {
    try {
        dcSchema = xmp.getDublinCoreSchema();
    } catch (IOException e) {}
            
    JempboxExtractor.extractXMPMM(xmp, metadata);
    }*/

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
    // TODO Remove these in Tika 2.0
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate());
    addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
    addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate());

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getCOSObject().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
            addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
                    info.getCOSObject().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
    metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set(PDF.PDFA_VERSION, version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf18.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    org.apache.jempbox.xmp.XMPMetadata xmp = null;
    XMPSchemaDublinCore dcSchema = null;
    XMPSchemaMediaManagement mmSchema = null;
    try {/*from  w w w . jav a  2s .  co m*/
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
        }
    } catch (IOException e) {
    }

    if (xmp != null) {
        try {
            dcSchema = xmp.getDublinCoreSchema();
        } catch (IOException e) {
        }

        JempboxExtractor.extractXMPMM(xmp, metadata);
    }

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}