List of usage examples for org.apache.pdfbox.cos COSDictionary getNameAsString
public String getNameAsString(COSName key)
From source file:net.padaf.preflight.helpers.FontValidationHelper.java
License:Apache License
@Override public List<ValidationError> innerValidate(DocumentHandler handler) throws ValidationException { List<ValidationError> result = new ArrayList<ValidationError>(0); PDDocument pdfDoc = handler.getDocument(); COSDocument cDoc = pdfDoc.getDocument(); List<?> lCOSObj = cDoc.getObjects(); List<FontValidator> lType3 = new ArrayList<FontValidator>(); for (Object o : lCOSObj) { COSObject cObj = (COSObject) o;/*w w w .java 2 s .c om*/ // If this object represents a Stream, the Dictionary must contain the // Length key COSBase cBase = cObj.getObject(); if (cBase instanceof COSDictionary) { COSDictionary dic = (COSDictionary) cBase; String type = dic.getNameAsString(COSName.getPDFName(DICTIONARY_KEY_TYPE)); if (type != null && FONT_DICTIONARY_VALUE_FONT.equals(type)) { FontValidator fontVal = fontValidationFactory.getFontValidator(cObj, handler); if (fontVal instanceof Type3FontValidator) { lType3.add(fontVal); } else { validateFont(handler, fontVal, result); } } } } // ---- Type 3 can contain other font, so type 3 are validated at the end. for (FontValidator t3FontVal : lType3) { validateFont(handler, t3FontVal, result); } return result; }
From source file:net.padaf.preflight.helpers.GraphicsValidationHelper.java
License:Apache License
@Override public List<ValidationError> innerValidate(DocumentHandler handler) throws ValidationException { List<ValidationError> result = new ArrayList<ValidationError>(0); PDDocument pdfDoc = handler.getDocument(); // ---- Checks all XObjects COSDocument cDoc = pdfDoc.getDocument(); List<?> lCOSObj = cDoc.getObjects(); for (Object o : lCOSObj) { COSObject cObj = (COSObject) o;/*from www .j av a2s . c om*/ COSBase cBase = cObj.getObject(); if (cBase instanceof COSDictionary) { COSDictionary dic = (COSDictionary) cBase; String type = dic.getNameAsString(COSName.getPDFName(DICTIONARY_KEY_TYPE)); if (type != null && DICTIONARY_KEY_XOBJECT.equals(type)) { result.addAll(validateXObject(handler, cObj)); } else if (type != null && DICTIONARY_KEY_PATTERN.equals(type)) { result.addAll(validatePattern(handler, cObj)); } } } return result; }
From source file:net.padaf.preflight.utils.ContentStreamEngine.java
License:Apache License
/** * Throw a ContentStreamException if the LZW filter is used in a InlinedImage. * //from w w w.j a v a 2 s. c om * @param operator * the InlinedImage object (BI to EI) * @throws ContentStreamException */ protected void validImageFilter(PDFOperator operator) throws ContentStreamException { COSDictionary dict = operator.getImageParameters().getDictionary(); // ---- Search a Filter declaration in the InlinedImage dictionary. // ---- The LZWDecode Filter is forbidden. String filter = dict.getNameAsString(STREAM_DICTIONARY_KEY_F); if (filter == null) { filter = dict.getNameAsString(STREAM_DICTIONARY_KEY_FILTER); } String errorCode = FilterHelper.isAuthorizedFilter(filter); if (errorCode != null) { // --- LZW is forbidden. if (ERROR_SYNTAX_STREAM_INVALID_FILTER.equals(errorCode)) { throwContentStreamException("LZW filter can't be used in a PDF/A File", ERROR_SYNTAX_STREAM_INVALID_FILTER); } else { throwContentStreamException("This filter isn't defined in the PDF Reference Third Edition.", ERROR_SYNTAX_STREAM_UNDEFINED_FILTER); } } }
From source file:org.apache.padaf.preflight.helpers.CatalogValidationHelper.java
License:Apache License
/** * This method checks the content of each OutputIntent. The S entry must * contain GTS_PDFA1. The DestOuputProfile must contain a valid ICC Profile * Stream.// www . j av a 2 s . c o m * * If there are more than one OutputIntent, they have to use the same ICC * Profile. * * This method returns a list of ValidationError. It is empty if no errors * have been found. * * @param handler * @return * @throws ValidationException */ public List<ValidationError> validateOutputIntent(DocumentHandler handler) throws ValidationException { List<ValidationError> result = new ArrayList<ValidationError>(0); PDDocument pdDocument = handler.getDocument(); PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); COSDocument cDoc = pdDocument.getDocument(); COSBase cBase = catalog.getCOSDictionary() .getItem(COSName.getPDFName(DOCUMENT_DICTIONARY_KEY_OUTPUT_INTENTS)); COSArray outputIntents = COSUtils.getAsArray(cBase, cDoc); Map<COSObjectKey, Boolean> tmpDestOutputProfile = new HashMap<COSObjectKey, Boolean>(); for (int i = 0; outputIntents != null && i < outputIntents.size(); ++i) { COSDictionary dictionary = COSUtils.getAsDictionary(outputIntents.get(i), cDoc); if (dictionary == null) { result.add(new ValidationError(ERROR_GRAPHIC_OUTPUT_INTENT_INVALID_ENTRY, "OutputIntent object is null or isn't a dictionary")); } else { // ---- S entry is mandatory and must be equals to GTS_PDFA1 String sValue = dictionary.getNameAsString(COSName.getPDFName(OUTPUT_INTENT_DICTIONARY_KEY_S)); if (!OUTPUT_INTENT_DICTIONARY_VALUE_GTS_PDFA1.equals(sValue)) { result.add(new ValidationError(ERROR_GRAPHIC_OUTPUT_INTENT_S_VALUE_INVALID, "The S entry of the OutputIntent isn't GTS_PDFA1")); continue; } // ---- OutputConditionIdentifier is a mandatory field String outputConditionIdentifier = dictionary .getString(COSName.getPDFName(OUTPUT_INTENT_DICTIONARY_KEY_OUTPUT_CONDITION_IDENTIFIER)); if (outputConditionIdentifier == null) {// empty string is autorized (it may be an application specific value) result.add(new ValidationError(ERROR_GRAPHIC_OUTPUT_INTENT_INVALID_ENTRY, "The OutputIntentCondition is missing")); continue; } // ---- If OutputConditionIdentifier is "Custom" or a non Standard ICC Characterization : // ---- DestOutputProfile and Info are mandatory // ---- DestOutputProfile must be a ICC Profile // ---- Because of PDF/A conforming file needs to specify the color characteristics, the DestOutputProfile // ---- is checked even if the OutputConditionIdentifier isn't "Custom" COSBase dop = dictionary .getItem(COSName.getPDFName(OUTPUT_INTENT_DICTIONARY_KEY_DEST_OUTPUT_PROFILE)); ValidationError valer = validateICCProfile(dop, cDoc, tmpDestOutputProfile, handler); if (valer != null) { result.add(valer); continue; } // TODO [LAZY] When Lazy mode will be added, this block should be uncommented to set result as warning. // if (!isStandardICCCharacterization(outputConditionIdentifier)) { // String info = dictionary.getString(COSName.getPDFName(OUTPUT_INTENT_DICTIONARY_KEY_INFO)); // if (info == null || "".equals(info)) { // result.add(new ValidationError(ERROR_GRAPHIC_OUTPUT_INTENT_INVALID_ENTRY, // "The Info entry of a OutputIntent dictionary is missing")); // continue; // } // } } } return result; }
From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java
License:Apache License
@SuppressWarnings("deprecation") private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { XMPMetadata xmp = null;/*from w w w .ja va 2 s .c om*/ XMPSchemaDublinCore dcSchema = null; try { if (document.getDocumentCatalog().getMetadata() != null) { xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata()); } if (xmp != null) { dcSchema = xmp.getDublinCoreSchema(); } } catch (IOException e) { //swallow } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {/*from w w w. j a va 2 s .c o m*/ if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context); XMPMetadata xmp = null;/*from ww w . j a v a 2 s . c o m*/ if (dom != null) { xmp = new XMPMetadata(dom); } XMPSchemaDublinCore dcSchema = null; /*if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) {} JempboxExtractor.extractXMPMM(xmp, metadata); }*/ PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle()); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor()); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped()); // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate()); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getCOSObject().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key)); addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name, info.getCOSObject().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set(PDF.PDFA_VERSION, version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf18.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {/*from w w w .ja va 2s . c om*/ if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:test.be.fedict.eid.applet.PdfSpikeTest.java
License:Open Source License
@Test public void testSignPDF() throws Exception { // create a sample PDF file Document document = new Document(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); PdfWriter.getInstance(document, baos); document.open();//from w w w. j av a 2 s . c om Paragraph titleParagraph = new Paragraph("This is a test."); titleParagraph.setAlignment(Paragraph.ALIGN_CENTER); document.add(titleParagraph); document.newPage(); Paragraph textParagraph = new Paragraph("Hello world."); document.add(textParagraph); document.close(); File tmpFile = File.createTempFile("test-", ".pdf"); LOG.debug("tmp file: " + tmpFile.getAbsolutePath()); FileUtils.writeByteArrayToFile(tmpFile, baos.toByteArray()); // eID PcscEid pcscEid = new PcscEid(new TestView(), new Messages(Locale.getDefault())); if (false == pcscEid.isEidPresent()) { LOG.debug("insert eID card"); pcscEid.waitForEidPresent(); } List<X509Certificate> signCertificateChain = pcscEid.getSignCertificateChain(); Certificate[] certs = new Certificate[signCertificateChain.size()]; for (int idx = 0; idx < certs.length; idx++) { certs[idx] = signCertificateChain.get(idx); } // open the pdf FileInputStream pdfInputStream = new FileInputStream(tmpFile); File signedTmpFile = File.createTempFile("test-signed-", ".pdf"); PdfReader reader = new PdfReader(pdfInputStream); FileOutputStream pdfOutputStream = new FileOutputStream(signedTmpFile); PdfStamper stamper = PdfStamper.createSignature(reader, pdfOutputStream, '\0', null, true); // add extra page Rectangle pageSize = reader.getPageSize(1); int pageCount = reader.getNumberOfPages(); int extraPageIndex = pageCount + 1; stamper.insertPage(extraPageIndex, pageSize); // calculate unique signature field name int signatureNameIndex = 1; String signatureName; AcroFields existingAcroFields = reader.getAcroFields(); List<String> existingSignatureNames = existingAcroFields.getSignatureNames(); do { signatureName = "Signature" + signatureNameIndex; signatureNameIndex++; } while (existingSignatureNames.contains(signatureName)); LOG.debug("new unique signature name: " + signatureName); PdfSignatureAppearance signatureAppearance = stamper.getSignatureAppearance(); signatureAppearance.setCrypto(null, certs, null, PdfSignatureAppearance.SELF_SIGNED); signatureAppearance.setCertificationLevel(PdfSignatureAppearance.CERTIFIED_NO_CHANGES_ALLOWED); signatureAppearance.setReason("PDF Signature Test"); signatureAppearance.setLocation("Belgium"); signatureAppearance.setVisibleSignature(new Rectangle(54, 440, 234, 566), extraPageIndex, signatureName); signatureAppearance.setExternalDigest(new byte[128], new byte[20], "RSA"); signatureAppearance.preClose(); byte[] content = IOUtils.toByteArray(signatureAppearance.getRangeStream()); byte[] hash = MessageDigest.getInstance("SHA-1").digest(content); byte[] signatureBytes = pcscEid.sign(hash, "SHA-1"); pcscEid.close(); PdfSigGenericPKCS sigStandard = signatureAppearance.getSigStandard(); PdfPKCS7 signature = sigStandard.getSigner(); signature.setExternalDigest(signatureBytes, hash, "RSA"); PdfDictionary dictionary = new PdfDictionary(); dictionary.put(PdfName.CONTENTS, new PdfString(signature.getEncodedPKCS1()).setHexWriting(true)); signatureAppearance.close(dictionary); LOG.debug("signed tmp file: " + signedTmpFile.getAbsolutePath()); // verify the signature reader = new PdfReader(new FileInputStream(signedTmpFile)); AcroFields acroFields = reader.getAcroFields(); ArrayList<String> signatureNames = acroFields.getSignatureNames(); for (String signName : signatureNames) { LOG.debug("signature name: " + signName); LOG.debug("signature covers whole document: " + acroFields.signatureCoversWholeDocument(signName)); LOG.debug("document revision " + acroFields.getRevision(signName) + " of " + acroFields.getTotalRevisions()); PdfPKCS7 pkcs7 = acroFields.verifySignature(signName); Calendar signDate = pkcs7.getSignDate(); LOG.debug("signing date: " + signDate.getTime()); LOG.debug("Subject: " + PdfPKCS7.getSubjectFields(pkcs7.getSigningCertificate())); LOG.debug("Document modified: " + !pkcs7.verify()); Certificate[] verifyCerts = pkcs7.getCertificates(); for (Certificate certificate : verifyCerts) { X509Certificate x509Certificate = (X509Certificate) certificate; LOG.debug("cert subject: " + x509Certificate.getSubjectX500Principal()); } } /* * Reading the signature using Apache PDFBox. */ PDDocument pdDocument = PDDocument.load(signedTmpFile); COSDictionary trailer = pdDocument.getDocument().getTrailer(); /* * PDF Reference - third edition - Adobe Portable Document Format - * Version 1.4 - 3.6.1 Document Catalog */ COSDictionary documentCatalog = (COSDictionary) trailer.getDictionaryObject(COSName.ROOT); /* * 8.6.1 Interactive Form Dictionary */ COSDictionary acroForm = (COSDictionary) documentCatalog.getDictionaryObject(COSName.ACRO_FORM); COSArray fields = (COSArray) acroForm.getDictionaryObject(COSName.FIELDS); for (int fieldIdx = 0; fieldIdx < fields.size(); fieldIdx++) { COSDictionary field = (COSDictionary) fields.getObject(fieldIdx); String fieldType = field.getNameAsString("FT"); if ("Sig".equals(fieldType)) { COSDictionary signatureDictionary = (COSDictionary) field.getDictionaryObject(COSName.V); /* * TABLE 8.60 Entries in a signature dictionary */ COSString signatoryName = (COSString) signatureDictionary.getDictionaryObject(COSName.NAME); if (null != signatoryName) { LOG.debug("signatory name: " + signatoryName.getString()); } COSString reason = (COSString) signatureDictionary.getDictionaryObject(COSName.REASON); if (null != reason) { LOG.debug("reason: " + reason.getString()); } COSString location = (COSString) signatureDictionary.getDictionaryObject(COSName.LOCATION); if (null != location) { LOG.debug("location: " + location.getString()); } Calendar signingTime = signatureDictionary.getDate(COSName.M); if (null != signingTime) { LOG.debug("signing time: " + signingTime.getTime()); } String signatureHandler = signatureDictionary.getNameAsString(COSName.FILTER); LOG.debug("signature handler: " + signatureHandler); } } }
From source file:uk.ac.liverpool.thumbnails.PDFService.java
License:Open Source License
@Override public FontInformation[] extractFontList(URI u, File fff) throws MalformedURLException, IOException { SortedSet<FontInformation> ret = new TreeSet<FontInformation>(); PDDocument document = getPages(u, fff); List pages = document.getDocumentCatalog().getAllPages(); int i = 0;//from ww w . j a va2s .c o m // The code down here is easier as it gets all the fonts used in the document. Still, this would inlcude unused fonts, so we get the fonts page by page and add them to a Hash table. for (COSObject c : document.getDocument().getObjectsByType(COSName.FONT)) { if (c == null || !(c.getObject() instanceof COSDictionary)) continue; //System.out.println(c.getObject()); COSDictionary fontDictionary = (COSDictionary) c.getObject(); // System.out.println(dic.getNameAsString(COSName.BASE_FONT)); // } // } // int pagen = document.getNumberOfPages(); // i=0; // for (int p=0;p<pagen;p++){ // PDPage page = (PDPage)pages.get(p); // PDResources res = page.findResources(); // //for each page resources // if (res==null) continue; // // get the font dictionary // COSDictionary fonts = (COSDictionary) res.getCOSDictionary().getDictionaryObject( COSName.FONT ); // for( COSName fontName : fonts.keySet() ) { // COSObject font = (COSObject) fonts.getItem( fontName ); // // if the font has already been visited we ingore it // long objectId = font.getObjectNumber().longValue(); // if (ret.get(objectId)!=null) // continue; // if( font==null || ! (font.getObject() instanceof COSDictionary) ) // continue; // COSDictionary fontDictionary = (COSDictionary)font.getObject(); // Type MUSt be font if (!fontDictionary.getNameAsString(COSName.TYPE).equals("Font")) continue; // get the variables FontInformation fi = new FontInformation(); fi.fontType = fontDictionary.getNameAsString(COSName.SUBTYPE); String baseFont = fontDictionary.getNameAsString(COSName.BASE_FONT); if (baseFont == null) continue; if (Arrays.binarySearch(standard14, baseFont) >= 0) continue; COSDictionary fontDescriptor = (COSDictionary) fontDictionary.getDictionaryObject(COSName.FONT_DESC); COSBase enc = fontDictionary.getItem(COSName.ENCODING); COSBase uni = fontDictionary.getItem(COSName.TO_UNICODE); int firstChar = fontDictionary.getInt(COSName.FIRST_CHAR); int lastChar = fontDictionary.getInt(COSName.LAST_CHAR); String encoding; boolean toUnicode = uni != null; if (enc == null) { encoding = "standard14"; } if (enc instanceof COSString) { encoding = ((COSString) enc).getString(); } else { encoding = "table"; } fi.isSubset = false; boolean t = true; // Type one and TT can have subsets defineing the basename see 5.5.3 pdfref 1.6 // if (fi.fontType.lastIndexOf(COSName.TYPE1.getName())!=-1 || fi.fontType.equals(COSName.TRUE_TYPE.getName()) ) if (baseFont != null) { if (baseFont.length() > 6) { for (int k = 0; k < 6; k++) if (!Character.isUpperCase(baseFont.charAt(k))) t = false; if (baseFont.charAt(6) != '+') t = false; } else t = false; fi.isSubset = t; if (fi.isSubset) baseFont = baseFont.substring(7); } fi.fontFlags = 0; if (fi.fontType.equals(COSName.TYPE0) || fi.fontType.equals(COSName.TYPE3)) fi.isEmbedded = true; if (fontDescriptor != null) { // in Type1 charset indicates font is subsetted if (fontDescriptor.getItem(COSName.CHAR_SET) != null) fi.isSubset = true; if (fontDescriptor.getItem(COSName.FONT_FILE) != null || fontDescriptor.getItem(COSName.FONT_FILE3) != null || fontDescriptor.getItem(COSName.FONT_FILE2) != null) fi.isEmbedded = true; fi.fontFlags = fontDescriptor.getInt(COSName.getPDFName("Flags")); fi.fontFamily = fontDescriptor.getString(COSName.FONT_FAMILY); fi.fontStretch = fontDescriptor.getString(COSName.FONT_STRETCH); } fi.charset = encoding; fi.fontName = baseFont; fi.isToUnicode = toUnicode; ret.add(fi); } // for all fonts // } // for all pages Iterator<FontInformation> it = ret.iterator(); FontInformation prev = null; LinkedList<FontInformation> toDelete = new LinkedList<FontInformation>(); while (it.hasNext()) { FontInformation current = it.next(); if (prev != null && prev.fontName.equals(current.fontName) && prev.fontType.startsWith("CIDFontType")) toDelete.add(current); prev = current; } ret.removeAll(toDelete); FontInformation[] retArray = ret.toArray(new FontInformation[0]); return retArray; }