List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocument
public COSDocument getDocument()
From source file:org.apache.padaf.preflight.helpers.TrailerValidationHelper.java
License:Apache License
/** * Extracts and compares first and last trailers for PDF version between 1.1 and 1.4 * @param handler/*from w w w. j a v a 2s. c om*/ * @param result */ protected void checkTrailersForLinearizedPDF14(DocumentHandler handler, List<ValidationError> result) { PDDocument pdfDoc = handler.getDocument(); List<String> lTrailers = handler.getPdfExtractor().getAllTrailers(); if (lTrailers.isEmpty()) { result.add(new ValidationResult.ValidationError(ValidationConstants.ERROR_SYNTAX_TRAILER, "There are no trailer in the PDF file")); } else { String firstTrailer = lTrailers.get(0); String lastTrailer = lTrailers.get(lTrailers.size() - 1); COSDictionary first = null; COSDictionary last = null; COSDocument cd = null; try { cd = new COSDocument(); PdfElementParser parser1 = new PdfElementParser(cd, firstTrailer.getBytes()); first = parser1.parseAsDictionary(); PdfElementParser parser2 = new PdfElementParser(cd, lastTrailer.getBytes()); last = parser2.parseAsDictionary(); checkMainTrailer(pdfDoc.getDocument(), first, result); if (!compareIds(first, last, pdfDoc.getDocument())) { result.add(new ValidationResult.ValidationError( ValidationConstants.ERROR_SYNTAX_TRAILER_ID_CONSISTENCY, "ID is different in the first and the last trailer")); } } catch (IOException e) { result.add(new ValidationResult.ValidationError(ValidationConstants.ERROR_SYNTAX_TRAILER, "Unable to parse trailers of the linearized PDF")); } finally { COSUtils.closeDocumentQuietly(cd); } } }
From source file:org.apache.padaf.preflight.helpers.TrailerValidationHelper.java
License:Apache License
/** * Accesses and compares First and Last trailers for a PDF version higher than 1.4. * //w w w . j a va2 s. com * @param handler * @param result */ protected void checkTrailersForLinearizedPDF15(DocumentHandler handler, List<ValidationError> result) { PDDocument pdfDoc = handler.getDocument(); try { COSDocument cosDocument = pdfDoc.getDocument(); List<COSObject> xrefs = cosDocument.getObjectsByType(COSName.XREF); if (xrefs.isEmpty()) { // no XRef CosObject, may by this pdf file used the PDF 1.4 syntaxe checkTrailersForLinearizedPDF14(handler, result); } else { long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; COSDictionary firstTrailer = null; COSDictionary lastTrailer = null; // Search First and Last trailers according to offset position. for (COSObject co : xrefs) { long offset = cosDocument.getXrefTable().get(new COSObjectKey(co)); if (offset < min) { min = offset; firstTrailer = (COSDictionary) co.getObject(); } if (offset > max) { max = offset; lastTrailer = (COSDictionary) co.getObject(); } } checkMainTrailer(pdfDoc.getDocument(), firstTrailer, result); if (!compareIds(firstTrailer, lastTrailer, pdfDoc.getDocument())) { result.add(new ValidationResult.ValidationError( ValidationConstants.ERROR_SYNTAX_TRAILER_ID_CONSISTENCY, "ID is different in the first and the last trailer")); } } } catch (IOException e) { result.add(new ValidationResult.ValidationError(ValidationConstants.ERROR_SYNTAX_TRAILER, "Unable to check PDF Trailers due to : " + e.getMessage())); } }
From source file:org.apache.padaf.preflight.RetrieveMissingStream.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 1) { System.err.println("usage : RetrieveMissingStream file"); System.exit(233);/* www .jav a2 s . com*/ } HashSet<COSObjectKey> listOfKeys = new HashSet<COSObjectKey>(); PDDocument document = PDDocument.load(new FileInputStream(args[0])); List<COSObject> lCosObj = document.getDocument().getObjects(); for (COSObject cosObject : lCosObj) { if (cosObject.getObject() instanceof COSStream) { listOfKeys.add(new COSObjectKey(cosObject.getObjectNumber().intValue(), cosObject.getGenerationNumber().intValue())); } } PDDocumentCatalog catalog = document.getDocumentCatalog(); List<?> pages = catalog.getAllPages(); for (int i = 0; i < pages.size(); ++i) { PDPage pdp = (PDPage) pages.get(i); PDStream pdStream = pdp.getContents(); COSBase b = pdp.getCOSDictionary().getItem(COSName.getPDFName("Contents")); System.out.println(); } }
From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java
License:Apache License
@SuppressWarnings("deprecation") private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { XMPMetadata xmp = null;//from ww w . j a v a 2 s .co m XMPSchemaDublinCore dcSchema = null; try { if (document.getDocumentCatalog().getMetadata() != null) { xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata()); } if (xmp != null) { dcSchema = xmp.getDublinCoreSchema(); } } catch (IOException e) { //swallow } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {//w w w .j av a2s .com if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context); XMPMetadata xmp = null;//from w w w . j a v a 2s. c o m if (dom != null) { xmp = new XMPMetadata(dom); } XMPSchemaDublinCore dcSchema = null; /*if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) {} JempboxExtractor.extractXMPMM(xmp, metadata); }*/ PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle()); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor()); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped()); // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate()); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getCOSObject().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key)); addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name, info.getCOSObject().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set(PDF.PDFA_VERSION, version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf18.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {// ww w . j ava 2 s . c o m if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:test.be.fedict.eid.applet.PdfSpikeTest.java
License:Open Source License
@Test public void testSignPDF() throws Exception { // create a sample PDF file Document document = new Document(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); PdfWriter.getInstance(document, baos); document.open();// ww w .j a v a 2 s . c o m Paragraph titleParagraph = new Paragraph("This is a test."); titleParagraph.setAlignment(Paragraph.ALIGN_CENTER); document.add(titleParagraph); document.newPage(); Paragraph textParagraph = new Paragraph("Hello world."); document.add(textParagraph); document.close(); File tmpFile = File.createTempFile("test-", ".pdf"); LOG.debug("tmp file: " + tmpFile.getAbsolutePath()); FileUtils.writeByteArrayToFile(tmpFile, baos.toByteArray()); // eID PcscEid pcscEid = new PcscEid(new TestView(), new Messages(Locale.getDefault())); if (false == pcscEid.isEidPresent()) { LOG.debug("insert eID card"); pcscEid.waitForEidPresent(); } List<X509Certificate> signCertificateChain = pcscEid.getSignCertificateChain(); Certificate[] certs = new Certificate[signCertificateChain.size()]; for (int idx = 0; idx < certs.length; idx++) { certs[idx] = signCertificateChain.get(idx); } // open the pdf FileInputStream pdfInputStream = new FileInputStream(tmpFile); File signedTmpFile = File.createTempFile("test-signed-", ".pdf"); PdfReader reader = new PdfReader(pdfInputStream); FileOutputStream pdfOutputStream = new FileOutputStream(signedTmpFile); PdfStamper stamper = PdfStamper.createSignature(reader, pdfOutputStream, '\0', null, true); // add extra page Rectangle pageSize = reader.getPageSize(1); int pageCount = reader.getNumberOfPages(); int extraPageIndex = pageCount + 1; stamper.insertPage(extraPageIndex, pageSize); // calculate unique signature field name int signatureNameIndex = 1; String signatureName; AcroFields existingAcroFields = reader.getAcroFields(); List<String> existingSignatureNames = existingAcroFields.getSignatureNames(); do { signatureName = "Signature" + signatureNameIndex; signatureNameIndex++; } while (existingSignatureNames.contains(signatureName)); LOG.debug("new unique signature name: " + signatureName); PdfSignatureAppearance signatureAppearance = stamper.getSignatureAppearance(); signatureAppearance.setCrypto(null, certs, null, PdfSignatureAppearance.SELF_SIGNED); signatureAppearance.setCertificationLevel(PdfSignatureAppearance.CERTIFIED_NO_CHANGES_ALLOWED); signatureAppearance.setReason("PDF Signature Test"); signatureAppearance.setLocation("Belgium"); signatureAppearance.setVisibleSignature(new Rectangle(54, 440, 234, 566), extraPageIndex, signatureName); signatureAppearance.setExternalDigest(new byte[128], new byte[20], "RSA"); signatureAppearance.preClose(); byte[] content = IOUtils.toByteArray(signatureAppearance.getRangeStream()); byte[] hash = MessageDigest.getInstance("SHA-1").digest(content); byte[] signatureBytes = pcscEid.sign(hash, "SHA-1"); pcscEid.close(); PdfSigGenericPKCS sigStandard = signatureAppearance.getSigStandard(); PdfPKCS7 signature = sigStandard.getSigner(); signature.setExternalDigest(signatureBytes, hash, "RSA"); PdfDictionary dictionary = new PdfDictionary(); dictionary.put(PdfName.CONTENTS, new PdfString(signature.getEncodedPKCS1()).setHexWriting(true)); signatureAppearance.close(dictionary); LOG.debug("signed tmp file: " + signedTmpFile.getAbsolutePath()); // verify the signature reader = new PdfReader(new FileInputStream(signedTmpFile)); AcroFields acroFields = reader.getAcroFields(); ArrayList<String> signatureNames = acroFields.getSignatureNames(); for (String signName : signatureNames) { LOG.debug("signature name: " + signName); LOG.debug("signature covers whole document: " + acroFields.signatureCoversWholeDocument(signName)); LOG.debug("document revision " + acroFields.getRevision(signName) + " of " + acroFields.getTotalRevisions()); PdfPKCS7 pkcs7 = acroFields.verifySignature(signName); Calendar signDate = pkcs7.getSignDate(); LOG.debug("signing date: " + signDate.getTime()); LOG.debug("Subject: " + PdfPKCS7.getSubjectFields(pkcs7.getSigningCertificate())); LOG.debug("Document modified: " + !pkcs7.verify()); Certificate[] verifyCerts = pkcs7.getCertificates(); for (Certificate certificate : verifyCerts) { X509Certificate x509Certificate = (X509Certificate) certificate; LOG.debug("cert subject: " + x509Certificate.getSubjectX500Principal()); } } /* * Reading the signature using Apache PDFBox. */ PDDocument pdDocument = PDDocument.load(signedTmpFile); COSDictionary trailer = pdDocument.getDocument().getTrailer(); /* * PDF Reference - third edition - Adobe Portable Document Format - * Version 1.4 - 3.6.1 Document Catalog */ COSDictionary documentCatalog = (COSDictionary) trailer.getDictionaryObject(COSName.ROOT); /* * 8.6.1 Interactive Form Dictionary */ COSDictionary acroForm = (COSDictionary) documentCatalog.getDictionaryObject(COSName.ACRO_FORM); COSArray fields = (COSArray) acroForm.getDictionaryObject(COSName.FIELDS); for (int fieldIdx = 0; fieldIdx < fields.size(); fieldIdx++) { COSDictionary field = (COSDictionary) fields.getObject(fieldIdx); String fieldType = field.getNameAsString("FT"); if ("Sig".equals(fieldType)) { COSDictionary signatureDictionary = (COSDictionary) field.getDictionaryObject(COSName.V); /* * TABLE 8.60 Entries in a signature dictionary */ COSString signatoryName = (COSString) signatureDictionary.getDictionaryObject(COSName.NAME); if (null != signatoryName) { LOG.debug("signatory name: " + signatoryName.getString()); } COSString reason = (COSString) signatureDictionary.getDictionaryObject(COSName.REASON); if (null != reason) { LOG.debug("reason: " + reason.getString()); } COSString location = (COSString) signatureDictionary.getDictionaryObject(COSName.LOCATION); if (null != location) { LOG.debug("location: " + location.getString()); } Calendar signingTime = signatureDictionary.getDate(COSName.M); if (null != signingTime) { LOG.debug("signing time: " + signingTime.getTime()); } String signatureHandler = signatureDictionary.getNameAsString(COSName.FILTER); LOG.debug("signature handler: " + signatureHandler); } } }
From source file:uk.ac.liverpool.thumbnails.PDFService.java
License:Open Source License
@Override public FontInformation[] extractFontList(URI u, File fff) throws MalformedURLException, IOException { SortedSet<FontInformation> ret = new TreeSet<FontInformation>(); PDDocument document = getPages(u, fff); List pages = document.getDocumentCatalog().getAllPages(); int i = 0;//from w w w.ja v a 2 s. c o m // The code down here is easier as it gets all the fonts used in the document. Still, this would inlcude unused fonts, so we get the fonts page by page and add them to a Hash table. for (COSObject c : document.getDocument().getObjectsByType(COSName.FONT)) { if (c == null || !(c.getObject() instanceof COSDictionary)) continue; //System.out.println(c.getObject()); COSDictionary fontDictionary = (COSDictionary) c.getObject(); // System.out.println(dic.getNameAsString(COSName.BASE_FONT)); // } // } // int pagen = document.getNumberOfPages(); // i=0; // for (int p=0;p<pagen;p++){ // PDPage page = (PDPage)pages.get(p); // PDResources res = page.findResources(); // //for each page resources // if (res==null) continue; // // get the font dictionary // COSDictionary fonts = (COSDictionary) res.getCOSDictionary().getDictionaryObject( COSName.FONT ); // for( COSName fontName : fonts.keySet() ) { // COSObject font = (COSObject) fonts.getItem( fontName ); // // if the font has already been visited we ingore it // long objectId = font.getObjectNumber().longValue(); // if (ret.get(objectId)!=null) // continue; // if( font==null || ! (font.getObject() instanceof COSDictionary) ) // continue; // COSDictionary fontDictionary = (COSDictionary)font.getObject(); // Type MUSt be font if (!fontDictionary.getNameAsString(COSName.TYPE).equals("Font")) continue; // get the variables FontInformation fi = new FontInformation(); fi.fontType = fontDictionary.getNameAsString(COSName.SUBTYPE); String baseFont = fontDictionary.getNameAsString(COSName.BASE_FONT); if (baseFont == null) continue; if (Arrays.binarySearch(standard14, baseFont) >= 0) continue; COSDictionary fontDescriptor = (COSDictionary) fontDictionary.getDictionaryObject(COSName.FONT_DESC); COSBase enc = fontDictionary.getItem(COSName.ENCODING); COSBase uni = fontDictionary.getItem(COSName.TO_UNICODE); int firstChar = fontDictionary.getInt(COSName.FIRST_CHAR); int lastChar = fontDictionary.getInt(COSName.LAST_CHAR); String encoding; boolean toUnicode = uni != null; if (enc == null) { encoding = "standard14"; } if (enc instanceof COSString) { encoding = ((COSString) enc).getString(); } else { encoding = "table"; } fi.isSubset = false; boolean t = true; // Type one and TT can have subsets defineing the basename see 5.5.3 pdfref 1.6 // if (fi.fontType.lastIndexOf(COSName.TYPE1.getName())!=-1 || fi.fontType.equals(COSName.TRUE_TYPE.getName()) ) if (baseFont != null) { if (baseFont.length() > 6) { for (int k = 0; k < 6; k++) if (!Character.isUpperCase(baseFont.charAt(k))) t = false; if (baseFont.charAt(6) != '+') t = false; } else t = false; fi.isSubset = t; if (fi.isSubset) baseFont = baseFont.substring(7); } fi.fontFlags = 0; if (fi.fontType.equals(COSName.TYPE0) || fi.fontType.equals(COSName.TYPE3)) fi.isEmbedded = true; if (fontDescriptor != null) { // in Type1 charset indicates font is subsetted if (fontDescriptor.getItem(COSName.CHAR_SET) != null) fi.isSubset = true; if (fontDescriptor.getItem(COSName.FONT_FILE) != null || fontDescriptor.getItem(COSName.FONT_FILE3) != null || fontDescriptor.getItem(COSName.FONT_FILE2) != null) fi.isEmbedded = true; fi.fontFlags = fontDescriptor.getInt(COSName.getPDFName("Flags")); fi.fontFamily = fontDescriptor.getString(COSName.FONT_FAMILY); fi.fontStretch = fontDescriptor.getString(COSName.FONT_STRETCH); } fi.charset = encoding; fi.fontName = baseFont; fi.isToUnicode = toUnicode; ret.add(fi); } // for all fonts // } // for all pages Iterator<FontInformation> it = ret.iterator(); FontInformation prev = null; LinkedList<FontInformation> toDelete = new LinkedList<FontInformation>(); while (it.hasNext()) { FontInformation current = it.next(); if (prev != null && prev.fontName.equals(current.fontName) && prev.fontType.startsWith("CIDFontType")) toDelete.add(current); prev = current; } ret.removeAll(toDelete); FontInformation[] retArray = ret.toArray(new FontInformation[0]); return retArray; }
From source file:uk.bl.wa.tika.parser.pdf.pdfbox.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); addMetadata(metadata, Metadata.TITLE, info.getTitle()); addMetadata(metadata, Metadata.AUTHOR, info.getAuthor()); addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords()); addMetadata(metadata, "pdf:creator", info.getCreator()); addMetadata(metadata, "pdf:producer", info.getProducer()); addMetadata(metadata, Metadata.SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped" }); if (info.getCOSObject() != null && info.getCOSObject().keySet() != null) { for (COSName key : info.getCOSObject().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key)); }// ww w .j ava 2s.co m } } // ANJ Extensions: // // // Add other data of interest: metadata.set("pdf:version", "" + document.getDocument().getVersion()); metadata.set("pdf:numPages", "" + document.getNumberOfPages()); //metadata.set("pdf:cryptoMode", ""+getCryptoModeAsString(reader)); //metadata.set("pdf:openedWithFullPermissions", ""+reader.isOpenedWithFullPermissions()); metadata.set("pdf:encrypted", "" + document.isEncrypted()); //metadata.set("pdf:metadataEncrypted", ""+document.isMetadataEncrypted()); //metadata.set("pdf:128key", ""+reader.is128Key()); //metadata.set("pdf:tampered", ""+reader.isTampered()); try { if (document.getDocumentCatalog().getMetadata() != null) { XMPMetadata xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata()); // There is a special class for grabbing data in the PDF schema - not sure it will add much here: // Could parse xmp:CreatorTool and pdf:Producer etc. etc. out of here. XMPSchemaPDF pdfxmp = xmp.getPDFSchema(); // Added a PDF/A schema class: xmp.addXMLNSMapping(XMPSchemaPDFA.NAMESPACE, XMPSchemaPDFA.class); XMPSchemaPDFA pdfaxmp = (XMPSchemaPDFA) xmp.getSchemaByClass(XMPSchemaPDFA.class); if (pdfaxmp != null) { metadata.set("pdfaid:part", pdfaxmp.getPart()); metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(); //metadata.set("pdfa:version", version ); metadata.set("pdf:version", version); } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { log.error("XMP Parsing failed: " + e); metadata.set("pdf:metadata-xmp-parse-failed", "" + e); } // Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); metadata.set("pdf:version", baseVersion + " Adobe Extension Level " + el); } // TODO WARN if this embedded version is inconsistent with document header version? } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } // End Of ANJ Extensions. }