List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getModificationDate
public Calendar getModificationDate()
From source file:net.padaf.preflight.xmp.SynchronizedMetaDataValidation.java
License:Apache License
/** * Analyze if the ModifyDate embedded in Document Information dictionary and * in XMP properties are synchronized//from w ww.jav a 2s. co m * * @param dico * Document Information Dictionary * @param xmp * XMP Basic Schema * @param ve * The list of validation errors * @throws ValidationException */ protected void analyzeModifyDateProperty(PDDocumentInformation dico, XMPBasicSchema xmp, List<ValidationError> ve) throws ValidationException { Calendar modifyDate; try { modifyDate = dico.getModificationDate(); if (modifyDate != null) { if (xmp != null) { Calendar xmpModifyDate = xmp.getModifyDateValue(); if (xmpModifyDate == null) { ve.add(AbsentXMPPropertyError("ModifyDate", "Property is not defined")); } else { if (!DateConverter.toISO8601(xmpModifyDate).equals(DateConverter.toISO8601(modifyDate))) { ve.add(unsynchronizedMetaDataError("ModificationDate")); } } } else { ve.add(AbsentSchemaMetaDataError("ModifyDate", "Basic XMP")); } } } catch (IOException e) { // If there is an error while converting this property to a date throw formatAccessException("Document Information", "ModifyDate", e); } }
From source file:net.yacy.document.parser.pdfParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);/*www . ja v a 2s . c o m*/ // create a pdf parser PDDocument pdfDoc; //final PDFParser pdfParser; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain pdfDoc = PDDocument.load(source); //PDFParser pdfParser = new PDFParser(source); //pdfParser.parse(); //pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index documents // the new documents will get a virtual link with a post argument page=X appended to the original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); //System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }
From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java
License:Apache License
@SuppressWarnings("deprecation") private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { XMPMetadata xmp = null;//from w w w . j a v a 2 s .co m XMPSchemaDublinCore dcSchema = null; try { if (document.getDocumentCatalog().getMetadata() != null) { xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata()); } if (xmp != null) { dcSchema = xmp.getDublinCoreSchema(); } } catch (IOException e) { //swallow } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {/*from w ww .ja va 2 s .com*/ if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context); XMPMetadata xmp = null;//from w w w .java 2 s . co m if (dom != null) { xmp = new XMPMetadata(dom); } XMPSchemaDublinCore dcSchema = null; /*if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) {} JempboxExtractor.extractXMPMM(xmp, metadata); }*/ PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle()); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor()); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped()); // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate()); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getCOSObject().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key)); addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name, info.getCOSObject().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set(PDF.PDFA_VERSION, version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf18.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {/*w w w.j a v a 2s . com*/ if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.dspace.content.packager.PDFPackager.java
License:BSD License
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException { COSDocument cos = null;//from w w w .j a va 2 s. com try { PDFParser parser = new PDFParser(metadata); parser.parse(); cos = parser.getDocument(); // sanity check: PDFBox breaks on encrypted documents, so give up. if (cos.getEncryptionDictionary() != null) { throw new MetadataValidationException("This packager cannot accept an encrypted PDF document."); } /* PDF to DC "crosswalk": * * NOTE: This is not in a crosswalk plugin because (a) it isn't * useful anywhere else, and more importantly, (b) the source * data is not XML so it doesn't fit the plugin's interface. * * pattern of crosswalk -- PDF dict entries to DC: * Title -> title.null * Author -> contributor.author * CreationDate -> date.created * ModDate -> date.created * Creator -> description.provenance (application that created orig) * Producer -> description.provenance (convertor to pdf) * Subject -> description.abstract * Keywords -> subject.other * date is java.util.Calendar */ PDDocument pd = new PDDocument(cos); PDDocumentInformation docinfo = pd.getDocumentInformation(); String title = docinfo.getTitle(); // sanity check: item must have a title. if (title == null) { throw new MetadataValidationException( "This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary."); } if (log.isDebugEnabled()) { log.debug("PDF Info dict title=\"" + title + "\""); } item.addDC("title", null, "en", title); String value = docinfo.getAuthor(); if (value != null) { item.addDC("contributor", "author", null, value); if (log.isDebugEnabled()) { log.debug("PDF Info dict author=\"" + value + "\""); } } value = docinfo.getCreator(); if (value != null) { item.addDC("description", "provenance", "en", "Application that created the original document: " + value); } value = docinfo.getProducer(); if (value != null) { item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value); } value = docinfo.getSubject(); if (value != null) { item.addDC("description", "abstract", null, value); } value = docinfo.getKeywords(); if (value != null) { item.addDC("subject", "other", null, value); } // Take either CreationDate or ModDate as "date.created", // Too bad there's no place to put "last modified" in the DC. Calendar calValue = docinfo.getCreationDate(); if (calValue == null) { calValue = docinfo.getModificationDate(); } if (calValue != null) { item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString()); } item.update(); } finally { if (cos != null) { cos.close(); } } }
From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java
License:Open Source License
public Properties getProperties(final InputStream is) throws IOException, DocumentReadException { try {//from ww w . j a v a 2 s. c o m return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() { public Properties run() throws Exception { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } PDDocument pdDocument = PDDocument.load(is); Properties props = new Properties(); try { if (pdDocument.isEncrypted()) { try { pdDocument.decrypt(""); } catch (InvalidPasswordException e) { throw new DocumentReadException("The pdf document is encrypted.", e); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { throw new DocumentReadException(e.getMessage(), e); } } PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); PDMetadata meta = catalog.getMetadata(); if (meta != null) { XMPMetadata metadata = meta.exportXMPMetadata(); XMPSchemaDublinCore dc = metadata.getDublinCoreSchema(); if (dc != null) { try { if (dc.getTitle() != null) props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle())); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } try { if (dc.getDescription() != null) props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription())); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (dc.getCreators() != null) { for (String creator : dc.getCreators()) { props.put(DCMetaData.CREATOR, fixEncoding(creator)); } } } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (dc.getDates() != null) { for (Calendar date : dc.getDates()) { props.put(DCMetaData.DATE, date); } } } catch (Exception e) { LOG.warn("getDate failed: " + e.getMessage()); } } XMPSchemaPDF pdf = metadata.getPDFSchema(); if (pdf != null) { try { if (pdf.getKeywords() != null) props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords())); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (pdf.getProducer() != null) props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer())); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } } XMPSchemaBasic basic = metadata.getBasicSchema(); if (basic != null) { try { if (basic.getCreateDate() != null) props.put(DCMetaData.DATE, basic.getCreateDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (basic.getModifyDate() != null) props.put(DCMetaData.DATE, basic.getModifyDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } // DCMetaData.PUBLISHER - basic.getCreatorTool() } } if (props.isEmpty()) { // The pdf doesn't contain any metadata, try to use the document // information instead PDDocumentInformation docInfo = pdDocument.getDocumentInformation(); if (docInfo != null) { try { if (docInfo.getAuthor() != null) props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor()); } catch (Exception e) { LOG.warn("getAuthor failed: " + e.getMessage()); } try { if (docInfo.getCreationDate() != null) props.put(DCMetaData.DATE, docInfo.getCreationDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (docInfo.getCreator() != null) props.put(DCMetaData.CREATOR, docInfo.getCreator()); } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (docInfo.getKeywords() != null) props.put(DCMetaData.SUBJECT, docInfo.getKeywords()); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (docInfo.getModificationDate() != null) props.put(DCMetaData.DATE, docInfo.getModificationDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } try { if (docInfo.getProducer() != null) props.put(DCMetaData.PUBLISHER, docInfo.getProducer()); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } try { if (docInfo.getSubject() != null) props.put(DCMetaData.DESCRIPTION, docInfo.getSubject()); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (docInfo.getTitle() != null) props.put(DCMetaData.TITLE, docInfo.getTitle()); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } // docInfo.getTrapped(); } } } finally { if (pdDocument != null) { pdDocument.close(); } if (is != null) { try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } } return props; } }); } catch (PrivilegedActionException pae) { Throwable cause = pae.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } else if (cause instanceof RuntimeException) { throw (RuntimeException) cause; } else { throw new RuntimeException(cause); } } }
From source file:org.knoesis.matvocab.indexer.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *//*from w w w. j a va 2 s . c o m*/ private void addContent(Document document, InputStream is, String documentLocation, PDFTextStripper stripper) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on pdfDocument.decrypt(""); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } else { stripper.resetEngine(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addField(document, "contents", contents); addField(document, "stemmedcontents", contents); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addField(document, "Author", info.getAuthor()); try { addField(document, "CreationDate", info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Creator", info.getCreator()); addField(document, "Keywords", info.getKeywords()); try { addField(document, "ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Producer", info.getProducer()); addField(document, "Subject", info.getSubject()); addField(document, "Title", info.getTitle()); addField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addField(document, "summary", summary); addField(document, "numpages", String.valueOf(pdfDocument.getNumberOfPages())); } catch (CryptographyException e) { throw new IOException("Error decrypting document(" + documentLocation + "): " + e); } catch (InvalidPasswordException e) { //they didn't suppply a password and the default of "" was wrong. throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed."); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.modeshape.sequencer.pdf.PdfBasicMetadata.java
License:Apache License
public boolean check() throws Exception { try (PDDocument document = PDDocument.load(in)) { PDDocumentCatalog catalog = document.getDocumentCatalog(); PDPageable pageable = new PDPageable(document); PageFormat firstPage = pageable.getPageFormat(0); encrypted = document.isEncrypted(); pageCount = document.getNumberOfPages(); orientation = ORIENTATION_STRINGS[firstPage.getOrientation()]; version = String.valueOf(document.getDocument().getVersion()); String catalogVersion = catalog.getVersion(); if (catalogVersion != null && !catalogVersion.isEmpty()) { // According to specs version saved here should be determining instead // the version in header. It is barely used, though. version = catalogVersion;//from w ww .ja v a 2 s . co m } if (!encrypted) { PDDocumentInformation metadata = document.getDocumentInformation(); author = metadata.getAuthor(); creationDate = metadata.getCreationDate(); creator = metadata.getCreator(); keywords = metadata.getKeywords(); modificationDate = metadata.getModificationDate(); producer = metadata.getProducer(); subject = metadata.getSubject(); title = metadata.getTitle(); } // extract all attached files from all pages int pageNumber = 0; for (Object page : catalog.getAllPages()) { pageNumber += 1; PdfPageMetadata pageMetadata = new PdfPageMetadata(); pageMetadata.setPageNumber(pageNumber); for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata(); PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile(); attachmentMetadata.setSubject(fann.getSubject()); attachmentMetadata.setName(fileSpec.getFilename()); attachmentMetadata.setCreationDate(embeddedFile.getCreationDate()); attachmentMetadata.setModificationDate(embeddedFile.getModDate()); attachmentMetadata.setMimeType(embeddedFile.getSubtype()); attachmentMetadata.setData(embeddedFile.getByteArray()); pageMetadata.addAttachment(attachmentMetadata); } } pages.add(pageMetadata); } return true; } }