List of usage examples for org.apache.pdfbox.pdmodel PDDocumentInformation getProducer
public String getProducer()
From source file:fr.univ_tours.etu.searcher.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. *//*from w ww.ja v a 2 s . c o m*/ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); // create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addTextField(document, "contents", reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, "Author", info.getAuthor()); addTextField(document, "CreationDate", info.getCreationDate()); addTextField(document, "Creator", info.getCreator()); addTextField(document, "Keywords", info.getKeywords()); addTextField(document, "ModificationDate", info.getModificationDate()); addTextField(document, "Producer", info.getProducer()); addTextField(document, "Subject", info.getSubject()); addTextField(document, "Title", info.getTitle()); addTextField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 1500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addUnindexedField(document, "summary", summary); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle()); addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor()); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try {/*from w w w. j a v a 2 s .c o m*/ // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped" }); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } }
From source file:net.padaf.preflight.xmp.SynchronizedMetaDataValidation.java
License:Apache License
/** * Analyze if Producer embedded in Document Information dictionary and in XMP * properties are synchronized// ww w . ja va 2 s. c om * * @param dico * Document Information Dictionary * @param pdf * PDF Schema * @param ve * The list of validation errors */ protected void analyzeProducerProperty(PDDocumentInformation dico, AdobePDFSchema pdf, List<ValidationError> ve) { String producer = dico.getProducer(); if (producer != null) { if (pdf != null) { if (pdf.getProducer() == null) { ve.add(AbsentXMPPropertyError("Producer", "Property is not defined")); } else { if (!pdf.getProducerValue().equals(producer)) { ve.add(unsynchronizedMetaDataError("Producer")); } } } else { ve.add(AbsentSchemaMetaDataError("Producer", "PDF")); } } }
From source file:net.yacy.document.parser.pdfParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);//from w w w. ja va2s . co m // create a pdf parser PDDocument pdfDoc; //final PDFParser pdfParser; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain pdfDoc = PDDocument.load(source); //PDFParser pdfParser = new PDFParser(source); //pdfParser.parse(); //pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index documents // the new documents will get a virtual link with a post argument page=X appended to the original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); //System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }
From source file:org.apache.tika.parser.pdf.EnhancedPDFParser.java
License:Apache License
@SuppressWarnings("deprecation") private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { XMPMetadata xmp = null;/*from w w w. j av a2 s . co m*/ XMPSchemaDublinCore dcSchema = null; try { if (document.getDocumentCatalog().getMetadata() != null) { xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata()); } if (xmp != null) { dcSchema = xmp.getDublinCoreSchema(); } } catch (IOException e) { //swallow } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {/* w ww . j ava2 s.co m*/ if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context); XMPMetadata xmp = null;//from w ww . j av a 2 s . c o m if (dom != null) { xmp = new XMPMetadata(dom); } XMPSchemaDublinCore dcSchema = null; /*if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) {} JempboxExtractor.extractXMPMM(xmp, metadata); }*/ PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle()); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor()); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped()); // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate()); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getCOSObject().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key)); addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name, info.getCOSObject().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set(PDF.PDFA_VERSION, version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf18.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {/*from w ww. ja v a 2s.com*/ if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.deepfs.fsml.xdcr.PDFTransducer.java
@Override public String read(final String path) { PDDocument pd = null;//from w w w. j a va 2s . c o m final StringWriter sw = new StringWriter(); final StringBuilder sb = new StringBuilder(128); try { pd = PDDocument.load(path); PDDocumentInformation info = pd.getDocumentInformation(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setEndPage(NO_PAGES); stripper.writeText(pd, sw); sb.append(keyValue(NS + "title", info.getTitle())); sb.append(keyValue(NS + "subject", info.getSubject())); sb.append(keyValue(NS + "creator", info.getCreator())); sb.append(keyValue(NS + "author", info.getAuthor())); sb.append(keyValue(NS + "producer", info.getProducer())); sb.append(keyValue(NS + "date", info.getCreationDate() != null ? new SimpleDateFormat().format(info.getCreationDate().getTime()) : null)); sb.append(keyValue(NS + "content", sw.getBuffer().toString())); sb.append(keyValue(NS + "keywords", info.getKeywords())); } catch (IOException e) { e.printStackTrace(); } catch (IndexOutOfBoundsException oe) { // [MS] thanks apache pdfbox :-) System.err.println(oe.getMessage()); } return sb.toString(); }
From source file:org.dspace.content.packager.PDFPackager.java
License:BSD License
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException { COSDocument cos = null;/* w w w.j a v a2 s .c o m*/ try { PDFParser parser = new PDFParser(metadata); parser.parse(); cos = parser.getDocument(); // sanity check: PDFBox breaks on encrypted documents, so give up. if (cos.getEncryptionDictionary() != null) { throw new MetadataValidationException("This packager cannot accept an encrypted PDF document."); } /* PDF to DC "crosswalk": * * NOTE: This is not in a crosswalk plugin because (a) it isn't * useful anywhere else, and more importantly, (b) the source * data is not XML so it doesn't fit the plugin's interface. * * pattern of crosswalk -- PDF dict entries to DC: * Title -> title.null * Author -> contributor.author * CreationDate -> date.created * ModDate -> date.created * Creator -> description.provenance (application that created orig) * Producer -> description.provenance (convertor to pdf) * Subject -> description.abstract * Keywords -> subject.other * date is java.util.Calendar */ PDDocument pd = new PDDocument(cos); PDDocumentInformation docinfo = pd.getDocumentInformation(); String title = docinfo.getTitle(); // sanity check: item must have a title. if (title == null) { throw new MetadataValidationException( "This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary."); } if (log.isDebugEnabled()) { log.debug("PDF Info dict title=\"" + title + "\""); } item.addDC("title", null, "en", title); String value = docinfo.getAuthor(); if (value != null) { item.addDC("contributor", "author", null, value); if (log.isDebugEnabled()) { log.debug("PDF Info dict author=\"" + value + "\""); } } value = docinfo.getCreator(); if (value != null) { item.addDC("description", "provenance", "en", "Application that created the original document: " + value); } value = docinfo.getProducer(); if (value != null) { item.addDC("description", "provenance", "en", "Original document converted to PDF by: " + value); } value = docinfo.getSubject(); if (value != null) { item.addDC("description", "abstract", null, value); } value = docinfo.getKeywords(); if (value != null) { item.addDC("subject", "other", null, value); } // Take either CreationDate or ModDate as "date.created", // Too bad there's no place to put "last modified" in the DC. Calendar calValue = docinfo.getCreationDate(); if (calValue == null) { calValue = docinfo.getModificationDate(); } if (calValue != null) { item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString()); } item.update(); } finally { if (cos != null) { cos.close(); } } }