List of usage examples for org.apache.pdfbox.pdmodel PDDocument getCurrentAccessPermission
public AccessPermission getCurrentAccessPermission()
From source file:com.ackpdfbox.app.Decrypt.java
License:Apache License
private void decrypt() throws IOException { PDDocument document = null; try {/*w w w . j a v a 2s. c o m*/ InputStream keyStoreStream = null; if (keyStore != null) { keyStoreStream = new FileInputStream(keyStore); } document = PDDocument.load(new File(infile), password, keyStoreStream, alias); if (document.isEncrypted()) { AccessPermission ap = document.getCurrentAccessPermission(); if (ap.isOwnerPermission()) { document.setAllSecurityToBeRemoved(true); document.save(outfile); } else { throw new IOException( "Error: You are only allowed to decrypt a document with the owner password."); } } else { System.err.println("Error: Document is not encrypted."); } } finally { if (document != null) { document.close(); } } }
From source file:cz.muni.pdfjbim.PdfImageExtractor.java
License:Apache License
/** * @deprecated -- do not use doesn't work properly yet * This method extracts images by going through PDF tree structure * @param pdfFile name of input PDF file * @param prefix /* w w w .ja va 2s . c o m*/ * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet // * @param silent -- if true error messages are not written to output otherwise they are * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfObjectAccess(String pdfFile, String prefix, String password, Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException { if (binarize == null) { binarize = false; } // checking arguments and setting appropriate variables if (pdfFile == null) { throw new IllegalArgumentException("pdfFile must be defined"); } InputStream inputStream = null; if (password != null) { try { log.debug("PDF probably encrypted, trying to decrypt using given password {}", password); ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream(); PdfReader reader = new PdfReader(pdfFile, password.getBytes(StandardCharsets.UTF_8)); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); stamper.close(); inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { try { inputStream = new FileInputStream(pdfFile); } catch (FileNotFoundException ex) { throw new PdfRecompressionException("File wasn't found", ex); } } // if prefix is not set then prefix set to name of pdf without .pdf // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed // and this string set as prefix if ((prefix == null) && (pdfFile.length() > 4)) { prefix = pdfFile.substring(0, pdfFile.length() - 4); } PDFParser parser = null; PDDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getPDDocument(); AccessPermission accessPermissions = doc.getCurrentAccessPermission(); if (!accessPermissions.canExtractContent()) { throw new PdfRecompressionException("Error: You do not have permission to extract images."); } // going page by page List pages = doc.getDocumentCatalog().getAllPages(); for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) { if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) { continue; } PDPage page = (PDPage) pages.get(pageNumber); PDResources resources = page.getResources(); Map xobjs = resources.getXObjects(); if (xobjs != null) { Iterator xobjIter = xobjs.entrySet().iterator(); while (xobjIter.hasNext()) { Map.Entry entry = (Map.Entry) xobjIter.next(); String key = (String) entry.getKey(); PDXObject xobj = (PDXObject) entry.getValue(); Map images; if (xobj instanceof PDXObjectForm) { PDXObjectForm xform = (PDXObjectForm) xobj; images = xform.getResources().getImages(); } else { images = resources.getImages(); } // reading images from each page and saving them to file if (images != null) { Iterator imageIter = images.entrySet().iterator(); while (imageIter.hasNext()) { Map.Entry imEntry = (Map.Entry) imageIter.next(); String imKey = (String) imEntry.getKey(); PDXObjectImage image = (PDXObjectImage) imEntry.getValue(); PDStream pdStr = new PDStream(image.getCOSStream()); List<COSName> filters = pdStr.getFilters(); if (image.getBitsPerComponent() > 1 && !binarize) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE)) { log.info("This is LZWDecoded => skipping"); continue; } if (filters.contains(COSName.JBIG2_DECODE)) { if (skipJBig2Images) { log.warn("Allready compressed according to JBIG2 standard => skipping"); continue; } else { log.debug("JBIG2 image detected"); } } // detection of unsupported filters by pdfBox library if (filters.contains(COSName.JPX_DECODE)) { log.info("Unsupported filter JPXDecode => skipping"); continue; } COSObject cosObj = new COSObject(image.getCOSObject()); int objectNum = cosObj.getObjectNumber().intValue(); int genNum = cosObj.getGenerationNumber().intValue(); log.debug(objectNum + " " + genNum + " obj"); String name = getUniqueFileName(prefix + imKey, image.getSuffix()); log.debug("Writing image:" + name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); log.debug(pdfImageInfo.toString()); namesOfImages.add(name + "." + image.getSuffix()); } } } } } } catch (IOException ex) { Tools.deleteFilesFromList(namesOfImages); throw new PdfRecompressionException("Unable to parse PDF document", ex); } catch (RuntimeException ex) { Tools.deleteFilesFromList(namesOfImages); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:cz.muni.pdfjbim.PdfImageProcessor.java
License:Apache License
/** * @deprecated -- do not use doesn't work properly yet * This method extracts images by going through PDF tree structure * @param pdfFile name of input PDF file * @param password password for access to PDF if needed * @param pagesToProcess list of pages which should be processed if null given => processed all pages * -- not working yet//w ww. j a v a 2s . com * @param silent -- if true error messages are not written to output otherwise they are * @param binarize -- enables processing of nonbitonal images as well (LZW is still not * processed because of output with inverted colors) * @throws PdfRecompressionException if problem to extract images from PDF */ public void extractImagesUsingPdfObjectAccess(String pdfFile, String password, Set<Integer> pagesToProcess, Boolean silent, Boolean binarize) throws PdfRecompressionException { if (binarize == null) { binarize = false; } // checking arguments and setting appropriate variables if (pdfFile == null) { throw new IllegalArgumentException(pdfFile); } String prefix = null; InputStream inputStream = null; if (password != null) { try { ByteArrayOutputStream decryptedOutputStream = null; PdfReader reader = new PdfReader(pdfFile, password.getBytes()); PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream); stamper.close(); inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray()); } catch (DocumentException ex) { throw new PdfRecompressionException(ex); } catch (IOException ex) { throw new PdfRecompressionException("Reading file caused exception", ex); } } else { try { inputStream = new FileInputStream(pdfFile); } catch (FileNotFoundException ex) { throw new PdfRecompressionException("File wasn't found", ex); } } // if prefix is not set then prefix set to name of pdf without .pdf // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed // and this string set as prefix if ((prefix == null) && (pdfFile.length() > 4)) { prefix = pdfFile.substring(0, pdfFile.length() - 4); } PDFParser parser = null; PDDocument doc = null; try { parser = new PDFParser(inputStream); parser.parse(); doc = parser.getPDDocument(); AccessPermission accessPermissions = doc.getCurrentAccessPermission(); if (!accessPermissions.canExtractContent()) { throw new PdfRecompressionException("Error: You do not have permission to extract images."); } // going page by page List pages = doc.getDocumentCatalog().getAllPages(); for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) { if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) { continue; } PDPage page = (PDPage) pages.get(pageNumber); PDResources resources = page.getResources(); Map xobjs = resources.getXObjects(); if (xobjs != null) { Iterator xobjIter = xobjs.keySet().iterator(); while (xobjIter.hasNext()) { String key = (String) xobjIter.next(); PDXObject xobj = (PDXObject) xobjs.get(key); Map images; if (xobj instanceof PDXObjectForm) { PDXObjectForm xform = (PDXObjectForm) xobj; images = xform.getResources().getImages(); } else { images = resources.getImages(); } // reading images from each page and saving them to file if (images != null) { Iterator imageIter = images.keySet().iterator(); while (imageIter.hasNext()) { String imKey = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) images.get(imKey); PDStream pdStr = new PDStream(image.getCOSStream()); List filters = pdStr.getFilters(); if (image.getBitsPerComponent() > 1) { log.info("It is not a bitonal image => skipping"); continue; } // at this moment for preventing bad output (bad coloring) from LZWDecode filter if (filters.contains(COSName.LZW_DECODE.getName())) { log.info("This is LZWDecoded => skipping"); continue; } // detection of unsupported filters by pdfBox library if (filters.contains("JBIG2Decode")) { log.info("Allready compressed according to JBIG2 standard => skipping"); continue; } if (filters.contains("JPXDecode")) { log.info("Unsupported filter JPXDecode => skipping"); continue; } COSObject cosObj = new COSObject(image.getCOSObject()); int objectNum = cosObj.getObjectNumber().intValue(); int genNum = cosObj.getGenerationNumber().intValue(); log.debug(objectNum + " " + genNum + " obj"); String name = getUniqueFileName(prefix + imKey, image.getSuffix()); log.debug("Writing image:" + name); image.write2file(name); PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(), image.getHeight(), objectNum, genNum); originalImageInformations.add(pdfImageInfo); log.debug(pdfImageInfo.toString()); namesOfImages.add(name + "." + image.getSuffix()); } } } } } } catch (IOException ex) { throw new PdfRecompressionException("Unable to parse PDF document", ex); } finally { if (doc != null) { try { doc.close(); } catch (IOException ex) { throw new PdfRecompressionException(ex); } } } }
From source file:de.catma.document.source.contenthandler.PDFContentHandler.java
License:Open Source License
public void load(InputStream is) throws IOException { PDDocument document = null; try {// ww w . j a va 2s . co m document = PDDocument.load(is, false); if (document.isEncrypted()) { throw new IOException("can not open pdf document because it is encrypted"); } AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract text"); } PDFTextStripper stripper = new PDFTextStripper("UTF-8"); stripper.setForceParsing(false); stripper.setSortByPosition(false); stripper.setShouldSeparateByBeads(true); stripper.setStartPage(1); stripper.setEndPage(Integer.MAX_VALUE); ByteArrayOutputStream os = new ByteArrayOutputStream(); Writer w = new OutputStreamWriter(os); try { stripper.writeText(document, w); } finally { w.close(); } // some pdfs seem to include non valid unicode characters // and this causes problems when converting text to HTML // for GUI delivery and during indexing setContent(os.toString().replaceAll("[^\\x09\\x0A\\x0D\\x20-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]", "?")); } finally { if (document != null) { document.close(); } } }
From source file:edu.ist.psu.sagnik.research.pdfbox2playground.javatest.ExtractImages.java
License:Apache License
private void extract(String pdfFile, String password) throws IOException { PDDocument document = null; try {//from w w w. ja v a 2 s. com document = PDDocument.load(new File(pdfFile), password); AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract images"); } for (int i = 0; i < document.getNumberOfPages(); i++) // todo: ITERATOR would be much better { PDPage page = document.getPage(i); ImageGraphicsEngine extractor = new ImageGraphicsEngine(page); extractor.run(); } } finally { if (document != null) { document.close(); } } }
From source file:net.yacy.cider.parser.idiom.pdfIdiom.java
License:Open Source License
@Override public Model parse(DataSource source) throws ParserException { // create an empty Model Model model = ModelFactory.createDefaultModel(); Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true)) : model.createResource();/*from w w w.ja v a2 s.com*/ // open pdf document final PDDocument theDocument; final PDFParser parser; try { parser = new PDFParser(source.getStream()); parser.parse(); theDocument = parser.getPDDocument(); } catch (IOException e) { log.error(e.getMessage(), e); throw new ParserException(e.getMessage(), source.getURI()); } if (theDocument.isEncrypted()) { try { theDocument.openProtection(new StandardDecryptionMaterial("")); } catch (BadSecurityHandlerException e) { throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(), source.getURI(), e); } catch (IOException e) { throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e); } catch (CryptographyException e) { throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(), source.getURI(), e); } final AccessPermission perm = theDocument.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) throw new ParserException("PDF cannot be decrypted", source.getURI()); } // get metadata final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null; if (theDocInfo != null) { docTitle = theDocInfo.getTitle(); docSubject = theDocInfo.getSubject(); docAuthor = theDocInfo.getAuthor(); docKeywordStr = theDocInfo.getKeywords(); } if (docAuthor != null && docAuthor.length() > 0) { resource.addProperty(VCARD.FN, docAuthor); resource.addProperty(DC.creator, docAuthor); } if (docSubject != null && docSubject.length() > 0) { resource.addProperty(DC.subject, docSubject); } if (docTitle != null && docTitle.length() > 0) { resource.addProperty(DC.title, docTitle); } String[] docKeywords = null; if (docKeywordStr != null && docKeywordStr.length() > 0) { docKeywords = docKeywordStr.split(" |,"); resource.addProperty(DC.coverage, concat(docKeywords)); } // get the content ByteArrayOutputStream baos = new ByteArrayOutputStream(); Writer writer; try { writer = new OutputStreamWriter(baos, "UTF-8"); } catch (UnsupportedEncodingException e1) { writer = new OutputStreamWriter(baos); } try { final PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(theDocument, writer); theDocument.close(); writer.close(); } catch (IOException e) { if (writer != null) try { writer.close(); } catch (final Exception ex) { } throw new ParserException("PDF content reader", source.getURI(), e); } String content; try { content = new String(baos.toByteArray(), "UTF-8"); } catch (UnsupportedEncodingException e) { content = new String(baos.toByteArray()); } if (content != null && content.length() > 0) { resource.addProperty(CIDER.data_content_text, content); } return model; }
From source file:net.yacy.document.parser.pdfParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);/*ww w . j a va2 s. co m*/ // create a pdf parser PDDocument pdfDoc; //final PDFParser pdfParser; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain pdfDoc = PDDocument.load(source); //PDFParser pdfParser = new PDFParser(source); //pdfParser.parse(); //pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index documents // the new documents will get a virtual link with a post argument page=X appended to the original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); //System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }
From source file:org.apache.camel.component.pdf.PdfCreationTest.java
License:Apache License
@Test public void testPdfCreationWithEncryption() throws Exception { final String ownerPass = "ownerPass"; final String userPass = "userPass"; final String expectedText = "expectedText"; AccessPermission accessPermission = new AccessPermission(); accessPermission.setCanPrint(false); StandardProtectionPolicy protectionPolicy = new StandardProtectionPolicy(ownerPass, userPass, accessPermission);//from w w w. ja va2s .c om protectionPolicy.setEncryptionKeyLength(128); template.sendBodyAndHeader("direct:start", expectedText, PdfHeaderConstants.PROTECTION_POLICY_HEADER_NAME, protectionPolicy); resultEndpoint.setExpectedMessageCount(1); resultEndpoint.expectedMessagesMatches(new Predicate() { @Override public boolean matches(Exchange exchange) { Object body = exchange.getIn().getBody(); assertThat(body, instanceOf(ByteArrayOutputStream.class)); try { PDDocument doc = PDDocument .load(new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray())); assertTrue("Expected encrypted document", doc.isEncrypted()); doc.decrypt(userPass); assertFalse("Printing should not be permitted", doc.getCurrentAccessPermission().canPrint()); PDFTextStripper pdfTextStripper = new PDFTextStripper(); String text = pdfTextStripper.getText(doc); assertEquals(1, doc.getNumberOfPages()); assertThat(text, containsString(expectedText)); } catch (Exception e) { throw new RuntimeException(e); } return true; } }); resultEndpoint.assertIsSatisfied(); }
From source file:org.apache.tika.parser.pdf.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {/* w w w.j ava2 s . c om*/ if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context); XMPMetadata xmp = null;/*w w w. ja v a 2 s .c o m*/ if (dom != null) { xmp = new XMPMetadata(dom); } XMPSchemaDublinCore dcSchema = null; /*if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) {} JempboxExtractor.extractXMPMM(xmp, metadata); }*/ PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle()); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor()); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject()); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped()); // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate()); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getCOSObject().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key)); addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name, info.getCOSObject().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set(PDF.PDFA_VERSION, version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }