List of usage examples for org.apache.pdfbox.pdmodel PDDocument getCurrentAccessPermission
public AccessPermission getCurrentAccessPermission()
From source file:org.apache.tika.parser.pdf18.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {/*from w w w . j a v a 2s . c o m*/ if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.codelibs.robot.extractor.impl.PdfExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); }//from w w w. j a v a2 s . c o m synchronized (pdfBoxLockObj) { PDDocument document = null; try { document = PDDocument.load(in, null, force); if (document.isEncrypted() && params != null) { String password = params.get(ExtractData.PDF_PASSWORD); if (password == null) { password = getPassword(params.get(ExtractData.URL), params.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); } if (password != null) { final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password); document.openProtection(sdm); final AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract text."); } } } final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Writer output = new OutputStreamWriter(baos, encoding); final PDFTextStripper stripper = new PDFTextStripper(encoding); stripper.setForceParsing(force); final AtomicBoolean done = new AtomicBoolean(false); final PDDocument doc = document; final Set<Exception> exceptionSet = new HashSet<>(); final Thread task = new Thread(() -> { try { stripper.writeText(doc, output); } catch (final Exception e) { exceptionSet.add(e); } finally { done.set(true); } }); task.setDaemon(true); task.start(); task.join(timeout); if (!done.get()) { for (int i = 0; i < 100 && !done.get(); i++) { task.interrupt(); Thread.sleep(50); } throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec."); } else if (!exceptionSet.isEmpty()) { throw exceptionSet.iterator().next(); } output.flush(); final ExtractData extractData = new ExtractData(baos.toString(encoding)); extractMetadata(document, extractData); return extractData; } catch (final Exception e) { throw new ExtractException(e); } finally { if (document != null) { try { document.close(); } catch (final IOException e) { // NOP } } } } }
From source file:org.nuxeo.ecm.core.convert.plugins.text.extractors.PDF2TextConverter.java
License:Apache License
@Override public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { PDDocument document = null; File f = null;//from w w w . j a va 2 s . c o m OutputStream fas = null; try { document = PDDocument.load(blobHolder.getBlob().getStream()); // NXP-1556: if document is protected an IOException will be raised // Instead of catching the exception based on its message string // lets avoid sending messages that will generate this error // code taken from PDFTextStripper.writeText source. // only care about standard encryption and if it was decrypted with // the user password AccessPermission permission = document.getCurrentAccessPermission(); if (permission.canExtractContent()) { PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper(); // use the position information to heuristically organize the // extracted paragraphs. This is also important for // right-to-left languages. textStripper.setSortByPosition(true); String text = textStripper.getText(document); // replace non breaking space by regular spaces (why?) // text = text.replace("\u00a0", " "); f = Framework.createTempFile("pdfboplugin", ".txt"); fas = new FileOutputStream(f); fas.write(text.getBytes("UTF-8")); try (FileInputStream is = new FileInputStream(f)) { Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8"); return new SimpleCachableBlobHolder(blob); } } else { return new SimpleCachableBlobHolder(Blobs.createBlob("")); } } catch (IOException e) { throw new ConversionException("Error during text extraction with PDFBox", e); } finally { if (document != null) { try { document.close(); } catch (IOException e) { log.error("Error while closing PDFBox document", e); } } if (fas != null) { try { fas.close(); } catch (IOException e) { log.error(e); } } if (f != null) { f.delete(); } } }
From source file:org.nuxeo.pdf.test.PDFEncryptionTest.java
License:Open Source License
protected void checkIsReadOnly(Blob inBlob, String ownerPwd, String userPwd) throws Exception { assertNotNull(inBlob);// w w w .ja v a2 s . c o m PDDocument pdfDoc = utils.loadAndTrack(inBlob); assertTrue(pdfDoc.isEncrypted()); // Decrypt as user pdfDoc.openProtection(new StandardDecryptionMaterial(userPwd)); assertFalse(pdfDoc.isEncrypted()); AccessPermission ap = pdfDoc.getCurrentAccessPermission(); assertTrue(ap.canExtractContent()); assertTrue(ap.canExtractForAccessibility()); assertTrue(ap.canPrint()); assertTrue(ap.canPrintDegraded()); assertFalse(ap.canAssembleDocument()); assertFalse(ap.canFillInForm()); assertFalse(ap.canModifyAnnotations()); // Decrypt as owner utils.closeAndUntrack(pdfDoc); pdfDoc = utils.loadAndTrack(inBlob); pdfDoc.openProtection(new StandardDecryptionMaterial(ownerPwd)); assertFalse(pdfDoc.isEncrypted()); ap = pdfDoc.getCurrentAccessPermission(); assertTrue(ap.isOwnerPermission()); utils.closeAndUntrack(pdfDoc); }
From source file:org.paxle.parser.pdf.impl.PdfParser.java
License:Open Source License
public IParserDocument parse(URI location, String charset, InputStream fileIn) throws ParserException, UnsupportedEncodingException, IOException { IParserDocument parserDoc = null;// ww w. java 2 s.c o m PDDocument pddDoc = null; try { final IParserContext pc = this.contextLocal.getCurrentContext(); final ICommandProfile cmdProfile = pc.getCommandProfile(); // create an empty document parserDoc = pc.createDocument(); // parse it final PDFParser parser = new PDFParser(fileIn); parser.parse(); pddDoc = parser.getPDDocument(); // check document encryption if (pddDoc.isEncrypted()) { if (this.logger.isDebugEnabled()) { this.logger.debug(String.format("Document '%s' is encrypted.", location)); } // determine the decryption password String pwd = ""; if (cmdProfile != null) { String tmp = (String) cmdProfile.getProperty("org.paxle.parser.pdf.impl.decryptionPassword"); if (tmp != null) pwd = tmp; } // try to open document with the given password try { final StandardDecryptionMaterial dm = new StandardDecryptionMaterial(pwd); pddDoc.openProtection(dm); final AccessPermission accessPermission = pddDoc.getCurrentAccessPermission(); if (accessPermission == null || !accessPermission.canExtractContent()) { if (this.logger.isInfoEnabled()) { this.logger.debug( String.format("No permission to extract content of document '%s'.", location)); } parserDoc.setStatus(IParserDocument.Status.FAILURE, "PDF Document is encrypted."); return parserDoc; } } catch (Throwable e) { this.logger.error(String.format("Unable to decrypt document '%s'.", location), e); parserDoc.setStatus(IParserDocument.Status.FAILURE, String .format("Unable to decrypt document. %s: %s", e.getClass().getName(), e.getMessage())); return parserDoc; } } // extract metadata this.extractMetaData(parserDoc, pddDoc); // extract text final PDFTextStripper stripper = new PDFTextStripper(); // XXX: we could limit the amount of parsed pages via crawling-profile properties? // stripper.setStartPage(startPageValue); // stripper.setEndPage(endPageValue); final Writer pdocWriter = parserDoc.getTextWriter(); stripper.writeText(pddDoc, pdocWriter); pdocWriter.flush(); // extracting URIs this.extractURLs(parserDoc, pddDoc); // extracting embedded files this.extractEmbeddedFiles(location, parserDoc, pddDoc); parserDoc.setStatus(IParserDocument.Status.OK); return parserDoc; } catch (Throwable e) { throw new ParserException("Error parsing pdf document. " + e.getMessage(), e); } finally { if (pddDoc != null) try { pddDoc.close(); } catch (Exception e) { this.logger.error(e); } } }
From source file:org.pdfsam.pdfbox.component.PDDocumentAccessPermission.java
License:Open Source License
PDDocumentAccessPermission(PDDocument document) { this.permissions = document.getCurrentAccessPermission(); }
From source file:org.pdfsam.pdfbox.component.PDDocumentAccessPermissionTest.java
License:Open Source License
@Before public void setUp() { PDDocument document = mock(PDDocument.class); permission = mock(AccessPermission.class); when(document.getCurrentAccessPermission()).thenReturn(permission); victim = new PDDocumentAccessPermission(document); }
From source file:org.pennyledger.docstore.parser.impl.PDFImageExtractor.java
License:Apache License
/** * Entry point for the application./*from w w w . jav a 2 s . com*/ * * @param args * The command-line arguments. * @throws IOException * if there is an error reading the file or extracting the images. */ // public static void main(String[] args) throws IOException { // // suppress the Dock icon on OS X // System.setProperty("apple.awt.UIElement", "true"); // // IImageParser imageParser = new TesseractImageOCR(); // PDFImageExtractor extractor = new PDFImageExtractor(imageParser, 150); // extractor.run(args); // } // private void run(String[] args) throws IOException { // //String pdfFile = args[0]; // String pdfFile = "c:/PennyLedger/1d51-9fe1b211e8039458b2ac4dbbfbf1.pdf"; // if (pdfFile.length() <= 4) { // throw new IllegalArgumentException("Invalid file name: not PDF"); // } // String password = ""; // Path pdfPath = Paths.get(pdfFile); // PDDocument document = PDDocument.load(pdfPath.toFile(), password); // IDocumentContents docContents = new DocumentContents(); // extract(document, id, pdfPath, docContents); // } IDocumentContents extract(PDDocument document, String id, IDocumentContents docContents) throws IOException { AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract images"); } for (int i = 0; i < document.getNumberOfPages(); i++) { PDPage page = document.getPage(i); ImageGraphicsEngine extractor = new ImageGraphicsEngine(page, i, id); extractor.run(); IDocumentContents pageContents = extractor.getPageContents(); docContents = docContents.merge(pageContents); } return docContents; }
From source file:org.seasar.robot.extractor.impl.PdfExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); }/*from www . java 2 s . c om*/ synchronized (pdfBoxLockObj) { PDDocument document = null; try { document = PDDocument.load(in, null, force); if (document.isEncrypted() && params != null) { String password = params.get(ExtractData.PDF_PASSWORD); if (password == null) { password = getPassword(params.get(ExtractData.URL), params.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); } if (password != null) { final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password); document.openProtection(sdm); final AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract text."); } } } final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Writer output = new OutputStreamWriter(baos, encoding); final PDFTextStripper stripper = new PDFTextStripper(encoding); stripper.setForceParsing(force); final AtomicBoolean done = new AtomicBoolean(false); final PDDocument doc = document; final Set<Exception> exceptionSet = new HashSet<>(); Thread task = new Thread(new Runnable() { @Override public void run() { try { stripper.writeText(doc, output); } catch (Exception e) { exceptionSet.add(e); } finally { done.set(true); } } }); task.setDaemon(true); task.start(); task.join(timeout); if (!done.get()) { for (int i = 0; i < 100 && !done.get(); i++) { task.interrupt(); Thread.sleep(50); } throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec."); } else if (!exceptionSet.isEmpty()) { throw exceptionSet.iterator().next(); } output.flush(); final ExtractData extractData = new ExtractData(baos.toString(encoding)); extractMetadata(document, extractData); return extractData; } catch (final Exception e) { throw new ExtractException(e); } finally { if (document != null) { try { document.close(); } catch (final IOException e) { // NOP } } } } }
From source file:org.sejda.impl.pdfbox.component.PDDocumentAccessPermission.java
License:Apache License
PDDocumentAccessPermission(PDDocument document) { if (document == null) { throw new IllegalArgumentException("Unable to get permissions from null instance."); }/* w w w. j ava 2 s . c o m*/ this.permissions = document.getCurrentAccessPermission(); }