Example usage for org.apache.pdfbox.pdmodel PDDocument getCurrentAccessPermission

List of usage examples for org.apache.pdfbox.pdmodel PDDocument getCurrentAccessPermission

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getCurrentAccessPermission.

Prototype

public AccessPermission getCurrentAccessPermission() 

Source Link

Document

Returns the access permissions granted when the document was decrypted.

Usage

From source file:org.apache.tika.parser.pdf18.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    org.apache.jempbox.xmp.XMPMetadata xmp = null;
    XMPSchemaDublinCore dcSchema = null;
    XMPSchemaMediaManagement mmSchema = null;
    try {/*from  w  w  w . j a v  a 2s .  c o m*/
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
        }
    } catch (IOException e) {
    }

    if (xmp != null) {
        try {
            dcSchema = xmp.getDublinCoreSchema();
        } catch (IOException e) {
        }

        JempboxExtractor.extractXMPMM(xmp, metadata);
    }

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.codelibs.robot.extractor.impl.PdfExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new RobotSystemException("The inputstream is null.");
    }//from   w  w w.  j  a v a2 s .  c o m
    synchronized (pdfBoxLockObj) {
        PDDocument document = null;
        try {
            document = PDDocument.load(in, null, force);
            if (document.isEncrypted() && params != null) {
                String password = params.get(ExtractData.PDF_PASSWORD);
                if (password == null) {
                    password = getPassword(params.get(ExtractData.URL),
                            params.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
                }
                if (password != null) {
                    final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password);
                    document.openProtection(sdm);
                    final AccessPermission ap = document.getCurrentAccessPermission();

                    if (!ap.canExtractContent()) {
                        throw new IOException("You do not have permission to extract text.");
                    }
                }
            }

            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final Writer output = new OutputStreamWriter(baos, encoding);
            final PDFTextStripper stripper = new PDFTextStripper(encoding);
            stripper.setForceParsing(force);
            final AtomicBoolean done = new AtomicBoolean(false);
            final PDDocument doc = document;
            final Set<Exception> exceptionSet = new HashSet<>();
            final Thread task = new Thread(() -> {
                try {
                    stripper.writeText(doc, output);
                } catch (final Exception e) {
                    exceptionSet.add(e);
                } finally {
                    done.set(true);
                }
            });
            task.setDaemon(true);
            task.start();
            task.join(timeout);
            if (!done.get()) {
                for (int i = 0; i < 100 && !done.get(); i++) {
                    task.interrupt();
                    Thread.sleep(50);
                }
                throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
            } else if (!exceptionSet.isEmpty()) {
                throw exceptionSet.iterator().next();
            }
            output.flush();
            final ExtractData extractData = new ExtractData(baos.toString(encoding));
            extractMetadata(document, extractData);
            return extractData;
        } catch (final Exception e) {
            throw new ExtractException(e);
        } finally {
            if (document != null) {
                try {
                    document.close();
                } catch (final IOException e) {
                    // NOP
                }
            }
        }
    }
}

From source file:org.nuxeo.ecm.core.convert.plugins.text.extractors.PDF2TextConverter.java

License:Apache License

@Override
public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters)
        throws ConversionException {

    PDDocument document = null;
    File f = null;//from  w  w w  .  j a  va  2 s  .  c o  m
    OutputStream fas = null;
    try {
        document = PDDocument.load(blobHolder.getBlob().getStream());
        // NXP-1556: if document is protected an IOException will be raised
        // Instead of catching the exception based on its message string
        // lets avoid sending messages that will generate this error
        // code taken from PDFTextStripper.writeText source.
        // only care about standard encryption and if it was decrypted with
        // the user password
        AccessPermission permission = document.getCurrentAccessPermission();
        if (permission.canExtractContent()) {
            PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper();

            // use the position information to heuristically organize the
            // extracted paragraphs. This is also important for
            // right-to-left languages.
            textStripper.setSortByPosition(true);

            String text = textStripper.getText(document);
            // replace non breaking space by regular spaces (why?)
            // text = text.replace("\u00a0", " ");
            f = Framework.createTempFile("pdfboplugin", ".txt");
            fas = new FileOutputStream(f);
            fas.write(text.getBytes("UTF-8"));
            try (FileInputStream is = new FileInputStream(f)) {
                Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8");
                return new SimpleCachableBlobHolder(blob);
            }
        } else {
            return new SimpleCachableBlobHolder(Blobs.createBlob(""));
        }
    } catch (IOException e) {
        throw new ConversionException("Error during text extraction with PDFBox", e);
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
                log.error("Error while closing PDFBox document", e);
            }
        }
        if (fas != null) {
            try {
                fas.close();
            } catch (IOException e) {
                log.error(e);
            }
        }
        if (f != null) {
            f.delete();
        }
    }
}

From source file:org.nuxeo.pdf.test.PDFEncryptionTest.java

License:Open Source License

protected void checkIsReadOnly(Blob inBlob, String ownerPwd, String userPwd) throws Exception {

    assertNotNull(inBlob);//  w w w .ja v a2 s  .  c  o  m

    PDDocument pdfDoc = utils.loadAndTrack(inBlob);
    assertTrue(pdfDoc.isEncrypted());

    // Decrypt as user
    pdfDoc.openProtection(new StandardDecryptionMaterial(userPwd));
    assertFalse(pdfDoc.isEncrypted());
    AccessPermission ap = pdfDoc.getCurrentAccessPermission();
    assertTrue(ap.canExtractContent());
    assertTrue(ap.canExtractForAccessibility());
    assertTrue(ap.canPrint());
    assertTrue(ap.canPrintDegraded());

    assertFalse(ap.canAssembleDocument());
    assertFalse(ap.canFillInForm());
    assertFalse(ap.canModifyAnnotations());

    // Decrypt as owner
    utils.closeAndUntrack(pdfDoc);
    pdfDoc = utils.loadAndTrack(inBlob);
    pdfDoc.openProtection(new StandardDecryptionMaterial(ownerPwd));
    assertFalse(pdfDoc.isEncrypted());
    ap = pdfDoc.getCurrentAccessPermission();
    assertTrue(ap.isOwnerPermission());

    utils.closeAndUntrack(pdfDoc);

}

From source file:org.paxle.parser.pdf.impl.PdfParser.java

License:Open Source License

public IParserDocument parse(URI location, String charset, InputStream fileIn)
        throws ParserException, UnsupportedEncodingException, IOException {
    IParserDocument parserDoc = null;//  ww w. java 2 s.c o  m
    PDDocument pddDoc = null;

    try {
        final IParserContext pc = this.contextLocal.getCurrentContext();
        final ICommandProfile cmdProfile = pc.getCommandProfile();

        // create an empty document         
        parserDoc = pc.createDocument();

        // parse it
        final PDFParser parser = new PDFParser(fileIn);
        parser.parse();
        pddDoc = parser.getPDDocument();

        // check document encryption
        if (pddDoc.isEncrypted()) {
            if (this.logger.isDebugEnabled()) {
                this.logger.debug(String.format("Document '%s' is encrypted.", location));
            }

            // determine the decryption password
            String pwd = "";
            if (cmdProfile != null) {
                String tmp = (String) cmdProfile.getProperty("org.paxle.parser.pdf.impl.decryptionPassword");
                if (tmp != null)
                    pwd = tmp;
            }

            // try to open document with the given password
            try {
                final StandardDecryptionMaterial dm = new StandardDecryptionMaterial(pwd);
                pddDoc.openProtection(dm);
                final AccessPermission accessPermission = pddDoc.getCurrentAccessPermission();

                if (accessPermission == null || !accessPermission.canExtractContent()) {
                    if (this.logger.isInfoEnabled()) {
                        this.logger.debug(
                                String.format("No permission to extract content of document '%s'.", location));
                    }
                    parserDoc.setStatus(IParserDocument.Status.FAILURE, "PDF Document is encrypted.");
                    return parserDoc;
                }
            } catch (Throwable e) {
                this.logger.error(String.format("Unable to decrypt document '%s'.", location), e);
                parserDoc.setStatus(IParserDocument.Status.FAILURE, String
                        .format("Unable to decrypt document. %s: %s", e.getClass().getName(), e.getMessage()));
                return parserDoc;
            }
        }

        // extract metadata
        this.extractMetaData(parserDoc, pddDoc);

        // extract text
        final PDFTextStripper stripper = new PDFTextStripper();

        // XXX: we could limit the amount of parsed pages via crawling-profile properties?
        // stripper.setStartPage(startPageValue);
        // stripper.setEndPage(endPageValue);

        final Writer pdocWriter = parserDoc.getTextWriter();
        stripper.writeText(pddDoc, pdocWriter);
        pdocWriter.flush();

        // extracting URIs
        this.extractURLs(parserDoc, pddDoc);

        // extracting embedded files
        this.extractEmbeddedFiles(location, parserDoc, pddDoc);

        parserDoc.setStatus(IParserDocument.Status.OK);
        return parserDoc;
    } catch (Throwable e) {
        throw new ParserException("Error parsing pdf document. " + e.getMessage(), e);
    } finally {
        if (pddDoc != null)
            try {
                pddDoc.close();
            } catch (Exception e) {
                this.logger.error(e);
            }
    }
}

From source file:org.pdfsam.pdfbox.component.PDDocumentAccessPermission.java

License:Open Source License

PDDocumentAccessPermission(PDDocument document) {
    this.permissions = document.getCurrentAccessPermission();
}

From source file:org.pdfsam.pdfbox.component.PDDocumentAccessPermissionTest.java

License:Open Source License

@Before
public void setUp() {
    PDDocument document = mock(PDDocument.class);
    permission = mock(AccessPermission.class);
    when(document.getCurrentAccessPermission()).thenReturn(permission);
    victim = new PDDocumentAccessPermission(document);
}

From source file:org.pennyledger.docstore.parser.impl.PDFImageExtractor.java

License:Apache License

/**
 * Entry point for the application./*from   w w  w .  jav  a 2 s . com*/
 *
 * @param args
 *          The command-line arguments.
 * @throws IOException
 *           if there is an error reading the file or extracting the images.
 */
//  public static void main(String[] args) throws IOException {
//    // suppress the Dock icon on OS X
//    System.setProperty("apple.awt.UIElement", "true");
//
//    IImageParser imageParser = new TesseractImageOCR();
//    PDFImageExtractor extractor = new PDFImageExtractor(imageParser, 150);
//    extractor.run(args);
//  }

//  private void run(String[] args) throws IOException {
//    //String pdfFile = args[0];
//    String pdfFile = "c:/PennyLedger/1d51-9fe1b211e8039458b2ac4dbbfbf1.pdf";
//    if (pdfFile.length() <= 4) {
//      throw new IllegalArgumentException("Invalid file name: not PDF");
//    }
//    String password = "";
//    Path pdfPath = Paths.get(pdfFile);
//    PDDocument document = PDDocument.load(pdfPath.toFile(), password);
//    IDocumentContents docContents = new DocumentContents();
//    extract(document, id, pdfPath, docContents);
//  }

IDocumentContents extract(PDDocument document, String id, IDocumentContents docContents) throws IOException {
    AccessPermission ap = document.getCurrentAccessPermission();
    if (!ap.canExtractContent()) {
        throw new IOException("You do not have permission to extract images");
    }

    for (int i = 0; i < document.getNumberOfPages(); i++) {
        PDPage page = document.getPage(i);
        ImageGraphicsEngine extractor = new ImageGraphicsEngine(page, i, id);
        extractor.run();
        IDocumentContents pageContents = extractor.getPageContents();
        docContents = docContents.merge(pageContents);
    }
    return docContents;
}

From source file:org.seasar.robot.extractor.impl.PdfExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new RobotSystemException("The inputstream is null.");
    }/*from   www  .  java  2 s  . c om*/
    synchronized (pdfBoxLockObj) {
        PDDocument document = null;
        try {
            document = PDDocument.load(in, null, force);
            if (document.isEncrypted() && params != null) {
                String password = params.get(ExtractData.PDF_PASSWORD);
                if (password == null) {
                    password = getPassword(params.get(ExtractData.URL),
                            params.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
                }
                if (password != null) {
                    final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password);
                    document.openProtection(sdm);
                    final AccessPermission ap = document.getCurrentAccessPermission();

                    if (!ap.canExtractContent()) {
                        throw new IOException("You do not have permission to extract text.");
                    }
                }
            }

            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final Writer output = new OutputStreamWriter(baos, encoding);
            final PDFTextStripper stripper = new PDFTextStripper(encoding);
            stripper.setForceParsing(force);
            final AtomicBoolean done = new AtomicBoolean(false);
            final PDDocument doc = document;
            final Set<Exception> exceptionSet = new HashSet<>();
            Thread task = new Thread(new Runnable() {
                @Override
                public void run() {
                    try {
                        stripper.writeText(doc, output);
                    } catch (Exception e) {
                        exceptionSet.add(e);
                    } finally {
                        done.set(true);
                    }
                }
            });
            task.setDaemon(true);
            task.start();
            task.join(timeout);
            if (!done.get()) {
                for (int i = 0; i < 100 && !done.get(); i++) {
                    task.interrupt();
                    Thread.sleep(50);
                }
                throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
            } else if (!exceptionSet.isEmpty()) {
                throw exceptionSet.iterator().next();
            }
            output.flush();
            final ExtractData extractData = new ExtractData(baos.toString(encoding));
            extractMetadata(document, extractData);
            return extractData;
        } catch (final Exception e) {
            throw new ExtractException(e);
        } finally {
            if (document != null) {
                try {
                    document.close();
                } catch (final IOException e) {
                    // NOP
                }
            }
        }
    }
}

From source file:org.sejda.impl.pdfbox.component.PDDocumentAccessPermission.java

License:Apache License

PDDocumentAccessPermission(PDDocument document) {
    if (document == null) {
        throw new IllegalArgumentException("Unable to get permissions from null instance.");
    }/* w  w  w.  j  ava 2  s .  c  o  m*/
    this.permissions = document.getCurrentAccessPermission();
}