Example usage for org.apache.pdfbox.pdmodel PDDocument getCurrentAccessPermission

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getCurrentAccessPermission.

Prototype

public AccessPermission getCurrentAccessPermission()

Source Link

Document

Returns the access permissions granted when the document was decrypted.

Usage

From source file:com.ackpdfbox.app.Decrypt.java

License:Apache License

private void decrypt() throws IOException {
    PDDocument document = null;
    try {/*w  w w  .  j a  v a  2s.  c  o m*/
        InputStream keyStoreStream = null;
        if (keyStore != null) {
            keyStoreStream = new FileInputStream(keyStore);
        }

        document = PDDocument.load(new File(infile), password, keyStoreStream, alias);

        if (document.isEncrypted()) {
            AccessPermission ap = document.getCurrentAccessPermission();
            if (ap.isOwnerPermission()) {
                document.setAllSecurityToBeRemoved(true);
                document.save(outfile);
            } else {
                throw new IOException(
                        "Error: You are only allowed to decrypt a document with the owner password.");
            }
        } else {
            System.err.println("Error: Document is not encrypted.");
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:cz.muni.pdfjbim.PdfImageExtractor.java

License:Apache License

/**
 * @deprecated -- do not use doesn't work properly yet
 * This method extracts images by going through PDF tree structure
 * @param pdfFile name of input PDF file
 * @param prefix /*  w  w w  .ja va 2s  .  c  o  m*/
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet
//    * @param silent -- if true error messages are not written to output otherwise they are
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfObjectAccess(String pdfFile, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    if (binarize == null) {
        binarize = false;
    }
    // checking arguments and setting appropriate variables
    if (pdfFile == null) {
        throw new IllegalArgumentException("pdfFile must be defined");
    }

    InputStream inputStream = null;
    if (password != null) {
        try {
            log.debug("PDF probably encrypted, trying to decrypt using given password {}", password);
            ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream();
            PdfReader reader = new PdfReader(pdfFile, password.getBytes(StandardCharsets.UTF_8));
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            stamper.close();
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        try {
            inputStream = new FileInputStream(pdfFile);
        } catch (FileNotFoundException ex) {
            throw new PdfRecompressionException("File wasn't found", ex);
        }
    }

    // if prefix is not set then prefix set to name of pdf without .pdf
    // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed
    // and this string set as prefix
    if ((prefix == null) && (pdfFile.length() > 4)) {
        prefix = pdfFile.substring(0, pdfFile.length() - 4);
    }

    PDFParser parser = null;
    PDDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getPDDocument();

        AccessPermission accessPermissions = doc.getCurrentAccessPermission();

        if (!accessPermissions.canExtractContent()) {
            throw new PdfRecompressionException("Error: You do not have permission to extract images.");
        }

        // going page by page
        List pages = doc.getDocumentCatalog().getAllPages();
        for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) {
            if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) {
                continue;
            }
            PDPage page = (PDPage) pages.get(pageNumber);
            PDResources resources = page.getResources();
            Map xobjs = resources.getXObjects();

            if (xobjs != null) {
                Iterator xobjIter = xobjs.entrySet().iterator();
                while (xobjIter.hasNext()) {
                    Map.Entry entry = (Map.Entry) xobjIter.next();
                    String key = (String) entry.getKey();
                    PDXObject xobj = (PDXObject) entry.getValue();
                    Map images;
                    if (xobj instanceof PDXObjectForm) {
                        PDXObjectForm xform = (PDXObjectForm) xobj;
                        images = xform.getResources().getImages();
                    } else {
                        images = resources.getImages();
                    }

                    // reading images from each page and saving them to file
                    if (images != null) {
                        Iterator imageIter = images.entrySet().iterator();
                        while (imageIter.hasNext()) {
                            Map.Entry imEntry = (Map.Entry) imageIter.next();
                            String imKey = (String) imEntry.getKey();
                            PDXObjectImage image = (PDXObjectImage) imEntry.getValue();

                            PDStream pdStr = new PDStream(image.getCOSStream());
                            List<COSName> filters = pdStr.getFilters();

                            if (image.getBitsPerComponent() > 1 && !binarize) {
                                log.info("It is not a bitonal image => skipping");
                                continue;
                            }

                            // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                            if (filters.contains(COSName.LZW_DECODE)) {
                                log.info("This is LZWDecoded => skipping");
                                continue;

                            }

                            if (filters.contains(COSName.JBIG2_DECODE)) {
                                if (skipJBig2Images) {
                                    log.warn("Allready compressed according to JBIG2 standard => skipping");
                                    continue;
                                } else {
                                    log.debug("JBIG2 image detected");
                                }
                            }

                            // detection of unsupported filters by pdfBox library
                            if (filters.contains(COSName.JPX_DECODE)) {
                                log.info("Unsupported filter JPXDecode => skipping");
                                continue;
                            }

                            COSObject cosObj = new COSObject(image.getCOSObject());
                            int objectNum = cosObj.getObjectNumber().intValue();
                            int genNum = cosObj.getGenerationNumber().intValue();
                            log.debug(objectNum + " " + genNum + " obj");

                            String name = getUniqueFileName(prefix + imKey, image.getSuffix());
                            log.debug("Writing image:" + name);
                            image.write2file(name);

                            PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                                    image.getHeight(), objectNum, genNum);
                            originalImageInformations.add(pdfImageInfo);
                            log.debug(pdfImageInfo.toString());

                            namesOfImages.add(name + "." + image.getSuffix());
                        }
                    }
                }
            }
        }
    } catch (IOException ex) {
        Tools.deleteFilesFromList(namesOfImages);
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } catch (RuntimeException ex) {
        Tools.deleteFilesFromList(namesOfImages);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:cz.muni.pdfjbim.PdfImageProcessor.java

License:Apache License

/**
 * @deprecated -- do not use doesn't work properly yet
 * This method extracts images by going through PDF tree structure
 * @param pdfFile name of input PDF file
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet//w ww.  j a  v  a  2s . com
 * @param silent -- if true error messages are not written to output otherwise they are
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfObjectAccess(String pdfFile, String password, Set<Integer> pagesToProcess,
        Boolean silent, Boolean binarize) throws PdfRecompressionException {
    if (binarize == null) {
        binarize = false;
    }
    // checking arguments and setting appropriate variables
    if (pdfFile == null) {
        throw new IllegalArgumentException(pdfFile);
    }

    String prefix = null;

    InputStream inputStream = null;
    if (password != null) {
        try {
            ByteArrayOutputStream decryptedOutputStream = null;
            PdfReader reader = new PdfReader(pdfFile, password.getBytes());
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            stamper.close();
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        try {
            inputStream = new FileInputStream(pdfFile);
        } catch (FileNotFoundException ex) {
            throw new PdfRecompressionException("File wasn't found", ex);
        }
    }

    // if prefix is not set then prefix set to name of pdf without .pdf
    // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed
    // and this string set as prefix
    if ((prefix == null) && (pdfFile.length() > 4)) {
        prefix = pdfFile.substring(0, pdfFile.length() - 4);
    }

    PDFParser parser = null;
    PDDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getPDDocument();

        AccessPermission accessPermissions = doc.getCurrentAccessPermission();

        if (!accessPermissions.canExtractContent()) {
            throw new PdfRecompressionException("Error: You do not have permission to extract images.");
        }

        // going page by page
        List pages = doc.getDocumentCatalog().getAllPages();
        for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) {
            if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) {
                continue;
            }
            PDPage page = (PDPage) pages.get(pageNumber);
            PDResources resources = page.getResources();
            Map xobjs = resources.getXObjects();

            if (xobjs != null) {
                Iterator xobjIter = xobjs.keySet().iterator();
                while (xobjIter.hasNext()) {
                    String key = (String) xobjIter.next();
                    PDXObject xobj = (PDXObject) xobjs.get(key);
                    Map images;
                    if (xobj instanceof PDXObjectForm) {
                        PDXObjectForm xform = (PDXObjectForm) xobj;
                        images = xform.getResources().getImages();
                    } else {
                        images = resources.getImages();
                    }

                    // reading images from each page and saving them to file
                    if (images != null) {
                        Iterator imageIter = images.keySet().iterator();
                        while (imageIter.hasNext()) {
                            String imKey = (String) imageIter.next();
                            PDXObjectImage image = (PDXObjectImage) images.get(imKey);

                            PDStream pdStr = new PDStream(image.getCOSStream());
                            List filters = pdStr.getFilters();

                            if (image.getBitsPerComponent() > 1) {
                                log.info("It is not a bitonal image => skipping");
                                continue;
                            }

                            // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                            if (filters.contains(COSName.LZW_DECODE.getName())) {
                                log.info("This is LZWDecoded => skipping");
                                continue;

                            }

                            // detection of unsupported filters by pdfBox library
                            if (filters.contains("JBIG2Decode")) {
                                log.info("Allready compressed according to JBIG2 standard => skipping");
                                continue;
                            }
                            if (filters.contains("JPXDecode")) {
                                log.info("Unsupported filter JPXDecode => skipping");
                                continue;
                            }

                            COSObject cosObj = new COSObject(image.getCOSObject());
                            int objectNum = cosObj.getObjectNumber().intValue();
                            int genNum = cosObj.getGenerationNumber().intValue();
                            log.debug(objectNum + " " + genNum + " obj");

                            String name = getUniqueFileName(prefix + imKey, image.getSuffix());
                            log.debug("Writing image:" + name);
                            image.write2file(name);

                            PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                                    image.getHeight(), objectNum, genNum);
                            originalImageInformations.add(pdfImageInfo);
                            log.debug(pdfImageInfo.toString());

                            namesOfImages.add(name + "." + image.getSuffix());
                        }
                    }

                }
            }

        }
    } catch (IOException ex) {
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:de.catma.document.source.contenthandler.PDFContentHandler.java

License:Open Source License

public void load(InputStream is) throws IOException {
    PDDocument document = null;
    try {//  ww w  . j  a va  2s .  co  m
        document = PDDocument.load(is, false);

        if (document.isEncrypted()) {
            throw new IOException("can not open pdf document because it is encrypted");
        }

        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
            throw new IOException("You do not have permission to extract text");
        }

        PDFTextStripper stripper = new PDFTextStripper("UTF-8");

        stripper.setForceParsing(false);
        stripper.setSortByPosition(false);
        stripper.setShouldSeparateByBeads(true);
        stripper.setStartPage(1);
        stripper.setEndPage(Integer.MAX_VALUE);

        ByteArrayOutputStream os = new ByteArrayOutputStream();
        Writer w = new OutputStreamWriter(os);
        try {
            stripper.writeText(document, w);
        } finally {
            w.close();
        }
        // some pdfs seem to include non valid unicode characters
        // and this causes problems when converting text to HTML
        // for GUI delivery and during indexing 
        setContent(os.toString().replaceAll("[^\\x09\\x0A\\x0D\\x20-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]",
                "?"));
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:edu.ist.psu.sagnik.research.pdfbox2playground.javatest.ExtractImages.java

License:Apache License

private void extract(String pdfFile, String password) throws IOException {
    PDDocument document = null;
    try {//from   w w  w.  ja v a 2 s.  com
        document = PDDocument.load(new File(pdfFile), password);
        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
            throw new IOException("You do not have permission to extract images");
        }

        for (int i = 0; i < document.getNumberOfPages(); i++) // todo: ITERATOR would be much better
        {
            PDPage page = document.getPage(i);
            ImageGraphicsEngine extractor = new ImageGraphicsEngine(page);
            extractor.run();
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }
}

From source file:net.yacy.cider.parser.idiom.pdfIdiom.java

License:Open Source License

@Override
public Model parse(DataSource source) throws ParserException {
    // create an empty Model
    Model model = ModelFactory.createDefaultModel();
    Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true))
            : model.createResource();/*from   w  w w.ja v  a2 s.com*/

    // open pdf document
    final PDDocument theDocument;
    final PDFParser parser;
    try {
        parser = new PDFParser(source.getStream());
        parser.parse();
        theDocument = parser.getPDDocument();
    } catch (IOException e) {
        log.error(e.getMessage(), e);
        throw new ParserException(e.getMessage(), source.getURI());
    }

    if (theDocument.isEncrypted()) {
        try {
            theDocument.openProtection(new StandardDecryptionMaterial(""));
        } catch (BadSecurityHandlerException e) {
            throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(),
                    source.getURI(), e);
        } catch (IOException e) {
            throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e);
        } catch (CryptographyException e) {
            throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(),
                    source.getURI(), e);
        }
        final AccessPermission perm = theDocument.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent())
            throw new ParserException("PDF cannot be decrypted", source.getURI());
    }

    // get metadata
    final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
    if (theDocInfo != null) {
        docTitle = theDocInfo.getTitle();
        docSubject = theDocInfo.getSubject();
        docAuthor = theDocInfo.getAuthor();
        docKeywordStr = theDocInfo.getKeywords();
    }

    if (docAuthor != null && docAuthor.length() > 0) {
        resource.addProperty(VCARD.FN, docAuthor);
        resource.addProperty(DC.creator, docAuthor);
    }
    if (docSubject != null && docSubject.length() > 0) {
        resource.addProperty(DC.subject, docSubject);
    }
    if (docTitle != null && docTitle.length() > 0) {
        resource.addProperty(DC.title, docTitle);
    }
    String[] docKeywords = null;
    if (docKeywordStr != null && docKeywordStr.length() > 0) {
        docKeywords = docKeywordStr.split(" |,");
        resource.addProperty(DC.coverage, concat(docKeywords));
    }

    // get the content
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Writer writer;
    try {
        writer = new OutputStreamWriter(baos, "UTF-8");
    } catch (UnsupportedEncodingException e1) {
        writer = new OutputStreamWriter(baos);
    }
    try {
        final PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(theDocument, writer);
        theDocument.close();
        writer.close();
    } catch (IOException e) {
        if (writer != null)
            try {
                writer.close();
            } catch (final Exception ex) {
            }
        throw new ParserException("PDF content reader", source.getURI(), e);
    }
    String content;
    try {
        content = new String(baos.toByteArray(), "UTF-8");
    } catch (UnsupportedEncodingException e) {
        content = new String(baos.toByteArray());
    }
    if (content != null && content.length() > 0) {
        resource.addProperty(CIDER.data_content_text, content);
    }

    return model;
}

From source file:net.yacy.document.parser.pdfParser.java

License:Open Source License

@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
        throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(),
                location);/*ww w .  j  a va2  s.  co  m*/

    // create a pdf parser
    PDDocument pdfDoc;
    //final PDFParser pdfParser;
    try {
        Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
        pdfDoc = PDDocument.load(source);
        //PDFParser pdfParser = new PDFParser(source);
        //pdfParser.parse();
        //pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
        throw new Parser.Failure(e.getMessage(), location);
    } finally {
        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
        try {
            pdfDoc.openProtection(new StandardDecryptionMaterial(""));
        } catch (final BadSecurityHandlerException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
        } catch (final IOException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
        } catch (final CryptographyException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
        }
        final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent()) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
        docTitle = info.getTitle();
        docSubject = info.getSubject();
        docAuthor = info.getAuthor();
        docPublisher = info.getProducer();
        if (docPublisher == null || docPublisher.isEmpty())
            docPublisher = info.getCreator();
        docKeywordStr = info.getKeywords();
        try {
            if (info.getModificationDate() != null)
                docDate = info.getModificationDate().getTime();
        } catch (IOException e) {
        }
        // unused:
        // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
        docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
        docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
        docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
        // get the links
        pdflinks = extractPdfLinks(pdfDoc);

        // get the fulltext (either per document or for each page)
        final PDFTextStripper stripper = new PDFTextStripper("UTF-8");

        if (individualPages) {
            // this is a hack which stores individual pages of the source pdf into individual index documents
            // the new documents will get a virtual link with a post argument page=X appended to the original url

            // collect text
            int pagecount = pdfDoc.getNumberOfPages();
            String[] pages = new String[pagecount];
            for (int page = 1; page <= pagecount; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                pages[page - 1] = stripper.getText(pdfDoc);
                //System.out.println("PAGE " + page + ": " + pages[page - 1]);
            }

            // create individual documents for each page
            assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = "
                    + pdflinks.length;
            result = new Document[Math.min(pages.length, pdflinks.length)];
            String loc = location.toNormalform(true);
            for (int page = 0; page < result.length; page++) {
                result[page] = new Document(
                        new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname
                                + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                        mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor,
                        docPublisher, null, null, 0.0f, 0.0f,
                        pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                        pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false,
                        docDate);
            }
        } else {
            // collect the whole text at once
            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
            byte[] contentBytes = new byte[0];
            stripper.setEndPage(3); // get first 3 pages (always)
            writer.append(stripper.getText(pdfDoc));
            contentBytes = writer.getBytes(); // remember text in case of interrupting thread

            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
                stripper.setEndPage(Integer.MAX_VALUE); // set to default
                // we start the pdf parsing in a separate thread to ensure that it can be terminated
                final PDDocument pdfDocC = pdfDoc;
                final Thread t = new Thread() {
                    @Override
                    public void run() {
                        Thread.currentThread().setName("pdfParser.getText:" + location);
                        try {
                            writer.append(stripper.getText(pdfDocC));
                        } catch (final Throwable e) {
                        }
                    }
                };
                t.start();
                t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
                if (t.isAlive())
                    t.interrupt();
            }
            contentBytes = writer.getBytes(); // get final text before closing writer

            Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
            for (Collection<AnchorURL> pdflinksx : pdflinks)
                if (pdflinksx != null)
                    pdflinksCombined.addAll(pdflinksx);
            result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords,
                    singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes,
                    pdflinksCombined, null, null, false, docDate) };
        }
    } catch (final Throwable e) {
        //close the writer (in finally)
        //throw new Parser.Failure(e.getMessage(), location);
    } finally {
        try {
            pdfDoc.close();
        } catch (final Throwable e) {
        }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
}

From source file:org.apache.camel.component.pdf.PdfCreationTest.java

License:Apache License

@Test
public void testPdfCreationWithEncryption() throws Exception {
    final String ownerPass = "ownerPass";
    final String userPass = "userPass";
    final String expectedText = "expectedText";
    AccessPermission accessPermission = new AccessPermission();
    accessPermission.setCanPrint(false);
    StandardProtectionPolicy protectionPolicy = new StandardProtectionPolicy(ownerPass, userPass,
            accessPermission);//from w  w  w.  ja  va2s .c  om
    protectionPolicy.setEncryptionKeyLength(128);
    template.sendBodyAndHeader("direct:start", expectedText, PdfHeaderConstants.PROTECTION_POLICY_HEADER_NAME,
            protectionPolicy);

    resultEndpoint.setExpectedMessageCount(1);
    resultEndpoint.expectedMessagesMatches(new Predicate() {
        @Override
        public boolean matches(Exchange exchange) {
            Object body = exchange.getIn().getBody();
            assertThat(body, instanceOf(ByteArrayOutputStream.class));
            try {
                PDDocument doc = PDDocument
                        .load(new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray()));
                assertTrue("Expected encrypted document", doc.isEncrypted());
                doc.decrypt(userPass);
                assertFalse("Printing should not be permitted", doc.getCurrentAccessPermission().canPrint());
                PDFTextStripper pdfTextStripper = new PDFTextStripper();
                String text = pdfTextStripper.getText(doc);
                assertEquals(1, doc.getNumberOfPages());
                assertThat(text, containsString(expectedText));
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return true;
        }
    });
    resultEndpoint.assertIsSatisfied();
}

From source file:org.apache.tika.parser.pdf.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    org.apache.jempbox.xmp.XMPMetadata xmp = null;
    XMPSchemaDublinCore dcSchema = null;
    XMPSchemaMediaManagement mmSchema = null;
    try {/* w w w.j  ava2 s  . c om*/
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
        }
    } catch (IOException e) {
    }

    if (xmp != null) {
        try {
            dcSchema = xmp.getDublinCoreSchema();
        } catch (IOException e) {
        }

        JempboxExtractor.extractXMPMM(xmp, metadata);
    }

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context)
        throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);

    XMPMetadata xmp = null;/*w w w.  ja  v a 2  s  .c o m*/
    if (dom != null) {
        xmp = new XMPMetadata(dom);
    }
    XMPSchemaDublinCore dcSchema = null;

    /*if (xmp != null) {
    try {
        dcSchema = xmp.getDublinCoreSchema();
    } catch (IOException e) {}
            
    JempboxExtractor.extractXMPMM(xmp, metadata);
    }*/

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
    // TODO Remove these in Tika 2.0
    addMetadata(metadata, "created", info.getCreationDate());
    addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate());
    addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    Calendar modified = info.getModificationDate();
    addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
    addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate());

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getCOSObject().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
            addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
                    info.getCOSObject().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
    metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set(PDF.PDFA_VERSION, version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?          
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSObject();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}