Example usage for org.apache.pdfbox.pdmodel.encryption StandardDecryptionMaterial StandardDecryptionMaterial

List of usage examples for org.apache.pdfbox.pdmodel.encryption StandardDecryptionMaterial StandardDecryptionMaterial

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.encryption StandardDecryptionMaterial StandardDecryptionMaterial.

Prototype

public StandardDecryptionMaterial(String pwd) 

Source Link

Document

Create a new standard decryption material with the given password.

Usage

From source file:com.jaeksoft.searchlib.parser.PdfParser.java

License:Open Source License

private String decrypt(PDDocument pdf, File pdfFile)
        throws BadSecurityHandlerException, IOException, CryptographyException {
    // Let's try first with an empty password
    String password = StringUtils.EMPTY;
    try {/*  w w  w.  j  a  v  a  2s. c  om*/
        pdf.openProtection(new StandardDecryptionMaterial(password));
    } catch (CryptographyException e) {
        // New attempt with PDFCrack
        String pdfCrackCommandLine = getStringProperty(ClassPropertyEnum.PDFCRACK_COMMANDLINE);
        if (StringUtils.isEmpty(pdfCrackCommandLine))
            throw e;
        password = PdfCrack.findPassword(pdfCrackCommandLine, pdfFile);
        if (password == null) // No password found
            throw new IOException("Encrypted PDF.");
        // Password found, let's open
        pdf.openProtection(new StandardDecryptionMaterial(password));
    }
    return password;
}

From source file:com.opensearchserver.extractor.parser.PdfBox.java

License:Apache License

/**
 * Extract text content using PDFBox//w w  w .  ja va 2  s.  c  o m
 * 
 * @param pdf
 * @throws Exception
 */
private void parseContent(PDDocument pdf) throws Exception {
    try {
        if (pdf.isEncrypted())
            pdf.openProtection(new StandardDecryptionMaterial(""));
        extractMetaData(pdf);
        Stripper stripper = new Stripper();
        stripper.getText(pdf);
    } finally {
        if (pdf != null)
            pdf.close();
    }
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public String renderText(File file) throws ParseException {
    PDDocument pdfDoc = null;/*from www . ja  v a 2  s .  c  om*/
    try {
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        return writer.toString();
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:net.sourceforge.docfetcher.parse.PDFParser.java

License:Open Source License

public Document parse(File file) throws ParseException {
    PDDocument pdfDoc = null;//  ww  w .j  a  v a  2  s  .c om
    try {
        // Check if PDF file is encrypted
        pdfDoc = PDDocument.load(file);
        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (Exception e) {
                throw new ParseException(file, Msg.no_extraction_permission.value());
            }
        }

        // Get tags and contents
        PDFTextStripper stripper = new PDFTextStripper();
        StringWriter writer = new StringWriter();
        stripper.writeText(pdfDoc, writer);
        DocFetcher.getInstance().setExceptionHandlerEnabled(true);
        PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation();
        String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(),
                pdInfo.getKeywords(), };
        for (String field : metaData)
            if (field != null)
                writer.append(" ").append(field); //$NON-NLS-1$
        return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]);
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    } finally {
        if (pdfDoc != null) {
            try {
                pdfDoc.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:net.yacy.cider.parser.idiom.pdfIdiom.java

License:Open Source License

@Override
public Model parse(DataSource source) throws ParserException {
    // create an empty Model
    Model model = ModelFactory.createDefaultModel();
    Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true))
            : model.createResource();/*  w  ww .  j a v  a  2 s .  c o  m*/

    // open pdf document
    final PDDocument theDocument;
    final PDFParser parser;
    try {
        parser = new PDFParser(source.getStream());
        parser.parse();
        theDocument = parser.getPDDocument();
    } catch (IOException e) {
        log.error(e.getMessage(), e);
        throw new ParserException(e.getMessage(), source.getURI());
    }

    if (theDocument.isEncrypted()) {
        try {
            theDocument.openProtection(new StandardDecryptionMaterial(""));
        } catch (BadSecurityHandlerException e) {
            throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(),
                    source.getURI(), e);
        } catch (IOException e) {
            throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e);
        } catch (CryptographyException e) {
            throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(),
                    source.getURI(), e);
        }
        final AccessPermission perm = theDocument.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent())
            throw new ParserException("PDF cannot be decrypted", source.getURI());
    }

    // get metadata
    final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
    if (theDocInfo != null) {
        docTitle = theDocInfo.getTitle();
        docSubject = theDocInfo.getSubject();
        docAuthor = theDocInfo.getAuthor();
        docKeywordStr = theDocInfo.getKeywords();
    }

    if (docAuthor != null && docAuthor.length() > 0) {
        resource.addProperty(VCARD.FN, docAuthor);
        resource.addProperty(DC.creator, docAuthor);
    }
    if (docSubject != null && docSubject.length() > 0) {
        resource.addProperty(DC.subject, docSubject);
    }
    if (docTitle != null && docTitle.length() > 0) {
        resource.addProperty(DC.title, docTitle);
    }
    String[] docKeywords = null;
    if (docKeywordStr != null && docKeywordStr.length() > 0) {
        docKeywords = docKeywordStr.split(" |,");
        resource.addProperty(DC.coverage, concat(docKeywords));
    }

    // get the content
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Writer writer;
    try {
        writer = new OutputStreamWriter(baos, "UTF-8");
    } catch (UnsupportedEncodingException e1) {
        writer = new OutputStreamWriter(baos);
    }
    try {
        final PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(theDocument, writer);
        theDocument.close();
        writer.close();
    } catch (IOException e) {
        if (writer != null)
            try {
                writer.close();
            } catch (final Exception ex) {
            }
        throw new ParserException("PDF content reader", source.getURI(), e);
    }
    String content;
    try {
        content = new String(baos.toByteArray(), "UTF-8");
    } catch (UnsupportedEncodingException e) {
        content = new String(baos.toByteArray());
    }
    if (content != null && content.length() > 0) {
        resource.addProperty(CIDER.data_content_text, content);
    }

    return model;
}

From source file:net.yacy.document.parser.pdfParser.java

License:Open Source License

@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
        throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(),
                location);/*from w  w w  .  jav a 2s . co m*/

    // create a pdf parser
    PDDocument pdfDoc;
    //final PDFParser pdfParser;
    try {
        Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
        pdfDoc = PDDocument.load(source);
        //PDFParser pdfParser = new PDFParser(source);
        //pdfParser.parse();
        //pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
        throw new Parser.Failure(e.getMessage(), location);
    } finally {
        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
        try {
            pdfDoc.openProtection(new StandardDecryptionMaterial(""));
        } catch (final BadSecurityHandlerException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
        } catch (final IOException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
        } catch (final CryptographyException e) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
        }
        final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
        if (perm == null || !perm.canExtractContent()) {
            try {
                pdfDoc.close();
            } catch (final IOException ee) {
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
        docTitle = info.getTitle();
        docSubject = info.getSubject();
        docAuthor = info.getAuthor();
        docPublisher = info.getProducer();
        if (docPublisher == null || docPublisher.isEmpty())
            docPublisher = info.getCreator();
        docKeywordStr = info.getKeywords();
        try {
            if (info.getModificationDate() != null)
                docDate = info.getModificationDate().getTime();
        } catch (IOException e) {
        }
        // unused:
        // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
        docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
        docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
        docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
        // get the links
        pdflinks = extractPdfLinks(pdfDoc);

        // get the fulltext (either per document or for each page)
        final PDFTextStripper stripper = new PDFTextStripper("UTF-8");

        if (individualPages) {
            // this is a hack which stores individual pages of the source pdf into individual index documents
            // the new documents will get a virtual link with a post argument page=X appended to the original url

            // collect text
            int pagecount = pdfDoc.getNumberOfPages();
            String[] pages = new String[pagecount];
            for (int page = 1; page <= pagecount; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                pages[page - 1] = stripper.getText(pdfDoc);
                //System.out.println("PAGE " + page + ": " + pages[page - 1]);
            }

            // create individual documents for each page
            assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = "
                    + pdflinks.length;
            result = new Document[Math.min(pages.length, pdflinks.length)];
            String loc = location.toNormalform(true);
            for (int page = 0; page < result.length; page++) {
                result[page] = new Document(
                        new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname
                                + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                        mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor,
                        docPublisher, null, null, 0.0f, 0.0f,
                        pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                        pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false,
                        docDate);
            }
        } else {
            // collect the whole text at once
            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
            byte[] contentBytes = new byte[0];
            stripper.setEndPage(3); // get first 3 pages (always)
            writer.append(stripper.getText(pdfDoc));
            contentBytes = writer.getBytes(); // remember text in case of interrupting thread

            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
                stripper.setEndPage(Integer.MAX_VALUE); // set to default
                // we start the pdf parsing in a separate thread to ensure that it can be terminated
                final PDDocument pdfDocC = pdfDoc;
                final Thread t = new Thread() {
                    @Override
                    public void run() {
                        Thread.currentThread().setName("pdfParser.getText:" + location);
                        try {
                            writer.append(stripper.getText(pdfDocC));
                        } catch (final Throwable e) {
                        }
                    }
                };
                t.start();
                t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
                if (t.isAlive())
                    t.interrupt();
            }
            contentBytes = writer.getBytes(); // get final text before closing writer

            Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
            for (Collection<AnchorURL> pdflinksx : pdflinks)
                if (pdflinksx != null)
                    pdflinksCombined.addAll(pdflinksx);
            result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords,
                    singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes,
                    pdflinksCombined, null, null, false, docDate) };
        }
    } catch (final Throwable e) {
        //close the writer (in finally)
        //throw new Parser.Failure(e.getMessage(), location);
    } finally {
        try {
            pdfDoc.close();
        } catch (final Throwable e) {
        }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
}

From source file:org.apache.camel.component.fop.FopHelper.java

License:Apache License

public static void decryptPDFN(PDDocument document, String password)
        throws IOException, CryptographyException, BadSecurityHandlerException {
    if (document.isEncrypted()) {
        DecryptionMaterial decryptionMaterial = new StandardDecryptionMaterial(password);
        document.openProtection(decryptionMaterial);
    } else {//from  w w  w  .j  a v  a2s.c om
        throw new RuntimeException("Document not encrypted");
    }
}

From source file:org.apache.camel.component.pdf.PdfAppendTest.java

License:Apache License

@Test
public void testAppendEncrypted() throws Exception {
    final String originalText = "Test";
    final String textToAppend = "Append";
    PDDocument document = new PDDocument();
    PDPage page = new PDPage(PDPage.PAGE_SIZE_A4);
    document.addPage(page);/*from w  w w  . ja  v a 2s  .com*/
    PDPageContentStream contentStream = new PDPageContentStream(document, page);
    contentStream.setFont(PDType1Font.HELVETICA, 12);
    contentStream.beginText();
    contentStream.moveTextPositionByAmount(20, 400);
    contentStream.drawString(originalText);
    contentStream.endText();
    contentStream.close();

    final String ownerPass = "ownerPass";
    final String userPass = "userPass";
    AccessPermission accessPermission = new AccessPermission();
    accessPermission.setCanExtractContent(false);
    StandardProtectionPolicy protectionPolicy = new StandardProtectionPolicy(ownerPass, userPass,
            accessPermission);
    protectionPolicy.setEncryptionKeyLength(128);

    document.protect(protectionPolicy);

    ByteArrayOutputStream output = new ByteArrayOutputStream();
    document.save(output);

    // Encryption happens after saving.
    PDDocument encryptedDocument = PDDocument.load(new ByteArrayInputStream(output.toByteArray()));

    Map<String, Object> headers = new HashMap<String, Object>();
    headers.put(PdfHeaderConstants.PDF_DOCUMENT_HEADER_NAME, encryptedDocument);
    headers.put(PdfHeaderConstants.DECRYPTION_MATERIAL_HEADER_NAME, new StandardDecryptionMaterial(userPass));

    template.sendBodyAndHeaders("direct:start", textToAppend, headers);

    resultEndpoint.setExpectedMessageCount(1);
    resultEndpoint.expectedMessagesMatches(new Predicate() {
        @Override
        public boolean matches(Exchange exchange) {
            Object body = exchange.getIn().getBody();
            assertThat(body, instanceOf(ByteArrayOutputStream.class));
            try {
                PDDocument doc = PDDocument
                        .load(new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray()));
                PDFTextStripper pdfTextStripper = new PDFTextStripper();
                String text = pdfTextStripper.getText(doc);
                assertEquals(2, doc.getNumberOfPages());
                assertThat(text, containsString(originalText));
                assertThat(text, containsString(textToAppend));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            return true;
        }
    });
    resultEndpoint.assertIsSatisfied();

}

From source file:org.apache.camel.component.pdf.PdfTextExtractionTest.java

License:Apache License

@Test
public void testExtractTextFromEncrypted() throws Exception {
    final String ownerPass = "ownerPass";
    final String userPass = "userPass";
    AccessPermission accessPermission = new AccessPermission();
    accessPermission.setCanExtractContent(false);
    StandardProtectionPolicy protectionPolicy = new StandardProtectionPolicy(ownerPass, userPass,
            accessPermission);/*from w ww .j  a  v a2 s . com*/
    protectionPolicy.setEncryptionKeyLength(128);
    PDDocument document = new PDDocument();

    final String expectedText = "Test string";
    PDPage page = new PDPage(PDPage.PAGE_SIZE_A4);
    document.addPage(page);
    PDPageContentStream contentStream = new PDPageContentStream(document, page);
    contentStream.setFont(PDType1Font.HELVETICA, 12);
    contentStream.beginText();
    contentStream.moveTextPositionByAmount(20, 400);
    contentStream.drawString(expectedText);
    contentStream.endText();
    contentStream.close();

    document.protect(protectionPolicy);

    ByteArrayOutputStream output = new ByteArrayOutputStream();
    document.save(output);

    // Encryption happens after saving.
    PDDocument encryptedDocument = PDDocument.load(new ByteArrayInputStream(output.toByteArray()));

    template.sendBodyAndHeader("direct:start", encryptedDocument,
            PdfHeaderConstants.DECRYPTION_MATERIAL_HEADER_NAME, new StandardDecryptionMaterial(userPass));

    resultEndpoint.setExpectedMessageCount(1);
    resultEndpoint.expectedMessagesMatches(new Predicate() {
        @Override
        public boolean matches(Exchange exchange) {
            Object body = exchange.getIn().getBody();
            assertThat(body, instanceOf(String.class));
            assertThat((String) body, containsString(expectedText));
            return true;
        }
    });
    resultEndpoint.assertIsSatisfied();
    document.isEncrypted();
}

From source file:org.codelibs.robot.extractor.impl.PdfExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new RobotSystemException("The inputstream is null.");
    }/*from   w w  w  .j  a v  a 2 s .c  om*/
    synchronized (pdfBoxLockObj) {
        PDDocument document = null;
        try {
            document = PDDocument.load(in, null, force);
            if (document.isEncrypted() && params != null) {
                String password = params.get(ExtractData.PDF_PASSWORD);
                if (password == null) {
                    password = getPassword(params.get(ExtractData.URL),
                            params.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
                }
                if (password != null) {
                    final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password);
                    document.openProtection(sdm);
                    final AccessPermission ap = document.getCurrentAccessPermission();

                    if (!ap.canExtractContent()) {
                        throw new IOException("You do not have permission to extract text.");
                    }
                }
            }

            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final Writer output = new OutputStreamWriter(baos, encoding);
            final PDFTextStripper stripper = new PDFTextStripper(encoding);
            stripper.setForceParsing(force);
            final AtomicBoolean done = new AtomicBoolean(false);
            final PDDocument doc = document;
            final Set<Exception> exceptionSet = new HashSet<>();
            final Thread task = new Thread(() -> {
                try {
                    stripper.writeText(doc, output);
                } catch (final Exception e) {
                    exceptionSet.add(e);
                } finally {
                    done.set(true);
                }
            });
            task.setDaemon(true);
            task.start();
            task.join(timeout);
            if (!done.get()) {
                for (int i = 0; i < 100 && !done.get(); i++) {
                    task.interrupt();
                    Thread.sleep(50);
                }
                throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
            } else if (!exceptionSet.isEmpty()) {
                throw exceptionSet.iterator().next();
            }
            output.flush();
            final ExtractData extractData = new ExtractData(baos.toString(encoding));
            extractMetadata(document, extractData);
            return extractData;
        } catch (final Exception e) {
            throw new ExtractException(e);
        } finally {
            if (document != null) {
                try {
                    document.close();
                } catch (final IOException e) {
                    // NOP
                }
            }
        }
    }
}