List of usage examples for org.apache.pdfbox.pdmodel PDDocument isEncrypted
public boolean isEncrypted()
From source file:org.knoesis.matvocab.indexer.LucenePDFDocument.java
License:Apache License
/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * * @throws IOException If there is an error parsing the document. */// w w w. j av a 2 s .com private void addContent(Document document, InputStream is, String documentLocation, PDFTextStripper stripper) throws IOException { PDDocument pdfDocument = null; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on pdfDocument.decrypt(""); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); if (stripper == null) { stripper = new PDFTextStripper(); } else { stripper.resetEngine(); } stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. String contents = writer.getBuffer().toString(); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. addField(document, "contents", contents); addField(document, "stemmedcontents", contents); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addField(document, "Author", info.getAuthor()); try { addField(document, "CreationDate", info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Creator", info.getCreator()); addField(document, "Keywords", info.getKeywords()); try { addField(document, "ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } addField(document, "Producer", info.getProducer()); addField(document, "Subject", info.getSubject()); addField(document, "Title", info.getTitle()); addField(document, "Trapped", info.getTrapped()); } int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. addField(document, "summary", summary); addField(document, "numpages", String.valueOf(pdfDocument.getNumberOfPages())); } catch (CryptographyException e) { throw new IOException("Error decrypting document(" + documentLocation + "): " + e); } catch (InvalidPasswordException e) { //they didn't suppply a password and the default of "" was wrong. throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed."); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.lockss.pdf.pdfbox.PdfBoxDocumentFactory.java
License:Open Source License
/** * <p>/*from w ww. j a v a 2s. c o m*/ * Override this method to alter the processing of the {@link PDDocument} * instance after it has been parsed by {@link PDFParser#parse()}. * </p> * * @param pdDocument * A freshly parsed {@link PDDocument} instance * @throws CryptographyException * if a cryptography exception is thrown * @throws IOException * if an I/O exception is thrown * @since 1.67 */ protected void processAfterParse(PDDocument pdDocument) throws CryptographyException, IOException { pdDocument.setAllSecurityToBeRemoved(true); if (pdDocument.isEncrypted()) { pdDocument.decrypt(""); } }
From source file:org.mitre.xtext.converters.PDFConverter.java
License:Apache License
/** Implementation is informed by PDFBox authors. *//*from w w w . j a v a 2s . com*/ @Override public synchronized ConvertedDocument convert(java.io.File doc) throws IOException { /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Adapted from LucenePDFDocument.java from PDFBox lucene project * * This class is used to create a document for the lucene search engine. * This should easily plug into the IndexHTML or IndexFiles that comes with * the lucene project. This class will populate the following fields. * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr> <tr> * <td>path</td> <td>File system path if loaded from a file</td> </tr> <tr> * <td>url</td> <td>URL to PDF document</td> </tr> <tr> <td>contents</td> * <td>Entire contents of PDF document, indexed but not stored</td> </tr> * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr> <tr> * <td>modified</td> <td>The modified date/time according to the url or * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the Lucene * document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>Creator</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>Keywords</td> <td>From PDF meta-data if * available</td> </tr> <tr> <td>ModificationDate</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Producer</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Subject</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Trapped</td> <td>From PDF * meta-data if available</td> </tr> <tr> <td>Encrypted</td> <td>From PDF * meta-data if available</td> </tr> </table> * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.23 $ * * @throws IOException If there is an error parsing the document. */ PDDocument pdfDocument = null; ConvertedDocument textdoc = new ConvertedDocument(doc); try { pdfDocument = PDDocument.load(doc); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on /** * * Exception in thread "main" java.lang.NoClassDefFoundError: * org/bouncycastle/jce/provider/BouncyCastleProvider at * org.apache.pdfbox.pdmodel.PDDocument.openProtection(PDDocument.java:1090) * at * org.apache.pdfbox.pdmodel.PDDocument.decrypt(PDDocument.java:594) * * CRYPTO stuff -- load BouncyCastle crypto JAR files. try { * pdfDocument.decrypt(""); } catch (CryptographyException e) { * throw new IOException("Error decrypting document(" + pdf_file * + "): " + e); } catch (InvalidPasswordException e) { //they * didn't suppply a password and the default of "" was wrong. * throw new IOException( "Error: The document(" + pdf_file + ") * is encrypted "); } finally { if (pdfDocument != null) { * pdfDocument.close();} } */ textdoc.addProperty("encrypted", "YES"); } else { //create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper.resetEngine(); stripper.writeText(pdfDocument, writer); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { textdoc.addAuthor(info.getAuthor()); try { textdoc.addCreateDate(info.getCreationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } textdoc.addProperty("creator_tool", info.getCreator()); textdoc.addProperty("keywords", info.getKeywords()); /* try { metadata.add("ModificationDate", info.getModificationDate()); } catch (IOException io) { //ignore, bad date but continue with indexing } */ //metadata.add("Producer", info.getProducer()); textdoc.addProperty("subject", info.getSubject()); String ttl = info.getTitle(); if (ttl == null || "untitled".equalsIgnoreCase(ttl)) { ttl = textdoc.filename; } textdoc.addTitle(ttl); // metadata.add("Trapped", info.getTrapped()); // TODO: Character set is what? textdoc.setEncoding("UTF-8"); } // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string // is shared as long as the buffer content is not modified, which will // not occur here. textdoc.setPayload(writer.getBuffer().toString()); } return textdoc; } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.nuxeo.pdf.test.PDFEncryptionTest.java
License:Open Source License
protected void checkIsReadOnly(Blob inBlob, String ownerPwd, String userPwd) throws Exception { assertNotNull(inBlob);/*from w w w . java2s . c o m*/ PDDocument pdfDoc = utils.loadAndTrack(inBlob); assertTrue(pdfDoc.isEncrypted()); // Decrypt as user pdfDoc.openProtection(new StandardDecryptionMaterial(userPwd)); assertFalse(pdfDoc.isEncrypted()); AccessPermission ap = pdfDoc.getCurrentAccessPermission(); assertTrue(ap.canExtractContent()); assertTrue(ap.canExtractForAccessibility()); assertTrue(ap.canPrint()); assertTrue(ap.canPrintDegraded()); assertFalse(ap.canAssembleDocument()); assertFalse(ap.canFillInForm()); assertFalse(ap.canModifyAnnotations()); // Decrypt as owner utils.closeAndUntrack(pdfDoc); pdfDoc = utils.loadAndTrack(inBlob); pdfDoc.openProtection(new StandardDecryptionMaterial(ownerPwd)); assertFalse(pdfDoc.isEncrypted()); ap = pdfDoc.getCurrentAccessPermission(); assertTrue(ap.isOwnerPermission()); utils.closeAndUntrack(pdfDoc); }
From source file:org.nuxeo.pdf.test.PDFEncryptionTest.java
License:Open Source License
@Test public void testRemoveEncryption() throws Exception { // Test with encrypted PDF File f = FileUtils.getResourceFileFromContext(ENCRYPTED_PDF); FileBlob fb = new FileBlob(f); // Just check it is encrypted first PDDocument pdfDoc = utils.loadAndTrack(fb); assertTrue(pdfDoc.isEncrypted()); utils.closeAndUntrack(pdfDoc);/* w w w . java 2s.c o m*/ PDFEncryption pdfe = new PDFEncryption(fb); pdfe.setOriginalOwnerPwd(ENCRYPTED_PDF_PWD); Blob result = pdfe.removeEncryption(); assertNotNull(result); pdfDoc = utils.loadAndTrack(result); assertFalse(pdfDoc.isEncrypted()); utils.closeAndUntrack(pdfDoc); // Test with a non-encrypted PDF (removing encryption should not trigger an error) pdfe = new PDFEncryption(pdfFileBlob); pdfe.setOriginalOwnerPwd(ENCRYPTED_PDF_PWD); result = pdfe.removeEncryption(); assertNotNull(result); pdfDoc = utils.loadAndTrack(result); assertFalse(pdfDoc.isEncrypted()); utils.closeAndUntrack(pdfDoc); }
From source file:org.olat.core.commons.services.image.spi.ImageHelperImpl.java
License:Apache License
@Override public Size thumbnailPDF(VFSLeaf pdfFile, VFSLeaf thumbnailFile, int maxWidth, int maxHeight) { InputStream in = null;/*from w w w .j a va 2 s . com*/ PDDocument document = null; try { WorkThreadInformations.setInfoFiles(null, pdfFile); WorkThreadInformations.set("Generate thumbnail VFSLeaf=" + pdfFile); in = pdfFile.getInputStream(); document = PDDocument.load(in); if (document.isEncrypted()) { try { document.decrypt(""); } catch (Exception e) { log.info("PDF document is encrypted: " + pdfFile); throw new CannotGenerateThumbnailException("PDF document is encrypted: " + pdfFile); } } List pages = document.getDocumentCatalog().getAllPages(); PDPage page = (PDPage) pages.get(0); BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_BGR, 72); Size size = scaleImage(image, thumbnailFile, maxWidth, maxHeight); if (size != null) { return size; } return null; } catch (CannotGenerateThumbnailException e) { return null; } catch (Exception e) { log.warn("Unable to create image from pdf file.", e); return null; } finally { WorkThreadInformations.unset(); FileUtils.closeSafely(in); if (document != null) { try { document.close(); } catch (IOException e) { //only a try, fail silently } } } }
From source file:org.olat.core.commons.services.thumbnail.impl.PDFToThumbnail.java
License:Apache License
@Override public FinalSize generateThumbnail(VFSLeaf pdfFile, VFSLeaf thumbnailFile, int maxWidth, int maxHeight) throws CannotGenerateThumbnailException { InputStream in = null;//w w w .j a va2s. c o m PDDocument document = null; try { in = pdfFile.getInputStream(); document = PDDocument.load(in); if (document.isEncrypted()) { try { document.decrypt(""); } catch (Exception e) { log.info("PDF document is encrypted: " + pdfFile); throw new CannotGenerateThumbnailException("PDF document is encrypted: " + pdfFile); } } List pages = document.getDocumentCatalog().getAllPages(); PDPage page = (PDPage) pages.get(0); BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_BGR, 72); Size size = ImageHelper.scaleImage(image, thumbnailFile, maxWidth, maxHeight); return new FinalSize(size.getWidth(), size.getWidth()); } catch (CannotGenerateThumbnailException e) { throw e; } catch (Exception e) { log.warn("Unable to create image from pdf file.", e); throw new CannotGenerateThumbnailException(e); } finally { FileUtils.closeSafely(in); if (document != null) { try { document.close(); } catch (IOException e) { // only a try, fail silently } } } }
From source file:org.olat.course.certificate.ui.UploadCertificateController.java
License:Apache License
private boolean validatePdf(File template) { boolean allOk = true; PDDocument document = null; try (InputStream in = Files.newInputStream(template.toPath())) { document = PDDocument.load(in);//from w w w . jav a 2 s.co m if (document.isEncrypted()) { fileEl.setErrorKey("upload.error.encrypted", null); allOk &= false; } else { //check if we can write the form PDDocumentCatalog docCatalog = document.getDocumentCatalog(); PDAcroForm acroForm = docCatalog.getAcroForm(); if (acroForm != null) { @SuppressWarnings("unchecked") List<PDField> fields = acroForm.getFields(); for (PDField field : fields) { field.setValue("test"); } } document.save(new DevNullOutputStream()); } } catch (IOException ex) { logError("", ex); if (ex.getMessage() != null && ex.getMessage().contains("Don't know how to calculate the position for non-simple fonts")) { fileEl.setErrorKey("upload.error.simplefonts", null); } else { fileEl.setErrorKey("upload.unkown.error", null); } allOk &= false; } catch (Exception ex) { logError("", ex); fileEl.setErrorKey("upload.unkown.error", null); allOk &= false; } finally { IOUtils.closeQuietly(document); } return allOk; }
From source file:org.olat.search.service.document.file.pdf.PdfBoxExtractor.java
License:Apache License
private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException { if (log.isDebug()) log.debug("readContent from pdf starts..."); PDDocument document = null; BufferedInputStream bis = null; try {//from www . j a v a 2 s . c o m bis = new BufferedInputStream(leaf.getInputStream()); document = PDDocument.load(bis); if (document.isEncrypted()) { try { document.decrypt(""); } catch (Exception e) { log.warn("PDF is encrypted. Can not read content file=" + leaf.getName()); LimitedContentWriter writer = new LimitedContentWriter(128, FileDocumentFactory.getMaxFileSize()); writer.append(leaf.getName()); writer.close(); return new FileContent(leaf.getName(), writer.toString()); } } String title = getTitle(document); if (log.isDebug()) log.debug("readContent PDDocument loaded"); PDFTextStripper stripper = new PDFTextStripper(); LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize()); stripper.writeText(document, writer); writer.close(); return new FileContent(title, writer.toString()); } finally { if (document != null) { document.close(); } if (bis != null) { bis.close(); } } }
From source file:org.olat.search.service.document.file.PdfDocument.java
License:Apache License
private String extractTextFromPdf(final VFSLeaf leaf) throws IOException, DocumentAccessException { if (log.isDebug()) { log.debug("readContent from pdf starts..."); }/*www .ja va 2s . c o m*/ PDDocument document = null; BufferedInputStream bis = null; try { bis = new BufferedInputStream(leaf.getInputStream()); document = PDDocument.load(bis); if (document.isEncrypted()) { try { document.decrypt(""); } catch (final Exception e) { throw new DocumentAccessException( "PDF is encrypted. Can not read content file=" + leaf.getName()); } } if (log.isDebug()) { log.debug("readContent PDDocument loaded"); } final PDFTextStripper stripper = new PDFTextStripper(); return stripper.getText(document); } finally { if (document != null) { document.close(); } if (bis != null) { bis.close(); } } }