List of usage examples for org.apache.pdfbox.pdmodel.encryption StandardDecryptionMaterial StandardDecryptionMaterial
public StandardDecryptionMaterial(String pwd)
From source file:com.jaeksoft.searchlib.parser.PdfParser.java
License:Open Source License
private String decrypt(PDDocument pdf, File pdfFile) throws BadSecurityHandlerException, IOException, CryptographyException { // Let's try first with an empty password String password = StringUtils.EMPTY; try {/* w w w. j a v a 2s. c om*/ pdf.openProtection(new StandardDecryptionMaterial(password)); } catch (CryptographyException e) { // New attempt with PDFCrack String pdfCrackCommandLine = getStringProperty(ClassPropertyEnum.PDFCRACK_COMMANDLINE); if (StringUtils.isEmpty(pdfCrackCommandLine)) throw e; password = PdfCrack.findPassword(pdfCrackCommandLine, pdfFile); if (password == null) // No password found throw new IOException("Encrypted PDF."); // Password found, let's open pdf.openProtection(new StandardDecryptionMaterial(password)); } return password; }
From source file:com.opensearchserver.extractor.parser.PdfBox.java
License:Apache License
/** * Extract text content using PDFBox//w w w . ja va 2 s. c o m * * @param pdf * @throws Exception */ private void parseContent(PDDocument pdf) throws Exception { try { if (pdf.isEncrypted()) pdf.openProtection(new StandardDecryptionMaterial("")); extractMetaData(pdf); Stripper stripper = new Stripper(); stripper.getText(pdf); } finally { if (pdf != null) pdf.close(); } }
From source file:net.sourceforge.docfetcher.parse.PDFParser.java
License:Open Source License
public String renderText(File file) throws ParseException { PDDocument pdfDoc = null;/*from www . ja v a 2 s . c om*/ try { pdfDoc = PDDocument.load(file); if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (Exception e) { throw new ParseException(file, Msg.no_extraction_permission.value()); } } PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdfDoc, writer); return writer.toString(); } catch (IOException e) { throw new ParseException(file, Msg.file_not_readable.value()); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:net.sourceforge.docfetcher.parse.PDFParser.java
License:Open Source License
public Document parse(File file) throws ParseException { PDDocument pdfDoc = null;// ww w .j a v a 2 s .c om try { // Check if PDF file is encrypted pdfDoc = PDDocument.load(file); if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (Exception e) { throw new ParseException(file, Msg.no_extraction_permission.value()); } } // Get tags and contents PDFTextStripper stripper = new PDFTextStripper(); StringWriter writer = new StringWriter(); stripper.writeText(pdfDoc, writer); DocFetcher.getInstance().setExceptionHandlerEnabled(true); PDDocumentInformation pdInfo = pdfDoc.getDocumentInformation(); String[] metaData = new String[] { pdInfo.getTitle(), pdInfo.getAuthor(), pdInfo.getSubject(), pdInfo.getKeywords(), }; for (String field : metaData) if (field != null) writer.append(" ").append(field); //$NON-NLS-1$ return new Document(file, metaData[0], writer.getBuffer()).addAuthor(metaData[1]); } catch (IOException e) { throw new ParseException(file, Msg.file_not_readable.value()); } finally { if (pdfDoc != null) { try { pdfDoc.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:net.yacy.cider.parser.idiom.pdfIdiom.java
License:Open Source License
@Override public Model parse(DataSource source) throws ParserException { // create an empty Model Model model = ModelFactory.createDefaultModel(); Resource resource = source.hasURI() ? model.createResource(source.getURI().toNormalform(true, true)) : model.createResource();/* w ww . j a v a 2 s . c o m*/ // open pdf document final PDDocument theDocument; final PDFParser parser; try { parser = new PDFParser(source.getStream()); parser.parse(); theDocument = parser.getPDDocument(); } catch (IOException e) { log.error(e.getMessage(), e); throw new ParserException(e.getMessage(), source.getURI()); } if (theDocument.isEncrypted()) { try { theDocument.openProtection(new StandardDecryptionMaterial("")); } catch (BadSecurityHandlerException e) { throw new ParserException("PDF Encrypted (BadSecurityHandlerException): " + e.getMessage(), source.getURI(), e); } catch (IOException e) { throw new ParserException("PDF Encrypted (IOException): " + e.getMessage(), source.getURI(), e); } catch (CryptographyException e) { throw new ParserException("PDF Encrypted (CryptographyException): " + e.getMessage(), source.getURI(), e); } final AccessPermission perm = theDocument.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) throw new ParserException("PDF cannot be decrypted", source.getURI()); } // get metadata final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null; if (theDocInfo != null) { docTitle = theDocInfo.getTitle(); docSubject = theDocInfo.getSubject(); docAuthor = theDocInfo.getAuthor(); docKeywordStr = theDocInfo.getKeywords(); } if (docAuthor != null && docAuthor.length() > 0) { resource.addProperty(VCARD.FN, docAuthor); resource.addProperty(DC.creator, docAuthor); } if (docSubject != null && docSubject.length() > 0) { resource.addProperty(DC.subject, docSubject); } if (docTitle != null && docTitle.length() > 0) { resource.addProperty(DC.title, docTitle); } String[] docKeywords = null; if (docKeywordStr != null && docKeywordStr.length() > 0) { docKeywords = docKeywordStr.split(" |,"); resource.addProperty(DC.coverage, concat(docKeywords)); } // get the content ByteArrayOutputStream baos = new ByteArrayOutputStream(); Writer writer; try { writer = new OutputStreamWriter(baos, "UTF-8"); } catch (UnsupportedEncodingException e1) { writer = new OutputStreamWriter(baos); } try { final PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(theDocument, writer); theDocument.close(); writer.close(); } catch (IOException e) { if (writer != null) try { writer.close(); } catch (final Exception ex) { } throw new ParserException("PDF content reader", source.getURI(), e); } String content; try { content = new String(baos.toByteArray(), "UTF-8"); } catch (UnsupportedEncodingException e) { content = new String(baos.toByteArray()); } if (content != null && content.length() > 0) { resource.addProperty(CIDER.data_content_text, content); } return model; }
From source file:net.yacy.document.parser.pdfParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);/*from w w w . jav a 2s . co m*/ // create a pdf parser PDDocument pdfDoc; //final PDFParser pdfParser; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain pdfDoc = PDDocument.load(source); //PDFParser pdfParser = new PDFParser(source); //pdfParser.parse(); //pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index documents // the new documents will get a virtual link with a post argument page=X appended to the original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); //System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document(location, mimeType, "UTF-8", this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }
From source file:org.apache.camel.component.fop.FopHelper.java
License:Apache License
public static void decryptPDFN(PDDocument document, String password) throws IOException, CryptographyException, BadSecurityHandlerException { if (document.isEncrypted()) { DecryptionMaterial decryptionMaterial = new StandardDecryptionMaterial(password); document.openProtection(decryptionMaterial); } else {//from w w w .j a v a2s.c om throw new RuntimeException("Document not encrypted"); } }
From source file:org.apache.camel.component.pdf.PdfAppendTest.java
License:Apache License
@Test public void testAppendEncrypted() throws Exception { final String originalText = "Test"; final String textToAppend = "Append"; PDDocument document = new PDDocument(); PDPage page = new PDPage(PDPage.PAGE_SIZE_A4); document.addPage(page);/*from w w w . ja v a 2s .com*/ PDPageContentStream contentStream = new PDPageContentStream(document, page); contentStream.setFont(PDType1Font.HELVETICA, 12); contentStream.beginText(); contentStream.moveTextPositionByAmount(20, 400); contentStream.drawString(originalText); contentStream.endText(); contentStream.close(); final String ownerPass = "ownerPass"; final String userPass = "userPass"; AccessPermission accessPermission = new AccessPermission(); accessPermission.setCanExtractContent(false); StandardProtectionPolicy protectionPolicy = new StandardProtectionPolicy(ownerPass, userPass, accessPermission); protectionPolicy.setEncryptionKeyLength(128); document.protect(protectionPolicy); ByteArrayOutputStream output = new ByteArrayOutputStream(); document.save(output); // Encryption happens after saving. PDDocument encryptedDocument = PDDocument.load(new ByteArrayInputStream(output.toByteArray())); Map<String, Object> headers = new HashMap<String, Object>(); headers.put(PdfHeaderConstants.PDF_DOCUMENT_HEADER_NAME, encryptedDocument); headers.put(PdfHeaderConstants.DECRYPTION_MATERIAL_HEADER_NAME, new StandardDecryptionMaterial(userPass)); template.sendBodyAndHeaders("direct:start", textToAppend, headers); resultEndpoint.setExpectedMessageCount(1); resultEndpoint.expectedMessagesMatches(new Predicate() { @Override public boolean matches(Exchange exchange) { Object body = exchange.getIn().getBody(); assertThat(body, instanceOf(ByteArrayOutputStream.class)); try { PDDocument doc = PDDocument .load(new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray())); PDFTextStripper pdfTextStripper = new PDFTextStripper(); String text = pdfTextStripper.getText(doc); assertEquals(2, doc.getNumberOfPages()); assertThat(text, containsString(originalText)); assertThat(text, containsString(textToAppend)); } catch (IOException e) { throw new RuntimeException(e); } return true; } }); resultEndpoint.assertIsSatisfied(); }
From source file:org.apache.camel.component.pdf.PdfTextExtractionTest.java
License:Apache License
@Test public void testExtractTextFromEncrypted() throws Exception { final String ownerPass = "ownerPass"; final String userPass = "userPass"; AccessPermission accessPermission = new AccessPermission(); accessPermission.setCanExtractContent(false); StandardProtectionPolicy protectionPolicy = new StandardProtectionPolicy(ownerPass, userPass, accessPermission);/*from w ww .j a v a2 s . com*/ protectionPolicy.setEncryptionKeyLength(128); PDDocument document = new PDDocument(); final String expectedText = "Test string"; PDPage page = new PDPage(PDPage.PAGE_SIZE_A4); document.addPage(page); PDPageContentStream contentStream = new PDPageContentStream(document, page); contentStream.setFont(PDType1Font.HELVETICA, 12); contentStream.beginText(); contentStream.moveTextPositionByAmount(20, 400); contentStream.drawString(expectedText); contentStream.endText(); contentStream.close(); document.protect(protectionPolicy); ByteArrayOutputStream output = new ByteArrayOutputStream(); document.save(output); // Encryption happens after saving. PDDocument encryptedDocument = PDDocument.load(new ByteArrayInputStream(output.toByteArray())); template.sendBodyAndHeader("direct:start", encryptedDocument, PdfHeaderConstants.DECRYPTION_MATERIAL_HEADER_NAME, new StandardDecryptionMaterial(userPass)); resultEndpoint.setExpectedMessageCount(1); resultEndpoint.expectedMessagesMatches(new Predicate() { @Override public boolean matches(Exchange exchange) { Object body = exchange.getIn().getBody(); assertThat(body, instanceOf(String.class)); assertThat((String) body, containsString(expectedText)); return true; } }); resultEndpoint.assertIsSatisfied(); document.isEncrypted(); }
From source file:org.codelibs.robot.extractor.impl.PdfExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); }/*from w w w .j a v a 2 s .c om*/ synchronized (pdfBoxLockObj) { PDDocument document = null; try { document = PDDocument.load(in, null, force); if (document.isEncrypted() && params != null) { String password = params.get(ExtractData.PDF_PASSWORD); if (password == null) { password = getPassword(params.get(ExtractData.URL), params.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); } if (password != null) { final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password); document.openProtection(sdm); final AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract text."); } } } final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Writer output = new OutputStreamWriter(baos, encoding); final PDFTextStripper stripper = new PDFTextStripper(encoding); stripper.setForceParsing(force); final AtomicBoolean done = new AtomicBoolean(false); final PDDocument doc = document; final Set<Exception> exceptionSet = new HashSet<>(); final Thread task = new Thread(() -> { try { stripper.writeText(doc, output); } catch (final Exception e) { exceptionSet.add(e); } finally { done.set(true); } }); task.setDaemon(true); task.start(); task.join(timeout); if (!done.get()) { for (int i = 0; i < 100 && !done.get(); i++) { task.interrupt(); Thread.sleep(50); } throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec."); } else if (!exceptionSet.isEmpty()) { throw exceptionSet.iterator().next(); } output.flush(); final ExtractData extractData = new ExtractData(baos.toString(encoding)); extractMetadata(document, extractData); return extractData; } catch (final Exception e) { throw new ExtractException(e); } finally { if (document != null) { try { document.close(); } catch (final IOException e) { // NOP } } } } }