List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(InputStream input, String password, MemoryUsageSetting memUsageSetting) throws IOException
From source file:de.redsix.pdfcompare.PdfComparator.java
License:Apache License
public T compare() throws IOException { try {//from w ww .j ava 2 s . c o m if (expectedStreamSupplier == null || actualStreamSupplier == null) { return compareResult; } try (final InputStream expectedStream = expectedStreamSupplier.get()) { try (final InputStream actualStream = actualStreamSupplier.get()) { try (PDDocument expectedDocument = PDDocument.load(expectedStream, expectedPassword, Utilities.getMemorySettings(Environment.getDocumentCacheSize()))) { try (PDDocument actualDocument = PDDocument.load(actualStream, actualPassword, Utilities.getMemorySettings(Environment.getDocumentCacheSize()))) { compare(expectedDocument, actualDocument); } } } catch (NoSuchFileException ex) { addSingleDocumentToResult(expectedStream, MISSING_RGB); compareResult.expectedOnly(); } } catch (NoSuchFileException ex) { try (final InputStream actualStream = actualStreamSupplier.get()) { addSingleDocumentToResult(actualStream, EXTRA_RGB); compareResult.actualOnly(); } catch (NoSuchFileException innerEx) { LOG.warn("No files found to compare. Tried Expected: '{}' and Actual: '{}'", ex.getFile(), innerEx.getFile()); compareResult.noPagesFound(); } } } finally { compareResult.done(); } return compareResult; }
From source file:mj.ocraptor.extraction.tika.parser.pdf.PDFParser.java
License:Apache License
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDDocument pdfDocument = null;//from ww w. jav a 2 s . c o m TemporaryResources tmp = new TemporaryResources(); // config from context, or default if not set via context PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); try { // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not // already TikaInputStream tstream = TikaInputStream.cast(stream); if (tstream != null && tstream.hasFile()) { // File based, take that as a cue to use a temporary file RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw"); if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true); } } else { // Go for the normal, stream based in-memory parsing if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer()); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); } } if (pdfDocument.isEncrypted()) { String password = null; // Did they supply a new style Password Provider? PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { password = passwordProvider.getPassword(metadata); } // Fall back on the old style metadata if set if (password == null && metadata.get(PASSWORD) != null) { password = metadata.get(PASSWORD); } // If no password is given, use an empty string as the default if (password == null) { password = ""; } try { pdfDocument.decrypt(password); } catch (Exception e) { // Ignore } } metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } catch (Exception e) { // TODO: logging e.printStackTrace(); } finally { if (pdfDocument != null) { pdfDocument.close(); } if (tmp != null) { tmp.dispose(); tmp.close(); } } handler.endDocument(); }
From source file:org.apache.tika.parser.pdf.PDFParser.java
License:Apache License
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDDocument pdfDocument = null;//from w ww. j av a 2s. c o m TemporaryResources tmp = new TemporaryResources(); //config from context, or default if not set via context PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); String password = ""; try { // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { // File based, take that as a cue to use a temporary file RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw"); if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true); } } else { // Go for the normal, stream based in-memory parsing if (localConfig.getUseNonSequentialParser() == true) { pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer(), password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); } } metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted())); //if using the classic parser and the doc is encrypted, we must manually decrypt if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) { pdfDocument.decrypt(password); } metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata); } else { PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } } } catch (CryptographyException e) { //seq parser throws CryptographyException for bad password throw new EncryptedDocumentException(e); } catch (IOException e) { //nonseq parser throws IOException for bad password //At the Tika level, we want the same exception to be thrown if (e.getMessage() != null && e.getMessage().contains("Error (CryptographyException)")) { metadata.set("pdf:encrypted", Boolean.toString(true)); throw new EncryptedDocumentException(e); } //rethrow any other IOExceptions throw e; } finally { if (pdfDocument != null) { pdfDocument.close(); } tmp.dispose(); //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200) PDFont.clearResources(); } }
From source file:org.codelibs.robot.extractor.impl.PdfExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); }/* ww w. jav a 2s . co m*/ synchronized (pdfBoxLockObj) { PDDocument document = null; try { document = PDDocument.load(in, null, force); if (document.isEncrypted() && params != null) { String password = params.get(ExtractData.PDF_PASSWORD); if (password == null) { password = getPassword(params.get(ExtractData.URL), params.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); } if (password != null) { final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password); document.openProtection(sdm); final AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract text."); } } } final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Writer output = new OutputStreamWriter(baos, encoding); final PDFTextStripper stripper = new PDFTextStripper(encoding); stripper.setForceParsing(force); final AtomicBoolean done = new AtomicBoolean(false); final PDDocument doc = document; final Set<Exception> exceptionSet = new HashSet<>(); final Thread task = new Thread(() -> { try { stripper.writeText(doc, output); } catch (final Exception e) { exceptionSet.add(e); } finally { done.set(true); } }); task.setDaemon(true); task.start(); task.join(timeout); if (!done.get()) { for (int i = 0; i < 100 && !done.get(); i++) { task.interrupt(); Thread.sleep(50); } throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec."); } else if (!exceptionSet.isEmpty()) { throw exceptionSet.iterator().next(); } output.flush(); final ExtractData extractData = new ExtractData(baos.toString(encoding)); extractMetadata(document, extractData); return extractData; } catch (final Exception e) { throw new ExtractException(e); } finally { if (document != null) { try { document.close(); } catch (final IOException e) { // NOP } } } } }