List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(byte[] input, String password) throws IOException
From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java
License:Apache License
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDFPureJavaParserConfig localConfig = context.get(PDFPureJavaParserConfig.class, defaultConfig); PDDocument pdfDocument = null;/*from w w w .j a va 2 s . c o m*/ String password = ""; try { // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already //TODO: make this configurable via MemoryUsageSetting TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { // File based -- send file directly to PDFBox pdfDocument = PDDocument.load(tstream.getPath().toFile(), password); } else { pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password); } metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted())); metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString()); extractMetadata(pdfDocument, metadata, context); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); } else if (localConfig.getOcrStrategy().equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_ONLY)) { metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser"); // No-ops. Do not support OCR parser. } else { if (localConfig.getOcrStrategy() .equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser"); } PDF2XHTMLPureJava.process(pdfDocument, handler, context, metadata, localConfig); } } } catch (InvalidPasswordException e) { metadata.set(PDF.IS_ENCRYPTED, "true"); throw new EncryptedDocumentException(e); } catch (final PdfTimeoutException e) { throw new TikaPdfTimeoutException("PdfTimeoutException", e); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
From source file:org.codelibs.fess.crawler.extractor.impl.PdfExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new CrawlerSystemException("The inputstream is null."); }//from w w w. j a v a 2 s .com synchronized (pdfBoxLockObj) { final String password = getPassword(params); try (PDDocument document = PDDocument.load(in, password)) { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Writer output = new OutputStreamWriter(baos, encoding); final PDFTextStripper stripper = new PDFTextStripper(); final AtomicBoolean done = new AtomicBoolean(false); final PDDocument doc = document; final Set<Exception> exceptionSet = new HashSet<>(); final Thread task = new Thread(() -> { try { stripper.writeText(doc, output); } catch (final Exception e) { exceptionSet.add(e); } finally { done.set(true); } }); task.setDaemon(true); task.start(); task.join(timeout); if (!done.get()) { for (int i = 0; i < 100 && !done.get(); i++) { task.interrupt(); Thread.sleep(50); } throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec."); } else if (!exceptionSet.isEmpty()) { throw exceptionSet.iterator().next(); } output.flush(); final ExtractData extractData = new ExtractData(baos.toString(encoding)); extractMetadata(document, extractData); return extractData; } catch (final Exception e) { throw new ExtractException(e); } } }
From source file:org.geomajas.plugin.printing.document.DefaultDocumentTest.java
License:Open Source License
@Test public void testToImage() throws Exception { testRender();//from w w w . jav a2 s . c o m PDDocument pdf = PDDocument.load(new File("target/test.pdf"), true); PDFRenderer renderer = new PDFRenderer(pdf); BufferedImage bufferedImage = renderer.renderImageWithDPI(0, 144); pdf.close(); ImageIO.write(bufferedImage, "PNG", new File("target/test.png")); }
From source file:org.geomajas.plugin.printing.document.SinglePageDocument.java
License:Open Source License
private void writeDocument(OutputStream outputStream, Format format, int dpi) throws IOException, DocumentException, PrintingException { if (format == Format.PDF) { baos.writeTo(outputStream);// www. ja va2s .c o m } else { PDDocument pdf = PDDocument.load(new ByteArrayInputStream(baos.toByteArray()), true); PDFRenderer renderer = new PDFRenderer(pdf); BufferedImage bufferedImage = renderer.renderImageWithDPI(0, dpi); pdf.close(); if (format == Format.PNG) { final String formatName = format.getExtension(); for (Iterator<ImageWriter> iw = ImageIO.getImageWritersByFormatName(formatName); iw.hasNext();) { ImageWriter writer1 = iw.next(); ImageWriteParam writeParam = writer1.getDefaultWriteParam(); ImageTypeSpecifier typeSpecifier = ImageTypeSpecifier .createFromBufferedImageType(BufferedImage.TYPE_INT_ARGB); IIOMetadata metadata = writer1.getDefaultImageMetadata(typeSpecifier, writeParam); if (metadata.isReadOnly() || !metadata.isStandardMetadataFormatSupported()) { continue; } setDPI(metadata); // Write bufferedImage to outputStream final ImageOutputStream stream = ImageIO.createImageOutputStream(outputStream); try { writer1.setOutput(stream); writer1.write(metadata, new IIOImage(bufferedImage, null, metadata), writeParam); } finally { stream.flush(); stream.close(); } break; } } else { ImageIO.write(bufferedImage, format.getExtension(), outputStream); } } }