Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input, String password) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java

License:Apache License

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {

    PDFPureJavaParserConfig localConfig = context.get(PDFPureJavaParserConfig.class, defaultConfig);

    PDDocument pdfDocument = null;/*from  w  w  w  .j  a va 2 s .  c o m*/

    String password = "";
    try {
        // PDFBox can process entirely in memory, or can use a temp file
        //  for unpacked / processed resources
        // Decide which to do based on if we're reading from a file or not already
        //TODO: make this configurable via MemoryUsageSetting
        TikaInputStream tstream = TikaInputStream.cast(stream);
        password = getPassword(metadata, context);
        if (tstream != null && tstream.hasFile()) {
            // File based -- send file directly to PDFBox
            pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
        } else {
            pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
        }
        metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));

        metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
        extractMetadata(pdfDocument, metadata, context);
        AccessChecker checker = localConfig.getAccessChecker();
        checker.check(metadata);
        if (handler != null) {
            if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                handleXFAOnly(pdfDocument, handler, metadata, context);
            } else if (localConfig.getOcrStrategy().equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_ONLY)) {
                metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser");
                // No-ops. Do not support OCR parser.
            } else {
                if (localConfig.getOcrStrategy()
                        .equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
                    metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser");
                }
                PDF2XHTMLPureJava.process(pdfDocument, handler, context, metadata, localConfig);
            }
        }
    } catch (InvalidPasswordException e) {
        metadata.set(PDF.IS_ENCRYPTED, "true");
        throw new EncryptedDocumentException(e);
    } catch (final PdfTimeoutException e) {
        throw new TikaPdfTimeoutException("PdfTimeoutException", e);
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}

From source file:org.codelibs.fess.crawler.extractor.impl.PdfExtractor.java

License:Apache License

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }//from   w w  w. j a  v a  2  s  .com

    synchronized (pdfBoxLockObj) {
        final String password = getPassword(params);
        try (PDDocument document = PDDocument.load(in, password)) {
            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final Writer output = new OutputStreamWriter(baos, encoding);
            final PDFTextStripper stripper = new PDFTextStripper();
            final AtomicBoolean done = new AtomicBoolean(false);
            final PDDocument doc = document;
            final Set<Exception> exceptionSet = new HashSet<>();
            final Thread task = new Thread(() -> {
                try {
                    stripper.writeText(doc, output);
                } catch (final Exception e) {
                    exceptionSet.add(e);
                } finally {
                    done.set(true);
                }
            });
            task.setDaemon(true);
            task.start();
            task.join(timeout);
            if (!done.get()) {
                for (int i = 0; i < 100 && !done.get(); i++) {
                    task.interrupt();
                    Thread.sleep(50);
                }
                throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
            } else if (!exceptionSet.isEmpty()) {
                throw exceptionSet.iterator().next();
            }
            output.flush();
            final ExtractData extractData = new ExtractData(baos.toString(encoding));
            extractMetadata(document, extractData);
            return extractData;
        } catch (final Exception e) {
            throw new ExtractException(e);
        }
    }
}

From source file:org.geomajas.plugin.printing.document.DefaultDocumentTest.java

License:Open Source License

@Test
public void testToImage() throws Exception {
    testRender();//from  w  w w  . jav a2  s  .  c o  m
    PDDocument pdf = PDDocument.load(new File("target/test.pdf"), true);
    PDFRenderer renderer = new PDFRenderer(pdf);
    BufferedImage bufferedImage = renderer.renderImageWithDPI(0, 144);
    pdf.close();
    ImageIO.write(bufferedImage, "PNG", new File("target/test.png"));
}

From source file:org.geomajas.plugin.printing.document.SinglePageDocument.java

License:Open Source License

private void writeDocument(OutputStream outputStream, Format format, int dpi)
        throws IOException, DocumentException, PrintingException {
    if (format == Format.PDF) {
        baos.writeTo(outputStream);//  www. ja va2s  .c  o  m
    } else {
        PDDocument pdf = PDDocument.load(new ByteArrayInputStream(baos.toByteArray()), true);
        PDFRenderer renderer = new PDFRenderer(pdf);
        BufferedImage bufferedImage = renderer.renderImageWithDPI(0, dpi);
        pdf.close();
        if (format == Format.PNG) {
            final String formatName = format.getExtension();
            for (Iterator<ImageWriter> iw = ImageIO.getImageWritersByFormatName(formatName); iw.hasNext();) {
                ImageWriter writer1 = iw.next();
                ImageWriteParam writeParam = writer1.getDefaultWriteParam();
                ImageTypeSpecifier typeSpecifier = ImageTypeSpecifier
                        .createFromBufferedImageType(BufferedImage.TYPE_INT_ARGB);
                IIOMetadata metadata = writer1.getDefaultImageMetadata(typeSpecifier, writeParam);
                if (metadata.isReadOnly() || !metadata.isStandardMetadataFormatSupported()) {
                    continue;
                }

                setDPI(metadata);
                // Write bufferedImage to outputStream
                final ImageOutputStream stream = ImageIO.createImageOutputStream(outputStream);
                try {
                    writer1.setOutput(stream);
                    writer1.write(metadata, new IIOImage(bufferedImage, null, metadata), writeParam);
                } finally {
                    stream.flush();
                    stream.close();
                }
                break;
            }
        } else {
            ImageIO.write(bufferedImage, format.getExtension(), outputStream);
        }

    }
}