Example usage for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException

Source Link

Document

Parses a PDF.

Usage

From source file:org.elacin.pdfextract.datasource.pdfbox.PDFBoxSource.java

License:Apache License

@NotNull
protected static PDDocument openPdfDocument(@NotNull final File pdfFile, @Nullable final String password) {

    long t0 = System.currentTimeMillis();

    MDC.put("doc", pdfFile.getName());
    log.info("LOG00120:Opening PDF file " + pdfFile + ".");

    try {/*from www . ja  va2  s. co  m*/
        final PDDocument document = PDDocument.load(pdfFile);

        if (document.isEncrypted()) {
            if (password != null) {
                try {
                    document.decrypt(password);
                } catch (Exception e) {
                    throw new RuntimeException("Error while reading encrypted PDF:", e);
                }
            } else {
                log.warn("File claims to be encrypted, a password should be provided");
            }
        }

        log.debug("load()took" + (System.currentTimeMillis() - t0) + "ms");

        return document;
    } catch (IOException e) {
        MDC.put("doc", "");

        throw new RuntimeException("Error while reading " + pdfFile + ".", e);
    }
}

From source file:org.esteco.jira.pdf.AddImageToPDF.java

License:Apache License

/**
 * Add an image to an existing PDF document.
 *
 * @param inputFile  The input PDF to add the image to.
 * @param imagePath  The filename of the image to put in the PDF.
 * @param outputFile The file to write to the pdf to.
 * @throws IOException If there is an error writing the data.
 *//* w w w  .j a va  2 s  .  c o  m*/
public void createPDFFromImage(String inputFile, String imagePath, String outputFile) throws IOException {
    // the document
    PDDocument doc = null;
    try {
        doc = PDDocument.load(new File(inputFile));

        //we will add the image to the first page.
        PDPage page = doc.getPage(0);
        //page.setRotation(90);

        // createFromFile is the easiest way with an image file
        // if you already have the image in a BufferedImage,
        // call LosslessFactory.createFromImage() instead
        PDImageXObject pdImage = PDImageXObject.createFromFile(imagePath, doc);
        PDPageContentStream contentStream = new PDPageContentStream(doc, page, AppendMode.APPEND, true, true);

        // contentStream.drawImage(ximage, 20, 20 );
        // better method inspired by http://stackoverflow.com/a/22318681/535646
        // reduce this value if the image is too large
        float scale = 0.4f;
        contentStream.drawImage(pdImage, 20, 20, pdImage.getWidth() * scale, pdImage.getHeight() * scale);

        contentStream.close();
        doc.save(outputFile);
    } finally {
        if (doc != null) {
            doc.close();
        }
    }
}

From source file:org.example.extractimagesfrompdfpages.ExtractImagesFromPDFPagesMain.java

public static void main(String[] args) {
    try {//from w w w  . j  a  v  a 2  s  . c o  m
        File thePDFFile = new File(args[0]);
        PDDocument document = PDDocument.load(thePDFFile);
        PDPageTree list = document.getPages();
        int i = 1;
        for (PDPage page : list) {
            Boolean alreadyCreatedFolderForThisPage = false;
            File thePDFFileDirectory = thePDFFile.getParentFile();
            File thePDFPageFolder = new File(thePDFFileDirectory.getAbsolutePath() + "/temp_images" + "/" + i);
            PDResources pdResources = page.getResources();
            int j = 1;
            for (COSName c : pdResources.getXObjectNames()) {
                PDXObject o = pdResources.getXObject(c);
                if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
                    if (alreadyCreatedFolderForThisPage == false) {
                        thePDFPageFolder.mkdirs();
                        alreadyCreatedFolderForThisPage = true;
                    }

                    File file = new File(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png");
                    ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(),
                            "png", file);
                    System.out.println(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png");

                    j++;
                }

            }
            i++;
        }
    } catch (IOException ex) {
        Logger.getLogger(ExtractImagesFromPDFPagesMain.class.getName()).log(Level.SEVERE, null, ex);
        throw new RuntimeException(ex);
    }

}

From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java

License:Open Source License

/**
 * Returns only a text from pdf file content.
 * //from   w ww .j a va2 s  . c  o  m
 * @param is an input stream with .pdf file content.
 * @return The string only with text from file content.
 */
public String getContentAsText(final InputStream is) throws IOException, DocumentReadException {

    try {
        return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<String>() {
            public String run() throws Exception {
                if (is == null) {
                    throw new IllegalArgumentException("InputStream is null.");
                }
                PDDocument pdDocument = null;
                StringWriter sw = new StringWriter();
                try {
                    if (is.available() == 0)
                        return "";

                    try {
                        pdDocument = PDDocument.load(is);
                    } catch (IOException e) {
                        throw new DocumentReadException("Can not load PDF document.", e);
                    }

                    PDFTextStripper stripper = new PDFTextStripper();
                    stripper.setStartPage(1);
                    stripper.setEndPage(Integer.MAX_VALUE);
                    stripper.writeText(pdDocument, sw);
                } finally {
                    if (pdDocument != null)
                        try {
                            pdDocument.close();
                        } catch (IOException e) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("An exception occurred: " + e.getMessage());
                            }
                        }
                    if (is != null)
                        try {
                            is.close();
                        } catch (IOException e) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("An exception occurred: " + e.getMessage());
                            }
                        }
                }
                return sw.toString();
            }
        });

    } catch (PrivilegedActionException pae) {
        Throwable cause = pae.getCause();
        if (cause instanceof IOException) {
            throw (IOException) cause;
        } else if (cause instanceof RuntimeException) {
            throw (RuntimeException) cause;
        } else {
            throw new RuntimeException(cause);
        }
    }

}

From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java

License:Open Source License

public Properties getProperties(final InputStream is) throws IOException, DocumentReadException {
    try {/*from  w  ww. ja  v a2s.co m*/
        return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() {
            public Properties run() throws Exception {
                if (is == null) {
                    throw new IllegalArgumentException("InputStream is null.");
                }

                PDDocument pdDocument = PDDocument.load(is);
                Properties props = new Properties();
                try {
                    if (pdDocument.isEncrypted()) {
                        try {
                            pdDocument.decrypt("");
                        } catch (InvalidPasswordException e) {
                            throw new DocumentReadException("The pdf document is encrypted.", e);
                        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
                            throw new DocumentReadException(e.getMessage(), e);
                        }
                    }

                    PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
                    PDMetadata meta = catalog.getMetadata();
                    if (meta != null) {
                        XMPMetadata metadata = meta.exportXMPMetadata();

                        XMPSchemaDublinCore dc = metadata.getDublinCoreSchema();
                        if (dc != null) {
                            try {
                                if (dc.getTitle() != null)
                                    props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle()));
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }
                            try {
                                if (dc.getDescription() != null)
                                    props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription()));
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getCreators() != null) {
                                    for (String creator : dc.getCreators()) {
                                        props.put(DCMetaData.CREATOR, fixEncoding(creator));
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getDates() != null) {
                                    for (Calendar date : dc.getDates()) {
                                        props.put(DCMetaData.DATE, date);
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getDate failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaPDF pdf = metadata.getPDFSchema();
                        if (pdf != null) {
                            try {
                                if (pdf.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords()));
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }

                            try {
                                if (pdf.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer()));
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaBasic basic = metadata.getBasicSchema();
                        if (basic != null) {
                            try {
                                if (basic.getCreateDate() != null)
                                    props.put(DCMetaData.DATE, basic.getCreateDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (basic.getModifyDate() != null)
                                    props.put(DCMetaData.DATE, basic.getModifyDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }

                            // DCMetaData.PUBLISHER - basic.getCreatorTool()
                        }
                    }

                    if (props.isEmpty()) {
                        // The pdf doesn't contain any metadata, try to use the document
                        // information instead
                        PDDocumentInformation docInfo = pdDocument.getDocumentInformation();

                        if (docInfo != null) {
                            try {
                                if (docInfo.getAuthor() != null)
                                    props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor());
                            } catch (Exception e) {
                                LOG.warn("getAuthor failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getCreationDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreator() != null)
                                    props.put(DCMetaData.CREATOR, docInfo.getCreator());
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }
                            try {

                                if (docInfo.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getModificationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getModificationDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, docInfo.getProducer());
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getSubject() != null)
                                    props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getTitle() != null)
                                    props.put(DCMetaData.TITLE, docInfo.getTitle());
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }

                            // docInfo.getTrapped();
                        }
                    }
                } finally {
                    if (pdDocument != null) {
                        pdDocument.close();
                    }

                    if (is != null) {
                        try {
                            is.close();
                        } catch (IOException e) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("An exception occurred: " + e.getMessage());
                            }
                        }
                    }
                }
                return props;
            }
        });

    } catch (PrivilegedActionException pae) {
        Throwable cause = pae.getCause();
        if (cause instanceof IOException) {
            throw (IOException) cause;
        } else if (cause instanceof RuntimeException) {
            throw (RuntimeException) cause;
        } else {
            throw new RuntimeException(cause);
        }
    }
}

From source file:org.fit.cssbox.demo.PdfBoxBrowser.java

License:Open Source License

protected PDDocument loadPdf(InputStream is) throws IOException {
    PDDocument document = null;
    document = PDDocument.load(is);
    return document;
}

From source file:org.fit.pdfdom.PDFToHTML.java

License:Open Source License

public static void main(String[] args) {

    if (args.length < 1) {
        System.out.println("Usage: PDFToHTML <infile> [<outfile>]");
        System.exit(1);/*from  w  w  w. j a va  2s. co  m*/
    }

    String infile = args[0];
    String outfile;
    if (args.length > 1)
        outfile = args[1];
    else {
        String base = args[0];
        if (base.toLowerCase().endsWith(".pdf"))
            base = base.substring(0, base.length() - 4);
        outfile = base + ".html";
    }

    PDDocument document = null;
    try {
        document = PDDocument.load(new File(infile));
        PDFDomTree parser = new PDFDomTree();
        //parser.setDisableImageData(true);
        Writer output = new PrintWriter(outfile, "utf-8");
        parser.writeText(document, output);
        output.close();
    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
                System.err.println("Error: " + e.getMessage());
                //e.printStackTrace();
            }
        }
    }

}

From source file:org.frameworkset.http.converter.wordpdf.Word2PDFResponse.java

License:Apache License

protected void render_(HttpOutputMessage outputMessage, HttpInputMessage inputMessage, File file) {
    OutputStream out = null;/*  w  w w . ja va2  s  . co m*/
    PDDocument doc2 = null;
    try {
        File contract_pdf = file;
        doc2 = PDDocument.load(contract_pdf);
        HttpServletResponse response = outputMessage.getResponse();
        response.setContentType("application/pdf");
        response.setHeader("Content-Disposition",
                "inline; filename=" + handleCNName(contract_pdf.getName(), inputMessage.getServletRequest()));
        out = response.getOutputStream();
        doc2.save(out);
        out.flush();
    } catch (Exception e) {
        throw new HttpMessageNotWritableException(this.getPdfFile(), e);
    } finally {
        if (out != null) {
            try {
                out.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        if (doc2 != null)
            try {
                doc2.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    }
}

From source file:org.freeeed.ocr.ImageTextParser.java

License:Apache License

private static String splitPages(String filePath) throws IOException {
    File file = new File(filePath);
    String pagePath;/* www  .j a  va2s .  c  o  m*/
    try (PDDocument document = PDDocument.load(file)) {
        Splitter splitter = new Splitter();
        List<PDDocument> pages = splitter.split(document);
        Iterator<PDDocument> iterator = pages.listIterator();
        int i = 0;
        pagePath = createTempPath(file);
        LOGGER.debug("pagePath = " + pagePath);
        while (iterator.hasNext()) {
            PDDocument pd = iterator.next();
            pd.save(pagePath + i++ + ".pdf");
            pd.close();
        }
    }
    return pagePath;
}

From source file:org.freeeed.ocr.PDFImageExtractor.java

License:Apache License

@SuppressWarnings("rawtypes")
@Override/* w w w.ja  v  a  2  s.  co m*/
public List<String> extractImages() {
    File extractionDir = new File(conf.getPdfImageExtractionDir());
    extractionDir.mkdirs();

    List<String> result = new ArrayList<String>();

    PDDocument document = null;
    try {
        document = PDDocument.load(file);

        List pages = document.getDocumentCatalog().getAllPages();
        Iterator iter = pages.iterator();
        int i = 1;
        int maxNumberOfImages = Project.getCurrentProject().getOcrMaxImagesPerPDF();

        while (iter.hasNext()) {
            PDPage page = (PDPage) iter.next();
            PDResources resources = page.getResources();
            Map pageImages = resources.getImages();
            if (pageImages != null) {
                Iterator imageIter = pageImages.keySet().iterator();
                while (imageIter.hasNext()) {
                    if (i > maxNumberOfImages) {
                        return result;
                    }

                    String key = (String) imageIter.next();
                    PDXObjectImage image = (PDXObjectImage) pageImages.get(key);

                    String fileName = conf.getPdfImageExtractionDir() + OCRUtil.createUniqueFileName("image");
                    image.write2file(fileName);

                    result.add(fileName + "." + image.getSuffix());

                    i++;
                }
            }
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }

    return result;
}