Example usage for org.apache.pdfbox.pdmodel PDDocument load

List of usage examples for org.apache.pdfbox.pdmodel PDDocument load

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument load.

Prototype

public static PDDocument load(byte[] input) throws IOException 

Source Link

Document

Parses a PDF.

Usage

From source file:org.elacin.pdfextract.datasource.pdfbox.PDFBoxSource.java

License:Apache License

@NotNull
protected static PDDocument openPdfDocument(@NotNull final File pdfFile, @Nullable final String password) {

    long t0 = System.currentTimeMillis();

    MDC.put("doc", pdfFile.getName());
    log.info("LOG00120:Opening PDF file " + pdfFile + ".");

    try {/*from www . ja  va2  s. co  m*/
        final PDDocument document = PDDocument.load(pdfFile);

        if (document.isEncrypted()) {
            if (password != null) {
                try {
                    document.decrypt(password);
                } catch (Exception e) {
                    throw new RuntimeException("Error while reading encrypted PDF:", e);
                }
            } else {
                log.warn("File claims to be encrypted, a password should be provided");
            }
        }

        log.debug("load()took" + (System.currentTimeMillis() - t0) + "ms");

        return document;
    } catch (IOException e) {
        MDC.put("doc", "");

        throw new RuntimeException("Error while reading " + pdfFile + ".", e);
    }
}

From source file:org.esteco.jira.pdf.AddImageToPDF.java

License:Apache License

/**
 * Add an image to an existing PDF document.
 *
 * @param inputFile  The input PDF to add the image to.
 * @param imagePath  The filename of the image to put in the PDF.
 * @param outputFile The file to write to the pdf to.
 * @throws IOException If there is an error writing the data.
 *//* w w w  .j a va  2 s  .  c o  m*/
public void createPDFFromImage(String inputFile, String imagePath, String outputFile) throws IOException {
    // the document
    PDDocument doc = null;
    try {
        doc = PDDocument.load(new File(inputFile));

        //we will add the image to the first page.
        PDPage page = doc.getPage(0);
        //page.setRotation(90);

        // createFromFile is the easiest way with an image file
        // if you already have the image in a BufferedImage,
        // call LosslessFactory.createFromImage() instead
        PDImageXObject pdImage = PDImageXObject.createFromFile(imagePath, doc);
        PDPageContentStream contentStream = new PDPageContentStream(doc, page, AppendMode.APPEND, true, true);

        // contentStream.drawImage(ximage, 20, 20 );
        // better method inspired by http://stackoverflow.com/a/22318681/535646
        // reduce this value if the image is too large
        float scale = 0.4f;
        contentStream.drawImage(pdImage, 20, 20, pdImage.getWidth() * scale, pdImage.getHeight() * scale);

        contentStream.close();
        doc.save(outputFile);
    } finally {
        if (doc != null) {
            doc.close();
        }
    }
}

From source file:org.example.extractimagesfrompdfpages.ExtractImagesFromPDFPagesMain.java

public static void main(String[] args) {
    try {//from w w w  . j  a  v  a 2  s  . c o  m
        File thePDFFile = new File(args[0]);
        PDDocument document = PDDocument.load(thePDFFile);
        PDPageTree list = document.getPages();
        int i = 1;
        for (PDPage page : list) {
            Boolean alreadyCreatedFolderForThisPage = false;
            File thePDFFileDirectory = thePDFFile.getParentFile();
            File thePDFPageFolder = new File(thePDFFileDirectory.getAbsolutePath() + "/temp_images" + "/" + i);
            PDResources pdResources = page.getResources();
            int j = 1;
            for (COSName c : pdResources.getXObjectNames()) {
                PDXObject o = pdResources.getXObject(c);
                if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
                    if (alreadyCreatedFolderForThisPage == false) {
                        thePDFPageFolder.mkdirs();
                        alreadyCreatedFolderForThisPage = true;
                    }

                    File file = new File(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png");
                    ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(),
                            "png", file);
                    System.out.println(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png");

                    j++;
                }

            }
            i++;
        }
    } catch (IOException ex) {
        Logger.getLogger(ExtractImagesFromPDFPagesMain.class.getName()).log(Level.SEVERE, null, ex);
        throw new RuntimeException(ex);
    }

}

From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java

License:Open Source License

/**
 * Returns only a text from pdf file content.
 * //from   w ww .j a va2 s  . c  o  m
 * @param is an input stream with .pdf file content.
 * @return The string only with text from file content.
 */
public String getContentAsText(final InputStream is) throws IOException, DocumentReadException {

    try {
        return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<String>() {
            public String run() throws Exception {
                if (is == null) {
                    throw new IllegalArgumentException("InputStream is null.");
                }
                PDDocument pdDocument = null;
                StringWriter sw = new StringWriter();
                try {
                    if (is.available() == 0)
                        return "";

                    try {
                        pdDocument = PDDocument.load(is);
                    } catch (IOException e) {
                        throw new DocumentReadException("Can not load PDF document.", e);
                    }

                    PDFTextStripper stripper = new PDFTextStripper();
                    stripper.setStartPage(1);
                    stripper.setEndPage(Integer.MAX_VALUE);
                    stripper.writeText(pdDocument, sw);
                } finally {
                    if (pdDocument != null)
                        try {
                            pdDocument.close();
                        } catch (IOException e) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("An exception occurred: " + e.getMessage());
                            }
                        }
                    if (is != null)
                        try {
                            is.close();
                        } catch (IOException e) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("An exception occurred: " + e.getMessage());
                            }
                        }
                }
                return sw.toString();
            }
        });

    } catch (PrivilegedActionException pae) {
        Throwable cause = pae.getCause();
        if (cause instanceof IOException) {
            throw (IOException) cause;
        } else if (cause instanceof RuntimeException) {
            throw (RuntimeException) cause;
        } else {
            throw new RuntimeException(cause);
        }
    }

}

From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java

License:Open Source License

public Properties getProperties(final InputStream is) throws IOException, DocumentReadException {
    try {/*from  w  ww. ja  v a2s.co m*/
        return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() {
            public Properties run() throws Exception {
                if (is == null) {
                    throw new IllegalArgumentException("InputStream is null.");
                }

                PDDocument pdDocument = PDDocument.load(is);
                Properties props = new Properties();
                try {
                    if (pdDocument.isEncrypted()) {
                        try {
                            pdDocument.decrypt("");
                        } catch (InvalidPasswordException e) {
                            throw new DocumentReadException("The pdf document is encrypted.", e);
                        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
                            throw new DocumentReadException(e.getMessage(), e);
                        }
                    }

                    PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
                    PDMetadata meta = catalog.getMetadata();
                    if (meta != null) {
                        XMPMetadata metadata = meta.exportXMPMetadata();

                        XMPSchemaDublinCore dc = metadata.getDublinCoreSchema();
                        if (dc != null) {
                            try {
                                if (dc.getTitle() != null)
                                    props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle()));
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }
                            try {
                                if (dc.getDescription() != null)
                                    props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription()));
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getCreators() != null) {
                                    for (String creator : dc.getCreators()) {
                                        props.put(DCMetaData.CREATOR, fixEncoding(creator));
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getDates() != null) {
                                    for (Calendar date : dc.getDates()) {
                                        props.put(DCMetaData.DATE, date);
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getDate failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaPDF pdf = metadata.getPDFSchema();
                        if (pdf != null) {
                            try {
                                if (pdf.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords()));
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }

                            try {
                                if (pdf.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer()));
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaBasic basic = metadata.getBasicSchema();
                        if (basic != null) {
                            try {
                                if (basic.getCreateDate() != null)
                                    props.put(DCMetaData.DATE, basic.getCreateDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (basic.getModifyDate() != null)
                                    props.put(DCMetaData.DATE, basic.getModifyDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }

                            // DCMetaData.PUBLISHER - basic.getCreatorTool()
                        }
                    }

                    if (props.isEmpty()) {
                        // The pdf doesn't contain any metadata, try to use the document
                        // information instead
                        PDDocumentInformation docInfo = pdDocument.getDocumentInformation();

                        if (docInfo != null) {
                            try {
                                if (docInfo.getAuthor() != null)
                                    props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor());
                            } catch (Exception e) {
                                LOG.warn("getAuthor failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getCreationDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreator() != null)
                                    props.put(DCMetaData.CREATOR, docInfo.getCreator());
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }
                            try {

                                if (docInfo.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getModificationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getModificationDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, docInfo.getProducer());
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getSubject() != null)
                                    props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getTitle() != null)
                                    props.put(DCMetaData.TITLE, docInfo.getTitle());
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }

                            // docInfo.getTrapped();
                        }
                    }
                } finally {
                    if (pdDocument != null) {
                        pdDocument.close();
                    }

                    if (is != null) {
                        try {
                            is.close();
                        } catch (IOException e) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("An exception occurred: " + e.getMessage());
                            }
                        }
                    }
                }
                return props;
            }
        });

    } catch (PrivilegedActionException pae) {
        Throwable cause = pae.getCause();
        if (cause instanceof IOException) {
            throw (IOException) cause;
        } else if (cause instanceof RuntimeException) {
            throw (RuntimeException) cause;
        } else {
            throw new RuntimeException(cause);
        }
    }
}

From source file:org.fit.cssbox.demo.PdfBoxBrowser.java

License:Open Source License

protected PDDocument loadPdf(InputStream is) throws IOException {
    PDDocument document = null;
    document = PDDocument.load(is);
    return document;
}

From source file:org.fit.pdfdom.PDFToHTML.java

License:Open Source License

public static void main(String[] args) {

    if (args.length < 1) {
        System.out.println("Usage: PDFToHTML <infile> [<outfile>]");
        System.exit(1);/*from  w  w  w. j a va  2s. co  m*/
    }

    String infile = args[0];
    String outfile;
    if (args.length > 1)
        outfile = args[1];
    else {
        String base = args[0];
        if (base.toLowerCase().endsWith(".pdf"))
            base = base.substring(0, base.length() - 4);
        outfile = base + ".html";
    }

    PDDocument document = null;
    try {
        document = PDDocument.load(new File(infile));
        PDFDomTree parser = new PDFDomTree();
        //parser.setDisableImageData(true);
        Writer output = new PrintWriter(outfile, "utf-8");
        parser.writeText(document, output);
        output.close();
    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (IOException e) {
                System.err.println("Error: " + e.getMessage());
                //e.printStackTrace();
            }
        }
    }

}

From source file:org.frameworkset.http.converter.wordpdf.Word2PDFResponse.java

License:Apache License

protected void render_(HttpOutputMessage outputMessage, HttpInputMessage inputMessage, File file) {
    OutputStream out = null;/*  w  w w . ja va2  s  . co m*/
    PDDocument doc2 = null;
    try {
        File contract_pdf = file;
        doc2 = PDDocument.load(contract_pdf);
        HttpServletResponse response = outputMessage.getResponse();
        response.setContentType("application/pdf");
        response.setHeader("Content-Disposition",
                "inline; filename=" + handleCNName(contract_pdf.getName(), inputMessage.getServletRequest()));
        out = response.getOutputStream();
        doc2.save(out);
        out.flush();
    } catch (Exception e) {
        throw new HttpMessageNotWritableException(this.getPdfFile(), e);
    } finally {
        if (out != null) {
            try {
                out.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        if (doc2 != null)
            try {
                doc2.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    }
}

From source file:org.freeeed.ocr.ImageTextParser.java

License:Apache License

private static String splitPages(String filePath) throws IOException {
    File file = new File(filePath);
    String pagePath;/* www  .j a  va2s .  c  o  m*/
    try (PDDocument document = PDDocument.load(file)) {
        Splitter splitter = new Splitter();
        List<PDDocument> pages = splitter.split(document);
        Iterator<PDDocument> iterator = pages.listIterator();
        int i = 0;
        pagePath = createTempPath(file);
        LOGGER.debug("pagePath = " + pagePath);
        while (iterator.hasNext()) {
            PDDocument pd = iterator.next();
            pd.save(pagePath + i++ + ".pdf");
            pd.close();
        }
    }
    return pagePath;
}

From source file:org.freeeed.ocr.PDFImageExtractor.java

License:Apache License

@SuppressWarnings("rawtypes")
@Override/* w w w.ja  v  a  2  s.  co m*/
public List<String> extractImages() {
    File extractionDir = new File(conf.getPdfImageExtractionDir());
    extractionDir.mkdirs();

    List<String> result = new ArrayList<String>();

    PDDocument document = null;
    try {
        document = PDDocument.load(file);

        List pages = document.getDocumentCatalog().getAllPages();
        Iterator iter = pages.iterator();
        int i = 1;
        int maxNumberOfImages = Project.getCurrentProject().getOcrMaxImagesPerPDF();

        while (iter.hasNext()) {
            PDPage page = (PDPage) iter.next();
            PDResources resources = page.getResources();
            Map pageImages = resources.getImages();
            if (pageImages != null) {
                Iterator imageIter = pageImages.keySet().iterator();
                while (imageIter.hasNext()) {
                    if (i > maxNumberOfImages) {
                        return result;
                    }

                    String key = (String) imageIter.next();
                    PDXObjectImage image = (PDXObjectImage) pageImages.get(key);

                    String fileName = conf.getPdfImageExtractionDir() + OCRUtil.createUniqueFileName("image");
                    image.write2file(fileName);

                    result.add(fileName + "." + image.getSuffix());

                    i++;
                }
            }
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }

    return result;
}