List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(byte[] input) throws IOException
From source file:org.elacin.pdfextract.datasource.pdfbox.PDFBoxSource.java
License:Apache License
@NotNull protected static PDDocument openPdfDocument(@NotNull final File pdfFile, @Nullable final String password) { long t0 = System.currentTimeMillis(); MDC.put("doc", pdfFile.getName()); log.info("LOG00120:Opening PDF file " + pdfFile + "."); try {/*from www . ja va2 s. co m*/ final PDDocument document = PDDocument.load(pdfFile); if (document.isEncrypted()) { if (password != null) { try { document.decrypt(password); } catch (Exception e) { throw new RuntimeException("Error while reading encrypted PDF:", e); } } else { log.warn("File claims to be encrypted, a password should be provided"); } } log.debug("load()took" + (System.currentTimeMillis() - t0) + "ms"); return document; } catch (IOException e) { MDC.put("doc", ""); throw new RuntimeException("Error while reading " + pdfFile + ".", e); } }
From source file:org.esteco.jira.pdf.AddImageToPDF.java
License:Apache License
/** * Add an image to an existing PDF document. * * @param inputFile The input PDF to add the image to. * @param imagePath The filename of the image to put in the PDF. * @param outputFile The file to write to the pdf to. * @throws IOException If there is an error writing the data. *//* w w w .j a va 2 s . c o m*/ public void createPDFFromImage(String inputFile, String imagePath, String outputFile) throws IOException { // the document PDDocument doc = null; try { doc = PDDocument.load(new File(inputFile)); //we will add the image to the first page. PDPage page = doc.getPage(0); //page.setRotation(90); // createFromFile is the easiest way with an image file // if you already have the image in a BufferedImage, // call LosslessFactory.createFromImage() instead PDImageXObject pdImage = PDImageXObject.createFromFile(imagePath, doc); PDPageContentStream contentStream = new PDPageContentStream(doc, page, AppendMode.APPEND, true, true); // contentStream.drawImage(ximage, 20, 20 ); // better method inspired by http://stackoverflow.com/a/22318681/535646 // reduce this value if the image is too large float scale = 0.4f; contentStream.drawImage(pdImage, 20, 20, pdImage.getWidth() * scale, pdImage.getHeight() * scale); contentStream.close(); doc.save(outputFile); } finally { if (doc != null) { doc.close(); } } }
From source file:org.example.extractimagesfrompdfpages.ExtractImagesFromPDFPagesMain.java
public static void main(String[] args) { try {//from w w w . j a v a 2 s . c o m File thePDFFile = new File(args[0]); PDDocument document = PDDocument.load(thePDFFile); PDPageTree list = document.getPages(); int i = 1; for (PDPage page : list) { Boolean alreadyCreatedFolderForThisPage = false; File thePDFFileDirectory = thePDFFile.getParentFile(); File thePDFPageFolder = new File(thePDFFileDirectory.getAbsolutePath() + "/temp_images" + "/" + i); PDResources pdResources = page.getResources(); int j = 1; for (COSName c : pdResources.getXObjectNames()) { PDXObject o = pdResources.getXObject(c); if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) { if (alreadyCreatedFolderForThisPage == false) { thePDFPageFolder.mkdirs(); alreadyCreatedFolderForThisPage = true; } File file = new File(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png"); ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png", file); System.out.println(thePDFPageFolder.getAbsolutePath() + "/" + j + ".png"); j++; } } i++; } } catch (IOException ex) { Logger.getLogger(ExtractImagesFromPDFPagesMain.class.getName()).log(Level.SEVERE, null, ex); throw new RuntimeException(ex); } }
From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java
License:Open Source License
/** * Returns only a text from pdf file content. * //from w ww .j a va2 s . c o m * @param is an input stream with .pdf file content. * @return The string only with text from file content. */ public String getContentAsText(final InputStream is) throws IOException, DocumentReadException { try { return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<String>() { public String run() throws Exception { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } PDDocument pdDocument = null; StringWriter sw = new StringWriter(); try { if (is.available() == 0) return ""; try { pdDocument = PDDocument.load(is); } catch (IOException e) { throw new DocumentReadException("Can not load PDF document.", e); } PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(1); stripper.setEndPage(Integer.MAX_VALUE); stripper.writeText(pdDocument, sw); } finally { if (pdDocument != null) try { pdDocument.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } if (is != null) try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } return sw.toString(); } }); } catch (PrivilegedActionException pae) { Throwable cause = pae.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } else if (cause instanceof RuntimeException) { throw (RuntimeException) cause; } else { throw new RuntimeException(cause); } } }
From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java
License:Open Source License
public Properties getProperties(final InputStream is) throws IOException, DocumentReadException { try {/*from w ww. ja v a2s.co m*/ return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() { public Properties run() throws Exception { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } PDDocument pdDocument = PDDocument.load(is); Properties props = new Properties(); try { if (pdDocument.isEncrypted()) { try { pdDocument.decrypt(""); } catch (InvalidPasswordException e) { throw new DocumentReadException("The pdf document is encrypted.", e); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { throw new DocumentReadException(e.getMessage(), e); } } PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); PDMetadata meta = catalog.getMetadata(); if (meta != null) { XMPMetadata metadata = meta.exportXMPMetadata(); XMPSchemaDublinCore dc = metadata.getDublinCoreSchema(); if (dc != null) { try { if (dc.getTitle() != null) props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle())); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } try { if (dc.getDescription() != null) props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription())); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (dc.getCreators() != null) { for (String creator : dc.getCreators()) { props.put(DCMetaData.CREATOR, fixEncoding(creator)); } } } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (dc.getDates() != null) { for (Calendar date : dc.getDates()) { props.put(DCMetaData.DATE, date); } } } catch (Exception e) { LOG.warn("getDate failed: " + e.getMessage()); } } XMPSchemaPDF pdf = metadata.getPDFSchema(); if (pdf != null) { try { if (pdf.getKeywords() != null) props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords())); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (pdf.getProducer() != null) props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer())); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } } XMPSchemaBasic basic = metadata.getBasicSchema(); if (basic != null) { try { if (basic.getCreateDate() != null) props.put(DCMetaData.DATE, basic.getCreateDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (basic.getModifyDate() != null) props.put(DCMetaData.DATE, basic.getModifyDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } // DCMetaData.PUBLISHER - basic.getCreatorTool() } } if (props.isEmpty()) { // The pdf doesn't contain any metadata, try to use the document // information instead PDDocumentInformation docInfo = pdDocument.getDocumentInformation(); if (docInfo != null) { try { if (docInfo.getAuthor() != null) props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor()); } catch (Exception e) { LOG.warn("getAuthor failed: " + e.getMessage()); } try { if (docInfo.getCreationDate() != null) props.put(DCMetaData.DATE, docInfo.getCreationDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (docInfo.getCreator() != null) props.put(DCMetaData.CREATOR, docInfo.getCreator()); } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (docInfo.getKeywords() != null) props.put(DCMetaData.SUBJECT, docInfo.getKeywords()); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (docInfo.getModificationDate() != null) props.put(DCMetaData.DATE, docInfo.getModificationDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } try { if (docInfo.getProducer() != null) props.put(DCMetaData.PUBLISHER, docInfo.getProducer()); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } try { if (docInfo.getSubject() != null) props.put(DCMetaData.DESCRIPTION, docInfo.getSubject()); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (docInfo.getTitle() != null) props.put(DCMetaData.TITLE, docInfo.getTitle()); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } // docInfo.getTrapped(); } } } finally { if (pdDocument != null) { pdDocument.close(); } if (is != null) { try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } } return props; } }); } catch (PrivilegedActionException pae) { Throwable cause = pae.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } else if (cause instanceof RuntimeException) { throw (RuntimeException) cause; } else { throw new RuntimeException(cause); } } }
From source file:org.fit.cssbox.demo.PdfBoxBrowser.java
License:Open Source License
protected PDDocument loadPdf(InputStream is) throws IOException { PDDocument document = null; document = PDDocument.load(is); return document; }
From source file:org.fit.pdfdom.PDFToHTML.java
License:Open Source License
public static void main(String[] args) { if (args.length < 1) { System.out.println("Usage: PDFToHTML <infile> [<outfile>]"); System.exit(1);/*from w w w. j a va 2s. co m*/ } String infile = args[0]; String outfile; if (args.length > 1) outfile = args[1]; else { String base = args[0]; if (base.toLowerCase().endsWith(".pdf")) base = base.substring(0, base.length() - 4); outfile = base + ".html"; } PDDocument document = null; try { document = PDDocument.load(new File(infile)); PDFDomTree parser = new PDFDomTree(); //parser.setDisableImageData(true); Writer output = new PrintWriter(outfile, "utf-8"); parser.writeText(document, output); output.close(); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); } finally { if (document != null) { try { document.close(); } catch (IOException e) { System.err.println("Error: " + e.getMessage()); //e.printStackTrace(); } } } }
From source file:org.frameworkset.http.converter.wordpdf.Word2PDFResponse.java
License:Apache License
protected void render_(HttpOutputMessage outputMessage, HttpInputMessage inputMessage, File file) { OutputStream out = null;/* w w w . ja va2 s . co m*/ PDDocument doc2 = null; try { File contract_pdf = file; doc2 = PDDocument.load(contract_pdf); HttpServletResponse response = outputMessage.getResponse(); response.setContentType("application/pdf"); response.setHeader("Content-Disposition", "inline; filename=" + handleCNName(contract_pdf.getName(), inputMessage.getServletRequest())); out = response.getOutputStream(); doc2.save(out); out.flush(); } catch (Exception e) { throw new HttpMessageNotWritableException(this.getPdfFile(), e); } finally { if (out != null) { try { out.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (doc2 != null) try { doc2.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
From source file:org.freeeed.ocr.ImageTextParser.java
License:Apache License
private static String splitPages(String filePath) throws IOException { File file = new File(filePath); String pagePath;/* www .j a va2s . c o m*/ try (PDDocument document = PDDocument.load(file)) { Splitter splitter = new Splitter(); List<PDDocument> pages = splitter.split(document); Iterator<PDDocument> iterator = pages.listIterator(); int i = 0; pagePath = createTempPath(file); LOGGER.debug("pagePath = " + pagePath); while (iterator.hasNext()) { PDDocument pd = iterator.next(); pd.save(pagePath + i++ + ".pdf"); pd.close(); } } return pagePath; }
From source file:org.freeeed.ocr.PDFImageExtractor.java
License:Apache License
@SuppressWarnings("rawtypes") @Override/* w w w.ja v a 2 s. co m*/ public List<String> extractImages() { File extractionDir = new File(conf.getPdfImageExtractionDir()); extractionDir.mkdirs(); List<String> result = new ArrayList<String>(); PDDocument document = null; try { document = PDDocument.load(file); List pages = document.getDocumentCatalog().getAllPages(); Iterator iter = pages.iterator(); int i = 1; int maxNumberOfImages = Project.getCurrentProject().getOcrMaxImagesPerPDF(); while (iter.hasNext()) { PDPage page = (PDPage) iter.next(); PDResources resources = page.getResources(); Map pageImages = resources.getImages(); if (pageImages != null) { Iterator imageIter = pageImages.keySet().iterator(); while (imageIter.hasNext()) { if (i > maxNumberOfImages) { return result; } String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) pageImages.get(key); String fileName = conf.getPdfImageExtractionDir() + OCRUtil.createUniqueFileName("image"); image.write2file(fileName); result.add(fileName + "." + image.getSuffix()); i++; } } } } catch (IOException ex) { ex.printStackTrace(); } return result; }