List of usage examples for org.apache.pdfbox.pdmodel PDDocument load
public static PDDocument load(byte[] input) throws IOException
From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java
License:Apache License
/** * This method extracts creation date/ custom date of a PDF file * @param file is a File object/*from w w w . j a v a2 s.c o m*/ * @return String that contains the creation date/ custom date of the PDF */ public static String extractDate(File file) { PDDocument document = null; boolean isDamaged = false; //to deal with damaged pdf String creationDateMetaData = ""; try { document = PDDocument.load(file.toString()); /*If the PDF file is not damanged --->*/ if (!isDamaged) { /*...but the file is encrypted --->*/ if (document.isEncrypted()) { logger.info("File " + file.getName() + "is encrypted. Trying to decrypt..."); try { /*...then decryptt it --->*/ document.decrypt(""); document.setAllSecurityToBeRemoved(true); logger.info("File " + file.getName() + "successfully decrypted!"); } catch (CryptographyException e) { logger.info("Error decrypting file " + file.getName()); isDamaged = true; } } /*<--work around to decrypt an encrypted pdf ends here*/ /*Metadata extraction --->*/ PDDocumentInformation info = document.getDocumentInformation(); /*We are only interested in date data--->*/ Calendar calendar = info.getCreationDate(); int creationYear = 0, creationMonth = 0, creationDate = 0; if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete*/ /*If creation date is not empty --->*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } //<--- creation date found and the date part of the title is generated /*No creation date is found --->*/ else { SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy"); Date customDate = null; /*But we have custom date some times --->*/ try { customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate")); } catch (ParseException e) { logger.info("Error parsing date from custom date"); } calendar = Calendar.getInstance(); calendar.setTime(customDate); if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete from customdate*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } } //<--- work around if no creation date is found } /*<--- Good to know that the PDF was not damaged*/ } catch (IOException e) { /*If the PDF was not read by the system --->*/ logger.info("Error processing file " + file.getName()); /*... then maybe it is damaged*/ isDamaged = true; } finally { try { /*If the file was good, not damaged, then please close it --->*/ if (!isDamaged) { document.close(); logger.info("File " + file.getName() + " is closed successfully!"); } } catch (IOException e) { logger.info("Error closing file " + file.getName()); } } /*<--- PDF closing done!*/ return creationDateMetaData; }
From source file:com.tekstosense.segmenter.Main.java
License:Open Source License
private TextExtractor parsePdf(File f) throws IOException { PDDocument doc = PDDocument.load(f); if (doc.isEncrypted()) { // Some documents are encrypted with the empty password. Try // to decrypt with this password, or the one passed in on the // command line (if any), and fail if we can't. try {//from www. ja v a 2 s. c om doc.setAllSecurityToBeRemoved(false); //doc.decrypt(password); // Defaults to the empty string. } catch (Exception e) { throw new IOException("Can't decrypt document: ", e); } } TextExtractor te = new TextExtractor(); te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream())); return te; }
From source file:com.tekstosense.segmenter.StructurePdf.PdfSections.java
License:Open Source License
private TextExtractor parsePdf(File f) throws IOException { PDDocument doc = PDDocument.load(f); if (doc.isEncrypted()) { // Some documents are encrypted with the empty password. Try // to decrypt with this password, or the one passed in on the // command line (if any), and fail if we can't. try {/* w w w . j av a 2s.co m*/ doc.setAllSecurityToBeRemoved(false); // doc.decrypt(password); // Defaults to the empty string. } catch (Exception e) { throw new IOException("Can't decrypt document: ", e); } } TextExtractor te = new TextExtractor(); te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream())); return te; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * Get the page count of the document./*from w w w . ja v a2 s .c o m*/ * * @param file Absolute file path * @return int No of pages in the document. * @throws java.io.IOException when file is not found. */ public int getPageCount(String file) throws IOException { logger.info("file :" + file); PDDocument doc = PDDocument.load(new File(file)); int pageCount = doc.getNumberOfPages(); logger.info("pageCount :" + pageCount); doc.close(); return pageCount; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method returns the content of the document *//*from w ww .j a v a 2 s. c o m*/ private String getPDFText(String file, int startPage, int endPage) throws IOException { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); PDDocument doc = PDDocument.load(new File(file)); PDFTextStripper localStripper = new PDFTextStripper(); if (null != this.stripper) { localStripper = this.stripper; } this.updateStartAndEndPages(file, startPage, endPage); localStripper.setStartPage(this.startPage); localStripper.setEndPage(this.endPage); String txt = localStripper.getText(doc); logger.info("PDF Text before trimming : " + txt); if (this.bTrimWhiteSpace) { txt = txt.trim().replaceAll("\\s+", " ").trim(); logger.info("PDF Text after trimming : " + txt); } doc.close(); return txt; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method saves the each page of the pdf as image *//* www . j av a 2 s. co m*/ private List<String> saveAsImage(String file, int startPage, int endPage) throws IOException { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); ArrayList<String> imgNames = new ArrayList<String>(); try { File sourceFile = new File(file); this.createImageDestinationDirectory(file); this.updateStartAndEndPages(file, startPage, endPage); String fileName = sourceFile.getName().replace(".pdf", ""); PDDocument document = PDDocument.load(sourceFile); PDFRenderer pdfRenderer = new PDFRenderer(document); for (int iPage = this.startPage - 1; iPage < this.endPage; iPage++) { logger.info("Page No : " + (iPage + 1)); String fname = this.imageDestinationPath + fileName + "_" + (iPage + 1) + ".png"; BufferedImage image = pdfRenderer.renderImageWithDPI(iPage, 300, ImageType.RGB); ImageIOUtil.writeImage(image, fname, 300); imgNames.add(fname); logger.info("PDf Page saved as image : " + fname); } document.close(); } catch (Exception e) { e.printStackTrace(); } return imgNames; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
private boolean convertToImageAndCompare(String file1, String file2, int startPage, int endPage) throws IOException { boolean result = true; PDDocument doc1 = null;//from w ww . j av a2s.co m PDDocument doc2 = null; PDFRenderer pdfRenderer1 = null; PDFRenderer pdfRenderer2 = null; try { doc1 = PDDocument.load(new File(file1)); doc2 = PDDocument.load(new File(file2)); pdfRenderer1 = new PDFRenderer(doc1); pdfRenderer2 = new PDFRenderer(doc2); for (int iPage = startPage - 1; iPage < endPage; iPage++) { String fileName = new File(file1).getName().replace(".pdf", "_") + (iPage + 1); fileName = this.getImageDestinationPath() + "/" + fileName + "_diff.png"; logger.info("Comparing Page No : " + (iPage + 1)); BufferedImage image1 = pdfRenderer1.renderImageWithDPI(iPage, 300, ImageType.RGB); BufferedImage image2 = pdfRenderer2.renderImageWithDPI(iPage, 300, ImageType.RGB); result = ImageUtil.compareAndHighlight(image1, image2, fileName, this.bHighlightPdfDifference, this.imgColor.getRGB()) && result; if (!this.bCompareAllPages && !result) { break; } } } catch (Exception e) { e.printStackTrace(); } finally { doc1.close(); doc2.close(); } return result; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method extracts all the embedded images of the pdf document *//*from ww w . ja va2s . c om*/ private List<String> extractimages(String file, int startPage, int endPage) { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); ArrayList<String> imgNames = new ArrayList<String>(); boolean bImageFound = false; try { this.createImageDestinationDirectory(file); String fileName = this.getFileName(file).replace(".pdf", "_resource"); PDDocument document = PDDocument.load(new File(file)); PDPageTree list = document.getPages(); this.updateStartAndEndPages(file, startPage, endPage); int totalImages = 1; for (int iPage = this.startPage - 1; iPage < this.endPage; iPage++) { logger.info("Page No : " + (iPage + 1)); PDResources pdResources = list.get(iPage).getResources(); for (COSName c : pdResources.getXObjectNames()) { PDXObject o = pdResources.getXObject(c); if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) { bImageFound = true; String fname = this.imageDestinationPath + "/" + fileName + "_" + totalImages + ".png"; ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png", new File(fname)); imgNames.add(fname); totalImages++; } } } document.close(); if (bImageFound) logger.info("Images are saved @ " + this.imageDestinationPath); else logger.info("No images were found in the PDF"); } catch (Exception e) { e.printStackTrace(); } return imgNames; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
private void updateStartAndEndPages(String file, int start, int end) throws IOException { PDDocument document = PDDocument.load(new File(file)); int pagecount = document.getNumberOfPages(); logger.info("Page Count : " + pagecount); logger.info("Given start page:" + start); logger.info("Given end page:" + end); if ((start > 0 && start <= pagecount)) { this.startPage = start; } else {/*from w w w . j a v a 2s. c o m*/ this.startPage = 1; } if ((end > 0 && end >= start && end <= pagecount)) { this.endPage = end; } else { this.endPage = pagecount; } document.close(); logger.info("Updated start page:" + this.startPage); logger.info("Updated end page:" + this.endPage); }
From source file:com.truckzoo.test.pdf.CustomPageDrawer.java
License:Apache License
public static void main(String[] args) throws IOException { File file = new File("custom-render-demo.pdf"); PDDocument doc = PDDocument.load(file); PDFRenderer renderer = new MyPDFRenderer(doc); BufferedImage image = renderer.renderImage(0); ImageIO.write(image, "PNG", new File("custom-render.png")); doc.close();/* w w w . j ava 2 s . c o m*/ }