List of usage examples for org.apache.pdfbox.pdmodel PDDocument close
@Override public void close() throws IOException
From source file:com.qwazr.library.pdfbox.PdfBoxParser.java
License:Apache License
/** * Extract text content using PDFBox//from w w w .j a v a2s .com * * @param pdf * @param resultBuilder * @throws Exception */ private void parseContent(final PDDocument pdf, final ParserResultBuilder resultBuilder) throws Exception { try { extractMetaData(pdf, resultBuilder.metas()); Stripper stripper = new Stripper(resultBuilder); stripper.getText(pdf); } finally { if (pdf != null) pdf.close(); } }
From source file:com.santaanna.friendlyreader.pdfstod.pdfstod3.ReplaceStringStreamEngine.java
License:Apache License
private void saveAndClose(String filnamn, PDDocument utfil) { try {//from www.java 2s . c o m if (utfil != null) { SkrivUt(3, "saveAndClose"); utfil.save(filnamn); utfil.close(); } } catch (java.io.IOException jioio) { SkrivUt(7, "IO Fel i saveAndClose."); } catch (COSVisitorException cosv) { SkrivUt(7, "CosVis Fel i saveAndClose."); } }
From source file:com.sinefine.util.pdf.Pdfs.java
License:Apache License
/** * Closes the instance of the PDDocument without throwing an exception. * * @param pdDocument an instance of the class {@linkplain PDDocument}. *///from w ww .j a v a 2 s . c om private static void closeQuietly(final PDDocument pdDocument) { if (pdDocument != null) { try { pdDocument.close(); } catch (IOException ioe) { //ignore exception if (LOGGER.isDebugEnabled()) { LOGGER.debug("An exception occured whilst attempting " + "to close an instance of the PDDocument class. " + "Although this exception will not cause the " + "application to fail, it should be investigated.", ioe); } } } }
From source file:com.stimulus.archiva.extraction.PDFExtractor.java
License:Open Source License
public Reader getText(InputStream is, Charset charset, IndexInfo indexInfo) throws ExtractionException { logger.debug("extracting pdf file"); File file = null;/* w ww.j av a2 s . c om*/ PDDocument document = null; Writer output = null; try { PDFParser parser = new PDFParser(is); parser.parse(); document = parser.getPDDocument(); if (document.isEncrypted()) { DocumentEncryption decryptor = new DocumentEncryption(document); if (logger.isDebugEnabled()) { logger.debug("pdf document appears to be encrypted (will attempt decryption)"); } decryptor.decryptDocument(""); } file = File.createTempFile("extract_pdf", ".tmp"); indexInfo.addDeleteFile(file); output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8"); PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(document, output); /*logger.debug("PDF extraction completed"); BufferedReader reader; try { reader = new BufferedReader(new FileReader(file)); String line = null; while( (line=reader.readLine()) != null) { logger.debug("PDF>"+line); } reader.close(); } catch(Exception e) { logger.error("failed to open txt file",e); }*/ } catch (Throwable e) { throw new ExtractionException("failed to extract pdf (probable password protected document)", e, logger, ChainedException.Level.DEBUG); } finally { try { if (document != null) document.close(); if (output != null) output.close(); } catch (IOException io) { } } try { logger.debug("returning extracted PDF data"); Reader outReader = new FileReader(file); indexInfo.addReader(outReader); return outReader; } catch (Exception ex) { throw new ExtractionException("failed to extract text from powerpoint document", ex, logger, ChainedException.Level.DEBUG); } }
From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java
License:Apache License
/** * This method extracts creation date/ custom date of a PDF file * @param file is a File object// w w w .j a v a 2 s . c om * @return String that contains the creation date/ custom date of the PDF */ public static String extractDate(File file) { PDDocument document = null; boolean isDamaged = false; //to deal with damaged pdf String creationDateMetaData = ""; try { document = PDDocument.load(file.toString()); /*If the PDF file is not damanged --->*/ if (!isDamaged) { /*...but the file is encrypted --->*/ if (document.isEncrypted()) { logger.info("File " + file.getName() + "is encrypted. Trying to decrypt..."); try { /*...then decryptt it --->*/ document.decrypt(""); document.setAllSecurityToBeRemoved(true); logger.info("File " + file.getName() + "successfully decrypted!"); } catch (CryptographyException e) { logger.info("Error decrypting file " + file.getName()); isDamaged = true; } } /*<--work around to decrypt an encrypted pdf ends here*/ /*Metadata extraction --->*/ PDDocumentInformation info = document.getDocumentInformation(); /*We are only interested in date data--->*/ Calendar calendar = info.getCreationDate(); int creationYear = 0, creationMonth = 0, creationDate = 0; if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete*/ /*If creation date is not empty --->*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } //<--- creation date found and the date part of the title is generated /*No creation date is found --->*/ else { SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy"); Date customDate = null; /*But we have custom date some times --->*/ try { customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate")); } catch (ParseException e) { logger.info("Error parsing date from custom date"); } calendar = Calendar.getInstance(); calendar.setTime(customDate); if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete from customdate*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } } //<--- work around if no creation date is found } /*<--- Good to know that the PDF was not damaged*/ } catch (IOException e) { /*If the PDF was not read by the system --->*/ logger.info("Error processing file " + file.getName()); /*... then maybe it is damaged*/ isDamaged = true; } finally { try { /*If the file was good, not damaged, then please close it --->*/ if (!isDamaged) { document.close(); logger.info("File " + file.getName() + " is closed successfully!"); } } catch (IOException e) { logger.info("Error closing file " + file.getName()); } } /*<--- PDF closing done!*/ return creationDateMetaData; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * Get the page count of the document.//from w w w.j a va2s . c o m * * @param file Absolute file path * @return int No of pages in the document. * @throws java.io.IOException when file is not found. */ public int getPageCount(String file) throws IOException { logger.info("file :" + file); PDDocument doc = PDDocument.load(new File(file)); int pageCount = doc.getNumberOfPages(); logger.info("pageCount :" + pageCount); doc.close(); return pageCount; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method returns the content of the document *//*from w w w . ja v a 2 s. c o m*/ private String getPDFText(String file, int startPage, int endPage) throws IOException { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); PDDocument doc = PDDocument.load(new File(file)); PDFTextStripper localStripper = new PDFTextStripper(); if (null != this.stripper) { localStripper = this.stripper; } this.updateStartAndEndPages(file, startPage, endPage); localStripper.setStartPage(this.startPage); localStripper.setEndPage(this.endPage); String txt = localStripper.getText(doc); logger.info("PDF Text before trimming : " + txt); if (this.bTrimWhiteSpace) { txt = txt.trim().replaceAll("\\s+", " ").trim(); logger.info("PDF Text after trimming : " + txt); } doc.close(); return txt; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method saves the each page of the pdf as image *//*from w ww . j a v a 2s.c om*/ private List<String> saveAsImage(String file, int startPage, int endPage) throws IOException { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); ArrayList<String> imgNames = new ArrayList<String>(); try { File sourceFile = new File(file); this.createImageDestinationDirectory(file); this.updateStartAndEndPages(file, startPage, endPage); String fileName = sourceFile.getName().replace(".pdf", ""); PDDocument document = PDDocument.load(sourceFile); PDFRenderer pdfRenderer = new PDFRenderer(document); for (int iPage = this.startPage - 1; iPage < this.endPage; iPage++) { logger.info("Page No : " + (iPage + 1)); String fname = this.imageDestinationPath + fileName + "_" + (iPage + 1) + ".png"; BufferedImage image = pdfRenderer.renderImageWithDPI(iPage, 300, ImageType.RGB); ImageIOUtil.writeImage(image, fname, 300); imgNames.add(fname); logger.info("PDf Page saved as image : " + fname); } document.close(); } catch (Exception e) { e.printStackTrace(); } return imgNames; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
private boolean convertToImageAndCompare(String file1, String file2, int startPage, int endPage) throws IOException { boolean result = true; PDDocument doc1 = null; PDDocument doc2 = null;/*from w ww.j a va 2s . c o m*/ PDFRenderer pdfRenderer1 = null; PDFRenderer pdfRenderer2 = null; try { doc1 = PDDocument.load(new File(file1)); doc2 = PDDocument.load(new File(file2)); pdfRenderer1 = new PDFRenderer(doc1); pdfRenderer2 = new PDFRenderer(doc2); for (int iPage = startPage - 1; iPage < endPage; iPage++) { String fileName = new File(file1).getName().replace(".pdf", "_") + (iPage + 1); fileName = this.getImageDestinationPath() + "/" + fileName + "_diff.png"; logger.info("Comparing Page No : " + (iPage + 1)); BufferedImage image1 = pdfRenderer1.renderImageWithDPI(iPage, 300, ImageType.RGB); BufferedImage image2 = pdfRenderer2.renderImageWithDPI(iPage, 300, ImageType.RGB); result = ImageUtil.compareAndHighlight(image1, image2, fileName, this.bHighlightPdfDifference, this.imgColor.getRGB()) && result; if (!this.bCompareAllPages && !result) { break; } } } catch (Exception e) { e.printStackTrace(); } finally { doc1.close(); doc2.close(); } return result; }
From source file:com.testautomationguru.utility.PDFUtil.java
License:Apache License
/** * This method extracts all the embedded images of the pdf document *///from w w w . j av a 2 s . co m private List<String> extractimages(String file, int startPage, int endPage) { logger.info("file : " + file); logger.info("startPage : " + startPage); logger.info("endPage : " + endPage); ArrayList<String> imgNames = new ArrayList<String>(); boolean bImageFound = false; try { this.createImageDestinationDirectory(file); String fileName = this.getFileName(file).replace(".pdf", "_resource"); PDDocument document = PDDocument.load(new File(file)); PDPageTree list = document.getPages(); this.updateStartAndEndPages(file, startPage, endPage); int totalImages = 1; for (int iPage = this.startPage - 1; iPage < this.endPage; iPage++) { logger.info("Page No : " + (iPage + 1)); PDResources pdResources = list.get(iPage).getResources(); for (COSName c : pdResources.getXObjectNames()) { PDXObject o = pdResources.getXObject(c); if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) { bImageFound = true; String fname = this.imageDestinationPath + "/" + fileName + "_" + totalImages + ".png"; ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png", new File(fname)); imgNames.add(fname); totalImages++; } } } document.close(); if (bImageFound) logger.info("Images are saved @ " + this.imageDestinationPath); else logger.info("No images were found in the PDF"); } catch (Exception e) { e.printStackTrace(); } return imgNames; }