List of usage examples for org.apache.pdfbox.pdmodel PDDocument isEncrypted
public boolean isEncrypted()
From source file:com.jaeksoft.searchlib.web.controller.ViewerController.java
License:Open Source License
private void loadPdfBox() throws IOException, CryptographyException, SearchLibException, InterruptedException { PDDocument document = null; try {//from w w w .ja v a 2s . com document = PDDocument.loadNonSeq(tempFile, null); // Trying to open with empty password boolean isEncrypted = document.isEncrypted(); if (isEncrypted) document.decrypt(""); loadGS(isEncrypted ? "" : null); List<Rectangle> boxList = new ArrayList<Rectangle>(0); checkPdfBoxHighlight(document, boxList); checkHocrHighlight(currentImage.getWidth(), currentImage.getHeight(), boxList); ImageUtils.yellowHighlight(currentImage, boxList, 0.1F); numberOfPages = document.getNumberOfPages(); } finally { if (document != null) IOUtils.close(document); } }
From source file:com.jt.tool.pdf.CreateBookmarks.java
License:Apache License
public static void createBookmark(String srcFile, String targetFile, String reg) throws Exception { PDDocument document = null; try {// w ww. jav a 2 s .com document = PDDocument.load(new File(srcFile)); if (document.isEncrypted()) { System.err.println("Error: Cannot add bookmarks to encrypted document."); System.exit(1); } PDDocumentOutline outline = new PDDocumentOutline(); document.getDocumentCatalog().setDocumentOutline(outline); PDOutlineItem pagesOutline = new PDOutlineItem(); pagesOutline.setTitle("All Pages"); // outline.appendChild(pagesOutline); List pages = new ArrayList(); // document.getDocumentCatalog().getAllPages(); for (int i = 12; i < pages.size(); i++) { String pageText = getPageText(document, i + 1, 0); String[] strings = matchTitle(pageText, reg); if (makeBookmark(strings)) { PDPage page = (PDPage) pages.get(i); PDPageFitWidthDestination dest = new PDPageFitWidthDestination(); dest.setPage(page); PDOutlineItem bookmark = new PDOutlineItem(); bookmark.setDestination(dest); bookmark.setTitle(strings[0]); // pagesOutline.appendChild(bookmark); System.out.println("add " + strings[0]); } } pagesOutline.openNode(); outline.openNode(); document.save(targetFile); } finally { if (document != null) { document.close(); } } }
From source file:com.odc.pdfextractor.parser.CleanPdfParser.java
License:Apache License
/** * This will print the documents docBuilder. * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. *//* ww w . j a v a2 s . com*/ public DocumentLocation processPdf(String filename) throws Exception { PDDocument document = null; try { document = PDDocument.load(filename); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } List allPages = document.getDocumentCatalog().getAllPages(); System.out.print("Extracting text from PDF"); for (int i = 0; i < allPages.size(); i++) { PDPage page = (PDPage) allPages.get(i); System.out.print("."); PDStream contents = page.getContents(); if (contents != null) { this.processStream(page, page.findResources(), page.getContents().getStream()); } docBuilder.incrementPage(); } } finally { System.out.println(); if (document != null) { document.close(); } } return docBuilder.getDoc(); }
From source file:com.openkm.extractor.PdfTextExtractor.java
License:Open Source License
/** * {@inheritDoc}//from www . j a v a 2s . c om */ @SuppressWarnings("rawtypes") public String extractText(InputStream stream, String type, String encoding) throws IOException { try { PDFParser parser = new PDFParser(new BufferedInputStream(stream)); try { parser.parse(); PDDocument document = parser.getPDDocument(); if (document.isEncrypted()) { try { document.decrypt(""); document.setAllSecurityToBeRemoved(true); } catch (Exception e) { throw new IOException("Unable to extract text: document encrypted", e); } } CharArrayWriter writer = new CharArrayWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.writeText(document, writer); String st = writer.toString().trim(); log.debug("TextStripped: '{}'", st); if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) { log.warn("PDF does not contains text layer"); // Extract images from PDF StringBuilder sb = new StringBuilder(); if (!Config.SYSTEM_PDFIMAGES.isEmpty()) { File tmpPdf = FileUtils.createTempFile("pdf"); File tmpDir = new File(EnvironmentDetector.getTempDir()); String baseName = FileUtils.getFileName(tmpPdf.getName()); document.save(tmpPdf); int pgNum = 1; try { for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) { HashMap<String, Object> hm = new HashMap<String, Object>(); hm.put("fileIn", tmpPdf.getPath()); hm.put("firstPage", pgNum); hm.put("lastPage", pgNum++); hm.put("imageRoot", tmpDir + File.separator + baseName); String cmd = TemplateUtils.replace("SYSTEM_PDFIMAGES", Config.SYSTEM_PDFIMAGES, hm); ExecutionUtils.runCmd(cmd); for (File tmp : tmpDir.listFiles()) { if (tmp.getName().startsWith(baseName + "-")) { if (page.findRotation() > 0) { ImageUtils.rotate(tmp, tmp, page.findRotation()); } try { String txt = doOcr(tmp); sb.append(txt).append(" "); log.debug("OCR Extracted: {}", txt); } finally { FileUtils.deleteQuietly(tmp); } } } } } finally { FileUtils.deleteQuietly(tmpPdf); } } else { for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) { PDResources resources = page.getResources(); Map<String, PDXObject> images = resources.getXObjects(); if (images != null) { for (String key : images.keySet()) { PDXObjectImage image = (PDXObjectImage) images.get(key); String prefix = "img-" + key + "-"; File pdfImg = null; try { pdfImg = File.createTempFile(prefix, ".png"); log.debug("Writing image: {}", pdfImg.getPath()); // Won't work until PDFBox 1.8.9 ImageIO.write(image.getRGBImage(), "png", pdfImg); if (page.findRotation() > 0) { ImageUtils.rotate(pdfImg, pdfImg, page.findRotation()); } // Do OCR String txt = doOcr(pdfImg); sb.append(txt).append(" "); log.debug("OCR Extracted: {}", txt); } finally { FileUtils.deleteQuietly(pdfImg); } } } } } return sb.toString(); } else { return writer.toString(); } } finally { try { PDDocument doc = parser.getPDDocument(); if (doc != null) { doc.close(); } } catch (IOException e) { // ignore } } } catch (Exception e) { // it may happen that PDFParser throws a runtime // exception when parsing certain pdf documents log.warn("Failed to extract PDF text content", e); throw new IOException(e.getMessage(), e); } finally { stream.close(); } }
From source file:com.opensearchserver.extractor.parser.PdfBox.java
License:Apache License
/** * Extract text content using PDFBox//from www . j a v a 2 s . com * * @param pdf * @throws Exception */ private void parseContent(PDDocument pdf) throws Exception { try { if (pdf.isEncrypted()) pdf.openProtection(new StandardDecryptionMaterial("")); extractMetaData(pdf); Stripper stripper = new Stripper(); stripper.getText(pdf); } finally { if (pdf != null) pdf.close(); } }
From source file:com.santaanna.friendlyreader.pdfstod.GUI.PDFReader.java
License:Apache License
/** * This will parse a document./*from w w w .j ava 2 s . c om*/ * * @param input The input stream for the document. * * @return The document. * * @throws IOException If there is an error parsing the document. */ private static PDDocument parseDocument(InputStream input) throws IOException { PDDocument document = PDDocument.load(input); if (document.isEncrypted()) { try { document.decrypt(""); } catch (org.apache.pdfbox.exceptions.InvalidPasswordException e) { System.err.println("Error: The document is encrypted."); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { e.printStackTrace(); } } return document; }
From source file:com.stimulus.archiva.extraction.PDFExtractor.java
License:Open Source License
public Reader getText(InputStream is, Charset charset, IndexInfo indexInfo) throws ExtractionException { logger.debug("extracting pdf file"); File file = null;// w ww . j a v a2 s . c om PDDocument document = null; Writer output = null; try { PDFParser parser = new PDFParser(is); parser.parse(); document = parser.getPDDocument(); if (document.isEncrypted()) { DocumentEncryption decryptor = new DocumentEncryption(document); if (logger.isDebugEnabled()) { logger.debug("pdf document appears to be encrypted (will attempt decryption)"); } decryptor.decryptDocument(""); } file = File.createTempFile("extract_pdf", ".tmp"); indexInfo.addDeleteFile(file); output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8"); PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(document, output); /*logger.debug("PDF extraction completed"); BufferedReader reader; try { reader = new BufferedReader(new FileReader(file)); String line = null; while( (line=reader.readLine()) != null) { logger.debug("PDF>"+line); } reader.close(); } catch(Exception e) { logger.error("failed to open txt file",e); }*/ } catch (Throwable e) { throw new ExtractionException("failed to extract pdf (probable password protected document)", e, logger, ChainedException.Level.DEBUG); } finally { try { if (document != null) document.close(); if (output != null) output.close(); } catch (IOException io) { } } try { logger.debug("returning extracted PDF data"); Reader outReader = new FileReader(file); indexInfo.addReader(outReader); return outReader; } catch (Exception ex) { throw new ExtractionException("failed to extract text from powerpoint document", ex, logger, ChainedException.Level.DEBUG); } }
From source file:com.sustainalytics.crawlerfilter.PDFTitleGeneration.java
License:Apache License
/** * This method extracts creation date/ custom date of a PDF file * @param file is a File object// www . j ava2 s. c o m * @return String that contains the creation date/ custom date of the PDF */ public static String extractDate(File file) { PDDocument document = null; boolean isDamaged = false; //to deal with damaged pdf String creationDateMetaData = ""; try { document = PDDocument.load(file.toString()); /*If the PDF file is not damanged --->*/ if (!isDamaged) { /*...but the file is encrypted --->*/ if (document.isEncrypted()) { logger.info("File " + file.getName() + "is encrypted. Trying to decrypt..."); try { /*...then decryptt it --->*/ document.decrypt(""); document.setAllSecurityToBeRemoved(true); logger.info("File " + file.getName() + "successfully decrypted!"); } catch (CryptographyException e) { logger.info("Error decrypting file " + file.getName()); isDamaged = true; } } /*<--work around to decrypt an encrypted pdf ends here*/ /*Metadata extraction --->*/ PDDocumentInformation info = document.getDocumentInformation(); /*We are only interested in date data--->*/ Calendar calendar = info.getCreationDate(); int creationYear = 0, creationMonth = 0, creationDate = 0; if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete*/ /*If creation date is not empty --->*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } //<--- creation date found and the date part of the title is generated /*No creation date is found --->*/ else { SimpleDateFormat dateFormatter = new SimpleDateFormat("MM/dd/yyyy"); Date customDate = null; /*But we have custom date some times --->*/ try { customDate = dateFormatter.parse(info.getCustomMetadataValue("customdate")); } catch (ParseException e) { logger.info("Error parsing date from custom date"); } calendar = Calendar.getInstance(); calendar.setTime(customDate); if (calendar != null) { creationYear = calendar.get(Calendar.YEAR); creationMonth = calendar.get(Calendar.MONTH) + 1; creationDate = calendar.get(Calendar.DATE); } /*<---Date data extraction complete from customdate*/ if (creationYear != 0) { creationDateMetaData = creationYear + "-" + creationMonth + "-" + creationDate; } } //<--- work around if no creation date is found } /*<--- Good to know that the PDF was not damaged*/ } catch (IOException e) { /*If the PDF was not read by the system --->*/ logger.info("Error processing file " + file.getName()); /*... then maybe it is damaged*/ isDamaged = true; } finally { try { /*If the file was good, not damaged, then please close it --->*/ if (!isDamaged) { document.close(); logger.info("File " + file.getName() + " is closed successfully!"); } } catch (IOException e) { logger.info("Error closing file " + file.getName()); } } /*<--- PDF closing done!*/ return creationDateMetaData; }
From source file:com.tekstosense.segmenter.Main.java
License:Open Source License
private TextExtractor parsePdf(File f) throws IOException { PDDocument doc = PDDocument.load(f); if (doc.isEncrypted()) { // Some documents are encrypted with the empty password. Try // to decrypt with this password, or the one passed in on the // command line (if any), and fail if we can't. try {//from w ww . jav a 2s . c o m doc.setAllSecurityToBeRemoved(false); //doc.decrypt(password); // Defaults to the empty string. } catch (Exception e) { throw new IOException("Can't decrypt document: ", e); } } TextExtractor te = new TextExtractor(); te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream())); return te; }
From source file:com.tekstosense.segmenter.StructurePdf.PdfSections.java
License:Open Source License
private TextExtractor parsePdf(File f) throws IOException { PDDocument doc = PDDocument.load(f); if (doc.isEncrypted()) { // Some documents are encrypted with the empty password. Try // to decrypt with this password, or the one passed in on the // command line (if any), and fail if we can't. try {/*from w w w .ja va2 s. c o m*/ doc.setAllSecurityToBeRemoved(false); // doc.decrypt(password); // Defaults to the empty string. } catch (Exception e) { throw new IOException("Can't decrypt document: ", e); } } TextExtractor te = new TextExtractor(); te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream())); return te; }