List of usage examples for org.apache.pdfbox.pdmodel PDDocument isEncrypted
public boolean isEncrypted()
From source file:pdfsplicer.SplicerModel.java
License:Open Source License
/** * Create the new PDF, and save it.//w ww. j av a 2 s. c om * * @param saveFile the file to save it as * @throws IOException if it cannot save the file */ public void makeFinalizedPDF(File saveFile) throws IOException { PDDocument doc = null; PDDocument newdoc = new PDDocument(); for (int i = 0; i < pageEntryPDFList.size(); ++i) { doc = pdfList.get(pageEntryPDFList.get(i)); if (doc.isEncrypted()) { System.out.println("Error: Encrypted PDF"); System.exit(1); } List<Integer> pRange = pageRangeList.get(i); PDFCloneUtility pdfCloner = new PDFCloneUtility(newdoc); for (int pNum : pRange) { PDPage page = doc.getPage(pNum - 1); COSDictionary clonedDict = (COSDictionary) pdfCloner.cloneForNewDocument(page); newdoc.addPage(new PDPage(clonedDict)); } } newdoc.save(saveFile); if (newdoc != null) { newdoc.close(); } }
From source file:pl.umk.mat.zawodyweb.pdf.PdfToImage.java
License:Open Source License
public static BufferedImage process(InputStream pdfFile) { PDDocument pdf = null; BufferedImage output = null;/*from w ww . j av a 2 s.c o m*/ try { pdf = PDDocument.load(pdfFile, true); if (pdf.isEncrypted()) { pdf.decrypt(""); } List<PDPage> pdfPages = pdf.getDocumentCatalog().getAllPages(); if (pdfPages.isEmpty() == false) { Iterator<PDPage> it = pdfPages.iterator(); PDPage page = it.next(); BufferedImage bi = page.convertToImage(BufferedImage.TYPE_USHORT_565_RGB, 72 * 2); if (pdfPages.size() == 1) { output = bi; } else { int width = bi.getWidth(); int height = bi.getHeight(); output = new BufferedImage(width, height * pdfPages.size(), BufferedImage.TYPE_USHORT_565_RGB); Graphics2D g = output.createGraphics(); g.drawImage(bi, 0, 0, null); g.setColor(Color.red); int pageNo = 0; while (it.hasNext()) { ++pageNo; page = it.next(); bi = page.convertToImage(BufferedImage.TYPE_USHORT_565_RGB, 72 * 2); g.drawImage(bi, 0, pageNo * height, null); g.drawLine(0, pageNo * height, width, pageNo * height); } g.dispose(); } } } catch (Exception ex) { throw new RuntimeException("Exception converting pdf to image: ", ex); } finally { if (pdf != null) { try { pdf.close(); } catch (IOException ex) { throw new RuntimeException("Exception when closing pdf: ", ex); } } } return output; }
From source file:Project.data.preparation.ImageExtraction.java
public void extractImages(String sourceDir, String destinationDir) throws IOException, CryptographyException, COSVisitorException { PDDocument document = null; double[] size; if (oldFile.exists()) { document = PDDocument.load(sourceDir); if (document.isEncrypted()) { document.decrypt(""); }/*from w w w . j a v a 2 s . c o m*/ PrintImageLocation printer; // Get image location List<PDPage> list = document.getDocumentCatalog().getAllPages(); String fileName_img = oldFile.getName().replace(".pdf", "_cover"); int pageNum = 0; int totalImages = 1; System.out.println("\n" + filename); for (PDPage page : list) { original_imgName = new ArrayList<String>(); location_xy = new ArrayList<double[]>(); size_xy_ordered = new ArrayList<double[]>(); size_xy_tmp = new ArrayList<double[]>(); PDResources pdResources = page.getResources(); Map pageImages = pdResources.getXObjects(); pageNum++; if (pageImages != null && pageImages.size() > 0) { Iterator imageIter = pageImages.keySet().iterator(); while (imageIter.hasNext()) { String key = (String) imageIter.next(); PDXObjectImage pdxObjectImage = (PDXObjectImage) pageImages.get(key); String imgName = fileName_img + "_" + totalImages; System.out.println("Page Number : " + pageNum + "\t" + imgName); pdxObjectImage.write2file(destinationDir + imgName); original_imgName.add(imgName + "." + pdxObjectImage.getSuffix()); size = new double[] { pdxObjectImage.getWidth(), pdxObjectImage.getHeight() }; size_xy_ordered.add(size); totalImages++; } //Start for detect figure name for image renaming printer = new PrintImageLocation(page); location_xy = printer.getLocation_xy(); size_xy_tmp = printer.getSize_xy(); RearrangeImageOrder(location_xy, size_xy_tmp, size_xy_ordered); //PrinttoString(); DetectFigureName detectFig = new DetectFigureName(original_imgName, filename, pageNum, page, location_ordered, size_xy_ordered); } } } else { System.err.println("File not exists"); } if (document != null) { document.close(); } }
From source file:summarizer.pdfReader.java
List<String> parsePdf(String filePath, int startPage, int endPage, boolean newLine) { PDDocument pd; BufferedWriter wr;//from w ww . j a va 2s .c om List<String> outputStrings = new ArrayList<String>(); try { File input = new File(filePath); // The PDF file from where you would like to extract File output = new File("SampleText.txt"); // The text file where you are going to store the extracted data pd = PDDocument.load(input); System.out.println(pd.getNumberOfPages()); if (pd.isEncrypted()) { System.out.println("Error PDF is encrypted, cannot Parse"); } PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(startPage); //Start extracting from page 14 stripper.setEndPage(endPage); //Extract till page 16 wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output))); stripper.writeText(pd, wr); if (pd != null) { pd.close(); } wr.close(); BufferedReader in = new BufferedReader(new FileReader("SampleText.txt")); String s; StringBuilder sb = new StringBuilder(); while ((s = in.readLine()) != null) { sb.append(" "); sb.append(s); } s = sb.toString(); String[] tokenizedStrings; if (newLine) { tokenizedStrings = s.split("\\r?\\n"); } else { tokenizedStrings = s.split("\\."); } for (String x : tokenizedStrings) { if (x.compareTo("") != 0) outputStrings.add(x); } //System.out.println(s); } catch (Exception e) { e.printStackTrace(); } return outputStrings; }
From source file:uk.bl.dpt.qa.flint.wrappers.PDFBoxWrapper.java
License:Apache License
/** * Check if a PDF file has DRM or not//from ww w . j a va 2 s . co m * @param pFile file to check * @return whether the file is had DRM or not */ public boolean hasDRM(File pFile) { boolean ret = false; File tmp = null; try { System.setProperty("org.apache.pdfbox.baseParser.pushBackSize", "1024768"); // NOTE: we use loadNonSeq here as it is the latest parser // load() and parser.parse() have hung on test files tmp = File.createTempFile("flint-", ".tmp"); tmp.deleteOnExit(); RandomAccess scratchFile = new RandomAccessFile(tmp, "rw"); PDDocument doc = PDDocument.loadNonSeq(new FileInputStream(pFile), scratchFile); ret = doc.isEncrypted(); doc.close(); } catch (IOException e) { // This may occur when a suitable security handler cannot be found if (e.getMessage().contains("BadSecurityHandlerException")) { // if this happens then there must be some sort of DRM here ret = true; } } catch (Exception e) { e.printStackTrace(); // See comments in https://issues.apache.org/jira/browse/PDFBOX-1757 // PDFBox state that these files have errors and their parser is correct // The only way to find out that the parser doesn't like it is to catch // a general Exception. // If we reach this point then we have no idea of whether the file contains // DRM or not. Return false and hope it is detected elsewhere. ret = false; } finally { if (tmp != null) tmp.delete(); } return ret; }
From source file:uk.bl.wa.tika.parser.pdf.pdfbox.PDFParser.java
License:Apache License
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { PDDocument pdfDocument = null; TemporaryResources tmp = new TemporaryResources(); try {/* www . j a v a 2 s .c o m*/ // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already TikaInputStream tstream = TikaInputStream.cast(stream); pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), MemoryUsageSetting.setupMixed(100 * 1024 * 1024)); if (pdfDocument.isEncrypted()) { String password = null; // Did they supply a new style Password Provider? PasswordProvider passwordProvider = context.get(PasswordProvider.class); if (passwordProvider != null) { password = passwordProvider.getPassword(metadata); } // Fall back on the old style metadata if set if (password == null && metadata.get(PASSWORD) != null) { password = metadata.get(PASSWORD); } // If no password is given, use an empty string as the default if (password == null) { password = ""; } } metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); PDF2XHTML.process(pdfDocument, handler, metadata, extractAnnotationText, enableAutoSpace, suppressDuplicateOverlappingText, sortByPosition); } catch (Exception e) { log.error("Exception while parsing PDF: " + e); } finally { if (pdfDocument != null) { pdfDocument.close(); } tmp.dispose(); } }
From source file:uk.bl.wa.tika.parser.pdf.pdfbox.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); addMetadata(metadata, Metadata.TITLE, info.getTitle()); addMetadata(metadata, Metadata.AUTHOR, info.getAuthor()); addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords()); addMetadata(metadata, "pdf:creator", info.getCreator()); addMetadata(metadata, "pdf:producer", info.getProducer()); addMetadata(metadata, Metadata.SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate()); Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList(new String[] { "Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped" }); if (info.getCOSObject() != null && info.getCOSObject().keySet() != null) { for (COSName key : info.getCOSObject().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key)); }/* www .j ava 2 s .c o m*/ } } // ANJ Extensions: // // // Add other data of interest: metadata.set("pdf:version", "" + document.getDocument().getVersion()); metadata.set("pdf:numPages", "" + document.getNumberOfPages()); //metadata.set("pdf:cryptoMode", ""+getCryptoModeAsString(reader)); //metadata.set("pdf:openedWithFullPermissions", ""+reader.isOpenedWithFullPermissions()); metadata.set("pdf:encrypted", "" + document.isEncrypted()); //metadata.set("pdf:metadataEncrypted", ""+document.isMetadataEncrypted()); //metadata.set("pdf:128key", ""+reader.is128Key()); //metadata.set("pdf:tampered", ""+reader.isTampered()); try { if (document.getDocumentCatalog().getMetadata() != null) { XMPMetadata xmp = XMPMetadata.load(document.getDocumentCatalog().getMetadata().exportXMPMetadata()); // There is a special class for grabbing data in the PDF schema - not sure it will add much here: // Could parse xmp:CreatorTool and pdf:Producer etc. etc. out of here. XMPSchemaPDF pdfxmp = xmp.getPDFSchema(); // Added a PDF/A schema class: xmp.addXMLNSMapping(XMPSchemaPDFA.NAMESPACE, XMPSchemaPDFA.class); XMPSchemaPDFA pdfaxmp = (XMPSchemaPDFA) xmp.getSchemaByClass(XMPSchemaPDFA.class); if (pdfaxmp != null) { metadata.set("pdfaid:part", pdfaxmp.getPart()); metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(); //metadata.set("pdfa:version", version ); metadata.set("pdf:version", version); } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { log.error("XMP Parsing failed: " + e); metadata.set("pdf:metadata-xmp-parse-failed", "" + e); } // Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); metadata.set("pdf:version", baseVersion + " Adobe Extension Level " + el); } // TODO WARN if this embedded version is inconsistent with document header version? } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } // End Of ANJ Extensions. }
From source file:zhaw.PDFIndexer.java
License:Apache License
/** * This will add the contents to the lucene document. * /*from w ww.j av a 2 s .c o m*/ * @param document * The document to add the contents to. * @param is * The stream to get the contents from. * @param documentLocation * The location of the document, used just for debug messages. * @throws IOException * If there is an error parsing the document. */ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; PDFTextStripper stripper; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { // Just try using the default password and move on pdfDocument.decrypt(""); } // create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper = new PDFTextStripper(); try { stripper.writeText(pdfDocument, writer); } catch (Exception e) { System.out.println("Error in stripper.writeText()"); } String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); addTextField(document, Indexer.contents, reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, Indexer.Author, info.getAuthor()); try { addTextField(document, Indexer.created, info.getCreationDate()); } catch (IOException io) { // ignore, bad date but continue with indexing } addTextField(document, Indexer.keywords, info.getKeywords()); try { addTextField(document, Indexer.modified, info.getModificationDate()); } catch (IOException io) { // ignore, bad date but continue with indexing } addTextField(document, "Subject", info.getSubject()); addTextField(document, Indexer.Title, info.getTitle()); } int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and // returned // with hit documents for display. addUnindexedField(document, Indexer.summary, summary); } catch (CryptographyException e) { throw new IOException("Error decrypting document(" + documentLocation + "): " + e); } catch (InvalidPasswordException e) { // they didn't suppply a password and the default of "" was wrong. throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed."); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }