List of usage examples for org.apache.pdfbox.pdmodel PDDocument isEncrypted
public boolean isEncrypted()
From source file:org.seasar.robot.extractor.impl.PdfExtractor.java
License:Apache License
@Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) { throw new RobotSystemException("The inputstream is null."); }// w ww .j a v a 2 s.com synchronized (pdfBoxLockObj) { PDDocument document = null; try { document = PDDocument.load(in, null, force); if (document.isEncrypted() && params != null) { String password = params.get(ExtractData.PDF_PASSWORD); if (password == null) { password = getPassword(params.get(ExtractData.URL), params.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); } if (password != null) { final StandardDecryptionMaterial sdm = new StandardDecryptionMaterial(password); document.openProtection(sdm); final AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract text."); } } } final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Writer output = new OutputStreamWriter(baos, encoding); final PDFTextStripper stripper = new PDFTextStripper(encoding); stripper.setForceParsing(force); final AtomicBoolean done = new AtomicBoolean(false); final PDDocument doc = document; final Set<Exception> exceptionSet = new HashSet<>(); Thread task = new Thread(new Runnable() { @Override public void run() { try { stripper.writeText(doc, output); } catch (Exception e) { exceptionSet.add(e); } finally { done.set(true); } } }); task.setDaemon(true); task.start(); task.join(timeout); if (!done.get()) { for (int i = 0; i < 100 && !done.get(); i++) { task.interrupt(); Thread.sleep(50); } throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec."); } else if (!exceptionSet.isEmpty()) { throw exceptionSet.iterator().next(); } output.flush(); final ExtractData extractData = new ExtractData(baos.toString(encoding)); extractMetadata(document, extractData); return extractData; } catch (final Exception e) { throw new ExtractException(e); } finally { if (document != null) { try { document.close(); } catch (final IOException e) { // NOP } } } } }
From source file:org.swiftexplorer.gui.preview.PdfPanel.java
License:Apache License
public synchronized void setPdf(PDDocument pdf) { listImagePages.clear();/*w w w.ja va 2 s. c om*/ if (pdf == null) return; try { if (pdf.isEncrypted()) { logger.info("Failed attempt at previewing an encrypted PDF"); return; } PDDocumentCatalog cat = pdf.getDocumentCatalog(); @SuppressWarnings("unchecked") List<PDPage> pages = cat.getAllPages(); if (pages != null && !pages.isEmpty()) { for (PDPage page : pages) { listImagePages.add(page.convertToImage()); if (listImagePages.size() >= maxPageToPreview) break; } } } catch (IOException e) { logger.error("Error occurred while opening the pdf document", e); } finally { if (pdf != null) { try { pdf.close(); } catch (IOException ex) { logger.error("Error occurred while closing the pdf document", ex); } } } repaint(); }
From source file:org.terrier.indexing.PDFDocument.java
License:Mozilla Public License
/** * Returns the reader of text, which is suitable for parsing terms out of, * and which is created by converting the file represented by * parameter docStream. This method involves running the stream * through the PDFParser etc provided in the org.pdfbox library. * On error, it returns null, and sets EOD to true, so no terms * can be read from this document.//from w w w .j av a 2 s . c o m * @param is the input stream that represents the document's file. * @return Reader a reader that is fed to an indexer. */ protected Reader getReader(InputStream is) { if ((Files.length(filename) / 1048576) > 300) { logger.info("Skipping document " + filename + " because it's size exceeds 300Mb"); return new StringReader(""); } PDDocument pdfDocument = null; Reader rtr = null; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { //Just try using the default password and move on pdfDocument.decrypt(""); } //create a writer where to append the text content. StringWriter writer = new StringWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(pdfDocument, writer); String contents = writer.getBuffer().toString(); int spaceCount = StringUtils.countMatches(contents, " "); for (char badChar : new char[] { '\u00A0', '\u2029', '#' }) { final int count = StringUtils.countMatches(contents, "" + badChar); if (count > spaceCount / 2) { contents = contents.replace(badChar, ' '); spaceCount += count; } } rtr = new StringReader(contents); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null && USE_PDF_TITLE) { setProperty("title", info.getTitle()); } else { setProperty("title", new java.io.File(super.filename).getName()); } } catch (CryptographyException e) { throw new RuntimeException("Error decrypting PDF document: " + e); } catch (InvalidPasswordException e) { //they didn't suppply a password and the default of "" was wrong. throw new RuntimeException("Error: The PDF document is encrypted and will not be indexed."); } catch (Exception e) { throw new RuntimeException("Error extracting PDF document", e); } finally { if (pdfDocument != null) { try { pdfDocument.close(); } catch (IOException ioe) { } } } return rtr; }
From source file:org.vesalainen.ham.pdf.RfaxTest.java
License:Open Source License
public void test() throws IOException { PDDocument document = PDDocument.load(new File("rfax.pdf")); if (!document.isEncrypted()) { PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(document); try (BufferedWriter bw = Files.newBufferedWriter(Paths.get("src", "main", "resources", "rfax.txt"))) { bw.write(text);//from ww w.j a va 2 s .co m } } document.close(); }
From source file:org.xstudiosys.pdfxmp.AddMetadataFromDocInfo.java
License:Apache License
/** * This will print the documents data.//from w w w . j a va 2s. c o m * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main(String[] args) throws Exception { if (args.length != 2) { usage(); } else { PDDocument document = null; try { document = PDDocument.load(args[0]); if (document.isEncrypted()) { System.err.println("Error: Cannot add metadata to encrypted document."); System.exit(1); } PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentInformation info = document.getDocumentInformation(); XMPMetadata metadata = new XMPMetadata(); XMPSchemaPDF pdfSchema = metadata.addPDFSchema(); pdfSchema.setKeywords(info.getKeywords()); pdfSchema.setProducer(info.getProducer()); XMPSchemaBasic basicSchema = metadata.addBasicSchema(); basicSchema.setModifyDate(info.getModificationDate()); basicSchema.setCreateDate(info.getCreationDate()); basicSchema.setCreatorTool(info.getCreator()); basicSchema.setMetadataDate(new GregorianCalendar()); XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema(); dcSchema.setTitle(info.getTitle()); dcSchema.addCreator("PDFBox"); dcSchema.setDescription(info.getSubject()); PDMetadata metadataStream = new PDMetadata(document); metadataStream.importXMPMetadata(metadata); catalog.setMetadata(metadataStream); document.save(args[1]); } finally { if (document != null) { document.close(); } } } }
From source file:org.xstudiosys.pdfxmp.XMPUtil.java
License:Open Source License
/** * Try to read the given BibTexEntry from the XMP-stream of the given * inputstream containing a PDF-file.//from www.ja va 2 s . com * * @param inputStream * The inputstream to read from. * * @throws IOException * Throws an IOException if the file cannot be read, so the user * than remove a lock or cancel the operation. */ @SuppressWarnings("unchecked") public static List<BibtexEntry> readXMP(InputStream inputStream) throws IOException { List<BibtexEntry> result = new LinkedList<BibtexEntry>(); PDDocument document = null; try { document = PDDocument.load(inputStream); if (document.isEncrypted()) { throw new EncryptionNotSupportedException("Error: Cannot read metadata from encrypted document."); } XMPMetadata meta = getXMPMetadata(document); // If we did not find any XMP metadata, search for non XMP metadata if (meta != null) { List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaBibtex.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaBibtex bib = (XMPSchemaBibtex) schema; result.add(bib.getBibtexEntry()); } // If we did not find anything have a look if a Dublin Core exists if (result.size() == 0) { schemas = meta.getSchemasByNamespaceURI(XMPSchemaDublinCore.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaDublinCore dc = (XMPSchemaDublinCore) schema; BibtexEntry entry = getBibtexEntryFromDublinCore(dc); if (entry != null) result.add(entry); } } } if (result.size() == 0) { BibtexEntry entry = getBibtexEntryFromDocumentInformation(document.getDocumentInformation()); if (entry != null) result.add(entry); } } finally { if (document != null) document.close(); } // return null, if no metadata was found if (result.size() == 0) return null; return result; }
From source file:org.xstudiosys.pdfxmp.XMPUtil.java
License:Open Source License
/** * Will read the XMPMetadata from the given pdf file, closing the file * afterwards./*from www .j av a 2 s . c om*/ * * @param inputStream * The inputStream representing a PDF-file to read the * XMPMetadata from. * @return The XMPMetadata object found in the file or null if none is * found. * @throws IOException */ public static XMPMetadata readRawXMP(InputStream inputStream) throws IOException { PDDocument document = null; try { document = PDDocument.load(inputStream); if (document.isEncrypted()) { throw new EncryptionNotSupportedException("Error: Cannot read metadata from encrypted document."); } return getXMPMetadata(document); } finally { if (document != null) document.close(); } }
From source file:org.xstudiosys.pdfxmp.XMPUtil.java
License:Open Source License
/** * Try to write the given BibTexEntry in the XMP-stream of the given * PDF-file./*from ww w . j av a2s. c o m*/ * * Throws an IOException if the file cannot be read or written, so the user * can remove a lock or cancel the operation. * * The method will overwrite existing BibTeX-XMP-data, but keep other * existing metadata. * * @param file * The file to write the entries to. * @param bibtexEntries * The entries to write to the file. * * @param database * maybenull An optional database which the given bibtex entries * belong to, which will be used to resolve strings. If the * database is null the strings will not be resolved. * @param writePDFInfo * Write information also in PDF document properties * @throws TransformerException * If the entry was malformed or unsupported. * @throws IOException * If the file could not be written to or could not be found. */ @SuppressWarnings("unchecked") public static void writeXMP(File file, Collection<BibtexEntry> bibtexEntries, BibtexDatabase database, boolean writePDFInfo) throws IOException, TransformerException { if (database != null) bibtexEntries = database.resolveForStrings(bibtexEntries, false); PDDocument document = null; try { document = PDDocument.load(file.getAbsoluteFile()); if (document.isEncrypted()) { throw new EncryptionNotSupportedException("Error: Cannot add metadata to encrypted document."); } if (writePDFInfo && bibtexEntries.size() == 1) { writeDocumentInformation(document, bibtexEntries.iterator().next(), null); writeDublinCore(document, bibtexEntries, null); } PDDocumentCatalog catalog = document.getDocumentCatalog(); PDMetadata metaRaw = catalog.getMetadata(); XMPMetadata meta; if (metaRaw != null) { meta = new XMPMetadata(XMLUtil.parse(metaRaw.createInputStream())); } else { meta = new XMPMetadata(); } meta.addXMLNSMapping(XMPSchemaBibtex.NAMESPACE, XMPSchemaBibtex.class); // Remove all current Bibtex-schemas List<XMPSchema> schemas = meta.getSchemasByNamespaceURI(XMPSchemaBibtex.NAMESPACE); for (XMPSchema schema : schemas) { XMPSchemaBibtex bib = (XMPSchemaBibtex) schema; bib.getElement().getParentNode().removeChild(bib.getElement()); } for (BibtexEntry e : bibtexEntries) { XMPSchemaBibtex bibtex = new XMPSchemaBibtex(meta); meta.addSchema(bibtex); bibtex.setBibtexEntry(e, null); } // Save to stream and then input that stream to the PDF ByteArrayOutputStream os = new ByteArrayOutputStream(); meta.save(os); ByteArrayInputStream is = new ByteArrayInputStream(os.toByteArray()); PDMetadata metadataStream = new PDMetadata(document, is, false); catalog.setMetadata(metadataStream); // Save try { document.save(file.getAbsolutePath()); } catch (COSVisitorException e) { throw new TransformerException("Could not write XMP-metadata: " + e.getLocalizedMessage()); } } finally { if (document != null) { document.close(); } } }
From source file:pdfedittest.PDFEditTest.java
/** * @param args the command line arguments */// w ww.j a v a 2 s. com public static void main(String[] args) { // TODO code application logic here PDDocument pd; BufferedWriter wr; try { String outputPath = "rayani.txt"; File input = new File("C:\\Users\\Administrator\\Desktop\\FA Feb 16.pdf"); // The PDF file from where you would like to extract File output = new File(outputPath); // The text file where you are going to store the extracted data pd = PDDocument.load(input); System.out.println(pd.getNumberOfPages()); System.out.println(pd.isEncrypted()); //pd.save("CopyOfInvoice.pdf"); // Creates a copy called "CopyOfInvoice.pdf" PDFTextStripper stripper = new PDFTextStripper(); //stripper.setStartPage(3); //Start extracting from page 3 //stripper.setEndPage(5); //Extract till page 5 //stripper.set wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output))); stripper.writeText(pd, wr); if (pd != null) { pd.close(); } // I use close() to flush the stream. wr.close(); TextParser a = new TextParser(outputPath); a.getUserDetail(""); } catch (Exception e) { e.printStackTrace(); } }
From source file:pdfpositional.PdfPositional.java
/** * @param args the command line arguments */// w ww . j a v a 2 s .c o m public static void main(String[] args) { try { // check file param if (args.length == 0) { throw new ParameterException("No file parameter specified"); } String file = args[args.length - 1]; Pattern patternFile = Pattern.compile("(?i)^[\\w,\\s-()/]+\\.pdf$"); Matcher matcherFile = patternFile.matcher(file); // check file is valid format if (!matcherFile.find()) { throw new ParameterException("File parameter invalid: " + file); } // check if file exists File input = new File(file); if (!input.exists()) { throw new ParameterException("File does not exist: " + file); } // ensure it isnt a directory if (input.isDirectory()) { throw new ParameterException("File is a directory: " + file); } PdfPositional pdfPositional = new PdfPositional(input); pdfPositional.setConversion(new Float(1.388888888889)); pdfPositional.processFileArgument(args[args.length - 1]); Pattern patternArgument = Pattern.compile("^-{2}([^=]+)[=]([\\s\\S]+)$"); Matcher matcher; for (int i = 0; i < args.length - 1; i++) { matcher = patternArgument.matcher(args[i]); while (matcher.find()) { switch (matcher.group(1)) { case "page": pdfPositional.setPageNumber(Integer.parseInt(matcher.group(2))); break; case "output": pdfPositional.setOutputFile(matcher.group(2)); break; } } } PDDocument document; document = PDDocument.load(pdfPositional.getInputFile()); // check for encrypted document if (document.isEncrypted()) { try { document.decrypt(""); } catch (CryptographyException | IOException e) { document.close(); throw new EncryptedDocumentException(); } } List allPages = document.getDocumentCatalog().getAllPages(); if (pdfPositional.hasPageNumber()) { if (document.getNumberOfPages() < pdfPositional.getPageNumber()) { throw new ParameterException("illegal page number"); } PDPage page = (PDPage) allPages.get(pdfPositional.getPageNumber() - 1); PDStream contents = page.getContents(); if (contents != null) { pdfPositional.processStream(page, page.findResources(), page.getContents().getStream()); pdfPositional.addPageDataToPdfData(); pdfPositional.writeJSONToOutputStream(); } } else { for (int i = 0; i < allPages.size(); i++) { pdfPositional.setPageNumber(i + 1); PDPage page = (PDPage) allPages.get(i); PDStream contents = page.getContents(); if (contents != null) { pdfPositional.processStream(page, page.findResources(), page.getContents().getStream()); pdfPositional.addPageDataToPdfData(); pdfPositional.writeJSONToOutputStream(); } page.clear(); } } pdfPositional.destroyOutputStream(); document.close(); System.exit(0); } catch (ParameterException ex) { System.out.println("Parameter Error: " + ex.getMessage()); System.exit(1); } catch (EncryptedDocumentException ex) { System.out.println("Encrypted Document Error"); System.exit(1); } catch (IOException | NumberFormatException ex) { System.out.println("General Error"); System.exit(1); } }