List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog
public PDDocumentCatalog getDocumentCatalog()
From source file:com.liferay.portlet.documentlibrary.util.PDFProcessorImpl.java
License:Open Source License
private void _generateImagesPB(FileVersion fileVersion, InputStream inputStream) throws Exception { boolean generatePreview = _isGeneratePreview(fileVersion); boolean generateThumbnail = _isGenerateThumbnail(fileVersion); PDDocument pdDocument = null; try {// ww w. j a v a 2 s . com pdDocument = PDDocument.load(inputStream); PDDocumentCatalog pdDocumentCatalog = pdDocument.getDocumentCatalog(); List<PDPage> pdPages = pdDocumentCatalog.getAllPages(); for (int i = 0; i < pdPages.size(); i++) { PDPage pdPage = pdPages.get(i); if (generateThumbnail && (i == 0)) { _generateImagesPB(fileVersion, pdPage, i); if (_log.isInfoEnabled()) { _log.info("PDFBox generated a thumbnail for " + fileVersion.getFileVersionId()); } } if (!generatePreview) { break; } _generateImagesPB(fileVersion, pdPage, i + 1); } if (_log.isInfoEnabled() && generatePreview) { _log.info("PDFBox generated " + getPreviewFileCount(fileVersion) + " preview pages for " + fileVersion.getFileVersionId()); } } finally { if (pdDocument != null) { pdDocument.close(); } } }
From source file:com.lp.server.system.ejbfac.SystemFacBean.java
License:Open Source License
public byte[][] konvertierePDFFileInEinzelneBilder(String pdfFile, int resolution) { PDDocument document = null; byte[][] oBilder = null; try {/*from ww w . j a v a2 s.c o m*/ document = PDDocument.load(pdfFile); List pages = document.getDocumentCatalog().getAllPages(); oBilder = new byte[pages.size()][]; for (int i = 0; i < pages.size(); i++) { PDPage page = (PDPage) pages.get(i); BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_RGB, resolution); oBilder[i] = Helper.imageToByteArray(image); } } catch (IOException e) { e.printStackTrace(); throw new EJBExceptionLP(EJBExceptionLP.FEHLER, e.getMessage()); } finally { if (document != null) { try { document.close(); } catch (IOException e) { e.printStackTrace(); throw new EJBExceptionLP(EJBExceptionLP.FEHLER, e.getMessage()); } } } return oBilder; }
From source file:com.modemo.javase.signature.SigUtils.java
License:Apache License
/** * Get the access permissions granted for this document in the DocMDP transform parameters * dictionary. Details are described in the table "Entries in the DocMDP transform parameters * dictionary" in the PDF specification. * * @param doc document.//from www. j a v a 2 s .co m * @return the permission value. 0 means no DocMDP transform parameters dictionary exists. Other * return values are 1, 2 or 3. 2 is also returned if the DocMDP transform parameters dictionary * is found but did not contain a /P entry, or if the value is outside the valid range. */ public static int getMDPPermission(PDDocument doc) { COSBase base = doc.getDocumentCatalog().getCOSObject().getDictionaryObject(COSName.PERMS); if (base instanceof COSDictionary) { COSDictionary permsDict = (COSDictionary) base; base = permsDict.getDictionaryObject(COSName.DOCMDP); if (base instanceof COSDictionary) { COSDictionary signatureDict = (COSDictionary) base; base = signatureDict.getDictionaryObject("Reference"); if (base instanceof COSArray) { COSArray refArray = (COSArray) base; for (int i = 0; i < refArray.size(); ++i) { base = refArray.getObject(i); if (base instanceof COSDictionary) { COSDictionary sigRefDict = (COSDictionary) base; if (COSName.DOCMDP.equals(sigRefDict.getDictionaryObject("TransformMethod"))) { base = sigRefDict.getDictionaryObject("TransformParams"); if (base instanceof COSDictionary) { COSDictionary transformDict = (COSDictionary) base; int accessPermissions = transformDict.getInt(COSName.P, 2); if (accessPermissions < 1 || accessPermissions > 3) { accessPermissions = 2; } return accessPermissions; } } } } } } } return 0; }
From source file:com.modemo.javase.signature.SigUtils.java
License:Apache License
/** * Set the access permissions granted for this document in the DocMDP transform parameters * dictionary. Details are described in the table "Entries in the DocMDP transform parameters * dictionary" in the PDF specification. * * @param doc The document./*w ww . j av a2 s .c o m*/ * @param signature The signature object. * @param accessPermissions The permission value (1, 2 or 3). */ static public void setMDPPermission(PDDocument doc, PDSignature signature, int accessPermissions) { COSDictionary sigDict = signature.getCOSObject(); // DocMDP specific stuff COSDictionary transformParameters = new COSDictionary(); transformParameters.setItem(COSName.TYPE, COSName.getPDFName("TransformParams")); transformParameters.setInt(COSName.P, accessPermissions); transformParameters.setName(COSName.V, "1.2"); transformParameters.setNeedToBeUpdated(true); COSDictionary referenceDict = new COSDictionary(); referenceDict.setItem(COSName.TYPE, COSName.getPDFName("SigRef")); referenceDict.setItem("TransformMethod", COSName.DOCMDP); referenceDict.setItem("DigestMethod", COSName.getPDFName("SHA1")); referenceDict.setItem("TransformParams", transformParameters); referenceDict.setNeedToBeUpdated(true); COSArray referenceArray = new COSArray(); referenceArray.add(referenceDict); sigDict.setItem("Reference", referenceArray); referenceArray.setNeedToBeUpdated(true); // Catalog COSDictionary catalogDict = doc.getDocumentCatalog().getCOSObject(); COSDictionary permsDict = new COSDictionary(); catalogDict.setItem(COSName.PERMS, permsDict); permsDict.setItem(COSName.DOCMDP, signature); catalogDict.setNeedToBeUpdated(true); permsDict.setNeedToBeUpdated(true); }
From source file:com.mycompany.mavenproject1.ragaiproject.PDFManipulation.java
public String[] getFieldNames(PDDocument pdfDocument, JList list) { int i = 0;// www. j a v a2 s. com String[] names = new String[10000]; PDDocumentCatalog docCatalog = pdfDocument.getDocumentCatalog(); PDAcroForm acroForm = docCatalog.getAcroForm(); java.util.List<PDField> fields = acroForm.getFields(); for (PDField field : fields) { names[i] = field.getPartialName(); this.arrayOfFieldNames[i] = names[i]; list.setListData(names); this.fieldLabelPairs.add(new Pair(field.getPartialName(), field.getValueAsString())); System.out.println(this.fieldLabelPairs.size()); i++; } return names; }
From source file:com.netsteadfast.greenstep.util.PdfConvertUtils.java
License:Apache License
@SuppressWarnings("unchecked") public static List<File> toImageFiles(File pdfFile, int resolution) throws Exception { PDDocument document = PDDocument.loadNonSeq(pdfFile, null); List<PDPage> pages = document.getDocumentCatalog().getAllPages(); File tmpDir = new File(Constants.getWorkTmpDir() + "/" + PdfConvertUtils.class.getSimpleName() + "/" + System.currentTimeMillis() + "/"); FileUtils.forceMkdir(tmpDir);//from ww w. j av a 2s . co m List<File> files = new LinkedList<File>(); int len = String.valueOf(pages.size() + 1).length(); for (int i = 0; i < pages.size(); i++) { String name = StringUtils.leftPad(String.valueOf(i + 1), len, "0"); BufferedImage bufImage = pages.get(i).convertToImage(BufferedImage.TYPE_INT_RGB, resolution); File imageFile = new File(tmpDir.getPath() + "/" + name + ".png"); FileOutputStream fos = new FileOutputStream(imageFile); ImageIOUtil.writeImage(bufImage, "png", fos, resolution); fos.flush(); fos.close(); files.add(imageFile); } document.close(); tmpDir = null; return files; }
From source file:com.odc.pdfextractor.parser.CleanPdfParser.java
License:Apache License
/** * This will print the documents docBuilder. * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. *///from w w w . ja v a 2 s. co m public DocumentLocation processPdf(String filename) throws Exception { PDDocument document = null; try { document = PDDocument.load(filename); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } List allPages = document.getDocumentCatalog().getAllPages(); System.out.print("Extracting text from PDF"); for (int i = 0; i < allPages.size(); i++) { PDPage page = (PDPage) allPages.get(i); System.out.print("."); PDStream contents = page.getContents(); if (contents != null) { this.processStream(page, page.findResources(), page.getContents().getStream()); } docBuilder.incrementPage(); } } finally { System.out.println(); if (document != null) { document.close(); } } return docBuilder.getDoc(); }
From source file:com.openkm.extractor.PdfTextExtractor.java
License:Open Source License
/** * {@inheritDoc}//from w w w .j a v a 2 s .c o m */ @SuppressWarnings("rawtypes") public String extractText(InputStream stream, String type, String encoding) throws IOException { try { PDFParser parser = new PDFParser(new BufferedInputStream(stream)); try { parser.parse(); PDDocument document = parser.getPDDocument(); if (document.isEncrypted()) { try { document.decrypt(""); document.setAllSecurityToBeRemoved(true); } catch (Exception e) { throw new IOException("Unable to extract text: document encrypted", e); } } CharArrayWriter writer = new CharArrayWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.writeText(document, writer); String st = writer.toString().trim(); log.debug("TextStripped: '{}'", st); if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) { log.warn("PDF does not contains text layer"); // Extract images from PDF StringBuilder sb = new StringBuilder(); if (!Config.SYSTEM_PDFIMAGES.isEmpty()) { File tmpPdf = FileUtils.createTempFile("pdf"); File tmpDir = new File(EnvironmentDetector.getTempDir()); String baseName = FileUtils.getFileName(tmpPdf.getName()); document.save(tmpPdf); int pgNum = 1; try { for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) { HashMap<String, Object> hm = new HashMap<String, Object>(); hm.put("fileIn", tmpPdf.getPath()); hm.put("firstPage", pgNum); hm.put("lastPage", pgNum++); hm.put("imageRoot", tmpDir + File.separator + baseName); String cmd = TemplateUtils.replace("SYSTEM_PDFIMAGES", Config.SYSTEM_PDFIMAGES, hm); ExecutionUtils.runCmd(cmd); for (File tmp : tmpDir.listFiles()) { if (tmp.getName().startsWith(baseName + "-")) { if (page.findRotation() > 0) { ImageUtils.rotate(tmp, tmp, page.findRotation()); } try { String txt = doOcr(tmp); sb.append(txt).append(" "); log.debug("OCR Extracted: {}", txt); } finally { FileUtils.deleteQuietly(tmp); } } } } } finally { FileUtils.deleteQuietly(tmpPdf); } } else { for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) { PDResources resources = page.getResources(); Map<String, PDXObject> images = resources.getXObjects(); if (images != null) { for (String key : images.keySet()) { PDXObjectImage image = (PDXObjectImage) images.get(key); String prefix = "img-" + key + "-"; File pdfImg = null; try { pdfImg = File.createTempFile(prefix, ".png"); log.debug("Writing image: {}", pdfImg.getPath()); // Won't work until PDFBox 1.8.9 ImageIO.write(image.getRGBImage(), "png", pdfImg); if (page.findRotation() > 0) { ImageUtils.rotate(pdfImg, pdfImg, page.findRotation()); } // Do OCR String txt = doOcr(pdfImg); sb.append(txt).append(" "); log.debug("OCR Extracted: {}", txt); } finally { FileUtils.deleteQuietly(pdfImg); } } } } } return sb.toString(); } else { return writer.toString(); } } finally { try { PDDocument doc = parser.getPDDocument(); if (doc != null) { doc.close(); } } catch (IOException e) { // ignore } } } catch (Exception e) { // it may happen that PDFParser throws a runtime // exception when parsing certain pdf documents log.warn("Failed to extract PDF text content", e); throw new IOException(e.getMessage(), e); } finally { stream.close(); } }
From source file:com.opensearchserver.extractor.parser.PdfBox.java
License:Apache License
private void extractMetaData(PDDocument pdf) throws IOException { PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(SUBJECT, info.getSubject()); metas.add(AUTHOR, info.getAuthor()); metas.add(PRODUCER, info.getProducer()); metas.add(KEYWORDS, info.getKeywords()); metas.add(CREATION_DATE, getDate(getCreationDate(info))); metas.add(MODIFICATION_DATE, getModificationDate(info)); }//from ww w . j av a2 s . c o m int pages = pdf.getNumberOfPages(); metas.add(NUMBER_OF_PAGES, pages); PDDocumentCatalog catalog = pdf.getDocumentCatalog(); if (catalog != null) metas.add(LANGUAGE, catalog.getLanguage()); }
From source file:com.qwazr.library.pdfbox.PdfBoxParser.java
License:Apache License
private void extractMetaData(final PDDocument pdf, final ParserFieldsBuilder metas) { metas.set(MIME_TYPE, DEFAULT_MIMETYPES[0]); final PDDocumentInformation info = pdf.getDocumentInformation(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(SUBJECT, info.getSubject()); metas.add(AUTHOR, info.getAuthor()); metas.add(PRODUCER, info.getProducer()); metas.add(KEYWORDS, info.getKeywords()); metas.add(CREATION_DATE, info.getCreationDate()); metas.add(MODIFICATION_DATE, info.getModificationDate()); }/*from w w w . j a va2s. c o m*/ int pages = pdf.getNumberOfPages(); metas.add(NUMBER_OF_PAGES, pages); PDDocumentCatalog catalog = pdf.getDocumentCatalog(); if (catalog != null) metas.add(LANGUAGE, catalog.getLanguage()); }