List of usage examples for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog
public PDDocumentCatalog getDocumentCatalog()
From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java
License:Apache License
private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFPureJavaParserConfig config) { if (config.getIfXFAExtractOnlyXFA() && pdDocument.getDocumentCatalog() != null && pdDocument.getDocumentCatalog().getAcroForm() != null && pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) { return true; }//from www. ja v a 2 s . co m return false; }
From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java
License:Apache License
private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException, IOException, TikaException { XFAExtractor ex = new XFAExtractor(); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument();/* w w w . ja v a2 s.c o m*/ try (InputStream is = new ByteArrayInputStream( pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) { ex.extract(is, xhtml, metadata, context); } catch (XMLStreamException e) { throw new TikaException("XML error in XFA", e); } xhtml.endDocument(); }
From source file:org.apache.tika.parser.pdf18.PDFParser.java
License:Apache License
private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException { //first extract AccessPermissions AccessPermission ap = document.getCurrentAccessPermission(); metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility())); metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent())); metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument())); metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm())); metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify())); metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations())); metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint())); metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded())); //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; XMPSchemaMediaManagement mmSchema = null; try {/*from ww w . ja v a2 s . c o m*/ if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } } catch (IOException e) { } if (xmp != null) { try { dcSchema = xmp.getDublinCoreSchema(); } catch (IOException e) { } JempboxExtractor.extractXMPMM(xmp, metadata); } PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); addMetadata(metadata, "producer", info.getProducer()); extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); try { // TODO Remove these in Tika 2.0 addMetadata(metadata, "created", info.getCreationDate()); addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); } catch (IOException e) { // Invalid date format, just ignore } try { Calendar modified = info.getModificationDate(); addMetadata(metadata, Metadata.LAST_MODIFIED, modified); addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); } catch (IOException e) { // Invalid date format, just ignore } // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } //try to get the various versions //Caveats: // there is currently a fair amount of redundancy // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion())); try { if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); metadata.set("pdfa:PDFVersion", version); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); } } } }
From source file:org.argrr.extractor.gdrive.downloader.ChartsDownloader.java
License:Open Source License
public static void extractPictures(String path, String fileName) throws IOException { PDDocument document = null; try {/*from w w w. j a v a2 s . c o m*/ document = PDDocument.load(path + "/" + fileName + ".pdf"); } catch (IOException ex) { System.out.println("" + ex); } List pages = document.getDocumentCatalog().getAllPages(); Iterator iter = pages.iterator(); int i = 1; String name = null; while (iter.hasNext()) { PDPage page = (PDPage) iter.next(); PDResources resources = page.getResources(); Map pageImages = resources.getImages(); if (pageImages != null) { Iterator imageIter = pageImages.keySet().iterator(); while (imageIter.hasNext()) { String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) pageImages.get(key); image.write2file(ChartsDownloader.rootOutputPathCharts + "/" + fileName + "-" + i); i++; } } } }
From source file:org.dspace.disseminate.CitationDocument.java
License:BSD License
private void addCoverPageToDocument(PDDocument document, PDDocument sourceDocument, PDPage coverPage) { List<PDPage> sourcePageList = sourceDocument.getDocumentCatalog().getAllPages(); if (isCitationFirstPage()) { //citation as cover page document.addPage(coverPage);/*w w w. j a va 2 s.c om*/ for (PDPage sourcePage : sourcePageList) { document.addPage(sourcePage); } } else { //citation as tail page for (PDPage sourcePage : sourcePageList) { document.addPage(sourcePage); } document.addPage(coverPage); } sourcePageList.clear(); }
From source file:org.dspace.disseminate.CitationDocumentServiceImpl.java
License:BSD License
protected void addCoverPageToDocument(PDDocument document, PDDocument sourceDocument, PDPage coverPage) { PDPageTree sourcePageList = sourceDocument.getDocumentCatalog().getPages(); if (isCitationFirstPage()) { //citation as cover page document.addPage(coverPage);// w w w . j a v a2s.co m for (PDPage sourcePage : sourcePageList) { document.addPage(sourcePage); } } else { //citation as tail page for (PDPage sourcePage : sourcePageList) { document.addPage(sourcePage); } document.addPage(coverPage); } }
From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java
License:Open Source License
public Properties getProperties(final InputStream is) throws IOException, DocumentReadException { try {/*w w w .ja va2s . co m*/ return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() { public Properties run() throws Exception { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } PDDocument pdDocument = PDDocument.load(is); Properties props = new Properties(); try { if (pdDocument.isEncrypted()) { try { pdDocument.decrypt(""); } catch (InvalidPasswordException e) { throw new DocumentReadException("The pdf document is encrypted.", e); } catch (org.apache.pdfbox.exceptions.CryptographyException e) { throw new DocumentReadException(e.getMessage(), e); } } PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); PDMetadata meta = catalog.getMetadata(); if (meta != null) { XMPMetadata metadata = meta.exportXMPMetadata(); XMPSchemaDublinCore dc = metadata.getDublinCoreSchema(); if (dc != null) { try { if (dc.getTitle() != null) props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle())); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } try { if (dc.getDescription() != null) props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription())); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (dc.getCreators() != null) { for (String creator : dc.getCreators()) { props.put(DCMetaData.CREATOR, fixEncoding(creator)); } } } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (dc.getDates() != null) { for (Calendar date : dc.getDates()) { props.put(DCMetaData.DATE, date); } } } catch (Exception e) { LOG.warn("getDate failed: " + e.getMessage()); } } XMPSchemaPDF pdf = metadata.getPDFSchema(); if (pdf != null) { try { if (pdf.getKeywords() != null) props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords())); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (pdf.getProducer() != null) props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer())); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } } XMPSchemaBasic basic = metadata.getBasicSchema(); if (basic != null) { try { if (basic.getCreateDate() != null) props.put(DCMetaData.DATE, basic.getCreateDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (basic.getModifyDate() != null) props.put(DCMetaData.DATE, basic.getModifyDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } // DCMetaData.PUBLISHER - basic.getCreatorTool() } } if (props.isEmpty()) { // The pdf doesn't contain any metadata, try to use the document // information instead PDDocumentInformation docInfo = pdDocument.getDocumentInformation(); if (docInfo != null) { try { if (docInfo.getAuthor() != null) props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor()); } catch (Exception e) { LOG.warn("getAuthor failed: " + e.getMessage()); } try { if (docInfo.getCreationDate() != null) props.put(DCMetaData.DATE, docInfo.getCreationDate()); } catch (Exception e) { LOG.warn("getCreationDate failed: " + e.getMessage()); } try { if (docInfo.getCreator() != null) props.put(DCMetaData.CREATOR, docInfo.getCreator()); } catch (Exception e) { LOG.warn("getCreator failed: " + e.getMessage()); } try { if (docInfo.getKeywords() != null) props.put(DCMetaData.SUBJECT, docInfo.getKeywords()); } catch (Exception e) { LOG.warn("getKeywords failed: " + e.getMessage()); } try { if (docInfo.getModificationDate() != null) props.put(DCMetaData.DATE, docInfo.getModificationDate()); } catch (Exception e) { LOG.warn("getModificationDate failed: " + e.getMessage()); } try { if (docInfo.getProducer() != null) props.put(DCMetaData.PUBLISHER, docInfo.getProducer()); } catch (Exception e) { LOG.warn("getProducer failed: " + e.getMessage()); } try { if (docInfo.getSubject() != null) props.put(DCMetaData.DESCRIPTION, docInfo.getSubject()); } catch (Exception e) { LOG.warn("getSubject failed: " + e.getMessage()); } try { if (docInfo.getTitle() != null) props.put(DCMetaData.TITLE, docInfo.getTitle()); } catch (Exception e) { LOG.warn("getTitle failed: " + e.getMessage()); } // docInfo.getTrapped(); } } } finally { if (pdDocument != null) { pdDocument.close(); } if (is != null) { try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } } return props; } }); } catch (PrivilegedActionException pae) { Throwable cause = pae.getCause(); if (cause instanceof IOException) { throw (IOException) cause; } else if (cause instanceof RuntimeException) { throw (RuntimeException) cause; } else { throw new RuntimeException(cause); } } }
From source file:org.freeeed.ocr.PDFImageExtractor.java
License:Apache License
@SuppressWarnings("rawtypes") @Override/*w ww . j a va 2 s .com*/ public List<String> extractImages() { File extractionDir = new File(conf.getPdfImageExtractionDir()); extractionDir.mkdirs(); List<String> result = new ArrayList<String>(); PDDocument document = null; try { document = PDDocument.load(file); List pages = document.getDocumentCatalog().getAllPages(); Iterator iter = pages.iterator(); int i = 1; int maxNumberOfImages = Project.getCurrentProject().getOcrMaxImagesPerPDF(); while (iter.hasNext()) { PDPage page = (PDPage) iter.next(); PDResources resources = page.getResources(); Map pageImages = resources.getImages(); if (pageImages != null) { Iterator imageIter = pageImages.keySet().iterator(); while (imageIter.hasNext()) { if (i > maxNumberOfImages) { return result; } String key = (String) imageIter.next(); PDXObjectImage image = (PDXObjectImage) pageImages.get(key); String fileName = conf.getPdfImageExtractionDir() + OCRUtil.createUniqueFileName("image"); image.write2file(fileName); result.add(fileName + "." + image.getSuffix()); i++; } } } } catch (IOException ex) { ex.printStackTrace(); } return result; }
From source file:org.ghost4j.document.PDFDocument.java
License:LGPL
public Document extract(int begin, int end) throws DocumentException { this.assertValidPageRange(begin, end); PDFDocument result = new PDFDocument(); ByteArrayInputStream bais = null; ByteArrayOutputStream baos = null; if (content != null) { PDDocument document = new PDDocument(); try {//from w w w . j ava 2 s . co m bais = new ByteArrayInputStream(content); baos = new ByteArrayOutputStream(); PDDocument inputPDF = PDDocument.load(bais); while (begin <= end) { document.addPage((PDPage) inputPDF.getDocumentCatalog().getAllPages().get(begin - 1)); begin++; } document.save(baos); document.close(); result.load(new ByteArrayInputStream(baos.toByteArray())); } catch (Exception e) { throw new DocumentException(e); } finally { IOUtils.closeQuietly(bais); IOUtils.closeQuietly(baos); } } return result; }
From source file:org.ghost4j.document.PDFDocument.java
License:LGPL
@Override public void append(Document document) throws DocumentException { super.append(document); ByteArrayOutputStream baos = null; PDDocument mergedDocument = new PDDocument(); try {/*from w w w.j av a 2s . c o m*/ baos = new ByteArrayOutputStream(); ByteArrayInputStream bais = new ByteArrayInputStream(content); PDDocument pDocument = PDDocument.load(bais); int pageCount = pDocument.getNumberOfPages(); for (int i = 0; i < pageCount; i++) { mergedDocument.addPage((PDPage) pDocument.getDocumentCatalog().getAllPages().get(i)); } // copy new document ByteArrayInputStream baisNewDoc = new ByteArrayInputStream(document.getContent()); PDDocument pNewDocument = PDDocument.load(baisNewDoc); pageCount = pNewDocument.getNumberOfPages(); for (int i = 0; i < pageCount; i++) { mergedDocument.addPage((PDPage) pNewDocument.getDocumentCatalog().getAllPages().get(i)); } mergedDocument.save(baos); mergedDocument.close(); // replace content with new content content = baos.toByteArray(); } catch (Exception e) { throw new DocumentException(e); } finally { IOUtils.closeQuietly(baos); } }