Example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel PDDocument getDocumentCatalog.

Prototype

public PDDocumentCatalog getDocumentCatalog()

Source Link

Document

This will get the document CATALOG.

Usage

From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java

License:Apache License

private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFPureJavaParserConfig config) {
    if (config.getIfXFAExtractOnlyXFA() && pdDocument.getDocumentCatalog() != null
            && pdDocument.getDocumentCatalog().getAcroForm() != null
            && pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
        return true;
    }//from  www. ja v a  2 s  .  co m
    return false;
}

From source file:org.apache.tika.parser.pdf.PDFPureJavaParser.java

License:Apache License

private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata,
        ParseContext context) throws SAXException, IOException, TikaException {
    XFAExtractor ex = new XFAExtractor();
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();/*  w w  w . ja  v  a2  s.c  o  m*/
    try (InputStream is = new ByteArrayInputStream(
            pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) {
        ex.extract(is, xhtml, metadata, context);
    } catch (XMLStreamException e) {
        throw new TikaException("XML error in XFA", e);
    }
    xhtml.endDocument();
}

From source file:org.apache.tika.parser.pdf18.PDFParser.java

License:Apache License

private void extractMetadata(PDDocument document, Metadata metadata) throws TikaException {

    //first extract AccessPermissions
    AccessPermission ap = document.getCurrentAccessPermission();
    metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
            Boolean.toString(ap.canExtractForAccessibility()));
    metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
    metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
    metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
    metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
    metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
    metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
    metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));

    //now go for the XMP
    org.apache.jempbox.xmp.XMPMetadata xmp = null;
    XMPSchemaDublinCore dcSchema = null;
    XMPSchemaMediaManagement mmSchema = null;
    try {/*from ww  w  .  ja v  a2  s .  c  o m*/
        if (document.getDocumentCatalog().getMetadata() != null) {
            xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
        }
    } catch (IOException e) {
    }

    if (xmp != null) {
        try {
            dcSchema = xmp.getDublinCoreSchema();
        } catch (IOException e) {
        }

        JempboxExtractor.extractXMPMM(xmp, metadata);
    }

    PDDocumentInformation info = document.getDocumentInformation();
    metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
    extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
    extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
    addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
    addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
    addMetadata(metadata, "producer", info.getProducer());
    extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);

    // TODO: Move to description in Tika 2.0
    addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
    addMetadata(metadata, "trapped", info.getTrapped());
    try {
        // TODO Remove these in Tika 2.0
        addMetadata(metadata, "created", info.getCreationDate());
        addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
    } catch (IOException e) {
        // Invalid date format, just ignore
    }
    try {
        Calendar modified = info.getModificationDate();
        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
    } catch (IOException e) {
        // Invalid date format, just ignore
    }

    // All remaining metadata is custom
    // Copy this over as-is
    List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords",
            "Producer", "Subject", "Title", "Trapped");
    for (COSName key : info.getDictionary().keySet()) {
        String name = key.getName();
        if (!handledMetadata.contains(name)) {
            addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
        }
    }

    //try to get the various versions
    //Caveats:
    //    there is currently a fair amount of redundancy
    //    TikaCoreProperties.FORMAT can be multivalued
    //    There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
    metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
    metadata.add(TikaCoreProperties.FORMAT.getName(),
            MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));

    try {
        if (xmp != null) {
            xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
            XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
            if (pdfaxmp != null) {
                if (pdfaxmp.getPart() != null) {
                    metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart()));
                }
                if (pdfaxmp.getConformance() != null) {
                    metadata.set("pdfaid:conformance", pdfaxmp.getConformance());
                    String version = "A-" + pdfaxmp.getPart()
                            + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
                    metadata.set("pdfa:PDFVersion", version);
                    metadata.add(TikaCoreProperties.FORMAT.getName(),
                            MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
                }
            }
            // TODO WARN if this XMP version is inconsistent with document header version?
        }
    } catch (IOException e) {
        metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
    }
    //TODO: Let's try to move this into PDFBox.
    //Attempt to determine Adobe extension level, if present:
    COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
    COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
    if (extensions != null) {
        for (COSName extName : extensions.keySet()) {
            // If it's an Adobe one, interpret it to determine the extension level:
            if (extName.equals(COSName.getPDFName("ADBE"))) {
                COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
                if (adobeExt != null) {
                    String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                    int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                    //-1 is sentinel value that something went wrong in getInt
                    if (el != -1) {
                        metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el);
                        metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\""
                                + baseVersion + " Adobe Extension Level " + el + "\"");
                    }
                }
            } else {
                // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
                metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
            }
        }
    }
}

From source file:org.argrr.extractor.gdrive.downloader.ChartsDownloader.java

License:Open Source License

public static void extractPictures(String path, String fileName) throws IOException {
    PDDocument document = null;
    try {/*from  w  w  w. j a v  a2  s  .  c  o m*/
        document = PDDocument.load(path + "/" + fileName + ".pdf");
    } catch (IOException ex) {
        System.out.println("" + ex);
    }
    List pages = document.getDocumentCatalog().getAllPages();
    Iterator iter = pages.iterator();
    int i = 1;
    String name = null;

    while (iter.hasNext()) {
        PDPage page = (PDPage) iter.next();
        PDResources resources = page.getResources();
        Map pageImages = resources.getImages();
        if (pageImages != null) {
            Iterator imageIter = pageImages.keySet().iterator();
            while (imageIter.hasNext()) {
                String key = (String) imageIter.next();
                PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
                image.write2file(ChartsDownloader.rootOutputPathCharts + "/" + fileName + "-" + i);
                i++;
            }
        }
    }
}

From source file:org.dspace.disseminate.CitationDocument.java

License:BSD License

private void addCoverPageToDocument(PDDocument document, PDDocument sourceDocument, PDPage coverPage) {
    List<PDPage> sourcePageList = sourceDocument.getDocumentCatalog().getAllPages();

    if (isCitationFirstPage()) {
        //citation as cover page
        document.addPage(coverPage);/*w w  w.  j  a  va  2 s.c  om*/
        for (PDPage sourcePage : sourcePageList) {
            document.addPage(sourcePage);
        }
    } else {
        //citation as tail page
        for (PDPage sourcePage : sourcePageList) {
            document.addPage(sourcePage);
        }
        document.addPage(coverPage);
    }
    sourcePageList.clear();
}

From source file:org.dspace.disseminate.CitationDocumentServiceImpl.java

License:BSD License

protected void addCoverPageToDocument(PDDocument document, PDDocument sourceDocument, PDPage coverPage) {
    PDPageTree sourcePageList = sourceDocument.getDocumentCatalog().getPages();

    if (isCitationFirstPage()) {
        //citation as cover page
        document.addPage(coverPage);//  w w w  .  j a  v a2s.co m
        for (PDPage sourcePage : sourcePageList) {
            document.addPage(sourcePage);
        }
    } else {
        //citation as tail page
        for (PDPage sourcePage : sourcePageList) {
            document.addPage(sourcePage);
        }
        document.addPage(coverPage);
    }
}

From source file:org.exoplatform.services.document.impl.PDFDocumentReader.java

License:Open Source License

public Properties getProperties(final InputStream is) throws IOException, DocumentReadException {
    try {/*w  w w .ja va2s . co  m*/
        return SecurityHelper.doPrivilegedExceptionAction(new PrivilegedExceptionAction<Properties>() {
            public Properties run() throws Exception {
                if (is == null) {
                    throw new IllegalArgumentException("InputStream is null.");
                }

                PDDocument pdDocument = PDDocument.load(is);
                Properties props = new Properties();
                try {
                    if (pdDocument.isEncrypted()) {
                        try {
                            pdDocument.decrypt("");
                        } catch (InvalidPasswordException e) {
                            throw new DocumentReadException("The pdf document is encrypted.", e);
                        } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
                            throw new DocumentReadException(e.getMessage(), e);
                        }
                    }

                    PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
                    PDMetadata meta = catalog.getMetadata();
                    if (meta != null) {
                        XMPMetadata metadata = meta.exportXMPMetadata();

                        XMPSchemaDublinCore dc = metadata.getDublinCoreSchema();
                        if (dc != null) {
                            try {
                                if (dc.getTitle() != null)
                                    props.put(DCMetaData.TITLE, fixEncoding(dc.getTitle()));
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }
                            try {
                                if (dc.getDescription() != null)
                                    props.put(DCMetaData.DESCRIPTION, fixEncoding(dc.getDescription()));
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getCreators() != null) {
                                    for (String creator : dc.getCreators()) {
                                        props.put(DCMetaData.CREATOR, fixEncoding(creator));
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }

                            try {
                                if (dc.getDates() != null) {
                                    for (Calendar date : dc.getDates()) {
                                        props.put(DCMetaData.DATE, date);
                                    }
                                }
                            } catch (Exception e) {
                                LOG.warn("getDate failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaPDF pdf = metadata.getPDFSchema();
                        if (pdf != null) {
                            try {
                                if (pdf.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, fixEncoding(pdf.getKeywords()));
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }

                            try {
                                if (pdf.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, fixEncoding(pdf.getProducer()));
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                        }

                        XMPSchemaBasic basic = metadata.getBasicSchema();
                        if (basic != null) {
                            try {
                                if (basic.getCreateDate() != null)
                                    props.put(DCMetaData.DATE, basic.getCreateDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (basic.getModifyDate() != null)
                                    props.put(DCMetaData.DATE, basic.getModifyDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }

                            // DCMetaData.PUBLISHER - basic.getCreatorTool()
                        }
                    }

                    if (props.isEmpty()) {
                        // The pdf doesn't contain any metadata, try to use the document
                        // information instead
                        PDDocumentInformation docInfo = pdDocument.getDocumentInformation();

                        if (docInfo != null) {
                            try {
                                if (docInfo.getAuthor() != null)
                                    props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor());
                            } catch (Exception e) {
                                LOG.warn("getAuthor failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getCreationDate());
                            } catch (Exception e) {
                                LOG.warn("getCreationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getCreator() != null)
                                    props.put(DCMetaData.CREATOR, docInfo.getCreator());
                            } catch (Exception e) {
                                LOG.warn("getCreator failed: " + e.getMessage());
                            }
                            try {

                                if (docInfo.getKeywords() != null)
                                    props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
                            } catch (Exception e) {
                                LOG.warn("getKeywords failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getModificationDate() != null)
                                    props.put(DCMetaData.DATE, docInfo.getModificationDate());
                            } catch (Exception e) {
                                LOG.warn("getModificationDate failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getProducer() != null)
                                    props.put(DCMetaData.PUBLISHER, docInfo.getProducer());
                            } catch (Exception e) {
                                LOG.warn("getProducer failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getSubject() != null)
                                    props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
                            } catch (Exception e) {
                                LOG.warn("getSubject failed: " + e.getMessage());
                            }
                            try {
                                if (docInfo.getTitle() != null)
                                    props.put(DCMetaData.TITLE, docInfo.getTitle());
                            } catch (Exception e) {
                                LOG.warn("getTitle failed: " + e.getMessage());
                            }

                            // docInfo.getTrapped();
                        }
                    }
                } finally {
                    if (pdDocument != null) {
                        pdDocument.close();
                    }

                    if (is != null) {
                        try {
                            is.close();
                        } catch (IOException e) {
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("An exception occurred: " + e.getMessage());
                            }
                        }
                    }
                }
                return props;
            }
        });

    } catch (PrivilegedActionException pae) {
        Throwable cause = pae.getCause();
        if (cause instanceof IOException) {
            throw (IOException) cause;
        } else if (cause instanceof RuntimeException) {
            throw (RuntimeException) cause;
        } else {
            throw new RuntimeException(cause);
        }
    }
}

From source file:org.freeeed.ocr.PDFImageExtractor.java

License:Apache License

@SuppressWarnings("rawtypes")
@Override/*w ww  . j  a va 2  s .com*/
public List<String> extractImages() {
    File extractionDir = new File(conf.getPdfImageExtractionDir());
    extractionDir.mkdirs();

    List<String> result = new ArrayList<String>();

    PDDocument document = null;
    try {
        document = PDDocument.load(file);

        List pages = document.getDocumentCatalog().getAllPages();
        Iterator iter = pages.iterator();
        int i = 1;
        int maxNumberOfImages = Project.getCurrentProject().getOcrMaxImagesPerPDF();

        while (iter.hasNext()) {
            PDPage page = (PDPage) iter.next();
            PDResources resources = page.getResources();
            Map pageImages = resources.getImages();
            if (pageImages != null) {
                Iterator imageIter = pageImages.keySet().iterator();
                while (imageIter.hasNext()) {
                    if (i > maxNumberOfImages) {
                        return result;
                    }

                    String key = (String) imageIter.next();
                    PDXObjectImage image = (PDXObjectImage) pageImages.get(key);

                    String fileName = conf.getPdfImageExtractionDir() + OCRUtil.createUniqueFileName("image");
                    image.write2file(fileName);

                    result.add(fileName + "." + image.getSuffix());

                    i++;
                }
            }
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }

    return result;
}

From source file:org.ghost4j.document.PDFDocument.java

License:LGPL

public Document extract(int begin, int end) throws DocumentException {

    this.assertValidPageRange(begin, end);

    PDFDocument result = new PDFDocument();

    ByteArrayInputStream bais = null;
    ByteArrayOutputStream baos = null;

    if (content != null) {

        PDDocument document = new PDDocument();

        try {//from  w w w  . j ava  2  s .  co  m

            bais = new ByteArrayInputStream(content);
            baos = new ByteArrayOutputStream();
            PDDocument inputPDF = PDDocument.load(bais);
            while (begin <= end) {
                document.addPage((PDPage) inputPDF.getDocumentCatalog().getAllPages().get(begin - 1));
                begin++;
            }
            document.save(baos);
            document.close();
            result.load(new ByteArrayInputStream(baos.toByteArray()));

        } catch (Exception e) {
            throw new DocumentException(e);
        } finally {
            IOUtils.closeQuietly(bais);
            IOUtils.closeQuietly(baos);
        }

    }

    return result;
}

From source file:org.ghost4j.document.PDFDocument.java

License:LGPL

@Override
public void append(Document document) throws DocumentException {

    super.append(document);

    ByteArrayOutputStream baos = null;
    PDDocument mergedDocument = new PDDocument();

    try {/*from  w  w w.j av  a 2s .  c  o  m*/

        baos = new ByteArrayOutputStream();
        ByteArrayInputStream bais = new ByteArrayInputStream(content);
        PDDocument pDocument = PDDocument.load(bais);
        int pageCount = pDocument.getNumberOfPages();
        for (int i = 0; i < pageCount; i++) {
            mergedDocument.addPage((PDPage) pDocument.getDocumentCatalog().getAllPages().get(i));
        }

        // copy new document
        ByteArrayInputStream baisNewDoc = new ByteArrayInputStream(document.getContent());
        PDDocument pNewDocument = PDDocument.load(baisNewDoc);
        pageCount = pNewDocument.getNumberOfPages();
        for (int i = 0; i < pageCount; i++) {
            mergedDocument.addPage((PDPage) pNewDocument.getDocumentCatalog().getAllPages().get(i));
        }
        mergedDocument.save(baos);
        mergedDocument.close();
        // replace content with new content
        content = baos.toByteArray();

    } catch (Exception e) {
        throw new DocumentException(e);
    } finally {
        IOUtils.closeQuietly(baos);
    }

}