Example usage for org.apache.pdfbox.pdmodel.common.filespecification PDEmbeddedFile getSubtype

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.common.filespecification PDEmbeddedFile getSubtype.

Prototype

public String getSubtype()

Source Link

Document

Get the subtype(mimetype) for the embedded file.

Usage

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java

License:Apache License

private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames,
        EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException {
    if (embeddedFileNames == null) {
        return;//from ww  w .j  ava 2  s  .  co  m
    }
    for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) {
        PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
        PDEmbeddedFile file = spec.getEmbeddedFile();

        Metadata metadata = new Metadata();
        // TODO: other metadata?
        metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));

        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
            TikaInputStream stream = TikaInputStream.get(file.createInputStream());
            try {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            } finally {
                stream.close();
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

private void extractPDEmbeddedFile(String displayName, String unicodeFileName, String fileName,
        PDEmbeddedFile file, AttributesImpl attributes) throws SAXException, IOException, TikaException {

    if (file == null) {
        //skip silently
        return;//from   ww  w.j  ava2 s . com
    }

    fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
    fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;

    // TODO: other metadata?
    Metadata embeddedMetadata = new Metadata();
    embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
    embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
    embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
    embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
            TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
    embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
    if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
        return;
    }
    TikaInputStream stream = null;
    try {
        stream = TikaInputStream.get(file.createInputStream());
    } catch (IOException e) {
        //store this exception in the parent's metadata
        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        return;
    }
    try {
        embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
                false);

        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
        attributes.addAttribute("", "id", "id", "CDATA", fileName);
        xhtml.startElement("div", attributes);
        xhtml.endElement("div");
    } finally {
        IOUtils.closeQuietly(stream);
    }

}

From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java

License:Apache License

private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
        EmbeddedDocumentExtractor extractor) throws SAXException, IOException, TikaException {

    if (file == null) {
        //skip silently
        return;//from ww  w . j  a v  a 2 s.  co  m
    }

    fileName = (fileName == null) ? defaultName : fileName;

    // TODO: other metadata?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
    metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
    metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
    metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
            TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());

    if (extractor.shouldParseEmbedded(metadata)) {
        TikaInputStream stream = null;
        try {
            stream = TikaInputStream.get(file.createInputStream());
            extractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);

            AttributesImpl attributes = new AttributesImpl();
            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
            attributes.addAttribute("", "id", "id", "CDATA", fileName);
            handler.startElement("div", attributes);
            handler.endElement("div");
        } finally {
            IOUtils.closeQuietly(stream);
        }
    }
}

From source file:org.modeshape.sequencer.pdf.PdfBasicMetadata.java

License:Apache License

public boolean check() throws Exception {
    try (PDDocument document = PDDocument.load(in)) {
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDPageable pageable = new PDPageable(document);
        PageFormat firstPage = pageable.getPageFormat(0);

        encrypted = document.isEncrypted();
        pageCount = document.getNumberOfPages();
        orientation = ORIENTATION_STRINGS[firstPage.getOrientation()];
        version = String.valueOf(document.getDocument().getVersion());
        String catalogVersion = catalog.getVersion();
        if (catalogVersion != null && !catalogVersion.isEmpty()) {
            // According to specs version saved here should be determining instead
            // the version in header. It is barely used, though.
            version = catalogVersion;/*from   w ww .java2 s .co m*/
        }

        if (!encrypted) {
            PDDocumentInformation metadata = document.getDocumentInformation();
            author = metadata.getAuthor();
            creationDate = metadata.getCreationDate();
            creator = metadata.getCreator();
            keywords = metadata.getKeywords();
            modificationDate = metadata.getModificationDate();
            producer = metadata.getProducer();
            subject = metadata.getSubject();
            title = metadata.getTitle();
        }

        // extract all attached files from all pages
        int pageNumber = 0;
        for (Object page : catalog.getAllPages()) {
            pageNumber += 1;
            PdfPageMetadata pageMetadata = new PdfPageMetadata();
            pageMetadata.setPageNumber(pageNumber);
            for (PDAnnotation annotation : ((PDPage) page).getAnnotations()) {
                if (annotation instanceof PDAnnotationFileAttachment) {
                    PdfAttachmentMetadata attachmentMetadata = new PdfAttachmentMetadata();

                    PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                    PDEmbeddedFile embeddedFile = fileSpec.getEmbeddedFile();

                    attachmentMetadata.setSubject(fann.getSubject());
                    attachmentMetadata.setName(fileSpec.getFilename());
                    attachmentMetadata.setCreationDate(embeddedFile.getCreationDate());
                    attachmentMetadata.setModificationDate(embeddedFile.getModDate());
                    attachmentMetadata.setMimeType(embeddedFile.getSubtype());
                    attachmentMetadata.setData(embeddedFile.getByteArray());

                    pageMetadata.addAttachment(attachmentMetadata);
                }
            }
            pages.add(pageMetadata);
        }
        return true;
    }
}

From source file:org.paxle.parser.pdf.impl.PdfParser.java

License:Open Source License

/**
 * A function to extract the content of embedded files from a PDF document.
 *///www  .java2 s.c  o  m
protected void extractEmbeddedFiles(URI location, IParserDocument parserDoc, PDDocument pddDoc)
        throws IOException {
    final PDDocumentCatalog pddDocCatalog = pddDoc.getDocumentCatalog();
    if (pddDocCatalog == null)
        return;

    final PDDocumentNameDictionary nameDic = pddDocCatalog.getNames();
    if (nameDic == null)
        return;

    final PDEmbeddedFilesNameTreeNode embeddedFiles = nameDic.getEmbeddedFiles();
    if (embeddedFiles == null)
        return;

    @SuppressWarnings("unchecked")
    final Map<String, Object> names = embeddedFiles.getNames();
    if (names == null || names.isEmpty())
        return;

    final IParserContext context = this.contextLocal.getCurrentContext();

    for (Entry<String, Object> name : names.entrySet()) {
        // final String fileDesc = name.getKey();
        final Object fileObj = name.getValue();
        if (fileObj == null)
            continue;

        if (fileObj instanceof PDComplexFileSpecification) {
            final PDComplexFileSpecification embeddedFileSpec = (PDComplexFileSpecification) fileObj;
            final PDEmbeddedFile embeddedFile = embeddedFileSpec.getEmbeddedFile();

            // getting the embedded file name and mime-type
            final String fileName = embeddedFileSpec.getFile();
            final String fileMimeType = embeddedFile.getSubtype();
            if (fileMimeType == null) {
                this.logger.warn(String.format("No mime-type specified form embedded file '%s#%s'.", location,
                        fileName));
                continue;
            }

            // getting a parser to parse the content
            final ISubParser sp = context.getParser(fileMimeType);
            if (sp == null) {
                this.logger.warn(String.format("No parser found to parse embedded file '%s#%s' with type '%s'.",
                        location, fileName, fileMimeType));
                continue;
            }

            // parsing content
            InputStream embeddedFileStream = null;
            try {
                embeddedFileStream = embeddedFile.createInputStream();
                final IParserDocument subParserDoc = sp.parse(location, "UTF-8", embeddedFileStream);
                if (subParserDoc.getMimeType() == null) {
                    subParserDoc.setMimeType(fileMimeType);
                }

                parserDoc.addSubDocument(fileName, subParserDoc);
            } catch (ParserException e) {
                this.logger.error(String.format(
                        "Unexpected error while parsing parse embedded file '%s#%s' with type '%s': %s",
                        location, fileName, fileMimeType, e.getMessage()));
            } finally {
                if (embeddedFileStream != null)
                    try {
                        embeddedFileStream.close();
                    } catch (Exception e) {
                        this.logger.error(e);
                    }
            }
        }
    }
}