Example usage for org.apache.pdfbox.pdmodel.common.filespecification PDEmbeddedFile getSize

List of usage examples for org.apache.pdfbox.pdmodel.common.filespecification PDEmbeddedFile getSize

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdmodel.common.filespecification PDEmbeddedFile getSize.

Prototype

public int getSize() 

Source Link

Document

Get the size of the embedded file.

Usage

From source file:mj.ocraptor.extraction.tika.parser.pdf.PDF2XHTML.java

License:Apache License

private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames,
        EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException {
    if (embeddedFileNames == null) {
        return;//from w w  w. j av a2s.  c o m
    }
    for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) {
        PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
        PDEmbeddedFile file = spec.getEmbeddedFile();

        Metadata metadata = new Metadata();
        // TODO: other metadata?
        metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));

        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
            TikaInputStream stream = TikaInputStream.get(file.createInputStream());
            try {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            } finally {
                stream.close();
            }
        }
    }
}

From source file:org.apache.tika.parser.pdf.AbstractPDF2XHTML.java

License:Apache License

private void extractPDEmbeddedFile(String displayName, String unicodeFileName, String fileName,
        PDEmbeddedFile file, AttributesImpl attributes) throws SAXException, IOException, TikaException {

    if (file == null) {
        //skip silently
        return;//from w w w .j a v  a2 s.  co  m
    }

    fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
    fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;

    // TODO: other metadata?
    Metadata embeddedMetadata = new Metadata();
    embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
    embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
    embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
    embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
            TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
    embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
    if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
        return;
    }
    TikaInputStream stream = null;
    try {
        stream = TikaInputStream.get(file.createInputStream());
    } catch (IOException e) {
        //store this exception in the parent's metadata
        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
        return;
    }
    try {
        embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
                false);

        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
        attributes.addAttribute("", "id", "id", "CDATA", fileName);
        xhtml.startElement("div", attributes);
        xhtml.endElement("div");
    } finally {
        IOUtils.closeQuietly(stream);
    }

}

From source file:org.apache.tika.parser.pdf.EnhancedPDF2XHTML.java

License:Apache License

private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
        EmbeddedDocumentExtractor extractor) throws SAXException, IOException, TikaException {

    if (file == null) {
        //skip silently
        return;/*w w  w.  j  a  v a 2 s.  c  o m*/
    }

    fileName = (fileName == null) ? defaultName : fileName;

    // TODO: other metadata?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
    metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
    metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
    metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
            TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());

    if (extractor.shouldParseEmbedded(metadata)) {
        TikaInputStream stream = null;
        try {
            stream = TikaInputStream.get(file.createInputStream());
            extractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);

            AttributesImpl attributes = new AttributesImpl();
            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
            attributes.addAttribute("", "id", "id", "CDATA", fileName);
            handler.startElement("div", attributes);
            handler.endElement("div");
        } finally {
            IOUtils.closeQuietly(stream);
        }
    }
}