Example usage for org.apache.poi.poifs.filesystem Ole10Native createFromEmbeddedOleObject

List of usage examples for org.apache.poi.poifs.filesystem Ole10Native createFromEmbeddedOleObject

Introduction

In this page you can find the example usage for org.apache.poi.poifs.filesystem Ole10Native createFromEmbeddedOleObject.

Prototype

public static Ole10Native createFromEmbeddedOleObject(DirectoryNode directory)
        throws IOException, Ole10NativeException 

Source Link

Document

Creates an instance of this class from an embedded OLE Object.

Usage

From source file:com.ezdi.rtf.testRTFParser.RTFObjDataParser.java

License:Apache License

private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount)
        throws IOException {

    byte[] ret = null;
    try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {

        DirectoryNode root = fs.getRoot();

        if (root == null) {
            return ret;
        }/*ww  w.jav a 2s . c  om*/

        if (root.hasEntry("Package")) {
            Entry ooxml = root.getEntry("Package");
            TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

            ByteArrayOutputStream out = new ByteArrayOutputStream();

            IOUtils.copy(stream, out);
            ret = out.toByteArray();
        } else {
            // try poifs
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
                    ret = ole.getDataBuffer();
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {

                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) root.getEntry("Contents");
                }

                try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
                    ret = new byte[contentsEntry.getSize()];
                    inp.readFully(ret);
                }
            } else {

                ByteArrayOutputStream out = new ByteArrayOutputStream();
                is.reset();
                IOUtils.copy(is, out);
                ret = out.toByteArray();
                metadata.set(Metadata.RESOURCE_NAME_KEY,
                        "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            }
        }
    }
    return ret;
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.AbstractPOIFSExtractor.java

License:Apache License

/**
 * Handle an office document that's embedded at the POIFS level
 *///from   www.  j  a v a 2s.  co  m
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {

    // Is it an embedded OLE2 document, or an embedded OOXML document?

    if (dir.hasEntry("Package")) {
        // It's OOXML (has a ZipFile):
        Entry ooxml = dir.getEntry("Package");

        TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));
        try {
            ZipContainerDetector detector = new ZipContainerDetector();
            MediaType type = detector.detect(stream, new Metadata());
            handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
            return;
        } finally {
            stream.close();
        }
    }

    // It's regular OLE2:

    // What kind of document is it?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
    POIFSDocumentType type = POIFSDocumentType.detectType(dir);
    TikaInputStream embedded = null;

    try {
        if (type == POIFSDocumentType.OLE10_NATIVE) {
            try {
                // Try to un-wrap the OLE10Native record:
                Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());

                byte[] data = ole.getDataBuffer();
                embedded = TikaInputStream.get(data);
            } catch (Ole10NativeException ex) {
                // Not a valid OLE10Native record, skip it
            } catch (Exception e) {
                LOGGER.warn(
                        "Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document "
                                + dir.getName(),
                        e);
            }
        } else if (type == POIFSDocumentType.COMP_OBJ) {
            try {
                // Grab the contents and process
                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) dir.getEntry("Contents");
                }
                DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                byte[] contents = new byte[contentsEntry.getSize()];
                inp.readFully(contents);
                embedded = TikaInputStream.get(contents);

                // Try to work out what it is
                MediaType mediaType = getDetector().detect(embedded, new Metadata());
                String extension = type.getExtension();
                try {
                    MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                    extension = mimeType.getExtension();
                } catch (MimeTypeException mte) {
                    // No details on this type are known
                }

                // Record what we can do about it
                metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
            } catch (Exception e) {
                throw new TikaException("Invalid embedded resource", e);
            }
        } else {
            metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
        }

        // Should we parse it?
        if (extractor.shouldParseEmbedded(metadata)) {
            if (embedded == null) {
                // Make a TikaInputStream that just
                // passes the root directory of the
                // embedded document, and is otherwise
                // empty (byte[0]):
                embedded = TikaInputStream.get(new byte[0]);
                embedded.setOpenContainer(dir);
            }
            extractor.parseEmbedded(embedded, xhtml, metadata, true);
        }
    } finally {
        if (embedded != null) {
            embedded.close();
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java

License:Apache License

/**
 * Handles an embedded OLE object in the document
 *///from www . j  a va 2s  .c  o  m
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
        throws IOException, SAXException {
    // A POIFSFileSystem needs to be at least 3 blocks big to be valid
    // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code
    //        if (part.getSize() >= 0 && part.getSize() < 512*3) {
    //           // Too small, skip
    //           return;
    //        }

    // Open the POIFS (OLE2) structure and process
    POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
    try {
        Metadata metadata = new Metadata();
        TikaInputStream stream = null;
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

        DirectoryNode root = fs.getRoot();
        POIFSDocumentType type = POIFSDocumentType.detectType(root);

        if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj")
                && root.hasEntry("\u0003ObjInfo")) {
            // TIKA-704: OLE 2.0 embedded non-Office document?
            stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else if (POIFSDocumentType.OLE10_NATIVE == type) {
            // TIKA-704: OLE 1.0 embedded document
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
            metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
            byte[] data = ole.getDataBuffer();
            if (data != null) {
                stream = TikaInputStream.get(data);
            }

            if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else {
            handleEmbeddedFile(part, handler, rel);
        }
    } catch (FileNotFoundException e) {
        // There was no CONTENTS entry, so skip this part
    } catch (Ole10NativeException e) {
        // Could not process an OLE 1.0 entry, so skip this part
    }
}

From source file:org.apache.tika.parser.microsoft.AbstractPOIFSExtractor.java

License:Apache License

/**
 * Handle an office document that's embedded at the POIFS level
 *//*from   w w w.j  a  v  a  2  s  .  c  o  m*/
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {

    // Is it an embedded OLE2 document, or an embedded OOXML document?

    if (dir.hasEntry("Package")) {
        // It's OOXML (has a ZipFile):
        Entry ooxml = dir.getEntry("Package");

        try (TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml))) {
            ZipContainerDetector detector = new ZipContainerDetector();
            MediaType type = detector.detect(stream, new Metadata());
            handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml,
                    true);
            return;
        }
    }

    // It's regular OLE2:

    // What kind of document is it?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
    if (dir.getStorageClsid() != null) {
        metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString());
    }
    POIFSDocumentType type = POIFSDocumentType.detectType(dir);
    TikaInputStream embedded = null;

    try {
        if (type == POIFSDocumentType.OLE10_NATIVE) {
            try {
                // Try to un-wrap the OLE10Native record:
                Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
                if (ole.getLabel() != null) {
                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                }
                byte[] data = ole.getDataBuffer();
                embedded = TikaInputStream.get(data);
            } catch (Ole10NativeException ex) {
                // Not a valid OLE10Native record, skip it
            } catch (Exception e) {
                logger.warn(
                        "Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document "
                                + dir.getName(),
                        e);
            }
        } else if (type == POIFSDocumentType.COMP_OBJ) {
            try {
                // Grab the contents and process
                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) dir.getEntry("Contents");
                }
                DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                byte[] contents = new byte[contentsEntry.getSize()];
                inp.readFully(contents);
                embedded = TikaInputStream.get(contents);

                // Try to work out what it is
                MediaType mediaType = getDetector().detect(embedded, new Metadata());
                String extension = type.getExtension();
                try {
                    MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                    extension = mimeType.getExtension();
                } catch (MimeTypeException mte) {
                    // No details on this type are known
                }

                // Record what we can do about it
                metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
            } catch (Exception e) {
                throw new TikaException("Invalid embedded resource", e);
            }
        } else {
            metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
        }

        // Should we parse it?
        if (extractor.shouldParseEmbedded(metadata)) {
            if (embedded == null) {
                // Make a TikaInputStream that just
                // passes the root directory of the
                // embedded document, and is otherwise
                // empty (byte[0]):
                embedded = TikaInputStream.get(new byte[0]);
                embedded.setOpenContainer(dir);
            }
            extractor.parseEmbedded(embedded, xhtml, metadata, true);
        }
    } finally {
        if (embedded != null) {
            embedded.close();
        }
    }
}

From source file:org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java

License:Apache License

/**
 * Handles an embedded OLE object in the document
 *//*  w ww .j  a  v  a  2  s . c  om*/
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
        throws IOException, SAXException {
    // A POIFSFileSystem needs to be at least 3 blocks big to be valid
    if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
        // Too small, skip
        return;
    }

    // Open the POIFS (OLE2) structure and process
    POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
    try {
        Metadata metadata = new Metadata();
        TikaInputStream stream = null;
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

        DirectoryNode root = fs.getRoot();
        POIFSDocumentType type = POIFSDocumentType.detectType(root);

        if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj")
                && root.hasEntry("\u0003ObjInfo")) {
            // TIKA-704: OLE 2.0 embedded non-Office document?
            stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else if (POIFSDocumentType.OLE10_NATIVE == type) {
            // TIKA-704: OLE 1.0 embedded document
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
            if (ole.getLabel() != null) {
                metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
            }
            byte[] data = ole.getDataBuffer();
            if (data != null) {
                stream = TikaInputStream.get(data);
            }

            if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else {
            handleEmbeddedFile(part, handler, rel);
        }
    } catch (FileNotFoundException e) {
        // There was no CONTENTS entry, so skip this part
    } catch (Ole10NativeException e) {
        // Could not process an OLE 1.0 entry, so skip this part
    }
}

From source file:org.apache.tika.parser.rtf.RTFObjDataParser.java

License:Apache License

private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount)
        throws IOException {

    byte[] ret = null;
    try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {

        DirectoryNode root = fs.getRoot();

        if (root == null) {
            return ret;
        }//from  w  w w  .  j  a  v a 2  s  .c  o  m

        if (root.hasEntry("Package")) {
            Entry ooxml = root.getEntry("Package");
            TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

            ByteArrayOutputStream out = new ByteArrayOutputStream();

            IOUtils.copy(stream, out);
            ret = out.toByteArray();
        } else {
            //try poifs
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
                    ret = ole.getDataBuffer();
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {

                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) root.getEntry("Contents");
                }

                try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
                    ret = new byte[contentsEntry.getSize()];
                    inp.readFully(ret);
                }
            } else {

                ByteArrayOutputStream out = new ByteArrayOutputStream();
                is.reset();
                IOUtils.copy(is, out);
                ret = out.toByteArray();
                metadata.set(Metadata.RESOURCE_NAME_KEY,
                        "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            }
        }
    }
    return ret;
}