Example usage for org.apache.poi.poifs.filesystem DocumentInputStream readFully

List of usage examples for org.apache.poi.poifs.filesystem DocumentInputStream readFully

Introduction

In this page you can find the example usage for org.apache.poi.poifs.filesystem DocumentInputStream readFully.

Prototype

@Override
    public void readFully(byte[] buf) 

Source Link

Usage

From source file:mj.ocraptor.extraction.tika.parser.microsoft.AbstractPOIFSExtractor.java

License:Apache License

/**
 * Handle an office document that's embedded at the POIFS level
 *//*ww  w. j  a v a  2s .c  o  m*/
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {

    // Is it an embedded OLE2 document, or an embedded OOXML document?

    if (dir.hasEntry("Package")) {
        // It's OOXML (has a ZipFile):
        Entry ooxml = dir.getEntry("Package");

        TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));
        try {
            ZipContainerDetector detector = new ZipContainerDetector();
            MediaType type = detector.detect(stream, new Metadata());
            handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
            return;
        } finally {
            stream.close();
        }
    }

    // It's regular OLE2:

    // What kind of document is it?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
    POIFSDocumentType type = POIFSDocumentType.detectType(dir);
    TikaInputStream embedded = null;

    try {
        if (type == POIFSDocumentType.OLE10_NATIVE) {
            try {
                // Try to un-wrap the OLE10Native record:
                Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());

                byte[] data = ole.getDataBuffer();
                embedded = TikaInputStream.get(data);
            } catch (Ole10NativeException ex) {
                // Not a valid OLE10Native record, skip it
            } catch (Exception e) {
                LOGGER.warn(
                        "Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document "
                                + dir.getName(),
                        e);
            }
        } else if (type == POIFSDocumentType.COMP_OBJ) {
            try {
                // Grab the contents and process
                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) dir.getEntry("Contents");
                }
                DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                byte[] contents = new byte[contentsEntry.getSize()];
                inp.readFully(contents);
                embedded = TikaInputStream.get(contents);

                // Try to work out what it is
                MediaType mediaType = getDetector().detect(embedded, new Metadata());
                String extension = type.getExtension();
                try {
                    MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                    extension = mimeType.getExtension();
                } catch (MimeTypeException mte) {
                    // No details on this type are known
                }

                // Record what we can do about it
                metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
            } catch (Exception e) {
                throw new TikaException("Invalid embedded resource", e);
            }
        } else {
            metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
        }

        // Should we parse it?
        if (extractor.shouldParseEmbedded(metadata)) {
            if (embedded == null) {
                // Make a TikaInputStream that just
                // passes the root directory of the
                // embedded document, and is otherwise
                // empty (byte[0]):
                embedded = TikaInputStream.get(new byte[0]);
                embedded.setOpenContainer(dir);
            }
            extractor.parseEmbedded(embedded, xhtml, metadata, true);
        }
    } finally {
        if (embedded != null) {
            embedded.close();
        }
    }
}

From source file:org.apache.tika.parser.microsoft.AbstractPOIFSExtractor.java

License:Apache License

/**
 * Handle an office document that's embedded at the POIFS level
 *///from  ww  w  .j ava  2 s.  com
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {

    // Is it an embedded OLE2 document, or an embedded OOXML document?

    if (dir.hasEntry("Package")) {
        // It's OOXML (has a ZipFile):
        Entry ooxml = dir.getEntry("Package");

        try (TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml))) {
            ZipContainerDetector detector = new ZipContainerDetector();
            MediaType type = detector.detect(stream, new Metadata());
            handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml,
                    true);
            return;
        }
    }

    // It's regular OLE2:

    // What kind of document is it?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
    if (dir.getStorageClsid() != null) {
        metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString());
    }
    POIFSDocumentType type = POIFSDocumentType.detectType(dir);
    TikaInputStream embedded = null;

    try {
        if (type == POIFSDocumentType.OLE10_NATIVE) {
            try {
                // Try to un-wrap the OLE10Native record:
                Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
                if (ole.getLabel() != null) {
                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                }
                byte[] data = ole.getDataBuffer();
                embedded = TikaInputStream.get(data);
            } catch (Ole10NativeException ex) {
                // Not a valid OLE10Native record, skip it
            } catch (Exception e) {
                logger.warn(
                        "Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document "
                                + dir.getName(),
                        e);
            }
        } else if (type == POIFSDocumentType.COMP_OBJ) {
            try {
                // Grab the contents and process
                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) dir.getEntry("Contents");
                }
                DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                byte[] contents = new byte[contentsEntry.getSize()];
                inp.readFully(contents);
                embedded = TikaInputStream.get(contents);

                // Try to work out what it is
                MediaType mediaType = getDetector().detect(embedded, new Metadata());
                String extension = type.getExtension();
                try {
                    MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                    extension = mimeType.getExtension();
                } catch (MimeTypeException mte) {
                    // No details on this type are known
                }

                // Record what we can do about it
                metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
            } catch (Exception e) {
                throw new TikaException("Invalid embedded resource", e);
            }
        } else {
            metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
        }

        // Should we parse it?
        if (extractor.shouldParseEmbedded(metadata)) {
            if (embedded == null) {
                // Make a TikaInputStream that just
                // passes the root directory of the
                // embedded document, and is otherwise
                // empty (byte[0]):
                embedded = TikaInputStream.get(new byte[0]);
                embedded.setOpenContainer(dir);
            }
            extractor.parseEmbedded(embedded, xhtml, metadata, true);
        }
    } finally {
        if (embedded != null) {
            embedded.close();
        }
    }
}