Example usage for org.apache.poi.poifs.filesystem DirectoryNode hasEntry

List of usage examples for org.apache.poi.poifs.filesystem DirectoryNode hasEntry

Introduction

In this page you can find the example usage for org.apache.poi.poifs.filesystem DirectoryNode hasEntry.

Prototype

public boolean hasEntry(String name) 

Source Link

Usage

From source file:com.ezdi.rtf.testRTFParser.RTFObjDataParser.java

License:Apache License

private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount)
        throws IOException {

    byte[] ret = null;
    try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {

        DirectoryNode root = fs.getRoot();

        if (root == null) {
            return ret;
        }/*from   w  w w.  ja  v a 2s.co  m*/

        if (root.hasEntry("Package")) {
            Entry ooxml = root.getEntry("Package");
            TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

            ByteArrayOutputStream out = new ByteArrayOutputStream();

            IOUtils.copy(stream, out);
            ret = out.toByteArray();
        } else {
            // try poifs
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
                    ret = ole.getDataBuffer();
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {

                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) root.getEntry("Contents");
                }

                try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
                    ret = new byte[contentsEntry.getSize()];
                    inp.readFully(ret);
                }
            } else {

                ByteArrayOutputStream out = new ByteArrayOutputStream();
                is.reset();
                IOUtils.copy(is, out);
                ret = out.toByteArray();
                metadata.set(Metadata.RESOURCE_NAME_KEY,
                        "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            }
        }
    }
    return ret;
}

From source file:com.jkoolcloud.tnt4j.streams.inputs.ExcelSXSSFRowStream.java

License:Apache License

/**
 * Reads HSSF (XLS) format excel file using Apache POI streaming SXSSF API.
 *
 * @param xlsFile//w  w  w .j  av  a 2s.co m
 *            excel HSSF format file to read
 *
 * @throws IOException
 *             if excel file or workbook can't be read
 */
protected void readXLS(File xlsFile) throws IOException {
    NPOIFSFileSystem fs = null;
    InputStream dis = null;
    boolean passwordSet = false;

    try {
        fs = new NPOIFSFileSystem(xlsFile, true);
        DirectoryNode root = fs.getRoot();
        if (root.hasEntry("EncryptedPackage")) { // NON-NLS
            dis = DocumentFactoryHelper.getDecryptedStream(fs, wbPass);
        } else {
            if (wbPass != null) {
                Biff8EncryptionKey.setCurrentUserPassword(wbPass);
                passwordSet = true;
            }
            dis = fs.createDocumentInputStream("Workbook"); // NON-NLS
        }
        HSSFRequest req = new HSSFRequest();

        XLSEventListener listener = new XLSEventListener(this);
        FormatTrackingHSSFListener formatsListener = new FormatTrackingHSSFListener(listener,
                Locale.getDefault());
        listener.setFormatListener(formatsListener);
        req.addListenerForAllRecords(formatsListener);
        HSSFEventFactory factory = new HSSFEventFactory();
        factory.processEvents(req, dis);
    } finally {
        if (passwordSet) {
            Biff8EncryptionKey.setCurrentUserPassword((String) null);
        }

        Utils.close(fs);
        Utils.close(dis);
    }
}

From source file:com.kplot.web.data.WorkbookFactory.java

License:Apache License

/**
 * Creates a Workbook from the given NPOIFSFileSystem, which may
 *  be password protected//  w  w w .  j a v  a 2 s  .  co  m
 *
 *  @param fs The {@link NPOIFSFileSystem} to read the document from
 *  @param password The password that should be used or null if no password is necessary.
 *
 *  @return The created Workbook
 *
 *  @throws IOException if an error occurs while reading the data
 *  @throws InvalidFormatException if the contents of the file cannot be parsed into a {@link Workbook}
 */
private static Workbook create(NPOIFSFileSystem fs, String password)
        throws IOException, InvalidFormatException {
    DirectoryNode root = fs.getRoot();

    // Encrypted OOXML files go inside OLE2 containers, is this one?
    if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
        EncryptionInfo info = new EncryptionInfo(fs);
        Decryptor d = Decryptor.getInstance(info);

        boolean passwordCorrect = false;
        InputStream stream = null;
        try {
            if (password != null && d.verifyPassword(password)) {
                passwordCorrect = true;
            }
            if (!passwordCorrect && d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
                passwordCorrect = true;
            }
            if (passwordCorrect) {
                stream = d.getDataStream(root);
            }
        } catch (GeneralSecurityException e) {
            throw new IOException(e);
        }

        if (!passwordCorrect) {
            if (password != null)
                throw new EncryptedDocumentException("Password incorrect");
            else
                throw new EncryptedDocumentException(
                        "The supplied spreadsheet is protected, but no password was supplied");
        }

        OPCPackage pkg = OPCPackage.open(stream);
        return create(pkg);
    }

    // If we get here, it isn't an encrypted XLSX file
    // So, treat it as a regular HSSF XLS one
    if (password != null) {
        Biff8EncryptionKey.setCurrentUserPassword(password);
    }
    try {
        return new HSSFWorkbook(root, true);
    } finally {
        Biff8EncryptionKey.setCurrentUserPassword(null);
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.ExcelExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale)
        throws IOException, SAXException, TikaException {
    if (!root.hasEntry(WORKBOOK_ENTRY)) {
        // Corrupt file / very old file, just skip
        return;/*from   www  . ja  va  2 s . c o m*/
    }

    TikaImageHelper tikaHelper = new TikaImageHelper(metadata);
    try {
        TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this, tikaHelper);
        listener.processFile(root, isListenForAllRecords());
        listener.throwStoredException();
        tikaHelper.addTextToHandler(xhtml);
    } catch (Exception e) {
        // TODO: logging
        e.printStackTrace();
    } finally {
        if (tikaHelper != null) {
            tikaHelper.close();
        }
    }

    for (Entry entry : root) {
        if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
            try {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            } catch (TikaException e) {
                // ignore parse errors from embedded documents
            }
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java

License:Apache License

/**
 * Handles an embedded OLE object in the document
 *///from w ww .j a va 2s.  c  om
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
        throws IOException, SAXException {
    // A POIFSFileSystem needs to be at least 3 blocks big to be valid
    // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code
    //        if (part.getSize() >= 0 && part.getSize() < 512*3) {
    //           // Too small, skip
    //           return;
    //        }

    // Open the POIFS (OLE2) structure and process
    POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
    try {
        Metadata metadata = new Metadata();
        TikaInputStream stream = null;
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

        DirectoryNode root = fs.getRoot();
        POIFSDocumentType type = POIFSDocumentType.detectType(root);

        if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj")
                && root.hasEntry("\u0003ObjInfo")) {
            // TIKA-704: OLE 2.0 embedded non-Office document?
            stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else if (POIFSDocumentType.OLE10_NATIVE == type) {
            // TIKA-704: OLE 1.0 embedded document
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
            metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
            byte[] data = ole.getDataBuffer();
            if (data != null) {
                stream = TikaInputStream.get(data);
            }

            if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else {
            handleEmbeddedFile(part, handler, rel);
        }
    } catch (FileNotFoundException e) {
        // There was no CONTENTS entry, so skip this part
    } catch (Ole10NativeException e) {
        // Could not process an OLE 1.0 entry, so skip this part
    }
}

From source file:org.apache.tika.parser.microsoft.ExcelExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale)
        throws IOException, SAXException, TikaException {
    if (!root.hasEntry(WORKBOOK_ENTRY)) {
        if (root.hasEntry(BOOK_ENTRY)) {
            // Excel 5 / Excel 95 file
            // Records are in a different structure so needs a
            //  different parser to process them
            OldExcelExtractor extractor = new OldExcelExtractor(root);
            OldExcelParser.parse(extractor, xhtml);
            return;
        } else {//from   ww w .java2 s.  c om
            // Corrupt file / very old file, just skip text extraction
            return;
        }
    }

    // If a password was supplied, use it, otherwise the default
    Biff8EncryptionKey.setCurrentUserPassword(getPassword());

    // Have the file processed in event mode
    TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
    listener.processFile(root, isListenForAllRecords());
    listener.throwStoredException();

    for (Entry entry : root) {
        if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
            try {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            } catch (TikaException e) {
                // ignore parse errors from embedded documents
            }
        }
    }
}

From source file:org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java

License:Apache License

/**
 * Handles an embedded OLE object in the document
 *//*from   www.  ja v  a2 s.c o  m*/
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
        throws IOException, SAXException {
    // A POIFSFileSystem needs to be at least 3 blocks big to be valid
    if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
        // Too small, skip
        return;
    }

    // Open the POIFS (OLE2) structure and process
    POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
    try {
        Metadata metadata = new Metadata();
        TikaInputStream stream = null;
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

        DirectoryNode root = fs.getRoot();
        POIFSDocumentType type = POIFSDocumentType.detectType(root);

        if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj")
                && root.hasEntry("\u0003ObjInfo")) {
            // TIKA-704: OLE 2.0 embedded non-Office document?
            stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else if (POIFSDocumentType.OLE10_NATIVE == type) {
            // TIKA-704: OLE 1.0 embedded document
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
            if (ole.getLabel() != null) {
                metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
            }
            byte[] data = ole.getDataBuffer();
            if (data != null) {
                stream = TikaInputStream.get(data);
            }

            if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else {
            handleEmbeddedFile(part, handler, rel);
        }
    } catch (FileNotFoundException e) {
        // There was no CONTENTS entry, so skip this part
    } catch (Ole10NativeException e) {
        // Could not process an OLE 1.0 entry, so skip this part
    }
}

From source file:org.apache.tika.parser.rtf.RTFObjDataParser.java

License:Apache License

private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount)
        throws IOException {

    byte[] ret = null;
    try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {

        DirectoryNode root = fs.getRoot();

        if (root == null) {
            return ret;
        }//from   w w w .j a  va 2 s  .  co m

        if (root.hasEntry("Package")) {
            Entry ooxml = root.getEntry("Package");
            TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

            ByteArrayOutputStream out = new ByteArrayOutputStream();

            IOUtils.copy(stream, out);
            ret = out.toByteArray();
        } else {
            //try poifs
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
                    ret = ole.getDataBuffer();
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {

                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) root.getEntry("Contents");
                }

                try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
                    ret = new byte[contentsEntry.getSize()];
                    inp.readFully(ret);
                }
            } else {

                ByteArrayOutputStream out = new ByteArrayOutputStream();
                is.reset();
                IOUtils.copy(is, out);
                ret = out.toByteArray();
                metadata.set(Metadata.RESOURCE_NAME_KEY,
                        "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            }
        }
    }
    return ret;
}

From source file:org.apache.tika.parser.wordperfect.QPWTextExtractor.java

License:Apache License

@SuppressWarnings("resource")
public void extract(InputStream input, XHTMLContentHandler xhtml, Metadata metadata)
        throws IOException, SAXException, TikaException {

    POIFSFileSystem pfs = new POIFSFileSystem(input);
    DirectoryNode rootNode = pfs.getRoot();
    if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) {
        throw new UnsupportedFormatException("Unsupported QuattroPro file format. " + "Looking for OLE entry \""
                + OLE_DOCUMENT_NAME + "\". Found: " + rootNode.getEntryNames());
    }/*from w ww. j a  v a 2 s.  c om*/

    //TODO shall we validate and throw warning/error if the file does not 
    //start with a BOF and ends with a EOF?
    xhtml.startElement("p");
    try (WPInputStream in = new WPInputStream(pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) {
        Context ctx = new Context(in, xhtml, metadata);
        while (hasNext(in)) {
            ctx.type = in.readWPShort();
            ctx.bodyLength = in.readWPShort();
            Extractor extractor = EXTRACTORS.get(ctx.type);
            if (extractor != null) {
                extractor.extract(ctx);
            } else {
                // Use DEBUG to find out what we are ignoring
                //                    Extractor.DEBUG.extract(ctx);
                Extractor.IGNORE.extract(ctx);
            }
        }
    }
    xhtml.endElement("p");
}