Example usage for org.apache.poi.poifs.filesystem POIFSFileSystem createDocumentInputStream

Introduction

In this page you can find the example usage for org.apache.poi.poifs.filesystem POIFSFileSystem createDocumentInputStream.

Prototype

public DocumentInputStream createDocumentInputStream(final String documentName) throws IOException

Source Link

Document

open a document in the root entry's list of entries

Usage

From source file:com.sonicle.webtop.core.io.input.ExcelFileReader.java

License:Open Source License

public HashMap<String, String> listXlsColumnNames(File file) throws IOException, FileReaderException {
    POIFSFileSystem pfs = null;
    InputStream is = null;//  w  w w .  j  a  v a 2  s. co  m

    try {
        pfs = new POIFSFileSystem(file);
        is = pfs.createDocumentInputStream("Workbook");
        XlsColumnsProcessor processor = new XlsColumnsProcessor(is, headersRow, firstDataRow, lastDataRow,
                sheet);
        processor.process();
        return processor.columnNames;

    } finally {
        IOUtils.closeQuietly(is);
        IOUtils.closeQuietly(pfs);
    }
}

From source file:com.sonicle.webtop.core.io.input.ExcelFileReader.java

License:Open Source License

public HashMap<String, Integer> listXlsColumnIndexes(File file) throws IOException, FileReaderException {
    POIFSFileSystem pfs = null;
    InputStream is = null;//ww  w  .  j  av a 2  s  . com

    try {
        pfs = new POIFSFileSystem(file);
        is = pfs.createDocumentInputStream("Workbook");
        XlsColumnsProcessor processor = new XlsColumnsProcessor(is, headersRow, firstDataRow, lastDataRow,
                sheet);
        processor.process();
        return processor.columnIndexes;

    } finally {
        IOUtils.closeQuietly(is);
        IOUtils.closeQuietly(pfs);
    }
}

From source file:com.toolsverse.etl.metadata.excel.ExcelFileMetadata.java

License:Open Source License

@Override
public DataSet getTablesByType(InputStream inputSteam, String name, String pattern, String type)
        throws Exception {
    DataSet dataSet = new DataSet();
    dataSet.setName("tables");

    FieldDef fieldDef = new FieldDef();
    fieldDef.setName("File");
    fieldDef.setSqlDataType(Types.VARCHAR);
    dataSet.addField(fieldDef);//from  www .j  av  a2s .c  o  m

    fieldDef = new FieldDef();
    fieldDef.setName("Name");
    fieldDef.setSqlDataType(Types.VARCHAR);
    dataSet.addField(fieldDef);

    dataSet.setKeyFields("Name");

    InputStream din = null;

    try {
        POIFSFileSystem poifs = new POIFSFileSystem(inputSteam);
        din = poifs.createDocumentInputStream("Workbook");
        HSSFRequest req = new HSSFRequest();

        req.addListenerForAllRecords(new SheetReader(name, dataSet));
        HSSFEventFactory factory = new HSSFEventFactory();

        try {
            factory.processEvents(req, din);
        } catch (Exception ex) {
            if (!Utils.isParticularException(ex, SHEETS_EXTRACTED_EXCEPTION))
                throw ex;
        }
    } finally {
        if (din != null)
            din.close();
    }

    return dataSet;
}

From source file:com.toolsverse.etl.metadata.excel.ExcelFileMetadata.java

License:Open Source License

@Override
public DataSet getTablesByType(String catalog, String schema, String pattern, String type) throws Exception {
    DataSet dataSet = new DataSet();
    dataSet.setName(TABLES_DATASET_TYPE);

    FieldDef fieldDef = new FieldDef();
    fieldDef.setName("File");
    fieldDef.setSqlDataType(Types.VARCHAR);
    dataSet.addField(fieldDef);//  www  .  ja v  a  2 s .c  om

    fieldDef = new FieldDef();
    fieldDef.setName("Name");
    fieldDef.setSqlDataType(Types.VARCHAR);
    dataSet.addField(fieldDef);

    dataSet.setKeyFields("Name");

    FileInputStream fin = null;
    InputStream din = null;

    try {
        fin = new FileInputStream(catalog);
        POIFSFileSystem poifs = new POIFSFileSystem(fin);
        din = poifs.createDocumentInputStream("Workbook");
        HSSFRequest req = new HSSFRequest();

        req.addListenerForAllRecords(new SheetReader(catalog, dataSet));
        HSSFEventFactory factory = new HSSFEventFactory();

        try {
            factory.processEvents(req, din);
        } catch (Exception ex) {
            if (!Utils.isParticularException(ex, SHEETS_EXTRACTED_EXCEPTION))
                throw ex;
        }
    } finally {
        if (fin != null)
            fin.close();
        if (din != null)
            din.close();
    }

    return dataSet;
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.java

License:Apache License

/**
 * Handles an embedded OLE object in the document
 *//*from  ww  w.j  a va  2 s. co  m*/
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
        throws IOException, SAXException {
    // A POIFSFileSystem needs to be at least 3 blocks big to be valid
    // TODO: TIKA-1118 Upgrade to POI 4.0 then enable this block of code
    //        if (part.getSize() >= 0 && part.getSize() < 512*3) {
    //           // Too small, skip
    //           return;
    //        }

    // Open the POIFS (OLE2) structure and process
    POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
    try {
        Metadata metadata = new Metadata();
        TikaInputStream stream = null;
        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);

        DirectoryNode root = fs.getRoot();
        POIFSDocumentType type = POIFSDocumentType.detectType(root);

        if (root.hasEntry("CONTENTS") && root.hasEntry("\u0001Ole") && root.hasEntry("\u0001CompObj")
                && root.hasEntry("\u0003ObjInfo")) {
            // TIKA-704: OLE 2.0 embedded non-Office document?
            stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
            if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else if (POIFSDocumentType.OLE10_NATIVE == type) {
            // TIKA-704: OLE 1.0 embedded document
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
            metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
            byte[] data = ole.getDataBuffer();
            if (data != null) {
                stream = TikaInputStream.get(data);
            }

            if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
                embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
            }
        } else {
            handleEmbeddedFile(part, handler, rel);
        }
    } catch (FileNotFoundException e) {
        // There was no CONTENTS entry, so skip this part
    } catch (Ole10NativeException e) {
        // Could not process an OLE 1.0 entry, so skip this part
    }
}

From source file:net.sf.mmm.content.parser.impl.poi.AbstractContentParserPoi.java

License:Apache License

/**
 * {@inheritDoc}//from ww  w. ja  v  a2  s  .co m
 */
@Override
public void parse(InputStream inputStream, long filesize, ContentParserOptions options,
        MutableGenericContext context) throws Exception {

    POIFSFileSystem poiFs = new POIFSFileSystem(inputStream);
    SummaryInformation summaryInfo = (SummaryInformation) PropertySetFactory
            .create(poiFs.createDocumentInputStream(SummaryInformation.DEFAULT_STREAM_NAME));
    String title = summaryInfo.getTitle();
    if (title != null) {
        context.setVariable(VARIABLE_NAME_TITLE, title);
    }
    String author = summaryInfo.getAuthor();
    if (author != null) {
        context.setVariable(VARIABLE_NAME_CREATOR, author);
    }
    String keywords = summaryInfo.getKeywords();
    if (keywords != null) {
        context.setVariable(VARIABLE_NAME_KEYWORDS, keywords);
    }
    context.setVariable(VARIABLE_NAME_TEXT, extractText(poiFs, filesize, options));
}

From source file:net.sf.mmm.content.parser.impl.poi.ContentParserPpt.java

License:Apache License

/**
 * {@inheritDoc}// w ww. j a v a  2  s  .  c  o  m
 */
@Override
protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options)
        throws Exception {

    // PowerPointExtractor pptExtractor = new PowerPointExtractor(poiFs);
    // return pptExtractor.getText();

    DocumentInputStream docStream = poiFs.createDocumentInputStream(POIFS_POWERPOINT_DOC);

    int length = docStream.available();
    int maximumBufferSize = options.getMaximumBufferSize();
    if (maximumBufferSize < length) {
        length = maximumBufferSize;
    }
    int capacity = length / 10;
    StringBuffer textBuffer = new StringBuffer(capacity);
    byte[] buffer = new byte[length];
    docStream.read(buffer);
    docStream.close();
    extractRecursive(buffer, 0, length, textBuffer);
    return textBuffer.toString();
}

From source file:net.sf.mmm.content.parser.impl.poi.ContentParserXls.java

License:Apache License

/**
 * {@inheritDoc}/*  www .j a v a  2s.  c  om*/
 */
@Override
protected String extractText(POIFSFileSystem poiFs, long filesize, ContentParserOptions options)
        throws Exception {

    int maxBufferSize = options.getMaximumBufferSize();
    int maxCharSize = maxBufferSize / 2;
    InputStream documentInputStream = poiFs.createDocumentInputStream(POIFS_EXCEL_DOC);
    // actually there seems no smart guess for the initial capacity of
    // textBuffer
    // the text length can have any ration to documentInputStream.available()
    // the only possibility would be to create the string buffer in the listener
    // from the size of the SSTRecord. In this case stable code is better than
    // saving a tiny percent of performance...
    StringBuilder textBuffer = new StringBuilder(1024);
    try {
        HSSFRequest req = new HSSFRequest();
        req.addListenerForAllRecords(new ExcelListener(textBuffer, maxCharSize));
        HSSFEventFactory factory = new HSSFEventFactory();
        factory.processEvents(req, documentInputStream);
    } finally {
        documentInputStream.close();
    }
    return textBuffer.toString();
}

From source file:no.trank.openpipe.parse.ms.POIUtils.java

License:Apache License

/**
 * Fetches the \005SummaryInformation and \005DocumentSummaryInformation streams from the poi
 * file system and exctracts all properties of primitive type, String or Date.
 * /* w ww  . j  a v  a 2 s .  co  m*/
 * @param fs the poi filesystem 
 * @return the properties
 */
public static Map<String, String> getProperties(POIFSFileSystem fs) {
    Map<String, String> map = new HashMap<String, String>();

    try {
        InputStream stream = fs.createDocumentInputStream(SummaryInformation.DEFAULT_STREAM_NAME);
        addProperties(map, PropertySetFactory.create(stream));
    } catch (Exception e) {
        // ignore
    }

    try {
        InputStream stream = fs.createDocumentInputStream(DocumentSummaryInformation.DEFAULT_STREAM_NAME);
        addProperties(map, PropertySetFactory.create(stream));
    } catch (Exception e) {
        // ignore
    }

    return map;
}

From source file:org.apache.nutch.parse.msword.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document./*from www .ja v a 2 s .  co  m*/
 *
 * @param in The InputStream representing the Word file.
 */
protected String extractText(InputStream in) throws Exception {

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}