org.textmining.extraction.excel.ExcelTextExtractor.java Source code

Java tutorial

Introduction

Here is the source code for org.textmining.extraction.excel.ExcelTextExtractor.java

Source

/*
 * Textmining.org text extractors
 * 
 * Copyright (C) 2008 Benryan Software Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */
package org.textmining.extraction.excel;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.util.*;

import org.textmining.extraction.TextExtractor;

public class ExcelTextExtractor implements TextExtractor {
    byte[] _recordStream;
    int _offset;

    public ExcelTextExtractor(InputStream in) throws IOException {
        POIFSFileSystem poifs = new POIFSFileSystem(in);
        DocumentEntry headerProps = (DocumentEntry) poifs.getRoot().getEntry("Workbook");
        DocumentInputStream din = poifs.createDocumentInputStream("Workbook");
        _recordStream = new byte[headerProps.getSize()];

        din.read(_recordStream);
        din.close();
    }

    public String getText() throws IOException {
        StringWriter writer = new StringWriter();
        getText(writer);
        return writer.toString();
    }

    public void getText(Writer writer) throws IOException {
        while (_offset < _recordStream.length) {
            int type = LittleEndian.getShort(_recordStream, _offset);
            _offset += LittleEndian.SHORT_SIZE;
            if (type == 0xa) {
                //if (_offset == _recordStream.length)
                break;
                //        else
                //        {
                //          continue;
                //        }
            }
            int size = LittleEndian.getShort(_recordStream, _offset);
            _offset += LittleEndian.SHORT_SIZE;
            if (type == Record.SST_RECORD) {
                int totalStrings = LittleEndian.getInt(_recordStream, _offset);
                _offset += LittleEndian.INT_SIZE;
                int sharedStrings = LittleEndian.getInt(_recordStream, _offset);
                _offset += LittleEndian.INT_SIZE;
                for (int x = 0; x < sharedStrings; x++) {
                    int strLength = LittleEndian.getShort(_recordStream, _offset);
                    _offset += LittleEndian.SHORT_SIZE;

                    int flags = _recordStream[_offset++];
                    boolean compression = (flags & 0x1) == 0;
                    boolean asian = (flags & 0x4) != 0;
                    boolean richText = (flags & 8) != 0;
                    int numRuns = 0;
                    int sizeofAsian = 0;

                    if (richText) {
                        numRuns = LittleEndian.getShort(_recordStream, _offset);
                        _offset += LittleEndian.SHORT_SIZE;
                    }
                    if (asian) {
                        sizeofAsian = LittleEndian.getInt(_recordStream, _offset);
                        _offset += LittleEndian.SHORT_SIZE;
                    }
                    int byteLength = !compression ? strLength * 2 : strLength;
                    String string = new String(_recordStream, _offset, byteLength,
                            compression ? "Cp1252" : "UTF-16LE");
                    //System.out.println(string);
                    writer.write(string + ' ');

                    _offset += byteLength;
                    if (richText) {
                        _offset += (numRuns * 4);
                    }

                }
            } else {
                _offset += size;
            }
        }
    }

}