Example usage for org.apache.poi.util LittleEndian getShort

List of usage examples for org.apache.poi.util LittleEndian getShort

Introduction

In this page you can find the example usage for org.apache.poi.util LittleEndian getShort.

Prototype

public static short getShort(byte[] data, int offset) 

Source Link

Document

get a short value from a byte array

Usage

From source file:com.krawler.esp.fileparser.word.ExtractWordFile.java

License:Open Source License

public String extractText(String filepath) throws FastSavedException, IOException {
    InputStream iStream = new BufferedInputStream(new FileInputStream(filepath));
    POIFSFileSystem fsys = new POIFSFileSystem(iStream);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);//www.  j a va  2 s .c  om
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        System.out.println("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    // Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    // get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't
    // been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:com.krawler.esp.fileparser.wordparser.ExtractWordFile.java

License:Open Source License

public String extractText(String filepath) throws FastSavedException, IOException {
    InputStream iStream = new BufferedInputStream(new FileInputStream(filepath));

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(iStream);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);//  w  w  w  .j a va2s .  co  m
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        System.out.println("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    // Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    // get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);

    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header, tpt);
    }
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);
    // load our text pieces and our character runs

    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't
    // been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:com.progdan.doc2txt.sprm.SprmOperation.java

License:Apache License

public SprmOperation(byte[] grpprl, int offset) {
    short sprmStart = LittleEndian.getShort(grpprl, offset);
    offset += 2;//from ww  w.  ja va2s  .co  m

    _operation = OP_BITFIELD.getValue(sprmStart);
    _type = TYPE_BITFIELD.getValue(sprmStart);
    int sizeCode = SIZECODE_BITFIELD.getValue(sprmStart);

    switch (sizeCode) {
    case 0:
    case 1:
        _operand = LittleEndian.getUnsignedByte(grpprl, offset);
        _sizeNeeded = 3;
        break;
    case 2:
    case 4:
    case 5:
        _operand = LittleEndian.getShort(grpprl, offset);
        _sizeNeeded = 4;
        break;
    case 3:
        _operand = LittleEndian.getInt(grpprl, offset);
        _sizeNeeded = 6;
        break;
    case 6:
        _varOperand = new byte[grpprl[offset++]];
        System.arraycopy(grpprl, offset, _varOperand, 0, _varOperand.length);
        _sizeNeeded = _varOperand.length + 3;
        break;
    case 7:
        byte threeByteInt[] = new byte[4];
        threeByteInt[0] = grpprl[offset];
        threeByteInt[1] = grpprl[offset + 1];
        threeByteInt[2] = grpprl[offset + 2];
        threeByteInt[3] = (byte) 0;
        _operand = LittleEndian.getInt(threeByteInt, 0);
        _sizeNeeded = 5;
        break;

    }
}

From source file:com.progdan.doc2txt.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document./*from  w  w w .j av a  2  s .c  o m*/
 *
 * @param in The InputStream representing the Word file.
 */
public String extractText(InputStream in) throws Exception {
    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:org.apache.nutch.parse.msword.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document./*from   w  ww.ja v  a  2  s.  co  m*/
 *
 * @param in The InputStream representing the Word file.
 */
protected String extractText(InputStream in) throws Exception {

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:org.ddt.listener.dsi.HeadingPairProperty.java

License:Apache License

/**
 * the constructor.//from ww  w. j av  a  2  s. c o  m
 *
 *
 * @param data           the data to read from.
 * @param dataOffset     the offset into the
 * <code>data</code> byte array.
 * @param docPartsOffset the offset of the corresponding docparts.
 * @throws IllegalVariantTypeException  if the data is malformed.
 * @throws UnsupportedEncodingException
 */
HeadingPairProperty(byte[] data, int dataOffset, int docPartsOffset)
        throws IllegalVariantTypeException, UnsupportedEncodingException {
    int off = dataOffset;
    name = new StringProperty(data, off);
    off += name.getSize();
    long type = LittleEndian.getUInt(data, off);
    if (type != Variant.VT_I4) {
        log.log(Level.WARNING, "Not a proper VT_I4 type.");
        throw new IllegalVariantTypeException(type, name);
    }
    off += LittleEndian.INT_SIZE;
    //this is a horrible workaround, around the bug in HPSF, that returns
    //cutoff byte arrays from Section.getProperty() (HPFS Bug #52337)
    //It hopes that there aren't too many parts per heading (i.e. worst
    //case it can be store in one byte...)
    int left = data.length - off;
    if (left >= LittleEndian.INT_SIZE) {
        partsCount = (int) LittleEndian.getUInt(data, off);
        off += LittleEndian.INT_SIZE;
    } else if (left >= LittleEndian.SHORT_SIZE) {
        partsCount = LittleEndian.getShort(data, off);
        off += left;
    } else if (left >= LittleEndian.BYTE_SIZE) {
        partsCount = LittleEndian.getUByte(data, off);
        off += left;
    } else {
        partsCount = 1; //default... maybe not a good idea.
    }
    size = off - dataOffset;

    this.docPartsOffset = docPartsOffset;
}

From source file:org.ddt.listener.records.DConRefRecord.java

License:Apache License

/**
 * Read constructor./*  ww  w .  j  av  a 2  s .c o  m*/
 *
 * @param data byte array containing a DConRef Record, including the header.
 */
public DConRefRecord(byte[] data) {
    int offset = 0;
    if (!(LittleEndian.getShort(data, offset) == DConRefRecord.sid))
        throw new RecordFormatException("incompatible sid.");
    offset += LittleEndian.SHORT_SIZE;

    //length = LittleEndian.getShort(data, offset);
    offset += LittleEndian.SHORT_SIZE;

    firstRow = LittleEndian.getUShort(data, offset);
    offset += LittleEndian.SHORT_SIZE;
    lastRow = LittleEndian.getUShort(data, offset);
    offset += LittleEndian.SHORT_SIZE;
    firstCol = LittleEndian.getUByte(data, offset);
    offset += LittleEndian.BYTE_SIZE;
    lastCol = LittleEndian.getUByte(data, offset);
    offset += LittleEndian.BYTE_SIZE;
    charCount = LittleEndian.getUShort(data, offset);
    offset += LittleEndian.SHORT_SIZE;
    if (charCount < 2)
        throw new org.apache.poi.hssf.record.RecordFormatException("Character count must be >= 2");

    charType = LittleEndian.getUByte(data, offset);
    offset += LittleEndian.BYTE_SIZE; //7 bits reserved + 1 bit type

    /*
     * bytelength is the length of the string in bytes, which depends on whether the string is
     * made of single- or double-byte chars. This is given by charType, which equals 0 if
     * single-byte, 1 if double-byte.
     */
    int byteLength = charCount * ((charType & 1) + 1);

    path = LittleEndian.getByteArray(data, offset, byteLength);
    offset += byteLength;

    /*
     * If it's a self reference, the last one or two bytes (depending on char type) are the
     * unused field. Not sure If i need to bother with this...
     */
    if (path[0] == 0x02)
        _unused = LittleEndian.getByteArray(data, offset, (charType + 1));

}

From source file:org.textmining.extraction.excel.ExcelTextExtractor.java

License:Open Source License

public void getText(Writer writer) throws IOException {
    while (_offset < _recordStream.length) {
        int type = LittleEndian.getShort(_recordStream, _offset);
        _offset += LittleEndian.SHORT_SIZE;
        if (type == 0xa) {
            //if (_offset == _recordStream.length)
            break;
            //        else
            //        {
            //          continue;
            //        }
        }//from w w w .j  a va 2s .co m
        int size = LittleEndian.getShort(_recordStream, _offset);
        _offset += LittleEndian.SHORT_SIZE;
        if (type == Record.SST_RECORD) {
            int totalStrings = LittleEndian.getInt(_recordStream, _offset);
            _offset += LittleEndian.INT_SIZE;
            int sharedStrings = LittleEndian.getInt(_recordStream, _offset);
            _offset += LittleEndian.INT_SIZE;
            for (int x = 0; x < sharedStrings; x++) {
                int strLength = LittleEndian.getShort(_recordStream, _offset);
                _offset += LittleEndian.SHORT_SIZE;

                int flags = _recordStream[_offset++];
                boolean compression = (flags & 0x1) == 0;
                boolean asian = (flags & 0x4) != 0;
                boolean richText = (flags & 8) != 0;
                int numRuns = 0;
                int sizeofAsian = 0;

                if (richText) {
                    numRuns = LittleEndian.getShort(_recordStream, _offset);
                    _offset += LittleEndian.SHORT_SIZE;
                }
                if (asian) {
                    sizeofAsian = LittleEndian.getInt(_recordStream, _offset);
                    _offset += LittleEndian.SHORT_SIZE;
                }
                int byteLength = !compression ? strLength * 2 : strLength;
                String string = new String(_recordStream, _offset, byteLength,
                        compression ? "Cp1252" : "UTF-16LE");
                //System.out.println(string);
                writer.write(string + ' ');

                _offset += byteLength;
                if (richText) {
                    _offset += (numRuns * 4);
                }

            }
        } else {
            _offset += size;
        }
    }
}

From source file:org.textmining.extraction.word.ComplexFileTable.java

License:Open Source License

public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException {
    //skips through the prms before we reach the piece table. These contain data
    //for actual fast saved files
    while (tableStream[offset] == GRPPRL_TYPE) {
        offset++;//  www  .java  2 s .  co  m
        int skip = LittleEndian.getShort(tableStream, offset);
        offset += LittleEndian.SHORT_SIZE + skip;
    }
    if (tableStream[offset] != TEXT_PIECE_TABLE_TYPE) {
        throw new IOException("The text piece table is corrupted");
    } else {
        int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
        offset += LittleEndian.INT_SIZE;
        _tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin);
    }
}

From source file:org.textmining.extraction.word.model.PieceDescriptor.java

License:Open Source License

public PieceDescriptor(byte[] buf, int offset) {
    descriptor = LittleEndian.getShort(buf, offset);
    offset += LittleEndian.SHORT_SIZE;/*from   w ww . j  a  va2s .c  om*/
    fc = LittleEndian.getInt(buf, offset);
    offset += LittleEndian.INT_SIZE;
    prm = LittleEndian.getShort(buf, offset);

    // see if this piece uses unicode.
    if ((fc & 0x40000000) == 0) {
        unicode = true;
    } else {
        unicode = false;
        fc &= ~(0x40000000);//gives me FC in doc stream
        fc /= 2;
    }

}