Example usage for org.apache.poi.hwpf.model CHPBinTable CHPBinTable

List of usage examples for org.apache.poi.hwpf.model CHPBinTable CHPBinTable

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.model CHPBinTable CHPBinTable.

Prototype

public CHPBinTable(byte[] documentStream, byte[] tableStream, int offset, int size,
        CharIndexTranslator translator) 

Source Link

Document

Constructor used to read a binTable in from a Word document.

Usage

From source file:com.progdan.doc2txt.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document./*from  w w w . j av a 2  s  . c  o m*/
 *
 * @param in The InputStream representing the Word file.
 */
public String extractText(InputStream in) throws Exception {
    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:org.apache.nutch.parse.msword.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document./*from  ww w. j a  v a2s .  c  om*/
 *
 * @param in The InputStream representing the Word file.
 */
protected String extractText(InputStream in) throws Exception {

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}