Example usage for org.apache.poi.hwpf.model TextPiece substring

List of usage examples for org.apache.poi.hwpf.model TextPiece substring

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.model TextPiece substring.

Prototype

@Deprecated
public String substring(int start, int end) 

Source Link

Document

Returns part of the string.

Usage

From source file:com.krawler.esp.fileparser.word.ExtractWordFile.java

License:Open Source License

public String extractText(String filepath) throws FastSavedException, IOException {
    InputStream iStream = new BufferedInputStream(new FileInputStream(filepath));
    POIFSFileSystem fsys = new POIFSFileSystem(iStream);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);/*from w w w  .  j  av a 2 s  . com*/
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        System.out.println("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    // Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    // get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't
    // been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:com.krawler.esp.fileparser.wordparser.ExtractWordFile.java

License:Open Source License

public String extractText(String filepath) throws FastSavedException, IOException {
    InputStream iStream = new BufferedInputStream(new FileInputStream(filepath));

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(iStream);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);// w w  w .jav a  2s. c o m
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        System.out.println("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    // Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    // get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);

    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header, tpt);
    }
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin, tpt);
    // load our text pieces and our character runs

    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't
    // been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:com.progdan.doc2txt.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document.//  w w  w .j a  v a 2 s .  c  o  m
 *
 * @param in The InputStream representing the Word file.
 */
public String extractText(InputStream in) throws Exception {
    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}

From source file:org.apache.nutch.parse.msword.WordExtractor.java

License:Apache License

/**
 * Gets the text from a Word document./*  w w w  .  j  a v a 2s  .c o  m*/
 *
 * @param in The InputStream representing the Word file.
 */
protected String extractText(InputStream in) throws Exception {

    ArrayList text = new ArrayList();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    // load our POIFS document streams.
    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();

    int info = LittleEndian.getShort(header, 0xa);
    if ((info & 0x4) != 0) {
        throw new FastSavedException("Fast-saved files are unsupported at this time");
    }
    if ((info & 0x100) != 0) {
        throw new PasswordProtectedException("This document is password protected");
    }

    // determine the version of Word this document came from.
    int nFib = LittleEndian.getShort(header, 0x2);
    switch (nFib) {
    case 101:
    case 102:
    case 103:
    case 104:
        // this is a Word 6.0 doc send it to the extractor for that version.
        Word6Extractor oldExtractor = new Word6Extractor();
        return oldExtractor.extractText(header);
    }

    //Get the information we need from the header
    boolean useTable1 = (info & 0x200) != 0;

    //get the location of the piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);

    // determine which table stream we must use.
    String tableName = null;
    if (useTable1) {
        tableName = "1Table";
    } else {
        tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    int chpOffset = LittleEndian.getInt(header, 0xfa);
    int chpSize = LittleEndian.getInt(header, 0xfe);
    int fcMin = LittleEndian.getInt(header, 0x18);
    CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);

    // load our text pieces and our character runs
    ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
    TextPieceTable tpt = cft.getTextPieceTable();
    List textPieces = tpt.getTextPieces();

    // make the POIFS objects available for garbage collection
    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    List textRuns = cbt.getTextRuns();
    Iterator runIt = textRuns.iterator();
    Iterator textIt = textPieces.iterator();

    TextPiece currentPiece = (TextPiece) textIt.next();
    int currentTextStart = currentPiece.getStart();
    int currentTextEnd = currentPiece.getEnd();

    WordTextBuffer finalTextBuf = new WordTextBuffer();

    // iterate through all text runs extract the text only if they haven't been
    // deleted
    while (runIt.hasNext()) {
        CHPX chpx = (CHPX) runIt.next();
        boolean deleted = isDeleted(chpx.getGrpprl());
        if (deleted) {
            continue;
        }

        int runStart = chpx.getStart();
        int runEnd = chpx.getEnd();

        while (runStart >= currentTextEnd) {
            currentPiece = (TextPiece) textIt.next();
            currentTextStart = currentPiece.getStart();
            currentTextEnd = currentPiece.getEnd();
        }

        if (runEnd < currentTextEnd) {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else if (runEnd > currentTextEnd) {
            while (runEnd > currentTextEnd) {
                String str = currentPiece.substring(runStart - currentTextStart,
                        currentTextEnd - currentTextStart);
                finalTextBuf.append(str);
                if (textIt.hasNext()) {
                    currentPiece = (TextPiece) textIt.next();
                    currentTextStart = currentPiece.getStart();
                    runStart = currentTextStart;
                    currentTextEnd = currentPiece.getEnd();
                } else {
                    return finalTextBuf.toString();
                }
            }
            String str = currentPiece.substring(0, runEnd - currentTextStart);
            finalTextBuf.append(str);
        } else {
            String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
            if (textIt.hasNext()) {
                currentPiece = (TextPiece) textIt.next();
                currentTextStart = currentPiece.getStart();
                currentTextEnd = currentPiece.getEnd();
            }
            finalTextBuf.append(str);
        }
    }
    return finalTextBuf.toString();
}