Example usage for org.apache.poi.hwpf.model TextPiece getRawBytes

List of usage examples for org.apache.poi.hwpf.model TextPiece getRawBytes

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.model TextPiece getRawBytes.

Prototype

public byte[] getRawBytes() 

Source Link

Usage

From source file:com.xx.platform.util.tools.ms.WordExtractor.java

License:Apache License

/**
 * Grab the text out of the text pieces. Might also include various
 *  bits of crud, but will work in cases where the text piece -> paragraph
 *  mapping is broken. Fast too./*from w  w  w.  j a  va2s.c  o m*/
 */
public String getTextFromPieces() {
    StringBuffer textBuf = new StringBuffer();

    Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
    while (textPieces.hasNext()) {
        TextPiece piece = (TextPiece) textPieces.next();

        String encoding = "Cp1252";
        if (piece.isUnicode()) {
            encoding = "UTF-16LE";
        }
        try {
            String text = new String(piece.getRawBytes(), encoding);
            textBuf.append(text);
        } catch (UnsupportedEncodingException e) {
            throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
        }
    }

    String text = textBuf.toString();

    // Fix line endings (Note - won't get all of them
    text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    text = text.replaceAll("\r\r", "\r\n\r\n");

    if (text.endsWith("\r")) {
        text += "\n";
    }

    return text;
}

From source file:org.opencrx.kernel.text.WordToText.java

License:BSD License

/**
 * Grab the text out of the text pieces. Might also include various
 * bits of crud, but will work in cases where the text piece -> paragraph
 * mapping is broken. Fast too.//w  ww.java2s  .c  om
 */
public String getTextFromPieces(HWPFDocument doc) {
    StringBuffer textBuf = new StringBuffer();
    Iterator<TextPiece> textPieces = doc.getTextTable().getTextPieces().iterator();
    while (textPieces.hasNext()) {
        TextPiece piece = textPieces.next();
        String encoding = "Cp1252";
        if (piece.isUnicode()) {
            encoding = "UTF-16LE";
        }
        try {
            String text = new String(piece.getRawBytes(), encoding);
            textBuf.append(text);
        } catch (UnsupportedEncodingException e) {
        }
    }
    String text = textBuf.toString();
    // Fix line endings (Note - won't get all of them
    text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    text = text.replaceAll("\r\r", "\r\n\r\n");
    if (text.endsWith("\r")) {
        text += "\n";
    }
    return text;
}

From source file:org.wandora.utils.MSOfficeBox.java

License:Open Source License

/**
 * Grab the text out of the text pieces. Might also include various
 *  bits of crud, but will work in cases where the text piece -> paragraph
 *  mapping is broken. Fast too./*from w w w . ja v a2 s.  c  om*/
 */
public static String getWordTextFromPieces(HWPFDocument doc) {
    StringBuilder textBuf = new StringBuilder();

    Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
    while (textPieces.hasNext()) {
        TextPiece piece = (TextPiece) textPieces.next();

        String encoding = "Cp1252";
        if (piece.isUnicode()) {
            encoding = "UTF-16LE";
        }
        try {
            String text = new String(piece.getRawBytes(), encoding);
            textBuf.append(text);
        } catch (UnsupportedEncodingException e) {
            throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
        }
    }

    String text = textBuf.toString();

    // Fix line endings (Note - won't get all of them
    text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    text = text.replaceAll("\r\r", "\r\n\r\n");

    if (text.endsWith("\r")) {
        text += "\n";
    }

    return text;
}