Example usage for org.apache.poi.hwpf.model TextPiece isUnicode

List of usage examples for org.apache.poi.hwpf.model TextPiece isUnicode

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.model TextPiece isUnicode.

Prototype

public boolean isUnicode() 

Source Link

Usage

From source file:com.xx.platform.util.tools.ms.WordExtractor.java

License:Apache License

/**
 * Grab the text out of the text pieces. Might also include various
 *  bits of crud, but will work in cases where the text piece -> paragraph
 *  mapping is broken. Fast too./*from ww w .j av  a2  s.  c  o  m*/
 */
public String getTextFromPieces() {
    StringBuffer textBuf = new StringBuffer();

    Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
    while (textPieces.hasNext()) {
        TextPiece piece = (TextPiece) textPieces.next();

        String encoding = "Cp1252";
        if (piece.isUnicode()) {
            encoding = "UTF-16LE";
        }
        try {
            String text = new String(piece.getRawBytes(), encoding);
            textBuf.append(text);
        } catch (UnsupportedEncodingException e) {
            throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
        }
    }

    String text = textBuf.toString();

    // Fix line endings (Note - won't get all of them
    text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    text = text.replaceAll("\r\r", "\r\n\r\n");

    if (text.endsWith("\r")) {
        text += "\n";
    }

    return text;
}

From source file:org.opencrx.kernel.text.WordToText.java

License:BSD License

/**
 * Grab the text out of the text pieces. Might also include various
 * bits of crud, but will work in cases where the text piece -> paragraph
 * mapping is broken. Fast too./*from   www  . jav  a  2s  .  com*/
 */
public String getTextFromPieces(HWPFDocument doc) {
    StringBuffer textBuf = new StringBuffer();
    Iterator<TextPiece> textPieces = doc.getTextTable().getTextPieces().iterator();
    while (textPieces.hasNext()) {
        TextPiece piece = textPieces.next();
        String encoding = "Cp1252";
        if (piece.isUnicode()) {
            encoding = "UTF-16LE";
        }
        try {
            String text = new String(piece.getRawBytes(), encoding);
            textBuf.append(text);
        } catch (UnsupportedEncodingException e) {
        }
    }
    String text = textBuf.toString();
    // Fix line endings (Note - won't get all of them
    text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    text = text.replaceAll("\r\r", "\r\n\r\n");
    if (text.endsWith("\r")) {
        text += "\n";
    }
    return text;
}

From source file:org.wandora.utils.MSOfficeBox.java

License:Open Source License

/**
 * Grab the text out of the text pieces. Might also include various
 *  bits of crud, but will work in cases where the text piece -> paragraph
 *  mapping is broken. Fast too./* w  w w.j  av  a2  s  .  c  o m*/
 */
public static String getWordTextFromPieces(HWPFDocument doc) {
    StringBuilder textBuf = new StringBuilder();

    Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
    while (textPieces.hasNext()) {
        TextPiece piece = (TextPiece) textPieces.next();

        String encoding = "Cp1252";
        if (piece.isUnicode()) {
            encoding = "UTF-16LE";
        }
        try {
            String text = new String(piece.getRawBytes(), encoding);
            textBuf.append(text);
        } catch (UnsupportedEncodingException e) {
            throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
        }
    }

    String text = textBuf.toString();

    // Fix line endings (Note - won't get all of them
    text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    text = text.replaceAll("\r\r", "\r\n\r\n");

    if (text.endsWith("\r")) {
        text += "\n";
    }

    return text;
}