List of usage examples for org.apache.poi.hwpf.model TextPiece isUnicode
public boolean isUnicode()
From source file:com.xx.platform.util.tools.ms.WordExtractor.java
License:Apache License
/** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too./*from ww w .j av a2 s. c o m*/ */ public String getTextFromPieces() { StringBuffer textBuf = new StringBuffer(); Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); while (textPieces.hasNext()) { TextPiece piece = (TextPiece) textPieces.next(); String encoding = "Cp1252"; if (piece.isUnicode()) { encoding = "UTF-16LE"; } try { String text = new String(piece.getRawBytes(), encoding); textBuf.append(text); } catch (UnsupportedEncodingException e) { throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken"); } } String text = textBuf.toString(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); text = text.replaceAll("\r\r", "\r\n\r\n"); if (text.endsWith("\r")) { text += "\n"; } return text; }
From source file:org.opencrx.kernel.text.WordToText.java
License:BSD License
/** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too./*from www . jav a 2s . com*/ */ public String getTextFromPieces(HWPFDocument doc) { StringBuffer textBuf = new StringBuffer(); Iterator<TextPiece> textPieces = doc.getTextTable().getTextPieces().iterator(); while (textPieces.hasNext()) { TextPiece piece = textPieces.next(); String encoding = "Cp1252"; if (piece.isUnicode()) { encoding = "UTF-16LE"; } try { String text = new String(piece.getRawBytes(), encoding); textBuf.append(text); } catch (UnsupportedEncodingException e) { } } String text = textBuf.toString(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); text = text.replaceAll("\r\r", "\r\n\r\n"); if (text.endsWith("\r")) { text += "\n"; } return text; }
From source file:org.wandora.utils.MSOfficeBox.java
License:Open Source License
/** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too./* w w w.j av a2 s . c o m*/ */ public static String getWordTextFromPieces(HWPFDocument doc) { StringBuilder textBuf = new StringBuilder(); Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); while (textPieces.hasNext()) { TextPiece piece = (TextPiece) textPieces.next(); String encoding = "Cp1252"; if (piece.isUnicode()) { encoding = "UTF-16LE"; } try { String text = new String(piece.getRawBytes(), encoding); textBuf.append(text); } catch (UnsupportedEncodingException e) { throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken"); } } String text = textBuf.toString(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); text = text.replaceAll("\r\r", "\r\n\r\n"); if (text.endsWith("\r")) { text += "\n"; } return text; }