List of usage examples for org.apache.poi.hwpf.model TextPiece getRawBytes
public byte[] getRawBytes()
From source file:com.xx.platform.util.tools.ms.WordExtractor.java
License:Apache License
/** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too./*from w w w. j a va2s.c o m*/ */ public String getTextFromPieces() { StringBuffer textBuf = new StringBuffer(); Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); while (textPieces.hasNext()) { TextPiece piece = (TextPiece) textPieces.next(); String encoding = "Cp1252"; if (piece.isUnicode()) { encoding = "UTF-16LE"; } try { String text = new String(piece.getRawBytes(), encoding); textBuf.append(text); } catch (UnsupportedEncodingException e) { throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken"); } } String text = textBuf.toString(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); text = text.replaceAll("\r\r", "\r\n\r\n"); if (text.endsWith("\r")) { text += "\n"; } return text; }
From source file:org.opencrx.kernel.text.WordToText.java
License:BSD License
/** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too.//w ww.java2s .c om */ public String getTextFromPieces(HWPFDocument doc) { StringBuffer textBuf = new StringBuffer(); Iterator<TextPiece> textPieces = doc.getTextTable().getTextPieces().iterator(); while (textPieces.hasNext()) { TextPiece piece = textPieces.next(); String encoding = "Cp1252"; if (piece.isUnicode()) { encoding = "UTF-16LE"; } try { String text = new String(piece.getRawBytes(), encoding); textBuf.append(text); } catch (UnsupportedEncodingException e) { } } String text = textBuf.toString(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); text = text.replaceAll("\r\r", "\r\n\r\n"); if (text.endsWith("\r")) { text += "\n"; } return text; }
From source file:org.wandora.utils.MSOfficeBox.java
License:Open Source License
/** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too./*from w w w . ja v a2 s. c om*/ */ public static String getWordTextFromPieces(HWPFDocument doc) { StringBuilder textBuf = new StringBuilder(); Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); while (textPieces.hasNext()) { TextPiece piece = (TextPiece) textPieces.next(); String encoding = "Cp1252"; if (piece.isUnicode()) { encoding = "UTF-16LE"; } try { String text = new String(piece.getRawBytes(), encoding); textBuf.append(text); } catch (UnsupportedEncodingException e) { throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken"); } } String text = textBuf.toString(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); text = text.replaceAll("\r\r", "\r\n\r\n"); if (text.endsWith("\r")) { text += "\n"; } return text; }