List of usage examples for org.apache.poi.hwpf.usermodel CharacterRun text
String text();
From source file:com.duroty.lucene.parser.MSWordParser.java
License:Open Source License
/** * DOCUMENT ME!//from w w w .ja v a 2s.c o m * * @return DOCUMENT ME! * * @throws ParserException DOCUMENT ME! */ private String getContents() throws ParserException { String contents = ""; try { HWPFDocument doc = new HWPFDocument(input); Range r = doc.getRange(); StringBuffer buffer = new StringBuffer(); for (int x = 0; x < r.numSections(); x++) { Section s = r.getSection(x); for (int y = 0; y < s.numParagraphs(); y++) { Paragraph p = null; try { p = s.getParagraph(y); } catch (Exception e) { buffer.append("\n"); } if (p != null) { for (int z = 0; z < p.numCharacterRuns(); z++) { try { //character run CharacterRun run = p.getCharacterRun(z); //character run text buffer.append(run.text()); } catch (Exception e) { buffer.append(" "); } } } /*if (sleep > 0) { try { Thread.sleep(sleep); } catch (Exception ex) { } }*/ // use a new line at the paragraph break buffer.append("\n"); } } contents = buffer.toString(); } catch (Exception ex) { throw new ParserException(ex); } return contents; }
From source file:com.example.minireader.WordViewActivity.java
License:Apache License
/**html*/ public void writeParagraphContent(Paragraph paragraph) { Paragraph p = paragraph;/*from w w w.j av a 2s . c o m*/ int pnumCharacterRuns = p.numCharacterRuns(); for (int j = 0; j < pnumCharacterRuns; j++) { CharacterRun run = p.getCharacterRun(j); if (run.getPicOffset() == 0 || run.getPicOffset() >= 1000) { if (presentPicture < pictures.size()) { // writePicture(); } } else { try { String text = run.text(); if (text.length() >= 2 && pnumCharacterRuns < 2) { // output.write(text.getBytes()); } else { // int size = run.getFontSize(); int color = run.getColor(); String fontSizeBegin = "<font size=\"" + decideSize(size) + "\">"; String fontColorBegin = "<font color=\"" + decideColor(color) + "\">"; String fontEnd = "</font>"; String boldBegin = "<b>"; String boldEnd = "</b>"; String islaBegin = "<i>"; String islaEnd = "</i>"; output.write(fontSizeBegin.getBytes()); output.write(fontColorBegin.getBytes()); if (run.isBold()) { output.write(boldBegin.getBytes()); } if (run.isItalic()) { output.write(islaBegin.getBytes()); } output.write(text.getBytes()); if (run.isBold()) { output.write(boldEnd.getBytes()); } if (run.isItalic()) { output.write(islaEnd.getBytes()); } output.write(fontEnd.getBytes()); output.write(fontEnd.getBytes()); } } catch (Exception e) { System.out.println("Write File Exception"); } } } }
From source file:com.google.gdt.handler.impl.WordHandler.java
License:Open Source License
/** * // w w w . ja v a 2s .co m * @param inputFile * @param pLevel * @throws IOException * @throws InvalidFormatException */ @Override public void handle(String inputFile, ProgressLevel pLevel) throws IOException, InvalidFormatException { String outPutFile = getOuputFileName(inputFile); OutputStream outputStream = new FileOutputStream(outPutFile); InputStream inputStream = new FileInputStream(inputFile); HWPFDocument hDocument = new HWPFDocument(inputStream); Range range = hDocument.getRange(); pLevel.setTrFileName(outPutFile); pLevel.setValue(0); pLevel.setStringPainted(true); pLevel.setMaxValue(range.numParagraphs()); int count = 0; for (int i = 0; i < range.numParagraphs(); i++) { Paragraph paragraph = range.getParagraph(i); int numCharRuns = paragraph.numCharacterRuns(); for (int j = 0; j < numCharRuns; j++) { if (isInterrupted) { outputStream.close(); new File(outPutFile).delete(); pLevel.setString("cancelled"); return; } CharacterRun charRun = paragraph.getCharacterRun(j); String inputText = charRun.text(); if ((null == inputText) || (inputText.trim().equals(""))) continue; String translatedTxt = inputText; //in http post method, all key value pairs are seperated with & if (preferenceModel.getTranslatorType() == TranslatorType.HTTP) inputText = inputText.replaceAll("&", "and"); try { translatedTxt = translator.translate(translatedTxt); charRun.replaceText(inputText, translatedTxt); } catch (Exception e) { logger.log(Level.SEVERE, "Input File : " + inputFile + " cannot translate the text : " + inputText, e); } } count++; pLevel.setValue(count); } pLevel.setString("done"); hDocument.write(outputStream); outputStream.close(); }
From source file:javaapplication1.HWPFTest.java
private static HWPFDocument replaceText(HWPFDocument doc, String findText, String replaceText) { Range r1 = doc.getRange();//from ww w.j av a2 s. c om for (int i = 0; i < r1.numSections(); ++i) { Section s = r1.getSection(i); for (int x = 0; x < s.numParagraphs(); x++) { Paragraph p = s.getParagraph(x); /*String text = p.text(); if(text.contains(findText)) { p.replaceText(replaceText, findText); }*/ for (int z = 0; z < p.numCharacterRuns(); z++) { CharacterRun run = p.getCharacterRun(z); String text = run.text(); if (text.contains(findText)) { run.replaceText(findText, replaceText); } } } } return doc; }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { // Note - a poi bug means we can't currently properly recurse // into nested tables, so currently we don't if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) { Table t = r.getTable(p);/*from www .j a va 2 s . c o m*/ xhtml.startElement("table"); xhtml.startElement("tbody"); for (int rn = 0; rn < t.numRows(); rn++) { TableRow row = t.getRow(rn); xhtml.startElement("tr"); for (int cn = 0; cn < row.numCells(); cn++) { TableCell cell = row.getCell(cn); xhtml.startElement("td"); for (int pn = 0; pn < cell.numParagraphs(); pn++) { Paragraph cellP = cell.getParagraph(pn); handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, xhtml); } xhtml.endElement("td"); } xhtml.endElement("tr"); } xhtml.endElement("tbody"); xhtml.endElement("table"); return (t.numParagraphs() - 1); } TagAndStyle tas; if (document.getStyleSheet().numStyles() > p.getStyleIndex()) { StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex()); if (style != null && style.getName() != null && style.getName().length() > 0) { tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0)); } else { tas = new TagAndStyle("p", null); } } else { tas = new TagAndStyle("p", null); } if (tas.getStyleClass() != null) { xhtml.startElement(tas.getTag(), "class", tas.getStyleClass()); } else { xhtml.startElement(tas.getTag()); } for (int j = 0; j < p.numCharacterRuns(); j++) { CharacterRun cr = p.getCharacterRun(j); // FIELD_BEGIN_MARK: if (cr.text().getBytes()[0] == 0x13) { Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset()); // 58 is an embedded document // 56 is a document link if (field != null && (field.getType() == 58 || field.getType() == 56)) { // Embedded Object: add a <div // class="embedded" id="_X"/> so consumer can see where // in the main text each embedded document // occurred: String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", id); xhtml.startElement("div", attributes); xhtml.endElement("div"); } } if (cr.text().equals("\u0013")) { j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml); } else if (cr.text().startsWith("\u0008")) { // Floating Picture(s) for (int pn = 0; pn < cr.text().length(); pn++) { // Assume they're in the order from the unclaimed list... Picture picture = pictures.nextUnclaimed(); // Output handlePictureCharacterRun(cr, picture, pictures, xhtml); } } else if (pictureTable.hasPicture(cr)) { // Inline Picture Picture picture = pictures.getFor(cr); handlePictureCharacterRun(cr, picture, pictures, xhtml); } else { handleCharacterRun(cr, tas.isHeading(), xhtml); } } // Close any still open style tags if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (curItalic) { xhtml.endElement("i"); curItalic = false; } if (curBold) { xhtml.endElement("b"); curBold = false; } xhtml.endElement(tas.getTag()); return 0; }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml) throws SAXException { // Skip trailing newlines if (!isRendered(cr) || cr.text().equals("\r")) return;// ww w. j a v a 2 s . co m if (!skipStyling) { if (cr.isBold() != curBold) { // Enforce nesting -- must close s and i tags if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (curItalic) { xhtml.endElement("i"); curItalic = false; } if (cr.isBold()) { xhtml.startElement("b"); } else { xhtml.endElement("b"); } curBold = cr.isBold(); } if (cr.isItalic() != curItalic) { // Enforce nesting -- must close s tag if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (cr.isItalic()) { xhtml.startElement("i"); } else { xhtml.endElement("i"); } curItalic = cr.isItalic(); } if (cr.isStrikeThrough() != curStrikeThrough) { if (cr.isStrikeThrough()) { xhtml.startElement("s"); } else { xhtml.endElement("s"); } curStrikeThrough = cr.isStrikeThrough(); } } // Clean up the text String text = cr.text(); text = text.replace('\r', '\n'); if (text.endsWith("\u0007")) { // Strip the table cell end marker text = text.substring(0, text.length() - 1); } // Copied from POI's // org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters: // line tabulator as break line text = text.replace((char) 0x000b, '\n'); // Non-breaking hyphens are returned as char 30 text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN); // Non-required hyphens to zero-width space text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE); // TODO: mj xhtml.characters(text); }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
/** * Can be \13..text..\15 or \13..control..\14..text..\15 . Nesting is allowed *///from w w w. ja va 2s .c om private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling, PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException { List<CharacterRun> controls = new ArrayList<CharacterRun>(); List<CharacterRun> texts = new ArrayList<CharacterRun>(); boolean has14 = false; // Split it into before and after the 14 int i; for (i = index + 1; i < p.numCharacterRuns(); i++) { CharacterRun cr = p.getCharacterRun(i); if (cr.text().equals("\u0013")) { // Nested, oh joy... int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml); i += increment; } else if (cr.text().equals("\u0014")) { has14 = true; } else if (cr.text().equals("\u0015")) { if (!has14) { texts = controls; controls = new ArrayList<CharacterRun>(); } break; } else { if (has14) { texts.add(cr); } else { controls.add(cr); } } } // Do we need to do something special with this? if (controls.size() > 0) { String text = controls.get(0).text(); for (int j = 1; j < controls.size(); j++) { text += controls.get(j).text(); } if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK")) && text.indexOf('"') > -1) { String url = text.substring(text.indexOf('"') + 1, text.lastIndexOf('"')); xhtml.startElement("a", "href", url); for (CharacterRun cr : texts) { handleCharacterRun(cr, skipStyling, xhtml); } xhtml.endElement("a"); } else { // Just output the text ones for (CharacterRun cr : texts) { if (pictures.hasPicture(cr)) { Picture picture = pictures.getFor(cr); handlePictureCharacterRun(cr, picture, pictures, xhtml); } else { handleCharacterRun(cr, skipStyling, xhtml); } } } } else { // We only had text // Output as-is for (CharacterRun cr : texts) { handleCharacterRun(cr, skipStyling, xhtml); } } // Tell them how many to skip over return i - index; }
From source file:Modelo.EscribirWord.java
private HWPFDocument replaceText(HWPFDocument doc, String findText, String replaceText) { Range r1 = doc.getRange();/*from ww w . j a va 2s . co m*/ for (int i = 0; i < r1.numSections(); ++i) { Section s = r1.getSection(i); for (int x = 0; x < s.numParagraphs(); x++) { Paragraph p = s.getParagraph(x); for (int z = 0; z < p.numCharacterRuns(); z++) { CharacterRun run = p.getCharacterRun(z); String text = run.text(); if (text.contains(findText)) { if (replaceText == null) { System.out.println("null"); replaceText = ""; } run.replaceText(findText, replaceText); } } } } return doc; }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { // Note - a poi bug means we can't currently properly recurse // into nested tables, so currently we don't if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) { Table t = r.getTable(p);//from w ww .java 2 s . com xhtml.startElement("table"); xhtml.startElement("tbody"); for (int rn = 0; rn < t.numRows(); rn++) { TableRow row = t.getRow(rn); xhtml.startElement("tr"); for (int cn = 0; cn < row.numCells(); cn++) { TableCell cell = row.getCell(cn); xhtml.startElement("td"); for (int pn = 0; pn < cell.numParagraphs(); pn++) { Paragraph cellP = cell.getParagraph(pn); handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml); } xhtml.endElement("td"); } xhtml.endElement("tr"); } xhtml.endElement("tbody"); xhtml.endElement("table"); return (t.numParagraphs() - 1); } String text = p.text(); if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) { // Skip empty paragraphs return 0; } TagAndStyle tas; String numbering = null; if (document.getStyleSheet().numStyles() > p.getStyleIndex()) { StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex()); if (style != null && style.getName() != null && style.getName().length() > 0) { if (p.isInList()) { numbering = listManager.getFormattedNumber(p); } tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0)); } else { tas = new TagAndStyle("p", null); } } else { tas = new TagAndStyle("p", null); } if (tas.getStyleClass() != null) { xhtml.startElement(tas.getTag(), "class", tas.getStyleClass()); } else { xhtml.startElement(tas.getTag()); } if (numbering != null) { xhtml.characters(numbering); } for (int j = 0; j < p.numCharacterRuns(); j++) { CharacterRun cr = p.getCharacterRun(j); // FIELD_BEGIN_MARK: if (cr.text().getBytes(UTF_8)[0] == 0x13) { Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset()); // 58 is an embedded document // 56 is a document link if (field != null && (field.getType() == 58 || field.getType() == 56)) { // Embedded Object: add a <div // class="embedded" id="_X"/> so consumer can see where // in the main text each embedded document // occurred: String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", id); xhtml.startElement("div", attributes); xhtml.endElement("div"); } } if (cr.text().equals("\u0013")) { j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml); } else if (cr.text().startsWith("\u0008")) { // Floating Picture(s) for (int pn = 0; pn < cr.text().length(); pn++) { // Assume they're in the order from the unclaimed list... Picture picture = pictures.nextUnclaimed(); // Output handlePictureCharacterRun(cr, picture, pictures, xhtml); } } else if (pictureTable.hasPicture(cr)) { // Inline Picture Picture picture = pictures.getFor(cr); handlePictureCharacterRun(cr, picture, pictures, xhtml); } else { handleCharacterRun(cr, tas.isHeading(), xhtml); } } // Close any still open style tags if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (curItalic) { xhtml.endElement("i"); curItalic = false; } if (curBold) { xhtml.endElement("b"); curBold = false; } xhtml.endElement(tas.getTag()); return 0; }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml) throws SAXException { // Skip trailing newlines if (!isRendered(cr) || cr.text().equals("\r")) return;/*from ww w. j a v a 2 s . c om*/ if (!skipStyling) { if (cr.isBold() != curBold) { // Enforce nesting -- must close s and i tags if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (curItalic) { xhtml.endElement("i"); curItalic = false; } if (cr.isBold()) { xhtml.startElement("b"); } else { xhtml.endElement("b"); } curBold = cr.isBold(); } if (cr.isItalic() != curItalic) { // Enforce nesting -- must close s tag if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (cr.isItalic()) { xhtml.startElement("i"); } else { xhtml.endElement("i"); } curItalic = cr.isItalic(); } if (cr.isStrikeThrough() != curStrikeThrough) { if (cr.isStrikeThrough()) { xhtml.startElement("s"); } else { xhtml.endElement("s"); } curStrikeThrough = cr.isStrikeThrough(); } } // Clean up the text String text = cr.text(); text = text.replace('\r', '\n'); if (text.endsWith("\u0007")) { // Strip the table cell end marker text = text.substring(0, text.length() - 1); } // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters: // Non-breaking hyphens are returned as char 30 text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN); // Non-required hyphens to zero-width space text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE); // Control characters as line break text = text.replaceAll("[\u0000-\u001f]", "\n"); xhtml.characters(text); }