List of usage examples for org.apache.poi.hwpf.usermodel Range numParagraphs
public int numParagraphs()
From source file:com.google.gdt.handler.impl.WordHandler.java
License:Open Source License
/** * /* w w w .j a va 2 s . c o m*/ * @param inputFile * @param pLevel * @throws IOException * @throws InvalidFormatException */ @Override public void handle(String inputFile, ProgressLevel pLevel) throws IOException, InvalidFormatException { String outPutFile = getOuputFileName(inputFile); OutputStream outputStream = new FileOutputStream(outPutFile); InputStream inputStream = new FileInputStream(inputFile); HWPFDocument hDocument = new HWPFDocument(inputStream); Range range = hDocument.getRange(); pLevel.setTrFileName(outPutFile); pLevel.setValue(0); pLevel.setStringPainted(true); pLevel.setMaxValue(range.numParagraphs()); int count = 0; for (int i = 0; i < range.numParagraphs(); i++) { Paragraph paragraph = range.getParagraph(i); int numCharRuns = paragraph.numCharacterRuns(); for (int j = 0; j < numCharRuns; j++) { if (isInterrupted) { outputStream.close(); new File(outPutFile).delete(); pLevel.setString("cancelled"); return; } CharacterRun charRun = paragraph.getCharacterRun(j); String inputText = charRun.text(); if ((null == inputText) || (inputText.trim().equals(""))) continue; String translatedTxt = inputText; //in http post method, all key value pairs are seperated with & if (preferenceModel.getTranslatorType() == TranslatorType.HTTP) inputText = inputText.replaceAll("&", "and"); try { translatedTxt = translator.translate(translatedTxt); charRun.replaceText(inputText, translatedTxt); } catch (Exception e) { logger.log(Level.SEVERE, "Input File : " + inputFile + " cannot translate the text : " + inputText, e); } } count++; pLevel.setValue(count); } pLevel.setString("done"); hDocument.write(outputStream); outputStream.close(); }
From source file:com.thuvienkhoahoc.wordtomwtext.examples.WordToMwtext.java
License:Apache License
public WordToMwtext(HWPFDocument doc, OutputStream stream) throws IOException, UnsupportedEncodingException { // bagd/* ww w. j av a2s. c om*/ OutputStreamWriter out = new OutputStreamWriter(stream, "UTF-8"); _out = out; _doc = doc; init(); openDocument(); openBody(); Range r = doc.getRange(); StyleSheet styleSheet = doc.getStyleSheet(); int sectionLevel = 0; int lenParagraph = r.numParagraphs(); boolean inCode = false; for (int x = 0; x < lenParagraph; x++) { Paragraph p = r.getParagraph(x); String text = p.text(); if (text.trim().length() == 0) { continue; } StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex()); String styleName = paragraphStyle.getName(); if (styleName.startsWith("Heading")) { if (inCode) { closeSource(); inCode = false; } int headerLevel = Integer.parseInt(styleName.substring(8)); if (headerLevel > sectionLevel) { openSection(); } else { for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) { closeSection(); } openSection(); } sectionLevel = headerLevel; openTitle(sectionLevel); writePlainText(text.trim()); closeTitle(sectionLevel); } else { int cruns = p.numCharacterRuns(); CharacterRun run = p.getCharacterRun(0); String fontName = run.getFontName(); if (fontName.startsWith("Courier")) { if (!inCode) { openSource(); inCode = true; } writePlainText(p.text()); } else { if (inCode) { inCode = false; closeSource(); } openParagraph(); writePlainText(p.text()); closeParagraph(); } } } for (int x = 0; x < sectionLevel; x++) { closeSection(); } closeBody(); closeDocument(); _out.flush(); }
From source file:com.xx.platform.util.tools.ms.WordExtractor.java
License:Apache License
/** * Get the text from the word file, as an array with one String * per paragraph/*from ww w. j a v a 2 s . c o m*/ */ public String[] getParagraphText() { String[] ret; // Extract using the model code try { Range r = doc.getRange(); ret = new String[r.numParagraphs()]; for (int i = 0; i < ret.length; i++) { Paragraph p = r.getParagraph(i); ret[i] = p.text(); // Fix the line ending if (ret[i].endsWith("\r")) { ret[i] = ret[i] + "\n"; } } } catch (Exception e) { // Something's up with turning the text pieces into paragraphs // Fall back to ripping out the text pieces ret = new String[1]; ret[0] = getTextFromPieces(); } return ret; }
From source file:com.zhch.example.poi.Word2Forrest.java
License:Apache License
@SuppressWarnings("unused") public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException { OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8")); _out = out;/*ww w . j ava 2s.c o m*/ _doc = doc; init(); openDocument(); openBody(); Range r = doc.getRange(); StyleSheet styleSheet = doc.getStyleSheet(); int sectionLevel = 0; int lenParagraph = r.numParagraphs(); boolean inCode = false; for (int x = 0; x < lenParagraph; x++) { Paragraph p = r.getParagraph(x); String text = p.text(); if (text.trim().length() == 0) { continue; } StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex()); String styleName = paragraphStyle.getName(); if (styleName.startsWith("Heading")) { if (inCode) { closeSource(); inCode = false; } int headerLevel = Integer.parseInt(styleName.substring(8)); if (headerLevel > sectionLevel) { openSection(); } else { for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) { closeSection(); } openSection(); } sectionLevel = headerLevel; openTitle(); writePlainText(text); closeTitle(); } else { int cruns = p.numCharacterRuns(); CharacterRun run = p.getCharacterRun(0); String fontName = run.getFontName(); if (fontName.startsWith("Courier")) { if (!inCode) { openSource(); inCode = true; } writePlainText(p.text()); } else { if (inCode) { inCode = false; closeSource(); } openParagraph(); writePlainText(p.text()); closeParagraph(); } } } for (int x = 0; x < sectionLevel; x++) { closeSection(); } closeBody(); closeDocument(); _out.flush(); }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document;/*from w w w. j a va2 s .c om*/ try { document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); // mj extractImageText(xhtml, document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
private static int countParagraphs(Range... ranges) { int count = 0; for (Range r : ranges) { if (r != null) { count += r.numParagraphs(); }/*from w w w . ja v a2 s . c o m*/ } return count; }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document, PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { if (countParagraphs(ranges) > 0) { xhtml.startElement("div", "class", type); for (Range r : ranges) { if (r != null) { for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); String text = p.text(); if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) { // Skip empty header or footer paragraphs } else { i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml); }/* ww w .jav a 2s.co m*/ } } } xhtml.endElement("div"); } }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HWPFDocument document;/*from w w w. j a va2 s . c o m*/ try { document = new HWPFDocument(root); } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor( document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, // the pictures should be in order, and may be directly // placed or referenced from an anchor PicturesTable pictureTable = document.getPicturesTable(); PicturesSource pictures = new PicturesSource(document); // Do any headers, if present Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); ListManager listManager = new ListManager(document); for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml); } // Do everything else for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getCommentsText()) { xhtml.element("p", paragraph); } for (String paragraph : wordExtractor.getEndnoteText()) { xhtml.element("p", paragraph); } // Do any footers, if present Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet for (Picture p = pictures.nextUnclaimed(); p != null;) { handlePictureCharacterRun(null, p, pictures, xhtml); p = pictures.nextUnclaimed(); } // Handle any embeded office documents try { DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); for (Entry entry : op) { if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } } catch (FileNotFoundException e) { } }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document, PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { if (countParagraphs(ranges) > 0) { xhtml.startElement("div", "class", type); ListManager listManager = new ListManager(document); for (Range r : ranges) { if (r != null) { for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures, pictureTable, listManager, xhtml); }/*ww w . j a v a2 s . c o m*/ } } xhtml.endElement("div"); } }
From source file:org.esmerilprogramming.pdfcake.DocumentReplace.java
License:Open Source License
/** * Read the document searching for the $$$<keys>$$$ and replace with the values in the template * @param document// ww w. j a v a2s . c o m * @param template * @return */ private static HWPFDocument replaceKeys(HWPFDocument document, DocumentTemplate template) { Range range = document.getRange(); for (int i = 0; i < range.numParagraphs(); i++) { Paragraph p = range.getParagraph(i); String text = null; for (Enumeration<String> e = template.getAttributes().keys(); e.hasMoreElements();) { String key = e.nextElement(); String attributeKey = "$$$" + key + "$$$"; try { text = p.text(); } catch (Exception ex) { ; } while (text != null && text.indexOf(attributeKey) > -1) { String replacement = template.getAttributes().get(key); p.replaceText(attributeKey, replacement, text.indexOf(attributeKey)); text = text.replace(attributeKey, ""); } } } return document; }