List of usage examples for org.apache.poi.hwpf.usermodel Paragraph getStyleIndex
public short getStyleIndex()
From source file:com.thuvienkhoahoc.wordtomwtext.examples.WordToMwtext.java
License:Apache License
public WordToMwtext(HWPFDocument doc, OutputStream stream) throws IOException, UnsupportedEncodingException { // bagd/*from w ww. ja v a2 s.c om*/ OutputStreamWriter out = new OutputStreamWriter(stream, "UTF-8"); _out = out; _doc = doc; init(); openDocument(); openBody(); Range r = doc.getRange(); StyleSheet styleSheet = doc.getStyleSheet(); int sectionLevel = 0; int lenParagraph = r.numParagraphs(); boolean inCode = false; for (int x = 0; x < lenParagraph; x++) { Paragraph p = r.getParagraph(x); String text = p.text(); if (text.trim().length() == 0) { continue; } StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex()); String styleName = paragraphStyle.getName(); if (styleName.startsWith("Heading")) { if (inCode) { closeSource(); inCode = false; } int headerLevel = Integer.parseInt(styleName.substring(8)); if (headerLevel > sectionLevel) { openSection(); } else { for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) { closeSection(); } openSection(); } sectionLevel = headerLevel; openTitle(sectionLevel); writePlainText(text.trim()); closeTitle(sectionLevel); } else { int cruns = p.numCharacterRuns(); CharacterRun run = p.getCharacterRun(0); String fontName = run.getFontName(); if (fontName.startsWith("Courier")) { if (!inCode) { openSource(); inCode = true; } writePlainText(p.text()); } else { if (inCode) { inCode = false; closeSource(); } openParagraph(); writePlainText(p.text()); closeParagraph(); } } } for (int x = 0; x < sectionLevel; x++) { closeSection(); } closeBody(); closeDocument(); _out.flush(); }
From source file:com.zhch.example.poi.Word2Forrest.java
License:Apache License
@SuppressWarnings("unused") public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException { OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8")); _out = out;//w w w . j a v a 2 s. c om _doc = doc; init(); openDocument(); openBody(); Range r = doc.getRange(); StyleSheet styleSheet = doc.getStyleSheet(); int sectionLevel = 0; int lenParagraph = r.numParagraphs(); boolean inCode = false; for (int x = 0; x < lenParagraph; x++) { Paragraph p = r.getParagraph(x); String text = p.text(); if (text.trim().length() == 0) { continue; } StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex()); String styleName = paragraphStyle.getName(); if (styleName.startsWith("Heading")) { if (inCode) { closeSource(); inCode = false; } int headerLevel = Integer.parseInt(styleName.substring(8)); if (headerLevel > sectionLevel) { openSection(); } else { for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) { closeSection(); } openSection(); } sectionLevel = headerLevel; openTitle(); writePlainText(text); closeTitle(); } else { int cruns = p.numCharacterRuns(); CharacterRun run = p.getCharacterRun(0); String fontName = run.getFontName(); if (fontName.startsWith("Courier")) { if (!inCode) { openSource(); inCode = true; } writePlainText(p.text()); } else { if (inCode) { inCode = false; closeSource(); } openParagraph(); writePlainText(p.text()); closeParagraph(); } } } for (int x = 0; x < sectionLevel; x++) { closeSection(); } closeBody(); closeDocument(); _out.flush(); }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java
License:Apache License
private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { // Note - a poi bug means we can't currently properly recurse // into nested tables, so currently we don't if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) { Table t = r.getTable(p);//from w ww . j a v a2s. c o m xhtml.startElement("table"); xhtml.startElement("tbody"); for (int rn = 0; rn < t.numRows(); rn++) { TableRow row = t.getRow(rn); xhtml.startElement("tr"); for (int cn = 0; cn < row.numCells(); cn++) { TableCell cell = row.getCell(cn); xhtml.startElement("td"); for (int pn = 0; pn < cell.numParagraphs(); pn++) { Paragraph cellP = cell.getParagraph(pn); handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, xhtml); } xhtml.endElement("td"); } xhtml.endElement("tr"); } xhtml.endElement("tbody"); xhtml.endElement("table"); return (t.numParagraphs() - 1); } TagAndStyle tas; if (document.getStyleSheet().numStyles() > p.getStyleIndex()) { StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex()); if (style != null && style.getName() != null && style.getName().length() > 0) { tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0)); } else { tas = new TagAndStyle("p", null); } } else { tas = new TagAndStyle("p", null); } if (tas.getStyleClass() != null) { xhtml.startElement(tas.getTag(), "class", tas.getStyleClass()); } else { xhtml.startElement(tas.getTag()); } for (int j = 0; j < p.numCharacterRuns(); j++) { CharacterRun cr = p.getCharacterRun(j); // FIELD_BEGIN_MARK: if (cr.text().getBytes()[0] == 0x13) { Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset()); // 58 is an embedded document // 56 is a document link if (field != null && (field.getType() == 58 || field.getType() == 56)) { // Embedded Object: add a <div // class="embedded" id="_X"/> so consumer can see where // in the main text each embedded document // occurred: String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", id); xhtml.startElement("div", attributes); xhtml.endElement("div"); } } if (cr.text().equals("\u0013")) { j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml); } else if (cr.text().startsWith("\u0008")) { // Floating Picture(s) for (int pn = 0; pn < cr.text().length(); pn++) { // Assume they're in the order from the unclaimed list... Picture picture = pictures.nextUnclaimed(); // Output handlePictureCharacterRun(cr, picture, pictures, xhtml); } } else if (pictureTable.hasPicture(cr)) { // Inline Picture Picture picture = pictures.getFor(cr); handlePictureCharacterRun(cr, picture, pictures, xhtml); } else { handleCharacterRun(cr, tas.isHeading(), xhtml); } } // Close any still open style tags if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (curItalic) { xhtml.endElement("i"); curItalic = false; } if (curBold) { xhtml.endElement("b"); curBold = false; } xhtml.endElement(tas.getTag()); return 0; }
From source file:org.apache.tika.parser.microsoft.WordExtractor.java
License:Apache License
private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { // Note - a poi bug means we can't currently properly recurse // into nested tables, so currently we don't if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) { Table t = r.getTable(p);/*from ww w . ja v a 2 s . c o m*/ xhtml.startElement("table"); xhtml.startElement("tbody"); for (int rn = 0; rn < t.numRows(); rn++) { TableRow row = t.getRow(rn); xhtml.startElement("tr"); for (int cn = 0; cn < row.numCells(); cn++) { TableCell cell = row.getCell(cn); xhtml.startElement("td"); for (int pn = 0; pn < cell.numParagraphs(); pn++) { Paragraph cellP = cell.getParagraph(pn); handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml); } xhtml.endElement("td"); } xhtml.endElement("tr"); } xhtml.endElement("tbody"); xhtml.endElement("table"); return (t.numParagraphs() - 1); } String text = p.text(); if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) { // Skip empty paragraphs return 0; } TagAndStyle tas; String numbering = null; if (document.getStyleSheet().numStyles() > p.getStyleIndex()) { StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex()); if (style != null && style.getName() != null && style.getName().length() > 0) { if (p.isInList()) { numbering = listManager.getFormattedNumber(p); } tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0)); } else { tas = new TagAndStyle("p", null); } } else { tas = new TagAndStyle("p", null); } if (tas.getStyleClass() != null) { xhtml.startElement(tas.getTag(), "class", tas.getStyleClass()); } else { xhtml.startElement(tas.getTag()); } if (numbering != null) { xhtml.characters(numbering); } for (int j = 0; j < p.numCharacterRuns(); j++) { CharacterRun cr = p.getCharacterRun(j); // FIELD_BEGIN_MARK: if (cr.text().getBytes(UTF_8)[0] == 0x13) { Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset()); // 58 is an embedded document // 56 is a document link if (field != null && (field.getType() == 58 || field.getType() == 56)) { // Embedded Object: add a <div // class="embedded" id="_X"/> so consumer can see where // in the main text each embedded document // occurred: String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", id); xhtml.startElement("div", attributes); xhtml.endElement("div"); } } if (cr.text().equals("\u0013")) { j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml); } else if (cr.text().startsWith("\u0008")) { // Floating Picture(s) for (int pn = 0; pn < cr.text().length(); pn++) { // Assume they're in the order from the unclaimed list... Picture picture = pictures.nextUnclaimed(); // Output handlePictureCharacterRun(cr, picture, pictures, xhtml); } } else if (pictureTable.hasPicture(cr)) { // Inline Picture Picture picture = pictures.getFor(cr); handlePictureCharacterRun(cr, picture, pictures, xhtml); } else { handleCharacterRun(cr, tas.isHeading(), xhtml); } } // Close any still open style tags if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; } if (curItalic) { xhtml.endElement("i"); curItalic = false; } if (curBold) { xhtml.endElement("b"); curBold = false; } xhtml.endElement(tas.getTag()); return 0; }
From source file:org.docx4j.convert.in.Doc.java
License:Apache License
private static org.docx4j.wml.P handleP(WordprocessingMLPackage wordMLPackage, HWPFDocument doc, Paragraph p, org.apache.poi.hwpf.model.StyleSheet stylesheet, MainDocumentPart documentPart, org.docx4j.wml.ObjectFactory factory) { org.docx4j.wml.P wmlP = null;//from w w w .java 2s .c o m if (p.getStyleIndex() > 0) { log.debug("Styled paragraph, with index: " + p.getStyleIndex()); String styleName = stylesheet.getStyleDescription(p.getStyleIndex()).getName(); log.debug(styleName); wmlP = documentPart.createStyledParagraphOfText(stripSpace(styleName), null); } else { wmlP = documentPart.createParagraphOfText(null); } // LineSpacingDescriptor lsd = p.getLineSpacing(); // if (lsd==null || lsd.isEmpty()) { // // do nothing // } else { // PPr pPr = wmlP.getPPr(); // if (pPr==null) { // pPr = Context.getWmlObjectFactory().createPPr(); // wmlP.setPPr(pPr); // } // Spacing spacing = // Context.getWmlObjectFactory().createPPrBaseSpacing(); // spacing.setLine(lsd._dyaLine); // not visible // spacing.setLineRule(STLineSpacingRule.AUTO); // pPr.setSpacing(spacing); // } for (int z = 0; z < p.numCharacterRuns(); z++) { // character run CharacterRun run = p.getCharacterRun(z); // No character styles defined in there?? org.docx4j.wml.RPr rPr = null; if (run.isBold()) { // TODO - HIGH PRIORITY- handle other run properties // esp underline, font size if (rPr == null) { rPr = factory.createRPr(); } org.docx4j.wml.BooleanDefaultTrue boldOn = factory.createBooleanDefaultTrue(); boldOn.setVal(Boolean.TRUE); rPr.setB(boldOn); } //Process image if (doc instanceof HWPFDocument && ((HWPFDocument) doc).getPicturesTable().hasPicture(run)) { Picture picture = doc.getPicturesTable().extractPicture(run, true); Inline inline; try { BinaryPartAbstractImage imagePart = BinaryPartAbstractImage.createImagePart(wordMLPackage, picture.getContent()); long cx = UnitsOfMeasurement .twipToEMU(Math.round((double) imagePart.getImageInfo().getSize().getWidthMpt() * ((double) picture.getHorizontalScalingFactor() * 0.00001d))) * 2L; long cy = UnitsOfMeasurement .twipToEMU(Math.round((double) imagePart.getImageInfo().getSize().getHeightMpt() * ((double) picture.getVerticalScalingFactor() * 0.00001d))) * 2L; inline = imagePart.createImageInline(null, "", ID1++, ID2++, cx, cy, false); org.docx4j.wml.R imgrun = factory.createR(); org.docx4j.wml.Drawing drawing = factory.createDrawing(); imgrun.getContent().add(drawing); drawing.getAnchorOrInline().add(inline); wmlP.getContent().add(imgrun); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } else { // character run text String text = run.text(); // show us the text log.debug("Processing: " + text); String cleansed = stripNonValidXMLCharacters(text); // Necessary to avoid org.xml.sax.SAXParseException: An invalid // XML character // (Unicode: 0xb) was found in the element content of the // document. // when trying to open the resulting docx. // ie JAXB happily writes (marshals) it, but doesn't want to // unmarshall. if (!text.equals(cleansed)) { log.warn("Cleansed.."); } org.docx4j.wml.Text t = factory.createText(); t.setValue(cleansed); org.docx4j.wml.R wmlRun = factory.createR(); if (rPr != null) { wmlRun.setRPr(rPr); } wmlRun.getRunContent().add(t); wmlP.getParagraphContent().add(wmlRun); } } System.out.println(XmlUtils.marshaltoString(wmlP, true, true)); return wmlP; }
From source file:org.modeshape.sequencer.msoffice.word.WordMetadataReader.java
License:Apache License
public static WordMetadata instance(InputStream stream) throws IOException { WordMetadata metadata = new WordMetadata(); List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>(); HWPFDocument document = new HWPFDocument(stream); Range range = document.getRange();//from w ww. j a v a2 s . c o m StyleSheet stylesheet = document.getStyleSheet(); for (int i = 0; i < range.numParagraphs(); i++) { Paragraph paragraph = range.getParagraph(i); String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName(); if (styleName.startsWith(HEADER_PREFIX)) { String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim(); int levelNum = 0; try { levelNum = Integer.parseInt(rawLevelNum); } catch (NumberFormatException nfe) { log.debug("Could not parse heading level from: " + styleName); } String text = Paragraph.stripFields(paragraph.text()); if ('\r' == text.charAt(text.length() - 1)) { text = text.substring(0, text.length() - 1); } headings.add(new WordMetadata.WordHeading(text, levelNum)); } } metadata.setHeadings(headings); metadata.setMetadata(document.getSummaryInformation()); return metadata; }
From source file:poi.hwpf.Word2Forrest.java
License:Apache License
public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException, UnsupportedEncodingException { OutputStreamWriter out = new OutputStreamWriter(stream, "UTF-8"); _out = out;/*w w w . ja v a 2 s . co m*/ _doc = doc; init(); openDocument(); openBody(); Range r = doc.getRange(); StyleSheet styleSheet = doc.getStyleSheet(); int sectionLevel = 0; int lenParagraph = r.numParagraphs(); boolean inCode = false; for (int x = 0; x < lenParagraph; x++) { Paragraph p = r.getParagraph(x); String text = p.text(); if (text.trim().length() == 0) { continue; } StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex()); String styleName = paragraphStyle.getName(); if (styleName.startsWith("Heading")) { if (inCode) { closeSource(); inCode = false; } int headerLevel = Integer.parseInt(styleName.substring(8)); if (headerLevel > sectionLevel) { openSection(); } else { for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) { closeSection(); } openSection(); } sectionLevel = headerLevel; openTitle(); writePlainText(text); closeTitle(); } else { int cruns = p.numCharacterRuns(); CharacterRun run = p.getCharacterRun(0); String fontName = run.getFontName(); if (fontName.startsWith("Courier")) { if (!inCode) { openSource(); inCode = true; } writePlainText(p.text()); } else { if (inCode) { inCode = false; closeSource(); } openParagraph(); writePlainText(p.text()); closeParagraph(); } } } for (int x = 0; x < sectionLevel; x++) { closeSection(); } closeBody(); closeDocument(); _out.flush(); }
From source file:textextractor.WordManager.java
public ArrayList extractDoc(FileInputStream fis) throws IOException { HWPFDocument doc = new HWPFDocument(fis); Range range = doc.getRange();/*ww w . jav a2 s. c o m*/ for (int i = 0; i < range.numParagraphs(); i++) { Paragraph p = range.getParagraph(i); StyleDescription style = doc.getStyleSheet().getStyleDescription(p.getStyleIndex()); if (!"Normal".equals(style.getName())) { System.out.println(style.getName()); } String[] ary = p.text().split(" "); System.out.println(p.text()); listDoc = new ArrayList(); listDoc.addAll(Arrays.asList(ary)); } return listDoc; }