List of usage examples for org.apache.poi.xwpf.usermodel XWPFStyle getName
public String getName()
From source file:kz.service.DocumentReader.java
public static String readDocxFile(String fileName) { try {// w w w .j a v a2 s . co m File file = new File(fileName); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); StringBuffer content = new StringBuffer(); XWPFDocument document = new XWPFDocument(fis); XWPFStyles styles = document.getStyles(); List<XWPFParagraph> paragraphs = document.getParagraphs(); List<XWPFTable> tables = document.getTables(); List<XWPFPictureData> pictures = document.getAllPictures(); //int Picture_ID = 0; for (XWPFPictureData picture : pictures) { //XWPFPictureData picture = pictures.get(Picture_ID); System.out.println("Picture: " + picture.getFileName()); byte[] pictureData = picture.getData(); BufferedImage image = ImageIO.read(new ByteArrayInputStream(pictureData)); ImageIO.write(image, picture.getFileName(), file); content.append("<p>"); content.append("Here must be image"); content.append("</p>"); //Picture_ID++; } Iterator<IBodyElement> bodyElementIterator = document.getBodyElementsIterator(); int Table_ID = 0; int Paragraph_ID = 0; while (bodyElementIterator.hasNext()) { IBodyElement element = bodyElementIterator.next(); System.out.println(element.getElementType().name());//prints Element type name if ("TABLE".equalsIgnoreCase(element.getElementType().name())) { content.append("<table>"); XWPFTable table = tables.get(Table_ID); CTTbl cttbl = table.getCTTbl(); CTTblPr cttblPr = cttbl.getTblPr(); List<XWPFTableRow> tblRows = table.getRows(); for (XWPFTableRow tblRow : tblRows) { content.append("<tr>"); List<XWPFTableCell> tblCells = tblRow.getTableCells(); for (XWPFTableCell tblCell : tblCells) { content.append("<td>"); content.append(tblCell.getText()); content.append("</td>"); } content.append("</tr>"); } content.append("</table>"); Table_ID++; } else if ("PARAGRAPH".equalsIgnoreCase(element.getElementType().name())) { XWPFParagraph paragraph = paragraphs.get(Paragraph_ID); String styleClass = null; if (paragraph.getStyleID() != null) { content.append("<p class=''>"); XWPFStyle style = styles.getStyle(paragraph.getStyleID()); if (style != null && style.getName() != null) { //here will be code creation of tag with class style } } else { content.append("<p>"); } content.append(paragraph.getText()); content.append("</p>"); Paragraph_ID++; } } fis.close(); return content.toString(); } catch (Exception e) { return e.toString(); } }
From source file:mj.ocraptor.extraction.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator.java
License:Apache License
private void extractParagraph(XWPFParagraph paragraph, XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { // If this paragraph is actually a whole new section, then // it could have its own headers and footers // Check and handle if so XWPFHeaderFooterPolicy headerFooterPolicy = null; if (paragraph.getCTP().getPPr() != null) { CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr(); if (ctSectPr != null) { headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); extractHeaders(xhtml, headerFooterPolicy); }//from w w w. j a va 2 s .c o m } // Is this a paragraph, or a heading? String tag = "p"; String styleClass = null; if (paragraph.getStyleID() != null) { XWPFStyle style = styles.getStyle(paragraph.getStyleID()); if (style != null && style.getName() != null) { TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(), paragraph.getPartType() == BodyType.TABLECELL); tag = tas.getTag(); styleClass = tas.getStyleClass(); } } if (styleClass == null) { xhtml.startElement(tag); } else { xhtml.startElement(tag, "class", styleClass); } // Output placeholder for any embedded docs: // TODO: replace w/ XPath/XQuery: for (XWPFRun run : paragraph.getRuns()) { XmlCursor c = run.getCTR().newCursor(); c.selectPath("./*"); while (c.toNextSelection()) { XmlObject o = c.getObject(); if (o instanceof CTObject) { XmlCursor c2 = o.newCursor(); c2.selectPath("./*"); while (c2.toNextSelection()) { XmlObject o2 = c2.getObject(); XmlObject embedAtt = o2.selectAttribute(new QName("Type")); if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) { // Type is "Embed" XmlObject relIDAtt = o2.selectAttribute(new QName( "http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id")); if (relIDAtt != null) { String relID = relIDAtt.getDomNode().getNodeValue(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", relID); xhtml.startElement("div", attributes); xhtml.endElement("div"); } } } c2.dispose(); } } c.dispose(); } // Attach bookmarks for the paragraph // (In future, we might put them in the right place, for now // we just put them in the correct paragraph) for (CTBookmark bookmark : paragraph.getCTP().getBookmarkStartList()) { xhtml.startElement("a", "name", bookmark.getName()); xhtml.endElement("a"); } TmpFormatting fmtg = new TmpFormatting(false, false); // Do the iruns for (IRunElement run : paragraph.getIRuns()) { if (run instanceof XWPFSDT) { fmtg = closeStyleTags(xhtml, fmtg); processSDTRun((XWPFSDT) run, xhtml); // for now, we're ignoring formatting in sdt // if you hit an sdt reset to false fmtg.setBold(false); fmtg.setItalic(false); } else { fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg); } } closeStyleTags(xhtml, fmtg); // Now do any comments for the paragraph XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null); String commentText = comments.getCommentText(); if (commentText != null && commentText.length() > 0) { xhtml.characters(commentText); } String footnameText = paragraph.getFootnoteText(); if (footnameText != null && footnameText.length() > 0) { xhtml.characters(footnameText + "\n"); } // Also extract any paragraphs embedded in text boxes: for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath( "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) { extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), xhtml); } // Finish this paragraph xhtml.endElement(tag); if (headerFooterPolicy != null) { extractFooters(xhtml, headerFooterPolicy); } }
From source file:org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator.java
License:Apache License
private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager, XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { // If this paragraph is actually a whole new section, then // it could have its own headers and footers // Check and handle if so XWPFHeaderFooterPolicy headerFooterPolicy = null; if (paragraph.getCTP().getPPr() != null) { CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr(); if (ctSectPr != null) { headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); extractHeaders(xhtml, headerFooterPolicy, listManager); }//www. j a va2 s . c o m } // Is this a paragraph, or a heading? String tag = "p"; String styleClass = null; if (paragraph.getStyleID() != null) { XWPFStyle style = styles.getStyle(paragraph.getStyleID()); if (style != null && style.getName() != null) { TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(), paragraph.getPartType() == BodyType.TABLECELL); tag = tas.getTag(); styleClass = tas.getStyleClass(); } } if (styleClass == null) { xhtml.startElement(tag); } else { xhtml.startElement(tag, "class", styleClass); } writeParagraphNumber(paragraph, listManager, xhtml); // Output placeholder for any embedded docs: // TODO: replace w/ XPath/XQuery: for (XWPFRun run : paragraph.getRuns()) { XmlCursor c = run.getCTR().newCursor(); c.selectPath("./*"); while (c.toNextSelection()) { XmlObject o = c.getObject(); if (o instanceof CTObject) { XmlCursor c2 = o.newCursor(); c2.selectPath("./*"); while (c2.toNextSelection()) { XmlObject o2 = c2.getObject(); XmlObject embedAtt = o2.selectAttribute(new QName("Type")); if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) { // Type is "Embed" XmlObject relIDAtt = o2.selectAttribute(new QName( "http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id")); if (relIDAtt != null) { String relID = relIDAtt.getDomNode().getNodeValue(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", relID); xhtml.startElement("div", attributes); xhtml.endElement("div"); } } } c2.dispose(); } } c.dispose(); } // Attach bookmarks for the paragraph // (In future, we might put them in the right place, for now // we just put them in the correct paragraph) for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) { CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i); xhtml.startElement("a", "name", bookmark.getName()); xhtml.endElement("a"); } TmpFormatting fmtg = new TmpFormatting(false, false); // Do the iruns for (IRunElement run : paragraph.getIRuns()) { if (run instanceof XWPFSDT) { fmtg = closeStyleTags(xhtml, fmtg); processSDTRun((XWPFSDT) run, xhtml); //for now, we're ignoring formatting in sdt //if you hit an sdt reset to false fmtg.setBold(false); fmtg.setItalic(false); } else { fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg); } } closeStyleTags(xhtml, fmtg); // Now do any comments for the paragraph XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null); String commentText = comments.getCommentText(); if (commentText != null && commentText.length() > 0) { xhtml.characters(commentText); } String footnameText = paragraph.getFootnoteText(); if (footnameText != null && footnameText.length() > 0) { xhtml.characters(footnameText + "\n"); } // Also extract any paragraphs embedded in text boxes: for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath( "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) { extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml); } // Finish this paragraph xhtml.endElement(tag); if (headerFooterPolicy != null) { extractFooters(xhtml, headerFooterPolicy, listManager); } }