Example usage for org.apache.poi.hwpf.usermodel Paragraph getStyleIndex

List of usage examples for org.apache.poi.hwpf.usermodel Paragraph getStyleIndex

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.usermodel Paragraph getStyleIndex.

Prototype

public short getStyleIndex() 

Source Link

Document

Returns the index of the style which applies to this Paragraph.

Usage

From source file:com.thuvienkhoahoc.wordtomwtext.examples.WordToMwtext.java

License:Apache License

public WordToMwtext(HWPFDocument doc, OutputStream stream) throws IOException, UnsupportedEncodingException {

    // bagd/*from   w  ww. ja  v  a2  s.c om*/
    OutputStreamWriter out = new OutputStreamWriter(stream, "UTF-8");
    _out = out;
    _doc = doc;

    init();
    openDocument();
    openBody();

    Range r = doc.getRange();
    StyleSheet styleSheet = doc.getStyleSheet();

    int sectionLevel = 0;
    int lenParagraph = r.numParagraphs();
    boolean inCode = false;
    for (int x = 0; x < lenParagraph; x++) {
        Paragraph p = r.getParagraph(x);
        String text = p.text();
        if (text.trim().length() == 0) {
            continue;
        }
        StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());
        String styleName = paragraphStyle.getName();
        if (styleName.startsWith("Heading")) {
            if (inCode) {
                closeSource();
                inCode = false;
            }

            int headerLevel = Integer.parseInt(styleName.substring(8));
            if (headerLevel > sectionLevel) {
                openSection();
            } else {
                for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
                    closeSection();
                }
                openSection();
            }
            sectionLevel = headerLevel;
            openTitle(sectionLevel);
            writePlainText(text.trim());
            closeTitle(sectionLevel);
        } else {
            int cruns = p.numCharacterRuns();
            CharacterRun run = p.getCharacterRun(0);
            String fontName = run.getFontName();
            if (fontName.startsWith("Courier")) {
                if (!inCode) {
                    openSource();
                    inCode = true;
                }
                writePlainText(p.text());
            } else {
                if (inCode) {
                    inCode = false;
                    closeSource();
                }
                openParagraph();
                writePlainText(p.text());
                closeParagraph();
            }
        }
    }
    for (int x = 0; x < sectionLevel; x++) {
        closeSection();
    }
    closeBody();
    closeDocument();
    _out.flush();

}

From source file:com.zhch.example.poi.Word2Forrest.java

License:Apache License

@SuppressWarnings("unused")
public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException {
    OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8"));
    _out = out;//w  w w .  j  a v a  2 s.  c om
    _doc = doc;

    init();
    openDocument();
    openBody();

    Range r = doc.getRange();
    StyleSheet styleSheet = doc.getStyleSheet();

    int sectionLevel = 0;
    int lenParagraph = r.numParagraphs();
    boolean inCode = false;
    for (int x = 0; x < lenParagraph; x++) {
        Paragraph p = r.getParagraph(x);
        String text = p.text();
        if (text.trim().length() == 0) {
            continue;
        }
        StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());
        String styleName = paragraphStyle.getName();
        if (styleName.startsWith("Heading")) {
            if (inCode) {
                closeSource();
                inCode = false;
            }

            int headerLevel = Integer.parseInt(styleName.substring(8));
            if (headerLevel > sectionLevel) {
                openSection();
            } else {
                for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
                    closeSection();
                }
                openSection();
            }
            sectionLevel = headerLevel;
            openTitle();
            writePlainText(text);
            closeTitle();
        } else {
            int cruns = p.numCharacterRuns();
            CharacterRun run = p.getCharacterRun(0);
            String fontName = run.getFontName();
            if (fontName.startsWith("Courier")) {
                if (!inCode) {
                    openSource();
                    inCode = true;
                }
                writePlainText(p.text());
            } else {
                if (inCode) {
                    inCode = false;
                    closeSource();
                }
                openParagraph();
                writePlainText(p.text());
                closeParagraph();
            }
        }
    }
    for (int x = 0; x < sectionLevel; x++) {
        closeSection();
    }
    closeBody();
    closeDocument();
    _out.flush();

}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
        FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable,
        XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    // Note - a poi bug means we can't currently properly recurse
    // into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
        Table t = r.getTable(p);//from   w  ww .  j a  v a2s. c  o  m
        xhtml.startElement("table");
        xhtml.startElement("tbody");
        for (int rn = 0; rn < t.numRows(); rn++) {
            TableRow row = t.getRow(rn);
            xhtml.startElement("tr");
            for (int cn = 0; cn < row.numCells(); cn++) {
                TableCell cell = row.getCell(cn);
                xhtml.startElement("td");

                for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable,
                            xhtml);
                }
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        return (t.numParagraphs() - 1);
    }

    TagAndStyle tas;

    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
        StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (style != null && style.getName() != null && style.getName().length() > 0) {
            tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
        } else {
            tas = new TagAndStyle("p", null);
        }
    } else {
        tas = new TagAndStyle("p", null);
    }

    if (tas.getStyleClass() != null) {
        xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
        xhtml.startElement(tas.getTag());
    }

    for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun cr = p.getCharacterRun(j);

        // FIELD_BEGIN_MARK:
        if (cr.text().getBytes()[0] == 0x13) {
            Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
            // 58 is an embedded document
            // 56 is a document link
            if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                attributes.addAttribute("", "id", "id", "CDATA", id);
                xhtml.startElement("div", attributes);
                xhtml.endElement("div");
            }
        }

        if (cr.text().equals("\u0013")) {
            j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
        } else if (cr.text().startsWith("\u0008")) {
            // Floating Picture(s)
            for (int pn = 0; pn < cr.text().length(); pn++) {
                // Assume they're in the order from the unclaimed list...
                Picture picture = pictures.nextUnclaimed();

                // Output
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            }
        } else if (pictureTable.hasPicture(cr)) {
            // Inline Picture
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
        } else {
            handleCharacterRun(cr, tas.isHeading(), xhtml);
        }
    }

    // Close any still open style tags
    if (curStrikeThrough) {
        xhtml.endElement("s");
        curStrikeThrough = false;
    }
    if (curItalic) {
        xhtml.endElement("i");
        curItalic = false;
    }
    if (curBold) {
        xhtml.endElement("b");
        curBold = false;
    }

    xhtml.endElement(tas.getTag());

    return 0;
}

From source file:org.apache.tika.parser.microsoft.WordExtractor.java

License:Apache License

private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
        FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable,
        ListManager listManager, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    // Note - a poi bug means we can't currently properly recurse
    //  into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
        Table t = r.getTable(p);/*from  ww  w . ja  v  a  2  s  . c  o m*/
        xhtml.startElement("table");
        xhtml.startElement("tbody");
        for (int rn = 0; rn < t.numRows(); rn++) {
            TableRow row = t.getRow(rn);
            xhtml.startElement("tr");
            for (int cn = 0; cn < row.numCells(); cn++) {
                TableCell cell = row.getCell(cn);
                xhtml.startElement("td");

                for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable,
                            listManager, xhtml);
                }
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        return (t.numParagraphs() - 1);
    }

    String text = p.text();
    if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
        // Skip empty paragraphs
        return 0;
    }

    TagAndStyle tas;
    String numbering = null;

    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
        StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (style != null && style.getName() != null && style.getName().length() > 0) {
            if (p.isInList()) {
                numbering = listManager.getFormattedNumber(p);
            }
            tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
        } else {
            tas = new TagAndStyle("p", null);
        }
    } else {
        tas = new TagAndStyle("p", null);
    }

    if (tas.getStyleClass() != null) {
        xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
        xhtml.startElement(tas.getTag());
    }

    if (numbering != null) {
        xhtml.characters(numbering);
    }

    for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun cr = p.getCharacterRun(j);

        // FIELD_BEGIN_MARK:
        if (cr.text().getBytes(UTF_8)[0] == 0x13) {
            Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
            // 58 is an embedded document
            // 56 is a document link
            if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                attributes.addAttribute("", "id", "id", "CDATA", id);
                xhtml.startElement("div", attributes);
                xhtml.endElement("div");
            }
        }

        if (cr.text().equals("\u0013")) {
            j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
        } else if (cr.text().startsWith("\u0008")) {
            // Floating Picture(s)
            for (int pn = 0; pn < cr.text().length(); pn++) {
                // Assume they're in the order from the unclaimed list...
                Picture picture = pictures.nextUnclaimed();

                // Output
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            }
        } else if (pictureTable.hasPicture(cr)) {
            // Inline Picture
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
        } else {
            handleCharacterRun(cr, tas.isHeading(), xhtml);
        }
    }

    // Close any still open style tags
    if (curStrikeThrough) {
        xhtml.endElement("s");
        curStrikeThrough = false;
    }
    if (curItalic) {
        xhtml.endElement("i");
        curItalic = false;
    }
    if (curBold) {
        xhtml.endElement("b");
        curBold = false;
    }

    xhtml.endElement(tas.getTag());

    return 0;
}

From source file:org.docx4j.convert.in.Doc.java

License:Apache License

private static org.docx4j.wml.P handleP(WordprocessingMLPackage wordMLPackage, HWPFDocument doc, Paragraph p,
        org.apache.poi.hwpf.model.StyleSheet stylesheet, MainDocumentPart documentPart,
        org.docx4j.wml.ObjectFactory factory) {

    org.docx4j.wml.P wmlP = null;//from   w w  w  .java 2s  .c  o  m

    if (p.getStyleIndex() > 0) {
        log.debug("Styled paragraph, with index: " + p.getStyleIndex());
        String styleName = stylesheet.getStyleDescription(p.getStyleIndex()).getName();

        log.debug(styleName);

        wmlP = documentPart.createStyledParagraphOfText(stripSpace(styleName), null);

    } else {
        wmlP = documentPart.createParagraphOfText(null);
    }

    // LineSpacingDescriptor lsd = p.getLineSpacing();
    // if (lsd==null || lsd.isEmpty()) {
    // // do nothing
    // } else {
    // PPr pPr = wmlP.getPPr();
    // if (pPr==null) {
    // pPr = Context.getWmlObjectFactory().createPPr();
    // wmlP.setPPr(pPr);
    // }
    // Spacing spacing =
    // Context.getWmlObjectFactory().createPPrBaseSpacing();
    // spacing.setLine(lsd._dyaLine); // not visible
    // spacing.setLineRule(STLineSpacingRule.AUTO);
    // pPr.setSpacing(spacing);
    // }

    for (int z = 0; z < p.numCharacterRuns(); z++) {
        // character run
        CharacterRun run = p.getCharacterRun(z);

        // No character styles defined in there??

        org.docx4j.wml.RPr rPr = null;

        if (run.isBold()) {

            // TODO - HIGH PRIORITY- handle other run properties
            // esp underline, font size
            if (rPr == null) {
                rPr = factory.createRPr();
            }

            org.docx4j.wml.BooleanDefaultTrue boldOn = factory.createBooleanDefaultTrue();
            boldOn.setVal(Boolean.TRUE);

            rPr.setB(boldOn);

        }

        //Process image
        if (doc instanceof HWPFDocument && ((HWPFDocument) doc).getPicturesTable().hasPicture(run)) {

            Picture picture = doc.getPicturesTable().extractPicture(run, true);
            Inline inline;
            try {
                BinaryPartAbstractImage imagePart = BinaryPartAbstractImage.createImagePart(wordMLPackage,
                        picture.getContent());

                long cx = UnitsOfMeasurement
                        .twipToEMU(Math.round((double) imagePart.getImageInfo().getSize().getWidthMpt()
                                * ((double) picture.getHorizontalScalingFactor() * 0.00001d)))
                        * 2L;
                long cy = UnitsOfMeasurement
                        .twipToEMU(Math.round((double) imagePart.getImageInfo().getSize().getHeightMpt()
                                * ((double) picture.getVerticalScalingFactor() * 0.00001d)))
                        * 2L;

                inline = imagePart.createImageInline(null, "", ID1++, ID2++, cx, cy, false);

                org.docx4j.wml.R imgrun = factory.createR();
                org.docx4j.wml.Drawing drawing = factory.createDrawing();
                imgrun.getContent().add(drawing);
                drawing.getAnchorOrInline().add(inline);
                wmlP.getContent().add(imgrun);
            } catch (Exception e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }

        } else {
            // character run text
            String text = run.text();

            // show us the text
            log.debug("Processing: " + text);

            String cleansed = stripNonValidXMLCharacters(text);
            // Necessary to avoid org.xml.sax.SAXParseException: An invalid
            // XML character
            // (Unicode: 0xb) was found in the element content of the
            // document.
            // when trying to open the resulting docx.
            // ie JAXB happily writes (marshals) it, but doesn't want to
            // unmarshall.

            if (!text.equals(cleansed)) {
                log.warn("Cleansed..");
            }

            org.docx4j.wml.Text t = factory.createText();
            t.setValue(cleansed);

            org.docx4j.wml.R wmlRun = factory.createR();

            if (rPr != null) {
                wmlRun.setRPr(rPr);
            }
            wmlRun.getRunContent().add(t);
            wmlP.getParagraphContent().add(wmlRun);
        }
    }

    System.out.println(XmlUtils.marshaltoString(wmlP, true, true));

    return wmlP;

}

From source file:org.modeshape.sequencer.msoffice.word.WordMetadataReader.java

License:Apache License

public static WordMetadata instance(InputStream stream) throws IOException {
    WordMetadata metadata = new WordMetadata();
    List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>();

    HWPFDocument document = new HWPFDocument(stream);
    Range range = document.getRange();//from   w  ww.  j  a  v  a2 s . c  o m

    StyleSheet stylesheet = document.getStyleSheet();

    for (int i = 0; i < range.numParagraphs(); i++) {
        Paragraph paragraph = range.getParagraph(i);

        String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName();

        if (styleName.startsWith(HEADER_PREFIX)) {
            String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim();
            int levelNum = 0;

            try {
                levelNum = Integer.parseInt(rawLevelNum);
            } catch (NumberFormatException nfe) {
                log.debug("Could not parse heading level from: " + styleName);
            }

            String text = Paragraph.stripFields(paragraph.text());

            if ('\r' == text.charAt(text.length() - 1)) {
                text = text.substring(0, text.length() - 1);
            }

            headings.add(new WordMetadata.WordHeading(text, levelNum));
        }
    }

    metadata.setHeadings(headings);
    metadata.setMetadata(document.getSummaryInformation());
    return metadata;
}

From source file:poi.hwpf.Word2Forrest.java

License:Apache License

public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException, UnsupportedEncodingException {
    OutputStreamWriter out = new OutputStreamWriter(stream, "UTF-8");
    _out = out;/*w w  w . ja  v  a  2 s . co m*/
    _doc = doc;

    init();
    openDocument();
    openBody();

    Range r = doc.getRange();
    StyleSheet styleSheet = doc.getStyleSheet();

    int sectionLevel = 0;
    int lenParagraph = r.numParagraphs();
    boolean inCode = false;
    for (int x = 0; x < lenParagraph; x++) {
        Paragraph p = r.getParagraph(x);
        String text = p.text();
        if (text.trim().length() == 0) {
            continue;
        }
        StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());
        String styleName = paragraphStyle.getName();
        if (styleName.startsWith("Heading")) {
            if (inCode) {
                closeSource();
                inCode = false;
            }

            int headerLevel = Integer.parseInt(styleName.substring(8));
            if (headerLevel > sectionLevel) {
                openSection();
            } else {
                for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
                    closeSection();
                }
                openSection();
            }
            sectionLevel = headerLevel;
            openTitle();
            writePlainText(text);
            closeTitle();
        } else {
            int cruns = p.numCharacterRuns();
            CharacterRun run = p.getCharacterRun(0);
            String fontName = run.getFontName();
            if (fontName.startsWith("Courier")) {
                if (!inCode) {
                    openSource();
                    inCode = true;
                }
                writePlainText(p.text());
            } else {
                if (inCode) {
                    inCode = false;
                    closeSource();
                }
                openParagraph();
                writePlainText(p.text());
                closeParagraph();
            }
        }
    }
    for (int x = 0; x < sectionLevel; x++) {
        closeSection();
    }
    closeBody();
    closeDocument();
    _out.flush();

}

From source file:textextractor.WordManager.java

public ArrayList extractDoc(FileInputStream fis) throws IOException {
    HWPFDocument doc = new HWPFDocument(fis);
    Range range = doc.getRange();/*ww w  . jav a2 s.  c  o  m*/
    for (int i = 0; i < range.numParagraphs(); i++) {
        Paragraph p = range.getParagraph(i);
        StyleDescription style = doc.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (!"Normal".equals(style.getName())) {
            System.out.println(style.getName());
        }
        String[] ary = p.text().split(" ");
        System.out.println(p.text());
        listDoc = new ArrayList();
        listDoc.addAll(Arrays.asList(ary));
    }
    return listDoc;

}