Example usage for org.apache.poi.hwpf.usermodel Range numParagraphs

List of usage examples for org.apache.poi.hwpf.usermodel Range numParagraphs

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.usermodel Range numParagraphs.

Prototype


public int numParagraphs() 

Source Link

Document

Used to get the number of paragraphs in a range.

Usage

From source file:com.google.gdt.handler.impl.WordHandler.java

License:Open Source License

/**
 * /* w w  w  .j a  va 2 s . c o m*/
 * @param inputFile
 * @param pLevel
 * @throws IOException
 * @throws InvalidFormatException
 */
@Override
public void handle(String inputFile, ProgressLevel pLevel) throws IOException, InvalidFormatException {
    String outPutFile = getOuputFileName(inputFile);
    OutputStream outputStream = new FileOutputStream(outPutFile);
    InputStream inputStream = new FileInputStream(inputFile);

    HWPFDocument hDocument = new HWPFDocument(inputStream);
    Range range = hDocument.getRange();

    pLevel.setTrFileName(outPutFile);
    pLevel.setValue(0);
    pLevel.setStringPainted(true);
    pLevel.setMaxValue(range.numParagraphs());
    int count = 0;
    for (int i = 0; i < range.numParagraphs(); i++) {
        Paragraph paragraph = range.getParagraph(i);
        int numCharRuns = paragraph.numCharacterRuns();
        for (int j = 0; j < numCharRuns; j++) {
            if (isInterrupted) {
                outputStream.close();
                new File(outPutFile).delete();
                pLevel.setString("cancelled");
                return;
            }
            CharacterRun charRun = paragraph.getCharacterRun(j);
            String inputText = charRun.text();
            if ((null == inputText) || (inputText.trim().equals("")))
                continue;
            String translatedTxt = inputText;
            //in http post method, all key value pairs are seperated with &
            if (preferenceModel.getTranslatorType() == TranslatorType.HTTP)
                inputText = inputText.replaceAll("&", "and");
            try {
                translatedTxt = translator.translate(translatedTxt);
                charRun.replaceText(inputText, translatedTxt);
            } catch (Exception e) {
                logger.log(Level.SEVERE,
                        "Input File : " + inputFile + " cannot translate the text : " + inputText, e);
            }
        }
        count++;
        pLevel.setValue(count);
    }
    pLevel.setString("done");
    hDocument.write(outputStream);
    outputStream.close();
}

From source file:com.thuvienkhoahoc.wordtomwtext.examples.WordToMwtext.java

License:Apache License

public WordToMwtext(HWPFDocument doc, OutputStream stream) throws IOException, UnsupportedEncodingException {

    // bagd/* ww  w.  j av  a2s. c  om*/
    OutputStreamWriter out = new OutputStreamWriter(stream, "UTF-8");
    _out = out;
    _doc = doc;

    init();
    openDocument();
    openBody();

    Range r = doc.getRange();
    StyleSheet styleSheet = doc.getStyleSheet();

    int sectionLevel = 0;
    int lenParagraph = r.numParagraphs();
    boolean inCode = false;
    for (int x = 0; x < lenParagraph; x++) {
        Paragraph p = r.getParagraph(x);
        String text = p.text();
        if (text.trim().length() == 0) {
            continue;
        }
        StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());
        String styleName = paragraphStyle.getName();
        if (styleName.startsWith("Heading")) {
            if (inCode) {
                closeSource();
                inCode = false;
            }

            int headerLevel = Integer.parseInt(styleName.substring(8));
            if (headerLevel > sectionLevel) {
                openSection();
            } else {
                for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
                    closeSection();
                }
                openSection();
            }
            sectionLevel = headerLevel;
            openTitle(sectionLevel);
            writePlainText(text.trim());
            closeTitle(sectionLevel);
        } else {
            int cruns = p.numCharacterRuns();
            CharacterRun run = p.getCharacterRun(0);
            String fontName = run.getFontName();
            if (fontName.startsWith("Courier")) {
                if (!inCode) {
                    openSource();
                    inCode = true;
                }
                writePlainText(p.text());
            } else {
                if (inCode) {
                    inCode = false;
                    closeSource();
                }
                openParagraph();
                writePlainText(p.text());
                closeParagraph();
            }
        }
    }
    for (int x = 0; x < sectionLevel; x++) {
        closeSection();
    }
    closeBody();
    closeDocument();
    _out.flush();

}

From source file:com.xx.platform.util.tools.ms.WordExtractor.java

License:Apache License

/**
 * Get the text from the word file, as an array with one String
 *  per paragraph/*from   ww w.  j  a v  a 2  s  .  c  o  m*/
 */
public String[] getParagraphText() {
    String[] ret;

    // Extract using the model code
    try {
        Range r = doc.getRange();

        ret = new String[r.numParagraphs()];
        for (int i = 0; i < ret.length; i++) {
            Paragraph p = r.getParagraph(i);
            ret[i] = p.text();

            // Fix the line ending
            if (ret[i].endsWith("\r")) {
                ret[i] = ret[i] + "\n";
            }
        }
    } catch (Exception e) {
        // Something's up with turning the text pieces into paragraphs
        // Fall back to ripping out the text pieces
        ret = new String[1];
        ret[0] = getTextFromPieces();
    }

    return ret;
}

From source file:com.zhch.example.poi.Word2Forrest.java

License:Apache License

@SuppressWarnings("unused")
public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException {
    OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8"));
    _out = out;/*ww  w .  j  ava  2s.c o m*/
    _doc = doc;

    init();
    openDocument();
    openBody();

    Range r = doc.getRange();
    StyleSheet styleSheet = doc.getStyleSheet();

    int sectionLevel = 0;
    int lenParagraph = r.numParagraphs();
    boolean inCode = false;
    for (int x = 0; x < lenParagraph; x++) {
        Paragraph p = r.getParagraph(x);
        String text = p.text();
        if (text.trim().length() == 0) {
            continue;
        }
        StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());
        String styleName = paragraphStyle.getName();
        if (styleName.startsWith("Heading")) {
            if (inCode) {
                closeSource();
                inCode = false;
            }

            int headerLevel = Integer.parseInt(styleName.substring(8));
            if (headerLevel > sectionLevel) {
                openSection();
            } else {
                for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
                    closeSection();
                }
                openSection();
            }
            sectionLevel = headerLevel;
            openTitle();
            writePlainText(text);
            closeTitle();
        } else {
            int cruns = p.numCharacterRuns();
            CharacterRun run = p.getCharacterRun(0);
            String fontName = run.getFontName();
            if (fontName.startsWith("Courier")) {
                if (!inCode) {
                    openSource();
                    inCode = true;
                }
                writePlainText(p.text());
            } else {
                if (inCode) {
                    inCode = false;
                    closeSource();
                }
                openParagraph();
                writePlainText(p.text());
                closeParagraph();
            }
        }
    }
    for (int x = 0; x < sectionLevel; x++) {
        closeSection();
    }
    closeBody();
    closeDocument();
    _out.flush();

}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;/*from w  w w. j  a  va2  s  .c om*/
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }

    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);

    // mj
    extractImageText(xhtml, document);

    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    // the pictures should be in order, and may be directly
    // placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

private static int countParagraphs(Range... ranges) {
    int count = 0;
    for (Range r : ranges) {
        if (r != null) {
            count += r.numParagraphs();
        }/*from w  w w  .  ja  v  a2  s .  c  o m*/
    }
    return count;
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document, PicturesSource pictures,
        PicturesTable pictureTable, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    if (countParagraphs(ranges) > 0) {
        xhtml.startElement("div", "class", type);
        for (Range r : ranges) {
            if (r != null) {
                for (int i = 0; i < r.numParagraphs(); i++) {
                    Paragraph p = r.getParagraph(i);

                    String text = p.text();
                    if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
                        // Skip empty header or footer paragraphs
                    } else {
                        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures,
                                pictureTable, xhtml);
                    }/*  ww  w .jav  a 2s.co m*/
                }
            }
        }
        xhtml.endElement("div");
    }
}

From source file:org.apache.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;/*from   w w w.  j a  va2  s  .  c  o m*/
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }
    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);
    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    //  the pictures should be in order, and may be directly
    //  placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    ListManager listManager = new ListManager(document);
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager,
                xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}

From source file:org.apache.tika.parser.microsoft.WordExtractor.java

License:Apache License

private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document, PicturesSource pictures,
        PicturesTable pictureTable, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    if (countParagraphs(ranges) > 0) {
        xhtml.startElement("div", "class", type);
        ListManager listManager = new ListManager(document);
        for (Range r : ranges) {
            if (r != null) {
                for (int i = 0; i < r.numParagraphs(); i++) {
                    Paragraph p = r.getParagraph(i);

                    i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures, pictureTable,
                            listManager, xhtml);
                }/*ww  w . j  a v  a2 s . c o m*/
            }
        }
        xhtml.endElement("div");
    }
}

From source file:org.esmerilprogramming.pdfcake.DocumentReplace.java

License:Open Source License

/**
 * Read the document searching for the $$$<keys>$$$ and replace with the values in the template
 * @param document// ww  w. j  a  v  a2s .  c o  m
 * @param template
 * @return
 */
private static HWPFDocument replaceKeys(HWPFDocument document, DocumentTemplate template) {
    Range range = document.getRange();
    for (int i = 0; i < range.numParagraphs(); i++) {
        Paragraph p = range.getParagraph(i);
        String text = null;
        for (Enumeration<String> e = template.getAttributes().keys(); e.hasMoreElements();) {
            String key = e.nextElement();
            String attributeKey = "$$$" + key + "$$$";
            try {
                text = p.text();
            } catch (Exception ex) {
                ;
            }
            while (text != null && text.indexOf(attributeKey) > -1) {
                String replacement = template.getAttributes().get(key);
                p.replaceText(attributeKey, replacement, text.indexOf(attributeKey));
                text = text.replace(attributeKey, "");
            }
        }
    }
    return document;
}