Example usage for org.apache.poi.hwpf.usermodel CharacterRun text

List of usage examples for org.apache.poi.hwpf.usermodel CharacterRun text

Introduction

In this page you can find the example usage for org.apache.poi.hwpf.usermodel CharacterRun text.

Prototype

String text();

Source Link

Usage

From source file:com.duroty.lucene.parser.MSWordParser.java

License:Open Source License

/**
 * DOCUMENT ME!//from w w w  .ja v  a  2s.c  o  m
 *
 * @return DOCUMENT ME!
 *
 * @throws ParserException DOCUMENT ME!
 */
private String getContents() throws ParserException {
    String contents = "";

    try {
        HWPFDocument doc = new HWPFDocument(input);
        Range r = doc.getRange();
        StringBuffer buffer = new StringBuffer();

        for (int x = 0; x < r.numSections(); x++) {
            Section s = r.getSection(x);

            for (int y = 0; y < s.numParagraphs(); y++) {
                Paragraph p = null;

                try {
                    p = s.getParagraph(y);
                } catch (Exception e) {
                    buffer.append("\n");
                }

                if (p != null) {
                    for (int z = 0; z < p.numCharacterRuns(); z++) {
                        try {
                            //character run
                            CharacterRun run = p.getCharacterRun(z);

                            //character run text
                            buffer.append(run.text());
                        } catch (Exception e) {
                            buffer.append(" ");
                        }
                    }
                }

                /*if (sleep > 0) {
                    try {
                        Thread.sleep(sleep);
                    } catch (Exception ex) {
                    }
                }*/
                // use a new line at the paragraph break
                buffer.append("\n");
            }
        }

        contents = buffer.toString();
    } catch (Exception ex) {
        throw new ParserException(ex);
    }

    return contents;
}

From source file:com.example.minireader.WordViewActivity.java

License:Apache License

/**html*/
public void writeParagraphContent(Paragraph paragraph) {
    Paragraph p = paragraph;/*from  w w  w.j  av a  2s . c  o  m*/
    int pnumCharacterRuns = p.numCharacterRuns();

    for (int j = 0; j < pnumCharacterRuns; j++) {

        CharacterRun run = p.getCharacterRun(j);

        if (run.getPicOffset() == 0 || run.getPicOffset() >= 1000) {
            if (presentPicture < pictures.size()) {
                //
                writePicture();
            }
        } else {
            try {
                String text = run.text();
                if (text.length() >= 2 && pnumCharacterRuns < 2) {
                    //
                    output.write(text.getBytes());
                } else {
                    //
                    int size = run.getFontSize();
                    int color = run.getColor();
                    String fontSizeBegin = "<font size=\"" + decideSize(size) + "\">";
                    String fontColorBegin = "<font color=\"" + decideColor(color) + "\">";
                    String fontEnd = "</font>";
                    String boldBegin = "<b>";
                    String boldEnd = "</b>";
                    String islaBegin = "<i>";
                    String islaEnd = "</i>";

                    output.write(fontSizeBegin.getBytes());
                    output.write(fontColorBegin.getBytes());

                    if (run.isBold()) {
                        output.write(boldBegin.getBytes());
                    }
                    if (run.isItalic()) {
                        output.write(islaBegin.getBytes());
                    }

                    output.write(text.getBytes());

                    if (run.isBold()) {
                        output.write(boldEnd.getBytes());
                    }
                    if (run.isItalic()) {
                        output.write(islaEnd.getBytes());
                    }
                    output.write(fontEnd.getBytes());
                    output.write(fontEnd.getBytes());
                }
            } catch (Exception e) {
                System.out.println("Write File Exception");
            }
        }
    }
}

From source file:com.google.gdt.handler.impl.WordHandler.java

License:Open Source License

/**
 * // w  w  w  . ja v a 2s .co m
 * @param inputFile
 * @param pLevel
 * @throws IOException
 * @throws InvalidFormatException
 */
@Override
public void handle(String inputFile, ProgressLevel pLevel) throws IOException, InvalidFormatException {
    String outPutFile = getOuputFileName(inputFile);
    OutputStream outputStream = new FileOutputStream(outPutFile);
    InputStream inputStream = new FileInputStream(inputFile);

    HWPFDocument hDocument = new HWPFDocument(inputStream);
    Range range = hDocument.getRange();

    pLevel.setTrFileName(outPutFile);
    pLevel.setValue(0);
    pLevel.setStringPainted(true);
    pLevel.setMaxValue(range.numParagraphs());
    int count = 0;
    for (int i = 0; i < range.numParagraphs(); i++) {
        Paragraph paragraph = range.getParagraph(i);
        int numCharRuns = paragraph.numCharacterRuns();
        for (int j = 0; j < numCharRuns; j++) {
            if (isInterrupted) {
                outputStream.close();
                new File(outPutFile).delete();
                pLevel.setString("cancelled");
                return;
            }
            CharacterRun charRun = paragraph.getCharacterRun(j);
            String inputText = charRun.text();
            if ((null == inputText) || (inputText.trim().equals("")))
                continue;
            String translatedTxt = inputText;
            //in http post method, all key value pairs are seperated with &
            if (preferenceModel.getTranslatorType() == TranslatorType.HTTP)
                inputText = inputText.replaceAll("&", "and");
            try {
                translatedTxt = translator.translate(translatedTxt);
                charRun.replaceText(inputText, translatedTxt);
            } catch (Exception e) {
                logger.log(Level.SEVERE,
                        "Input File : " + inputFile + " cannot translate the text : " + inputText, e);
            }
        }
        count++;
        pLevel.setValue(count);
    }
    pLevel.setString("done");
    hDocument.write(outputStream);
    outputStream.close();
}

From source file:javaapplication1.HWPFTest.java

private static HWPFDocument replaceText(HWPFDocument doc, String findText, String replaceText) {
    Range r1 = doc.getRange();//from ww  w.j  av  a2 s. c  om

    for (int i = 0; i < r1.numSections(); ++i) {
        Section s = r1.getSection(i);
        for (int x = 0; x < s.numParagraphs(); x++) {
            Paragraph p = s.getParagraph(x);
            /*String text = p.text();
            if(text.contains(findText)) {
            p.replaceText(replaceText, findText);
            }*/

            for (int z = 0; z < p.numCharacterRuns(); z++) {
                CharacterRun run = p.getCharacterRun(z);
                String text = run.text();
                if (text.contains(findText)) {
                    run.replaceText(findText, replaceText);
                }
            }
        }
    }
    return doc;
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
        FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable,
        XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    // Note - a poi bug means we can't currently properly recurse
    // into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
        Table t = r.getTable(p);/*from www  .j  a va  2 s  .  c  o  m*/
        xhtml.startElement("table");
        xhtml.startElement("tbody");
        for (int rn = 0; rn < t.numRows(); rn++) {
            TableRow row = t.getRow(rn);
            xhtml.startElement("tr");
            for (int cn = 0; cn < row.numCells(); cn++) {
                TableCell cell = row.getCell(cn);
                xhtml.startElement("td");

                for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable,
                            xhtml);
                }
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        return (t.numParagraphs() - 1);
    }

    TagAndStyle tas;

    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
        StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (style != null && style.getName() != null && style.getName().length() > 0) {
            tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
        } else {
            tas = new TagAndStyle("p", null);
        }
    } else {
        tas = new TagAndStyle("p", null);
    }

    if (tas.getStyleClass() != null) {
        xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
        xhtml.startElement(tas.getTag());
    }

    for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun cr = p.getCharacterRun(j);

        // FIELD_BEGIN_MARK:
        if (cr.text().getBytes()[0] == 0x13) {
            Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
            // 58 is an embedded document
            // 56 is a document link
            if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                attributes.addAttribute("", "id", "id", "CDATA", id);
                xhtml.startElement("div", attributes);
                xhtml.endElement("div");
            }
        }

        if (cr.text().equals("\u0013")) {
            j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
        } else if (cr.text().startsWith("\u0008")) {
            // Floating Picture(s)
            for (int pn = 0; pn < cr.text().length(); pn++) {
                // Assume they're in the order from the unclaimed list...
                Picture picture = pictures.nextUnclaimed();

                // Output
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            }
        } else if (pictureTable.hasPicture(cr)) {
            // Inline Picture
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
        } else {
            handleCharacterRun(cr, tas.isHeading(), xhtml);
        }
    }

    // Close any still open style tags
    if (curStrikeThrough) {
        xhtml.endElement("s");
        curStrikeThrough = false;
    }
    if (curItalic) {
        xhtml.endElement("i");
        curItalic = false;
    }
    if (curBold) {
        xhtml.endElement("b");
        curBold = false;
    }

    xhtml.endElement(tas.getTag());

    return 0;
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml)
        throws SAXException {
    // Skip trailing newlines
    if (!isRendered(cr) || cr.text().equals("\r"))
        return;// ww  w. j  a  v a 2 s . co  m

    if (!skipStyling) {
        if (cr.isBold() != curBold) {
            // Enforce nesting -- must close s and i tags
            if (curStrikeThrough) {
                xhtml.endElement("s");
                curStrikeThrough = false;
            }
            if (curItalic) {
                xhtml.endElement("i");
                curItalic = false;
            }
            if (cr.isBold()) {
                xhtml.startElement("b");
            } else {
                xhtml.endElement("b");
            }
            curBold = cr.isBold();
        }

        if (cr.isItalic() != curItalic) {
            // Enforce nesting -- must close s tag
            if (curStrikeThrough) {
                xhtml.endElement("s");
                curStrikeThrough = false;
            }
            if (cr.isItalic()) {
                xhtml.startElement("i");
            } else {
                xhtml.endElement("i");
            }
            curItalic = cr.isItalic();
        }

        if (cr.isStrikeThrough() != curStrikeThrough) {
            if (cr.isStrikeThrough()) {
                xhtml.startElement("s");
            } else {
                xhtml.endElement("s");
            }
            curStrikeThrough = cr.isStrikeThrough();
        }
    }

    // Clean up the text
    String text = cr.text();
    text = text.replace('\r', '\n');
    if (text.endsWith("\u0007")) {
        // Strip the table cell end marker
        text = text.substring(0, text.length() - 1);
    }

    // Copied from POI's
    // org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:

    // line tabulator as break line
    text = text.replace((char) 0x000b, '\n');

    // Non-breaking hyphens are returned as char 30
    text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);

    // Non-required hyphens to zero-width space
    text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE);

    // TODO: mj
    xhtml.characters(text);
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

/**
 * Can be \13..text..\15 or \13..control..\14..text..\15 . Nesting is allowed
 *///from  w w  w. ja  va  2s .c om
private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling, PicturesSource pictures,
        XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException {
    List<CharacterRun> controls = new ArrayList<CharacterRun>();
    List<CharacterRun> texts = new ArrayList<CharacterRun>();
    boolean has14 = false;

    // Split it into before and after the 14
    int i;
    for (i = index + 1; i < p.numCharacterRuns(); i++) {
        CharacterRun cr = p.getCharacterRun(i);
        if (cr.text().equals("\u0013")) {
            // Nested, oh joy...
            int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml);
            i += increment;
        } else if (cr.text().equals("\u0014")) {
            has14 = true;
        } else if (cr.text().equals("\u0015")) {
            if (!has14) {
                texts = controls;
                controls = new ArrayList<CharacterRun>();
            }
            break;
        } else {
            if (has14) {
                texts.add(cr);
            } else {
                controls.add(cr);
            }
        }
    }

    // Do we need to do something special with this?
    if (controls.size() > 0) {
        String text = controls.get(0).text();
        for (int j = 1; j < controls.size(); j++) {
            text += controls.get(j).text();
        }

        if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK")) && text.indexOf('"') > -1) {
            String url = text.substring(text.indexOf('"') + 1, text.lastIndexOf('"'));
            xhtml.startElement("a", "href", url);
            for (CharacterRun cr : texts) {
                handleCharacterRun(cr, skipStyling, xhtml);
            }
            xhtml.endElement("a");
        } else {
            // Just output the text ones
            for (CharacterRun cr : texts) {
                if (pictures.hasPicture(cr)) {
                    Picture picture = pictures.getFor(cr);
                    handlePictureCharacterRun(cr, picture, pictures, xhtml);
                } else {
                    handleCharacterRun(cr, skipStyling, xhtml);
                }
            }
        }
    } else {
        // We only had text
        // Output as-is
        for (CharacterRun cr : texts) {
            handleCharacterRun(cr, skipStyling, xhtml);
        }
    }

    // Tell them how many to skip over
    return i - index;
}

From source file:Modelo.EscribirWord.java

private HWPFDocument replaceText(HWPFDocument doc, String findText, String replaceText) {

    Range r1 = doc.getRange();/*from  ww  w  . j a va  2s  . co m*/

    for (int i = 0; i < r1.numSections(); ++i) {
        Section s = r1.getSection(i);
        for (int x = 0; x < s.numParagraphs(); x++) {
            Paragraph p = s.getParagraph(x);
            for (int z = 0; z < p.numCharacterRuns(); z++) {
                CharacterRun run = p.getCharacterRun(z);
                String text = run.text();

                if (text.contains(findText)) {

                    if (replaceText == null) {
                        System.out.println("null");
                        replaceText = "";
                    }

                    run.replaceText(findText, replaceText);
                }

            }
        }
    }
    return doc;
}

From source file:org.apache.tika.parser.microsoft.WordExtractor.java

License:Apache License

private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
        FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable,
        ListManager listManager, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    // Note - a poi bug means we can't currently properly recurse
    //  into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
        Table t = r.getTable(p);//from  w  ww .java  2 s . com
        xhtml.startElement("table");
        xhtml.startElement("tbody");
        for (int rn = 0; rn < t.numRows(); rn++) {
            TableRow row = t.getRow(rn);
            xhtml.startElement("tr");
            for (int cn = 0; cn < row.numCells(); cn++) {
                TableCell cell = row.getCell(cn);
                xhtml.startElement("td");

                for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable,
                            listManager, xhtml);
                }
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        return (t.numParagraphs() - 1);
    }

    String text = p.text();
    if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
        // Skip empty paragraphs
        return 0;
    }

    TagAndStyle tas;
    String numbering = null;

    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
        StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (style != null && style.getName() != null && style.getName().length() > 0) {
            if (p.isInList()) {
                numbering = listManager.getFormattedNumber(p);
            }
            tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
        } else {
            tas = new TagAndStyle("p", null);
        }
    } else {
        tas = new TagAndStyle("p", null);
    }

    if (tas.getStyleClass() != null) {
        xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
        xhtml.startElement(tas.getTag());
    }

    if (numbering != null) {
        xhtml.characters(numbering);
    }

    for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun cr = p.getCharacterRun(j);

        // FIELD_BEGIN_MARK:
        if (cr.text().getBytes(UTF_8)[0] == 0x13) {
            Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
            // 58 is an embedded document
            // 56 is a document link
            if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                attributes.addAttribute("", "id", "id", "CDATA", id);
                xhtml.startElement("div", attributes);
                xhtml.endElement("div");
            }
        }

        if (cr.text().equals("\u0013")) {
            j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
        } else if (cr.text().startsWith("\u0008")) {
            // Floating Picture(s)
            for (int pn = 0; pn < cr.text().length(); pn++) {
                // Assume they're in the order from the unclaimed list...
                Picture picture = pictures.nextUnclaimed();

                // Output
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            }
        } else if (pictureTable.hasPicture(cr)) {
            // Inline Picture
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
        } else {
            handleCharacterRun(cr, tas.isHeading(), xhtml);
        }
    }

    // Close any still open style tags
    if (curStrikeThrough) {
        xhtml.endElement("s");
        curStrikeThrough = false;
    }
    if (curItalic) {
        xhtml.endElement("i");
        curItalic = false;
    }
    if (curBold) {
        xhtml.endElement("b");
        curBold = false;
    }

    xhtml.endElement(tas.getTag());

    return 0;
}

From source file:org.apache.tika.parser.microsoft.WordExtractor.java

License:Apache License

private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml)
        throws SAXException {
    // Skip trailing newlines
    if (!isRendered(cr) || cr.text().equals("\r"))
        return;/*from  ww w. j  a v a  2 s .  c  om*/

    if (!skipStyling) {
        if (cr.isBold() != curBold) {
            // Enforce nesting -- must close s and i tags
            if (curStrikeThrough) {
                xhtml.endElement("s");
                curStrikeThrough = false;
            }
            if (curItalic) {
                xhtml.endElement("i");
                curItalic = false;
            }
            if (cr.isBold()) {
                xhtml.startElement("b");
            } else {
                xhtml.endElement("b");
            }
            curBold = cr.isBold();
        }

        if (cr.isItalic() != curItalic) {
            // Enforce nesting -- must close s tag
            if (curStrikeThrough) {
                xhtml.endElement("s");
                curStrikeThrough = false;
            }
            if (cr.isItalic()) {
                xhtml.startElement("i");
            } else {
                xhtml.endElement("i");
            }
            curItalic = cr.isItalic();
        }

        if (cr.isStrikeThrough() != curStrikeThrough) {
            if (cr.isStrikeThrough()) {
                xhtml.startElement("s");
            } else {
                xhtml.endElement("s");
            }
            curStrikeThrough = cr.isStrikeThrough();
        }
    }

    // Clean up the text
    String text = cr.text();
    text = text.replace('\r', '\n');
    if (text.endsWith("\u0007")) {
        // Strip the table cell end marker
        text = text.substring(0, text.length() - 1);
    }

    // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:

    // Non-breaking hyphens are returned as char 30
    text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);

    // Non-required hyphens to zero-width space
    text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE);

    // Control characters as line break
    text = text.replaceAll("[\u0000-\u001f]", "\n");
    xhtml.characters(text);
}