Example usage for org.apache.poi.hslf.usermodel HSLFTextRun getRawText

List of usage examples for org.apache.poi.hslf.usermodel HSLFTextRun getRawText

Introduction

In this page you can find the example usage for org.apache.poi.hslf.usermodel HSLFTextRun getRawText.

Prototype

@Override
public String getRawText() 

Source Link

Document

Fetch the text, in raw storage form

Usage

From source file:com.jaeksoft.searchlib.parser.PptParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {

    HSLFSlideShow ppt = new HSLFSlideShow(streamLimiter.getNewInputStream());
    List<HSLFSlide> slides = ppt.getSlides();
    ParserResultItem result = getNewParserResultItem();
    for (HSLFSlide slide : slides) {
        List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs();
        for (List<HSLFTextParagraph> textLevel1 : textLevel0) {
            for (HSLFTextParagraph textPara : textLevel1) {
                ParserFieldEnum field;//from  www .j  a v a2  s .c  o  m
                switch (textPara.getRunType()) {
                case TextHeaderAtom.TITLE_TYPE:
                case TextHeaderAtom.CENTER_TITLE_TYPE:
                    field = ParserFieldEnum.title;
                    break;
                case TextHeaderAtom.NOTES_TYPE:
                    field = ParserFieldEnum.note;
                    break;
                case TextHeaderAtom.BODY_TYPE:
                case TextHeaderAtom.CENTRE_BODY_TYPE:
                case TextHeaderAtom.HALF_BODY_TYPE:
                case TextHeaderAtom.QUARTER_BODY_TYPE:
                    field = ParserFieldEnum.body;
                    break;
                case TextHeaderAtom.OTHER_TYPE:
                default:
                    field = ParserFieldEnum.other;
                    break;
                }
                StringBuilder sb = new StringBuilder();
                for (HSLFTextRun textRun : textPara.getTextRuns()) {
                    sb.append(textRun.getRawText());
                    sb.append(' ');
                }
                result.addField(field, StringUtils.replaceConsecutiveSpaces(sb.toString(), " "));
            }
        }
    }
    result.langDetection(10000, ParserFieldEnum.body);
}

From source file:com.qwazr.extractor.parser.Ppt.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception {

    HSLFSlideShow ppt = new HSLFSlideShow(inputStream);

    List<HSLFSlide> slides = ppt.getSlides();
    for (HSLFSlide slide : slides) {
        ParserDocument document = getNewParserDocument();
        List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs();
        for (List<HSLFTextParagraph> textLevel1 : textLevel0) {
            for (HSLFTextParagraph textPara : textLevel1) {
                ParserField parserField;
                switch (textPara.getRunType()) {
                case TextHeaderAtom.TITLE_TYPE:
                case TextHeaderAtom.CENTER_TITLE_TYPE:
                    parserField = TITLE;
                    break;
                case TextHeaderAtom.NOTES_TYPE:
                    parserField = NOTES;
                    break;
                case TextHeaderAtom.BODY_TYPE:
                case TextHeaderAtom.CENTRE_BODY_TYPE:
                case TextHeaderAtom.HALF_BODY_TYPE:
                case TextHeaderAtom.QUARTER_BODY_TYPE:
                    parserField = BODY;//from w  w  w.  j  a  va  2s.c o m
                    break;
                case TextHeaderAtom.OTHER_TYPE:
                default:
                    parserField = OTHER;
                    break;
                }
                StringBuilder sb = new StringBuilder();
                for (HSLFTextRun textRun : textPara.getTextRuns()) {
                    sb.append(textRun.getRawText());
                    sb.append(' ');
                }
                document.add(parserField, sb.toString().trim());
            }
        }
        document.add(LANG_DETECTION, languageDetection(document, BODY, 10000));
    }

}

From source file:com.qwazr.library.poi.PptParser.java

License:Apache License

@Override
public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream,
        final String extension, final String mimeType, final ParserResultBuilder resultBuilder)
        throws Exception {

    final HSLFSlideShow ppt = new HSLFSlideShow(inputStream);

    final ParserFieldsBuilder metas = resultBuilder.metas();
    metas.set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault));

    final List<HSLFSlide> slides = ppt.getSlides();
    for (HSLFSlide slide : slides) {
        final ParserFieldsBuilder document = resultBuilder.newDocument();
        final List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs();
        for (List<HSLFTextParagraph> textLevel1 : textLevel0) {
            for (HSLFTextParagraph textPara : textLevel1) {
                final ParserField parserField;
                switch (textPara.getRunType()) {
                case TextHeaderAtom.TITLE_TYPE:
                case TextHeaderAtom.CENTER_TITLE_TYPE:
                    parserField = TITLE;
                    break;
                case TextHeaderAtom.NOTES_TYPE:
                    parserField = NOTES;
                    break;
                case TextHeaderAtom.BODY_TYPE:
                case TextHeaderAtom.CENTRE_BODY_TYPE:
                case TextHeaderAtom.HALF_BODY_TYPE:
                case TextHeaderAtom.QUARTER_BODY_TYPE:
                    parserField = BODY;// ww w .j  av  a  2  s  .c  o  m
                    break;
                case TextHeaderAtom.OTHER_TYPE:
                default:
                    parserField = OTHER;
                    break;
                }
                StringBuilder sb = new StringBuilder();
                for (HSLFTextRun textRun : textPara.getTextRuns()) {
                    sb.append(textRun.getRawText());
                    sb.append(' ');
                }
                final String text = sb.toString().trim();
                document.add(parserField, text);
                if (parserField != TITLE)
                    document.add(CONTENT, text);
            }
        }
        document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000));
    }

}

From source file:org.apache.tika.parser.microsoft.HSLFExtractor.java

License:Apache License

private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList)
        throws SAXException {
    if (paragraphsList == null) {
        return;/*from   w  ww.  j  ava 2s. c o  m*/
    }

    for (List<HSLFTextParagraph> run : paragraphsList) {
        // Leaving in wisdom from TIKA-712 for easy revert.
        // Avoid boiler-plate text on the master slide (0
        // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
        //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {

        boolean isBullet = false;
        for (HSLFTextParagraph htp : run) {
            boolean nextBullet = htp.isBullet();
            // TODO: identify bullet/list type
            if (isBullet != nextBullet) {
                isBullet = nextBullet;
                if (isBullet) {
                    xhtml.startElement("ul");
                } else {
                    xhtml.endElement("ul");
                }
            }

            List<HSLFTextRun> textRuns = htp.getTextRuns();
            String firstLine = removePBreak(textRuns.get(0).getRawText());
            boolean showBullet = (isBullet && (textRuns.size() > 1 || !"".equals(firstLine)));
            String paraTag = showBullet ? "li" : "p";

            xhtml.startElement(paraTag);
            for (HSLFTextRun htr : textRuns) {
                String line = htr.getRawText();
                if (line != null) {
                    boolean isfirst = true;
                    for (String fragment : line.split("\\u000b")) {
                        if (!isfirst) {
                            xhtml.startElement("br");
                            xhtml.endElement("br");
                        }
                        isfirst = false;
                        xhtml.characters(removePBreak(fragment));
                    }
                    if (line.endsWith("\u000b")) {
                        xhtml.startElement("br");
                        xhtml.endElement("br");
                    }
                }
            }
            xhtml.endElement(paraTag);
        }
        if (isBullet) {
            xhtml.endElement("ul");
        }
    }
}