Example usage for org.apache.poi.hslf.usermodel HSLFSlide getTextParagraphs

List of usage examples for org.apache.poi.hslf.usermodel HSLFSlide getTextParagraphs

Introduction

In this page you can find the example usage for org.apache.poi.hslf.usermodel HSLFSlide getTextParagraphs.

Prototype

@Override
public List<List<HSLFTextParagraph>> getTextParagraphs() 

Source Link

Document

Returns an array of all the TextRuns found

Usage

From source file:com.jaeksoft.searchlib.parser.PptParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {

    HSLFSlideShow ppt = new HSLFSlideShow(streamLimiter.getNewInputStream());
    List<HSLFSlide> slides = ppt.getSlides();
    ParserResultItem result = getNewParserResultItem();
    for (HSLFSlide slide : slides) {
        List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs();
        for (List<HSLFTextParagraph> textLevel1 : textLevel0) {
            for (HSLFTextParagraph textPara : textLevel1) {
                ParserFieldEnum field;/*  ww  w  . j  a  v a2s  . c o m*/
                switch (textPara.getRunType()) {
                case TextHeaderAtom.TITLE_TYPE:
                case TextHeaderAtom.CENTER_TITLE_TYPE:
                    field = ParserFieldEnum.title;
                    break;
                case TextHeaderAtom.NOTES_TYPE:
                    field = ParserFieldEnum.note;
                    break;
                case TextHeaderAtom.BODY_TYPE:
                case TextHeaderAtom.CENTRE_BODY_TYPE:
                case TextHeaderAtom.HALF_BODY_TYPE:
                case TextHeaderAtom.QUARTER_BODY_TYPE:
                    field = ParserFieldEnum.body;
                    break;
                case TextHeaderAtom.OTHER_TYPE:
                default:
                    field = ParserFieldEnum.other;
                    break;
                }
                StringBuilder sb = new StringBuilder();
                for (HSLFTextRun textRun : textPara.getTextRuns()) {
                    sb.append(textRun.getRawText());
                    sb.append(' ');
                }
                result.addField(field, StringUtils.replaceConsecutiveSpaces(sb.toString(), " "));
            }
        }
    }
    result.langDetection(10000, ParserFieldEnum.body);
}

From source file:com.qwazr.extractor.parser.Ppt.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception {

    HSLFSlideShow ppt = new HSLFSlideShow(inputStream);

    List<HSLFSlide> slides = ppt.getSlides();
    for (HSLFSlide slide : slides) {
        ParserDocument document = getNewParserDocument();
        List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs();
        for (List<HSLFTextParagraph> textLevel1 : textLevel0) {
            for (HSLFTextParagraph textPara : textLevel1) {
                ParserField parserField;
                switch (textPara.getRunType()) {
                case TextHeaderAtom.TITLE_TYPE:
                case TextHeaderAtom.CENTER_TITLE_TYPE:
                    parserField = TITLE;
                    break;
                case TextHeaderAtom.NOTES_TYPE:
                    parserField = NOTES;
                    break;
                case TextHeaderAtom.BODY_TYPE:
                case TextHeaderAtom.CENTRE_BODY_TYPE:
                case TextHeaderAtom.HALF_BODY_TYPE:
                case TextHeaderAtom.QUARTER_BODY_TYPE:
                    parserField = BODY;/* www . j  a  v  a 2s  . co  m*/
                    break;
                case TextHeaderAtom.OTHER_TYPE:
                default:
                    parserField = OTHER;
                    break;
                }
                StringBuilder sb = new StringBuilder();
                for (HSLFTextRun textRun : textPara.getTextRuns()) {
                    sb.append(textRun.getRawText());
                    sb.append(' ');
                }
                document.add(parserField, sb.toString().trim());
            }
        }
        document.add(LANG_DETECTION, languageDetection(document, BODY, 10000));
    }

}

From source file:com.qwazr.library.poi.PptParser.java

License:Apache License

@Override
public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream,
        final String extension, final String mimeType, final ParserResultBuilder resultBuilder)
        throws Exception {

    final HSLFSlideShow ppt = new HSLFSlideShow(inputStream);

    final ParserFieldsBuilder metas = resultBuilder.metas();
    metas.set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault));

    final List<HSLFSlide> slides = ppt.getSlides();
    for (HSLFSlide slide : slides) {
        final ParserFieldsBuilder document = resultBuilder.newDocument();
        final List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs();
        for (List<HSLFTextParagraph> textLevel1 : textLevel0) {
            for (HSLFTextParagraph textPara : textLevel1) {
                final ParserField parserField;
                switch (textPara.getRunType()) {
                case TextHeaderAtom.TITLE_TYPE:
                case TextHeaderAtom.CENTER_TITLE_TYPE:
                    parserField = TITLE;
                    break;
                case TextHeaderAtom.NOTES_TYPE:
                    parserField = NOTES;
                    break;
                case TextHeaderAtom.BODY_TYPE:
                case TextHeaderAtom.CENTRE_BODY_TYPE:
                case TextHeaderAtom.HALF_BODY_TYPE:
                case TextHeaderAtom.QUARTER_BODY_TYPE:
                    parserField = BODY;/*  www  .  jav a 2 s.  com*/
                    break;
                case TextHeaderAtom.OTHER_TYPE:
                default:
                    parserField = OTHER;
                    break;
                }
                StringBuilder sb = new StringBuilder();
                for (HSLFTextRun textRun : textPara.getTextRuns()) {
                    sb.append(textRun.getRawText());
                    sb.append(' ');
                }
                final String text = sb.toString().trim();
                document.add(parserField, text);
                if (parserField != TITLE)
                    document.add(CONTENT, text);
            }
        }
        document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000));
    }

}

From source file:org.apache.tika.parser.microsoft.HSLFExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HSLFSlideShow ss = new HSLFSlideShow(root);
    List<HSLFSlide> _slides = ss.getSlides();

    xhtml.startElement("div", "class", "slideShow");

    /* Iterate over slides and extract text */
    for (HSLFSlide slide : _slides) {
        xhtml.startElement("div", "class", "slide");

        // Slide header, if present
        HeadersFooters hf = slide.getHeadersFooters();
        if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
            xhtml.startElement("p", "class", "slide-header");

            xhtml.characters(hf.getHeaderText());

            xhtml.endElement("p");
        }//from w  ww.  java2  s  .co m

        // Slide master, if present
        extractMaster(xhtml, slide.getMasterSheet());

        // Slide text
        {
            xhtml.startElement("div", "class", "slide-content");

            textRunsToText(xhtml, slide.getTextParagraphs());

            xhtml.endElement("div");
        }

        // Table text
        for (HSLFShape shape : slide.getShapes()) {
            if (shape instanceof HSLFTable) {
                extractTableText(xhtml, (HSLFTable) shape);
            }
        }

        // Slide footer, if present
        if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
            xhtml.startElement("p", "class", "slide-footer");

            xhtml.characters(hf.getFooterText());

            xhtml.endElement("p");
        }

        // Comments, if present
        StringBuilder authorStringBuilder = new StringBuilder();
        for (Comment comment : slide.getComments()) {
            authorStringBuilder.setLength(0);
            xhtml.startElement("p", "class", "slide-comment");

            if (comment.getAuthor() != null) {
                authorStringBuilder.append(comment.getAuthor());
            }
            if (comment.getAuthorInitials() != null) {
                if (authorStringBuilder.length() > 0) {
                    authorStringBuilder.append(" ");
                }
                authorStringBuilder.append("(" + comment.getAuthorInitials() + ")");
            }
            if (authorStringBuilder.length() > 0) {
                if (comment.getText() != null) {
                    authorStringBuilder.append(" - ");
                }
                xhtml.startElement("b");
                xhtml.characters(authorStringBuilder.toString());
                xhtml.endElement("b");
            }
            if (comment.getText() != null) {
                xhtml.characters(comment.getText());
            }
            xhtml.endElement("p");
        }

        // Now any embedded resources
        handleSlideEmbeddedResources(slide, xhtml);

        // Find the Notes for this slide and extract inline
        HSLFNotes notes = slide.getNotes();
        if (notes != null) {
            xhtml.startElement("div", "class", "slide-notes");

            textRunsToText(xhtml, notes.getTextParagraphs());

            xhtml.endElement("div");
        }

        // Slide complete
        xhtml.endElement("div");
    }

    // All slides done
    xhtml.endElement("div");

    /* notes */
    xhtml.startElement("div", "class", "slide-notes");
    HashSet<Integer> seenNotes = new HashSet<>();
    HeadersFooters hf = ss.getNotesHeadersFooters();

    for (HSLFSlide slide : _slides) {
        HSLFNotes notes = slide.getNotes();
        if (notes == null) {
            continue;
        }
        Integer id = notes._getSheetNumber();
        if (seenNotes.contains(id)) {
            continue;
        }
        seenNotes.add(id);

        // Repeat the Notes header, if set
        if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
            xhtml.startElement("p", "class", "slide-note-header");
            xhtml.characters(hf.getHeaderText());
            xhtml.endElement("p");
        }

        // Notes text
        textRunsToText(xhtml, notes.getTextParagraphs());

        // Repeat the notes footer, if set
        if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
            xhtml.startElement("p", "class", "slide-note-footer");
            xhtml.characters(hf.getFooterText());
            xhtml.endElement("p");
        }
    }

    handleSlideEmbeddedPictures(ss, xhtml);

    xhtml.endElement("div");
}