List of usage examples for org.apache.poi.hslf.usermodel HSLFSlide getTextParagraphs
@Override
public List<List<HSLFTextParagraph>> getTextParagraphs()
From source file:com.jaeksoft.searchlib.parser.PptParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { HSLFSlideShow ppt = new HSLFSlideShow(streamLimiter.getNewInputStream()); List<HSLFSlide> slides = ppt.getSlides(); ParserResultItem result = getNewParserResultItem(); for (HSLFSlide slide : slides) { List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs(); for (List<HSLFTextParagraph> textLevel1 : textLevel0) { for (HSLFTextParagraph textPara : textLevel1) { ParserFieldEnum field;/* ww w . j a v a2s . c o m*/ switch (textPara.getRunType()) { case TextHeaderAtom.TITLE_TYPE: case TextHeaderAtom.CENTER_TITLE_TYPE: field = ParserFieldEnum.title; break; case TextHeaderAtom.NOTES_TYPE: field = ParserFieldEnum.note; break; case TextHeaderAtom.BODY_TYPE: case TextHeaderAtom.CENTRE_BODY_TYPE: case TextHeaderAtom.HALF_BODY_TYPE: case TextHeaderAtom.QUARTER_BODY_TYPE: field = ParserFieldEnum.body; break; case TextHeaderAtom.OTHER_TYPE: default: field = ParserFieldEnum.other; break; } StringBuilder sb = new StringBuilder(); for (HSLFTextRun textRun : textPara.getTextRuns()) { sb.append(textRun.getRawText()); sb.append(' '); } result.addField(field, StringUtils.replaceConsecutiveSpaces(sb.toString(), " ")); } } } result.langDetection(10000, ParserFieldEnum.body); }
From source file:com.qwazr.extractor.parser.Ppt.java
License:Apache License
@Override protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception { HSLFSlideShow ppt = new HSLFSlideShow(inputStream); List<HSLFSlide> slides = ppt.getSlides(); for (HSLFSlide slide : slides) { ParserDocument document = getNewParserDocument(); List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs(); for (List<HSLFTextParagraph> textLevel1 : textLevel0) { for (HSLFTextParagraph textPara : textLevel1) { ParserField parserField; switch (textPara.getRunType()) { case TextHeaderAtom.TITLE_TYPE: case TextHeaderAtom.CENTER_TITLE_TYPE: parserField = TITLE; break; case TextHeaderAtom.NOTES_TYPE: parserField = NOTES; break; case TextHeaderAtom.BODY_TYPE: case TextHeaderAtom.CENTRE_BODY_TYPE: case TextHeaderAtom.HALF_BODY_TYPE: case TextHeaderAtom.QUARTER_BODY_TYPE: parserField = BODY;/* www . j a v a 2s . co m*/ break; case TextHeaderAtom.OTHER_TYPE: default: parserField = OTHER; break; } StringBuilder sb = new StringBuilder(); for (HSLFTextRun textRun : textPara.getTextRuns()) { sb.append(textRun.getRawText()); sb.append(' '); } document.add(parserField, sb.toString().trim()); } } document.add(LANG_DETECTION, languageDetection(document, BODY, 10000)); } }
From source file:com.qwazr.library.poi.PptParser.java
License:Apache License
@Override public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream, final String extension, final String mimeType, final ParserResultBuilder resultBuilder) throws Exception { final HSLFSlideShow ppt = new HSLFSlideShow(inputStream); final ParserFieldsBuilder metas = resultBuilder.metas(); metas.set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault)); final List<HSLFSlide> slides = ppt.getSlides(); for (HSLFSlide slide : slides) { final ParserFieldsBuilder document = resultBuilder.newDocument(); final List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs(); for (List<HSLFTextParagraph> textLevel1 : textLevel0) { for (HSLFTextParagraph textPara : textLevel1) { final ParserField parserField; switch (textPara.getRunType()) { case TextHeaderAtom.TITLE_TYPE: case TextHeaderAtom.CENTER_TITLE_TYPE: parserField = TITLE; break; case TextHeaderAtom.NOTES_TYPE: parserField = NOTES; break; case TextHeaderAtom.BODY_TYPE: case TextHeaderAtom.CENTRE_BODY_TYPE: case TextHeaderAtom.HALF_BODY_TYPE: case TextHeaderAtom.QUARTER_BODY_TYPE: parserField = BODY;/* www . jav a 2 s. com*/ break; case TextHeaderAtom.OTHER_TYPE: default: parserField = OTHER; break; } StringBuilder sb = new StringBuilder(); for (HSLFTextRun textRun : textPara.getTextRuns()) { sb.append(textRun.getRawText()); sb.append(' '); } final String text = sb.toString().trim(); document.add(parserField, text); if (parserField != TITLE) document.add(CONTENT, text); } } document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000)); } }
From source file:org.apache.tika.parser.microsoft.HSLFExtractor.java
License:Apache License
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { HSLFSlideShow ss = new HSLFSlideShow(root); List<HSLFSlide> _slides = ss.getSlides(); xhtml.startElement("div", "class", "slideShow"); /* Iterate over slides and extract text */ for (HSLFSlide slide : _slides) { xhtml.startElement("div", "class", "slide"); // Slide header, if present HeadersFooters hf = slide.getHeadersFooters(); if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { xhtml.startElement("p", "class", "slide-header"); xhtml.characters(hf.getHeaderText()); xhtml.endElement("p"); }//from w ww. java2 s .co m // Slide master, if present extractMaster(xhtml, slide.getMasterSheet()); // Slide text { xhtml.startElement("div", "class", "slide-content"); textRunsToText(xhtml, slide.getTextParagraphs()); xhtml.endElement("div"); } // Table text for (HSLFShape shape : slide.getShapes()) { if (shape instanceof HSLFTable) { extractTableText(xhtml, (HSLFTable) shape); } } // Slide footer, if present if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { xhtml.startElement("p", "class", "slide-footer"); xhtml.characters(hf.getFooterText()); xhtml.endElement("p"); } // Comments, if present StringBuilder authorStringBuilder = new StringBuilder(); for (Comment comment : slide.getComments()) { authorStringBuilder.setLength(0); xhtml.startElement("p", "class", "slide-comment"); if (comment.getAuthor() != null) { authorStringBuilder.append(comment.getAuthor()); } if (comment.getAuthorInitials() != null) { if (authorStringBuilder.length() > 0) { authorStringBuilder.append(" "); } authorStringBuilder.append("(" + comment.getAuthorInitials() + ")"); } if (authorStringBuilder.length() > 0) { if (comment.getText() != null) { authorStringBuilder.append(" - "); } xhtml.startElement("b"); xhtml.characters(authorStringBuilder.toString()); xhtml.endElement("b"); } if (comment.getText() != null) { xhtml.characters(comment.getText()); } xhtml.endElement("p"); } // Now any embedded resources handleSlideEmbeddedResources(slide, xhtml); // Find the Notes for this slide and extract inline HSLFNotes notes = slide.getNotes(); if (notes != null) { xhtml.startElement("div", "class", "slide-notes"); textRunsToText(xhtml, notes.getTextParagraphs()); xhtml.endElement("div"); } // Slide complete xhtml.endElement("div"); } // All slides done xhtml.endElement("div"); /* notes */ xhtml.startElement("div", "class", "slide-notes"); HashSet<Integer> seenNotes = new HashSet<>(); HeadersFooters hf = ss.getNotesHeadersFooters(); for (HSLFSlide slide : _slides) { HSLFNotes notes = slide.getNotes(); if (notes == null) { continue; } Integer id = notes._getSheetNumber(); if (seenNotes.contains(id)) { continue; } seenNotes.add(id); // Repeat the Notes header, if set if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { xhtml.startElement("p", "class", "slide-note-header"); xhtml.characters(hf.getHeaderText()); xhtml.endElement("p"); } // Notes text textRunsToText(xhtml, notes.getTextParagraphs()); // Repeat the notes footer, if set if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { xhtml.startElement("p", "class", "slide-note-footer"); xhtml.characters(hf.getFooterText()); xhtml.endElement("p"); } } handleSlideEmbeddedPictures(ss, xhtml); xhtml.endElement("div"); }