List of usage examples for org.apache.poi.hslf.usermodel HSLFTextParagraph getTextRuns
@Override
public List<HSLFTextRun> getTextRuns()
From source file:com.jaeksoft.searchlib.parser.PptParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { HSLFSlideShow ppt = new HSLFSlideShow(streamLimiter.getNewInputStream()); List<HSLFSlide> slides = ppt.getSlides(); ParserResultItem result = getNewParserResultItem(); for (HSLFSlide slide : slides) { List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs(); for (List<HSLFTextParagraph> textLevel1 : textLevel0) { for (HSLFTextParagraph textPara : textLevel1) { ParserFieldEnum field;//from ww w . j ava2 s . c o m switch (textPara.getRunType()) { case TextHeaderAtom.TITLE_TYPE: case TextHeaderAtom.CENTER_TITLE_TYPE: field = ParserFieldEnum.title; break; case TextHeaderAtom.NOTES_TYPE: field = ParserFieldEnum.note; break; case TextHeaderAtom.BODY_TYPE: case TextHeaderAtom.CENTRE_BODY_TYPE: case TextHeaderAtom.HALF_BODY_TYPE: case TextHeaderAtom.QUARTER_BODY_TYPE: field = ParserFieldEnum.body; break; case TextHeaderAtom.OTHER_TYPE: default: field = ParserFieldEnum.other; break; } StringBuilder sb = new StringBuilder(); for (HSLFTextRun textRun : textPara.getTextRuns()) { sb.append(textRun.getRawText()); sb.append(' '); } result.addField(field, StringUtils.replaceConsecutiveSpaces(sb.toString(), " ")); } } } result.langDetection(10000, ParserFieldEnum.body); }
From source file:com.qwazr.extractor.parser.Ppt.java
License:Apache License
@Override protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception { HSLFSlideShow ppt = new HSLFSlideShow(inputStream); List<HSLFSlide> slides = ppt.getSlides(); for (HSLFSlide slide : slides) { ParserDocument document = getNewParserDocument(); List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs(); for (List<HSLFTextParagraph> textLevel1 : textLevel0) { for (HSLFTextParagraph textPara : textLevel1) { ParserField parserField; switch (textPara.getRunType()) { case TextHeaderAtom.TITLE_TYPE: case TextHeaderAtom.CENTER_TITLE_TYPE: parserField = TITLE; break; case TextHeaderAtom.NOTES_TYPE: parserField = NOTES; break; case TextHeaderAtom.BODY_TYPE: case TextHeaderAtom.CENTRE_BODY_TYPE: case TextHeaderAtom.HALF_BODY_TYPE: case TextHeaderAtom.QUARTER_BODY_TYPE: parserField = BODY;/*from w w w.j a va2s.c o m*/ break; case TextHeaderAtom.OTHER_TYPE: default: parserField = OTHER; break; } StringBuilder sb = new StringBuilder(); for (HSLFTextRun textRun : textPara.getTextRuns()) { sb.append(textRun.getRawText()); sb.append(' '); } document.add(parserField, sb.toString().trim()); } } document.add(LANG_DETECTION, languageDetection(document, BODY, 10000)); } }
From source file:com.qwazr.library.poi.PptParser.java
License:Apache License
@Override public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream, final String extension, final String mimeType, final ParserResultBuilder resultBuilder) throws Exception { final HSLFSlideShow ppt = new HSLFSlideShow(inputStream); final ParserFieldsBuilder metas = resultBuilder.metas(); metas.set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault)); final List<HSLFSlide> slides = ppt.getSlides(); for (HSLFSlide slide : slides) { final ParserFieldsBuilder document = resultBuilder.newDocument(); final List<List<HSLFTextParagraph>> textLevel0 = slide.getTextParagraphs(); for (List<HSLFTextParagraph> textLevel1 : textLevel0) { for (HSLFTextParagraph textPara : textLevel1) { final ParserField parserField; switch (textPara.getRunType()) { case TextHeaderAtom.TITLE_TYPE: case TextHeaderAtom.CENTER_TITLE_TYPE: parserField = TITLE; break; case TextHeaderAtom.NOTES_TYPE: parserField = NOTES; break; case TextHeaderAtom.BODY_TYPE: case TextHeaderAtom.CENTRE_BODY_TYPE: case TextHeaderAtom.HALF_BODY_TYPE: case TextHeaderAtom.QUARTER_BODY_TYPE: parserField = BODY;/*from w w w . j av a2 s . c om*/ break; case TextHeaderAtom.OTHER_TYPE: default: parserField = OTHER; break; } StringBuilder sb = new StringBuilder(); for (HSLFTextRun textRun : textPara.getTextRuns()) { sb.append(textRun.getRawText()); sb.append(' '); } final String text = sb.toString().trim(); document.add(parserField, text); if (parserField != TITLE) document.add(CONTENT, text); } } document.add(LANG_DETECTION, languageDetection(document, CONTENT, 10000)); } }
From source file:org.apache.tika.parser.microsoft.HSLFExtractor.java
License:Apache License
private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException { if (paragraphsList == null) { return;// ww w . jav a2 s.c o m } for (List<HSLFTextParagraph> run : paragraphsList) { // Leaving in wisdom from TIKA-712 for easy revert. // Avoid boiler-plate text on the master slide (0 // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE): //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) { boolean isBullet = false; for (HSLFTextParagraph htp : run) { boolean nextBullet = htp.isBullet(); // TODO: identify bullet/list type if (isBullet != nextBullet) { isBullet = nextBullet; if (isBullet) { xhtml.startElement("ul"); } else { xhtml.endElement("ul"); } } List<HSLFTextRun> textRuns = htp.getTextRuns(); String firstLine = removePBreak(textRuns.get(0).getRawText()); boolean showBullet = (isBullet && (textRuns.size() > 1 || !"".equals(firstLine))); String paraTag = showBullet ? "li" : "p"; xhtml.startElement(paraTag); for (HSLFTextRun htr : textRuns) { String line = htr.getRawText(); if (line != null) { boolean isfirst = true; for (String fragment : line.split("\\u000b")) { if (!isfirst) { xhtml.startElement("br"); xhtml.endElement("br"); } isfirst = false; xhtml.characters(removePBreak(fragment)); } if (line.endsWith("\u000b")) { xhtml.startElement("br"); xhtml.endElement("br"); } } } xhtml.endElement(paraTag); } if (isBullet) { xhtml.endElement("ul"); } } }