List of usage examples for org.apache.poi.hslf.extractor PowerPointExtractor getText
@Override
public String getText()
From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsPowerPoint.java
License:Open Source License
/** * ?ppt //from www . jav a 2s. c o m * @param path * @return */ public String readPowerPoint(InputStream in) { String content = null; try { HSLFSlideShow slideShow = new HSLFSlideShow(in); org.apache.poi.hslf.extractor.PowerPointExtractor extractor = new PowerPointExtractor(slideShow); this.m_documentSummary = extractor.getDocSummaryInformation(); this.m_summary = extractor.getSummaryInformation(); content = extractor.getText(); // SlideShow ss = new SlideShow(new HSLFSlideShow(in));// is // // InputStreamSlideShow // Slide[] slides = ss.getSlides();// ?? // for (int i = 0; i < slides.length; i++) { // TextRun[] t = slides[i].getTextRuns();// ??TextRun // for (int j = 0; j < t.length; j++) { // content.append(t[j].getText());// content // } // } } catch (Exception ex) { System.out.println(ex.toString()); } return content; }
From source file:com.krawler.esp.fileparser.pptparser.MsPPTParser.java
License:Open Source License
public String extractText(String filepath) throws Exception { InputStream input = new BufferedInputStream(new FileInputStream(filepath)); String resultText = ""; PowerPointExtractor ppt = new PowerPointExtractor(input); resultText = ppt.getText(); if (input != null) { input.close();/* w w w . j a v a 2 s . co m*/ } return resultText; }
From source file:de.micromata.genome.gwiki.plugin.msotextextractor_1_0.PowerPointTextExtractor.java
License:Apache License
public String extractText(String fileName, InputStream data) { try {//from www. j a v a2 s . c om PowerPointExtractor extr = new PowerPointExtractor(data); String text = extr.getText(); text = WordTextExtractor.reworkWordText(text); return text; } catch (IOException ex) { throw new RuntimeIOException("Failure to extract word from " + fileName + "; " + ex.getMessage(), ex); } }
From source file:module.fileSupport.metadata.parsing.parsers.PowerPointParser.java
License:Open Source License
@Override protected String extract(InputStream stream) { String extractedText = null;/*from www .j a va2 s . c om*/ try { PowerPointExtractor extractor = new PowerPointExtractor(stream); extractedText = extractor.getText(); } catch (IOException e) { e.printStackTrace(); return null; } return extractedText; }
From source file:net.ontopia.topicmaps.classify.PowerPointFormatModule.java
License:Apache License
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) { try {/*w ww .j a va 2s. c o m*/ PowerPointExtractor extractor = new PowerPointExtractor( new BufferedInputStream(new ByteArrayInputStream(cc.getContent()))); String s = extractor.getText(); char[] c = s.toCharArray(); handler.startRegion("document"); handler.text(c, 0, c.length); handler.endRegion(); } catch (Exception e) { throw new OntopiaRuntimeException(e); } }
From source file:no.trank.openpipe.parse.ms.PowerPointParser.java
License:Apache License
@Override public ParserResult parse(ParseData data) throws IOException, ParserException { final HSLFSlideShow doc = new HSLFSlideShow(data.getInputStream()); final ParserResultImpl result = new ParserResultImpl(); result.setTitle(doc.getSummaryInformation().getTitle()); final PowerPointExtractor extractor = new PowerPointExtractor(doc); result.setText(POIUtils.getCleanText(extractor.getText())); if (data.includeProperties()) { result.setProperties(POIUtils.getProperties(doc)); }//from w w w .j a v a2 s. co m return result; }
From source file:org.kimios.kernel.index.filters.PowerPointFilter.java
License:Open Source License
public String getBody(InputStream in) throws IOException { PowerPointExtractor extractor = new PowerPointExtractor(in); return new String(extractor.getText().getBytes("UTF-8"), "UTF-8"); }
From source file:org.nuxeo.ecm.core.convert.plugins.text.extractors.PPT2TextConverter.java
License:Apache License
@Override public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { File f = null;/*from www .j a va 2s . co m*/ OutputStream fas = null; PowerPointExtractor extractor = null; try { extractor = new PowerPointExtractor(blobHolder.getBlob().getStream()); byte[] bytes = extractor.getText().getBytes(); f = Framework.createTempFile("po-ppt2text", ".txt"); fas = new FileOutputStream(f); fas.write(bytes); Blob blob; try (InputStream in = new FileInputStream(f)) { blob = Blobs.createBlob(in); } blob.setMimeType("text/plain"); return new SimpleCachableBlobHolder(blob); } catch (IOException e) { throw new ConversionException("Error during PPT2Text conversion", e); } finally { if (extractor != null) { try { extractor.close(); } catch (IOException e) { log.error(e, e); } } if (fas != null) { try { fas.close(); } catch (IOException e) { log.error(e, e); } } if (f != null) { f.delete(); } } }
From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.MSPowerpointIndexerTest.java
License:Open Source License
@Test public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception { POIFSFileSystem ppExtractor = Mockito.mock(POIFSFileSystem.class); PowerPointExtractor powerPointExtractor = Mockito.mock(PowerPointExtractor.class); XSLFPowerPointExtractor xslfExtractor = Mockito.mock(XSLFPowerPointExtractor.class); XMLSlideShow xmlSlideShow = Mockito.mock(XMLSlideShow.class); PowerMockito.whenNew(POIFSFileSystem.class).withArguments(Mockito.anyObject()) .thenThrow(OfficeXmlFileException.class).thenReturn(ppExtractor) .thenThrow(APIManagementException.class); PowerMockito.whenNew(PowerPointExtractor.class).withArguments(ppExtractor).thenReturn(powerPointExtractor); PowerMockito.whenNew(XMLSlideShow.class).withParameterTypes(InputStream.class).withArguments(Mockito.any()) .thenReturn(xmlSlideShow);/*from w w w . j a va2s .com*/ PowerMockito.whenNew(XSLFPowerPointExtractor.class).withArguments(xmlSlideShow).thenReturn(xslfExtractor); Mockito.when(powerPointExtractor.getText()).thenReturn(""); Mockito.when(xslfExtractor.getText()).thenReturn(""); MSPowerpointIndexer indexer = new MSPowerpointIndexer(); IndexDocument ppDoc = indexer.getIndexedDocument(file2Index); // should return the default media type when media type is not defined in file2Index if (!"application/vnd.ms-powerpoint" .equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) { Assert.fail(); } // should return the media type we have set in the file2Index file2Index.mediaType = "text/html"; ppDoc = indexer.getIndexedDocument(file2Index); if (!"text/html".equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) { Assert.fail(); } // should return the media type we have set in the file2Index even if exception occurred while reading the file ppDoc = indexer.getIndexedDocument(file2Index); if (!"text/html".equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) { Assert.fail(); } }
From source file:org.wso2.carbon.registry.indexing.indexer.MSPowerpointIndexer.java
License:Open Source License
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException { try {/*from www .java2 s . c o m*/ POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data)); PowerPointExtractor extractor = new PowerPointExtractor(fs); String ppText = extractor.getText(); return new IndexDocument(fileData.path, ppText, null); } catch (IOException e) { String msg = "Failed to write to the index"; log.error(msg, e); throw new SolrException(ErrorCode.SERVER_ERROR, msg); } }