Example usage for org.apache.poi.hslf.extractor PowerPointExtractor getText

List of usage examples for org.apache.poi.hslf.extractor PowerPointExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.hslf.extractor PowerPointExtractor getText.

Prototype

@Override
public String getText() 

Source Link

Document

Fetches all the slide text from the slideshow, but not the notes, unless you've called setSlidesByDefault() and setNotesByDefault() to change this

Usage

From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsPowerPoint.java

License:Open Source License

/** 
* ?ppt //from  www . jav a  2s. c  o  m
  * @param path 
  * @return 
  */
public String readPowerPoint(InputStream in) {
    String content = null;
    try {
        HSLFSlideShow slideShow = new HSLFSlideShow(in);
        org.apache.poi.hslf.extractor.PowerPointExtractor extractor = new PowerPointExtractor(slideShow);
        this.m_documentSummary = extractor.getDocSummaryInformation();
        this.m_summary = extractor.getSummaryInformation();
        content = extractor.getText();
        //                 SlideShow ss = new SlideShow(new HSLFSlideShow(in));// is  
        //                // InputStreamSlideShow  
        //                Slide[] slides = ss.getSlides();// ??  
        //                 for (int i = 0; i < slides.length; i++) {  
        //                    TextRun[] t = slides[i].getTextRuns();// ??TextRun  
        //                     for (int j = 0; j < t.length; j++) {  
        //                         content.append(t[j].getText());// content  
        //                    }  
        //                 }  
    } catch (Exception ex) {
        System.out.println(ex.toString());
    }
    return content;
}

From source file:com.krawler.esp.fileparser.pptparser.MsPPTParser.java

License:Open Source License

public String extractText(String filepath) throws Exception {

    InputStream input = new BufferedInputStream(new FileInputStream(filepath));
    String resultText = "";
    PowerPointExtractor ppt = new PowerPointExtractor(input);
    resultText = ppt.getText();

    if (input != null) {
        input.close();/*  w w  w . j a v  a  2  s .  co  m*/
    }
    return resultText;
}

From source file:de.micromata.genome.gwiki.plugin.msotextextractor_1_0.PowerPointTextExtractor.java

License:Apache License

public String extractText(String fileName, InputStream data) {
    try {//from www. j  a v a2 s .  c  om
        PowerPointExtractor extr = new PowerPointExtractor(data);
        String text = extr.getText();
        text = WordTextExtractor.reworkWordText(text);
        return text;
    } catch (IOException ex) {
        throw new RuntimeIOException("Failure to extract word from " + fileName + "; " + ex.getMessage(), ex);
    }
}

From source file:module.fileSupport.metadata.parsing.parsers.PowerPointParser.java

License:Open Source License

@Override
protected String extract(InputStream stream) {
    String extractedText = null;/*from  www  .j  a va2  s  . c  om*/
    try {
        PowerPointExtractor extractor = new PowerPointExtractor(stream);
        extractedText = extractor.getText();
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }

    return extractedText;

}

From source file:net.ontopia.topicmaps.classify.PowerPointFormatModule.java

License:Apache License

public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
    try {/*w  ww .j  a va 2s.  c  o  m*/
        PowerPointExtractor extractor = new PowerPointExtractor(
                new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
        String s = extractor.getText();
        char[] c = s.toCharArray();
        handler.startRegion("document");
        handler.text(c, 0, c.length);
        handler.endRegion();
    } catch (Exception e) {
        throw new OntopiaRuntimeException(e);
    }
}

From source file:no.trank.openpipe.parse.ms.PowerPointParser.java

License:Apache License

@Override
public ParserResult parse(ParseData data) throws IOException, ParserException {
    final HSLFSlideShow doc = new HSLFSlideShow(data.getInputStream());
    final ParserResultImpl result = new ParserResultImpl();
    result.setTitle(doc.getSummaryInformation().getTitle());
    final PowerPointExtractor extractor = new PowerPointExtractor(doc);
    result.setText(POIUtils.getCleanText(extractor.getText()));
    if (data.includeProperties()) {
        result.setProperties(POIUtils.getProperties(doc));
    }//from  w  w w  .j a  v a2  s. co  m
    return result;
}

From source file:org.kimios.kernel.index.filters.PowerPointFilter.java

License:Open Source License

public String getBody(InputStream in) throws IOException {
    PowerPointExtractor extractor = new PowerPointExtractor(in);
    return new String(extractor.getText().getBytes("UTF-8"), "UTF-8");
}

From source file:org.nuxeo.ecm.core.convert.plugins.text.extractors.PPT2TextConverter.java

License:Apache License

@Override
public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters)
        throws ConversionException {
    File f = null;/*from www  .j  a  va 2s  . co m*/
    OutputStream fas = null;

    PowerPointExtractor extractor = null;
    try {
        extractor = new PowerPointExtractor(blobHolder.getBlob().getStream());

        byte[] bytes = extractor.getText().getBytes();
        f = Framework.createTempFile("po-ppt2text", ".txt");
        fas = new FileOutputStream(f);
        fas.write(bytes);

        Blob blob;
        try (InputStream in = new FileInputStream(f)) {
            blob = Blobs.createBlob(in);
        }
        blob.setMimeType("text/plain");
        return new SimpleCachableBlobHolder(blob);
    } catch (IOException e) {
        throw new ConversionException("Error during PPT2Text conversion", e);
    } finally {
        if (extractor != null) {
            try {
                extractor.close();
            } catch (IOException e) {
                log.error(e, e);
            }
        }
        if (fas != null) {
            try {
                fas.close();
            } catch (IOException e) {
                log.error(e, e);
            }
        }
        if (f != null) {
            f.delete();
        }
    }
}

From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.MSPowerpointIndexerTest.java

License:Open Source License

@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception {
    POIFSFileSystem ppExtractor = Mockito.mock(POIFSFileSystem.class);
    PowerPointExtractor powerPointExtractor = Mockito.mock(PowerPointExtractor.class);
    XSLFPowerPointExtractor xslfExtractor = Mockito.mock(XSLFPowerPointExtractor.class);
    XMLSlideShow xmlSlideShow = Mockito.mock(XMLSlideShow.class);
    PowerMockito.whenNew(POIFSFileSystem.class).withArguments(Mockito.anyObject())
            .thenThrow(OfficeXmlFileException.class).thenReturn(ppExtractor)
            .thenThrow(APIManagementException.class);
    PowerMockito.whenNew(PowerPointExtractor.class).withArguments(ppExtractor).thenReturn(powerPointExtractor);
    PowerMockito.whenNew(XMLSlideShow.class).withParameterTypes(InputStream.class).withArguments(Mockito.any())
            .thenReturn(xmlSlideShow);/*from w w  w .  j a  va2s  .com*/
    PowerMockito.whenNew(XSLFPowerPointExtractor.class).withArguments(xmlSlideShow).thenReturn(xslfExtractor);
    Mockito.when(powerPointExtractor.getText()).thenReturn("");
    Mockito.when(xslfExtractor.getText()).thenReturn("");
    MSPowerpointIndexer indexer = new MSPowerpointIndexer();

    IndexDocument ppDoc = indexer.getIndexedDocument(file2Index);

    // should return the default media type when media type is not defined in file2Index
    if (!"application/vnd.ms-powerpoint"
            .equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index
    file2Index.mediaType = "text/html";
    ppDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if exception occurred while reading the file
    ppDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }
}

From source file:org.wso2.carbon.registry.indexing.indexer.MSPowerpointIndexer.java

License:Open Source License

public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
    try {/*from   www  .java2  s  .  c  o  m*/
        POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));
        PowerPointExtractor extractor = new PowerPointExtractor(fs);
        String ppText = extractor.getText();

        return new IndexDocument(fileData.path, ppText, null);
    } catch (IOException e) {
        String msg = "Failed to write to the index";
        log.error(msg, e);
        throw new SolrException(ErrorCode.SERVER_ERROR, msg);
    }

}