Example usage for org.apache.poi.hslf.extractor PowerPointExtractor PowerPointExtractor

List of usage examples for org.apache.poi.hslf.extractor PowerPointExtractor PowerPointExtractor

Introduction

In this page you can find the example usage for org.apache.poi.hslf.extractor PowerPointExtractor PowerPointExtractor.

Prototype

public PowerPointExtractor(HSLFSlideShowImpl ss) 

Source Link

Document

Creates a PowerPointExtractor, from a HSLFSlideShow

Usage

From source file:com.docdoku.server.esindexer.ESTools.java

License:Open Source License

private static String microsoftPowerPointDocumentToString(InputStream inputStream) throws IOException {
    String strRet;/*  w  ww.j  ava 2  s . co  m*/
    try (InputStream pptStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(pptStream)) {
            PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream);
            strRet = pptExtractor.getText(true, true);
        } else {
            XSLFPowerPointExtractor pptExtractor = new XSLFPowerPointExtractor(new XMLSlideShow(pptStream));
            strRet = pptExtractor.getText(true, true, true);
        }
    }
    return strRet;
}

From source file:com.docdoku.server.IndexerBean.java

License:Open Source License

@Asynchronous
@Lock(LockType.WRITE)//from   w  w  w  .  j  a  v a2s .  c om
public void addToIndex(String fullName, String pathName) {
    IndexWriter indexWriter = null;
    Directory indexDir = null;
    try {
        indexDir = FSDirectory.open(new File(indexPath));
        indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30),
                IndexWriter.MaxFieldLength.LIMITED);
        int ext = pathName.lastIndexOf('.');
        String extension = "";
        if (ext != -1) {
            extension = pathName.substring(ext);
        }

        if (extension.equals(".odt") || extension.equals(".ods") || extension.equals(".odp")
                || extension.equals(".odg") || extension.equals(".odc") || extension.equals(".odf")
                || extension.equals(".odb") || extension.equals(".odi") || extension.equals(".odm")) {
            final StringBuilder text = new StringBuilder();
            ZipInputStream zipOpenDoc = new ZipInputStream(
                    new BufferedInputStream(new FileInputStream(pathName)));
            ZipEntry zipEntry;
            while ((zipEntry = zipOpenDoc.getNextEntry()) != null) {
                if (zipEntry.getName().equals("content.xml")) {
                    SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
                    SAXParser parser = saxParserFactory.newSAXParser();
                    parser.parse(zipOpenDoc, new DefaultHandler() {

                        @Override
                        public void characters(char[] ch, int start, int length) throws SAXException {
                            for (int i = start; i < start + length; i++) {
                                text.append(ch[i]);
                            }
                            text.append("\r\n");
                        }
                    });
                    break;
                }
            }
            zipOpenDoc.close();
            Reader contentReader = new StringReader(text.toString());
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".doc")) {
            //MSWord Document
            InputStream wordStream = new BufferedInputStream(new FileInputStream(pathName));
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            Reader contentReader = new StringReader(wordExtractor.getText());
            wordStream.close();
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".ppt") || extension.equals(".pps")) {
            //MSPowerPoint Document
            InputStream pptStream = new BufferedInputStream(new FileInputStream(pathName));
            PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream);
            Reader contentReader = new StringReader(pptExtractor.getText(true, true));
            pptStream.close();
            addDoc(indexWriter, contentReader, fullName);
            pptExtractor.close();
            contentReader.close();
        } else if (extension.equals(".txt")) {
            //Text Document
            Reader contentReader = new BufferedReader(new FileReader(pathName));
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".xls")) {
            //MSExcelExtractor Document
            //InputStream excelStream=new BufferedInputStream(new FileInputStream(pathName));
            //ExcelExtractor excelExtractor= new ExcelExtractor(excelStream);
            //Reader contentReader=new StringReader(excelExtractor.getText());
            //excelStream.close();
            //addDoc(indexWriter,contentReader,fullName);
            //excelExtractor.close();
            //contentReader.close();
        } else if (extension.equals(".html") || extension.equals(".htm")) {
        } else if (extension.equals(".csv")) {
        } else if (extension.equals(".xml")) {
        } else if (extension.equals(".rtf")) {
        } else if (extension.equals(".pdf")) {
        } else if (extension.equals(".msg")) {
        }
    } catch (CorruptIndexException ex) {
        throw new EJBException(ex);
    } catch (LockObtainFailedException ex) {
        try {
            if (IndexWriter.isLocked(indexDir)) {
                IndexWriter.unlock(indexDir);
            }
        } catch (IOException pIOEx) {
            throw new EJBException(pIOEx);
        }
        throw new EJBException(ex);
    } catch (ParserConfigurationException ex) {
        throw new EJBException(ex);
    } catch (SAXException ex) {
        throw new EJBException(ex);
    } catch (IOException ex) {
        throw new EJBException(ex);
    } finally {
        try {
            if (indexWriter != null) {
                indexWriter.close();
            }
        } catch (IOException ex) {
            throw new EJBException(ex);
        }
    }
}

From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsPowerPoint.java

License:Open Source License

/** 
* ?ppt /*from   www.  j  a v  a  2  s .c o  m*/
  * @param path 
  * @return 
  */
public String readPowerPoint(InputStream in) {
    String content = null;
    try {
        HSLFSlideShow slideShow = new HSLFSlideShow(in);
        org.apache.poi.hslf.extractor.PowerPointExtractor extractor = new PowerPointExtractor(slideShow);
        this.m_documentSummary = extractor.getDocSummaryInformation();
        this.m_summary = extractor.getSummaryInformation();
        content = extractor.getText();
        //                 SlideShow ss = new SlideShow(new HSLFSlideShow(in));// is  
        //                // InputStreamSlideShow  
        //                Slide[] slides = ss.getSlides();// ??  
        //                 for (int i = 0; i < slides.length; i++) {  
        //                    TextRun[] t = slides[i].getTextRuns();// ??TextRun  
        //                     for (int j = 0; j < t.length; j++) {  
        //                         content.append(t[j].getText());// content  
        //                    }  
        //                 }  
    } catch (Exception ex) {
        System.out.println(ex.toString());
    }
    return content;
}

From source file:com.krawler.esp.fileparser.pptparser.MsPPTParser.java

License:Open Source License

public String extractText(String filepath) throws Exception {

    InputStream input = new BufferedInputStream(new FileInputStream(filepath));
    String resultText = "";
    PowerPointExtractor ppt = new PowerPointExtractor(input);
    resultText = ppt.getText();//from w  w  w .  j a  v a  2 s .c o m

    if (input != null) {
        input.close();
    }
    return resultText;
}

From source file:com.openkm.extractor.MsPowerPointTextExtractor.java

License:Open Source License

/**
 * {@inheritDoc}/*from w  w  w . j a v  a2  s.  c om*/
 */
public String extractText(InputStream stream, String type, String encoding) throws IOException {
    try {
        PowerPointExtractor extractor = new PowerPointExtractor(stream);
        return extractor.getText(true, true);
    } catch (RuntimeException e) {
        logger.warn("Failed to extract PowerPoint text content", e);
        throw new IOException(e.getMessage(), e);
    } finally {
        try {
            stream.close();
        } catch (IOException ignored) {
        }
    }
}

From source file:com.openkm.util.metadata.MetadataExtractor.java

License:Open Source License

/**
 * Extract metadata from Office Word//from   ww w  .  jav  a2s  .co  m
 */
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
    POIFSFileSystem fs = new POIFSFileSystem(is);
    OfficeMetadata md = new OfficeMetadata();
    SummaryInformation si = null;

    if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) {
        si = new WordExtractor(fs).getSummaryInformation();
    } else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) {
        si = new ExcelExtractor(fs).getSummaryInformation();
    } else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) {
        si = new PowerPointExtractor(fs).getSummaryInformation();
    }

    if (si != null) {
        md.setTitle(si.getTitle());
        md.setSubject(si.getSubject());
        md.setAuthor(si.getAuthor());
        md.setLastAuthor(si.getLastAuthor());
        md.setKeywords(si.getKeywords());
        md.setComments(si.getComments());
        md.setTemplate(si.getTemplate());
        md.setRevNumber(si.getRevNumber());
        md.setApplicationName(si.getApplicationName());
        md.setEditTime(si.getEditTime());
        md.setPageCount(si.getPageCount());
        md.setWordCount(si.getWordCount());
        md.setCharCount(si.getCharCount());
        md.setSecurity(si.getSecurity());

        Calendar createDateTime = Calendar.getInstance();
        createDateTime.setTime(si.getCreateDateTime());
        md.setCreateDateTime(createDateTime);

        Calendar lastSaveDateTime = Calendar.getInstance();
        lastSaveDateTime.setTime(si.getLastSaveDateTime());
        md.setLastSaveDateTime(lastSaveDateTime);

        Calendar lastPrinted = Calendar.getInstance();
        lastPrinted.setTime(si.getLastPrinted());
        md.setLastPrinted(lastPrinted);
    }

    log.info("officeExtractor: {}", md);
    return md;
}

From source file:com.xpn.xwiki.plugin.lucene.textextraction.MSPowerPointTextExtractor.java

License:Open Source License

public String getText(byte[] data) throws Exception {
    PowerPointExtractor ppe = new PowerPointExtractor(new ByteArrayInputStream(data));
    return ppe.getText(true, true);
}

From source file:de.maklerpoint.office.Lucene.Indexer.java

License:Open Source License

private void indexFileorDir(String fileName) throws IOException {
    listFiles(new File(fileName));

    for (File f : queue) {
        FileReader fr = null;/*from  w  w w .  ja va 2  s. c  o m*/
        try {
            if (f.getName().startsWith(".")) {
                //                    System.out.println("Versteckte datei: " + f.getName());
                // TODO add html, xml parsers
            } else if (f.getName().endsWith(".htm") || f.getName().endsWith(".html")
                    || f.getName().endsWith(".xml") || f.getName().endsWith(".txt")) {
                Document doc = new Document();

                //===================================================
                // add contents of file
                //===================================================
                fr = new FileReader(f);
                doc.add(new Field("contents", fr));

                //===================================================
                //adding second field which contains the path of the file
                //===================================================
                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                /**
                 * Adding Typ
                 */
                doc.add(new Field("type", String.valueOf(FileTypes.TXT), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
            } else if (f.getName().endsWith(".pdf")) {
                PDFParser parser = new PDFParser(new FileInputStream(f));
                parser.parse();
                COSDocument cd = parser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();

                String text = stripper.getText(new PDDocument(cd));

                Document doc = new Document();

                doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("type", String.valueOf(FileTypes.PDF), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
                cd.close();
            } else if (f.getName().endsWith(".doc") || f.getName().endsWith(".docx")) {

                POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
                WordExtractor extractor = new WordExtractor(fs);
                String wordText = extractor.getText();

                Document doc = new Document();
                doc.add(new Field("contents", wordText, Field.Store.YES, Field.Index.ANALYZED));

                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("type", String.valueOf(FileTypes.DOC), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
            } else if (f.getName().endsWith(".xls") || f.getName().endsWith(".xlsx")) {
                POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
                ExcelExtractor extractor = new ExcelExtractor(fs);
                String excelText = extractor.getText();

                Document doc = new Document();
                doc.add(new Field("contents", excelText, Field.Store.YES, Field.Index.ANALYZED));

                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("type", String.valueOf(FileTypes.XLS), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
            } else if (f.getName().endsWith(".ppt") || f.getName().endsWith(".pptx")) {
                POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
                PowerPointExtractor extractor = new PowerPointExtractor(fs);
                String ppttext = extractor.getText();

                Document doc = new Document();
                doc.add(new Field("contents", ppttext, Field.Store.YES, Field.Index.ANALYZED));

                doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED));

                doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("type", String.valueOf(FileTypes.PPT), Field.Store.YES,
                        Field.Index.NOT_ANALYZED));

                doc.add(new Field("filesize",
                        String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.addDocument(doc);
            }

            if (Log.logger.isDebugEnabled()) {
                Log.logger.debug("Lucene | Neue Datei indexiert: " + f);
            }
        } catch (Exception e) {
            if (Log.logger.isDebugEnabled()) {
                Log.logger.debug("Datei konnte nicht indexiert werden: " + f, e);
            }
            continue;
        } finally {
            //                fr.close();
        }
    }

    writer.optimize();
    queue.clear();

}

From source file:de.micromata.genome.gwiki.plugin.msotextextractor_1_0.PowerPointTextExtractor.java

License:Apache License

public String extractText(String fileName, InputStream data) {
    try {//from  w  w  w  .j a  va 2 s .  c om
        PowerPointExtractor extr = new PowerPointExtractor(data);
        String text = extr.getText();
        text = WordTextExtractor.reworkWordText(text);
        return text;
    } catch (IOException ex) {
        throw new RuntimeIOException("Failure to extract word from " + fileName + "; " + ex.getMessage(), ex);
    }
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

private static Stream getTextFromPPT(InputStream doc) throws GenericSearchException {
    long time = System.currentTimeMillis();
    boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors());
    PowerPointExtractor powerPointExtractor = null;
    try {/*  w  w w.j  a v a 2s.c o  m*/
        powerPointExtractor = new PowerPointExtractor(doc);
        StringBuffer buffer = new StringBuffer(powerPointExtractor.getText(true, true).trim());
        Stream stream = new Stream();
        stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING));
        stream.lock();
        if (logger.isDebugEnabled()) {
            logger.debug("extracting text from ppt needed " + (System.currentTimeMillis() - time));
        }
        return stream;
    } catch (Exception e) {
        if (errorFlag) {
            logger.warn("", e);
            return createErrorStream(pptTextExtractionErrorString);
        } else {
            throw new GenericSearchException("cannot parse ppt-file", e);
        }
    } finally {
        powerPointExtractor = null;
    }
}