Example usage for org.apache.poi.hslf.extractor PowerPointExtractor getText

List of usage examples for org.apache.poi.hslf.extractor PowerPointExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.hslf.extractor PowerPointExtractor getText.

Prototype

public String getText(boolean getSlideText, boolean getNoteText) 

Source Link

Document

Fetches text from the slideshow, be it slide text or note text.

Usage

From source file:com.docdoku.server.esindexer.ESTools.java

License:Open Source License

private static String microsoftPowerPointDocumentToString(InputStream inputStream) throws IOException {
    String strRet;//from ww  w.  j a v  a2s .  c  o  m
    try (InputStream pptStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(pptStream)) {
            PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream);
            strRet = pptExtractor.getText(true, true);
        } else {
            XSLFPowerPointExtractor pptExtractor = new XSLFPowerPointExtractor(new XMLSlideShow(pptStream));
            strRet = pptExtractor.getText(true, true, true);
        }
    }
    return strRet;
}

From source file:com.docdoku.server.IndexerBean.java

License:Open Source License

@Asynchronous
@Lock(LockType.WRITE)//from  w ww  .jav  a2  s .co  m
public void addToIndex(String fullName, String pathName) {
    IndexWriter indexWriter = null;
    Directory indexDir = null;
    try {
        indexDir = FSDirectory.open(new File(indexPath));
        indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30),
                IndexWriter.MaxFieldLength.LIMITED);
        int ext = pathName.lastIndexOf('.');
        String extension = "";
        if (ext != -1) {
            extension = pathName.substring(ext);
        }

        if (extension.equals(".odt") || extension.equals(".ods") || extension.equals(".odp")
                || extension.equals(".odg") || extension.equals(".odc") || extension.equals(".odf")
                || extension.equals(".odb") || extension.equals(".odi") || extension.equals(".odm")) {
            final StringBuilder text = new StringBuilder();
            ZipInputStream zipOpenDoc = new ZipInputStream(
                    new BufferedInputStream(new FileInputStream(pathName)));
            ZipEntry zipEntry;
            while ((zipEntry = zipOpenDoc.getNextEntry()) != null) {
                if (zipEntry.getName().equals("content.xml")) {
                    SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
                    SAXParser parser = saxParserFactory.newSAXParser();
                    parser.parse(zipOpenDoc, new DefaultHandler() {

                        @Override
                        public void characters(char[] ch, int start, int length) throws SAXException {
                            for (int i = start; i < start + length; i++) {
                                text.append(ch[i]);
                            }
                            text.append("\r\n");
                        }
                    });
                    break;
                }
            }
            zipOpenDoc.close();
            Reader contentReader = new StringReader(text.toString());
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".doc")) {
            //MSWord Document
            InputStream wordStream = new BufferedInputStream(new FileInputStream(pathName));
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            Reader contentReader = new StringReader(wordExtractor.getText());
            wordStream.close();
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".ppt") || extension.equals(".pps")) {
            //MSPowerPoint Document
            InputStream pptStream = new BufferedInputStream(new FileInputStream(pathName));
            PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream);
            Reader contentReader = new StringReader(pptExtractor.getText(true, true));
            pptStream.close();
            addDoc(indexWriter, contentReader, fullName);
            pptExtractor.close();
            contentReader.close();
        } else if (extension.equals(".txt")) {
            //Text Document
            Reader contentReader = new BufferedReader(new FileReader(pathName));
            addDoc(indexWriter, contentReader, fullName);
            contentReader.close();
        } else if (extension.equals(".xls")) {
            //MSExcelExtractor Document
            //InputStream excelStream=new BufferedInputStream(new FileInputStream(pathName));
            //ExcelExtractor excelExtractor= new ExcelExtractor(excelStream);
            //Reader contentReader=new StringReader(excelExtractor.getText());
            //excelStream.close();
            //addDoc(indexWriter,contentReader,fullName);
            //excelExtractor.close();
            //contentReader.close();
        } else if (extension.equals(".html") || extension.equals(".htm")) {
        } else if (extension.equals(".csv")) {
        } else if (extension.equals(".xml")) {
        } else if (extension.equals(".rtf")) {
        } else if (extension.equals(".pdf")) {
        } else if (extension.equals(".msg")) {
        }
    } catch (CorruptIndexException ex) {
        throw new EJBException(ex);
    } catch (LockObtainFailedException ex) {
        try {
            if (IndexWriter.isLocked(indexDir)) {
                IndexWriter.unlock(indexDir);
            }
        } catch (IOException pIOEx) {
            throw new EJBException(pIOEx);
        }
        throw new EJBException(ex);
    } catch (ParserConfigurationException ex) {
        throw new EJBException(ex);
    } catch (SAXException ex) {
        throw new EJBException(ex);
    } catch (IOException ex) {
        throw new EJBException(ex);
    } finally {
        try {
            if (indexWriter != null) {
                indexWriter.close();
            }
        } catch (IOException ex) {
            throw new EJBException(ex);
        }
    }
}

From source file:com.openkm.extractor.MsPowerPointTextExtractor.java

License:Open Source License

/**
 * {@inheritDoc}//from ww w .jav  a2 s . co  m
 */
public String extractText(InputStream stream, String type, String encoding) throws IOException {
    try {
        PowerPointExtractor extractor = new PowerPointExtractor(stream);
        return extractor.getText(true, true);
    } catch (RuntimeException e) {
        logger.warn("Failed to extract PowerPoint text content", e);
        throw new IOException(e.getMessage(), e);
    } finally {
        try {
            stream.close();
        } catch (IOException ignored) {
        }
    }
}

From source file:com.xpn.xwiki.plugin.lucene.textextraction.MSPowerPointTextExtractor.java

License:Open Source License

public String getText(byte[] data) throws Exception {
    PowerPointExtractor ppe = new PowerPointExtractor(new ByteArrayInputStream(data));
    return ppe.getText(true, true);
}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

private static Stream getTextFromPPT(InputStream doc) throws GenericSearchException {
    long time = System.currentTimeMillis();
    boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors());
    PowerPointExtractor powerPointExtractor = null;
    try {/*w  w w. j  a  v a2  s .  c  om*/
        powerPointExtractor = new PowerPointExtractor(doc);
        StringBuffer buffer = new StringBuffer(powerPointExtractor.getText(true, true).trim());
        Stream stream = new Stream();
        stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING));
        stream.lock();
        if (logger.isDebugEnabled()) {
            logger.debug("extracting text from ppt needed " + (System.currentTimeMillis() - time));
        }
        return stream;
    } catch (Exception e) {
        if (errorFlag) {
            logger.warn("", e);
            return createErrorStream(pptTextExtractionErrorString);
        } else {
            throw new GenericSearchException("cannot parse ppt-file", e);
        }
    } finally {
        powerPointExtractor = null;
    }
}

From source file:edu.ur.ir.index.DefaultPowerPointTextExtractor.java

License:Apache License

/**
 * Extract text from a power point 97-2003 document.
 * @throws Exception /*ww w  .jav  a2 s .c o m*/
 * 
 * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File)
 */
public String getText(File f) throws Exception {
    String text = null;
    if (isFileTooLarge(f) || f.length() <= 0l) {
        return text;
    }

    FileInputStream inputStream = null;
    try {
        inputStream = new FileInputStream(f);
        HSLFSlideShow powerPointDocument = new HSLFSlideShow(inputStream);
        PowerPointExtractor pptExtractor = new PowerPointExtractor(powerPointDocument);

        String myText = pptExtractor.getText(true, true);
        if (myText != null && !myText.trim().equals("")) {
            text = myText;
        }
    } catch (OutOfMemoryError oome) {
        text = null;
        log.error("could not extract text", oome);
        throw (oome);
    } catch (Exception e) {
        text = null;
        log.error("could not get text for power point document " + f.getAbsolutePath(), e);
        throw (e);
    }

    finally {
        closeInputStream(inputStream);
        inputStream = null;
    }
    return text;

}

From source file:net.sourceforge.docfetcher.parse.MSPowerPointParser.java

License:Open Source License

public String renderText(File file) throws ParseException {
    InputStream in = null;//ww  w  .  ja va  2 s.  c o m
    try {
        in = new FileInputStream(file);
        PowerPointExtractor extractor = null;
        try {
            extractor = new PowerPointExtractor(in);
        } catch (Exception e) {
            // This can happen if the file has the "ppt" extension, but is not a PowerPoint document
            throw new ParseException(file, Msg.file_corrupted.value());
        } finally {
            in.close();
        }
        return extractor.getText(true, true);
    } catch (FileNotFoundException e) {
        throw new ParseException(file, Msg.file_not_found.value());
    } catch (IOException e) {
        throw new ParseException(file, Msg.file_not_readable.value());
    }
}

From source file:net.yacy.document.parser.pptParser.java

License:Open Source License

@Override
public Document[] parse(final AnchorURL location, final String mimeType, final String charset,
        final VocabularyScraper scraper, final int timezoneOffset, final InputStream source)
        throws Parser.Failure, InterruptedException {
    try {/*from www .j  a v  a2  s.  c  o  m*/
        /*
         * create new PowerPointExtractor and extract text and notes
         * of the document
         */
        final PowerPointExtractor pptExtractor = new PowerPointExtractor(new BufferedInputStream(source));
        final String contents = pptExtractor.getText(true, true).trim();
        String title = contents.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim();
        if (title.length() > 80)
            title = title.substring(0, 80);
        int l = title.length();
        while (true) {
            title = title.replaceAll("  ", " ");
            if (title.length() == l)
                break;
            l = title.length();
        }
        // get keywords (for yacy as array)
        final String keywords = pptExtractor.getSummaryInformation().getKeywords();
        final String[] keywlist;
        if (keywords != null && !keywords.isEmpty()) {
            keywlist = CommonPattern.COMMA.split(keywords);
        } else
            keywlist = null;

        final String subject = pptExtractor.getSummaryInformation().getSubject();
        List<String> descriptions = new ArrayList<String>();
        if (subject != null && !subject.isEmpty())
            descriptions.add(subject);

        /*
         * create the plasmaParserDocument for the database
         * and set shortText and bodyText properly
         */
        final Document[] docs = new Document[] { new Document(location, mimeType, "UTF-8", this, null, keywlist,
                singleList(title), pptExtractor.getSummaryInformation().getAuthor(), // may be null
                pptExtractor.getDocSummaryInformation().getCompany(), null, descriptions, 0.0f, 0.0f, contents,
                null, null, null, false, new Date()) };
        return docs;
    } catch (final Exception e) {
        if (e instanceof InterruptedException)
            throw (InterruptedException) e;

        /*
         * an unexpected error occurred, log it and throw a Parser.Failure
         */
        ConcurrentLog.logException(e);
        final String errorMsg = "Unable to parse the ppt document '" + location + "':" + e.getMessage();
        AbstractParser.log.severe(errorMsg);
        throw new Parser.Failure(errorMsg, location);
    }
}

From source file:org.exoplatform.services.document.impl.PPTDocumentReader.java

License:Open Source License

/**
 * Returns only a text from .ppt file content.
 * /*w w  w  . j  a v a2 s  .  c o m*/
 * @param is an input stream with .ppt file content.
 * @return The string only with text from file content.
 */
public String getContentAsText(InputStream is) throws IOException, DocumentReadException {
    if (is == null) {
        throw new IllegalArgumentException("InputStream is null.");
    }
    try {

        if (is.available() == 0) {
            return "";
        }

        PowerPointExtractor ppe;
        try {
            ppe = new PowerPointExtractor(is);
        } catch (IOException e) {
            throw new DocumentReadException("Can't open presentation.", e);
        }
        return ppe.getText(true, true);
    } finally {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("An exception occurred: " + e.getMessage());
                }
            }
        }
    }
}

From source file:org.paxle.parser.msoffice.impl.MsPowerpointParser.java

License:Open Source License

@Override
protected void extractText(POIFSFileSystem fs, IParserDocument parserDoc) throws ParserException, IOException {
    // extract plain text
    final PowerPointExtractor parser = new PowerPointExtractor(fs);
    final String text = parser.getText(true, true);
    if (text != null && text.length() > 0) {
        parserDoc.append(text);/* w  w w .  j  a  va2s .  c om*/
    }
}