List of usage examples for org.apache.poi.hslf.extractor PowerPointExtractor getText
public String getText(boolean getSlideText, boolean getNoteText)
From source file:com.docdoku.server.esindexer.ESTools.java
License:Open Source License
private static String microsoftPowerPointDocumentToString(InputStream inputStream) throws IOException { String strRet;//from ww w. j a v a2s . c o m try (InputStream pptStream = new BufferedInputStream(inputStream)) { if (POIFSFileSystem.hasPOIFSHeader(pptStream)) { PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream); strRet = pptExtractor.getText(true, true); } else { XSLFPowerPointExtractor pptExtractor = new XSLFPowerPointExtractor(new XMLSlideShow(pptStream)); strRet = pptExtractor.getText(true, true, true); } } return strRet; }
From source file:com.docdoku.server.IndexerBean.java
License:Open Source License
@Asynchronous @Lock(LockType.WRITE)//from w ww .jav a2 s .co m public void addToIndex(String fullName, String pathName) { IndexWriter indexWriter = null; Directory indexDir = null; try { indexDir = FSDirectory.open(new File(indexPath)); indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30), IndexWriter.MaxFieldLength.LIMITED); int ext = pathName.lastIndexOf('.'); String extension = ""; if (ext != -1) { extension = pathName.substring(ext); } if (extension.equals(".odt") || extension.equals(".ods") || extension.equals(".odp") || extension.equals(".odg") || extension.equals(".odc") || extension.equals(".odf") || extension.equals(".odb") || extension.equals(".odi") || extension.equals(".odm")) { final StringBuilder text = new StringBuilder(); ZipInputStream zipOpenDoc = new ZipInputStream( new BufferedInputStream(new FileInputStream(pathName))); ZipEntry zipEntry; while ((zipEntry = zipOpenDoc.getNextEntry()) != null) { if (zipEntry.getName().equals("content.xml")) { SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); SAXParser parser = saxParserFactory.newSAXParser(); parser.parse(zipOpenDoc, new DefaultHandler() { @Override public void characters(char[] ch, int start, int length) throws SAXException { for (int i = start; i < start + length; i++) { text.append(ch[i]); } text.append("\r\n"); } }); break; } } zipOpenDoc.close(); Reader contentReader = new StringReader(text.toString()); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".doc")) { //MSWord Document InputStream wordStream = new BufferedInputStream(new FileInputStream(pathName)); WordExtractor wordExtractor = new WordExtractor(wordStream); Reader contentReader = new StringReader(wordExtractor.getText()); wordStream.close(); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".ppt") || extension.equals(".pps")) { //MSPowerPoint Document InputStream pptStream = new BufferedInputStream(new FileInputStream(pathName)); PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream); Reader contentReader = new StringReader(pptExtractor.getText(true, true)); pptStream.close(); addDoc(indexWriter, contentReader, fullName); pptExtractor.close(); contentReader.close(); } else if (extension.equals(".txt")) { //Text Document Reader contentReader = new BufferedReader(new FileReader(pathName)); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".xls")) { //MSExcelExtractor Document //InputStream excelStream=new BufferedInputStream(new FileInputStream(pathName)); //ExcelExtractor excelExtractor= new ExcelExtractor(excelStream); //Reader contentReader=new StringReader(excelExtractor.getText()); //excelStream.close(); //addDoc(indexWriter,contentReader,fullName); //excelExtractor.close(); //contentReader.close(); } else if (extension.equals(".html") || extension.equals(".htm")) { } else if (extension.equals(".csv")) { } else if (extension.equals(".xml")) { } else if (extension.equals(".rtf")) { } else if (extension.equals(".pdf")) { } else if (extension.equals(".msg")) { } } catch (CorruptIndexException ex) { throw new EJBException(ex); } catch (LockObtainFailedException ex) { try { if (IndexWriter.isLocked(indexDir)) { IndexWriter.unlock(indexDir); } } catch (IOException pIOEx) { throw new EJBException(pIOEx); } throw new EJBException(ex); } catch (ParserConfigurationException ex) { throw new EJBException(ex); } catch (SAXException ex) { throw new EJBException(ex); } catch (IOException ex) { throw new EJBException(ex); } finally { try { if (indexWriter != null) { indexWriter.close(); } } catch (IOException ex) { throw new EJBException(ex); } } }
From source file:com.openkm.extractor.MsPowerPointTextExtractor.java
License:Open Source License
/** * {@inheritDoc}//from ww w .jav a2 s . co m */ public String extractText(InputStream stream, String type, String encoding) throws IOException { try { PowerPointExtractor extractor = new PowerPointExtractor(stream); return extractor.getText(true, true); } catch (RuntimeException e) { logger.warn("Failed to extract PowerPoint text content", e); throw new IOException(e.getMessage(), e); } finally { try { stream.close(); } catch (IOException ignored) { } } }
From source file:com.xpn.xwiki.plugin.lucene.textextraction.MSPowerPointTextExtractor.java
License:Open Source License
public String getText(byte[] data) throws Exception { PowerPointExtractor ppe = new PowerPointExtractor(new ByteArrayInputStream(data)); return ppe.getText(true, true); }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
private static Stream getTextFromPPT(InputStream doc) throws GenericSearchException { long time = System.currentTimeMillis(); boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors()); PowerPointExtractor powerPointExtractor = null; try {/*w w w. j a v a2 s . c om*/ powerPointExtractor = new PowerPointExtractor(doc); StringBuffer buffer = new StringBuffer(powerPointExtractor.getText(true, true).trim()); Stream stream = new Stream(); stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING)); stream.lock(); if (logger.isDebugEnabled()) { logger.debug("extracting text from ppt needed " + (System.currentTimeMillis() - time)); } return stream; } catch (Exception e) { if (errorFlag) { logger.warn("", e); return createErrorStream(pptTextExtractionErrorString); } else { throw new GenericSearchException("cannot parse ppt-file", e); } } finally { powerPointExtractor = null; } }
From source file:edu.ur.ir.index.DefaultPowerPointTextExtractor.java
License:Apache License
/** * Extract text from a power point 97-2003 document. * @throws Exception /*ww w .jav a2 s .c o m*/ * * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File) */ public String getText(File f) throws Exception { String text = null; if (isFileTooLarge(f) || f.length() <= 0l) { return text; } FileInputStream inputStream = null; try { inputStream = new FileInputStream(f); HSLFSlideShow powerPointDocument = new HSLFSlideShow(inputStream); PowerPointExtractor pptExtractor = new PowerPointExtractor(powerPointDocument); String myText = pptExtractor.getText(true, true); if (myText != null && !myText.trim().equals("")) { text = myText; } } catch (OutOfMemoryError oome) { text = null; log.error("could not extract text", oome); throw (oome); } catch (Exception e) { text = null; log.error("could not get text for power point document " + f.getAbsolutePath(), e); throw (e); } finally { closeInputStream(inputStream); inputStream = null; } return text; }
From source file:net.sourceforge.docfetcher.parse.MSPowerPointParser.java
License:Open Source License
public String renderText(File file) throws ParseException { InputStream in = null;//ww w . ja va 2 s. c o m try { in = new FileInputStream(file); PowerPointExtractor extractor = null; try { extractor = new PowerPointExtractor(in); } catch (Exception e) { // This can happen if the file has the "ppt" extension, but is not a PowerPoint document throw new ParseException(file, Msg.file_corrupted.value()); } finally { in.close(); } return extractor.getText(true, true); } catch (FileNotFoundException e) { throw new ParseException(file, Msg.file_not_found.value()); } catch (IOException e) { throw new ParseException(file, Msg.file_not_readable.value()); } }
From source file:net.yacy.document.parser.pptParser.java
License:Open Source License
@Override public Document[] parse(final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { try {/*from www .j a v a2 s. c o m*/ /* * create new PowerPointExtractor and extract text and notes * of the document */ final PowerPointExtractor pptExtractor = new PowerPointExtractor(new BufferedInputStream(source)); final String contents = pptExtractor.getText(true, true).trim(); String title = contents.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim(); if (title.length() > 80) title = title.substring(0, 80); int l = title.length(); while (true) { title = title.replaceAll(" ", " "); if (title.length() == l) break; l = title.length(); } // get keywords (for yacy as array) final String keywords = pptExtractor.getSummaryInformation().getKeywords(); final String[] keywlist; if (keywords != null && !keywords.isEmpty()) { keywlist = CommonPattern.COMMA.split(keywords); } else keywlist = null; final String subject = pptExtractor.getSummaryInformation().getSubject(); List<String> descriptions = new ArrayList<String>(); if (subject != null && !subject.isEmpty()) descriptions.add(subject); /* * create the plasmaParserDocument for the database * and set shortText and bodyText properly */ final Document[] docs = new Document[] { new Document(location, mimeType, "UTF-8", this, null, keywlist, singleList(title), pptExtractor.getSummaryInformation().getAuthor(), // may be null pptExtractor.getDocSummaryInformation().getCompany(), null, descriptions, 0.0f, 0.0f, contents, null, null, null, false, new Date()) }; return docs; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; /* * an unexpected error occurred, log it and throw a Parser.Failure */ ConcurrentLog.logException(e); final String errorMsg = "Unable to parse the ppt document '" + location + "':" + e.getMessage(); AbstractParser.log.severe(errorMsg); throw new Parser.Failure(errorMsg, location); } }
From source file:org.exoplatform.services.document.impl.PPTDocumentReader.java
License:Open Source License
/** * Returns only a text from .ppt file content. * /*w w w . j a v a2 s . c o m*/ * @param is an input stream with .ppt file content. * @return The string only with text from file content. */ public String getContentAsText(InputStream is) throws IOException, DocumentReadException { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } try { if (is.available() == 0) { return ""; } PowerPointExtractor ppe; try { ppe = new PowerPointExtractor(is); } catch (IOException e) { throw new DocumentReadException("Can't open presentation.", e); } return ppe.getText(true, true); } finally { if (is != null) { try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } } }
From source file:org.paxle.parser.msoffice.impl.MsPowerpointParser.java
License:Open Source License
@Override protected void extractText(POIFSFileSystem fs, IParserDocument parserDoc) throws ParserException, IOException { // extract plain text final PowerPointExtractor parser = new PowerPointExtractor(fs); final String text = parser.getText(true, true); if (text != null && text.length() > 0) { parserDoc.append(text);/* w w w . j a va2s . c om*/ } }