List of usage examples for org.apache.poi.hslf.extractor PowerPointExtractor PowerPointExtractor
public PowerPointExtractor(HSLFSlideShowImpl ss)
From source file:com.docdoku.server.esindexer.ESTools.java
License:Open Source License
private static String microsoftPowerPointDocumentToString(InputStream inputStream) throws IOException { String strRet;/* w ww.j ava 2 s . co m*/ try (InputStream pptStream = new BufferedInputStream(inputStream)) { if (POIFSFileSystem.hasPOIFSHeader(pptStream)) { PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream); strRet = pptExtractor.getText(true, true); } else { XSLFPowerPointExtractor pptExtractor = new XSLFPowerPointExtractor(new XMLSlideShow(pptStream)); strRet = pptExtractor.getText(true, true, true); } } return strRet; }
From source file:com.docdoku.server.IndexerBean.java
License:Open Source License
@Asynchronous @Lock(LockType.WRITE)//from w w w . j a v a2s . c om public void addToIndex(String fullName, String pathName) { IndexWriter indexWriter = null; Directory indexDir = null; try { indexDir = FSDirectory.open(new File(indexPath)); indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30), IndexWriter.MaxFieldLength.LIMITED); int ext = pathName.lastIndexOf('.'); String extension = ""; if (ext != -1) { extension = pathName.substring(ext); } if (extension.equals(".odt") || extension.equals(".ods") || extension.equals(".odp") || extension.equals(".odg") || extension.equals(".odc") || extension.equals(".odf") || extension.equals(".odb") || extension.equals(".odi") || extension.equals(".odm")) { final StringBuilder text = new StringBuilder(); ZipInputStream zipOpenDoc = new ZipInputStream( new BufferedInputStream(new FileInputStream(pathName))); ZipEntry zipEntry; while ((zipEntry = zipOpenDoc.getNextEntry()) != null) { if (zipEntry.getName().equals("content.xml")) { SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); SAXParser parser = saxParserFactory.newSAXParser(); parser.parse(zipOpenDoc, new DefaultHandler() { @Override public void characters(char[] ch, int start, int length) throws SAXException { for (int i = start; i < start + length; i++) { text.append(ch[i]); } text.append("\r\n"); } }); break; } } zipOpenDoc.close(); Reader contentReader = new StringReader(text.toString()); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".doc")) { //MSWord Document InputStream wordStream = new BufferedInputStream(new FileInputStream(pathName)); WordExtractor wordExtractor = new WordExtractor(wordStream); Reader contentReader = new StringReader(wordExtractor.getText()); wordStream.close(); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".ppt") || extension.equals(".pps")) { //MSPowerPoint Document InputStream pptStream = new BufferedInputStream(new FileInputStream(pathName)); PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream); Reader contentReader = new StringReader(pptExtractor.getText(true, true)); pptStream.close(); addDoc(indexWriter, contentReader, fullName); pptExtractor.close(); contentReader.close(); } else if (extension.equals(".txt")) { //Text Document Reader contentReader = new BufferedReader(new FileReader(pathName)); addDoc(indexWriter, contentReader, fullName); contentReader.close(); } else if (extension.equals(".xls")) { //MSExcelExtractor Document //InputStream excelStream=new BufferedInputStream(new FileInputStream(pathName)); //ExcelExtractor excelExtractor= new ExcelExtractor(excelStream); //Reader contentReader=new StringReader(excelExtractor.getText()); //excelStream.close(); //addDoc(indexWriter,contentReader,fullName); //excelExtractor.close(); //contentReader.close(); } else if (extension.equals(".html") || extension.equals(".htm")) { } else if (extension.equals(".csv")) { } else if (extension.equals(".xml")) { } else if (extension.equals(".rtf")) { } else if (extension.equals(".pdf")) { } else if (extension.equals(".msg")) { } } catch (CorruptIndexException ex) { throw new EJBException(ex); } catch (LockObtainFailedException ex) { try { if (IndexWriter.isLocked(indexDir)) { IndexWriter.unlock(indexDir); } } catch (IOException pIOEx) { throw new EJBException(pIOEx); } throw new EJBException(ex); } catch (ParserConfigurationException ex) { throw new EJBException(ex); } catch (SAXException ex) { throw new EJBException(ex); } catch (IOException ex) { throw new EJBException(ex); } finally { try { if (indexWriter != null) { indexWriter.close(); } } catch (IOException ex) { throw new EJBException(ex); } } }
From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsPowerPoint.java
License:Open Source License
/** * ?ppt /*from www. j a v a 2 s .c o m*/ * @param path * @return */ public String readPowerPoint(InputStream in) { String content = null; try { HSLFSlideShow slideShow = new HSLFSlideShow(in); org.apache.poi.hslf.extractor.PowerPointExtractor extractor = new PowerPointExtractor(slideShow); this.m_documentSummary = extractor.getDocSummaryInformation(); this.m_summary = extractor.getSummaryInformation(); content = extractor.getText(); // SlideShow ss = new SlideShow(new HSLFSlideShow(in));// is // // InputStreamSlideShow // Slide[] slides = ss.getSlides();// ?? // for (int i = 0; i < slides.length; i++) { // TextRun[] t = slides[i].getTextRuns();// ??TextRun // for (int j = 0; j < t.length; j++) { // content.append(t[j].getText());// content // } // } } catch (Exception ex) { System.out.println(ex.toString()); } return content; }
From source file:com.krawler.esp.fileparser.pptparser.MsPPTParser.java
License:Open Source License
public String extractText(String filepath) throws Exception { InputStream input = new BufferedInputStream(new FileInputStream(filepath)); String resultText = ""; PowerPointExtractor ppt = new PowerPointExtractor(input); resultText = ppt.getText();//from w w w . j a v a 2 s .c o m if (input != null) { input.close(); } return resultText; }
From source file:com.openkm.extractor.MsPowerPointTextExtractor.java
License:Open Source License
/** * {@inheritDoc}/*from w w w . j a v a2 s. c om*/ */ public String extractText(InputStream stream, String type, String encoding) throws IOException { try { PowerPointExtractor extractor = new PowerPointExtractor(stream); return extractor.getText(true, true); } catch (RuntimeException e) { logger.warn("Failed to extract PowerPoint text content", e); throw new IOException(e.getMessage(), e); } finally { try { stream.close(); } catch (IOException ignored) { } } }
From source file:com.openkm.util.metadata.MetadataExtractor.java
License:Open Source License
/** * Extract metadata from Office Word//from ww w . jav a2s .co m */ public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException { POIFSFileSystem fs = new POIFSFileSystem(is); OfficeMetadata md = new OfficeMetadata(); SummaryInformation si = null; if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) { si = new WordExtractor(fs).getSummaryInformation(); } else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) { si = new ExcelExtractor(fs).getSummaryInformation(); } else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) { si = new PowerPointExtractor(fs).getSummaryInformation(); } if (si != null) { md.setTitle(si.getTitle()); md.setSubject(si.getSubject()); md.setAuthor(si.getAuthor()); md.setLastAuthor(si.getLastAuthor()); md.setKeywords(si.getKeywords()); md.setComments(si.getComments()); md.setTemplate(si.getTemplate()); md.setRevNumber(si.getRevNumber()); md.setApplicationName(si.getApplicationName()); md.setEditTime(si.getEditTime()); md.setPageCount(si.getPageCount()); md.setWordCount(si.getWordCount()); md.setCharCount(si.getCharCount()); md.setSecurity(si.getSecurity()); Calendar createDateTime = Calendar.getInstance(); createDateTime.setTime(si.getCreateDateTime()); md.setCreateDateTime(createDateTime); Calendar lastSaveDateTime = Calendar.getInstance(); lastSaveDateTime.setTime(si.getLastSaveDateTime()); md.setLastSaveDateTime(lastSaveDateTime); Calendar lastPrinted = Calendar.getInstance(); lastPrinted.setTime(si.getLastPrinted()); md.setLastPrinted(lastPrinted); } log.info("officeExtractor: {}", md); return md; }
From source file:com.xpn.xwiki.plugin.lucene.textextraction.MSPowerPointTextExtractor.java
License:Open Source License
public String getText(byte[] data) throws Exception { PowerPointExtractor ppe = new PowerPointExtractor(new ByteArrayInputStream(data)); return ppe.getText(true, true); }
From source file:de.maklerpoint.office.Lucene.Indexer.java
License:Open Source License
private void indexFileorDir(String fileName) throws IOException { listFiles(new File(fileName)); for (File f : queue) { FileReader fr = null;/*from w w w . ja va 2 s. c o m*/ try { if (f.getName().startsWith(".")) { // System.out.println("Versteckte datei: " + f.getName()); // TODO add html, xml parsers } else if (f.getName().endsWith(".htm") || f.getName().endsWith(".html") || f.getName().endsWith(".xml") || f.getName().endsWith(".txt")) { Document doc = new Document(); //=================================================== // add contents of file //=================================================== fr = new FileReader(f); doc.add(new Field("contents", fr)); //=================================================== //adding second field which contains the path of the file //=================================================== doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); /** * Adding Typ */ doc.add(new Field("type", String.valueOf(FileTypes.TXT), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); } else if (f.getName().endsWith(".pdf")) { PDFParser parser = new PDFParser(new FileInputStream(f)); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(new PDDocument(cd)); Document doc = new Document(); doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("type", String.valueOf(FileTypes.PDF), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); cd.close(); } else if (f.getName().endsWith(".doc") || f.getName().endsWith(".docx")) { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f)); WordExtractor extractor = new WordExtractor(fs); String wordText = extractor.getText(); Document doc = new Document(); doc.add(new Field("contents", wordText, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("type", String.valueOf(FileTypes.DOC), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); } else if (f.getName().endsWith(".xls") || f.getName().endsWith(".xlsx")) { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f)); ExcelExtractor extractor = new ExcelExtractor(fs); String excelText = extractor.getText(); Document doc = new Document(); doc.add(new Field("contents", excelText, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("type", String.valueOf(FileTypes.XLS), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); } else if (f.getName().endsWith(".ppt") || f.getName().endsWith(".pptx")) { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f)); PowerPointExtractor extractor = new PowerPointExtractor(fs); String ppttext = extractor.getText(); Document doc = new Document(); doc.add(new Field("contents", ppttext, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("modified", df.format(f.lastModified()), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("type", String.valueOf(FileTypes.PPT), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filesize", String.valueOf(FormatFileSize.formatSize(f.length(), FormatFileSize.KB)), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); } if (Log.logger.isDebugEnabled()) { Log.logger.debug("Lucene | Neue Datei indexiert: " + f); } } catch (Exception e) { if (Log.logger.isDebugEnabled()) { Log.logger.debug("Datei konnte nicht indexiert werden: " + f, e); } continue; } finally { // fr.close(); } } writer.optimize(); queue.clear(); }
From source file:de.micromata.genome.gwiki.plugin.msotextextractor_1_0.PowerPointTextExtractor.java
License:Apache License
public String extractText(String fileName, InputStream data) { try {//from w w w .j a va 2 s . c om PowerPointExtractor extr = new PowerPointExtractor(data); String text = extr.getText(); text = WordTextExtractor.reworkWordText(text); return text; } catch (IOException ex) { throw new RuntimeIOException("Failure to extract word from " + fileName + "; " + ex.getMessage(), ex); } }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
private static Stream getTextFromPPT(InputStream doc) throws GenericSearchException { long time = System.currentTimeMillis(); boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors()); PowerPointExtractor powerPointExtractor = null; try {/* w w w.j a v a 2s.c o m*/ powerPointExtractor = new PowerPointExtractor(doc); StringBuffer buffer = new StringBuffer(powerPointExtractor.getText(true, true).trim()); Stream stream = new Stream(); stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING)); stream.lock(); if (logger.isDebugEnabled()) { logger.debug("extracting text from ppt needed " + (System.currentTimeMillis() - time)); } return stream; } catch (Exception e) { if (errorFlag) { logger.warn("", e); return createErrorStream(pptTextExtractionErrorString); } else { throw new GenericSearchException("cannot parse ppt-file", e); } } finally { powerPointExtractor = null; } }