List of usage examples for org.apache.poi.xslf.extractor XSLFPowerPointExtractor XSLFPowerPointExtractor
public XSLFPowerPointExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException
From source file:com.docdoku.server.esindexer.ESTools.java
License:Open Source License
private static String microsoftPowerPointDocumentToString(InputStream inputStream) throws IOException { String strRet;//from w w w .ja v a2s . c o m try (InputStream pptStream = new BufferedInputStream(inputStream)) { if (POIFSFileSystem.hasPOIFSHeader(pptStream)) { PowerPointExtractor pptExtractor = new PowerPointExtractor(pptStream); strRet = pptExtractor.getText(true, true); } else { XSLFPowerPointExtractor pptExtractor = new XSLFPowerPointExtractor(new XMLSlideShow(pptStream)); strRet = pptExtractor.getText(true, true, true); } } return strRet; }
From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsPowerPoint.java
License:Open Source License
/** * ?ppt //w ww . j a v a2 s . c o m * @param path * @return */ public String readPowerPoint2007(InputStream in) { String content = null; try { XMLSlideShow xmlslideshow = new XMLSlideShow(in); org.apache.poi.xslf.extractor.XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor( xmlslideshow); this.cp = extractor.getCoreProperties(); content = extractor.getText(); // SlideShow ss = new SlideShow(new HSLFSlideShow(in));// is // // InputStreamSlideShow // Slide[] slides = ss.getSlides();// ?? // for (int i = 0; i < slides.length; i++) { // TextRun[] t = slides[i].getTextRuns();// ??TextRun // for (int j = 0; j < t.length; j++) { // content.append(t[j].getText());// content // } // } } catch (Exception ex) { System.out.println(ex.toString()); } return content; }
From source file:com.jaeksoft.searchlib.parser.PptxParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { // TODO Optimise if it is already a file File tempFile = File.createTempFile("oss", ".pptx"); FileOutputStream fos = null;//from w w w.ja va 2s. c o m try { fos = new FileOutputStream(tempFile); IOUtils.copy(streamLimiter.getNewInputStream(), fos); fos.close(); } catch (IOException e) { IOUtils.close(fos); throw e; } XSLFPowerPointExtractor poiExtractor = null; try { XSLFSlideShow pptSlideShow = new XSLFSlideShow(tempFile.getAbsolutePath()); poiExtractor = new XSLFPowerPointExtractor(pptSlideShow); ParserResultItem result = getNewParserResultItem(); CoreProperties info = poiExtractor.getCoreProperties(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.creator, info.getCreator()); result.addField(ParserFieldEnum.subject, info.getSubject()); result.addField(ParserFieldEnum.description, info.getDescription()); result.addField(ParserFieldEnum.keywords, info.getKeywords()); } String content = poiExtractor.getText(true, true); result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(content, " ")); result.langDetection(10000, ParserFieldEnum.content); } catch (OpenXML4JException e) { throw new IOException(e); } catch (XmlException e) { throw new IOException(e); } finally { IOUtils.close(poiExtractor); } }
From source file:com.krawler.esp.fileparser.ppt.MsPPTParser.java
License:Open Source License
public String extractText(String filepath) throws Exception { String resultText = ""; try {//from w w w. j ava2 s . c om InputStream input = new BufferedInputStream(new FileInputStream(filepath)); XSLFSlideShow xsslsh = new XSLFSlideShow(filepath); XMLSlideShow xslsh = new XMLSlideShow(xsslsh); XSLFPowerPointExtractor ppt = new XSLFPowerPointExtractor(xslsh); resultText = ppt.getText(); if (input != null) { input.close(); } } catch (XmlException e) { System.out.print(e.getMessage()); } return resultText; }
From source file:com.opensearchserver.extractor.parser.Pptx.java
License:Apache License
@Override protected void parseContent(File file, String extension, String mimeType) throws Exception { XSLFSlideShow pptSlideShow = new XSLFSlideShow(file.getAbsolutePath()); XMLSlideShow slideshow = new XMLSlideShow(pptSlideShow.getPackage()); // Extract metadata XSLFPowerPointExtractor poiExtractor = null; try {//from w ww. j ava 2 s . com poiExtractor = new XSLFPowerPointExtractor(slideshow); CoreProperties info = poiExtractor.getCoreProperties(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(CREATOR, info.getCreator()); metas.add(SUBJECT, info.getSubject()); metas.add(DESCRIPTION, info.getDescription()); metas.add(KEYWORDS, info.getKeywords()); metas.add(CREATION_DATE, info.getCreated()); metas.add(MODIFICATION_DATE, info.getModified()); } } finally { poiExtractor.close(); } extractSides(slideshow); }
From source file:com.opensearchserver.textextractor.parser.Pptx.java
License:Open Source License
@Override protected void parseContent(File file) throws Exception { XSLFSlideShow pptSlideShow = new XSLFSlideShow(file.getAbsolutePath()); XMLSlideShow slideshow = new XMLSlideShow(pptSlideShow.getPackage()); // Extract metadata XSLFPowerPointExtractor poiExtractor = null; try {/*from w w w. ja va 2s . co m*/ poiExtractor = new XSLFPowerPointExtractor(slideshow); CoreProperties info = poiExtractor.getCoreProperties(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(CREATOR, info.getCreator()); metas.add(SUBJECT, info.getSubject()); metas.add(DESCRIPTION, info.getDescription()); metas.add(KEYWORDS, info.getKeywords()); metas.add(CREATION_DATE, info.getCreated()); metas.add(MODIFICATION_DATE, info.getModified()); } } finally { poiExtractor.close(); } extractSides(slideshow); }
From source file:com.qwazr.library.poi.PptxParser.java
License:Apache License
@Override public void parseContent(final MultivaluedMap<String, String> parameters, final Path filePath, final String extension, final String mimeType, final ParserResultBuilder resultBuilder) throws Exception { final XSLFSlideShow pptSlideShow = new XSLFSlideShow(filePath.toAbsolutePath().toString()); final XMLSlideShow slideshow = new XMLSlideShow(pptSlideShow.getPackage()); final ParserFieldsBuilder metas = resultBuilder.metas(); metas.set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault)); // Extract metadata try (XSLFPowerPointExtractor poiExtractor = new XSLFPowerPointExtractor(slideshow)) { final CoreProperties info = poiExtractor.getCoreProperties(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(CREATOR, info.getCreator()); metas.add(SUBJECT, info.getSubject()); metas.add(DESCRIPTION, info.getDescription()); metas.add(KEYWORDS, info.getKeywords()); metas.add(CREATION_DATE, info.getCreated()); metas.add(MODIFICATION_DATE, info.getModified()); }/* ww w . j av a2s. c o m*/ } extractSides(slideshow, resultBuilder); }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
private static Stream getTextFromPPTX(InputStream doc) throws GenericSearchException { long time = System.currentTimeMillis(); boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors()); XSLFPowerPointExtractor powerPointExtractor = null; try {//from w w w . j a va2 s . c o m powerPointExtractor = new XSLFPowerPointExtractor(OPCPackage.open(doc)); StringBuffer buffer = new StringBuffer(powerPointExtractor.getText(true, true).trim()); Stream stream = new Stream(); stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING)); stream.lock(); if (logger.isDebugEnabled()) { logger.debug("extracting text from pptx needed " + (System.currentTimeMillis() - time)); } return stream; } catch (Exception e) { if (errorFlag) { logger.warn("", e); return createErrorStream(pptxTextExtractionErrorString); } else { throw new GenericSearchException("cannot parse pptx-file", e); } } finally { powerPointExtractor = null; } }
From source file:edu.ur.ir.index.DefaultPowerPointXmlTextExtractor.java
License:Apache License
/** * Extract text from a word 97-2003 document. * @throws Exception // w ww . j a va2s .c o m * * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File) */ public String getText(File f) throws Exception { String text = null; if (isFileTooLarge(f) || f.length() <= 0l) { return text; } OPCPackage p = null; try { p = XSLFSlideShow.openPackage(f.getAbsolutePath()); XSLFSlideShow slideShow = new XSLFSlideShow(p); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slideShow); String myText = extractor.getText(); if (myText != null && !myText.trim().equals("")) { text = myText; } } catch (OutOfMemoryError oome) { text = null; log.error("could not extract text", oome); throw (oome); } catch (Exception e) { text = null; log.error("could not get text for word document " + f.getAbsolutePath(), e); throw (e); } finally { if (p != null) { try { p.close(); p = null; } catch (IOException e) { log.debug(e); p = null; } } } return text; }
From source file:IO.search.SearchWordFile.java
private void search(File scrFile, String word) { //split the key word in different way //there are two way to split key word //space or hyphen //space 's meaning is any one key word contain in the file we search //hyphen 's meaning is all key word must contain in the file we seach String[] arrStr = null;// w w w .j a v a 2 s .c om String[] arrStrA = null; if (word.contains(" ")) { arrStr = word.split(" "); } else if (word.contains("-")) { arrStrA = word.split("-"); System.out.println("reach"); } //regular expression mean suffixes must contain doc. boolean is03word = scrFile.getName().matches("^.+\\.(?i)(doc)$"); if (is03word) { try { InputStream is = new FileInputStream(scrFile); WordExtractor ex = new WordExtractor(is); String text2003 = ex.getText(); if (arrStr != null && arrStr.length > 0) { //if keyword has space ,then we do spilt it //invoke the method finding(text2003, arrStr, scrFile); } else if (arrStrA != null && arrStrA.length > 0) { //if keyword has hyphen, it mean that the file we search must contain these key word. //we are using count varible to count the text of the file containing keyword whether enough or not. int count = 0; for (int i = 0; i < arrStrA.length; i++) { if (text2003.toLowerCase().contains(arrStrA[i].toLowerCase())) { count++; } } //if count varible if equal with amount of keyword that the file is we want. if (count == arrStrA.length) { nameList.add(scrFile.getPath()); } } else if (text2003.toLowerCase().contains(word.toLowerCase())) { System.out.println("true"); nameList.add(scrFile.getPath()); } } catch (Exception ex) { Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex); } } else if (scrFile.getName().matches("^.+\\.(?i)(docx)$")) { try { OPCPackage opcPackage = POIXMLDocument.openPackage(scrFile.getPath()); POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); String text2007 = extractor.getText(); if (arrStr != null && arrStr.length > 0) { finding(text2007, arrStr, scrFile); } else if (arrStrA != null && arrStrA.length > 0) { int count = 0; for (int i = 0; i < arrStrA.length; i++) { if (text2007.toLowerCase().contains(arrStrA[i].toLowerCase())) { count++; } } if (count == arrStrA.length) { nameList.add(scrFile.getPath()); } } else if (text2007.toLowerCase().contains(word.toLowerCase())) { System.out.println("true"); nameList.add(scrFile.getPath()); } } catch (Exception ex) { Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex); } } else if (scrFile.getName().matches("^.+\\.(?i)(pdf)$")) { FileInputStream input = null; PDDocument pdfDocument = null; try { input = new FileInputStream(scrFile); PDFParser pdfParser = new PDFParser(input); pdfParser.parse(); pdfDocument = pdfParser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(pdfDocument); if (arrStr != null && arrStr.length > 0) { finding(content, arrStr, scrFile); } else if (arrStrA != null && arrStrA.length > 0) { int count = 0; for (int i = 0; i < arrStrA.length; i++) { if (content.toLowerCase().contains(arrStrA[i].toLowerCase())) { count++; } } if (count == arrStrA.length) { nameList.add(scrFile.getPath()); } } else if (content.toLowerCase().contains(word.toLowerCase())) { System.out.println("true"); nameList.add(scrFile.getPath()); } } catch (Exception ex) { Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex); } finally { try { input.close(); pdfDocument.close(); } catch (IOException ex) { Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex); } } } else if (scrFile.getName().matches("^.+\\.(?i)(txt)$")) { BufferedReader in = null; StringBuffer sb = new StringBuffer(); try { in = new BufferedReader((new InputStreamReader(new FileInputStream(scrFile), getCharset(scrFile.getAbsolutePath())))); String line = null; while ((line = in.readLine()) != null) { sb.append(line); } if (arrStr != null && arrStr.length > 0) { if (finding(sb.toString(), arrStr, scrFile)) { return; } } else if (arrStrA != null && arrStrA.length > 0) { int count = 0; for (int i = 0; i < arrStrA.length; i++) { if (sb.toString().contains(arrStrA[i])) { count++; } } if (count == arrStrA.length) { nameList.add(scrFile.getPath()); } } else if (line.toLowerCase().contains(word.toLowerCase())) { System.out.println("true"); nameList.add(scrFile.getPath()); return; } } catch (Exception ex) { Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex); } finally { try { in.close(); } catch (IOException ex) { Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex); } } } else if (scrFile.getName().matches("^.+\\.(?i)(ppt)$")) {//find the key word in ppt file InputStream is = null; try { StringBuffer content = new StringBuffer(""); is = new FileInputStream(scrFile); //get core API HSLFSlideShow ss = new HSLFSlideShow(is); //get how many page in this PPT List<HSLFSlide> slides = ss.getSlides(); System.out.println("total have " + slides.size() + " page PPT"); for (int i = 0; i < slides.size(); i++) { //get each page of ppt content, retrun is List List<List<HSLFTextParagraph>> textParagraphs = slides.get(i).getTextParagraphs(); if (textParagraphs != null) { for (int j = 0; j < textParagraphs.size(); j++) { content.append("\n"); //get each row of the page List<HSLFTextParagraph> hslfTextParagraph = textParagraphs.get(j); for (int f = 0; f < hslfTextParagraph.size(); f++) { //get the text of this row content.append(hslfTextParagraph.get(f).toString()); } } } } if (arrStr != null && arrStr.length > 0) { finding(content.toString(), arrStr, scrFile); } else if (arrStrA != null && arrStrA.length > 0) { int count = 0; for (int i = 0; i < arrStrA.length; i++) { if (content.toString().toLowerCase().contains(arrStrA[i].toLowerCase())) { count++; } } if (count == arrStrA.length) { nameList.add(scrFile.getPath()); } } else if (content.toString().toLowerCase().contains(word.toLowerCase())) { System.out.println("true"); nameList.add(scrFile.getPath()); } } catch (Exception ex) { Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex); } finally { try { is.close(); } catch (IOException ex) { Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex); } } } else if (scrFile.getName().matches("^.+\\.(?i)(pptx)$")) {//if powerpoint is 2007 or after we use this method String conetxt = null; try { conetxt = new XSLFPowerPointExtractor(POIXMLDocument.openPackage(scrFile.getPath())).getText(); if (arrStr != null && arrStr.length > 0) { finding(conetxt, arrStr, scrFile); } else if (arrStrA != null && arrStrA.length > 0) { int count = 0; for (int i = 0; i < arrStrA.length; i++) { if (conetxt.toLowerCase().contains(arrStrA[i].toLowerCase())) { count++; } } if (count == arrStrA.length) { nameList.add(scrFile.getPath()); } } else if (conetxt.toLowerCase().contains(word.toLowerCase())) { System.out.println("true"); nameList.add(scrFile.getPath()); } } catch (Exception ex) { Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex); } } }