List of usage examples for org.apache.poi.xslf.extractor XSLFPowerPointExtractor getText
@Override
public String getText()
From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsPowerPoint.java
License:Open Source License
/** * ?ppt // w w w . j a v a 2s . co m * @param path * @return */ public String readPowerPoint2007(InputStream in) { String content = null; try { XMLSlideShow xmlslideshow = new XMLSlideShow(in); org.apache.poi.xslf.extractor.XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor( xmlslideshow); this.cp = extractor.getCoreProperties(); content = extractor.getText(); // SlideShow ss = new SlideShow(new HSLFSlideShow(in));// is // // InputStreamSlideShow // Slide[] slides = ss.getSlides();// ?? // for (int i = 0; i < slides.length; i++) { // TextRun[] t = slides[i].getTextRuns();// ??TextRun // for (int j = 0; j < t.length; j++) { // content.append(t[j].getText());// content // } // } } catch (Exception ex) { System.out.println(ex.toString()); } return content; }
From source file:com.krawler.esp.fileparser.ppt.MsPPTParser.java
License:Open Source License
public String extractText(String filepath) throws Exception { String resultText = ""; try {//from www . j ava 2 s .co m InputStream input = new BufferedInputStream(new FileInputStream(filepath)); XSLFSlideShow xsslsh = new XSLFSlideShow(filepath); XMLSlideShow xslsh = new XMLSlideShow(xsslsh); XSLFPowerPointExtractor ppt = new XSLFPowerPointExtractor(xslsh); resultText = ppt.getText(); if (input != null) { input.close(); } } catch (XmlException e) { System.out.print(e.getMessage()); } return resultText; }
From source file:edu.ur.ir.index.DefaultPowerPointXmlTextExtractor.java
License:Apache License
/** * Extract text from a word 97-2003 document. * @throws Exception //from ww w. j av a 2 s. c o m * * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File) */ public String getText(File f) throws Exception { String text = null; if (isFileTooLarge(f) || f.length() <= 0l) { return text; } OPCPackage p = null; try { p = XSLFSlideShow.openPackage(f.getAbsolutePath()); XSLFSlideShow slideShow = new XSLFSlideShow(p); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slideShow); String myText = extractor.getText(); if (myText != null && !myText.trim().equals("")) { text = myText; } } catch (OutOfMemoryError oome) { text = null; log.error("could not extract text", oome); throw (oome); } catch (Exception e) { text = null; log.error("could not get text for word document " + f.getAbsolutePath(), e); throw (e); } finally { if (p != null) { try { p.close(); p = null; } catch (IOException e) { log.debug(e); p = null; } } } return text; }
From source file:net.ontopia.topicmaps.classify.OOXMLPowerpointFormatModule.java
License:Apache License
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) { try {//from www . j ava 2 s. com OPCPackage opc = OPCPackage.open(new ByteArrayInputStream(cc.getContent())); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(opc); String s = extractor.getText(); char[] c = s.toCharArray(); handler.startRegion("document"); handler.text(c, 0, c.length); handler.endRegion(); } catch (Exception e) { throw new OntopiaRuntimeException(e); } }
From source file:org.crypto.sse.TextExtractPar.java
License:Open Source License
private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException { Multimap<String, String> lookup1 = ArrayListMultimap.create(); Multimap<String, String> lookup2 = ArrayListMultimap.create(); for (File file : listOfFile) { for (int j = 0; j < 100; j++) { if (counter == (int) ((j + 1) * listOfFile.length / 100)) { System.out.println("Number of files read equals " + j + " %"); break; }//www. j av a 2 s . com } List<String> lines = new ArrayList<String>(); counter++; FileInputStream fis = new FileInputStream(file); // ***********************************************************************************************// ///////////////////// .docx ///////////////////////////// // ***********************************************************************************************// if (file.getName().endsWith(".docx")) { XWPFDocument doc; try { // System.out.println("File read: "+file.getName()); doc = new XWPFDocument(fis); XWPFWordExtractor ex = new XWPFWordExtractor(doc); lines.add(ex.getText()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pptx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pptx")) { OPCPackage ppt; try { // System.out.println("File read: "+file.getName()); ppt = OPCPackage.open(fis); XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt); lines.add(xw.getText()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .xlsx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".xlsx")) { OPCPackage xls; try { // System.out.println("File read: "+file.getName()); xls = OPCPackage.open(fis); XSSFExcelExtractor xe = new XSSFExcelExtractor(xls); lines.add(xe.getText()); } catch (InvalidFormatException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { System.out.println("File not read: " + file.getName()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .doc ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".doc")) { NPOIFSFileSystem fs; try { // System.out.println("File read: "+file.getName()); fs = new NPOIFSFileSystem(file); WordExtractor extractor = new WordExtractor(fs.getRoot()); for (String rawText : extractor.getParagraphText()) { lines.add(extractor.stripFields(rawText)); } } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pdf ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pdf")) { PDFParser parser; try { // System.out.println("File read: "+file.getName()); parser = new PDFParser(fis); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); lines.add(stripper.getText(new PDDocument(cd))); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg, ///////////////////// .mp4 ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg") && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg") && file.getName().endsWith(".mp4")) { lines.add(file.getName()); } // ***********************************************************************************************// ///////////////////// raw text extensions ///////////////////// ///////////////////////////// // ***********************************************************************************************// else { try { // System.out.println("File read: "+file.getName()); lines = Files.readLines(file, Charsets.UTF_8); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } finally { try { fis.close(); } catch (IOException ioex) { // omitted. } } } // ***********************************************************************************************// ///////////////////// Begin word extraction ///////////////////// ///////////////////////////// // ***********************************************************************************************// int temporaryCounter = 0; // Filter threshold int counterDoc = 0; for (int i = 0; i < lines.size(); i++) { CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop // words. We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i)); temporaryCounter = temporaryCounter + token.size(); for (int j = 0; j < token.size(); j++) { // Avoid counting occurrences of words in the same file if (!lookup2.get(file.getName()).contains(token.get(j))) { lookup2.put(file.getName(), token.get(j)); } // Avoid counting occurrences of words in the same file if (!lookup1.get(token.get(j)).contains(file.getName())) { lookup1.put(token.get(j), file.getName()); } } } } // System.out.println(lookup.toString()); return new TextExtractPar(lookup1, lookup2); }
From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java
License:Open Source License
/** * Write document content to document artifact as its raw content * * @param registry// w w w . ja v a 2s . co m * @param documentResource * @return * @throws RegistryException * @throws IOException * @throws APIManagementException */ private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException { GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY); GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID()); String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE); String contentString = null; if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) { Association fileAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_FILE_ASSOCIATION); Association fileAssociation; if (fileAssociations.length < 1) { String error = "No document associated to API"; log.error(error); throw new APIManagementException(error); } //a file document can have one file association fileAssociation = fileAssociations[0]; String contentPath = fileAssociation.getDestinationPath(); if (!registry.resourceExists(contentPath)) { String error = "API not found at " + contentPath; log.error(error); throw new APIManagementException(error); } Resource contentResource = registry.get(contentPath); String fileName = ((ResourceImpl) contentResource).getName(); String extension = FilenameUtils.getExtension(fileName); InputStream inputStream = null; try { inputStream = contentResource.getContentStream(); switch (extension) { case APIConstants.PDF_EXTENSION: PDFParser pdfParser = new PDFParser(inputStream); pdfParser.parse(); COSDocument cosDocument = pdfParser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); contentString = stripper.getText(new PDDocument(cosDocument)); break; case APIConstants.DOC_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); WordExtractor msWord2003Extractor = new WordExtractor(pfs); contentString = msWord2003Extractor.getText(); break; } case APIConstants.DOCX_EXTENSION: XWPFDocument doc = new XWPFDocument(inputStream); XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc); contentString = msWord2007Extractor.getText(); break; case APIConstants.XLS_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); ExcelExtractor extractor = new ExcelExtractor(pfs); contentString = extractor.getText(); break; } case APIConstants.XLSX_EXTENSION: XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream); XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets); contentString = xssfExcelExtractor.getText(); break; case APIConstants.PPT_EXTENSION: { POIFSFileSystem fs = new POIFSFileSystem(inputStream); PowerPointExtractor extractor = new PowerPointExtractor(fs); contentString = extractor.getText(); break; } case APIConstants.PPTX_EXTENSION: XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream); XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow); contentString = xslfPowerPointExtractor.getText(); break; case APIConstants.TXT_EXTENSION: case APIConstants.WSDL_EXTENSION: case APIConstants.XML_DOC_EXTENSION: BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String line; StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); break; } } finally { IOUtils.closeQuietly(inputStream); } } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) { Association contentAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION); Association contentAssociation; //an inline document can have one or no content associations if (contentAssociations.length == 1) { contentAssociation = contentAssociations[0]; String contentPath = contentAssociation.getDestinationPath(); if (registry.resourceExists(contentPath)) { Resource contentResource = registry.get(contentPath); InputStream instream = null; BufferedReader reader = null; String line; try { instream = contentResource.getContentStream(); reader = new BufferedReader(new InputStreamReader(instream)); StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); } finally { if (reader != null) { IOUtils.closeQuietly(reader); } } } } } return contentString; }
From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.MSPowerpointIndexerTest.java
License:Open Source License
@Test public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception { POIFSFileSystem ppExtractor = Mockito.mock(POIFSFileSystem.class); PowerPointExtractor powerPointExtractor = Mockito.mock(PowerPointExtractor.class); XSLFPowerPointExtractor xslfExtractor = Mockito.mock(XSLFPowerPointExtractor.class); XMLSlideShow xmlSlideShow = Mockito.mock(XMLSlideShow.class); PowerMockito.whenNew(POIFSFileSystem.class).withArguments(Mockito.anyObject()) .thenThrow(OfficeXmlFileException.class).thenReturn(ppExtractor) .thenThrow(APIManagementException.class); PowerMockito.whenNew(PowerPointExtractor.class).withArguments(ppExtractor).thenReturn(powerPointExtractor); PowerMockito.whenNew(XMLSlideShow.class).withParameterTypes(InputStream.class).withArguments(Mockito.any()) .thenReturn(xmlSlideShow);/*from w w w. j a v a2 s .co m*/ PowerMockito.whenNew(XSLFPowerPointExtractor.class).withArguments(xmlSlideShow).thenReturn(xslfExtractor); Mockito.when(powerPointExtractor.getText()).thenReturn(""); Mockito.when(xslfExtractor.getText()).thenReturn(""); MSPowerpointIndexer indexer = new MSPowerpointIndexer(); IndexDocument ppDoc = indexer.getIndexedDocument(file2Index); // should return the default media type when media type is not defined in file2Index if (!"application/vnd.ms-powerpoint" .equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) { Assert.fail(); } // should return the media type we have set in the file2Index file2Index.mediaType = "text/html"; ppDoc = indexer.getIndexedDocument(file2Index); if (!"text/html".equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) { Assert.fail(); } // should return the media type we have set in the file2Index even if exception occurred while reading the file ppDoc = indexer.getIndexedDocument(file2Index); if (!"text/html".equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) { Assert.fail(); } }