List of usage examples for org.apache.poi.xssf.extractor XSSFExcelExtractor XSSFExcelExtractor
public XSSFExcelExtractor(XSSFWorkbook workbook)
From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsExcel.java
License:Open Source License
/** * ?excel2007 //from ww w . ja v a 2 s . c o m * @param path * @return * @throws IOException */ public String readExcel2007(InputStream in) throws IOException { // StringBuffer content = new StringBuffer(); // XSSFWorkbook strPath String content = null; XSSFWorkbook xwb = new XSSFWorkbook(in); XSSFExcelExtractor extractor = new XSSFExcelExtractor(xwb); extractor.setFormulasNotResults(true); extractor.setIncludeSheetNames(false); content = extractor.getText(); this.cp = extractor.getCoreProperties(); return content; // // Sheet // for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) { // XSSFSheet xSheet = xwb.getSheetAt(numSheet); // if (xSheet == null) { // continue; // } // // Row // for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) { // XSSFRow xRow = xSheet.getRow(rowNum); // if (xRow == null) { // continue; // } // // Cell // for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) { // XSSFCell xCell = xRow.getCell(cellNum); // if (xCell == null) { // continue; // } // if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) { // content.append(xCell.getBooleanCellValue()); // } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) { // content.append(xCell.getNumericCellValue()); // } else { // content.append(xCell.getStringCellValue()); // } // } // } // } // // return content.toString(); }
From source file:com.jaeksoft.searchlib.parser.XlsxParser.java
License:Open Source License
@Override protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { XSSFWorkbook workbook = new XSSFWorkbook(streamLimiter.getNewInputStream()); XSSFExcelExtractor excelExtractor = null; try {// w w w. j ava 2 s . c om excelExtractor = new XSSFExcelExtractor(workbook); ParserResultItem result = getNewParserResultItem(); CoreProperties info = excelExtractor.getCoreProperties(); if (info != null) { result.addField(ParserFieldEnum.title, info.getTitle()); result.addField(ParserFieldEnum.creator, info.getCreator()); result.addField(ParserFieldEnum.subject, info.getSubject()); result.addField(ParserFieldEnum.description, info.getDescription()); result.addField(ParserFieldEnum.keywords, info.getKeywords()); } excelExtractor.setIncludeCellComments(true); excelExtractor.setIncludeHeadersFooters(true); excelExtractor.setIncludeSheetNames(true); String content = excelExtractor.getText(); result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(content, " ")); result.langDetection(10000, ParserFieldEnum.content); } finally { IOUtils.close(excelExtractor); } }
From source file:com.krawler.esp.fileparser.excel.XlsxParser.java
License:Open Source License
public String extractText(String filepath) throws FileNotFoundException, IOException { StringBuilder sb = new StringBuilder(); try {/* www .j av a 2 s.com*/ FileInputStream fis = new FileInputStream(filepath); XSSFWorkbook workbook = new XSSFWorkbook(fis); XSSFExcelExtractor es = new XSSFExcelExtractor(workbook); sb.append(es.getText()); } catch (Exception e) { logger.warn(e.getMessage(), e); } return sb.toString(); }
From source file:com.opensearchserver.extractor.parser.Xlsx.java
License:Apache License
@Override protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception { XSSFWorkbook workbook = null;/* ww w. j av a 2 s . c om*/ XSSFExcelExtractor excelExtractor = null; try { workbook = new XSSFWorkbook(inputStream); excelExtractor = new XSSFExcelExtractor(workbook); CoreProperties info = excelExtractor.getCoreProperties(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(CREATOR, info.getCreator()); metas.add(CREATION_DATE, info.getCreated()); metas.add(MODIFICATION_DATE, info.getModified()); metas.add(SUBJECT, info.getSubject()); metas.add(DESCRIPTION, info.getDescription()); metas.add(KEYWORDS, info.getKeywords()); } ParserDocument result = getNewParserDocument(); excelExtractor.setIncludeCellComments(true); excelExtractor.setIncludeHeadersFooters(true); excelExtractor.setIncludeSheetNames(true); result.add(CONTENT, excelExtractor.getText()); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (excelExtractor != null) IOUtils.closeQuietly(excelExtractor); if (workbook != null) IOUtils.closeQuietly(workbook); } }
From source file:com.opensearchserver.textextractor.parser.Xlsx.java
License:Apache License
private void parseContent(XSSFWorkbook workbook) throws Exception { XSSFExcelExtractor excelExtractor = null; try {/*from ww w . j a va 2 s. c o m*/ excelExtractor = new XSSFExcelExtractor(workbook); CoreProperties info = excelExtractor.getCoreProperties(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(CREATOR, info.getCreator()); metas.add(CREATION_DATE, info.getCreated()); metas.add(MODIFICATION_DATE, info.getModified()); metas.add(SUBJECT, info.getSubject()); metas.add(DESCRIPTION, info.getDescription()); metas.add(KEYWORDS, info.getKeywords()); } ParserDocument result = getNewParserDocument(); excelExtractor.setIncludeCellComments(true); excelExtractor.setIncludeHeadersFooters(true); excelExtractor.setIncludeSheetNames(true); result.add(CONTENT, excelExtractor.getText()); result.add(LANG_DETECTION, languageDetection(CONTENT, 10000)); } finally { if (excelExtractor != null) IOUtils.closeQuietly(excelExtractor); } }
From source file:com.qwazr.library.poi.XlsxParser.java
License:Apache License
@Override public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream, String extension, final String mimeType, final ParserResultBuilder resultBuilder) throws Exception { try (final XSSFWorkbook workbook = new XSSFWorkbook(inputStream)) { try (final XSSFExcelExtractor excelExtractor = new XSSFExcelExtractor(workbook)) { final ParserFieldsBuilder metas = resultBuilder.metas(); metas.set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault)); final CoreProperties info = excelExtractor.getCoreProperties(); if (info != null) { metas.add(TITLE, info.getTitle()); metas.add(CREATOR, info.getCreator()); metas.add(CREATION_DATE, info.getCreated()); metas.add(MODIFICATION_DATE, info.getModified()); metas.add(SUBJECT, info.getSubject()); metas.add(DESCRIPTION, info.getDescription()); metas.add(KEYWORDS, info.getKeywords()); }//from w w w. j a va 2 s. c o m final ParserFieldsBuilder result = resultBuilder.newDocument(); excelExtractor.setIncludeCellComments(true); excelExtractor.setIncludeHeadersFooters(true); excelExtractor.setIncludeSheetNames(true); result.add(CONTENT, excelExtractor.getText()); result.add(LANG_DETECTION, languageDetection(result, CONTENT, 10000)); } } }
From source file:dk.defxws.fedoragsearch.server.TransformerToText.java
License:Open Source License
private static Stream getTextFromXLSX(InputStream doc) throws GenericSearchException { long time = System.currentTimeMillis(); boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors()); XSSFExcelExtractor excelExtractor = null; try {//from w ww . jav a 2 s .c o m excelExtractor = new XSSFExcelExtractor(OPCPackage.open(doc)); StringBuffer buffer = new StringBuffer(excelExtractor.getText().trim()); Stream stream = new Stream(); stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING)); stream.lock(); if (logger.isDebugEnabled()) { logger.debug("extracting text from xlsx needed " + (System.currentTimeMillis() - time)); } return stream; } catch (Exception e) { if (errorFlag) { logger.warn("", e); return createErrorStream(xlsxTextExtractionErrorString); } else { throw new GenericSearchException("cannot parse xlsx-file", e); } } finally { excelExtractor = null; } }
From source file:edu.ur.ir.index.DefaultExcelXmlTextExtractor.java
License:Apache License
/** * Extract text from a word 97-2003 document. * @throws Exception /*from ww w .j a va 2 s .com*/ * * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File) */ public String getText(File f) throws Exception { String text = null; if (isFileTooLarge(f) || f.length() <= 0l) { return text; } OPCPackage p = null; try { p = XSSFWorkbook.openPackage(f.getAbsolutePath()); XSSFWorkbook workbook = new XSSFWorkbook(p); XSSFExcelExtractor extractor = new XSSFExcelExtractor(workbook); String myText = extractor.getText(); if (myText != null && !myText.trim().equals("")) { text = myText; } } catch (OutOfMemoryError oome) { text = null; log.error("could not extract text", oome); throw (oome); } catch (Exception e) { text = null; log.error("could not get text for word document " + f.getAbsolutePath()); throw (e); } finally { if (p != null) { try { p.close(); p = null; } catch (IOException e) { log.debug(e); p = null; } } } return text; }
From source file:org.crypto.sse.TextExtractPar.java
License:Open Source License
private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException { Multimap<String, String> lookup1 = ArrayListMultimap.create(); Multimap<String, String> lookup2 = ArrayListMultimap.create(); for (File file : listOfFile) { for (int j = 0; j < 100; j++) { if (counter == (int) ((j + 1) * listOfFile.length / 100)) { System.out.println("Number of files read equals " + j + " %"); break; }/*www. j a v a2 s. c om*/ } List<String> lines = new ArrayList<String>(); counter++; FileInputStream fis = new FileInputStream(file); // ***********************************************************************************************// ///////////////////// .docx ///////////////////////////// // ***********************************************************************************************// if (file.getName().endsWith(".docx")) { XWPFDocument doc; try { // System.out.println("File read: "+file.getName()); doc = new XWPFDocument(fis); XWPFWordExtractor ex = new XWPFWordExtractor(doc); lines.add(ex.getText()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pptx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pptx")) { OPCPackage ppt; try { // System.out.println("File read: "+file.getName()); ppt = OPCPackage.open(fis); XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt); lines.add(xw.getText()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .xlsx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".xlsx")) { OPCPackage xls; try { // System.out.println("File read: "+file.getName()); xls = OPCPackage.open(fis); XSSFExcelExtractor xe = new XSSFExcelExtractor(xls); lines.add(xe.getText()); } catch (InvalidFormatException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { System.out.println("File not read: " + file.getName()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .doc ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".doc")) { NPOIFSFileSystem fs; try { // System.out.println("File read: "+file.getName()); fs = new NPOIFSFileSystem(file); WordExtractor extractor = new WordExtractor(fs.getRoot()); for (String rawText : extractor.getParagraphText()) { lines.add(extractor.stripFields(rawText)); } } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pdf ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pdf")) { PDFParser parser; try { // System.out.println("File read: "+file.getName()); parser = new PDFParser(fis); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); lines.add(stripper.getText(new PDDocument(cd))); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg, ///////////////////// .mp4 ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg") && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg") && file.getName().endsWith(".mp4")) { lines.add(file.getName()); } // ***********************************************************************************************// ///////////////////// raw text extensions ///////////////////// ///////////////////////////// // ***********************************************************************************************// else { try { // System.out.println("File read: "+file.getName()); lines = Files.readLines(file, Charsets.UTF_8); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } finally { try { fis.close(); } catch (IOException ioex) { // omitted. } } } // ***********************************************************************************************// ///////////////////// Begin word extraction ///////////////////// ///////////////////////////// // ***********************************************************************************************// int temporaryCounter = 0; // Filter threshold int counterDoc = 0; for (int i = 0; i < lines.size(); i++) { CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop // words. We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i)); temporaryCounter = temporaryCounter + token.size(); for (int j = 0; j < token.size(); j++) { // Avoid counting occurrences of words in the same file if (!lookup2.get(file.getName()).contains(token.get(j))) { lookup2.put(file.getName(), token.get(j)); } // Avoid counting occurrences of words in the same file if (!lookup1.get(token.get(j)).contains(file.getName())) { lookup1.put(token.get(j), file.getName()); } } } } // System.out.println(lookup.toString()); return new TextExtractPar(lookup1, lookup2); }
From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java
License:Open Source License
/** * Write document content to document artifact as its raw content * * @param registry//from w w w . j a v a2 s . c o m * @param documentResource * @return * @throws RegistryException * @throws IOException * @throws APIManagementException */ private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException { GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY); GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID()); String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE); String contentString = null; if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) { Association fileAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_FILE_ASSOCIATION); Association fileAssociation; if (fileAssociations.length < 1) { String error = "No document associated to API"; log.error(error); throw new APIManagementException(error); } //a file document can have one file association fileAssociation = fileAssociations[0]; String contentPath = fileAssociation.getDestinationPath(); if (!registry.resourceExists(contentPath)) { String error = "API not found at " + contentPath; log.error(error); throw new APIManagementException(error); } Resource contentResource = registry.get(contentPath); String fileName = ((ResourceImpl) contentResource).getName(); String extension = FilenameUtils.getExtension(fileName); InputStream inputStream = null; try { inputStream = contentResource.getContentStream(); switch (extension) { case APIConstants.PDF_EXTENSION: PDFParser pdfParser = new PDFParser(inputStream); pdfParser.parse(); COSDocument cosDocument = pdfParser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); contentString = stripper.getText(new PDDocument(cosDocument)); break; case APIConstants.DOC_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); WordExtractor msWord2003Extractor = new WordExtractor(pfs); contentString = msWord2003Extractor.getText(); break; } case APIConstants.DOCX_EXTENSION: XWPFDocument doc = new XWPFDocument(inputStream); XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc); contentString = msWord2007Extractor.getText(); break; case APIConstants.XLS_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); ExcelExtractor extractor = new ExcelExtractor(pfs); contentString = extractor.getText(); break; } case APIConstants.XLSX_EXTENSION: XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream); XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets); contentString = xssfExcelExtractor.getText(); break; case APIConstants.PPT_EXTENSION: { POIFSFileSystem fs = new POIFSFileSystem(inputStream); PowerPointExtractor extractor = new PowerPointExtractor(fs); contentString = extractor.getText(); break; } case APIConstants.PPTX_EXTENSION: XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream); XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow); contentString = xslfPowerPointExtractor.getText(); break; case APIConstants.TXT_EXTENSION: case APIConstants.WSDL_EXTENSION: case APIConstants.XML_DOC_EXTENSION: BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String line; StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); break; } } finally { IOUtils.closeQuietly(inputStream); } } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) { Association contentAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION); Association contentAssociation; //an inline document can have one or no content associations if (contentAssociations.length == 1) { contentAssociation = contentAssociations[0]; String contentPath = contentAssociation.getDestinationPath(); if (registry.resourceExists(contentPath)) { Resource contentResource = registry.get(contentPath); InputStream instream = null; BufferedReader reader = null; String line; try { instream = contentResource.getContentStream(); reader = new BufferedReader(new InputStreamReader(instream)); StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); } finally { if (reader != null) { IOUtils.closeQuietly(reader); } } } } } return contentString; }