Example usage for org.apache.poi.xssf.extractor XSSFExcelExtractor getText

List of usage examples for org.apache.poi.xssf.extractor XSSFExcelExtractor getText

Introduction

In this page you can find the example usage for org.apache.poi.xssf.extractor XSSFExcelExtractor getText.

Prototype

public String getText() 

Source Link

Document

Retrieves the text contents of the file

Usage

From source file:com.frameworkset.platform.cms.searchmanager.extractors.CmsExtractorMsExcel.java

License:Open Source License

/** 
    * ?excel2007 //from www.j a  va 2 s. c o  m
    * @param path 
     * @return 
     * @throws IOException 
    */
public String readExcel2007(InputStream in) throws IOException {
    //                StringBuffer content = new StringBuffer();  
    //  XSSFWorkbook strPath   
    String content = null;
    XSSFWorkbook xwb = new XSSFWorkbook(in);
    XSSFExcelExtractor extractor = new XSSFExcelExtractor(xwb);
    extractor.setFormulasNotResults(true);
    extractor.setIncludeSheetNames(false);
    content = extractor.getText();
    this.cp = extractor.getCoreProperties();
    return content;
    //               // Sheet  
    //               for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {  
    //                  XSSFSheet xSheet = xwb.getSheetAt(numSheet);  
    //                    if (xSheet == null) {  
    //                        continue;  
    //                  }  
    //                    // Row  
    //                 for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {  
    //                       XSSFRow xRow = xSheet.getRow(rowNum);  
    //                        if (xRow == null) {  
    //                          continue;  
    //                        }  
    //                        // Cell  
    //                        for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {  
    //                            XSSFCell xCell = xRow.getCell(cellNum);  
    //                            if (xCell == null) {  
    //                                continue;  
    //                           }  
    //                         if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {  
    //                                content.append(xCell.getBooleanCellValue());  
    //                            } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {  
    //                                content.append(xCell.getNumericCellValue());  
    //                            } else {  
    //                                content.append(xCell.getStringCellValue());  
    //                            }  
    //                        }  
    //                   }  
    //                }  
    //          
    //                return content.toString();  
}

From source file:com.jaeksoft.searchlib.parser.XlsxParser.java

License:Open Source License

@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException {

    XSSFWorkbook workbook = new XSSFWorkbook(streamLimiter.getNewInputStream());
    XSSFExcelExtractor excelExtractor = null;
    try {/* w w w  .  ja v a 2 s . c  o  m*/
        excelExtractor = new XSSFExcelExtractor(workbook);
        ParserResultItem result = getNewParserResultItem();

        CoreProperties info = excelExtractor.getCoreProperties();
        if (info != null) {
            result.addField(ParserFieldEnum.title, info.getTitle());
            result.addField(ParserFieldEnum.creator, info.getCreator());
            result.addField(ParserFieldEnum.subject, info.getSubject());
            result.addField(ParserFieldEnum.description, info.getDescription());
            result.addField(ParserFieldEnum.keywords, info.getKeywords());
        }

        excelExtractor.setIncludeCellComments(true);
        excelExtractor.setIncludeHeadersFooters(true);
        excelExtractor.setIncludeSheetNames(true);
        String content = excelExtractor.getText();
        result.addField(ParserFieldEnum.content, StringUtils.replaceConsecutiveSpaces(content, " "));

        result.langDetection(10000, ParserFieldEnum.content);
    } finally {
        IOUtils.close(excelExtractor);
    }

}

From source file:com.krawler.esp.fileparser.excel.XlsxParser.java

License:Open Source License

public String extractText(String filepath) throws FileNotFoundException, IOException {
    StringBuilder sb = new StringBuilder();
    try {/*w  w  w . j  a  va 2s  .c o  m*/
        FileInputStream fis = new FileInputStream(filepath);
        XSSFWorkbook workbook = new XSSFWorkbook(fis);
        XSSFExcelExtractor es = new XSSFExcelExtractor(workbook);
        sb.append(es.getText());

    } catch (Exception e) {
        logger.warn(e.getMessage(), e);
    }
    return sb.toString();
}

From source file:com.opensearchserver.extractor.parser.Xlsx.java

License:Apache License

@Override
protected void parseContent(InputStream inputStream, String extension, String mimeType) throws Exception {

    XSSFWorkbook workbook = null;/*from  w  w w. ja  v a 2  s  .com*/
    XSSFExcelExtractor excelExtractor = null;
    try {
        workbook = new XSSFWorkbook(inputStream);
        excelExtractor = new XSSFExcelExtractor(workbook);

        CoreProperties info = excelExtractor.getCoreProperties();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(CREATOR, info.getCreator());
            metas.add(CREATION_DATE, info.getCreated());
            metas.add(MODIFICATION_DATE, info.getModified());
            metas.add(SUBJECT, info.getSubject());
            metas.add(DESCRIPTION, info.getDescription());
            metas.add(KEYWORDS, info.getKeywords());
        }

        ParserDocument result = getNewParserDocument();
        excelExtractor.setIncludeCellComments(true);
        excelExtractor.setIncludeHeadersFooters(true);
        excelExtractor.setIncludeSheetNames(true);
        result.add(CONTENT, excelExtractor.getText());
        result.add(LANG_DETECTION, languageDetection(CONTENT, 10000));

    } finally {
        if (excelExtractor != null)
            IOUtils.closeQuietly(excelExtractor);
        if (workbook != null)
            IOUtils.closeQuietly(workbook);
    }

}

From source file:com.opensearchserver.textextractor.parser.Xlsx.java

License:Apache License

private void parseContent(XSSFWorkbook workbook) throws Exception {

    XSSFExcelExtractor excelExtractor = null;
    try {//from w  w w.  j a v  a  2  s .c om
        excelExtractor = new XSSFExcelExtractor(workbook);

        CoreProperties info = excelExtractor.getCoreProperties();
        if (info != null) {
            metas.add(TITLE, info.getTitle());
            metas.add(CREATOR, info.getCreator());
            metas.add(CREATION_DATE, info.getCreated());
            metas.add(MODIFICATION_DATE, info.getModified());
            metas.add(SUBJECT, info.getSubject());
            metas.add(DESCRIPTION, info.getDescription());
            metas.add(KEYWORDS, info.getKeywords());
        }

        ParserDocument result = getNewParserDocument();
        excelExtractor.setIncludeCellComments(true);
        excelExtractor.setIncludeHeadersFooters(true);
        excelExtractor.setIncludeSheetNames(true);
        result.add(CONTENT, excelExtractor.getText());
        result.add(LANG_DETECTION, languageDetection(CONTENT, 10000));

    } finally {
        if (excelExtractor != null)
            IOUtils.closeQuietly(excelExtractor);
    }

}

From source file:dk.defxws.fedoragsearch.server.TransformerToText.java

License:Open Source License

private static Stream getTextFromXLSX(InputStream doc) throws GenericSearchException {
    long time = System.currentTimeMillis();
    boolean errorFlag = Boolean.parseBoolean(Config.getCurrentConfig().getIgnoreTextExtractionErrors());
    XSSFExcelExtractor excelExtractor = null;
    try {/*w  w  w  .j av a2 s.c o  m*/
        excelExtractor = new XSSFExcelExtractor(OPCPackage.open(doc));
        StringBuffer buffer = new StringBuffer(excelExtractor.getText().trim());
        Stream stream = new Stream();
        stream.write(buffer.toString().getBytes(Constants.XML_CHARACTER_ENCODING));
        stream.lock();
        if (logger.isDebugEnabled()) {
            logger.debug("extracting text from xlsx needed " + (System.currentTimeMillis() - time));
        }
        return stream;
    } catch (Exception e) {
        if (errorFlag) {
            logger.warn("", e);
            return createErrorStream(xlsxTextExtractionErrorString);
        } else {
            throw new GenericSearchException("cannot parse xlsx-file", e);
        }
    } finally {
        excelExtractor = null;
    }
}

From source file:edu.ur.ir.index.DefaultExcelXmlTextExtractor.java

License:Apache License

/**
 * Extract text from a word 97-2003 document.
 * @throws Exception //from  w w  w  .j av a  2s .c om
 * 
 * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File)
 */
public String getText(File f) throws Exception {
    String text = null;
    if (isFileTooLarge(f) || f.length() <= 0l) {
        return text;
    }

    OPCPackage p = null;
    try {

        p = XSSFWorkbook.openPackage(f.getAbsolutePath());
        XSSFWorkbook workbook = new XSSFWorkbook(p);
        XSSFExcelExtractor extractor = new XSSFExcelExtractor(workbook);

        String myText = extractor.getText();
        if (myText != null && !myText.trim().equals("")) {
            text = myText;
        }
    } catch (OutOfMemoryError oome) {
        text = null;
        log.error("could not extract text", oome);
        throw (oome);
    } catch (Exception e) {
        text = null;
        log.error("could not get text for word document " + f.getAbsolutePath());
        throw (e);
    } finally {
        if (p != null) {
            try {
                p.close();
                p = null;
            } catch (IOException e) {
                log.debug(e);
                p = null;
            }
        }
    }
    return text;
}

From source file:org.crypto.sse.TextExtractPar.java

License:Open Source License

private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException {

    Multimap<String, String> lookup1 = ArrayListMultimap.create();
    Multimap<String, String> lookup2 = ArrayListMultimap.create();

    for (File file : listOfFile) {

        for (int j = 0; j < 100; j++) {

            if (counter == (int) ((j + 1) * listOfFile.length / 100)) {
                System.out.println("Number of files read equals " + j + " %");
                break;
            }/*from w w w  .j a va  2s .  c  om*/
        }

        List<String> lines = new ArrayList<String>();
        counter++;
        FileInputStream fis = new FileInputStream(file);

        // ***********************************************************************************************//

        ///////////////////// .docx /////////////////////////////

        // ***********************************************************************************************//

        if (file.getName().endsWith(".docx")) {
            XWPFDocument doc;
            try {
                // System.out.println("File read: "+file.getName());

                doc = new XWPFDocument(fis);
                XWPFWordExtractor ex = new XWPFWordExtractor(doc);
                lines.add(ex.getText());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pptx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pptx")) {

            OPCPackage ppt;
            try {
                // System.out.println("File read: "+file.getName());

                ppt = OPCPackage.open(fis);
                XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt);
                lines.add(xw.getText());
            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .xlsx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".xlsx")) {

            OPCPackage xls;
            try {
                // System.out.println("File read: "+file.getName());

                xls = OPCPackage.open(fis);
                XSSFExcelExtractor xe = new XSSFExcelExtractor(xls);
                lines.add(xe.getText());
            } catch (InvalidFormatException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                System.out.println("File not read: " + file.getName());

            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .doc /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".doc")) {

            NPOIFSFileSystem fs;
            try {
                // System.out.println("File read: "+file.getName());

                fs = new NPOIFSFileSystem(file);
                WordExtractor extractor = new WordExtractor(fs.getRoot());
                for (String rawText : extractor.getParagraphText()) {
                    lines.add(extractor.stripFields(rawText));
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pdf /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pdf")) {

            PDFParser parser;
            try {
                // System.out.println("File read: "+file.getName());

                parser = new PDFParser(fis);
                parser.parse();
                COSDocument cd = parser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                lines.add(stripper.getText(new PDDocument(cd)));

            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg,
        ///////////////////// .mp4 /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg")
                && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg")
                && file.getName().endsWith(".mp4")) {

            lines.add(file.getName());

        }

        // ***********************************************************************************************//

        ///////////////////// raw text extensions
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        else {
            try {
                // System.out.println("File read: "+file.getName());

                lines = Files.readLines(file, Charsets.UTF_8);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } finally {
                try {
                    fis.close();
                } catch (IOException ioex) {
                    // omitted.
                }
            }
        }

        // ***********************************************************************************************//

        ///////////////////// Begin word extraction
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        int temporaryCounter = 0;

        // Filter threshold
        int counterDoc = 0;
        for (int i = 0; i < lines.size(); i++) {

            CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();

            // We are using a standard tokenizer that eliminates the stop
            // words. We can use Stemming tokenizer such Porter
            // A set of English noise keywords is used that will eliminates
            // words such as "the, a, etc"

            Analyzer analyzer = new StandardAnalyzer(noise);
            List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i));
            temporaryCounter = temporaryCounter + token.size();
            for (int j = 0; j < token.size(); j++) {

                // Avoid counting occurrences of words in the same file
                if (!lookup2.get(file.getName()).contains(token.get(j))) {
                    lookup2.put(file.getName(), token.get(j));
                }

                // Avoid counting occurrences of words in the same file
                if (!lookup1.get(token.get(j)).contains(file.getName())) {
                    lookup1.put(token.get(j), file.getName());
                }

            }

        }

    }

    // System.out.println(lookup.toString());
    return new TextExtractPar(lookup1, lookup2);

}

From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java

License:Open Source License

/**
 * Write document content to document artifact as its raw content
 *
 * @param registry/*from   w w w . j  a  v  a  2s. c om*/
 * @param documentResource
 * @return
 * @throws RegistryException
 * @throws IOException
 * @throws APIManagementException
 */
private String fetchDocumentContent(Registry registry, Resource documentResource)
        throws RegistryException, IOException, APIManagementException {
    GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry,
            APIConstants.DOCUMENTATION_KEY);
    GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID());
    String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE);

    String contentString = null;
    if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) {
        Association fileAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_FILE_ASSOCIATION);
        Association fileAssociation;

        if (fileAssociations.length < 1) {
            String error = "No document associated to API";
            log.error(error);
            throw new APIManagementException(error);
        }

        //a file document can have one file association
        fileAssociation = fileAssociations[0];
        String contentPath = fileAssociation.getDestinationPath();

        if (!registry.resourceExists(contentPath)) {
            String error = "API not found at " + contentPath;
            log.error(error);
            throw new APIManagementException(error);
        }

        Resource contentResource = registry.get(contentPath);

        String fileName = ((ResourceImpl) contentResource).getName();
        String extension = FilenameUtils.getExtension(fileName);
        InputStream inputStream = null;
        try {
            inputStream = contentResource.getContentStream();
            switch (extension) {
            case APIConstants.PDF_EXTENSION:
                PDFParser pdfParser = new PDFParser(inputStream);
                pdfParser.parse();
                COSDocument cosDocument = pdfParser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                contentString = stripper.getText(new PDDocument(cosDocument));
                break;
            case APIConstants.DOC_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                WordExtractor msWord2003Extractor = new WordExtractor(pfs);
                contentString = msWord2003Extractor.getText();
                break;
            }
            case APIConstants.DOCX_EXTENSION:
                XWPFDocument doc = new XWPFDocument(inputStream);
                XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
                contentString = msWord2007Extractor.getText();
                break;
            case APIConstants.XLS_EXTENSION: {
                POIFSFileSystem pfs = new POIFSFileSystem(inputStream);
                ExcelExtractor extractor = new ExcelExtractor(pfs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.XLSX_EXTENSION:
                XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream);
                XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets);
                contentString = xssfExcelExtractor.getText();
                break;
            case APIConstants.PPT_EXTENSION: {
                POIFSFileSystem fs = new POIFSFileSystem(inputStream);
                PowerPointExtractor extractor = new PowerPointExtractor(fs);
                contentString = extractor.getText();
                break;
            }
            case APIConstants.PPTX_EXTENSION:
                XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
                XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow);
                contentString = xslfPowerPointExtractor.getText();
                break;
            case APIConstants.TXT_EXTENSION:
            case APIConstants.WSDL_EXTENSION:
            case APIConstants.XML_DOC_EXTENSION:
                BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
                String line;
                StringBuilder contentBuilder = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    contentBuilder.append(line);
                }
                contentString = contentBuilder.toString();
                break;
            }
        } finally {
            IOUtils.closeQuietly(inputStream);
        }

    } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) {
        Association contentAssociations[] = registry.getAssociations(documentResource.getPath(),
                APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION);
        Association contentAssociation;

        //an inline document can have one or no content associations
        if (contentAssociations.length == 1) {
            contentAssociation = contentAssociations[0];
            String contentPath = contentAssociation.getDestinationPath();

            if (registry.resourceExists(contentPath)) {
                Resource contentResource = registry.get(contentPath);

                InputStream instream = null;
                BufferedReader reader = null;
                String line;
                try {
                    instream = contentResource.getContentStream();
                    reader = new BufferedReader(new InputStreamReader(instream));
                    StringBuilder contentBuilder = new StringBuilder();
                    while ((line = reader.readLine()) != null) {
                        contentBuilder.append(line);
                    }
                    contentString = contentBuilder.toString();
                } finally {
                    if (reader != null) {
                        IOUtils.closeQuietly(reader);
                    }
                }
            }
        }
    }
    return contentString;
}