com.xpn.xwiki.plugin.lucene.textextraction.MSExcelTextExtractor.java Source code

Java tutorial

Introduction

Here is the source code for com.xpn.xwiki.plugin.lucene.textextraction.MSExcelTextExtractor.java

Source

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package com.xpn.xwiki.plugin.lucene.textextraction;

import java.io.ByteArrayInputStream;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFDataFormat;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MSExcelTextExtractor implements MimetypeTextExtractor {
    /** Logging helper. */
    private static final Logger LOG = LoggerFactory.getLogger(MSExcelTextExtractor.class);

    /** The currently preparing Excel workbook. */
    private HSSFWorkbook mWorkbook;

    /**
     * Contains all data formats used in the currently preparing Excel workbook.
     */
    private HSSFDataFormat mDataFormat;

    public static final int DEFAULT_BUFFER_SIZE = 16384; // 16 k

    /**
     * Extracts all text from an Excel by parsing all the sheets in that excel document.
     * 
     * @return String
     */
    public String getText(byte[] data) throws Exception {
        POIFSFileSystem poiFs = new POIFSFileSystem(new ByteArrayInputStream(data));
        mWorkbook = new HSSFWorkbook(poiFs);
        mDataFormat = mWorkbook.createDataFormat();

        StringBuffer cleanBuffer = new StringBuffer(DEFAULT_BUFFER_SIZE);
        for (int sheetIdx = 0; sheetIdx < mWorkbook.getNumberOfSheets(); sheetIdx++) {
            HSSFSheet sheet = mWorkbook.getSheetAt(sheetIdx);

            if (sheet != null) {
                parseSheet(sheet, cleanBuffer);
            }
        }
        return cleanBuffer.toString();
    }

    /**
     * It will parse the sheet with row wise and get the text from the sheet.
     */
    private void parseSheet(HSSFSheet sheet, StringBuffer cleanBuffer) {
        int firstRow = sheet.getFirstRowNum();
        int lastRow = sheet.getLastRowNum();
        for (int rowIdx = firstRow; rowIdx <= lastRow; rowIdx++) {
            HSSFRow row = sheet.getRow(rowIdx);

            if (row != null) {
                parseRow(row, cleanBuffer);
            }
        }
    }

    /**
     * It will parse row and return the text
     */
    private void parseRow(HSSFRow row, StringBuffer cleanBuffer) {
        short firstCell = row.getFirstCellNum();
        short lastCell = row.getLastCellNum();
        for (short cellIdx = firstCell; cellIdx <= lastCell; cellIdx++) {
            HSSFCell cell = row.getCell(cellIdx);

            if (cell != null) {
                parseCell(cell, cleanBuffer);
            }
        }
    }

    /**
     * Extracts all text from each cell of the sheet
     */
    private void parseCell(HSSFCell cell, StringBuffer cleanBuffer) {
        String cellValue = null;

        if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
            cellValue = cell.getRichStringCellValue().getString();
        } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
            HSSFCellStyle style = cell.getCellStyle();
            short formatId = style.getDataFormat();
            String formatPattern = mDataFormat.getFormat(formatId);
            formatPattern = replace(formatPattern, "\\ ", " ");

            if (isCellDateFormatted(cell)) {
                // This is a date
                formatPattern = replace(formatPattern, "mmmm", "MMMM");
                formatPattern = replace(formatPattern, "/", ".");
                SimpleDateFormat format;
                try {
                    format = new SimpleDateFormat(formatPattern);
                } catch (Throwable thr) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Creating date format failed: '" + formatPattern + "'", thr);
                    }
                    format = new SimpleDateFormat();
                }

                double numberValue = cell.getNumericCellValue();
                Date date = HSSFDateUtil.getJavaDate(numberValue);
                cellValue = format.format(date);
            } else {
                // This is a Number
                DecimalFormat format;
                try {
                    format = new DecimalFormat(formatPattern);
                } catch (Throwable thr) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Creating number format failed: '" + formatPattern + "'", thr);
                    }
                    format = new DecimalFormat();
                }

                double numberValue = cell.getNumericCellValue();
                cellValue = format.format(numberValue);
            }
        }

        if (cellValue != null) {
            cellValue = cellValue.trim();
            if (cellValue.length() != 0) {
                cleanBuffer.append(cellValue);
                cleanBuffer.append(" ");
            }
        }
    }

    /**
     * Checks cell is date formatted or not.
     * 
     * @return boolean
     */
    private boolean isCellDateFormatted(HSSFCell cell) {
        short format = cell.getCellStyle().getDataFormat();

        if (HSSFDateUtil.isValidExcelDate(cell.getNumericCellValue())) {
            if (HSSFDateUtil.isCellDateFormatted(cell)) {
                return true;
            } else {
                String fmtText = mDataFormat.getFormat(format);

                if (fmtText != null) {
                    fmtText = fmtText.toLowerCase();

                    if (fmtText.indexOf("d") >= 0 || fmtText.indexOf("m") >= 0 || fmtText.indexOf("y") >= 0
                            || fmtText.indexOf("h") >= 0 || fmtText.indexOf("s") >= 0) {
                        return true;
                    }
                }
            }
        }

        return false;
    }

    /**
     * It will replace all occurances of pattern in the source with replacement value
     * 
     * @return String
     */
    public static String replace(String source, String pattern, String replacement) {
        // Check whether the pattern occurs in the source at all
        int firstPatternPos = source.indexOf(pattern);
        if (firstPatternPos == -1) {
            // The pattern does not occur in the source -> return the source
            return source;
        }

        // Build a new String where pattern is replaced by the replacement
        StringBuffer target = new StringBuffer(source.length());
        int start = 0; // The start of a part without the pattern
        int end = firstPatternPos; // The end of a part without the pattern
        do {
            target.append(source.substring(start, end));
            target.append(replacement);
            start = end + pattern.length();
        } while ((end = source.indexOf(pattern, start)) != -1);
        target.append(source.substring(start, source.length()));

        // return the String
        return target.toString();
    }
}