com.ask.hive.hbase.PentahoTableCustomInputFormat.java Source code

Introduction

Here is the source code for com.ask.hive.hbase.PentahoTableCustomInputFormat.java
Source

/*******************************************************************************
 *
 * Pentaho Big Data
 *
 * Copyright (C) 2002-2012 by Pentaho : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package com.ask.hive.hbase;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapred.TableInputFormat;
//import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;

/**
 * Extends the mapred TableInputFormat and adds the ability to specify 
 * the table to read from via a property (rather than abusing the input
 * path). Also adds more configuration properties (like those int the 
 * mapreduce package's implementation).<p>
 * 
 * The following properties can be set in Pentaho MR job to configure the 
 * split:<br><br>
 * 
 * <code>
 * hbase.mapred.inputtable // name of the HBase table to read from
 * hbase.mapred.tablecolumns // space delimited list of columns in ColFam:ColName format (ColName can be ommitted to read all columns from a family)
 * hbase.mapreduce.scan.cachedrows // number of rows for caching that will be passed to scanners
 * hbase.mapreduce.scan.timestamp // timestamp used to filter columns with a specific time stamp
 * hbase.mapreduce.scan.timerange.start // starting timestamp to filter in a given timestamp range
 * hbase.mapreduce.scan.timerange.end // end timestamp to filter in a given timestamp range
 * </code>
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 */
public class PentahoTableCustomInputFormat extends TableInputFormat {

    // Note that the hbase.mapred.tablecolumns property is inherited
    // from TableInputFormat. This property expects a space-delimited list
    // of column names to read in the format "ColumnFamily:ColumnName". The
    // ColumnName may be ommitted in order to read *all* columns from the 
    // specified family

    /** The name of the table to read from */
    public static final String INPUT_TABLE = "hbase.mapred.inputtable";

    /** The number of rows (integer) for caching that will be passed to scanners. */
    public static final String SCAN_CACHEDROWS = "hbase.mapreduce.scan.cachedrows";

    /** The timestamp (long) used to filter columns with a specific timestamp. */
    public static final String SCAN_TIMESTAMP = "hbase.mapreduce.scan.timestamp";

    /** The starting timestamp (long) used to filter columns with a specific range of versions. */
    public static final String SCAN_TIMERANGE_START = "hbase.mapreduce.scan.timerange.start";

    /** The ending timestamp (long) used to filter columns with a specific range of versions. */
    public static final String SCAN_TIMERANGE_END = "hbase.mapreduce.scan.timerange.end";

    public static final String SCAN_EXCLUDE_FILTER = "hbase.exclude.filter.value";

    public static final String SCAN_INCLUDE_FILTER = "hbase.include.filter.value";

    protected final Log PLOG = LogFactory.getLog(PentahoTableCustomInputFormat.class);

    public void configure(JobConf job) {

        String tableName = job.get(INPUT_TABLE);

        // columns can be colFam:colName or colFam: 
        // the later can be used to set up a scan that 
        String colArg = job.get(COLUMN_LIST);

        if (StringUtils.isNotEmpty(colArg)) {
            String[] colNames = colArg.split(" ");
            byte[][] m_cols = new byte[colNames.length][];
            for (int i = 0; i < m_cols.length; i++) {
                String colN = colNames[i];
                m_cols[i] = Bytes.toBytes(colN);
            }
            setInputColumns(m_cols);
        }

        try {
            setHTable(new HTable(HBaseConfiguration.create(job), tableName));
        } catch (Exception e) {
            //  PLOG.error(StringUtils.stringifyException(e));
        }

        // set our table record reader
        PentahoTableRecordReader rr = new PentahoTableRecordReader();
        String cacheSize = job.get(SCAN_CACHEDROWS);
        if (StringUtils.isNotEmpty(cacheSize)) {
            rr.setScanCacheRowSize(Integer.parseInt(cacheSize));
        }
        FilterList list = new FilterList(FilterList.Operator.MUST_PASS_ALL);
        String hvalue = job.get(SCAN_INCLUDE_FILTER);
        if (StringUtils.isNotEmpty(hvalue)) {
            String[] columns = hvalue.split(",");
            if (columns.length > 0) {
                for (String column : columns) {
                    String[] fv = column.split(":");
                    if (fv.length > 2) {
                        SingleColumnValueFilter rowfilter = new SingleColumnValueFilter(Bytes.toBytes(fv[0]),
                                Bytes.toBytes(fv[1]), CompareOp.EQUAL, Bytes.toBytes(fv[2]));
                        list.addFilter(rowfilter);
                    }
                }
            }
        }
        String hexvalue = job.get(SCAN_EXCLUDE_FILTER);
        if (StringUtils.isNotEmpty(hexvalue)) {
            String[] columns = hexvalue.split(",");
            if (columns.length > 0) {
                for (String column : columns) {
                    String[] fv = column.split(":");
                    if (fv.length > 2) {
                        SingleColumnValueExcludeFilter rowfilter = new SingleColumnValueExcludeFilter(
                                Bytes.toBytes(fv[0]), Bytes.toBytes(fv[1]), CompareOp.NOT_EQUAL,
                                Bytes.toBytes(fv[2]));
                        list.addFilter(rowfilter);
                    }
                }
            }
        }
        String hmax = job.get("hbase.max.version");
        if (StringUtils.isNotEmpty(hmax)) {
            rr.setMaxVersion(Integer.parseInt(hmax));
        }
        rr.setValueFilter(list);
        setTableRecordReader(rr);
    }

    public void validateInput(JobConf job) throws IOException {
        // expecting a table name
        String tableName = job.get(INPUT_TABLE);
        if (StringUtils.isEmpty(tableName)) {
            throw new IOException("expecting one table name");
        }

        // connected to table?                                                                                                                             
        if (getHTable() == null) {
            throw new IOException("could not connect to table '" + tableName + "'");
        }

        // expecting at least one column/column family                                                                                                                   
        String colArg = job.get(COLUMN_LIST);
        if (colArg == null || colArg.length() == 0) {
            throw new IOException("expecting at least one column/column family");
        }
    }
}