com.cloudera.impala.catalog.HBaseTable.java Source code

Introduction

Here is the source code for com.cloudera.impala.catalog.HBaseTable.java
Source

// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.cloudera.impala.catalog;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hive.hbase.HBaseSerDe;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.log4j.Logger;

import com.cloudera.impala.common.Pair;
import com.cloudera.impala.thrift.TCatalogObjectType;
import com.cloudera.impala.thrift.TColumn;
import com.cloudera.impala.thrift.THBaseTable;
import com.cloudera.impala.thrift.TResultSet;
import com.cloudera.impala.thrift.TResultSetMetadata;
import com.cloudera.impala.thrift.TTable;
import com.cloudera.impala.thrift.TTableDescriptor;
import com.cloudera.impala.thrift.TTableType;
import com.cloudera.impala.util.StatsHelper;
import com.cloudera.impala.util.TResultRowBuilder;
import com.google.common.base.Preconditions;

/**
 * Impala representation of HBase table metadata,
 * as loaded from Hive's metastore.
 * This implies that we inherit the metastore's limitations related to HBase,
 * for example the lack of support for composite HBase row keys.
 * We sort the HBase columns (cols) by family/qualifier
 * to simplify the retrieval logic in the backend, since
 * HBase returns data ordered by family/qualifier.
 * This implies that a "select *"-query on an HBase table
 * will not have the columns ordered as they were declared in the DDL.
 * They will be ordered by family/qualifier.
 *
 */
public class HBaseTable extends Table {
    // Maximum deviation from the average to stop querying more regions
    // to estimate the row count
    private static final double DELTA_FROM_AVERAGE = 0.15;

    private static final Logger LOG = Logger.getLogger(HBaseTable.class);

    // Copied from Hive's HBaseStorageHandler.java.
    public static final String DEFAULT_PREFIX = "default.";

    // Number of rows fetched during the row count estimation per region
    public static final int ROW_COUNT_ESTIMATE_BATCH_SIZE = 10;

    // Minimum number of regions that are checked to estimate the row count
    private static final int MIN_NUM_REGIONS_TO_CHECK = 5;

    // Column referring to HBase row key.
    // Hive (including metastore) currently doesn't support composite HBase keys.
    protected HBaseColumn rowKey_;

    // Name of table in HBase.
    // 'this.name' is the alias of the HBase table in Hive.
    protected String hbaseTableName_;

    // Input format class for HBase tables read by Hive.
    private static final String HBASE_INPUT_FORMAT = "org.apache.hadoop.hive.hbase.HiveHBaseTableInputFormat";
    // Storage handler class for HBase tables read by Hive.
    private static final String HBASE_STORAGE_HANDLER = "org.apache.hadoop.hive.hbase.HBaseStorageHandler";

    // Column family of HBase row key
    private static final String ROW_KEY_COLUMN_FAMILY = ":key";

    // Keep the conf around
    private final static Configuration hbaseConf_ = HBaseConfiguration.create();

    private HTable hTable_ = null;
    // Cached column families. Used primarily for speeding up row stats estimation
    // (see CDH-19292).
    private HColumnDescriptor[] columnFamilies_ = null;

    protected HBaseTable(TableId id, org.apache.hadoop.hive.metastore.api.Table msTbl, Db db, String name,
            String owner) {
        super(id, msTbl, db, name, owner);
    }

    // Parse the column description string to the column families and column
    // qualifies.  This is a copy of HBaseSerDe.parseColumnMapping and
    // parseColumnStorageTypes with parts we don't use removed. The hive functions
    // are not public.
    //  tableDefaultStorageIsBinary - true if table is default to binary encoding
    //  columnsMappingSpec - input string format describing the table
    //  fieldSchemas - input field schema from metastore table
    //  columnFamilies/columnQualifiers/columnBinaryEncodings - out parameters that will be
    //    filled with the column family, column qualifier and encoding for each column.
    private void parseColumnMapping(boolean tableDefaultStorageIsBinary, String columnsMappingSpec,
            List<FieldSchema> fieldSchemas, List<String> columnFamilies, List<String> columnQualifiers,
            List<Boolean> colIsBinaryEncoded) throws SerDeException {
        if (columnsMappingSpec == null) {
            throw new SerDeException("Error: hbase.columns.mapping missing for this HBase table.");
        }

        if (columnsMappingSpec.equals("") || columnsMappingSpec.equals(HBaseSerDe.HBASE_KEY_COL)) {
            throw new SerDeException("Error: hbase.columns.mapping specifies only "
                    + "the HBase table row key. A valid Hive-HBase table must specify at "
                    + "least one additional column.");
        }

        int rowKeyIndex = -1;
        String[] columnSpecs = columnsMappingSpec.split(",");
        // If there was an implicit key column mapping, the number of columns (fieldSchemas)
        // will be one more than the number of column mapping specs.
        int fsStartIdxOffset = fieldSchemas.size() - columnSpecs.length;
        if (fsStartIdxOffset != 0 && fsStartIdxOffset != 1) {
            // This should never happen - Hive blocks creating a mismatched table and both Hive
            // and Impala currently block all column-level DDL on HBase tables.
            throw new SerDeException(String.format(
                    "Number of entries in " + "'hbase.columns.mapping' does not match the number of columns in the "
                            + "table: %d != %d (counting the key if implicit)",
                    columnSpecs.length, fieldSchemas.size()));
        }

        for (int i = 0; i < columnSpecs.length; ++i) {
            String mappingSpec = columnSpecs[i];
            String[] mapInfo = mappingSpec.split("#");
            // Trim column info so that serdeproperties with new lines still parse correctly.
            String colInfo = mapInfo[0].trim();

            int idxFirst = colInfo.indexOf(":");
            int idxLast = colInfo.lastIndexOf(":");

            if (idxFirst < 0 || !(idxFirst == idxLast)) {
                throw new SerDeException("Error: the HBase columns mapping contains a "
                        + "badly formed column family, column qualifier specification.");
            }

            if (colInfo.equals(HBaseSerDe.HBASE_KEY_COL)) {
                Preconditions.checkState(fsStartIdxOffset == 0);
                rowKeyIndex = i;
                columnFamilies.add(colInfo);
                columnQualifiers.add(null);
            } else {
                String[] parts = colInfo.split(":");
                Preconditions.checkState(parts.length > 0 && parts.length <= 2);
                columnFamilies.add(parts[0]);
                if (parts.length == 2) {
                    columnQualifiers.add(parts[1]);
                } else {
                    columnQualifiers.add(null);
                }
            }

            // Set column binary encoding
            FieldSchema fieldSchema = fieldSchemas.get(i + fsStartIdxOffset);
            boolean supportsBinaryEncoding = supportsBinaryEncoding(fieldSchema);
            if (mapInfo.length == 1) {
                // There is no column level storage specification. Use the table storage spec.
                colIsBinaryEncoded.add(new Boolean(tableDefaultStorageIsBinary && supportsBinaryEncoding));
            } else if (mapInfo.length == 2) {
                // There is a storage specification for the column
                String storageOption = mapInfo[1];

                if (!(storageOption.equals("-") || "string".startsWith(storageOption)
                        || "binary".startsWith(storageOption))) {
                    throw new SerDeException("Error: A column storage specification is one of"
                            + " the following: '-', a prefix of 'string', or a prefix of 'binary'. " + storageOption
                            + " is not a valid storage option specification for " + fieldSchema.getName());
                }

                boolean isBinaryEncoded = false;
                if ("-".equals(storageOption)) {
                    isBinaryEncoded = tableDefaultStorageIsBinary;
                } else if ("binary".startsWith(storageOption)) {
                    isBinaryEncoded = true;
                }
                if (isBinaryEncoded && !supportsBinaryEncoding) {
                    // Use string encoding and log a warning if the column spec is binary but the
                    // column type does not support it.
                    // TODO: Hive/HBase does not raise an exception, but should we?
                    LOG.warn("Column storage specification for column " + fieldSchema.getName() + " is binary"
                            + " but the column type " + fieldSchema.getType()
                            + " does not support binary encoding. Fallback to string format.");
                    isBinaryEncoded = false;
                }
                colIsBinaryEncoded.add(isBinaryEncoded);
            } else {
                // error in storage specification
                throw new SerDeException("Error: " + HBaseSerDe.HBASE_COLUMNS_MAPPING + " storage specification "
                        + mappingSpec + " is not valid for column: " + fieldSchema.getName());
            }
        }

        if (rowKeyIndex == -1) {
            columnFamilies.add(0, HBaseSerDe.HBASE_KEY_COL);
            columnQualifiers.add(0, null);
            colIsBinaryEncoded.add(0, supportsBinaryEncoding(fieldSchemas.get(0)) && tableDefaultStorageIsBinary);
        }
    }

    private boolean supportsBinaryEncoding(FieldSchema fs) {
        try {
            Type colType = parseColumnType(fs);
            // Only boolean, integer and floating point types can use binary storage.
            return colType.isBoolean() || colType.isIntegerType() || colType.isFloatingPointType();
        } catch (TableLoadingException e) {
            return false;
        }
    }

    @Override
    /**
     * For hbase tables, we can support tables with columns we don't understand at
     * all (e.g. map) as long as the user does not select those. This is in contrast
     * to hdfs tables since we typically need to understand all columns to make sense
     * of the file at all.
     */
    public void load(Table oldValue, HiveMetaStoreClient client, org.apache.hadoop.hive.metastore.api.Table msTbl)
            throws TableLoadingException {
        Preconditions.checkNotNull(getMetaStoreTable());
        try {
            hbaseTableName_ = getHBaseTableName(getMetaStoreTable());
            hTable_ = new HTable(hbaseConf_, hbaseTableName_);
            columnFamilies_ = null;
            Map<String, String> serdeParams = getMetaStoreTable().getSd().getSerdeInfo().getParameters();
            String hbaseColumnsMapping = serdeParams.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
            if (hbaseColumnsMapping == null) {
                throw new MetaException("No hbase.columns.mapping defined in Serde.");
            }

            String hbaseTableDefaultStorageType = getMetaStoreTable().getParameters()
                    .get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE);
            boolean tableDefaultStorageIsBinary = false;
            if (hbaseTableDefaultStorageType != null && !hbaseTableDefaultStorageType.isEmpty()) {
                if (hbaseTableDefaultStorageType.equalsIgnoreCase("binary")) {
                    tableDefaultStorageIsBinary = true;
                } else if (!hbaseTableDefaultStorageType.equalsIgnoreCase("string")) {
                    throw new SerDeException("Error: " + HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE
                            + " parameter must be specified as" + " 'string' or 'binary'; '"
                            + hbaseTableDefaultStorageType
                            + "' is not a valid specification for this table/serde property.");
                }
            }

            // Parse HBase column-mapping string.
            List<FieldSchema> fieldSchemas = getMetaStoreTable().getSd().getCols();
            List<String> hbaseColumnFamilies = new ArrayList<String>();
            List<String> hbaseColumnQualifiers = new ArrayList<String>();
            List<Boolean> hbaseColumnBinaryEncodings = new ArrayList<Boolean>();
            parseColumnMapping(tableDefaultStorageIsBinary, hbaseColumnsMapping, fieldSchemas, hbaseColumnFamilies,
                    hbaseColumnQualifiers, hbaseColumnBinaryEncodings);
            Preconditions.checkState(hbaseColumnFamilies.size() == hbaseColumnQualifiers.size());
            Preconditions.checkState(fieldSchemas.size() == hbaseColumnFamilies.size());

            // Populate tmp cols in the order they appear in the Hive metastore.
            // We will reorder the cols below.
            List<HBaseColumn> tmpCols = new ArrayList<HBaseColumn>();
            // Store the key column separately.
            // TODO: Change this to an ArrayList once we support composite row keys.
            HBaseColumn keyCol = null;
            for (int i = 0; i < fieldSchemas.size(); ++i) {
                FieldSchema s = fieldSchemas.get(i);
                Type t = Type.INVALID;
                try {
                    t = parseColumnType(s);
                } catch (TableLoadingException e) {
                    // Ignore hbase types we don't support yet. We can load the metadata
                    // but won't be able to select from it.
                }
                HBaseColumn col = new HBaseColumn(s.getName(), hbaseColumnFamilies.get(i),
                        hbaseColumnQualifiers.get(i), hbaseColumnBinaryEncodings.get(i), t, s.getComment(), -1);
                // Load column stats from the Hive metastore into col.
                loadColumnStats(col, client);
                if (col.getColumnFamily().equals(ROW_KEY_COLUMN_FAMILY)) {
                    // Store the row key column separately from the rest
                    keyCol = col;
                } else {
                    tmpCols.add(col);
                }
            }
            Preconditions.checkState(keyCol != null);

            // The backend assumes that the row key column is always first and
            // that the remaining HBase columns are ordered by columnFamily,columnQualifier,
            // so the final position depends on the other mapped HBase columns.
            // Sort columns and update positions.
            Collections.sort(tmpCols);
            clearColumns();

            keyCol.setPosition(0);
            addColumn(keyCol);
            // Update the positions of the remaining columns
            for (int i = 0; i < tmpCols.size(); ++i) {
                HBaseColumn col = tmpCols.get(i);
                col.setPosition(i + 1);
                addColumn(col);
            }

            // Set table stats.
            numRows_ = getRowCount(super.getMetaStoreTable().getParameters());

            // since we don't support composite hbase rowkeys yet, all hbase tables have a
            // single clustering col
            numClusteringCols_ = 1;
        } catch (Exception e) {
            throw new TableLoadingException("Failed to load metadata for HBase table: " + name_, e);
        }
    }

    @Override
    protected void loadFromThrift(TTable table) throws TableLoadingException {
        super.loadFromThrift(table);
        try {
            hbaseTableName_ = getHBaseTableName(getMetaStoreTable());
            hTable_ = new HTable(hbaseConf_, hbaseTableName_);
            columnFamilies_ = null;
        } catch (Exception e) {
            throw new TableLoadingException(
                    "Failed to load metadata for HBase table from " + "thrift table: " + name_, e);
        }
    }

    // This method is completely copied from Hive's HBaseStorageHandler.java.
    private String getHBaseTableName(org.apache.hadoop.hive.metastore.api.Table tbl) {
        // Give preference to TBLPROPERTIES over SERDEPROPERTIES
        // (really we should only use TBLPROPERTIES, so this is just
        // for backwards compatibility with the original specs).
        String tableName = tbl.getParameters().get(HBaseSerDe.HBASE_TABLE_NAME);
        if (tableName == null) {
            tableName = tbl.getSd().getSerdeInfo().getParameters().get(HBaseSerDe.HBASE_TABLE_NAME);
        }
        if (tableName == null) {
            tableName = tbl.getDbName() + "." + tbl.getTableName();
            if (tableName.startsWith(DEFAULT_PREFIX)) {
                tableName = tableName.substring(DEFAULT_PREFIX.length());
            }
        }
        return tableName;
    }

    /**
     * Estimates the number of rows for a single region and returns a pair with
     * the estimated row count and the estimated size in bytes per row.
     */
    private Pair<Long, Long> getEstimatedRowStatsForRegion(HRegionLocation location, boolean isCompressed)
            throws IOException {
        HRegionInfo info = location.getRegionInfo();
        Scan s = new Scan(info.getStartKey());
        // Get a small sample of rows
        s.setBatch(ROW_COUNT_ESTIMATE_BATCH_SIZE);
        // Try and get every version so the row's size can be used to estimate.
        s.setMaxVersions(Short.MAX_VALUE);
        // Don't cache the blocks as we don't think these are
        // necessarily important blocks.
        s.setCacheBlocks(false);
        // Try and get deletes too so their size can be counted.
        s.setRaw(false);
        ResultScanner rs = hTable_.getScanner(s);

        long currentRowSize = 0;
        long currentRowCount = 0;
        try {
            // Get the the ROW_COUNT_ESTIMATE_BATCH_SIZE fetched rows
            // for a representative sample
            for (int i = 0; i < ROW_COUNT_ESTIMATE_BATCH_SIZE; ++i) {
                Result r = rs.next();
                if (r == null)
                    break;
                // Check for empty rows, see IMPALA-1451
                if (r.isEmpty())
                    continue;
                ++currentRowCount;
                // To estimate the number of rows we simply use the amount of bytes
                // returned from the underlying buffer. Since HBase internally works
                // with these structures as well this gives us ok estimates.
                ImmutableBytesWritable bts = r.getBytes();
                currentRowSize += bts.getSize();
            }
        } finally {
            rs.close();
        }

        // If there are no rows then no need to estimate.
        if (currentRowCount == 0)
            return new Pair<Long, Long>(0L, 0L);
        // Get the size on hdfs
        long currentHdfsSize = getHdfsSize(info);
        // estimate the number of rows.
        double bytesPerRow = currentRowSize / (double) currentRowCount;
        // Compression factor two is only a best effort guess
        long estimatedRowCount = (long) ((isCompressed ? 2 : 1) * (currentHdfsSize / bytesPerRow));
        return new Pair<Long, Long>(estimatedRowCount, (long) bytesPerRow);
    }

    /**
     * Get an estimate of the number of rows and bytes per row in regions between
     * startRowKey and endRowKey.
     *
     * This number is calculated by incrementally checking as many region servers as
     * necessary until we observe a relatively constant row size per region on average.
     * Depending on the skew of data in the regions this can either mean that we need
     * to check only a minimal number of regions or that we will scan all regions.
     *
     * The accuracy of this number is determined by the number of rows that are written
     * and kept in the memstore and have not been flushed until now. A large number
     * of key-value pairs in the memstore will lead to bad estimates as this number
     * is not reflected in the file size on HDFS that is used to estimate this number.
     *
     * Currently, the algorithm does not consider the case that the key range used as a
     * parameter might be generally of different size than the rest of the region.
     *
     * The values computed here should be cached so that in high qps workloads
     * the nn is not overwhelmed.  Could be done in load(); Synchronized to make
     * sure that only one thread at a time is using the htable.
     *
     * @param startRowKey First row key in the range
     * @param endRowKey Last row key in the range
     * @return The estimated number of rows in the regions between the row keys (first) and
     * the estimated row size in bytes (second).
     */
    public synchronized Pair<Long, Long> getEstimatedRowStats(byte[] startRowKey, byte[] endRowKey) {
        Preconditions.checkNotNull(startRowKey);
        Preconditions.checkNotNull(endRowKey);

        boolean isCompressed = false;
        long rowCount = 0;
        long rowSize = 0;

        try {
            // Check to see if things are compressed.
            // If they are we'll estimate a compression factor.
            if (columnFamilies_ == null) {
                columnFamilies_ = hTable_.getTableDescriptor().getColumnFamilies();
            }
            Preconditions.checkNotNull(columnFamilies_);
            for (HColumnDescriptor desc : columnFamilies_) {
                isCompressed |= desc.getCompression() != Compression.Algorithm.NONE;
            }

            // Fetch all regions for the key range
            List<HRegionLocation> locations = getRegionsInRange(hTable_, startRowKey, endRowKey);
            Collections.shuffle(locations);
            // The following variables track the number and size of 'rows' in
            // HBase and allow incremental calculation of the average and standard
            // deviation.
            StatsHelper<Long> statsCount = new StatsHelper<Long>();
            StatsHelper<Long> statsSize = new StatsHelper<Long>();

            // Collects stats samples from at least MIN_NUM_REGIONS_TO_CHECK
            // and at most all regions until the delta is small enough.
            while ((statsSize.count() < MIN_NUM_REGIONS_TO_CHECK
                    || statsSize.stddev() > statsSize.mean() * DELTA_FROM_AVERAGE)
                    && statsSize.count() < locations.size()) {
                Pair<Long, Long> tmp = getEstimatedRowStatsForRegion(locations.get((int) statsCount.count()),
                        isCompressed);
                statsCount.addSample(tmp.first);
                statsSize.addSample(tmp.second);
            }

            rowCount = (long) (getHdfsSize(null) / statsSize.mean());
            rowSize = (long) statsSize.mean();
        } catch (IOException ioe) {
            // Print the stack trace, but we'll ignore it
            // as this is just an estimate.
            // TODO: Put this into the per query log.
            LOG.error("Error computing HBase row count estimate", ioe);
            return new Pair<Long, Long>(-1l, -1l);
        }
        return new Pair<Long, Long>(rowCount, rowSize);
    }

    /**
     * Returns the Hdfs size of the given region in bytes. NULL can be
     * passed as a parameter to retrieve the size of the complete table.
     */
    public long getHdfsSize(HRegionInfo info) throws IOException {
        Path tableDir = HTableDescriptor.getTableDir(FSUtils.getRootDir(hbaseConf_),
                Bytes.toBytes(hbaseTableName_));
        FileSystem fs = tableDir.getFileSystem(hbaseConf_);
        if (info != null) {
            Path regionDir = tableDir.suffix("/" + info.getEncodedName());
            return fs.getContentSummary(regionDir).getLength();
        } else {
            return fs.getContentSummary(tableDir).getLength();
        }
    }

    /**
     * Hive returns the columns in order of their declaration for HBase tables.
     */
    @Override
    public ArrayList<Column> getColumnsInHiveOrder() {
        return getColumns();
    }

    @Override
    public TTableDescriptor toThriftDescriptor(Set<Long> referencedPartitions) {
        TTableDescriptor tableDescriptor = new TTableDescriptor(id_.asInt(), TTableType.HBASE_TABLE,
                getColumns().size(), numClusteringCols_, hbaseTableName_, db_.getName());
        tableDescriptor.setHbaseTable(getTHBaseTable());
        tableDescriptor.setColNames(getColumnNames());
        return tableDescriptor;
    }

    public String getHBaseTableName() {
        return hbaseTableName_;
    }

    public HTable getHTable() {
        return hTable_;
    }

    public static Configuration getHBaseConf() {
        return hbaseConf_;
    }

    @Override
    public int getNumNodes() {
        // TODO: implement
        return 100;
    }

    @Override
    public TCatalogObjectType getCatalogObjectType() {
        return TCatalogObjectType.TABLE;
    }

    @Override
    public TTable toThrift() {
        TTable table = super.toThrift();
        table.setTable_type(TTableType.HBASE_TABLE);
        table.setHbase_table(getTHBaseTable());
        return table;
    }

    private THBaseTable getTHBaseTable() {
        THBaseTable tHbaseTable = new THBaseTable();
        tHbaseTable.setTableName(hbaseTableName_);
        for (Column c : getColumns()) {
            HBaseColumn hbaseCol = (HBaseColumn) c;
            tHbaseTable.addToFamilies(hbaseCol.getColumnFamily());
            if (hbaseCol.getColumnQualifier() != null) {
                tHbaseTable.addToQualifiers(hbaseCol.getColumnQualifier());
            } else {
                tHbaseTable.addToQualifiers("");
            }
            tHbaseTable.addToBinary_encoded(hbaseCol.isBinaryEncoded());
        }
        return tHbaseTable;
    }

    /**
     * This is copied from org.apache.hadoop.hbase.client.HTable. The only difference is
     * that it does not use cache when calling getRegionLocation.
     * TODO: Remove this function and use HTable.getRegionsInRange when the non-cache
     * version has been ported to CDH (DISTRO-477).
     * Get the corresponding regions for an arbitrary range of keys.
     * <p>
     * @param startRow Starting row in range, inclusive
     * @param endRow Ending row in range, exclusive
     * @return A list of HRegionLocations corresponding to the regions that
     * contain the specified range
     * @throws IOException if a remote or network exception occurs
     */
    public static List<HRegionLocation> getRegionsInRange(HTable hbaseTbl, final byte[] startKey,
            final byte[] endKey) throws IOException {
        final boolean endKeyIsEndOfTable = Bytes.equals(endKey, HConstants.EMPTY_END_ROW);
        if ((Bytes.compareTo(startKey, endKey) > 0) && !endKeyIsEndOfTable) {
            throw new IllegalArgumentException(
                    "Invalid range: " + Bytes.toStringBinary(startKey) + " > " + Bytes.toStringBinary(endKey));
        }
        final List<HRegionLocation> regionList = new ArrayList<HRegionLocation>();
        byte[] currentKey = startKey;
        // Make sure only one thread is accessing the hbaseTbl.
        synchronized (hbaseTbl) {
            do {
                // always reload region location info.
                HRegionLocation regionLocation = hbaseTbl.getRegionLocation(currentKey, true);
                regionList.add(regionLocation);
                currentKey = regionLocation.getRegionInfo().getEndKey();
            } while (!Bytes.equals(currentKey, HConstants.EMPTY_END_ROW)
                    && (endKeyIsEndOfTable || Bytes.compareTo(currentKey, endKey) < 0));
        }
        return regionList;
    }

    /**
     * Returns the input-format class string for HBase tables read by Hive.
     */
    public static String getInputFormat() {
        return HBASE_INPUT_FORMAT;
    }

    /**
     * Returns the storage handler class for HBase tables read by Hive.
     */
    @Override
    public String getStorageHandlerClassName() {
        return HBASE_STORAGE_HANDLER;
    }

    /**
     * Returns statistics on this table as a tabular result set. Used for the
     * SHOW TABLE STATS statement. The schema of the returned TResultSet is set
     * inside this method.
     */
    public TResultSet getTableStats() {
        TResultSet result = new TResultSet();
        TResultSetMetadata resultSchema = new TResultSetMetadata();
        result.setSchema(resultSchema);
        resultSchema.addToColumns(new TColumn("Region Location", Type.STRING.toThrift()));
        resultSchema.addToColumns(new TColumn("Start RowKey", Type.STRING.toThrift()));
        resultSchema.addToColumns(new TColumn("Est. #Rows", Type.BIGINT.toThrift()));
        resultSchema.addToColumns(new TColumn("Size", Type.STRING.toThrift()));

        // TODO: Consider fancier stats maintenance techniques for speeding up this process.
        // Currently, we list all regions and perform a mini-scan of each of them to
        // estimate the number of rows, the data size, etc., which is rather expensive.
        try {
            long totalNumRows = 0;
            long totalHdfsSize = 0;
            List<HRegionLocation> regions = HBaseTable.getRegionsInRange(hTable_, HConstants.EMPTY_END_ROW,
                    HConstants.EMPTY_START_ROW);
            for (HRegionLocation region : regions) {
                TResultRowBuilder rowBuilder = new TResultRowBuilder();
                HRegionInfo regionInfo = region.getRegionInfo();
                Pair<Long, Long> estRowStats = getEstimatedRowStatsForRegion(region, false);

                long numRows = estRowStats.first.longValue();
                long hdfsSize = getHdfsSize(regionInfo);
                totalNumRows += numRows;
                totalHdfsSize += hdfsSize;

                // Add the region location, start rowkey, number of rows and raw Hdfs size.
                rowBuilder.add(String.valueOf(region.getHostname())).add(Bytes.toString(regionInfo.getStartKey()))
                        .add(numRows).addBytes(hdfsSize);
                result.addToRows(rowBuilder.get());
            }

            // Total num rows and raw Hdfs size.
            if (regions.size() > 1) {
                TResultRowBuilder rowBuilder = new TResultRowBuilder();
                rowBuilder.add("Total").add("").add(totalNumRows).addBytes(totalHdfsSize);
                result.addToRows(rowBuilder.get());
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return result;
    }
}