com.github.dryangkun.hbase.tidx.hive.HBaseSerDe.java Source code

Java tutorial

Introduction

Here is the source code for com.github.dryangkun.hbase.tidx.hive.HBaseSerDe.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.dryangkun.hbase.tidx.hive;

import com.github.dryangkun.hbase.tidx.TxUtils;
import com.github.dryangkun.hbase.tidx.hive.ColumnMappings.ColumnMapping;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.*;
import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

/**
 * HBaseSerDe can be used to serialize object into an HBase table and
 * deserialize objects from an HBase table.
 */
@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
        serdeConstants.FIELD_DELIM, serdeConstants.COLLECTION_DELIM, serdeConstants.MAPKEY_DELIM,
        serdeConstants.SERIALIZATION_FORMAT, serdeConstants.SERIALIZATION_NULL_FORMAT,
        serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST, serdeConstants.ESCAPE_CHAR,
        serdeConstants.SERIALIZATION_ENCODING, LazySerDeParameters.SERIALIZATION_EXTEND_NESTING_LEVELS,
        LazySerDeParameters.SERIALIZATION_EXTEND_ADDITIONAL_NESTING_LEVELS, HBaseSerDe.HBASE_COLUMNS_MAPPING,
        HBaseSerDe.HBASE_TABLE_NAME, HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, HBaseSerDe.HBASE_KEY_COL,
        HBaseSerDe.HBASE_PUT_TIMESTAMP, HBaseSerDe.HBASE_COMPOSITE_KEY_CLASS, HBaseSerDe.HBASE_COMPOSITE_KEY_TYPES,
        HBaseSerDe.HBASE_COMPOSITE_KEY_FACTORY, HBaseSerDe.HBASE_STRUCT_SERIALIZER_CLASS,
        HBaseSerDe.HBASE_SCAN_CACHE, HBaseSerDe.HBASE_SCAN_CACHEBLOCKS, HBaseSerDe.HBASE_SCAN_BATCH,
        HBaseSerDe.HBASE_AUTOGENERATE_STRUCT })
public class HBaseSerDe extends AbstractSerDe {
    public static final Log LOG = LogFactory.getLog(HBaseSerDe.class);

    public static final String HBASE_COLUMNS_MAPPING = "hbase.columns.mapping";
    public static final String HBASE_TABLE_NAME = "hbase.table.name";
    public static final String HBASE_TABLE_DEFAULT_STORAGE_TYPE = "hbase.table.default.storage.type";
    public static final String HBASE_KEY_COL = ":key";
    public static final String HBASE_TIMESTAMP_COL = ":timestamp";
    public static final String HBASE_PUT_TIMESTAMP = "hbase.put.timestamp";
    public static final String HBASE_COMPOSITE_KEY_CLASS = "hbase.composite.key.class";
    public static final String HBASE_COMPOSITE_KEY_TYPES = "hbase.composite.key.types";
    public static final String HBASE_COMPOSITE_KEY_FACTORY = "hbase.composite.key.factory";
    public static final String HBASE_STRUCT_SERIALIZER_CLASS = "hbase.struct.serialization.class";
    public static final String HBASE_SCAN_CACHE = "hbase.scan.cache";
    public static final String HBASE_SCAN_CACHEBLOCKS = "hbase.scan.cacheblock";
    public static final String HBASE_SCAN_BATCH = "hbase.scan.batch";
    public static final String HBASE_AUTOGENERATE_STRUCT = "hbase.struct.autogenerate";
    /**
     * Determines whether a regex matching should be done on the columns or not. Defaults to true.
     * <strong>WARNING: Note that currently this only supports the suffix wildcard .*</strong>
     */
    public static final String HBASE_COLUMNS_REGEX_MATCHING = "hbase.columns.mapping.regex.matching";
    /**
     * Defines the type for a column.
     **/
    public static final String SERIALIZATION_TYPE = "serialization.type";

    public static final String TX_HIVE_TIME_COL = "tx.hive.time.col";
    public static final String TX_HIVE_PHOENIX_INDEX_ID = "tx.hive.pidx.id";
    public static final String TX_HIVE_TIME_CHECK = "tx.hive.tcheck";
    public static final String TX_HIVE_INDEX_SCAN = "tx.hive.index.scan";

    private ObjectInspector cachedObjectInspector;
    private LazyHBaseRow cachedHBaseRow;

    private HBaseSerDeParameters serdeParams;
    private HBaseRowSerializer serializer;

    @Override
    public String toString() {
        return getClass() + "[" + serdeParams + "]";
    }

    public HBaseSerDe() throws SerDeException {
    }

    /**
     * Initialize the SerDe given parameters.
     * @see SerDe#initialize(Configuration, Properties)
     */
    @Override
    public void initialize(Configuration conf, Properties tbl) throws SerDeException {
        serdeParams = new HBaseSerDeParameters(conf, tbl, getClass().getName());

        cachedObjectInspector = HBaseLazyObjectFactory.createLazyHBaseStructInspector(serdeParams.getSerdeParams(),
                serdeParams.getKeyIndex(), serdeParams.getKeyFactory(), serdeParams.getValueFactories());

        cachedHBaseRow = new LazyHBaseRow((LazySimpleStructObjectInspector) cachedObjectInspector, serdeParams);

        serializer = new HBaseRowSerializer(serdeParams);

        if (LOG.isDebugEnabled()) {
            LOG.debug("HBaseSerDe initialized with : " + serdeParams);
        }
    }

    public static int getTxTimeColumnIndex(ColumnMappings columnMappings, JobConf jobConf) throws SerDeException {
        String timeColumn = jobConf.get(TX_HIVE_TIME_COL);
        if (TxUtils.isEmpty(timeColumn)) {
            LOG.warn(TX_HIVE_TIME_COL + " is empty in job conf");
            return -1;
        }

        String[] items = timeColumn.split(":", 2);
        if (items.length != 2) {
            throw new SerDeException(TX_HIVE_TIME_COL + "=" + timeColumn + " invalid(family:qualifier)");
        }
        byte[] timeFamily = Bytes.toBytes(items[0]);
        byte[] timeQualifier = Bytes.toBytes(items[1]);

        ColumnMapping[] columns = columnMappings.getColumnsMapping();
        for (int i = 0; i < columns.length; i++) {
            ColumnMapping columnMapping = columns[i];
            if (Bytes.equals(timeFamily, columnMapping.familyNameBytes)
                    && Bytes.equals(timeQualifier, columnMapping.qualifierNameBytes)) {
                return i;
            }
        }
        return -1;
    }

    public static ColumnMappings parseColumnsMapping(String columnsMappingSpec) throws SerDeException {
        return parseColumnsMapping(columnsMappingSpec, true);
    }

    /**
     * Parses the HBase columns mapping specifier to identify the column families, qualifiers
     * and also caches the byte arrays corresponding to them. One of the Hive table
     * columns maps to the HBase row key, by default the first column.
     *
     * @param columnsMappingSpec string hbase.columns.mapping specified when creating table
     * @param doColumnRegexMatching whether to do a regex matching on the columns or not
     * @return List<ColumnMapping> which contains the column mapping information by position
     * @throws org.apache.hadoop.hive.serde2.SerDeException
     */
    public static ColumnMappings parseColumnsMapping(String columnsMappingSpec, boolean doColumnRegexMatching)
            throws SerDeException {

        if (columnsMappingSpec == null) {
            throw new SerDeException("Error: hbase.columns.mapping missing for this HBase table.");
        }

        if (columnsMappingSpec.isEmpty() || columnsMappingSpec.equals(HBASE_KEY_COL)) {
            throw new SerDeException("Error: hbase.columns.mapping specifies only the HBase table"
                    + " row key. A valid Hive-HBase table must specify at least one additional column.");
        }

        int rowKeyIndex = -1;
        int timestampIndex = -1;
        List<ColumnMapping> columnsMapping = new ArrayList<ColumnMapping>();
        String[] columnSpecs = columnsMappingSpec.split(",");

        for (int i = 0; i < columnSpecs.length; i++) {
            String mappingSpec = columnSpecs[i].trim();
            String[] mapInfo = mappingSpec.split("#");
            String colInfo = mapInfo[0];

            int idxFirst = colInfo.indexOf(":");
            int idxLast = colInfo.lastIndexOf(":");

            if (idxFirst < 0 || !(idxFirst == idxLast)) {
                throw new SerDeException("Error: the HBase columns mapping contains a badly formed "
                        + "column family, column qualifier specification.");
            }

            ColumnMapping columnMapping = new ColumnMapping();

            if (colInfo.equals(HBASE_KEY_COL)) {
                rowKeyIndex = i;
                columnMapping.familyName = colInfo;
                columnMapping.familyNameBytes = Bytes.toBytes(colInfo);
                columnMapping.qualifierName = null;
                columnMapping.qualifierNameBytes = null;
                columnMapping.hbaseRowKey = true;
            } else if (colInfo.equals(HBASE_TIMESTAMP_COL)) {
                timestampIndex = i;
                columnMapping.familyName = colInfo;
                columnMapping.familyNameBytes = Bytes.toBytes(colInfo);
                columnMapping.qualifierName = null;
                columnMapping.qualifierNameBytes = null;
                columnMapping.hbaseTimestamp = true;
            } else {
                String[] parts = colInfo.split(":");
                assert (parts.length > 0 && parts.length <= 2);
                columnMapping.familyName = parts[0];
                columnMapping.familyNameBytes = Bytes.toBytes(parts[0]);
                columnMapping.hbaseRowKey = false;
                columnMapping.hbaseTimestamp = false;

                if (parts.length == 2) {

                    if (doColumnRegexMatching && parts[1].endsWith(".*")) {
                        // we have a prefix with a wildcard
                        columnMapping.qualifierPrefix = parts[1].substring(0, parts[1].length() - 2);
                        columnMapping.qualifierPrefixBytes = Bytes.toBytes(columnMapping.qualifierPrefix);
                        // we weren't provided any actual qualifier name. Set these to
                        // null.
                        columnMapping.qualifierName = null;
                        columnMapping.qualifierNameBytes = null;
                    } else {
                        // set the regular provided qualifier names
                        columnMapping.qualifierName = parts[1];
                        columnMapping.qualifierNameBytes = Bytes.toBytes(parts[1]);
                    }
                } else {
                    columnMapping.qualifierName = null;
                    columnMapping.qualifierNameBytes = null;
                }
            }

            columnMapping.mappingSpec = mappingSpec;

            columnsMapping.add(columnMapping);
        }

        if (rowKeyIndex == -1) {
            rowKeyIndex = 0;
            ColumnMapping columnMapping = new ColumnMapping();
            columnMapping.familyName = HBaseSerDe.HBASE_KEY_COL;
            columnMapping.familyNameBytes = Bytes.toBytes(HBaseSerDe.HBASE_KEY_COL);
            columnMapping.qualifierName = null;
            columnMapping.qualifierNameBytes = null;
            columnMapping.hbaseRowKey = true;
            columnMapping.mappingSpec = HBaseSerDe.HBASE_KEY_COL;
            columnsMapping.add(0, columnMapping);
        }

        return new ColumnMappings(columnsMapping, rowKeyIndex, timestampIndex);
    }

    public LazySerDeParameters getSerdeParams() {
        return serdeParams.getSerdeParams();
    }

    public HBaseSerDeParameters getHBaseSerdeParam() {
        return serdeParams;
    }

    /**
     * Deserialize a row from the HBase Result writable to a LazyObject
     * @param result the HBase Result Writable containing the row
     * @return the deserialized object
     * @see SerDe#deserialize(Writable)
     */
    @Override
    public Object deserialize(Writable result) throws SerDeException {
        if (!(result instanceof ResultWritable)) {
            throw new SerDeException(getClass().getName() + ": expects ResultWritable!");
        }

        cachedHBaseRow.init(((ResultWritable) result).getResult());

        return cachedHBaseRow;
    }

    @Override
    public ObjectInspector getObjectInspector() throws SerDeException {
        return cachedObjectInspector;
    }

    @Override
    public Class<? extends Writable> getSerializedClass() {
        return PutWritable.class;
    }

    @Override
    public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
        try {
            return serializer.serialize(obj, objInspector);
        } catch (SerDeException e) {
            throw e;
        } catch (Exception e) {
            throw new SerDeException(e);
        }
    }

    @Override
    public SerDeStats getSerDeStats() {
        // no support for statistics
        return null;
    }

    public HBaseKeyFactory getKeyFactory() {
        return serdeParams.getKeyFactory();
    }

    public static void configureJobConf(TableDesc tableDesc, JobConf jobConf) throws Exception {
        HBaseSerDeParameters serdeParams = new HBaseSerDeParameters(jobConf, tableDesc.getProperties(),
                HBaseSerDe.class.getName());
        serdeParams.getKeyFactory().configureJobConf(tableDesc, jobConf);
    }
}