org.apache.hadoop.hive.cassandra.CassandraPushdownPredicate.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.cassandra.CassandraPushdownPredicate.java
Source

/**
 * Licensed to Tuplejump Software Pvt. Ltd. under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Tuplejump Software Pvt. Ltd. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.cassandra;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.db.marshal.TypeParser;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.exceptions.SyntaxException;
import org.apache.cassandra.thrift.*;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.Hex;
import org.apache.hadoop.hive.cassandra.serde.AbstractCassandraSerDe;
import org.apache.hadoop.hive.ql.exec.ExprNodeConstantEvaluator;
import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.udf.generic.*;
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef;
import org.apache.hadoop.hive.serde2.lazy.LazyCassandraUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.thrift.TDeserializer;
import org.apache.thrift.TException;
import org.apache.thrift.TSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class CassandraPushdownPredicate {

    private static final Logger logger = LoggerFactory.getLogger(CassandraPushdownPredicate.class);

    /**
     * Get metadata for the columns which have secondary indexes
     * @param host
     * @param port
     * @param ksName keyspace name
     * @param cfName column family name
     * @return A set of ColumnDefs representing the indexed columns of the cf.
     *         Only the name of the column and its validation class are required,
     *         at the moment, so all other fields are left unset
     * @throws CassandraException if a problem is encountered communicating with Cassandra
     */
    public static Set<ColumnDef> getIndexedColumns(String host, int port, String ksName, String cfName)
            throws CassandraException {
        final CassandraProxyClient client = new CassandraProxyClient(host, port, true, true);
        Set<ColumnDef> indexedColumns = new HashSet<ColumnDef>();
        try {
            KsDef ks = client.getProxyConnection().describe_keyspace(ksName);
            List<CfDef> cfs = ks.getCf_defs();
            CfDef cfDef = null;
            for (CfDef thisCf : cfs) {
                if (thisCf.getName().equalsIgnoreCase(cfName)) {
                    cfDef = thisCf;
                    break;
                }
            }

            List<ColumnDef> columns = cfDef.getColumn_metadata();
            for (ColumnDef thisColumn : columns) {
                if (thisColumn.isSetIndex_type()) {
                    ColumnDef indexed = new ColumnDef();
                    indexed.setName(thisColumn.getName());
                    indexed.setValidation_class(thisColumn.getValidation_class());
                    indexedColumns.add(indexed);
                }
            }
        } catch (TException e) {
            throw new CassandraException(e);
        } catch (InvalidRequestException e) {
            throw new CassandraException(e);
        } catch (NotFoundException e) {
            throw new CassandraException(e);
        }
        return indexedColumns;
    }

    /**
     * Serialize a set of ColumnDefs for indexed columns, so that
     * it can be written to Job configuration
     *
     * @param columns column metadata
     * @return serialized form
     */
    public static String serializeIndexedColumns(Set<ColumnDef> columns) {
        TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
        try {
            List<String> hexStrings = new ArrayList<String>();
            for (ColumnDef column : columns) {
                String encoded = Hex.bytesToHex(serializer.serialize(column));
                logger.info("Encoded column def: " + encoded);
                hexStrings.add(encoded);
            }
            return Joiner.on(AbstractCassandraSerDe.DELIMITER).join(hexStrings);
        } catch (TException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Serialize a set of ColumnDefs for indexed columns, read
     * from Job configuration
     *
     * @param serialized column metadata
     * @return list of column metadata objects which may be empty, but not null
     */
    public static Set<ColumnDef> deserializeIndexedColumns(String serialized) {
        Set<ColumnDef> columns = new HashSet<ColumnDef>();
        if (null == serialized) {
            return columns;
        }

        Iterable<String> strings = Splitter.on(AbstractCassandraSerDe.DELIMITER).omitEmptyStrings().trimResults()
                .split(serialized);
        TDeserializer deserializer = new TDeserializer(new TBinaryProtocol.Factory());
        for (String encoded : strings) {
            ColumnDef column = new ColumnDef();
            try {
                logger.info("Encoded column def: " + encoded);
                deserializer.deserialize(column, Hex.hexToBytes(encoded));
            } catch (TException e) {
                logger.warn("Error deserializing indexed column definition", e);
            }
            if (null == column.getName() || null == column.validation_class) {
                continue;
            }
            columns.add(column);
        }
        return columns;
    }

    /**
     * Given a set of indexed column names, return an IndexPredicateAnalyzer
     *
     * @param indexedColumns names of indexed columns
     * @return IndexPredicateAnalyzer
     */
    public static IndexPredicateAnalyzer newIndexPredicateAnalyzer(Set<ColumnDef> indexedColumns) {
        IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();

        // we only support C*'s set of comparisons = > >= =< <
        analyzer.addComparisonOp(GenericUDFOPEqual.class.getName());
        analyzer.addComparisonOp(GenericUDFOPEqualOrGreaterThan.class.getName());
        analyzer.addComparisonOp(GenericUDFOPGreaterThan.class.getName());
        analyzer.addComparisonOp(GenericUDFOPEqualOrLessThan.class.getName());
        analyzer.addComparisonOp(GenericUDFOPLessThan.class.getName());

        for (ColumnDef column : indexedColumns) {
            analyzer.allowColumnName(new String(column.getName()));
        }

        return analyzer;
    }

    /**
     * An IndexClause in C* must always include at least 1 EQ condition.
     * Validate this constraint is satisified by the list of IndexSearchConditions
     *
     * @return true if there is an EQ operator present, otherwise false
     */
    public static boolean verifySearchConditions(List<IndexSearchCondition> conditions) {
        for (IndexSearchCondition thisCon : conditions) {
            if (thisCon.getComparisonOp().equals(GenericUDFOPEqual.class.getName())) {
                return true;
            }
        }

        return false;
    }

    /**
     * Translate the list of Hive SearchConditions into C* IndexExpressions.
     *
     * @param conditions a list of index search condition
     * @return list of IndexExpressions, which may be empty but not null
     */
    public static List<IndexExpression> translateSearchConditions(List<IndexSearchCondition> conditions,
            Set<ColumnDef> indexedColumns) throws IOException {
        List<IndexExpression> exps = new ArrayList<IndexExpression>();
        for (IndexSearchCondition thisCond : conditions) {
            exps.add(translateSearchCondition(thisCond, indexedColumns));
        }
        return exps;
    }

    private static IndexExpression translateSearchCondition(IndexSearchCondition condition,
            Set<ColumnDef> columnInfos) throws IOException {
        IndexExpression expr = new IndexExpression();
        String columnName = condition.getColumnDesc().getColumn();
        expr.setColumn_name(columnName.getBytes());
        expr.setOp(getIndexOperator(condition.getComparisonOp()));

        ExprNodeConstantEvaluator eval = new ExprNodeConstantEvaluator(condition.getConstantDesc());
        byte[] value;
        try {
            ObjectInspector objInspector = eval.initialize(null);
            Object writable = eval.evaluate(null);
            ByteStream.Output serializeStream = new ByteStream.Output();

            PrimitiveObjectInspector poi = (PrimitiveObjectInspector) objInspector;
            AbstractType validator = getValidator(columnInfos, columnName);
            ByteBuffer bytes = getIndexExpressionValue(condition.getConstantDesc(), poi, writable, validator);
            serializeStream.write(ByteBufferUtil.getArray(bytes));

            value = new byte[serializeStream.getCount()];
            System.arraycopy(serializeStream.getData(), 0, value, 0, serializeStream.getCount());
        } catch (HiveException e) {
            throw new IOException(e);
        }
        expr.setValue(value);
        logger.info("IndexExpression.value : {}", new String(expr.getValue()));
        return expr;
    }

    private static AbstractType getValidator(Set<ColumnDef> columnInfos, String columnName) {
        for (ColumnDef column : columnInfos) {
            if (new String(column.getName()).equals(columnName)) {
                try {
                    return TypeParser.parse(column.validation_class);
                } catch (ConfigurationException e) {
                    logger.error("Error creating validator from string {}", column.validation_class);
                    throw new RuntimeException(e);
                } catch (SyntaxException e) {
                    logger.error("Syntax exception in parsing: \n {}", e.getMessage());
                    throw new RuntimeException(e);
                }
            }
        }
        logger.error("Error finding validator class for column {}", columnName);
        throw new RuntimeException("Error finding validator class for column " + columnName);
    }

    private static ByteBuffer getIndexExpressionValue(ExprNodeConstantDesc constantDesc,
            PrimitiveObjectInspector poi, Object writable, AbstractType validator) {
        logger.info("Primitive Category: {}, Validation class: {}, CassandraType: {}",
                new Object[] { poi.getPrimitiveCategory(), validator.getClass().getName(),
                        LazyCassandraUtils.getCassandraType(poi) });
        switch (poi.getPrimitiveCategory()) {
        case TIMESTAMP:
            String dateString = new java.sql.Date(
                    ((java.sql.Timestamp) poi.getPrimitiveJavaObject(writable)).getTime()).toString();
            return validator.fromString(dateString);
        case BINARY:
            byte[] bytes = ((ByteArrayRef) poi.getPrimitiveJavaObject(writable)).getData();

            // this will only work if the value has been cast using one of the UDFs
            // UDFHexToBytes, UDFUuid, UDFDecimal, UDFVarint
            return ByteBuffer.wrap(bytes);
        default:
            return validator.fromString(constantDesc.getValue().toString());
        }
    }

    private static IndexOperator getIndexOperator(String str) throws IOException {
        if (str.equals(GenericUDFOPEqual.class.getName())) {
            return IndexOperator.EQ;
        } else if (str.equals(GenericUDFOPEqualOrGreaterThan.class.getName())) {
            return IndexOperator.GTE;
        } else if (str.equals(GenericUDFOPGreaterThan.class.getName())) {
            return IndexOperator.GT;
        } else if (str.equals(GenericUDFOPEqualOrLessThan.class.getName())) {
            return IndexOperator.LTE;
        } else if (str.equals(GenericUDFOPLessThan.class.getName())) {
            return IndexOperator.LT;
        } else {
            throw new IOException("Unable to get index operator matches " + str);
        }
    }

}