com.bah.culvert.hive.CulvertInputFormat.java Source code

Introduction

Here is the source code for com.bah.culvert.hive.CulvertInputFormat.java
Source

/**
 * Copyright 2011 Booz Allen Hamilton.
 * 
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership. Booz Allen Hamilton
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.bah.culvert.hive;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.hadoop.hive.ql.exec.ExprNodeConstantEvaluator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

import com.bah.culvert.Client;
import com.bah.culvert.constraints.And;
import com.bah.culvert.constraints.Constraint;
import com.bah.culvert.constraints.IndexRangeConstraint;
import com.bah.culvert.data.CRange;
import com.bah.culvert.data.Result;
import com.bah.culvert.data.index.Index;
import com.bah.culvert.util.Bytes;
import com.bah.culvert.util.LexicographicBytesComparator;

@SuppressWarnings("deprecation")
public class CulvertInputFormat implements InputFormat<NullWritable, Result> {

    @Override
    public RecordReader<NullWritable, Result> getRecordReader(InputSplit split, JobConf conf, Reporter reporter)
            throws IOException {
        CulvertRecordReader crr = new CulvertRecordReader((CulvertInputSplit) split);
        return crr;
    }

    @Override
    public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
        String expression = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
        if (expression == null) {
            // return what??
        }
        ExprNodeDesc filterExpr = Utilities.deserializeExpression(expression, conf);
        CulvertPredicateHandlerDelegate predicateHandler = new CulvertPredicateHandlerDelegate();
        CulvertIndexMapping[] culvertIndexMappings = CulvertHiveUtils.getIndexMappings(conf);
        /*
         * there should be no residual predicate, just the pushed predicate, since
         * we already handled that in the storage handler.
         */
        List<IndexSearchCondition> indexSearchConditions = predicateHandler.decomposePredicateToList(filterExpr,
                culvertIndexMappings);
        List<Constraint> compiledConstraints = new ArrayList<Constraint>();
        Map<String, CRange> rangeMap = new HashMap<String, CRange>();
        // get start and end ranges from the conditions
        for (IndexSearchCondition condition : indexSearchConditions) {
            /*
             * The hive column is a simple string -- easy This column name will let us
             * get an index from the culvert configuration specified in the conf.
             */
            String hiveColumn = condition.getColumnDesc().getColumn();
            /*
             * the operation maps to the op name specified in
             * CulvertPredicateHandlerDelegate.createPredicateAnalyzerForCulvert()
             */
            String operation = condition.getComparisonOp();
            /*
             * We need to manipuate this quite a bit- we need to turn it into an
             * object and an object inspector, and then use the object inspector to
             * find a byte value for the key.
             */
            ExprNodeConstantEvaluator nodeEvaluator = new ExprNodeConstantEvaluator(condition.getConstantDesc());
            PrimitiveObjectInspector inspector;
            Object value;
            // we can assume that the inspector is a primitive
            try {
                inspector = (PrimitiveObjectInspector) nodeEvaluator.initialize(null);
                value = nodeEvaluator.evaluate(null);
            } catch (HiveException e) {
                throw new RuntimeException(
                        "Unable to get constant portion of query expression expression " + condition, e);
            }
            Object primitive = inspector.getPrimitiveJavaObject(value);
            // look for the storage type in the column mapping
            CulvertIndexMapping culvertIndexMapping = getMapping(hiveColumn, culvertIndexMappings);

            if (culvertIndexMapping == null) {
                throw new IllegalArgumentException("Could not find a culvert mapping for this hive column "
                        + hiveColumn + " out of " + Arrays.toString(culvertIndexMappings));
            }
            // we have a utility to get the byte representation based on the hive
            // column type and the actual primitive itself.
            byte[] culvertValue = CulvertHiveUtils.getBytesForPrimitiveObject(primitive,
                    culvertIndexMapping.getType());
            CRange range = rangeMap.get(hiveColumn);
            if (range == null) {
                // ok, start a new range
                // we know we'll always want to insert for this operation
                if (operation.equals(GenericUDFOPEqual.class.getName())) {
                    // =
                    range = new CRange(culvertValue);
                } else if (operation.equals(GenericUDFOPGreaterThan.class.getName())) {
                    // >
                    range = new CRange(Bytes.lexIncrement(culvertValue), Bytes.START_END_KEY);
                } else if (operation.equals(GenericUDFOPEqualOrGreaterThan.class.getName())) {
                    // =>
                    range = new CRange(culvertValue, Bytes.START_END_KEY);
                } else if (operation.equals(GenericUDFOPEqualOrLessThan.class.getName())) {
                    // =<
                    range = new CRange(Bytes.START_END_KEY, culvertValue);
                }
                /*
                 * TODO we can't implement < because lexDecrement is undefined (in math
                 * speak). Need to work around that by pushing down inclusive/exclusive
                 * on the CRange.
                 */
                rangeMap.put(hiveColumn, range);
            } else {
                LexicographicBytesComparator c = LexicographicBytesComparator.INSTANCE;
                if (operation.equals(GenericUDFOPEqual.class.getName())) {
                    // = (if two conflicting equals, the whole predicate becomes a no-op
                    if (c.compare(range.getStart(), culvertValue) != 0) {
                        rangeMap.clear();
                        return new InputSplit[0];
                    }
                } else if (operation.equals(GenericUDFOPGreaterThan.class.getName())) {
                    // >
                    // narrow the existing range
                    byte[] exclusiveValue = Bytes.lexIncrement(culvertValue);
                    int comp = c.compare(exclusiveValue, range.getEnd());
                    if (comp < 1) {
                        rangeMap.put(hiveColumn, new CRange(Bytes.max(exclusiveValue, range.getStart())));
                    } else if (comp == 0) {
                        // no-op - this is the same as what we're doing already
                    } else {
                        // if we get conflicting statements, kill it all...
                        rangeMap.clear();
                        return new InputSplit[0];
                    }
                } else if (operation.equals(GenericUDFOPEqualOrGreaterThan.class.getName())) {
                    // =>
                    // narrow the existing range
                    int comp = c.compare(culvertValue, range.getEnd());
                    if (comp < 1) {
                        rangeMap.put(hiveColumn, new CRange(Bytes.max(culvertValue, range.getStart())));
                    } else if (comp == 0) {
                        // no-op - this is the same as what we're doing already
                    } else {
                        // if we get conflicting statements, kill it all...
                        rangeMap.clear();
                        return new InputSplit[0];
                    }
                } else if (operation.equals(GenericUDFOPEqualOrLessThan.class.getName())) {
                    // =<
                    int comp = c.compare(culvertValue, range.getStart());
                    if (comp > 1) {
                        rangeMap.put(hiveColumn, new CRange(Bytes.min(culvertValue, range.getEnd())));
                    } else if (comp == 0) {
                        // no-op
                    } else {
                        // if we get conflicting statements, kill it...
                        rangeMap.clear();
                        return new InputSplit[0];
                    }
                }
            }
        }
        // <-- end getting start/end ranges
        Client culvertClient = new Client(conf);

        Index[] culvertIndices = culvertClient.getIndicesForTable(CulvertHiveUtils.getCulvertTable(conf));
        // create a map with the broken up ranges
        SortedMap<String, List<Constraint>> brokenRanges = new TreeMap<String, List<Constraint>>();
        for (Entry<String, CRange> rangeEntry : rangeMap.entrySet()) {
            String hiveColumn = rangeEntry.getKey();
            CRange totalRange = rangeEntry.getValue();
            Index indexToUse = getIndexForHiveColumn(hiveColumn, culvertIndexMappings, culvertIndices);
            /*
             * There should be an index from when we did the predicate pushdown in the
             * storage handler
             */
            if (indexToUse == null) {
                throw new RuntimeException("No indices exist over the requested column");
            }
            /*
             * Set the start and end on the splits to start where the overall range
             * starts and ends so we don't query the entire shard, which would give
             * the wrong result.
             */
            List<CRange> startEnds = indexToUse.getSplits();
            List<Constraint> newRanges = new ArrayList<Constraint>();
            if (startEnds.size() == 1) {
                newRanges.add(new IndexRangeConstraint(indexToUse, totalRange));
                brokenRanges.put(hiveColumn, newRanges);
                continue;
            }
            assert (startEnds.size() != 0);
            CRange firstShard = startEnds.get(0);
            CRange first = new CRange(totalRange.getStart(), firstShard.getEnd());
            newRanges.add(new IndexRangeConstraint(indexToUse, first));
            for (int i = 1; i < startEnds.size() - 1; i++) {
                newRanges.add(new IndexRangeConstraint(indexToUse, startEnds.get(i)));
            }
            CRange lastShard = startEnds.get(startEnds.size() - 1);
            CRange last = new CRange(lastShard.getStart(), totalRange.getEnd());
            newRanges.add(new IndexRangeConstraint(indexToUse, last));
            brokenRanges.put(hiveColumn, newRanges);
        }
        // <-- finished creating fragment map

        /*
         * Finally, take the cross product of all the fragments in order to find the
         * splits.
         */
        List<List<Constraint>> andRangeList = recursiveBuildRangeTuples(brokenRanges);
        InputSplit[] splits = new InputSplit[andRangeList.size()];
        for (int i = 0; i < splits.length; i++) {
            List<Constraint> constraints = andRangeList.get(i);
            Set<String> locations = new HashSet<String>();
            for (Constraint c : constraints) {
                locations.addAll(((IndexRangeConstraint) c).getIndex().getPreferredHosts());
            }
            // add field selection too...
            List<Constraint> fieldSelectingConstraints = new ArrayList<Constraint>(constraints.size());
            // TODO - wrap these constraints in field selecting constraints now that
            // we've pulled the preferred hosts out of them
            splits[i] = new CulvertInputSplit(new And(constraints), locations);
        }
        return null;
    }

    private Index getIndexForHiveColumn(String hiveColumn, CulvertIndexMapping[] culvertIndexMappings,
            Index[] culvertIndices) {
        CulvertIndexMapping culvertIndexMapping = getMapping(hiveColumn, culvertIndexMappings);
        Index indexToUse = null;
        /*
         * iterate the indices to find one to use on the column, in the future we
         * should select indices intelligently
         */
        for (Index ix : culvertIndices) {
            LexicographicBytesComparator c = LexicographicBytesComparator.INSTANCE;
            if (c.compare(ix.getColumnFamily(), culvertIndexMapping.getColumnFamily()) == 0
                    && c.compare(ix.getColumnQualifier(), culvertIndexMapping.getColumnQualifier()) == 0) {
                indexToUse = ix;
            }
        }
        return indexToUse;
    }

    /**
     * Builds the cross product of each element in a map of CRange Lists.
     * 
     * @param brokenRanges
     * @return
     */
    private static List<List<Constraint>> recursiveBuildRangeTuples(
            final SortedMap<String, List<Constraint>> brokenRanges) {
        int size = brokenRanges.size();
        /*
         * Special 0 case - no ranges, then return an empty list. Happens if there's
         * an empty query, but not during normal operation.
         */
        if (size == 0) {
            return new ArrayList<List<Constraint>>();
        }
        /*
         * Special 1 case - is the base case for normal operation (ie. the query
         * isn't empty)
         */
        else if (size == 1) {
            Entry<String, List<Constraint>> entry = brokenRanges.entrySet().iterator().next();
            List<List<Constraint>> rangeList = new ArrayList<List<Constraint>>();
            for (Constraint range : entry.getValue()) {
                ArrayList<Constraint> newList = new ArrayList<Constraint>();
                newList.add(range);
                rangeList.add(newList);
            }
            return rangeList;
        }
        /*
         * General case - we need to do the cross product of the different map
         * elements.
         */
        else {
            List<List<Constraint>> newListList = new ArrayList<List<Constraint>>();
            List<List<Constraint>> nextListList = recursiveBuildRangeTuples(
                    brokenRanges.tailMap(getSecondKey(brokenRanges)));
            for (Constraint range : brokenRanges.entrySet().iterator().next().getValue()) {
                for (List<Constraint> nextList : nextListList) {
                    nextList = new ArrayList<Constraint>(nextList);
                    nextList.add(range);
                    newListList.add(nextList);
                }
            }
            return newListList;
        }
    }

    /**
     * Gets the second key in the keySet of a sortedMap.
     * 
     * @param brokenRanges The sorted map to get the second key of
     * @return The second key in the map, or null if the map has less than two
     *         keys.
     */
    private static String getSecondKey(SortedMap<String, ?> brokenRanges) {
        Iterator<String> it = brokenRanges.keySet().iterator();
        if (it.hasNext())
            it.next();
        else
            return null;
        if (it.hasNext())
            return it.next();
        else
            return null;
    }

    /**
     * Get the index mapping for particular hive column from an array of index
     * mappings.
     * 
     * @param hiveColumn The hive column to look for.
     * @param culvertIndexMappings The index mappings to look through.
     * @return The first index mapping with the name of the hive column.
     */
    private static CulvertIndexMapping getMapping(String hiveColumn, CulvertIndexMapping[] culvertIndexMappings) {
        for (CulvertIndexMapping mapping : culvertIndexMappings) {
            if (mapping.getHiveColumn().equals(hiveColumn)) {
                return mapping;
            }
        }
        throw new RuntimeException("Cannot find culvert mapping for column " + hiveColumn + " in mappings: "
                + Arrays.toString(culvertIndexMappings));
    }
}