co.cask.cdap.data2.dataset2.lib.table.SplitsUtil.java Source code

Introduction

Here is the source code for co.cask.cdap.data2.dataset2.lib.table.SplitsUtil.java
Source

/*
 * Copyright  2014-2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.data2.dataset2.lib.table;

import co.cask.cdap.api.common.Bytes;
import com.google.common.collect.Lists;

import java.util.Collections;
import java.util.List;

/**
 * Provides handy methods for simple table splits calculation.
 * <p>
 * This is copied from old data-fabric, could be improved.
 * <p>
 * NOTE: there's also seem to be a bug: first split should have open start and last one open end... TODO: fix it
 */
public class SplitsUtil {
    // Simplest stateless getSplits method implementation (doesn't use the actual stored data)

    /**
     * If the number of splits is not given, and we have no hints from the table structure (that can be implemented in
     * overriding implementations, though), the primitive getSplits methos will return up to this many splits. Note that
     * we cannot read this number from configuration, because the current OVCTable(Handle) does not pass configuration
     * down into the tables anywhere. See ENG-2395 for the fix.
     */
    static final int DEFAULT_NUMBER_OF_SPLITS = 8;

    /**
     * Simplest possible implementation of getSplits. Takes the given start and end and divides the key space in
     * between into (almost) even partitions, using a long integer approximation of the keys.
     */
    static List<KeyRange> primitiveGetSplits(int numSplits, byte[] start, byte[] stop) {
        // if the range is empty, return no splits
        if (start != null && stop != null && Bytes.compareTo(start, stop) >= 0) {
            return Collections.emptyList();
        }
        if (numSplits <= 0) {
            numSplits = DEFAULT_NUMBER_OF_SPLITS;
        }
        // for simplicity, we construct a long from the begin and end, divide the resulting long range into approximately
        // even splits, and convert the boundaries back to nyte array keys.
        long begin = longForKey(start, false);
        long end = longForKey(stop, true);
        double splitSize = ((double) (end - begin)) / ((double) numSplits);

        // each range will start with the stop key of the previous range.
        // start key of the first range is either the given start, or the least possible key {0};
        List<KeyRange> ranges = Lists.newArrayListWithExpectedSize(numSplits);
        byte[] current = start == null ? new byte[] { 0x00 } : start;
        for (int i = 1; i < numSplits; i++) {
            long bound = begin + (long) (splitSize * i);
            byte[] next = keyForBound(bound);
            // due to rounding and truncation, we may get a bound that is the same as the previous (or if the previous is
            // the start key, less than that). We may also get a bound that exceeds the stop key. In both cases we want to
            // ignore this bound and continue.
            if (Bytes.compareTo(current, next) < 0 && (stop == null || Bytes.compareTo(next, stop) < 0)) {
                ranges.add(new KeyRange(current, next));
                current = next;
            }
        }
        // end of the last range is always the given stop of the range to cover
        ranges.add(new KeyRange(current, stop));

        return ranges;
    }

    // helper method to approximate a row key as a long value. Takes the first 7 bytes from the key and prepends a 0x0;
    // if the key is less than 7 bytes, pads it with zeros to the right.
    static long longForKey(byte[] key, boolean isStop) {
        if (key == null) {
            return isStop ? 0xffffffffffffffL : 0L;
        } else {
            // leading zero helps avoid negative long values for keys beginning with a byte > 0x80
            final byte[] leadingZero = { 0x00 };
            byte[] x;
            if (key.length >= Bytes.SIZEOF_LONG - 1) {
                x = Bytes.add(leadingZero, Bytes.head(key, Bytes.SIZEOF_LONG - 1));
            } else {
                x = Bytes.padTail(Bytes.add(leadingZero, key), Bytes.SIZEOF_LONG - 1 - key.length);
            }
            return Bytes.toLong(x);
        }
    }

    // helper method to convert a long approximation of a long key into a range bound.
    // the following invariant holds: keyForBound(longForKey(key)) == removeTrailingZeros(key).
    // removing the trailing zeros is ok in the context that this is used (only for split bounds)
    // this is called keyForBound on purpose, and not keyForLong.
    static byte[] keyForBound(long value) {
        byte[] bytes = Bytes.tail(Bytes.toBytes(value), Bytes.SIZEOF_LONG - 1);
        int lastNonZero = bytes.length - 1;
        while (lastNonZero > 0 && bytes[lastNonZero] == 0) {
            lastNonZero--;
        }
        return Bytes.head(bytes, lastNonZero + 1);
    }
}