com.willetinc.hadoop.mapreduce.dynamodb.BinarySplitter.java Source code

Introduction

Here is the source code for com.willetinc.hadoop.mapreduce.dynamodb.BinarySplitter.java
Source

/**
 * Copyright 2012 Willet Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.willetinc.hadoop.mapreduce.dynamodb;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;

import com.amazonaws.services.dynamodb.model.AttributeValue;
import com.amazonaws.services.dynamodb.model.ComparisonOperator;

/**
 * This method needs to determine the splits between two user-provided byte
 * arrays. In the case where the user's bytes are 0x0 and 0xF, this is not hard;
 * we could create two splits from [0x0, 0x7] and [0x8, 0xF], 16 splits for
 * bytes.
 * 
 * If a user has provided us with the byte arrays "[0xD, 0xA, 0xD]" and [0xD,
 * 0xA, 0xB], however, we need to create splits that differ in the third byte.
 * 
 * The algorithm used is as follows: Since there are 16 values per bit, we
 * interpret byts as digits in base 16. Given a byte array b containing bytes
 * b_0, b_1 .. b_n, we interpret the string as the number: 0.b_0 b_1 b_2.. b_n
 * in base 16. Having mapped the low and high strings into floating-point
 * values, we then use the BigDecimalSplitter to establish the even split
 * points, then map the resulting floating point values back into byte arrays.
 */
public class BinarySplitter extends BigDecimalSplitter {

    private static final Log LOG = LogFactory.getLog(BinarySplitter.class);

    private final static int MAX_BYTES = 16;

    private final static BigDecimal ONE_PLACE = new BigDecimal(16);

    @Override
    void generateRangeKeySplits(Configuration conf, List<InputSplit> splits, Types hashKeyType,
            AttributeValue hashKeyValue, Types rangeKeyType, AttributeValue minRangeKeyValue,
            AttributeValue maxRangeKeyValue, int numRangeSplits) {

        byte[] minBytes = minRangeKeyValue.getB().array();
        byte[] maxBytes = maxRangeKeyValue.getB().array();

        // If there is a common prefix between minString and maxString,
        // establish it
        // and pull it out of minString and maxString.
        int maxPrefixLen = Math.min(minBytes.length, maxBytes.length);
        int sharedLen;
        for (sharedLen = 0; sharedLen < maxPrefixLen; sharedLen++) {
            byte b1 = minBytes[sharedLen];
            byte b2 = maxBytes[sharedLen];
            if (b1 != b2) {
                break;
            }
        }

        // The common prefix has length 'sharedLen'. Extract it from both.
        byte[] commonPrefix = Arrays.copyOfRange(minBytes, 0, sharedLen);
        minBytes = Arrays.copyOfRange(minBytes, sharedLen, minBytes.length);
        maxBytes = Arrays.copyOfRange(maxBytes, sharedLen, maxBytes.length);

        List<BigDecimal> splitValues = split(numRangeSplits, minBytes, maxBytes);

        // Convert the list of split point strings into an actual set of
        // InputSplits.
        byte[] start = ArrayUtils.addAll(commonPrefix, bigDecimalToByteArray(splitValues.get(0), MAX_BYTES));
        for (int i = 1; i < splitValues.size(); i++) {
            byte[] end = ArrayUtils.addAll(commonPrefix, bigDecimalToByteArray(splitValues.get(i), MAX_BYTES));

            //if (compareBytes(start, end) >= 0)
            //   continue;

            List<AttributeValue> rangeKeyValues = new ArrayList<AttributeValue>();
            rangeKeyValues.add(new AttributeValue().withB(ByteBuffer.wrap(start)));
            rangeKeyValues.add(new AttributeValue().withB(ByteBuffer.wrap(end)));

            splits.add(new DynamoDBQueryInputFormat.DynamoDBQueryInputSplit(hashKeyType, hashKeyValue, rangeKeyType,
                    rangeKeyValues, ComparisonOperator.BETWEEN));

            start = ArrayUtils.addAll(end, new byte[] { 0x0 });
        }
    }

    public static int compareBytes(byte[] a, byte[] b) {
        for (int i = 0; i < a.length && i < b.length; i++) {
            if (a[i] < b[i]) {
                return -1;
            } else if (a[i] > b[i]) {
                return 1;
            }
        }

        return (a.length < b.length) ? -1 : (a.length > b.length) ? 1 : 0;
    }

    List<BigDecimal> split(int numSplits, byte[] minBytes, byte[] maxBytes) {

        BigDecimal minVal = byteArrayToBigDecimal(minBytes, MAX_BYTES);
        BigDecimal maxVal = byteArrayToBigDecimal(maxBytes, MAX_BYTES);

        List<BigDecimal> splitPoints = split(new BigDecimal(numSplits), minVal, maxVal);
        List<BigDecimal> splitValues = new ArrayList<BigDecimal>();

        // Convert the BigDecimal splitPoints into their string representations.
        for (BigDecimal bd : splitPoints) {
            splitValues.add(bd);
        }

        // Make sure that our user-specified boundaries are the first and last
        // entries in the array.
        if (splitValues.size() == 0 || (0 != splitValues.get(0).compareTo(minVal))) {
            splitValues.add(0, minVal);
        }
        if (splitValues.size() == 1 || (0 != splitValues.get(splitValues.size() - 1).compareTo(maxVal))) {
            splitValues.add(maxVal);
        }

        return splitValues;
    }

    /**
     * Return a BigDecimal representation of byte[] array suitable for use in a
     * numerically-sorting order.
     */
    static BigDecimal byteArrayToBigDecimal(byte[] array, int maxBytes) {
        BigDecimal result = BigDecimal.ZERO;
        BigDecimal curPlace = ONE_PLACE; // start with 1/16 to compute the
        // first digit.

        int len = Math.min(array.length, maxBytes);

        for (int i = 0; i < len; i++) {
            byte codePoint = array[i];
            result = result.add(tryDivide(new BigDecimal(codePoint), curPlace));
            // advance to the next less significant place. e.g., 1/(16^2) for
            // the second char.
            curPlace = curPlace.multiply(ONE_PLACE);
        }

        return result;
    }

    /**
     * Return the string encoded in a BigDecimal. Repeatedly multiply the input
     * value by 16; the integer portion after such a multiplication represents a
     * single character in base 16. Convert that back into a char and create a
     * string out of these until we have no data left.
     * 
     * @throws IOException
     */
    static byte[] bigDecimalToByteArray(BigDecimal bd, int maxBytes) {
        BigDecimal cur = bd.stripTrailingZeros();
        ByteArrayOutputStream sb = new ByteArrayOutputStream();

        try {
            byte[] curCodePoint = new byte[1];
            for (int numConverted = 0; numConverted < maxBytes; numConverted++) {
                cur = cur.multiply(ONE_PLACE);
                curCodePoint[0] = cur.byteValue();
                if (0x0 == curCodePoint[0]) {
                    break;
                }

                cur = cur.subtract(new BigDecimal(new BigInteger(curCodePoint)));
                sb.write(curCodePoint);
            }
        } catch (IOException e) {
            LOG.error("Error writing byte array", e);
        }

        return sb.toByteArray();
    }

}