edu.uci.ics.hivesterix.serde.lazy.LazyUtils.java Source code

Java tutorial

Introduction

Here is the source code for edu.uci.ics.hivesterix.serde.lazy.LazyUtils.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.uci.ics.hivesterix.serde.lazy;

import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.hadoop.hive.serde2.ByteStream.Output;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.WritableUtils;

import edu.uci.ics.hivesterix.serde.lazy.objectinspector.LazyObjectInspectorFactory;
import edu.uci.ics.hivesterix.serde.lazy.objectinspector.primitive.PrimitiveObjectInspectorFactory;

/**
 * LazyUtils.
 */
public final class LazyUtils {

    /**
     * Convert the byte array to an int starting from the given offset. Refer to
     * code by aeden on DZone Snippets:
     * 
     * @param b
     *            the byte array
     * @param offset
     *            the array offset
     * @return the integer
     */
    public static int byteArrayToInt(byte[] b, int offset) {
        int value = 0;
        for (int i = 0; i < 4; i++) {
            int shift = (4 - 1 - i) * 8;
            value += (b[i + offset] & 0x000000FF) << shift;
        }
        return value;
    }

    /**
     * Convert the byte array to a long starting from the given offset.
     * 
     * @param b
     *            the byte array
     * @param offset
     *            the array offset
     * @return the long
     */
    public static long byteArrayToLong(byte[] b, int offset) {
        long value = 0;
        for (int i = 0; i < 8; i++) {
            int shift = (8 - 1 - i) * 8;
            value += ((long) (b[i + offset] & 0x00000000000000FF)) << shift;
        }
        return value;
    }

    /**
     * Convert the byte array to a short starting from the given offset.
     * 
     * @param b
     *            the byte array
     * @param offset
     *            the array offset
     * @return the short
     */
    public static short byteArrayToShort(byte[] b, int offset) {
        short value = 0;
        value += (b[offset] & 0x000000FF) << 8;
        value += (b[offset + 1] & 0x000000FF);
        return value;
    }

    /**
     * Record is the unit that data is serialized in. A record includes two
     * parts. The first part stores the size of the element and the second part
     * stores the real element. size element record ->
     * |----|-------------------------|
     * A RecordInfo stores two information of a record, the size of the "size"
     * part which is the element offset and the size of the element part which
     * is element size.
     */
    public static class RecordInfo {
        public RecordInfo() {
            elementOffset = 0;
            elementSize = 0;
        }

        public byte elementOffset;
        public int elementSize;

        @Override
        public String toString() {
            return "(" + elementOffset + ", " + elementSize + ")";
        }
    }

    static VInt vInt = new LazyUtils.VInt();

    /**
     * Check a particular field and set its size and offset in bytes based on
     * the field type and the bytes arrays.
     * For void, boolean, byte, short, int, long, float and double, there is no
     * offset and the size is fixed. For string, map, list, struct, the first
     * four bytes are used to store the size. So the offset is 4 and the size is
     * computed by concating the first four bytes together. The first four bytes
     * are defined with respect to the offset in the bytes arrays.
     * 
     * @param objectInspector
     *            object inspector of the field
     * @param bytes
     *            bytes arrays store the table row
     * @param offset
     *            offset of this field
     * @param recordInfo
     *            modify this byteinfo object and return it
     */
    public static void checkObjectByteInfo(ObjectInspector objectInspector, byte[] bytes, int offset,
            RecordInfo recordInfo) {
        Category category = objectInspector.getCategory();
        switch (category) {
        case PRIMITIVE:
            PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) objectInspector)
                    .getPrimitiveCategory();
            switch (primitiveCategory) {
            case VOID:
                recordInfo.elementOffset = 0;
                recordInfo.elementSize = 0;
                break;
            case BOOLEAN:
            case BYTE:
                recordInfo.elementOffset = 0;
                recordInfo.elementSize = 1;
                break;
            case SHORT:
                recordInfo.elementOffset = 0;
                recordInfo.elementSize = 2;
                break;
            case FLOAT:
                recordInfo.elementOffset = 0;
                recordInfo.elementSize = 4;
                break;
            case DOUBLE:
                recordInfo.elementOffset = 0;
                recordInfo.elementSize = 8;
                break;
            case INT:
                recordInfo.elementOffset = 0;
                recordInfo.elementSize = WritableUtils.decodeVIntSize(bytes[offset]);
                break;
            case LONG:
                recordInfo.elementOffset = 0;
                recordInfo.elementSize = WritableUtils.decodeVIntSize(bytes[offset]);
                break;
            case STRING:
                // using vint instead of 4 bytes
                LazyUtils.readVInt(bytes, offset, vInt);
                recordInfo.elementOffset = vInt.length;
                recordInfo.elementSize = vInt.value;
                break;
            default: {
                throw new RuntimeException("Unrecognized primitive type: " + primitiveCategory);
            }
            }
            break;
        case LIST:
        case MAP:
        case STRUCT:
            recordInfo.elementOffset = 4;
            recordInfo.elementSize = LazyUtils.byteArrayToInt(bytes, offset);
            break;
        default: {
            throw new RuntimeException("Unrecognized non-primitive type: " + category);
        }
        }
    }

    /**
     * A zero-compressed encoded long.
     */
    public static class VLong {
        public VLong() {
            value = 0;
            length = 0;
        }

        public long value;
        public byte length;
    };

    /**
     * Reads a zero-compressed encoded long from a byte array and returns it.
     * 
     * @param bytes
     *            the byte array
     * @param offset
     *            offset of the array to read from
     * @param vlong
     *            storing the deserialized long and its size in byte
     */
    public static void readVLong(byte[] bytes, int offset, VLong vlong) {
        byte firstByte = bytes[offset];
        vlong.length = (byte) WritableUtils.decodeVIntSize(firstByte);
        if (vlong.length == 1) {
            vlong.value = firstByte;
            return;
        }
        long i = 0;
        for (int idx = 0; idx < vlong.length - 1; idx++) {
            byte b = bytes[offset + 1 + idx];
            i = i << 8;
            i = i | (b & 0xFF);
        }
        vlong.value = (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1L) : i);
    }

    /**
     * A zero-compressed encoded integer.
     */
    public static class VInt implements Serializable {
        private static final long serialVersionUID = 1L;

        public VInt() {
            value = 0;
            length = 0;
        }

        public int value;
        public byte length;
    };

    /**
     * Reads a zero-compressed encoded int from a byte array and returns it.
     * 
     * @param bytes
     *            the byte array
     * @param offset
     *            offset of the array to read from
     * @param vInt
     *            storing the deserialized int and its size in byte
     */
    public static void readVInt(byte[] bytes, int offset, VInt vInt) {
        byte firstByte = bytes[offset];
        vInt.length = (byte) WritableUtils.decodeVIntSize(firstByte);
        if (vInt.length == 1) {
            vInt.value = firstByte;
            return;
        }
        int i = 0;
        for (int idx = 0; idx < vInt.length - 1; idx++) {
            byte b = bytes[offset + 1 + idx];
            i = i << 8;
            i = i | (b & 0xFF);
        }
        vInt.value = (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1) : i);
    }

    /**
     * Writes a zero-compressed encoded int to a byte array.
     * 
     * @param byteStream
     *            the byte array/stream
     * @param i
     *            the int
     */
    public static void writeVInt(Output byteStream, int i) {
        writeVLong(byteStream, i);
    }

    /**
     * Write a zero-compressed encoded long to a byte array.
     * 
     * @param byteStream
     *            the byte array/stream
     * @param l
     *            the long
     */
    public static void writeVLong(Output byteStream, long l) {
        if (l >= -112 && l <= 127) {
            byteStream.write((byte) l);
            return;
        }

        int len = -112;
        if (l < 0) {
            l ^= -1L; // take one's complement'
            len = -120;
        }

        long tmp = l;
        while (tmp != 0) {
            tmp = tmp >> 8;
            len--;
        }

        byteStream.write((byte) len);

        len = (len < -120) ? -(len + 120) : -(len + 112);

        for (int idx = len; idx != 0; idx--) {
            int shiftbits = (idx - 1) * 8;
            long mask = 0xFFL << shiftbits;
            byteStream.write((byte) ((l & mask) >> shiftbits));
        }
    }

    static Map<TypeInfo, ObjectInspector> cachedLazyObjectInspector = new ConcurrentHashMap<TypeInfo, ObjectInspector>();

    /**
     * Returns the lazy binary object inspector that can be used to inspect an
     * lazy binary object of that typeInfo
     * For primitive types, we use the standard writable object inspector.
     */
    public static ObjectInspector getLazyObjectInspectorFromTypeInfo(TypeInfo typeInfo, boolean topLevel) {
        if (typeInfo == null)
            throw new IllegalStateException("illegal type null ");
        ObjectInspector result = cachedLazyObjectInspector.get(typeInfo);
        if (result == null) {
            switch (typeInfo.getCategory()) {
            case PRIMITIVE: {
                result = PrimitiveObjectInspectorFactory
                        .getPrimitiveLazyObjectInspector(((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory());
                break;
            }
            case LIST: {
                ObjectInspector elementObjectInspector = getLazyObjectInspectorFromTypeInfo(
                        ((ListTypeInfo) typeInfo).getListElementTypeInfo(), false);
                result = LazyObjectInspectorFactory.getLazyListObjectInspector(elementObjectInspector);
                break;
            }
            case MAP: {
                MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo;
                ObjectInspector keyObjectInspector = getLazyObjectInspectorFromTypeInfo(
                        mapTypeInfo.getMapKeyTypeInfo(), false);
                ObjectInspector valueObjectInspector = getLazyObjectInspectorFromTypeInfo(
                        mapTypeInfo.getMapValueTypeInfo(), false);
                result = LazyObjectInspectorFactory.getLazyMapObjectInspector(keyObjectInspector,
                        valueObjectInspector);
                break;
            }
            case STRUCT: {
                StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
                List<String> fieldNames = structTypeInfo.getAllStructFieldNames();
                List<TypeInfo> fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
                List<ObjectInspector> fieldObjectInspectors = new ArrayList<ObjectInspector>(fieldTypeInfos.size());

                for (int i = 0; i < fieldTypeInfos.size(); i++) {
                    fieldObjectInspectors.add(getLazyObjectInspectorFromTypeInfo(fieldTypeInfos.get(i), false));
                }

                // if it is top level then create columnar
                if (topLevel)
                    result = LazyObjectInspectorFactory.getLazyColumnarObjectInspector(fieldNames,
                            fieldObjectInspectors);
                // if it is not top level then create struct
                else
                    result = LazyObjectInspectorFactory.getLazyStructObjectInspector(fieldNames,
                            fieldObjectInspectors);

                break;
            }
            default: {
                result = null;
            }
            }
            cachedLazyObjectInspector.put(typeInfo, result);
        }
        return result;
    }

    /**
     * get top-level lazy object inspector
     * 
     * @param fieldNames
     * @param fieldTypeInfos
     * @return
     */
    public static ObjectInspector getLazyObjectInspector(List<String> fieldNames, List<TypeInfo> fieldTypeInfos) {
        List<ObjectInspector> fieldObjectInspectors = new ArrayList<ObjectInspector>(fieldTypeInfos.size());
        for (int i = 0; i < fieldTypeInfos.size(); i++) {
            fieldObjectInspectors.add(getLazyObjectInspectorFromTypeInfo(fieldTypeInfos.get(i), false));
        }

        return LazyObjectInspectorFactory.getLazyColumnarObjectInspector(fieldNames, fieldObjectInspectors);
    }

    private LazyUtils() {
        // prevent instantiation
    }

    /**
     * Returns -1 if the first byte sequence is lexicographically less than the
     * second; returns +1 if the second byte sequence is lexicographically less
     * than the first; otherwise return 0.
     */
    public static int compare(byte[] b1, int start1, int length1, byte[] b2, int start2, int length2) {

        int min = Math.min(length1, length2);

        for (int i = 0; i < min; i++) {
            if (b1[start1 + i] == b2[start2 + i]) {
                continue;
            }
            if (b1[start1 + i] < b2[start2 + i]) {
                return -1;
            } else {
                return 1;
            }
        }

        if (length1 < length2) {
            return -1;
        }
        if (length1 > length2) {
            return 1;
        }
        return 0;
    }

    public static int hashBytes(byte[] data, int start, int len) {
        int hash = 1;
        for (int i = start; i < len; i++) {
            hash = (31 * hash) + data[i];
        }
        return hash;
    }

    /**
     * Writes a zero-compressed encoded int to a byte array.
     * 
     * @param byteStream
     *            the byte array/stream
     * @param i
     *            the int
     */
    public static void writeVInt(DataOutput byteStream, int i) throws IOException {
        writeVLong(byteStream, i);
    }

    /**
     * Write a zero-compressed encoded long to a byte array.
     * 
     * @param byteStream
     *            the byte array/stream
     * @param l
     *            the long
     */
    public static void writeVLong(DataOutput byteStream, long l) throws IOException {
        if (l >= -112 && l <= 127) {
            byteStream.write((byte) l);
            return;
        }

        int len = -112;
        if (l < 0) {
            l ^= -1L; // take one's complement'
            len = -120;
        }

        long tmp = l;
        while (tmp != 0) {
            tmp = tmp >> 8;
            len--;
        }

        byteStream.write((byte) len);

        len = (len < -120) ? -(len + 120) : -(len + 112);

        for (int idx = len; idx != 0; idx--) {
            int shiftbits = (idx - 1) * 8;
            long mask = 0xFFL << shiftbits;
            byteStream.write((byte) ((l & mask) >> shiftbits));
        }
    }
}