org.apache.hadoop.hive.ql.io.parquet.TestVectorizedMapColumnReader.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.ql.io.parquet.TestVectorizedMapColumnReader.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io.parquet;

import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.io.api.Binary;
import org.junit.Test;

import java.io.IOException;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class TestVectorizedMapColumnReader extends VectorizedColumnReaderTestBase {

    protected static void writeMapData(ParquetWriter<Group> writer, boolean isDictionaryEncoding, int elementNum)
            throws IOException {
        SimpleGroupFactory f = new SimpleGroupFactory(schema);
        int mapMaxSize = 4;
        int mapElementIndex = 0;
        for (int i = 0; i < elementNum; i++) {
            boolean isNull = isNull(i);
            Group group = f.newGroup();

            int mapSize = i % mapMaxSize + 1;
            if (!isNull) {
                // the map_field is to test multiple level map definition
                Group multipleLevelGroup = group.addGroup("map_field");
                for (int j = 0; j < mapSize; j++) {
                    int intValForMap = getIntValue(isDictionaryEncoding, mapElementIndex);
                    long longValForMap = getLongValue(isDictionaryEncoding, mapElementIndex);
                    double doubleValForMap = getDoubleValue(isDictionaryEncoding, mapElementIndex);
                    float floatValForMap = getFloatValue(isDictionaryEncoding, mapElementIndex);
                    Binary binaryValForMap = getBinaryValue(isDictionaryEncoding, mapElementIndex);
                    HiveDecimal hd = getDecimal(isDictionaryEncoding, mapElementIndex).setScale(2);
                    HiveDecimalWritable hdw = new HiveDecimalWritable(hd);
                    Binary decimalValForMap = Binary.fromConstantByteArray(hdw.getInternalStorage());
                    group.addGroup("map_int32").append("key", intValForMap).append("value", intValForMap);
                    group.addGroup("map_int64").append("key", longValForMap).append("value", longValForMap);
                    group.addGroup("map_double").append("key", doubleValForMap).append("value", doubleValForMap);
                    group.addGroup("map_float").append("key", floatValForMap).append("value", floatValForMap);
                    group.addGroup("map_binary").append("key", binaryValForMap).append("value", binaryValForMap);
                    group.addGroup("map_decimal").append("key", decimalValForMap).append("value", decimalValForMap);
                    multipleLevelGroup.addGroup("map").append("key", binaryValForMap).append("value",
                            binaryValForMap);
                    mapElementIndex++;
                }
            }
            writer.write(group);
        }
        writer.close();
    }

    protected static void writeRepeateMapData(ParquetWriter<Group> writer, int elementNum, boolean isNull)
            throws IOException {
        SimpleGroupFactory f = new SimpleGroupFactory(schema);
        int mapMaxSize = 4;
        for (int i = 0; i < elementNum; i++) {
            Group group = f.newGroup();
            if (!isNull) {
                for (int j = 0; j < mapMaxSize; j++) {
                    group.addGroup("map_int32_for_repeat_test").append("key", j).append("value", j);
                }
            }
            writer.write(group);
        }
        writer.close();
    }

    @Test
    public void testMapReadLessOneBatch() throws Exception {
        boolean isDictionaryEncoding = false;
        removeFile();
        writeMapData(initWriterFromFile(), isDictionaryEncoding, 1023);
        testMapReadAllType(isDictionaryEncoding, 1023);
        removeFile();
        isDictionaryEncoding = true;
        writeMapData(initWriterFromFile(), isDictionaryEncoding, 1023);
        testMapReadAllType(isDictionaryEncoding, 1023);
        removeFile();
    }

    @Test
    public void testMapReadEqualOneBatch() throws Exception {
        boolean isDictionaryEncoding = false;
        removeFile();
        writeMapData(initWriterFromFile(), isDictionaryEncoding, 1024);
        testMapReadAllType(isDictionaryEncoding, 1024);
        removeFile();
        isDictionaryEncoding = true;
        writeMapData(initWriterFromFile(), isDictionaryEncoding, 1024);
        testMapReadAllType(isDictionaryEncoding, 1024);
        removeFile();
    }

    @Test
    public void testMapReadMoreOneBatch() throws Exception {
        boolean isDictionaryEncoding = false;
        removeFile();
        writeMapData(initWriterFromFile(), isDictionaryEncoding, 1025);
        testMapReadAllType(isDictionaryEncoding, 1025);
        removeFile();
        isDictionaryEncoding = true;
        writeMapData(initWriterFromFile(), isDictionaryEncoding, 1025);
        testMapReadAllType(isDictionaryEncoding, 1025);
        removeFile();
    }

    @Test
    public void testRepeateMapRead() throws Exception {
        removeFile();
        writeRepeateMapData(initWriterFromFile(), 1023, false);
        testRepeateMapRead(1023, false);
        removeFile();
        writeRepeateMapData(initWriterFromFile(), 1023, true);
        testRepeateMapRead(1023, true);
        removeFile();
        writeRepeateMapData(initWriterFromFile(), 1024, false);
        testRepeateMapRead(1024, false);
        removeFile();
        writeRepeateMapData(initWriterFromFile(), 1024, true);
        testRepeateMapRead(1024, true);
        removeFile();
        writeRepeateMapData(initWriterFromFile(), 1025, false);
        testRepeateMapRead(1025, false);
        removeFile();
        writeRepeateMapData(initWriterFromFile(), 1025, true);
        testRepeateMapRead(1025, true);
        removeFile();
    }

    @Test
    public void testMultipleDefinitionMapRead() throws Exception {
        removeFile();
        writeMapData(initWriterFromFile(), false, 1023);
        testMapRead(false, "multipleLevel", 1023);
        removeFile();
    }

    private void testMapReadAllType(boolean isDictionaryEncoding, int elementNum) throws Exception {
        testMapRead(isDictionaryEncoding, "int", elementNum);
        testMapRead(isDictionaryEncoding, "long", elementNum);
        testMapRead(isDictionaryEncoding, "double", elementNum);
        testMapRead(isDictionaryEncoding, "float", elementNum);
        testMapRead(isDictionaryEncoding, "binary", elementNum);
        testMapRead(isDictionaryEncoding, "decimal", elementNum);
    }

    private void testMapRead(boolean isDictionaryEncoding, String type, int elementNum) throws Exception {
        Configuration conf = new Configuration();
        setTypeConfiguration(type, conf);
        conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
        conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
        VectorizedParquetRecordReader reader = createTestParquetReader(getSchema(type), conf);
        VectorizedRowBatch previous = reader.createValue();
        int row = 0;
        int index = 0;
        try {
            while (reader.next(NullWritable.get(), previous)) {
                MapColumnVector mapVector = (MapColumnVector) previous.cols[0];

                //since Repeating only happens when offset length is 1.
                assertEquals((mapVector.offsets.length == 1), mapVector.isRepeating);

                for (int i = 0; i < mapVector.offsets.length; i++) {
                    if (row == elementNum) {
                        assertEquals(i, mapVector.offsets.length - 1);
                        break;
                    }
                    long start = mapVector.offsets[i];
                    long length = mapVector.lengths[i];
                    boolean isNull = isNull(row);
                    if (isNull) {
                        assertEquals(mapVector.isNull[i], true);
                    } else {
                        for (long j = 0; j < length; j++) {
                            assertValue(type, mapVector.keys, isDictionaryEncoding, index, (int) (start + j));
                            assertValue(type, mapVector.values, isDictionaryEncoding, index, (int) (start + j));
                            index++;
                        }
                    }
                    row++;
                }
            }
            assertEquals("It doesn't exit at expected position", elementNum, row);
        } finally {
            reader.close();
        }
    }

    private void testRepeateMapRead(int elementNum, boolean isNull) throws Exception {
        Configuration conf = new Configuration();
        conf.set(IOConstants.COLUMNS, "map_int32_for_repeat_test");
        conf.set(IOConstants.COLUMNS_TYPES, "map<int,int>");
        conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
        conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0");
        String schema = "message hive_schema {\n" + "  repeated group map_int32_for_repeat_test (MAP_KEY_VALUE) {\n"
                + "    required int32 key;\n" + "    optional int32 value;\n" + "  }\n" + "}\n";
        VectorizedParquetRecordReader reader = createTestParquetReader(schema, conf);
        VectorizedRowBatch previous = reader.createValue();
        int row = 0;
        try {
            while (reader.next(NullWritable.get(), previous)) {
                MapColumnVector mapVector = (MapColumnVector) previous.cols[0];

                assertTrue(mapVector.isRepeating);
                assertEquals(isNull, mapVector.isNull[0]);

                for (int i = 0; i < mapVector.offsets.length; i++) {
                    if (row == elementNum) {
                        assertEquals(i, mapVector.offsets.length - 1);
                        break;
                    }
                    row++;
                }
            }
            assertEquals("It doesn't exit at expected position", elementNum, row);
        } finally {
            reader.close();
        }
    }

    private void setTypeConfiguration(String type, Configuration conf) {
        if ("int".equals(type)) {
            conf.set(IOConstants.COLUMNS, "map_int32");
            conf.set(IOConstants.COLUMNS_TYPES, "map<int,int>");
        } else if ("long".equals(type)) {
            conf.set(IOConstants.COLUMNS, "map_int64");
            conf.set(IOConstants.COLUMNS_TYPES, "map<bigint,bigint>");
        } else if ("double".equals(type)) {
            conf.set(IOConstants.COLUMNS, "map_double");
            conf.set(IOConstants.COLUMNS_TYPES, "map<double,double>");
        } else if ("float".equals(type)) {
            conf.set(IOConstants.COLUMNS, "map_float");
            conf.set(IOConstants.COLUMNS_TYPES, "map<float,float>");
        } else if ("binary".equals(type)) {
            conf.set(IOConstants.COLUMNS, "map_binary");
            conf.set(IOConstants.COLUMNS_TYPES, "map<string,string>");
        } else if ("decimal".equals(type)) {
            conf.set(IOConstants.COLUMNS, "map_decimal");
            conf.set(IOConstants.COLUMNS_TYPES, "map<decimal(5,2),decimal(5,2)>");
        } else if ("multipleLevel".equals(type)) {
            conf.set(IOConstants.COLUMNS, "map_field");
            conf.set(IOConstants.COLUMNS_TYPES, "map<string,string>");
        }
    }

    private String getSchema(String type) {
        String schemaFormat = "message hive_schema {\n" + "  repeated group map_%s (MAP_KEY_VALUE) {\n"
                + "    required %s key %s;\n" + "    optional %s value %s;\n" + "  }\n" + "}\n";
        switch (type) {
        case "int":
            return String.format(schemaFormat, "int32", "int32", "", "int32", "");
        case "long":
            return String.format(schemaFormat, "int64", "int64", "", "int64", "");
        case "double":
            return String.format(schemaFormat, "double", "double", "", "double", "");
        case "float":
            return String.format(schemaFormat, "float", "float", "", "float", "");
        case "binary":
            return String.format(schemaFormat, "binary", "binary", "", "binary", "");
        case "decimal":
            return String.format(schemaFormat, "decimal", "binary", "(DECIMAL(5,2))", "binary", "(DECIMAL(5,2))");
        case "multipleLevel":
            return "message hive_schema {\n" + "optional group map_field (MAP) {\n"
                    + "  repeated group map (MAP_KEY_VALUE) {\n" + "    required binary key;\n"
                    + "    optional binary value;\n" + "  }\n" + "}\n" + "}\n";
        default:
            throw new RuntimeException("Unsupported type for TestVectorizedMapColumnReader!");
        }
    }

    private void assertValue(String type, ColumnVector childVector, boolean isDictionaryEncoding, int valueIndex,
            int position) {
        if ("int".equals(type)) {
            assertEquals(getIntValue(isDictionaryEncoding, valueIndex),
                    ((LongColumnVector) childVector).vector[position]);
        } else if ("long".equals(type)) {
            assertEquals(getLongValue(isDictionaryEncoding, valueIndex),
                    ((LongColumnVector) childVector).vector[position]);
        } else if ("double".equals(type)) {
            assertEquals(getDoubleValue(isDictionaryEncoding, valueIndex),
                    ((DoubleColumnVector) childVector).vector[position], 0);
        } else if ("float".equals(type)) {
            assertEquals(getFloatValue(isDictionaryEncoding, valueIndex),
                    ((DoubleColumnVector) childVector).vector[position], 0);
        } else if ("binary".equals(type) || "multipleLevel".equals(type)) {
            String actual = new String(ArrayUtils.subarray(((BytesColumnVector) childVector).vector[position],
                    ((BytesColumnVector) childVector).start[position],
                    ((BytesColumnVector) childVector).start[position]
                            + ((BytesColumnVector) childVector).length[position]));
            assertEquals(getStr(isDictionaryEncoding, valueIndex), actual);
        } else if ("decimal".equals(type)) {
            assertEquals(getDecimal(isDictionaryEncoding, valueIndex),
                    ((DecimalColumnVector) childVector).vector[position].getHiveDecimal());
        } else {
            throw new RuntimeException("Unsupported type for TestVectorizedMapColumnReader!");
        }
    }
}