org.apache.impala.util.AvroSchemaParser.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.impala.util.AvroSchemaParser.java

Source

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.impala.util;

import static org.apache.avro.Schema.Type.BOOLEAN;
import static org.apache.avro.Schema.Type.DOUBLE;
import static org.apache.avro.Schema.Type.FLOAT;
import static org.apache.avro.Schema.Type.INT;
import static org.apache.avro.Schema.Type.LONG;
import static org.apache.avro.Schema.Type.STRING;

import java.util.Collections;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import org.apache.avro.Schema;
import org.apache.avro.SchemaParseException;
import org.apache.impala.analysis.ColumnDef;
import org.apache.impala.analysis.TypeDef;
import org.apache.impala.catalog.ArrayType;
import org.apache.impala.catalog.MapType;
import org.apache.impala.catalog.ScalarType;
import org.apache.impala.catalog.StructField;
import org.apache.impala.catalog.StructType;
import org.apache.impala.catalog.Type;
import org.apache.impala.common.AnalysisException;
import org.codehaus.jackson.JsonNode;

/**
 * Utility class used to parse Avro schema. Checks that the schema is valid
 * and performs mapping of Avro types to Impala types.
 * Note: This code is loosely based off the parsing code in the Hive AvroSerDe.
 */
public class AvroSchemaParser {
    // Map of Avro to Impala primitive types.
    private static final Map<Schema.Type, Type> avroToImpalaPrimitiveTypeMap_;
    static {
        Map<Schema.Type, Type> typeMap = new Hashtable<Schema.Type, Type>();
        typeMap.put(STRING, Type.STRING);
        typeMap.put(INT, Type.INT);
        typeMap.put(BOOLEAN, Type.BOOLEAN);
        typeMap.put(LONG, Type.BIGINT);
        typeMap.put(FLOAT, Type.FLOAT);
        typeMap.put(DOUBLE, Type.DOUBLE);
        avroToImpalaPrimitiveTypeMap_ = Collections.unmodifiableMap(typeMap);
    }

    /**
     * Parses the Avro schema string literal, mapping the Avro types to Impala types.
     * Returns a list of ColumnDef objects with their name and type info set.
     * Throws an AnalysisException if the Avro type maps to a type that Impala
     * does not yet support.
     * Throws a SchemaParseException if the Avro schema was invalid.
     */
    public static List<ColumnDef> parse(String schemaStr) throws SchemaParseException, AnalysisException {
        Schema.Parser avroSchemaParser = new Schema.Parser();
        Schema schema = avroSchemaParser.parse(schemaStr);
        if (!schema.getType().equals(Schema.Type.RECORD)) {
            throw new UnsupportedOperationException(
                    "Schema for table must be of type " + "RECORD. Received type: " + schema.getType());
        }
        List<ColumnDef> colDefs = Lists.newArrayListWithCapacity(schema.getFields().size());
        for (Schema.Field field : schema.getFields()) {
            Map<ColumnDef.Option, Object> option = Maps.newHashMap();
            String comment = field.doc();
            if (comment != null)
                option.put(ColumnDef.Option.COMMENT, comment);
            ColumnDef colDef = new ColumnDef(field.name(), new TypeDef(getTypeInfo(field.schema(), field.name())),
                    option);
            colDef.analyze(null);
            colDefs.add(colDef);
        }
        return colDefs;
    }

    /**
     * Parses the given Avro schema and returns the matching Impala type
     * for this field. Handles primitive and complex types.
     */
    private static Type getTypeInfo(Schema schema, String colName) throws AnalysisException {
        // Avro requires NULLable types to be defined as unions of some type T
        // and NULL.  This is annoying and we're going to hide it from the user.
        if (isNullableType(schema)) {
            return getTypeInfo(getColumnType(schema), colName);
        }

        Schema.Type type = schema.getType();
        if (avroToImpalaPrimitiveTypeMap_.containsKey(type)) {
            return avroToImpalaPrimitiveTypeMap_.get(type);
        }

        switch (type) {
        case ARRAY:
            Type itemType = getTypeInfo(schema.getElementType(), colName);
            return new ArrayType(itemType);
        case MAP:
            Type valueType = getTypeInfo(schema.getValueType(), colName);
            return new MapType(Type.STRING, valueType);
        case RECORD:
            StructType structType = new StructType();
            for (Schema.Field field : schema.getFields()) {
                Type fieldType = getTypeInfo(field.schema(), colName);
                structType.addField(new StructField(field.name(), fieldType, field.doc()));
            }
            return structType;
        case BYTES:
            String logicalType = schema.getProp("logicalType");
            if (logicalType == null) {
                throw new AnalysisException(String.format(
                        "logicalType for column '%s' specified at wrong level or was not specified", colName));
            }
            // Decimal is stored in Avro as a BYTE.
            if (logicalType.equalsIgnoreCase("decimal")) {
                return getDecimalType(schema);
            } else {
                throw new AnalysisException(String.format(
                        "Unsupported logicalType: '%s' for column '%s' with type BYTES", logicalType, colName));
            }
            // TODO: Add support for stored Avro UNIONs by exposing them as STRUCTs in Impala.
        case UNION:
        case ENUM:
        case FIXED:
        case NULL:
        default: {
            throw new AnalysisException(
                    String.format("Unsupported type '%s' of column '%s'", type.getName(), colName));
        }
        }
    }

    /**
     * Returns true if this is a nullable type (a Union[T, Null]), false otherwise.
     */
    private static boolean isNullableType(Schema schema) {
        // [null, null] not allowed, so this check is ok.
        return schema.getType().equals(Schema.Type.UNION) && schema.getTypes().size() == 2
                && (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
                        || schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
    }

    /**
     * If a nullable type, get the schema for the non-nullable type which will
     * provide Impala column type information.
     */
    private static Schema getColumnType(Schema schema) {
        List<Schema> types = schema.getTypes();
        return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
    }

    /**
     * Attempts to parse decimal type information from the Avro schema, returning
     * a decimal ColumnType if successful.
     * Decimal is defined in Avro as a BYTE type with the logicalType property
     * set to "decimal" and a specified scale/precision.
     * Throws a SchemaParseException if the logicType=decimal, but scale/precision
     * is not specified or in the incorrect format.
     */
    private static Type getDecimalType(Schema schema) {
        Preconditions.checkState(schema.getType() == Schema.Type.BYTES);
        // Parse the scale/precision of the decimal type.
        Integer scale = getDecimalProp(schema, "scale");
        // The Avro spec states that scale should default to zero if not set.
        if (scale == null)
            scale = 0;

        // Precision is a required property according to the Avro spec.
        Integer precision = getDecimalProp(schema, "precision");
        if (precision == null) {
            throw new SchemaParseException("No 'precision' property specified for 'decimal' logicalType");
        }
        return ScalarType.createDecimalType(precision, scale);
    }

    /**
     * Parses a decimal property and returns the value as an integer, or null
     * if the property isn't set. Used to parse decimal scale/precision.
     * Throws a SchemaParseException if the property doesn't parse to a
     * natural number.
     */
    private static Integer getDecimalProp(Schema schema, String propName) throws SchemaParseException {
        JsonNode node = schema.getJsonProp(propName);
        if (node == null)
            return null;
        int propValue = node.getValueAsInt(-1);
        if (propValue < 0) {
            throw new SchemaParseException(
                    String.format("Invalid decimal '%s' " + "property value: %s", propName, node.getValueAsText()));
        }
        return propValue;
    }
}