com.cloudera.impala.util.AvroSchemaUtils.java Source code

Introduction

Here is the source code for com.cloudera.impala.util.AvroSchemaUtils.java
Source

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package com.cloudera.impala.util;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils;

import com.cloudera.impala.analysis.ColumnDef;
import com.cloudera.impala.catalog.PrimitiveType;
import com.cloudera.impala.common.AnalysisException;
import com.cloudera.impala.common.FileSystemUtil;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;

/**
 * Contains utility functions for dealing with Avro schemas.
 */
public class AvroSchemaUtils {

    /**
     * Gets an Avro table's JSON schema from the list of given table property search
     * locations. The schema may be specified as a string literal or provided as a
     * Hadoop FileSystem or http URL that points to the schema. Apart from ensuring
     * that the JSON schema is not SCHEMA_NONE, this function does not perform any
     * additional validation on the returned string (e.g., it may not be a valid
     * schema). Returns the Avro schema or null if none was specified in the search
     * locations. Throws an AnalysisException if a schema was specified, but could not
     * be retrieved, e.g., because of an invalid URL.
     */
    public static String getAvroSchema(List<Map<String, String>> schemaSearchLocations) throws AnalysisException {
        String url = null;
        // Search all locations and break out on the first valid schema found.
        for (Map<String, String> schemaLocation : schemaSearchLocations) {
            if (schemaLocation == null)
                continue;

            String literal = schemaLocation.get(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName());
            if (literal != null && !literal.equals(AvroSerdeUtils.SCHEMA_NONE))
                return literal;

            url = schemaLocation.get(AvroSerdeUtils.AvroTableProperties.SCHEMA_URL.getPropName());
            if (url != null && !url.equals(AvroSerdeUtils.SCHEMA_NONE)) {
                url = url.trim();
                break;
            }
        }
        if (url == null)
            return null;

        String schema = null;
        InputStream urlStream = null;
        try {
            // TODO: Add support for https:// here.
            if (url.toLowerCase().startsWith("http://")) {
                urlStream = new URL(url).openStream();
                schema = IOUtils.toString(urlStream);
            } else {
                Path path = new Path(url);
                FileSystem fs = null;
                fs = path.getFileSystem(FileSystemUtil.getConfiguration());
                StringBuilder errorMsg = new StringBuilder();
                if (!FileSystemUtil.isPathReachable(path, fs, errorMsg)) {
                    throw new AnalysisException(String.format("Invalid avro.schema.url: %s. %s", url, errorMsg));
                }
                schema = FileSystemUtil.readFile(path);
            }
        } catch (AnalysisException e) {
            throw e;
        } catch (IOException e) {
            throw new AnalysisException(
                    String.format("Failed to read Avro schema at: %s. %s ", url, e.getMessage()));
        } catch (Exception e) {
            throw new AnalysisException(String.format("Invalid avro.schema.url: %s. %s", url, e.getMessage()));
        } finally {
            if (urlStream != null)
                IOUtils.closeQuietly(urlStream);
        }
        return schema;
    }

    /**
     * Reconciles differences in names/types between the given list of column definitions
     * and the column definitions corresponding to an Avro Schema. Populates 'warning'
     * if there are inconsistencies between the column definitions and the Avro schema,
     * Returns the reconciled column definitions according to the following conflict
     * resolution policy:
     *
     * Mismatched number of columns -> Prefer Avro columns.
     * Always prefer Avro schema except for column type CHAR/VARCHAR/STRING:
     *   A CHAR/VARCHAR/STRING column definition maps to an Avro STRING. The reconciled
     *   column will preserve the type in the column definition but use the column name
     *   and comment from the Avro schema.
     */
    public static List<ColumnDef> reconcileSchemas(List<ColumnDef> colDefs, List<ColumnDef> avroCols,
            StringBuilder warning) {
        if (colDefs.size() != avroCols.size()) {
            warning.append(String.format(
                    "Ignoring column definitions in favor of Avro schema.\n"
                            + "The Avro schema has %s column(s) but %s column definition(s) were given.",
                    avroCols.size(), colDefs.size()));
            return avroCols;
        }

        List<ColumnDef> result = Lists.newArrayListWithCapacity(colDefs.size());
        for (int i = 0; i < avroCols.size(); ++i) {
            ColumnDef colDef = colDefs.get(i);
            ColumnDef avroCol = avroCols.get(i);
            Preconditions.checkNotNull(colDef.getType());
            Preconditions.checkNotNull(avroCol.getType());

            // A CHAR/VARCHAR/STRING column definition maps to an Avro STRING, and is preserved
            // as a CHAR/VARCHAR/STRING in the reconciled schema. Column name and comment
            // are taken from the Avro schema.
            if ((colDef.getType().isStringType() && avroCol.getType().isStringType())) {
                Preconditions.checkState(avroCol.getType().getPrimitiveType() == PrimitiveType.STRING);
                ColumnDef reconciledColDef = new ColumnDef(avroCol.getColName(), colDef.getTypeDef(),
                        avroCol.getComment());
                try {
                    reconciledColDef.analyze();
                } catch (AnalysisException e) {
                    Preconditions.checkNotNull(null, "reconciledColDef.analyze() should never throw.");
                }
                result.add(reconciledColDef);
            } else {
                result.add(avroCol);
            }

            // Populate warning string if there are name and/or type inconsistencies.
            if (!colDef.getColName().equals(avroCol.getColName()) || !colDef.getType().equals(avroCol.getType())) {
                if (warning.length() == 0) {
                    // Add warning preamble for the first mismatch.
                    warning.append("Resolved the following name and/or type inconsistencies "
                            + "between the column definitions and the Avro schema.\n");
                }
                warning.append(String.format("Column definition at position %s:  %s %s\n", i,
                        colDefs.get(i).getColName(), colDefs.get(i).getType().toSql()));
                warning.append(String.format("Avro schema column at position %s: %s %s\n", i,
                        avroCols.get(i).getColName(), avroCols.get(i).getType().toSql()));
                warning.append(String.format("Resolution at position %s: %s %s\n", i, result.get(i).getColName(),
                        result.get(i).getType().toSql()));
            }
        }
        Preconditions.checkState(result.size() == avroCols.size());
        Preconditions.checkState(result.size() == colDefs.size());
        return result;
    }

    /**
     * Sets the comment of each column definition to 'from deserializer' if not already
     * set. The purpose of this function is to provide behavioral consistency with
     * Hive ('deserializer' is not applicable to Impala) with respect to column comments
     * set for Avro tables.
     */
    public static void setFromSerdeComment(List<ColumnDef> colDefs) {
        for (ColumnDef colDef : colDefs) {
            if (Strings.isNullOrEmpty(colDef.getComment())) {
                colDef.setComment("from deserializer");
            }
        }
    }
}