org.apache.gobblin.util.AvroFlattener.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.gobblin.util.AvroFlattener.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.gobblin.util;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.codehaus.jackson.JsonNode;

import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

/***
 * This class provides methods to flatten an Avro Schema to make it more optimal for ORC
 * (Hive does not support predicate pushdown for ORC with nested fields: ETL-7214)
 *
 * The behavior of Avro Schema un-nesting is listed below:
 *
 * 1. Record within Record (and so on recursively) are flattened into the parent Record
 * Record R1 {
 *   fields: {[
 *      {
 *        Record R2 {
 *          fields: {[
 *              {
 *                Record R3 {
 *                  fields: {[
 *                      {
 *                        String S2
 *                      }
 *                  ]}
 *                }, {
 *                  String S3
 *                }
 *              }
 *
 *          ]}
 *        }
 *      }, {
 *        String S1
 *      }
 *   ]}
 * }
 * will be flattened to:
 * Record R1 {
 *   fields: {[
 *      {
 *        String S1
 *      }, {
 *        String S2
 *      }, {
 *        String S3
 *      }
 *   ]}
 * }
 *
 * 2. All fields un-nested from a Record within an Option (ie. Union of the type [null, Record] or [Record, null])
 * within a Record are moved to parent Record as a list of Option fields
 * Record R1 {
 *   fields : {[
 *      {
 *        Union : [
 *          null,
 *          Record R2 {
 *            fields : {[
 *                {
 *                  String S1
 *                }, {
 *                  String S2
 *                }
 *            ]}
 *          }
 *      }
 *   ]}
 * }
 * will be flattened to:
 * Record R1 {
 *   fields : {[
 *      {
 *        Union : [ null, String S1]
 *      }, {
 *        Union : [ null, String S2]
 *      }
 *   ]}
 * }
 *
 * 3. Array or Map will not be un-nested, however Records within it will be un-nested as described above
 *
 * 4. All un-nested fields are decorated with a new property "flatten_source" which is a dot separated string
 * concatenation of parent fields name, similarly un-nested fields are renamed to double-underscore string
 * concatenation of parent fields name
 *
 * 5. Primitive Types are not un-nested
 */
public class AvroFlattener {

    private static final Logger LOG = Logger.getLogger(AvroFlattener.class);

    private static final String FLATTENED_NAME_JOINER = "__";
    private static final String FLATTENED_SOURCE_JOINER = ".";
    private static final String FLATTENED_SOURCE_KEY = "flatten_source";

    private String flattenedNameJoiner;
    private String flattenedSourceJoiner;

    /***
     * Flatten the Schema to un-nest recursive Records (to make it optimal for ORC)
     * @param schema Avro Schema to flatten
     * @param flattenComplexTypes Flatten complex types recursively other than Record and Option
     * @return Flattened Avro Schema
     */
    public Schema flatten(Schema schema, boolean flattenComplexTypes) {
        Preconditions.checkNotNull(schema);

        // To help make it configurable later
        this.flattenedNameJoiner = FLATTENED_NAME_JOINER;
        this.flattenedSourceJoiner = FLATTENED_SOURCE_JOINER;

        Schema flattenedSchema = flatten(schema, false, flattenComplexTypes);

        LOG.debug("Original Schema : " + schema);
        LOG.debug("Flattened Schema: " + flattenedSchema);

        return flattenedSchema;
    }

    /***
     * Flatten the Schema to un-nest recursive Records (to make it optimal for ORC)
     * @param schema Schema to flatten
     * @param shouldPopulateLineage is set to true if the field is going to be flattened and moved up the hierarchy -
     *                              so that lineage information can be tagged to it; which happens when there is a
     *                              Record within a Record OR Record within Option within Record and so on,
     *                              however not when there is a Record within Map or Array
     * @param flattenComplexTypes Flatten complex types recursively other than Record and Option
     * @return Flattened Avro Schema
     */
    private Schema flatten(Schema schema, boolean shouldPopulateLineage, boolean flattenComplexTypes) {
        Schema flattenedSchema;

        // Process all Schema Types
        // (Primitives are simply cloned)
        switch (schema.getType()) {
        case ARRAY:
            // Array might be an array of recursive Records, flatten them
            if (flattenComplexTypes) {
                flattenedSchema = Schema.createArray(flatten(schema.getElementType(), false));
            } else {
                flattenedSchema = Schema.createArray(schema.getElementType());
            }
            break;
        case BOOLEAN:
            flattenedSchema = Schema.create(schema.getType());
            break;
        case BYTES:
            flattenedSchema = Schema.create(schema.getType());
            break;
        case DOUBLE:
            flattenedSchema = Schema.create(schema.getType());
            break;
        case ENUM:
            flattenedSchema = Schema.createEnum(schema.getName(), schema.getDoc(), schema.getNamespace(),
                    schema.getEnumSymbols());
            break;
        case FIXED:
            flattenedSchema = Schema.createFixed(schema.getName(), schema.getDoc(), schema.getNamespace(),
                    schema.getFixedSize());
            break;
        case FLOAT:
            flattenedSchema = Schema.create(schema.getType());
            break;
        case INT:
            flattenedSchema = Schema.create(schema.getType());
            break;
        case LONG:
            flattenedSchema = Schema.create(schema.getType());
            break;
        case MAP:
            if (flattenComplexTypes) {
                flattenedSchema = Schema.createMap(flatten(schema.getValueType(), false));
            } else {
                flattenedSchema = Schema.createMap(schema.getValueType());
            }
            break;
        case NULL:
            flattenedSchema = Schema.create(schema.getType());
            break;
        case RECORD:
            flattenedSchema = flattenRecord(schema, shouldPopulateLineage, flattenComplexTypes);
            break;
        case STRING:
            flattenedSchema = Schema.create(schema.getType());
            break;
        case UNION:
            flattenedSchema = flattenUnion(schema, shouldPopulateLineage, flattenComplexTypes);
            break;
        default:
            String exceptionMessage = String.format("Schema flattening failed for \"%s\" ", schema);
            LOG.error(exceptionMessage);

            throw new AvroRuntimeException(exceptionMessage);
        }

        // Copy schema metadata
        copyProperties(schema, flattenedSchema);

        return flattenedSchema;
    }

    /***
     * Flatten Record schema
     * @param schema Record Schema to flatten
     * @param shouldPopulateLineage If lineage information should be tagged in the field, this is true when we are
     *                              un-nesting fields
     * @param flattenComplexTypes Flatten complex types recursively other than Record and Option
     * @return Flattened Record Schema
     */
    private Schema flattenRecord(Schema schema, boolean shouldPopulateLineage, boolean flattenComplexTypes) {
        Preconditions.checkNotNull(schema);
        Preconditions.checkArgument(Schema.Type.RECORD.equals(schema.getType()));

        Schema flattenedSchema;

        List<Schema.Field> flattenedFields = new ArrayList<>();
        if (schema.getFields().size() > 0) {
            for (Schema.Field oldField : schema.getFields()) {
                List<Schema.Field> newFields = flattenField(oldField, ImmutableList.<String>of(),
                        shouldPopulateLineage, flattenComplexTypes, Optional.<Schema>absent());
                if (null != newFields && newFields.size() > 0) {
                    flattenedFields.addAll(newFields);
                }
            }
        }

        flattenedSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(),
                schema.isError());
        flattenedSchema.setFields(flattenedFields);

        return flattenedSchema;
    }

    /***
     * Flatten Union Schema
     * @param schema Union Schema to flatten
     * @param shouldPopulateLineage If lineage information should be tagged in the field, this is true when we are
     *                              un-nesting fields
     * @param flattenComplexTypes Flatten complex types recursively other than Record and Option
     * @return Flattened Union Schema
     */
    private Schema flattenUnion(Schema schema, boolean shouldPopulateLineage, boolean flattenComplexTypes) {
        Preconditions.checkNotNull(schema);
        Preconditions.checkArgument(Schema.Type.UNION.equals(schema.getType()));

        Schema flattenedSchema;

        List<Schema> flattenedUnionMembers = new ArrayList<>();
        if (null != schema.getTypes() && schema.getTypes().size() > 0) {
            for (Schema oldUnionMember : schema.getTypes()) {
                if (flattenComplexTypes) {
                    // It's member might still recursively contain records
                    flattenedUnionMembers.add(flatten(oldUnionMember, shouldPopulateLineage, flattenComplexTypes));
                } else {
                    flattenedUnionMembers.add(oldUnionMember);
                }
            }
        }
        flattenedSchema = Schema.createUnion(flattenedUnionMembers);

        return flattenedSchema;
    }

    /***
     * Flatten Record field, and compute a list of flattened fields
     *
     * Note: Lineage represents the source path from root for the flattened field. For. eg. If the original schema is:
     * {
     *    "type" : "record",
     *    "name" : "parentRecordName",
     *    "fields" : [ {
     *      "name" : "parentFieldRecord",
     *      "type" : {
     *        "type" : "record",
     *        "name" : "nestedRecordName",
     *        "fields" : [ {
     *            "name" : "nestedFieldString",
     *            "type" : "string"
     *          }, {
     *            "name" : "nestedFieldInt",
     *            "type" : "int"
     *          } ]
     *       }
     *     }]
     * }
     * The expected output schema is:
     * {
     *    "type" : "record",
     *    "name" : "parentRecordName",
     *    "fields" : [ {
     *      "name" : "parentFieldRecord__nestedFieldString",
     *      "type" : "string",
     *      "flatten_source" : "parentFieldRecord.nestedFieldString"
     *    }, {
     *      "name" : "parentFieldRecord__nestedFieldInt",
     *      "type" : "int",
     *      "flatten_source" : "parentFieldRecord.nestedFieldInt"
     *    }, {
     *      "name" : "parentFieldInt",
     *      "type" : "int"
     *    } ]
     * }
     * Here, 'flatten_source' and field 'name' has also been modified to represent their origination from nested schema
     * lineage helps to determine that
     *
     * @param f Field to flatten
     * @param parentLineage Parent's lineage represented as a List of Strings
     * @param shouldPopulateLineage If lineage information should be tagged in the field, this is true when we are
     *                              un-nesting fields
     * @param flattenComplexTypes Flatten complex types recursively other than Record and Option
     * @param shouldWrapInOption If the field should be wrapped as an OPTION, if we un-nest fields within an OPTION
     *                           we make all the unnested fields as OPTIONs
     * @return List of flattened Record fields
     */
    private List<Schema.Field> flattenField(Schema.Field f, ImmutableList<String> parentLineage,
            boolean shouldPopulateLineage, boolean flattenComplexTypes, Optional<Schema> shouldWrapInOption) {
        Preconditions.checkNotNull(f);
        Preconditions.checkNotNull(f.schema());
        Preconditions.checkNotNull(f.name());

        List<Schema.Field> flattenedFields = new ArrayList<>();
        ImmutableList<String> lineage = ImmutableList.<String>builder().addAll(parentLineage.iterator())
                .add(f.name()).build();

        // If field.Type = RECORD, un-nest its fields and return them
        if (Schema.Type.RECORD.equals(f.schema().getType())) {
            if (null != f.schema().getFields() && f.schema().getFields().size() > 0) {
                for (Schema.Field field : f.schema().getFields()) {
                    flattenedFields.addAll(
                            flattenField(field, lineage, true, flattenComplexTypes, Optional.<Schema>absent()));
                }
            }
        }
        // If field.Type = OPTION, un-nest its fields and return them
        else {
            Optional<Schema> optionalRecord = isOfOptionType(f.schema());
            if (optionalRecord.isPresent()) {
                Schema record = optionalRecord.get();
                if (record.getFields().size() > 0) {
                    for (Schema.Field field : record.getFields()) {
                        flattenedFields.addAll(
                                flattenField(field, lineage, true, flattenComplexTypes, Optional.of(f.schema())));
                    }
                }
            }
            // If field.Type = any-other, copy and return it
            else {
                // Compute name and source using lineage
                String flattenName = f.name();
                String flattenSource = StringUtils.EMPTY;
                if (shouldPopulateLineage) {
                    flattenName = StringUtils.join(lineage, flattenedNameJoiner);
                    flattenSource = StringUtils.join(lineage, flattenedSourceJoiner);
                }
                // Copy field
                Schema flattenedFieldSchema = flatten(f.schema(), shouldPopulateLineage, flattenComplexTypes);
                if (shouldWrapInOption.isPresent()) {
                    boolean isNullFirstMember = Schema.Type.NULL
                            .equals(shouldWrapInOption.get().getTypes().get(0).getType());
                    // If already Union, just copy it instead of wrapping (Union within Union is not supported)
                    if (Schema.Type.UNION.equals(flattenedFieldSchema.getType())) {
                        List<Schema> newUnionMembers = new ArrayList<>();
                        if (isNullFirstMember) {
                            newUnionMembers.add(Schema.create(Schema.Type.NULL));
                        }
                        for (Schema type : flattenedFieldSchema.getTypes()) {
                            if (Schema.Type.NULL.equals(type.getType())) {
                                continue;
                            }
                            newUnionMembers.add(type);
                        }
                        if (!isNullFirstMember) {
                            newUnionMembers.add(Schema.create(Schema.Type.NULL));
                        }

                        flattenedFieldSchema = Schema.createUnion(newUnionMembers);
                    }
                    // Wrap the Union, since parent Union is an option
                    else {
                        if (isNullFirstMember) {
                            flattenedFieldSchema = Schema.createUnion(
                                    Arrays.asList(Schema.create(Schema.Type.NULL), flattenedFieldSchema));
                        } else {
                            flattenedFieldSchema = Schema.createUnion(
                                    Arrays.asList(flattenedFieldSchema, Schema.create(Schema.Type.NULL)));
                        }
                    }
                }
                Schema.Field field = new Schema.Field(flattenName, flattenedFieldSchema, f.doc(), f.defaultValue(),
                        f.order());

                if (StringUtils.isNotBlank(flattenSource)) {
                    field.addProp(FLATTENED_SOURCE_KEY, flattenSource);
                }
                for (Map.Entry<String, JsonNode> entry : f.getJsonProps().entrySet()) {
                    field.addProp(entry.getKey(), entry.getValue());
                }
                flattenedFields.add(field);
            }
        }

        return flattenedFields;
    }

    /***
     * Check if the Avro Schema is of type OPTION
     * ie. [null, RECORD] or [RECORD, null]
     * @param schema Avro Schema to check
     * @return Optional Avro Record if schema is of type OPTION
     */
    private static Optional<Schema> isOfOptionType(Schema schema) {
        Preconditions.checkNotNull(schema);

        // If not of type UNION, cant be an OPTION
        if (!Schema.Type.UNION.equals(schema.getType())) {
            return Optional.<Schema>absent();
        }

        // If has more than two members, can't be an OPTION
        List<Schema> types = schema.getTypes();
        if (null != types && types.size() == 2) {
            Schema first = types.get(0);
            Schema second = types.get(1);

            // One member should be of type NULL and other of type RECORD
            if (Schema.Type.NULL.equals(first.getType()) && Schema.Type.RECORD.equals(second.getType())) {
                return Optional.of(second);
            } else if (Schema.Type.RECORD.equals(first.getType()) && Schema.Type.NULL.equals(second.getType())) {
                return Optional.of(first);
            }
        }

        return Optional.<Schema>absent();
    }

    /***
     * Copy properties from old Avro Schema to new Avro Schema
     * @param oldSchema Old Avro Schema to copy properties from
     * @param newSchema New Avro Schema to copy properties to
     */
    private static void copyProperties(Schema oldSchema, Schema newSchema) {
        Preconditions.checkNotNull(oldSchema);
        Preconditions.checkNotNull(newSchema);

        Map<String, JsonNode> props = oldSchema.getJsonProps();
        copyProperties(props, newSchema);
    }

    /***
     * Copy properties to an Avro Schema
     * @param props Properties to copy to Avro Schema
     * @param schema Avro Schema to copy properties to
     */
    private static void copyProperties(Map<String, JsonNode> props, Schema schema) {
        Preconditions.checkNotNull(schema);

        // (if null, don't copy but do not throw exception)
        if (null != props) {
            for (Map.Entry<String, JsonNode> prop : props.entrySet()) {
                schema.addProp(prop.getKey(), prop.getValue());
            }
        }
    }
}