org.apache.gobblin.data.management.conversion.hive.avro.AvroSchemaManager.java Source code

Introduction

Here is the source code for org.apache.gobblin.data.management.conversion.hive.avro.AvroSchemaManager.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.data.management.conversion.hive.avro;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;

import org.apache.avro.Schema;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.serde2.avro.TypeInfoToSchema;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;

import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.hash.Hashing;

import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.data.management.conversion.hive.query.HiveAvroORCQueryGenerator;
import org.apache.gobblin.hive.avro.HiveAvroSerDeManager;
import org.apache.gobblin.util.AvroUtils;
import org.apache.gobblin.util.HadoopUtils;

/**
 * Avro schema for a {@link Partition} or {@link Table} is available at multiple locations. This class is used to decide
 * the schema to use. It also creates a temporary schema file on the {@link FileSystem}.
 *
 * <ul>
 * 1. The {@link Schema} can be set as a literal in the serde info<br>
 * 2. The {@link Schema} url can set as a property in the serde info<br>
 * 3. The {@link Schema} can be inferred using the physical data location of the {@link Table} or {@link Partition}<br>
 * </ul>
 *
 * Callers request for the schema url using {@link #getSchemaUrl(Partition)} or {@link #getSchemaUrl(Table)}.
 *<ul>
 * In case (1.), the literal is written as a {@link Schema} file under {@link #schemaDir}. The {@link Path} to this file
 * is uses as the {@link Schema} url<br>
 * In case (2.), the url itself is used as the {@link Schema} url<br>
 * In case (3.), a {@link Schema} file is created under {@link #schemaDir} for {@link Schema} of latest data file.<br>
 *</ul>
 *
 * In all three cases the mapping of {@link Schema} to temporary Schema file path is cached.
 * If multiple {@link Partition}s have the same {@link Schema} a duplicate schema file in not created. Already existing
 * {@link Schema} url for this {@link Schema} is used.
 */
@Slf4j
public class AvroSchemaManager {

    private static final String HIVE_SCHEMA_TEMP_DIR_PATH_KEY = "hive.schema.dir";
    private static final String DEFAULT_HIVE_SCHEMA_TEMP_DIR_PATH_KEY = "/tmp/gobblin_schemas";

    private final FileSystem fs;
    /**
     * A mapping of {@link Schema} hash to its {@link Path} on {@link FileSystem}
     */
    private final Map<String, Path> schemaPaths;

    /**
     * A temporary directory to hold all Schema files. The path is job id specific.
     * Deleting it will not affect other job executions
     */
    private final Path schemaDir;

    public AvroSchemaManager(FileSystem fs, State state) {
        this.fs = fs;
        this.schemaPaths = Maps.newHashMap();
        this.schemaDir = new Path(
                state.getProp(HIVE_SCHEMA_TEMP_DIR_PATH_KEY, DEFAULT_HIVE_SCHEMA_TEMP_DIR_PATH_KEY),
                state.getProp(ConfigurationKeys.JOB_ID_KEY));
    }

    /**
     * Get the url to <code>table</code>'s avro {@link Schema} file.
     *
     * @param table whose avro schema is to be returned
     * @return a {@link Path} to table's avro {@link Schema} file.
     */
    public Path getSchemaUrl(Table table) throws IOException {
        return getSchemaUrl(table.getTTable().getSd());
    }

    /**
     * Get the url to <code>partition</code>'s avro {@link Schema} file.
     *
     * @param partition whose avro schema is to be returned
     * @return a {@link Path} to table's avro {@link Schema} file.
     */
    public Path getSchemaUrl(Partition partition) throws IOException {
        return getSchemaUrl(partition.getTPartition().getSd());
    }

    /**
     * Delete the temporary {@link #schemaDir}
     */
    public void cleanupTempSchemas() throws IOException {
        HadoopUtils.deleteIfExists(this.fs, this.schemaDir, true);
    }

    public static Schema getSchemaFromUrl(Path schemaUrl, FileSystem fs) throws IOException {
        return AvroUtils.parseSchemaFromFile(schemaUrl, fs);
    }

    private Path getSchemaUrl(StorageDescriptor sd) throws IOException {

        String schemaString = StringUtils.EMPTY;
        try {
            // Try to fetch from SCHEMA URL
            Map<String, String> serdeParameters = sd.getSerdeInfo().getParameters();
            if (serdeParameters != null && serdeParameters.containsKey(HiveAvroSerDeManager.SCHEMA_URL)) {
                String schemaUrl = serdeParameters.get(HiveAvroSerDeManager.SCHEMA_URL);
                if (schemaUrl.startsWith("http")) {
                    // Fetch schema literal via HTTP GET if scheme is http(s)
                    schemaString = IOUtils.toString(new URI(schemaUrl), StandardCharsets.UTF_8);
                    log.debug("Schema string is: " + schemaString);
                    Schema schema = HiveAvroORCQueryGenerator.readSchemaFromString(schemaString);

                    return getOrGenerateSchemaFile(schema);
                } else {
                    // .. else fetch from HDFS or local filesystem
                    return new Path(sd.getSerdeInfo().getParameters().get(HiveAvroSerDeManager.SCHEMA_URL));
                }
            }
            // Try to fetch from SCHEMA LITERAL
            else if (serdeParameters != null && serdeParameters.containsKey(HiveAvroSerDeManager.SCHEMA_LITERAL)) {
                schemaString = serdeParameters.get(HiveAvroSerDeManager.SCHEMA_LITERAL);
                log.debug("Schema string is: " + schemaString);
                Schema schema = HiveAvroORCQueryGenerator.readSchemaFromString(schemaString);

                return getOrGenerateSchemaFile(schema);
            } else { // Generate schema form Hive schema
                List<FieldSchema> fields = sd.getCols();
                List<String> colNames = fields.stream().map(fs -> fs.getName()).collect(Collectors.toList());
                List<TypeInfo> typeInfos = fields.stream()
                        .map(fs -> TypeInfoUtils.getTypeInfoFromTypeString(fs.getType()))
                        .collect(Collectors.toList());
                List<String> comments = fields.stream().map(fs -> fs.getComment()).collect(Collectors.toList());
                Schema schema = new TypeInfoToSchema().convert(colNames, typeInfos, comments, null, null, null);
                return getOrGenerateSchemaFile(schema);
            }
        } catch (URISyntaxException e) {
            log.error(String.format("Failed to parse schema from schema string. Falling back to HDFS schema: %s",
                    schemaString), e);
        }

        // Try to fetch from HDFS
        Schema schema = AvroUtils.getDirectorySchema(new Path(sd.getLocation()), this.fs, true);

        if (schema == null) {
            throw new SchemaNotFoundException("Failed to get avro schema");
        }

        return getOrGenerateSchemaFile(schema);
    }

    /**
     * If url for schema already exists, return the url. If not create a new temporary schema file and return a the url.
     */
    private Path getOrGenerateSchemaFile(Schema schema) throws IOException {

        Preconditions.checkNotNull(schema, "Avro Schema should not be null");

        String hashedSchema = Hashing.sha256().hashString(schema.toString(), StandardCharsets.UTF_8).toString();

        if (!this.schemaPaths.containsKey(hashedSchema)) {

            Path schemaFilePath = new Path(this.schemaDir, String.valueOf(System.currentTimeMillis() + ".avsc"));
            AvroUtils.writeSchemaToFile(schema, schemaFilePath, fs, true);

            this.schemaPaths.put(hashedSchema, schemaFilePath);
        }

        return this.schemaPaths.get(hashedSchema);
    }
}