org.apache.hive.hcatalog.mapreduce.FileOutputFormatContainer.java Source code

Introduction

Here is the source code for org.apache.hive.hcatalog.mapreduce.FileOutputFormatContainer.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hive.hcatalog.mapreduce;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hive.hcatalog.common.ErrorType;
import org.apache.hive.hcatalog.common.HCatConstants;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.thrift.TException;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * File-based storage (ie RCFile, Text, etc) implementation of OutputFormatContainer.
 * This implementation supports the following HCatalog features: partitioning, dynamic partitioning, Hadoop Archiving, etc.
 */
class FileOutputFormatContainer extends OutputFormatContainer {

    /**
     * @param of base OutputFormat to contain
     */
    public FileOutputFormatContainer(
            org.apache.hadoop.mapred.OutputFormat<? super WritableComparable<?>, ? super Writable> of) {
        super(of);
    }

    @Override
    public RecordWriter<WritableComparable<?>, HCatRecord> getRecordWriter(TaskAttemptContext context)
            throws IOException, InterruptedException {
        //this needs to be manually set, under normal circumstances MR Task does this
        setWorkOutputPath(context);

        //Configure the output key and value classes.
        // This is required for writing null as key for file based tables.
        context.getConfiguration().set("mapred.output.key.class", NullWritable.class.getName());
        String jobInfoString = context.getConfiguration().get(HCatConstants.HCAT_KEY_OUTPUT_INFO);
        OutputJobInfo jobInfo = (OutputJobInfo) HCatUtil.deserialize(jobInfoString);
        StorerInfo storeInfo = jobInfo.getTableInfo().getStorerInfo();
        HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(context.getConfiguration(), storeInfo);
        Class<? extends SerDe> serde = storageHandler.getSerDeClass();
        SerDe sd = (SerDe) ReflectionUtils.newInstance(serde, context.getConfiguration());
        context.getConfiguration().set("mapred.output.value.class", sd.getSerializedClass().getName());

        RecordWriter<WritableComparable<?>, HCatRecord> rw;
        if (HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed()) {
            // When Dynamic partitioning is used, the RecordWriter instance initialized here isn't used. Can use null.
            // (That's because records can't be written until the values of the dynamic partitions are deduced.
            // By that time, a new local instance of RecordWriter, with the correct output-path, will be constructed.)
            rw = new DynamicPartitionFileRecordWriterContainer((org.apache.hadoop.mapred.RecordWriter) null,
                    context);
        } else {
            Path parentDir = new Path(context.getConfiguration().get("mapred.work.output.dir"));
            Path childPath = new Path(parentDir,
                    FileOutputFormat.getUniqueName(new JobConf(context.getConfiguration()),
                            context.getConfiguration().get("mapreduce.output.basename", "part")));

            rw = new StaticPartitionFileRecordWriterContainer(getBaseOutputFormat().getRecordWriter(
                    parentDir.getFileSystem(context.getConfiguration()), new JobConf(context.getConfiguration()),
                    childPath.toString(), InternalUtil.createReporter(context)), context);
        }
        return rw;
    }

    @Override
    public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
        OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration());
        IMetaStoreClient client = null;
        try {
            HiveConf hiveConf = HCatUtil.getHiveConf(context.getConfiguration());
            client = HCatUtil.getHiveMetastoreClient(hiveConf);
            handleDuplicatePublish(context, jobInfo, client, new Table(jobInfo.getTableInfo().getTable()));
        } catch (MetaException e) {
            throw new IOException(e);
        } catch (TException e) {
            throw new IOException(e);
        } finally {
            HCatUtil.closeHiveClientQuietly(client);
        }

        if (!jobInfo.isDynamicPartitioningUsed()) {
            JobConf jobConf = new JobConf(context.getConfiguration());
            getBaseOutputFormat().checkOutputSpecs(null, jobConf);
            //checkoutputspecs might've set some properties we need to have context reflect that
            HCatUtil.copyConf(jobConf, context.getConfiguration());
        }
    }

    @Override
    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
        //this needs to be manually set, under normal circumstances MR Task does this
        setWorkOutputPath(context);
        return new FileOutputCommitterContainer(context,
                HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed() ? null
                        : new JobConf(context.getConfiguration()).getOutputCommitter());
    }

    /**
     * Handles duplicate publish of partition or data into an unpartitioned table
     * if the table is immutable
     *
     * For partitioned tables, fails if partition already exists.
     * For non partitioned tables, fails if files are present in table directory.
     * For dynamic partitioned publish, does nothing - check would need to be done at recordwriter time
     * @param context the job
     * @param outputInfo the output info
     * @param client the metastore client
     * @param table the table being written to
     * @throws IOException
     * @throws org.apache.hadoop.hive.metastore.api.MetaException
     * @throws org.apache.thrift.TException
     */
    private static void handleDuplicatePublish(JobContext context, OutputJobInfo outputInfo,
            IMetaStoreClient client, Table table)
            throws IOException, MetaException, TException, NoSuchObjectException {

        /*
         * For fully specified ptn, follow strict checks for existence of partitions in metadata
         * For unpartitioned tables, follow filechecks
         * For partially specified tables:
         *    This would then need filechecks at the start of a ptn write,
         *    Doing metadata checks can get potentially very expensive (fat conf) if
         *    there are a large number of partitions that match the partial specifications
         */

        if (!table.isImmutable()) {
            return;
        }
        if (table.getPartitionKeys().size() > 0) {
            if (!outputInfo.isDynamicPartitioningUsed()) {
                List<String> partitionValues = getPartitionValueList(table, outputInfo.getPartitionValues());
                // fully-specified partition
                List<String> currentParts = client.listPartitionNames(outputInfo.getDatabaseName(),
                        outputInfo.getTableName(), partitionValues, (short) 1);

                if (currentParts.size() > 0) {
                    // If a table is partitioned and immutable, then the presence
                    // of the partition alone is enough to throw an error - we do
                    // not need to check for emptiness to decide to throw an error
                    throw new HCatException(ErrorType.ERROR_DUPLICATE_PARTITION);
                }
            }
        } else {
            List<String> partitionValues = getPartitionValueList(table, outputInfo.getPartitionValues());
            // non-partitioned table

            Path tablePath = new Path(table.getTTable().getSd().getLocation());
            FileSystem fs = tablePath.getFileSystem(context.getConfiguration());

            if (!MetaStoreUtils.isDirEmpty(fs, tablePath)) {
                throw new HCatException(ErrorType.ERROR_NON_EMPTY_TABLE,
                        table.getDbName() + "." + table.getTableName());
            }
        }
    }

    /**
     * Convert the partition value map to a value list in the partition key order.
     * @param table the table being written to
     * @param valueMap the partition value map
     * @return the partition value list
     * @throws java.io.IOException
     */
    static List<String> getPartitionValueList(Table table, Map<String, String> valueMap) throws IOException {

        if (valueMap.size() != table.getPartitionKeys().size()) {
            throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES, "Table " + table.getTableName()
                    + " has " + table.getPartitionKeys().size() + " partition keys, got " + valueMap.size());
        }

        List<String> values = new ArrayList<String>();

        for (FieldSchema schema : table.getPartitionKeys()) {
            String value = valueMap.get(schema.getName().toLowerCase());

            if (value == null) {
                throw new HCatException(ErrorType.ERROR_MISSING_PARTITION_KEY,
                        "Key " + schema.getName() + " of table " + table.getTableName());
            }

            values.add(value);
        }

        return values;
    }

    static void setWorkOutputPath(TaskAttemptContext context) throws IOException {
        String outputPath = context.getConfiguration().get("mapred.output.dir");
        //we need to do this to get the task path and set it for mapred implementation
        //since it can't be done automatically because of mapreduce->mapred abstraction
        if (outputPath != null)
            context.getConfiguration().set("mapred.work.output.dir",
                    new FileOutputCommitter(new Path(outputPath), context).getWorkPath().toString());
    }
}