org.apache.crunch.io.CrunchOutputs.java Source code

Introduction

Here is the source code for org.apache.crunch.io.CrunchOutputs.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.io;

import org.apache.crunch.CrunchRuntimeException;
import org.apache.crunch.hadoop.mapreduce.TaskAttemptContextFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.util.ReflectionUtils;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import java.io.IOException;
import java.util.List;
import java.util.Map;

/**
 * An analogue of {@link CrunchInputs} for handling multiple {@code OutputFormat} instances
 * writing to multiple files within a single MapReduce job.
 */
public class CrunchOutputs<K, V> {
    public static final String CRUNCH_OUTPUTS = "crunch.outputs.dir";

    private static final char RECORD_SEP = ',';
    private static final char FIELD_SEP = ';';
    private static final Joiner JOINER = Joiner.on(FIELD_SEP);
    private static final Splitter SPLITTER = Splitter.on(FIELD_SEP);

    public static void addNamedOutput(Job job, String name, Class<? extends OutputFormat> outputFormatClass,
            Class keyClass, Class valueClass) {
        addNamedOutput(job, name, FormatBundle.forOutput(outputFormatClass), keyClass, valueClass);
    }

    public static void addNamedOutput(Job job, String name, FormatBundle<? extends OutputFormat> outputBundle,
            Class keyClass, Class valueClass) {
        Configuration conf = job.getConfiguration();
        String inputs = JOINER.join(name, outputBundle.serialize(), keyClass.getName(), valueClass.getName());
        String existing = conf.get(CRUNCH_OUTPUTS);
        conf.set(CRUNCH_OUTPUTS, existing == null ? inputs : existing + RECORD_SEP + inputs);
    }

    private static class OutputConfig<K, V> {
        public FormatBundle<OutputFormat<K, V>> bundle;
        public Class<K> keyClass;
        public Class<V> valueClass;

        public OutputConfig(FormatBundle<OutputFormat<K, V>> bundle, Class<K> keyClass, Class<V> valueClass) {
            this.bundle = bundle;
            this.keyClass = keyClass;
            this.valueClass = valueClass;
        }
    }

    private static Map<String, OutputConfig> getNamedOutputs(TaskInputOutputContext<?, ?, ?, ?> context) {
        Map<String, OutputConfig> out = Maps.newHashMap();
        Configuration conf = context.getConfiguration();
        for (String input : Splitter.on(RECORD_SEP).split(conf.get(CRUNCH_OUTPUTS))) {
            List<String> fields = Lists.newArrayList(SPLITTER.split(input));
            String name = fields.get(0);
            FormatBundle<OutputFormat> bundle = FormatBundle.fromSerialized(fields.get(1), OutputFormat.class);
            try {
                Class<?> keyClass = Class.forName(fields.get(2));
                Class<?> valueClass = Class.forName(fields.get(3));
                out.put(name, new OutputConfig(bundle, keyClass, valueClass));
            } catch (ClassNotFoundException e) {
                throw new CrunchRuntimeException(e);
            }
        }
        return out;
    }

    private static final String BASE_OUTPUT_NAME = "mapreduce.output.basename";
    private static final String COUNTERS_GROUP = CrunchOutputs.class.getName();

    private TaskInputOutputContext<?, ?, K, V> baseContext;
    private Map<String, OutputConfig> namedOutputs;
    private Map<String, RecordWriter<K, V>> recordWriters;
    private Map<String, TaskAttemptContext> taskContextCache;

    /**
     * Creates and initializes multiple outputs support,
     * it should be instantiated in the Mapper/Reducer setup method.
     *
     * @param context the TaskInputOutputContext object
     */
    public CrunchOutputs(TaskInputOutputContext<?, ?, K, V> context) {
        this.baseContext = context;
        namedOutputs = getNamedOutputs(context);
        recordWriters = Maps.newHashMap();
        taskContextCache = Maps.newHashMap();
    }

    @SuppressWarnings("unchecked")
    public void write(String namedOutput, K key, V value) throws IOException, InterruptedException {
        if (!namedOutputs.containsKey(namedOutput)) {
            throw new IllegalArgumentException("Undefined named output '" + namedOutput + "'");
        }
        TaskAttemptContext taskContext = getContext(namedOutput);
        baseContext.getCounter(COUNTERS_GROUP, namedOutput).increment(1);
        getRecordWriter(taskContext, namedOutput).write(key, value);
    }

    public void close() throws IOException, InterruptedException {
        for (RecordWriter<?, ?> writer : recordWriters.values()) {
            writer.close(baseContext);
        }
    }

    private TaskAttemptContext getContext(String nameOutput) throws IOException {
        TaskAttemptContext taskContext = taskContextCache.get(nameOutput);
        if (taskContext != null) {
            return taskContext;
        }

        // The following trick leverages the instantiation of a record writer via
        // the job thus supporting arbitrary output formats.
        OutputConfig outConfig = namedOutputs.get(nameOutput);
        Configuration conf = new Configuration(baseContext.getConfiguration());
        Job job = new Job(conf);
        job.getConfiguration().set("crunch.namedoutput", nameOutput);
        job.setOutputFormatClass(outConfig.bundle.getFormatClass());
        job.setOutputKeyClass(outConfig.keyClass);
        job.setOutputValueClass(outConfig.valueClass);
        outConfig.bundle.configure(job.getConfiguration());
        taskContext = TaskAttemptContextFactory.create(job.getConfiguration(), baseContext.getTaskAttemptID());

        taskContextCache.put(nameOutput, taskContext);
        return taskContext;
    }

    private synchronized RecordWriter<K, V> getRecordWriter(TaskAttemptContext taskContext, String namedOutput)
            throws IOException, InterruptedException {
        // look for record-writer in the cache
        RecordWriter<K, V> writer = recordWriters.get(namedOutput);

        // If not in cache, create a new one
        if (writer == null) {
            // get the record writer from context output format
            taskContext.getConfiguration().set(BASE_OUTPUT_NAME, namedOutput);
            try {
                OutputFormat format = ReflectionUtils.newInstance(taskContext.getOutputFormatClass(),
                        taskContext.getConfiguration());
                writer = format.getRecordWriter(taskContext);
            } catch (ClassNotFoundException e) {
                throw new IOException(e);
            }
            recordWriters.put(namedOutput, writer);
        }

        return writer;
    }
}