cascading.flow.hadoop.MapReduceFlow.java Source code

Introduction

Here is the source code for cascading.flow.hadoop.MapReduceFlow.java
Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.flow.hadoop;

import java.beans.ConstructorProperties;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

import cascading.CascadingException;
import cascading.flow.FlowStep;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.flow.planner.process.FlowStepGraph;
import cascading.scheme.NullScheme;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;

/**
 * Class MapReduceFlow is a {@link cascading.flow.hadoop.HadoopFlow} subclass that supports custom MapReduce jobs
 * pre-configured via the {@link JobConf} object.
 * <p/>
 * Use this class to allow custom MapReduce jobs to participate in the {@link cascading.cascade.Cascade} scheduler. If
 * other Flow instances in the Cascade share resources with this Flow instance, all participants will be scheduled
 * according to their dependencies (topologically).
 * <p/>
 * Set the parameter {@code deleteSinkOnInit} to {@code true} if the outputPath in the jobConf should be deleted before executing the MapReduce job.
 * <p/>
 * MapReduceFlow assumes the underlying input and output paths are compatible with the {@link Hfs} Tap.
 * <p/>
 * If the configured JobConf instance uses some other identifier instead of Hadoop FS paths, you should override the
 * {@link #createSources(org.apache.hadoop.mapred.JobConf)}, {@link #createSinks(org.apache.hadoop.mapred.JobConf)}, and
 * {@link #createTraps(org.apache.hadoop.mapred.JobConf)} methods to properly resolve the configured paths into
 * usable {@link Tap} instances. By default createTraps returns an empty collection and should probably be left alone.
 * <p/>
 * MapReduceFlow supports both org.apache.hadoop.mapred.* and org.apache.hadoop.mapreduce.* API Jobs.
 */
public class MapReduceFlow extends HadoopFlow {
    /** Field deleteSinkOnInit */
    protected boolean deleteSinkOnInit = false;

    /**
     * Constructor MapReduceFlow creates a new MapReduceFlow instance.
     *
     * @param jobConf of type JobConf
     */
    @ConstructorProperties({ "jobConf" })
    public MapReduceFlow(JobConf jobConf) {
        this(jobConf.getJobName(), jobConf, false);
    }

    /**
     * Constructor MapReduceFlow creates a new MapReduceFlow instance.
     *
     * @param jobConf          of type JobConf
     * @param deleteSinkOnInit of type boolean
     */
    @ConstructorProperties({ "jobConf", "deleteSinkOnInit" })
    public MapReduceFlow(JobConf jobConf, boolean deleteSinkOnInit) {
        this(jobConf.getJobName(), jobConf, deleteSinkOnInit);
    }

    /**
     * Constructor MapReduceFlow creates a new MapReduceFlow instance.
     *
     * @param name    of type String
     * @param jobConf of type JobConf
     */
    @ConstructorProperties({ "name", "jobConf" })
    public MapReduceFlow(String name, JobConf jobConf) {
        this(name, jobConf, false);
    }

    /**
     * Constructor MapReduceFlow creates a new MapReduceFlow instance.
     *
     * @param name             of type String
     * @param jobConf          of type JobConf
     * @param deleteSinkOnInit of type boolean
     */
    @ConstructorProperties({ "name", "jobConf", "deleteSinkOnInit" })
    public MapReduceFlow(String name, JobConf jobConf, boolean deleteSinkOnInit) {
        this(name, jobConf, deleteSinkOnInit, true);
    }

    /**
     * Constructor MapReduceFlow creates a new MapReduceFlow instance.
     *
     * @param name             of type String
     * @param jobConf          of type JobConf
     * @param deleteSinkOnInit of type boolean
     * @param stopJobsOnExit   of type boolean
     */
    @ConstructorProperties({ "name", "jobConf", "deleteSinkOnInit", "stopJobsOnExit" })
    public MapReduceFlow(String name, JobConf jobConf, boolean deleteSinkOnInit, boolean stopJobsOnExit) {
        super(HadoopUtil.getPlatformInfo(), new Properties(), jobConf, name, null);
        this.deleteSinkOnInit = deleteSinkOnInit;
        this.stopJobsOnExit = stopJobsOnExit;

        setSources(createSources(jobConf));
        setSinks(createSinks(jobConf));
        setTraps(createTraps(jobConf));
        setFlowStepGraph(makeStepGraph(jobConf));
        initSteps();

        initializeNewJobsMap();
    }

    private FlowStepGraph makeStepGraph(JobConf jobConf) {
        FlowStepGraph flowStepGraph = new FlowStepGraph();

        Tap sink = getSinksCollection().iterator().next();
        FlowStep<JobConf> step = new MapReduceFlowStep(getName(), sink.toString(), jobConf, sink);

        flowStepGraph.addVertex(step);

        return flowStepGraph;
    }

    protected Map<String, Tap> createSources(JobConf jobConf) {
        Path[] paths = FileInputFormat.getInputPaths(jobConf);

        if (paths.length == 0) {
            try {
                paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(new Job(jobConf));
            } catch (IOException exception) {
                throw new CascadingException(exception);
            }
        }

        Map<String, Tap> taps = new HashMap<String, Tap>();

        for (Path path : paths)
            taps.put(path.toString(), new Hfs(new NullScheme(), path.toString()));

        return taps;
    }

    protected Map<String, Tap> createSinks(JobConf jobConf) {
        Map<String, Tap> taps = new HashMap<String, Tap>();

        Path path = FileOutputFormat.getOutputPath(jobConf);

        if (path == null) {
            try {
                path = org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.getOutputPath(new Job(jobConf));
            } catch (IOException exception) {
                throw new CascadingException(exception);
            }
        }

        taps.put(path.toString(),
                new Hfs(new NullScheme(), path.toString(), deleteSinkOnInit ? SinkMode.REPLACE : SinkMode.KEEP));

        return taps;
    }

    protected Map<String, Tap> createTraps(JobConf jobConf) {
        return new HashMap<String, Tap>();
    }
}