cascading.flow.hadoop.planner.HadoopPlanner.java Source code

Java tutorial

Introduction

Here is the source code for cascading.flow.hadoop.planner.HadoopPlanner.java

Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.flow.hadoop.planner;

import java.net.URI;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import cascading.flow.FlowConnector;
import cascading.flow.FlowDef;
import cascading.flow.FlowStep;
import cascading.flow.hadoop.HadoopFlow;
import cascading.flow.hadoop.HadoopFlowStep;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.flow.planner.FlowPlanner;
import cascading.flow.planner.PlannerInfo;
import cascading.flow.planner.PlatformInfo;
import cascading.flow.planner.graph.ElementGraph;
import cascading.flow.planner.process.FlowNodeGraph;
import cascading.flow.planner.rule.RuleRegistry;
import cascading.flow.planner.rule.transformer.IntermediateTapElementFactory;
import cascading.property.AppProps;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tap.hadoop.util.TempHfs;
import cascading.util.Util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class HadoopPlanner is the core Hadoop MapReduce planner used by default through a {@link cascading.flow.FlowConnector}
 * sub-class.
 * <p/>
 * Notes:
 * <p/>
 * <strong>Custom JobConf properties</strong><br/>
 * A custom JobConf instance can be passed to this planner by calling {@link #copyJobConf(java.util.Map, org.apache.hadoop.mapred.JobConf)}
 * on a map properties object before constructing a new {@link cascading.flow.FlowConnector} sub-class.
 * <p/>
 * A better practice would be to set Hadoop properties directly on the map properties object handed to the FlowConnector.
 * All values in the map will be passed to a new default JobConf instance to be used as defaults for all resulting
 * Flow instances.
 * <p/>
 * For example, {@code properties.set("mapred.child.java.opts","-Xmx512m");} would convince Hadoop
 * to spawn all child jvms with a heap of 512MB.
 */
public class HadoopPlanner extends FlowPlanner<HadoopFlow, JobConf> {
    /** Field LOG */
    private static final Logger LOG = LoggerFactory.getLogger(HadoopPlanner.class);

    public static final String PLATFORM_NAME = "hadoop";

    /** Field jobConf */
    private JobConf defaultJobConf;
    /** Field intermediateSchemeClass */
    private Class intermediateSchemeClass;

    /**
     * Method copyJobConf adds the given JobConf values to the given properties object. Use this method to pass
     * custom default Hadoop JobConf properties to Hadoop.
     *
     * @param properties of type Map
     * @param jobConf    of type JobConf
     */
    public static void copyJobConf(Map<Object, Object> properties, JobConf jobConf) {
        for (Map.Entry<String, String> entry : jobConf)
            properties.put(entry.getKey(), entry.getValue());
    }

    /**
     * Method createJobConf returns a new JobConf instance using the values in the given properties argument.
     *
     * @param properties of type Map
     * @return a JobConf instance
     */
    public static JobConf createJobConf(Map<Object, Object> properties) {
        JobConf conf = new JobConf();

        copyProperties(conf, properties);

        return conf;
    }

    /**
     * Method copyProperties adds the given Map values to the given JobConf object.
     *
     * @param jobConf    of type JobConf
     * @param properties of type Map
     */
    public static void copyProperties(JobConf jobConf, Map<Object, Object> properties) {
        if (properties instanceof Properties) {
            Properties props = (Properties) properties;
            Set<String> keys = props.stringPropertyNames();

            for (String key : keys)
                jobConf.set(key, props.getProperty(key));
        } else {
            for (Map.Entry<Object, Object> entry : properties.entrySet()) {
                if (entry.getValue() != null)
                    jobConf.set(entry.getKey().toString(), entry.getValue().toString());
            }
        }
    }

    @Override
    public PlannerInfo getPlannerInfo(String registryName) {
        return new PlannerInfo(getClass().getSimpleName(), PLATFORM_NAME, registryName);
    }

    @Override
    public JobConf getDefaultConfig() {
        return defaultJobConf;
    }

    @Override
    public PlatformInfo getPlatformInfo() {
        return HadoopUtil.getPlatformInfo();
    }

    @Override
    public void initialize(FlowConnector flowConnector, Map<Object, Object> properties) {
        super.initialize(flowConnector, properties);

        defaultJobConf = HadoopUtil.createJobConf(properties, createJobConf(properties));
        checkPlatform(defaultJobConf);
        intermediateSchemeClass = flowConnector.getIntermediateSchemeClass(properties);

        Class type = AppProps.getApplicationJarClass(properties);
        if (defaultJobConf.getJar() == null && type != null)
            defaultJobConf.setJarByClass(type);

        String path = AppProps.getApplicationJarPath(properties);
        if (defaultJobConf.getJar() == null && path != null)
            defaultJobConf.setJar(path);

        if (defaultJobConf.getJar() == null)
            defaultJobConf.setJarByClass(HadoopUtil.findMainClass(HadoopPlanner.class));

        AppProps.setApplicationJarPath(properties, defaultJobConf.getJar());

        LOG.info("using application jar: {}", defaultJobConf.getJar());
    }

    @Override
    public void configRuleRegistryDefaults(RuleRegistry ruleRegistry) {
        super.configRuleRegistryDefaults(ruleRegistry);

        ruleRegistry.addDefaultElementFactory(IntermediateTapElementFactory.TEMP_TAP, new TempTapElementFactory());
    }

    protected void checkPlatform(Configuration conf) {
        if (HadoopUtil.isYARN(conf))
            LOG.warn(
                    "running YARN based flows on Hadoop 1.x may cause problems, please use the 'cascading-hadoop2-mr1' dependencies");
    }

    @Override
    protected HadoopFlow createFlow(FlowDef flowDef) {
        return new HadoopFlow(getPlatformInfo(), getDefaultProperties(), getDefaultConfig(), flowDef);
    }

    public FlowStep<JobConf> createFlowStep(ElementGraph stepElementGraph, FlowNodeGraph flowNodeGraph) {
        return new HadoopFlowStep(stepElementGraph, flowNodeGraph);
    }

    public URI getDefaultURIScheme(Tap tap) {
        return ((Hfs) tap).getDefaultFileSystemURIScheme(defaultJobConf);
    }

    public URI getURIScheme(Tap tap) {
        return ((Hfs) tap).getURIScheme(defaultJobConf);
    }

    @Override
    protected Tap makeTempTap(String prefix, String name) {
        // must give Taps unique names
        return new TempHfs(defaultJobConf, Util.makePath(prefix, name), intermediateSchemeClass, prefix == null);
    }
}