org.opencloudengine.flamingo.mapreduce.core.AbstractJob.java Source code

Java tutorial

Introduction

Here is the source code for org.opencloudengine.flamingo.mapreduce.core.AbstractJob.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.opencloudengine.flamingo.mapreduce.core;

import com.google.common.base.Preconditions;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.Tool;
import org.opencloudengine.flamingo.mapreduce.util.CommandLineUtil;
import org.opencloudengine.flamingo.mapreduce.util.DefaultOptionCreator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.StringWriter;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * Flamingo MapReduce?  Hadoop Job Driver? ? ?.
 * ? MapReduce Driver ??  ??? ? ? Map Reduce Task?
 *   ?? Hadoop Configuration?  . ? ?
 *  MapReduce Driver  ???     ? .
 * <p/>
 *  ???  ?? MapReduce Driver? ? ?   .
 * <ul>
 * <li>--tempDir (path): Job ?    . ( "<tt>/temp/${user.home}</tt>")
 * <li>--help: ??</li>
 * </ul>
 * <p/>
 * ? MapReduce Job? ?  ?? ? ? ?   .
 * <p/>
 * <ul>
 * <li>-Dmapred.job.name=(name): Hadoop MapReduce Job? ?.  Driver ? .</li>
 * <li>-Dmapred.output.compress={true,false}:    ( true)</li>
 * <li>-Dmapred.input.dir=(path):  ? ?   ()</li>
 * <li>-Dmapred.output.dir=(path):  ? ()</li>
 * </ul>
 * <tt>-D</tt>   ?   ?  ?    .
 */
public abstract class AbstractJob extends Configured implements Tool {

    /**
     * SLF4J API
     */
    private static final Logger log = LoggerFactory.getLogger(AbstractJob.class);

    /**
     * MapReduce     .
     */
    private Option inputOption;

    /**
     * MapReduce     .
     */
    private Option outputOption;

    /**
     * {@link #parseArguments(String[])}?  ? MapReduce  .
     */
    private Path inputPath;

    /**
     * {@link #parseArguments(String[])}?  ? MapReduce  .
     */
    private Path outputPath;

    /**
     * {@link #parseArguments(String[])}?  ? MapReduce  .
     * ?  CLASSPATH? <tt>flamingo-hadoop-site.xml</tt>? <tt>tempDir</tt> ?  ?
     * <tt>/temp/${user.home}</tt>?  .
     */
    private Path tempPath;

    /**
     * MapReduce Job? ?  ?? Key Value Map.
     */
    private Map<String, String> argMap;

    /**
     * ?    ?.
     */
    private final List<Option> options;

    /**
     *  ??.
     */
    protected AbstractJob() {
        options = new LinkedList<Option>();
        if (getConf() == null) {
            setConf(new Configuration());
            // Flamingo MapReduce?   ??  Hadoop Configuration? .
            getConf().addResource(getClass().getResource("/flamingo-mapreduce-site.xml"));
        }
    }

    /**
     * {@link #parseArguments(String[])} ?  ??   .
     * Hadoop MapReduce Driver? {@link #addInputOption()}  ? 
     *      ?   Hadoop Configuration?
     * {@code mapred.input.dir} ?  .
     *
     * @return  
     */
    protected Path getInputPath() {
        return inputPath;
    }

    /**
     * {@link #parseArguments(String[])} ?  ??   .
     * Hadoop MapReduce Driver? {@link #addOutputOption()}  ? 
     *      ?   Hadoop Configuration?
     * {@code mapred.output.dir} ?  .
     *
     * @return  
     */
    protected Path getOutputPath() {
        return outputPath;
    }

    /**
     *   ?   ? ??  .
     *
     * @param child ?? 
     * @return Path
     */
    protected Path getOutputPath(String child) {
        return new Path(outputPath, child);
    }

    /**
     * ??     .
     *
     * @param path ?? 
     * @return  
     */
    protected Path getTempPath(String path) {
        return new Path(getTempPath(), path);
    }

    /**
     *   .
     *    ? {@link Constants#TEMP_DIR}? CLASSPATH? <tt>flamingo-mapreudce-site.xml</tt> ?? ?? 
     * ? ? ? ? ?    ??? <tt>--tempDir</tt>? ?   .
     * ?  ? ? ? ?  ?   ?.
     * ?  Hadoop MapReduce Driver? ? ?       .
     * <p/>
     * <pre>
     *     Path tempDir = getTempPath();
     * </pre>
     *
     * @return  
     */
    protected Path getTempPath() {
        String defaultTempDir = getConf().get("tempDir");
        if (argMap.containsKey(keyFor(Constants.TEMP_DIR))) {
            defaultTempDir = argMap.get(keyFor(Constants.TEMP_DIR));
        }
        return new Path(defaultTempDir);
    }

    /**
     *  ?     .  ?
     * <tt>flamingo-hadoop-site.xml</tt> ?? <tt>tempDir.date.pattern</tt>
     *     <tt>yyyyMMdd-HHmmss-SSS</tt>? .
     *
     * @return  
     */
    protected Path getTimestampTempPath() {
        SimpleDateFormat formatter = new SimpleDateFormat(getConf().get("tempDir.date.pattern"));
        return getTempPath(formatter.format(new Date()));
    }

    /**
     * Prefix    .
     *
     * @param prefix Prefix
     * @param path   ?? 
     * @return  
     */
    protected Path getTempPath(String prefix, String path) {
        Path tempPath = getTempPath();
        Path prefixPath = new Path(tempPath, prefix);
        return new Path(prefixPath, path);
    }

    /**
     * ??  ? . ??   ??     ?.
     *
     * @param name        ?(; <tt>inputPath</tt>)
     * @param shortName    ?(; <tt>i</tt>)
     * @param description ??  
     */
    protected void addFlag(String name, String shortName, String description) {
        options.add(buildOption(name, shortName, description, false, false, null));
    }

    /**
     * ?  ? . ? ?  ? .
     *
     * @param name        ?(; <tt>inputPath</tt>)
     * @param shortName    ?(; <tt>i</tt>)
     * @param description ??  
     */
    protected void addOption(String name, String shortName, String description) {
        options.add(buildOption(name, shortName, description, true, false, null));
    }

    /**
     * Hadoop MapReduce? ? .    ???  
     *  ??? {@link #parseArguments(String[])}    ?.
     *
     * @param name        ?(; <tt>inputPath</tt>)
     * @param shortName    ?(; <tt>i</tt>)
     * @param description ??  
     * @param required    ? ? <tt>true</tt>?   ???  ?   
     *                     ?.  ?  ?  ?.
     */
    protected void addOption(String name, String shortName, String description, boolean required) {
        options.add(buildOption(name, shortName, description, true, required, null));
    }

    /**
     * Hadoop MapReduce? ? .    ???  
     *  ??? {@link #parseArguments(String[])}    ?.
     *
     * @param name         ?(; <tt>inputPath</tt>)
     * @param shortName     ?(; <tt>i</tt>)
     * @param description  ??  
     * @param defaultValue  ???  ??     ? null? .
     */
    protected void addOption(String name, String shortName, String description, String defaultValue) {
        options.add(buildOption(name, shortName, description, true, false, defaultValue));
    }

    /**
     * Hadoop MapReduce? ? .    ???  
     *  ??? {@link #parseArguments(String[])}    ?.
     * ? ? ??    {@code parseArguments}  ?  
     * map? {@code containsKey}   ?   ? ?.
     *  ?  ? ?  ? '--'?  map?    ? 
     *    ? ?? .
     *
     * @param option  
     * @return  
     */
    protected Option addOption(Option option) {
        options.add(option);
        return option;
    }

    /**
     *    ? .     ??? <tt>'-i'</tt> ?
     *  {@link #parseArguments(String[])}  ?  ? ? 
     *   ?. ?    Hadoop Job?    
     * ? ?  <tt>required</tt> ?? .
     */
    protected void addInputOption() {
        this.inputOption = addOption(DefaultOptionCreator.inputOption().create());
    }

    /**
     *    ? .     ??? <tt>'-o'</tt> ?
     *  {@link #parseArguments(String[])}  ?  ? ? 
     *   ?. ?    Hadoop Job?    
     * ? ?  <tt>required</tt> ?? .
     */
    protected void addOutputOption() {
        this.outputOption = addOption(DefaultOptionCreator.outputOption().create());
    }

    /**
     *  ?  ? . ? ?   .
     * required.
     *
     * @param name          ??? '--'? prefix  ? ?
     * @param shortName     ??? '--'? prefix  ? ? ?
     * @param description  ???  ?  ? 
     * @param hasArg       ??  <tt>true</tt>
     * @param required      ?? <tt>true</tt>
     * @param defaultValue ??? . <tt>null</tt>? .
     * @return 
     */
    protected static Option buildOption(String name, String shortName, String description, boolean hasArg,
            boolean required, String defaultValue) {

        DefaultOptionBuilder optBuilder = new DefaultOptionBuilder().withLongName(name).withDescription(description)
                .withRequired(required);

        if (shortName != null) {
            optBuilder.withShortName(shortName);
        }

        if (hasArg) {
            ArgumentBuilder argBuilder = new ArgumentBuilder().withName(name).withMinimum(1).withMaximum(1);

            if (defaultValue != null) {
                argBuilder = argBuilder.withDefault(defaultValue);
            }

            optBuilder.withArgument(argBuilder.create());
        }

        return optBuilder.create();
    }

    /**
     * ?   ??? .
     * ? <tt>-h</tt>   ?  ???  <tt>null</tt>? .
     *
     * @param args  ?? 
     * @return ?? ???  ? ? {@code Map<String,String>}.
     *         ??? key ? ? ? '--'? prefix .
     *         ? ?  {@code Map<String,String>} ? ?    ? '--'? ??? .
     */
    public Map<String, String> parseArguments(String[] args) throws Exception {
        Option helpOpt = addOption(DefaultOptionCreator.helpOption());
        addOption("tempDir", null, " ", false);
        addOption("startPhase", null, "  ", "0");
        addOption("endPhase", null, "  ", String.valueOf(Integer.MAX_VALUE));

        GroupBuilder groupBuilder = new GroupBuilder().withName("Hadoop MapReduce Job :");

        for (Option opt : options) {
            groupBuilder = groupBuilder.withOption(opt);
        }

        Group group = groupBuilder.create();

        CommandLine cmdLine;
        try {
            Parser parser = new Parser();
            parser.setGroup(group);
            parser.setHelpOption(helpOpt);
            cmdLine = parser.parse(args);
        } catch (OptionException e) {
            log.error(e.getMessage());
            CommandLineUtil.printHelpWithGenericOptions(group, e);
            return null;
        }

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelpWithGenericOptions(group);
            return null;
        }

        try {
            parseDirectories(cmdLine);
        } catch (IllegalArgumentException e) {
            log.error(e.getMessage());
            CommandLineUtil.printHelpWithGenericOptions(group);
            return null;
        }

        argMap = new TreeMap<String, String>();
        maybePut(argMap, cmdLine, this.options.toArray(new Option[this.options.size()]));
        log.info("Command line arguments: ", argMap);
        Set<String> keySet = argMap.keySet();
        for (Iterator<String> iterator = keySet.iterator(); iterator.hasNext();) {
            String key = iterator.next();
            log.info("   {} = {}", key, argMap.get(key));
        }
        return argMap;
    }

    /**
     *  ?    .   ? <tt>name</tt> ??    <tt>--name</tt>? ?.
     *
     * @param optionName 
     */
    public static String keyFor(String optionName) {
        return "--" + optionName;
    }

    /**
     *  ?  Option ? .
     *
     * @return  ?   <tt>Option</tt>,   <tt>null</tt>? .
     */
    public String getOption(String optionName) {
        return argMap.get(keyFor(optionName));
    }

    /**
     *  ?  ?.
     *
     * @return  ?   <tt>true</tt>
     */
    public boolean hasOption(String optionName) {
        return argMap.containsKey(keyFor(optionName));
    }

    /**
     *  ??  ? Hadoop Configuration ??  ? ?  ??.
     * {@code addInputOption} ? {@code addOutputOption}   
     *  ?? ? Hadoop Configuration?   ? ??   
     * {@code OptionException}? ?. ? Hadoop Configuration ?
     * {@code inputPath} ? {@code outputPath}?  
     * <tt>non-null</tt>   .  ?? ? Hadoop Configuration ?
     *    .
     *
     * @param cmdLine  ??
     * @throws IllegalArgumentException inputOption ? {@code --input} {@code -Dmapred.input dir}   ??   , outputOption ? {@code --output} {@code -Dmapred.output dir}   ??   ,
     */
    protected void parseDirectories(CommandLine cmdLine) {
        Configuration conf = getConf();

        if (inputOption != null && cmdLine.hasOption(inputOption)) {
            this.inputPath = new Path(cmdLine.getValue(inputOption).toString());
        }
        if (inputPath == null && conf.get("mapred.input.dir") != null) {
            this.inputPath = new Path(conf.get("mapred.input.dir"));
        }

        if (outputOption != null && cmdLine.hasOption(outputOption)) {
            this.outputPath = new Path(cmdLine.getValue(outputOption).toString());
        }
        if (outputPath == null && conf.get("mapred.output.dir") != null) {
            this.outputPath = new Path(conf.get("mapred.output.dir"));
        }

        // Temporary Path . ? CLASSPATH? <tt>flamingo-hadoop-site.xml</tt> ??  ? .
        if (tempPath == null && conf.get("tempDir") != null) {
            this.tempPath = new Path(conf.get("tempDir"));
        }

        Preconditions.checkArgument(inputOption == null || inputPath != null,
                "  ?   -Dmapred.input.dir ?  ? . -Dmapred.input.dir ?  ?  ? ? ?  .");
        Preconditions.checkArgument(outputOption == null || outputPath != null,
                "  ?   -Dmapred.output.dir ?  ? . -Dmapred.input.dir ?  ?  ? ? ?  .");
    }

    protected static void maybePut(Map<String, String> args, CommandLine cmdLine, Option... opt) {
        for (Option o : opt) {

            //  ??? ?   ? ?  
            if (cmdLine.hasOption(o) || cmdLine.getValue(o) != null) {

                //  ??? ? ? OK
                // nulls are ok, for cases where options are simple flags.
                Object vo = cmdLine.getValue(o);
                String value = vo == null ? null : vo.toString();
                args.put(o.getPreferredName(), value);
            }
        }
    }

    /**
     *  ? Job?   ? Phase?   .
     *
     * @param args         Key Value ? 
     * @param currentPhase Phase
     * @return ?   . ?    <tt>true</tt> .
     */
    protected static boolean shouldRunNextPhase(Map<String, String> args, AtomicInteger currentPhase) {
        int phase = currentPhase.getAndIncrement();
        String startPhase = args.get("--startPhase");
        String endPhase = args.get("--endPhase");
        boolean phaseSkipped = (startPhase != null && phase < Integer.parseInt(startPhase))
                || (endPhase != null && phase > Integer.parseInt(endPhase));
        if (phaseSkipped) {
            log.info("Skipping phase {}", phase);
        }
        return !phaseSkipped;
    }

    /**
     * Mappre, Reducer  Hadoop Job? ?.
     *
     * @param inputPath     
     * @param outputPath    
     * @param inputFormat   ?
     * @param mapper       Mapper
     * @param mapperKey    Mapper? Output Key Class
     * @param mapperValue  Mapper? Output Value Class
     * @param reducer      Reducer
     * @param reducerKey   Reducer? Output Key Class
     * @param reducerValue Reducer? Output Value Class
     * @param outputFormat  ?
     * @return Hadoop Job
     * @throws IOException Hadoop Job? ?   
     */
    protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
            Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
            Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
            Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
            Class<? extends OutputFormat> outputFormat) throws IOException {

        Job job = new Job(new Configuration(getConf()));
        Configuration jobConf = job.getConfiguration();

        if (reducer.equals(Reducer.class)) {
            if (mapper.equals(Mapper.class)) {
                throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
            }
            job.setJarByClass(mapper);
        } else {
            job.setJarByClass(reducer);
        }

        job.setInputFormatClass(inputFormat);
        jobConf.set("mapred.input.dir", inputPath.toString());

        job.setMapperClass(mapper);
        job.setMapOutputKeyClass(mapperKey);
        job.setMapOutputValueClass(mapperValue);

        jobConf.setBoolean("mapred.compress.map.output", true);

        job.setReducerClass(reducer);
        job.setOutputKeyClass(reducerKey);
        job.setOutputValueClass(reducerValue);

        job.setJobName(getCustomJobName(job, mapper, reducer));

        job.setOutputFormatClass(outputFormat);
        jobConf.set("mapred.output.dir", outputPath.toString());

        return job;
    }

    /**
     * Mapper, Reducer  Hadoop Job? ?.
     *
     * @param inputPath     
     * @param outputPath    
     * @param inputFormat   ?
     * @param mapper       Mapper
     * @param mapperKey    Mapper? Output Key Class
     * @param mapperValue  Mapper? Output Value Class
     * @param reducer      Reducer
     * @param reducerKey   Reducer? Output Key Class
     * @param reducerValue Reducer? Output Value Class
     * @param outputFormat  ?
     * @param reduceTask   Reducer? 
     * @return Hadoop Job
     * @throws IOException Hadoop Job? ?   
     */
    protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
            Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
            Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
            Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
            Class<? extends OutputFormat> outputFormat, int reduceTask) throws IOException {

        Job job = new Job(new Configuration(getConf()));
        Configuration jobConf = job.getConfiguration();

        if (reducer.equals(Reducer.class)) {
            if (mapper.equals(Mapper.class)) {
                throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
            }
            job.setJarByClass(mapper);
        } else {
            job.setJarByClass(reducer);
        }

        job.setInputFormatClass(inputFormat);
        jobConf.set("mapred.input.dir", inputPath.toString());

        job.setMapperClass(mapper);
        job.setMapOutputKeyClass(mapperKey);
        job.setMapOutputValueClass(mapperValue);

        jobConf.setBoolean("mapred.compress.map.output", true);

        job.setReducerClass(reducer);
        job.setOutputKeyClass(reducerKey);
        job.setOutputValueClass(reducerValue);

        job.setJobName(getCustomJobName(job, mapper, reducer));

        job.setNumReduceTasks(reduceTask);

        job.setOutputFormatClass(outputFormat);
        jobConf.set("mapred.output.dir", outputPath.toString());

        return job;
    }

    /**
     * Mapper  Hadoop Job? ?.
     * ?  Mapper ? MapReduce Job? ?  .
     *
     * @param inputPath     
     * @param outputPath    
     * @param inputFormat   ?
     * @param mapper       Mapper
     * @param mapperKey    Mapper? Output Key Class
     * @param mapperValue  Mapper? Output Value Class
     * @param outputFormat  ?
     * @return Hadoop Job
     * @throws IOException Hadoop Job? ?   
     */
    protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
            Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
            Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat) throws IOException {

        Job job = new Job(new Configuration(getConf()));
        Configuration jobConf = job.getConfiguration();

        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper");
        }
        job.setJarByClass(mapper);

        job.setInputFormatClass(inputFormat);
        jobConf.set("mapred.input.dir", inputPath.toString());

        job.setMapperClass(mapper);
        job.setMapOutputKeyClass(mapperKey);
        job.setMapOutputValueClass(mapperValue);

        jobConf.setBoolean("mapred.compress.map.output", true);

        job.setNumReduceTasks(0);

        job.setJobName(getCustomJobName(job, mapper));

        job.setOutputFormatClass(outputFormat);
        jobConf.set("mapred.output.dir", outputPath.toString());

        return job;
    }

    /**
     * Mapper Reducer  Hadoop Job ?? .
     *
     * @param job     Hadoop Job
     * @param mapper  {@link org.apache.hadoop.mapreduce.Mapper}
     * @param reducer Reducer
     * @return Job Name
     */
    private String getCustomJobName(JobContext job, Class<? extends Mapper> mapper,
            Class<? extends Reducer> reducer) {
        StringBuilder name = new StringBuilder(100);
        String customJobName = job.getJobName();
        if (customJobName == null || customJobName.trim().length() == 0) {
            name.append(getClass().getSimpleName());
        } else {
            name.append(customJobName);
        }
        return name.toString();
    }

    /**
     * Mapper  Hadoop Job ?? .
     *
     * @param job    Hadoop Job
     * @param mapper {@link org.apache.hadoop.mapreduce.Mapper}
     * @return Job Name
     */
    private String getCustomJobName(JobContext job, Class<? extends Mapper> mapper) {
        StringBuilder name = new StringBuilder(100);
        String customJobName = job.getJobName();
        if (customJobName == null || customJobName.trim().length() == 0) {
            name.append(getClass().getSimpleName());
        } else {
            name.append(customJobName);
        }
        return name.toString();
    }

    /**
     *  Key Value  ??  ? . ? 
     * Workflow Engine?  Key Value ? Map?  ??     ?? 
     *   .
     *
     * @param key   MapReduce Job? ? Key
     * @param value MapReduce Job? ? Key? Value
     * @return  ??  
     */
    public static String getKeyValuePair(String key, String value) {
        return keyFor(key) + " " + value;
    }

    /**
     * HDFS Configuration? ?.
     *
     * @throws Exception Hadoop Configuration? ? ??   
     */
    public void dump() throws Exception {
        StringWriter writer = new StringWriter();
        this.getConf().dumpConfiguration(this.getConf(), writer);
        System.out.println(writer.toString());
    }
}