org.springframework.data.hadoop.mapreduce.JobFactoryBean.java Source code

Java tutorial

Introduction

Here is the source code for org.springframework.data.hadoop.mapreduce.JobFactoryBean.java

Source

/*
 * Copyright 2011 the original author or authors.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.springframework.data.hadoop.mapreduce;

import java.io.IOException;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.security.PrivilegedExceptionAction;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.security.UserGroupInformation;
import org.springframework.beans.factory.BeanClassLoaderAware;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.FactoryBean;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.core.io.Resource;
import org.springframework.core.io.support.ResourcePatternResolver;
import org.springframework.data.hadoop.configuration.ConfigurationUtils;
import org.springframework.util.Assert;
import org.springframework.util.ClassUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.StringUtils;

/**
 * Factory bean for creating a Hadoop Map-Reduce job.
 * Note that the setters for the class access class names (String) instead of direct
 * classes. This is done on purpose in case the job itself and its dependencies are
 * on a jar not available on the classpath ({@link #setJar(Resource)}) in which case
 * a special, on-the-fly classloader is used.
 * 
 * @author Costin Leau
 */
// TODO: extract input/output format configs
public class JobFactoryBean extends JobGenericOptions
        implements InitializingBean, FactoryBean<Job>, BeanNameAware, BeanClassLoaderAware {

    private Job job;
    private Configuration configuration;
    private Properties properties;

    private String name;

    private String key;
    private String value;

    private String mapKey;
    private String mapValue;

    private String mapper;
    private String reducer;
    private String combiner;
    private String inputFormat;
    private String outputFormat;
    private String partitioner;
    private String sortComparator;
    private String groupingComparator;

    private String workingDir;
    private Integer numReduceTasks;

    private Class<?> jarClass;
    private Resource jar;

    private List<String> inputPaths;
    private String outputPath;
    private Boolean compressOutput;
    private String codecClass;
    private ClassLoader beanClassLoader;

    @Override
    public void setBeanClassLoader(ClassLoader classLoader) {
        this.beanClassLoader = classLoader;
    }

    public void setBeanName(String name) {
        this.name = name;
    }

    public Job getObject() throws Exception {
        return job;
    }

    public Class<?> getObjectType() {
        return (job != null ? job.getClass() : Job.class);
    }

    public boolean isSingleton() {
        return true;
    }

    @SuppressWarnings("rawtypes")
    public void afterPropertiesSet() throws Exception {
        final Configuration cfg = ConfigurationUtils.createFrom(configuration, properties);

        buildGenericOptions(cfg);

        if (StringUtils.hasText(user)) {
            UserGroupInformation ugi = UserGroupInformation.createProxyUser(user,
                    UserGroupInformation.getLoginUser());
            ugi.doAs(new PrivilegedExceptionAction<Void>() {

                @Override
                public Void run() throws Exception {
                    job = new Job(cfg);
                    return null;
                }
            });
        } else {
            job = new Job(cfg);
        }

        ClassLoader loader = (beanClassLoader != null ? beanClassLoader
                : org.springframework.util.ClassUtils.getDefaultClassLoader());

        if (jar != null) {
            JobConf conf = (JobConf) job.getConfiguration();
            conf.setJar(jar.getURI().toString());
            loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg);
            conf.setClassLoader(loader);
        }

        // set first to enable auto-detection of K/V to skip the key/value types to be specified
        if (mapper != null) {
            Class<? extends Mapper> mapperClass = resolveClass(mapper, loader, Mapper.class);
            job.setMapperClass(mapperClass);
            configureMapperTypesIfPossible(job, mapperClass);
        }

        if (reducer != null) {
            Class<? extends Reducer> reducerClass = resolveClass(reducer, loader, Reducer.class);
            job.setReducerClass(reducerClass);
            configureReducerTypesIfPossible(job, reducerClass);
        }

        if (StringUtils.hasText(name)) {
            job.setJobName(name);
        }
        if (combiner != null) {
            job.setCombinerClass(resolveClass(combiner, loader, Reducer.class));
        }
        if (groupingComparator != null) {
            job.setGroupingComparatorClass(resolveClass(groupingComparator, loader, RawComparator.class));
        }
        if (inputFormat != null) {
            job.setInputFormatClass(resolveClass(inputFormat, loader, InputFormat.class));
        }
        if (mapKey != null) {
            job.setMapOutputKeyClass(resolveClass(mapKey, loader, Object.class));
        }
        if (mapValue != null) {
            job.setMapOutputValueClass(resolveClass(mapValue, loader, Object.class));
        }
        if (numReduceTasks != null) {
            job.setNumReduceTasks(numReduceTasks);
        }
        if (key != null) {
            job.setOutputKeyClass(resolveClass(key, loader, Object.class));
        }
        if (value != null) {
            job.setOutputValueClass(resolveClass(value, loader, Object.class));
        }
        if (outputFormat != null) {
            job.setOutputFormatClass(resolveClass(outputFormat, loader, OutputFormat.class));
        }
        if (partitioner != null) {
            job.setPartitionerClass(resolveClass(partitioner, loader, Partitioner.class));
        }
        if (sortComparator != null) {
            job.setSortComparatorClass(resolveClass(sortComparator, loader, RawComparator.class));
        }
        if (StringUtils.hasText(workingDir)) {
            job.setWorkingDirectory(new Path(workingDir));
        }
        if (jarClass != null) {
            job.setJarByClass(jarClass);
        }

        if (!CollectionUtils.isEmpty(inputPaths)) {
            for (String path : inputPaths) {
                FileInputFormat.addInputPath(job, new Path(path));
            }
        }

        if (StringUtils.hasText(outputPath)) {
            FileOutputFormat.setOutputPath(job, new Path(outputPath));
        }

        if (compressOutput != null) {
            FileOutputFormat.setCompressOutput(job, compressOutput);
        }

        if (codecClass != null) {
            FileOutputFormat.setOutputCompressorClass(job,
                    resolveClass(codecClass, loader, CompressionCodec.class));
        }

        processJob(job);
    }

    @SuppressWarnings("unchecked")
    private <T> Class<? extends T> resolveClass(String className, ClassLoader cl, Class<T> type) {
        return (Class<? extends T>) ClassUtils.resolveClassName(className, cl);
    }

    private void configureMapperTypesIfPossible(Job j, Class<? extends Mapper> mapper) {
        // Find mapper
        Class<?> targetClass = mapper;
        Type targetType = mapper;

        do {
            targetType = targetClass.getGenericSuperclass();
            targetClass = targetClass.getSuperclass();
        } while (targetClass != null && targetClass != Object.class && !Mapper.class.equals(targetClass));

        if (targetType instanceof ParameterizedType) {
            Type[] params = ((ParameterizedType) targetType).getActualTypeArguments();
            if (params.length == 4) {
                // set each param (if possible);
                if (params[2] instanceof Class) {
                    Class<?> clz = (Class<?>) params[2];
                    if (!clz.isInterface())
                        j.setMapOutputKeyClass(clz);
                }

                // set each param (if possible);
                if (params[3] instanceof Class) {
                    Class<?> clz = (Class<?>) params[3];
                    if (!clz.isInterface()) {
                        j.setMapOutputValueClass(clz);
                    }
                }
            }
        }
    }

    private void configureReducerTypesIfPossible(Job j, Class<? extends Reducer> reducer) {
        // don't do anything yet
    }

    private void validatePaths(String path, ResourcePatternResolver resourceLoader, boolean shouldExist)
            throws IOException {
        Resource res = resourceLoader.getResource(path);

        if (shouldExist) {
            Assert.isTrue(res.exists(), "The input path [" + path + "] does not exist");
        } else {
            Assert.isTrue(!res.exists(), "The output path [" + path + "] already exists");
        }
    }

    protected void processJob(Job job) throws Exception {
        // no-op
    }

    /**
     * Sets the Hadoop configuration to use.
     * 
     * @param configuration The configuration to set.
     */
    public void setConfiguration(Configuration configuration) {
        this.configuration = configuration;
    }

    /**
     * Sets the job name.
     * 
     * @param name The name to set.
     */
    public void setName(String name) {
        this.name = name;
    }

    /**
     * Sets the job key class.
     * 
     * @param key The keyClass to set.
     */
    public void setKey(String key) {
        this.key = key;
    }

    /**
     * Sets the job value class.
     * 
     * @param value The valueClass to set.
     */
    public void setValue(String value) {
        this.value = value;
    }

    /**
     * Sets the job map key class.
     * 
     * @param mapKey The mapKeyClass to set.
     */
    public void setMapKey(String mapKey) {
        this.mapKey = mapKey;
    }

    /**
     * Sets the job map value class.
     * 
     * @param mapValue The mapValueClass to set.
     */
    public void setMapValue(String mapValue) {
        this.mapValue = mapValue;
    }

    /**
     * Sets the job mapper class.
     * 
     * @param mapper The mapper to set.
     */
    public void setMapper(String mapper) {
        this.mapper = mapper;
    }

    /**
     * Sets the job reducer class.
     * 
     * @param reducer The reducer to set.
     */
    public void setReducer(String reducer) {
        this.reducer = reducer;
    }

    /**
     * Sets the job combiner class.
     * 
     * @param combiner The combiner to set.
     */
    public void setCombiner(String combiner) {
        this.combiner = combiner;
    }

    /**
     * Sets the job input format class.
     * 
     * @param inputFormat The inputFormat to set.
     */
    public void setInputFormat(String inputFormat) {
        this.inputFormat = inputFormat;
    }

    /**
     * Sets the job output format class.
     * 
     * @param outputFormat The outputFormat to set.
     */
    public void setOutputFormat(String outputFormat) {
        this.outputFormat = outputFormat;
    }

    /**
     * Sets the job partitioner class.
     * 
     * @param partitioner The partitioner to set.
     */
    public void setPartitioner(String partitioner) {
        this.partitioner = partitioner;
    }

    /**
     * Sets the job sort comparator class.
     * 
     * @param sortComparator The sortComparator to set.
     */
    public void setSortComparator(String sortComparator) {
        this.sortComparator = sortComparator;
    }

    /**
     * Sets the job grouping comparator class.
     * 
     * @param groupingComparator The groupingComparator to set.
     */
    public void setGroupingComparator(String groupingComparator) {
        this.groupingComparator = groupingComparator;
    }

    /**
     * Sets the job working directory.
     * 
     * @param workingDir The workingDir to set.
     */
    public void setWorkingDir(String workingDir) {
        this.workingDir = workingDir;
    }

    /**
     * Sets the number of reduce task for this job.
     *  
     * @param numReduceTasks The numReduceTasks to set.
     */
    public void setNumberReducers(Integer numReduceTasks) {
        this.numReduceTasks = numReduceTasks;
    }

    /**
     * Determines the job jar (available on the classpath) based on the given class.
     * 
     * @param jarClass The jarClass to set.
     */
    public void setJarByClass(Class<?> jarClass) {
        this.jarClass = jarClass;
    }

    /**
     * Sets the job jar (which might not be on the classpath).
     * 
     * @param jar The jar to set.
     */
    public void setJar(Resource jar) {
        this.jar = jar;
    }

    /**
     * Sets the job input path.
     * 
     * @param inputPath job input path.
     */
    public void setInputPath(String... inputPath) {
        // handle , strings here instead of the namespace to allow SpEL to kick in (if needed)  
        if (inputPath != null && inputPath.length == 1) {
            inputPath = StringUtils.commaDelimitedListToStringArray(inputPath[0]);
        }
        this.inputPaths = Arrays.asList(inputPath);
    }

    /**
     * Sets the job output path.
     * 
     * @param outputPath The outputPath to set.
     */
    public void setOutputPath(String outputPath) {
        this.outputPath = outputPath;
    }

    /**
     * Indicates whether the job output should be compressed or not.
     * 
     * @param compressOutput The compressOutput to set.
     */
    public void setCompressOutput(Boolean compressOutput) {
        this.compressOutput = compressOutput;
    }

    /**
     * Sets the job codec class.
     * 
     * @param codecClass The codecClass to set.
     */
    public void setCodec(String codecClass) {
        this.codecClass = codecClass;
    }

    /**
     * The configuration properties to set for this job.
     * 
     * @param properties The properties to set.
     */
    public void setProperties(Properties properties) {
        this.properties = properties;
    }
}