com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileJob.java Source code

Java tutorial

Introduction

Here is the source code for com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileJob.java

Source

/*
 * Copyright 2013 Alex Holmes
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.alexholmes.hadooputils.combine.seqfile.mapred;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Date;
import java.util.concurrent.TimeUnit;

/**
 * This is a simple map-only job that uses the {@link com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileInputFormat} in an identity
 * job.
 */
public class CombineSequenceFileJob extends Configured implements Tool {

    /**
     * Usage string.
     */
    private static final String[] USAGE = { "bin/hadoop jar hadoop-utils-<version>.jar "
            + CombineSequenceFileJob.class.getName() + "[OPTION]... INPUT_DIR OUTPUT_DIR", };

    /**
     * Print the usage.
     *
     * @return the Java exit code
     */
    static int printUsage() {
        System.out.println(StringUtils.join(USAGE, "\n"));
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    /**
     * The driver for program which works with command-line arguments.
     *
     * @param args command-line arguments
     * @return 0 if everything went well, non-zero for everything else
     * @throws Exception When there is communication problems with the
     *                   job tracker.
     */
    @SuppressWarnings("unchecked")
    public int run(final String[] args) throws Exception {

        if (args.length != 2) {
            return printUsage();
        }

        if (runJob(getConf(), args[0], args[1])) {
            return 0;
        }
        return 1;
    }

    /**
     * The driver for the MapReduce job.
     *
     * @param conf           configuration
     * @param inputDirAsString  input directory in CSV-form
     * @param outputDirAsString output directory
     * @return true if the job completed successfully
     * @throws java.io.IOException         if something went wrong
     * @throws java.net.URISyntaxException if a URI wasn't correctly formed
     */
    public boolean runJob(final Configuration conf, final String inputDirAsString, final String outputDirAsString)
            throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {

        JobConf job = new JobConf(conf);

        job.setJarByClass(CombineSequenceFileJob.class);
        job.setJobName("seqfilecombiner");

        job.setNumReduceTasks(0);

        job.setMapperClass(IdentityMapper.class);

        job.setInputFormat(CombineSequenceFileInputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, inputDirAsString);
        FileOutputFormat.setOutputPath(job, new Path(outputDirAsString));

        Date startTime = new Date();
        System.out.println("Job started: " + startTime);
        RunningJob jobResult = JobClient.runJob(job);

        Date endTime = new Date();
        System.out.println("Job ended: " + endTime);
        System.out.println("The job took "
                + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

        return jobResult.isSuccessful();
    }

    /**
     * Main entry point for the utility.
     *
     * @param args arguments
     * @throws Exception when something goes wrong
     */
    public static void main(final String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new CombineSequenceFileJob(), args);
        System.exit(res);
    }
}