Example usage for org.apache.hadoop.mapred JobConf setPartitionerClass

List of usage examples for org.apache.hadoop.mapred JobConf setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> theClass) 

Source Link

Document

Set the Partitioner class used to partition Mapper -outputs to be sent to the Reducer s.

Usage

From source file:MRDriver.java

License:Apache License

public int run(String args[]) throws Exception {
    FileSystem fs = null;/*from w  ww.ja va 2 s  . c o m*/
    Path samplesMapPath = null;

    float epsilon = Float.parseFloat(args[0]);
    double delta = Double.parseDouble(args[1]);
    int minFreqPercent = Integer.parseInt(args[2]);
    int d = Integer.parseInt(args[3]);
    int datasetSize = Integer.parseInt(args[4]);
    int numSamples = Integer.parseInt(args[5]);
    double phi = Double.parseDouble(args[6]);
    Random rand;

    /************************ Job 1 (local FIM) Configuration ************************/

    JobConf conf = new JobConf(getConf());

    /*
     * Compute the number of required "votes" for an itemsets to be
     * declared frequent    
     */
    // The +1 at the end is needed to ensure reqApproxNum > numsamples / 2.
    int reqApproxNum = (int) Math
            .floor((numSamples * (1 - phi)) - Math.sqrt(numSamples * (1 - phi) * 2 * Math.log(1 / delta))) + 1;
    int sampleSize = (int) Math.ceil((2 / Math.pow(epsilon, 2)) * (d + Math.log(1 / phi)));
    //System.out.println("reducersNum: " + numSamples + " reqApproxNum: " + reqApproxNum);

    conf.setInt("PARMM.reducersNum", numSamples);
    conf.setInt("PARMM.datasetSize", datasetSize);
    conf.setInt("PARMM.minFreqPercent", minFreqPercent);
    conf.setInt("PARMM.sampleSize", sampleSize);
    conf.setFloat("PARMM.epsilon", epsilon);

    // Set the number of reducers equal to the number of samples, to
    // maximize parallelism. Required by our Partitioner.
    conf.setNumReduceTasks(numSamples);

    // XXX: why do we disable the speculative execution? MR
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    /* 
     * Enable compression of map output.
     *
     * We do it for this job and not for the aggregation one because
     * each mapper there only print out one record for each itemset,
     * so there isn't much to compress, I'd say. MR
     *
     * In Amazon MapReduce compression of the map output seems to be
     * happen by default and the Snappy codec is used, which is
     * extremely fast.
     */
    conf.setBoolean("mapred.compress.map.output", true);
    //conf.setMapOutputCompressorClass(com.hadoop.compression.lzo.LzoCodec.class);

    conf.setJarByClass(MRDriver.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    // We write the collections found in a reducers as a SequenceFile 
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(conf, new Path(args[9]));

    // set the mapper class based on command line option
    switch (Integer.parseInt(args[7])) {
    case 1:
        System.out.println("running partition mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(PartitionMapper.class);
        break;
    case 2:
        System.out.println("running binomial mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(BinomialSamplerMapper.class);
        break;
    case 3:
        System.out.println("running coin mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(CoinFlipSamplerMapper.class);
    case 4:
        System.out.println("running sampler mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(InputSamplerMapper.class);

        // create a random sample of size T*m
        rand = new Random();
        long sampling_start_time = System.nanoTime();
        int[] samples = new int[numSamples * sampleSize];
        for (int i = 0; i < numSamples * sampleSize; i++) {
            samples[i] = rand.nextInt(datasetSize);
        }

        // for each key in the sample, create a list of all T samples to which this key belongs
        Hashtable<LongWritable, ArrayList<IntWritable>> hashTable = new Hashtable<LongWritable, ArrayList<IntWritable>>();
        for (int i = 0; i < numSamples * sampleSize; i++) {
            ArrayList<IntWritable> sampleIDs = null;
            LongWritable key = new LongWritable(samples[i]);
            if (hashTable.containsKey(key))
                sampleIDs = hashTable.get(key);
            else
                sampleIDs = new ArrayList<IntWritable>();
            sampleIDs.add(new IntWritable(i % numSamples));
            hashTable.put(key, sampleIDs);
        }

        /*
         * Convert the Hastable to a MapWritable which we will
         * write to HDFS and distribute to all Mappers using
         * DistributedCache
         */
        MapWritable map = new MapWritable();
        for (LongWritable key : hashTable.keySet()) {
            ArrayList<IntWritable> sampleIDs = hashTable.get(key);
            IntArrayWritable sampleIDsIAW = new IntArrayWritable();
            sampleIDsIAW.set(sampleIDs.toArray(new IntWritable[sampleIDs.size()]));
            map.put(key, sampleIDsIAW);
        }

        fs = FileSystem.get(URI.create("samplesMap.ser"), conf);
        samplesMapPath = new Path("samplesMap.ser");
        FSDataOutputStream out = fs.create(samplesMapPath, true);
        map.write(out);
        out.sync();
        out.close();
        DistributedCache.addCacheFile(new URI(fs.getWorkingDirectory() + "/samplesMap.ser#samplesMap.ser"),
                conf);
        // stop the sampling timer   
        long sampling_end_time = System.nanoTime();
        long sampling_runtime = (sampling_end_time - sampling_start_time) / 1000000;
        System.out.println("sampling runtime (milliseconds): " + sampling_runtime);
        break; // end switch case
    case 5:
        System.out.println("running random integer partition mapper...");
        conf.setInputFormat(WholeSplitInputFormat.class);
        Path inputFilePath = new Path(args[8]);
        WholeSplitInputFormat.addInputPath(conf, inputFilePath);
        conf.setMapperClass(RandIntPartSamplerMapper.class);
        // Compute number of map tasks.
        fs = inputFilePath.getFileSystem(conf);
        FileStatus inputFileStatus = fs.getFileStatus(inputFilePath);
        long len = inputFileStatus.getLen();
        long blockSize = inputFileStatus.getBlockSize();
        conf.setLong("mapred.min.split.size", blockSize);
        conf.setLong("mapred.max.split.size", blockSize);
        int mapTasksNum = ((int) (len / blockSize)) + 1;
        conf.setNumMapTasks(mapTasksNum);
        //System.out.println("len: " + len + " blockSize: " 
        //      + blockSize + " mapTasksNum: " + mapTasksNum);
        // Extract random integer partition of total sample
        // size into up to mapTasksNum partitions.
        // XXX I'm not sure this is a correct way to do
        // it.
        rand = new Random();
        IntWritable[][] toSampleArr = new IntWritable[mapTasksNum][numSamples];
        for (int j = 0; j < numSamples; j++) {
            IntWritable[] tempToSampleArr = new IntWritable[mapTasksNum];
            int sum = 0;
            int i;
            for (i = 0; i < mapTasksNum - 1; i++) {
                int size = rand.nextInt(sampleSize - sum);
                tempToSampleArr[i] = new IntWritable(size);
                sum += size;
                if (sum > numSamples * sampleSize) {
                    System.out.println("Something went wrong generating the sample Sizes");
                    System.exit(1);
                }
                if (sum == sampleSize) {
                    break;
                }
            }
            if (i == mapTasksNum - 1) {
                tempToSampleArr[i] = new IntWritable(sampleSize - sum);
            } else {
                for (; i < mapTasksNum; i++) {
                    tempToSampleArr[i] = new IntWritable(0);
                }
            }
            Collections.shuffle(Arrays.asList(tempToSampleArr));
            for (i = 0; i < mapTasksNum; i++) {
                toSampleArr[i][j] = tempToSampleArr[i];
            }
        }

        for (int i = 0; i < mapTasksNum; i++) {
            DefaultStringifier.storeArray(conf, toSampleArr[i], "PARMM.toSampleArr_" + i);
        }
        break;
    default:
        System.err.println("Wrong Mapper ID. Can only be in [1,5]");
        System.exit(1);
        break;
    }

    /*
     * We don't use the default hash partitioner because we want to
     * maximize the parallelism. That's why we also fix the number
     * of reducers.
     */
    conf.setPartitionerClass(FIMPartitioner.class);

    conf.setReducerClass(FIMReducer.class);

    /************************ Job 2 (aggregation) Configuration ************************/

    JobConf confAggr = new JobConf(getConf());

    confAggr.setInt("PARMM.reducersNum", numSamples);
    confAggr.setInt("PARMM.reqApproxNum", reqApproxNum);
    confAggr.setInt("PARMM.sampleSize", sampleSize);
    confAggr.setFloat("PARMM.epsilon", epsilon);

    // XXX: Why do we disable speculative execution? MR
    confAggr.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    confAggr.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    confAggr.setJarByClass(MRDriver.class);

    confAggr.setMapOutputKeyClass(Text.class);
    confAggr.setMapOutputValueClass(DoubleWritable.class);

    confAggr.setOutputKeyClass(Text.class);
    confAggr.setOutputValueClass(Text.class);

    confAggr.setMapperClass(AggregateMapper.class);
    confAggr.setReducerClass(AggregateReducer.class);

    confAggr.setInputFormat(CombineSequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(confAggr, new Path(args[9]));

    FileOutputFormat.setOutputPath(confAggr, new Path(args[10]));

    long FIMjob_start_time = System.currentTimeMillis();
    RunningJob FIMjob = JobClient.runJob(conf);
    long FIMjob_end_time = System.currentTimeMillis();

    RunningJob aggregateJob = JobClient.runJob(confAggr);
    long aggrJob_end_time = System.currentTimeMillis();

    long FIMjob_runtime = FIMjob_end_time - FIMjob_start_time;

    long aggrJob_runtime = aggrJob_end_time - FIMjob_end_time;

    if (args[7].equals("4")) {
        // Remove samplesMap file 
        fs.delete(samplesMapPath, false);
    }

    Counters counters = FIMjob.getCounters();
    Counters.Group FIMMapperStartTimesCounters = counters.getGroup("FIMMapperStart");
    long[] FIMMapperStartTimes = new long[FIMMapperStartTimesCounters.size()];
    int i = 0;
    for (Counters.Counter counter : FIMMapperStartTimesCounters) {
        FIMMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMMapperEndTimesCounters = counters.getGroup("FIMMapperEnd");
    long[] FIMMapperEndTimes = new long[FIMMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMMapperEndTimesCounters) {
        FIMMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerStartTimesCounters = counters.getGroup("FIMReducerStart");
    long[] FIMReducerStartTimes = new long[FIMReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerStartTimesCounters) {
        FIMReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerEndTimesCounters = counters.getGroup("FIMReducerEnd");
    long[] FIMReducerEndTimes = new long[FIMReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerEndTimesCounters) {
        FIMReducerEndTimes[i++] = counter.getCounter();
    }

    Counters countersAggr = aggregateJob.getCounters();
    Counters.Group AggregateMapperStartTimesCounters = countersAggr.getGroup("AggregateMapperStart");
    long[] AggregateMapperStartTimes = new long[AggregateMapperStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperStartTimesCounters) {
        AggregateMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateMapperEndTimesCounters = countersAggr.getGroup("AggregateMapperEnd");
    long[] AggregateMapperEndTimes = new long[AggregateMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperEndTimesCounters) {
        AggregateMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerStartTimesCounters = countersAggr.getGroup("AggregateReducerStart");
    long[] AggregateReducerStartTimes = new long[AggregateReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerStartTimesCounters) {
        AggregateReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerEndTimesCounters = countersAggr.getGroup("AggregateReducerEnd");
    long[] AggregateReducerEndTimes = new long[AggregateReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerEndTimesCounters) {
        AggregateReducerEndTimes[i++] = counter.getCounter();
    }

    long FIMMapperStartMin = FIMMapperStartTimes[0];
    for (long l : FIMMapperStartTimes) {
        if (l < FIMMapperStartMin) {
            FIMMapperStartMin = l;
        }
    }
    long FIMMapperEndMax = FIMMapperEndTimes[0];
    for (long l : FIMMapperEndTimes) {
        if (l > FIMMapperEndMax) {
            FIMMapperEndMax = l;
        }
    }
    System.out.println("FIM job setup time (milliseconds): " + (FIMMapperStartMin - FIMjob_start_time));
    System.out.println("FIMMapper total runtime (milliseconds): " + (FIMMapperEndMax - FIMMapperStartMin));
    long[] FIMMapperRunTimes = new long[FIMMapperStartTimes.length];
    long FIMMapperRunTimesSum = 0;
    for (int l = 0; l < FIMMapperStartTimes.length; l++) {
        FIMMapperRunTimes[l] = FIMMapperEndTimes[l] - FIMMapperStartTimes[l];
        FIMMapperRunTimesSum += FIMMapperRunTimes[l];
    }
    System.out.println("FIMMapper average task runtime (milliseconds): "
            + FIMMapperRunTimesSum / FIMMapperStartTimes.length);
    long FIMMapperRunTimesMin = FIMMapperRunTimes[0];
    long FIMMapperRunTimesMax = FIMMapperRunTimes[0];
    for (long l : FIMMapperRunTimes) {
        if (l < FIMMapperRunTimesMin) {
            FIMMapperRunTimesMin = l;
        }
        if (l > FIMMapperRunTimesMax) {
            FIMMapperRunTimesMax = l;
        }
    }
    System.out.println("FIMMapper minimum task runtime (milliseconds): " + FIMMapperRunTimesMin);
    System.out.println("FIMMapper maximum task runtime (milliseconds): " + FIMMapperRunTimesMax);

    long FIMReducerStartMin = FIMReducerStartTimes[0];
    for (long l : FIMReducerStartTimes) {
        if (l < FIMReducerStartMin) {
            FIMReducerStartMin = l;
        }
    }
    long FIMReducerEndMax = FIMReducerEndTimes[0];
    for (long l : FIMReducerEndTimes) {
        if (l > FIMReducerEndMax) {
            FIMReducerEndMax = l;
        }
    }
    System.out
            .println("FIM job shuffle phase runtime (milliseconds): " + (FIMReducerStartMin - FIMMapperEndMax));
    System.out.println("FIMReducer total runtime (milliseconds): " + (FIMReducerEndMax - FIMReducerStartMin));
    long[] FIMReducerRunTimes = new long[FIMReducerStartTimes.length];
    long FIMReducerRunTimesSum = 0;
    for (int l = 0; l < FIMReducerStartTimes.length; l++) {
        FIMReducerRunTimes[l] = FIMReducerEndTimes[l] - FIMReducerStartTimes[l];
        FIMReducerRunTimesSum += FIMReducerRunTimes[l];
    }
    System.out.println("FIMReducer average task runtime (milliseconds): "
            + FIMReducerRunTimesSum / FIMReducerStartTimes.length);
    long FIMReducerRunTimesMin = FIMReducerRunTimes[0];
    long FIMReducerRunTimesMax = FIMReducerRunTimes[0];
    for (long l : FIMReducerRunTimes) {
        if (l < FIMReducerRunTimesMin) {
            FIMReducerRunTimesMin = l;
        }
        if (l > FIMReducerRunTimesMax) {
            FIMReducerRunTimesMax = l;
        }
    }
    System.out.println("FIMReducer minimum task runtime (milliseconds): " + FIMReducerRunTimesMin);
    System.out.println("FIMReducer maximum task runtime (milliseconds): " + FIMReducerRunTimesMax);
    System.out.println("FIM job cooldown time (milliseconds): " + (FIMjob_end_time - FIMReducerEndMax));

    long AggregateMapperStartMin = AggregateMapperStartTimes[0];
    for (long l : AggregateMapperStartTimes) {
        if (l < AggregateMapperStartMin) {
            AggregateMapperStartMin = l;
        }
    }
    long AggregateMapperEndMax = AggregateMapperEndTimes[0];
    for (long l : AggregateMapperEndTimes) {
        if (l > AggregateMapperEndMax) {
            AggregateMapperEndMax = l;
        }
    }
    System.out.println(
            "Aggregation job setup time (milliseconds): " + (AggregateMapperStartMin - FIMjob_end_time));
    System.out.println("AggregateMapper total runtime (milliseconds): "
            + (AggregateMapperEndMax - AggregateMapperStartMin));
    long[] AggregateMapperRunTimes = new long[AggregateMapperStartTimes.length];
    long AggregateMapperRunTimesSum = 0;
    for (int l = 0; l < AggregateMapperStartTimes.length; l++) {
        AggregateMapperRunTimes[l] = AggregateMapperEndTimes[l] - AggregateMapperStartTimes[l];
        AggregateMapperRunTimesSum += AggregateMapperRunTimes[l];
    }
    System.out.println("AggregateMapper average task runtime (milliseconds): "
            + AggregateMapperRunTimesSum / AggregateMapperStartTimes.length);
    long AggregateMapperRunTimesMin = AggregateMapperRunTimes[0];
    long AggregateMapperRunTimesMax = AggregateMapperRunTimes[0];
    for (long l : AggregateMapperRunTimes) {
        if (l < AggregateMapperRunTimesMin) {
            AggregateMapperRunTimesMin = l;
        }
        if (l > AggregateMapperRunTimesMax) {
            AggregateMapperRunTimesMax = l;
        }
    }
    System.out.println("AggregateMapper minimum task runtime (milliseconds): " + AggregateMapperRunTimesMin);
    System.out.println("AggregateMapper maximum task runtime (milliseconds): " + AggregateMapperRunTimesMax);

    long AggregateReducerStartMin = AggregateReducerStartTimes[0];
    for (long l : AggregateReducerStartTimes) {
        if (l < AggregateReducerStartMin) {
            AggregateReducerStartMin = l;
        }
    }
    long AggregateReducerEndMax = AggregateReducerEndTimes[0];
    for (long l : AggregateReducerEndTimes) {
        if (l > AggregateReducerEndMax) {
            AggregateReducerEndMax = l;
        }
    }
    System.out.println("Aggregate job round shuffle phase runtime (milliseconds): "
            + (AggregateReducerStartMin - AggregateMapperEndMax));
    System.out.println("AggregateReducer total runtime (milliseconds): "
            + (AggregateReducerEndMax - AggregateReducerStartMin));
    long[] AggregateReducerRunTimes = new long[AggregateReducerStartTimes.length];
    long AggregateReducerRunTimesSum = 0;
    for (int l = 0; l < AggregateReducerStartTimes.length; l++) {
        AggregateReducerRunTimes[l] = AggregateReducerEndTimes[l] - AggregateReducerStartTimes[l];
        AggregateReducerRunTimesSum += AggregateReducerRunTimes[l];
    }
    System.out.println("AggregateReducer average task runtime (milliseconds): "
            + AggregateReducerRunTimesSum / AggregateReducerStartTimes.length);
    long AggregateReducerRunTimesMin = AggregateReducerRunTimes[0];
    long AggregateReducerRunTimesMax = AggregateReducerRunTimes[0];
    for (long l : AggregateReducerRunTimes) {
        if (l < AggregateReducerRunTimesMin) {
            AggregateReducerRunTimesMin = l;
        }
        if (l > AggregateReducerRunTimesMax) {
            AggregateReducerRunTimesMax = l;
        }
    }
    System.out.println("AggregateReducer minimum task runtime (milliseconds): " + AggregateReducerRunTimesMin);
    System.out.println("AggregateReducer maximum task runtime (milliseconds): " + AggregateReducerRunTimesMax);

    System.out.println(
            "Aggregation job cooldown time (milliseconds): " + (aggrJob_end_time - AggregateReducerEndMax));

    System.out
            .println("total runtime (all inclusive) (milliseconds): " + (aggrJob_end_time - FIMjob_start_time));
    System.out.println("total runtime (no FIM job setup, no aggregation job cooldown) (milliseconds): "
            + (AggregateReducerEndMax - FIMMapperStartMin));
    System.out.println("total runtime (no setups, no cooldowns) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin + AggregateReducerEndMax - AggregateMapperStartMin));
    System.out.println("FIM job runtime (including setup and cooldown) (milliseconds): " + FIMjob_runtime);
    System.out.println("FIM job runtime (no setup, no cooldown) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin));
    System.out.println(
            "Aggregation job runtime (including setup and cooldown) (milliseconds): " + aggrJob_runtime);
    System.out.println("Aggregation job runtime (no setup, no cooldown) (milliseconds): "
            + (AggregateReducerEndMax - AggregateMapperStartMin));

    return 0;
}

From source file:SleepJob.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), SleepJob.class);
    job.setNumMapTasks(numMapper);/* w w  w.  ja  v  a2  s  .  c  o m*/
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(SleepJob.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(SleepJob.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(SleepJob.class);
    job.setSpeculativeExecution(false);
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:NaivePageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    int iteration = -1;
    String inputPath = args[0];/*  www .ja v  a  2 s .c o  m*/
    String outputPath = args[1];
    int specIteration = 0;
    if (args.length > 2) {
        specIteration = Integer.parseInt(args[2]);
    }
    int numNodes = 100000;
    if (args.length > 3) {
        numNodes = Integer.parseInt(args[3]);
    }
    int numReducers = 32;
    if (args.length > 4) {
        numReducers = Integer.parseInt(args[4]);
    }
    System.out.println("specified iteration: " + specIteration);
    long start = System.currentTimeMillis();

    /**
     * job to count out-going links for each url
     */
    JobConf conf = new JobConf(NaivePageRank.class);
    conf.setJobName("PageRank-Count");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(CountMapper.class);
    conf.setReducerClass(CountReducer.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/count"));
    conf.setNumReduceTasks(numReducers);
    JobClient.runJob(conf);

    /******************** Initial Rank Assignment Job ***********************/
    conf = new JobConf(NaivePageRank.class);
    conf.setJobName("PageRank-Initialize");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(InitialRankAssignmentMapper.class);
    conf.setReducerClass(InitialRankAssignmentReducer.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration));
    conf.setNumReduceTasks(numReducers);
    // conf.setIterative(false);
    JobClient.runJob(conf);
    iteration++;

    do {
        /****************** Join Job ********************************/
        conf = new JobConf(NaivePageRank.class);
        conf.setJobName("PageRank-Join");
        conf.setOutputKeyClass(Text.class);
        // conf.setOutputValueClass(Text.class);
        conf.setMapperClass(ComputeRankMap.class);
        conf.setReducerClass(ComputeRankReduce.class);
        conf.setMapOutputKeyClass(TextPair.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        conf.setPartitionerClass(FirstPartitioner.class);
        conf.setOutputKeyComparatorClass(KeyComparator.class);
        conf.setOutputValueGroupingComparator(GroupComparator.class);

        // relation table
        FileInputFormat.setInputPaths(conf, new Path(inputPath));
        // rank table
        FileInputFormat.addInputPath(conf, new Path(outputPath + "/i" + (iteration - 1)));
        // count table
        FileInputFormat.addInputPath(conf, new Path(outputPath + "/count"));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration));
        conf.setNumReduceTasks(numReducers);
        JobClient.runJob(conf);
        iteration++;

        /******************** Rank Aggregate Job ***********************/
        conf = new JobConf(NaivePageRank.class);
        conf.setJobName("PageRank-Aggregate");
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapOutputKeyClass(Text.class);
        conf.setMapperClass(RankAggregateMapper.class);
        conf.setReducerClass(RankAggregateReducer.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(outputPath + "/i" + (iteration - 1)));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration));
        conf.setNumReduceTasks(numReducers);
        conf.setInt("haloop.num.nodes", numNodes);
        JobClient.runJob(conf);
        iteration++;
    } while (iteration < 2 * specIteration);

    long end = System.currentTimeMillis();
    System.out.println("running time " + (end - start) / 1000 + "s");
}

From source file:SleepJobWithArray.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), SleepJobWithArray.class);
    job.setNumMapTasks(numMapper);/*from www  . j av a 2  s . c o  m*/
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(SleepJobWithArray.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(SleepJobWithArray.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(SleepJobWithArray.class);
    job.setSpeculativeExecution(false);
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:cascading.flow.FlowStep.java

License:Open Source License

protected JobConf getJobConf(JobConf parentConf) throws IOException {
    JobConf conf = parentConf == null ? new JobConf() : new JobConf(parentConf);

    // set values first so they can't break things downstream
    if (hasProperties()) {
        for (Map.Entry entry : getProperties().entrySet())
            conf.set(entry.getKey().toString(), entry.getValue().toString());
    }/*from w  w  w.  java2  s.  c om*/

    // disable warning
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    conf.setJobName(getStepName());

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(Tuple.class);

    conf.setMapperClass(FlowMapper.class);
    conf.setReducerClass(FlowReducer.class);

    // set for use by the shuffling phase
    TupleSerialization.setSerializations(conf);

    initFromSources(conf);

    initFromSink(conf);

    initFromTraps(conf);

    if (sink.getScheme().getNumSinkParts() != 0) {
        // if no reducer, set num map tasks to control parts
        if (getGroup() != null)
            conf.setNumReduceTasks(sink.getScheme().getNumSinkParts());
        else
            conf.setNumMapTasks(sink.getScheme().getNumSinkParts());
    }

    conf.setOutputKeyComparatorClass(TupleComparator.class);

    if (getGroup() == null) {
        conf.setNumReduceTasks(0); // disable reducers
    } else {
        // must set map output defaults when performing a reduce
        conf.setMapOutputKeyClass(Tuple.class);
        conf.setMapOutputValueClass(Tuple.class);

        // handles the case the groupby sort should be reversed
        if (getGroup().isSortReversed())
            conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

        addComparators(conf, "cascading.group.comparator", getGroup().getGroupingSelectors());

        if (getGroup().isGroupBy())
            addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors());

        if (!getGroup().isGroupBy()) {
            conf.setPartitionerClass(CoGroupingPartitioner.class);
            conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
            conf.setMapOutputValueClass(IndexTuple.class);
            conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
            conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
        }

        if (getGroup().isSorted()) {
            conf.setPartitionerClass(GroupingPartitioner.class);
            conf.setMapOutputKeyClass(TuplePair.class);

            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
            else
                conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

            // no need to supply a reverse comparator, only equality is checked
            conf.setOutputValueGroupingComparator(GroupingComparator.class);
        }
    }

    // perform last so init above will pass to tasks
    conf.setInt("cascading.flow.step.id", id);
    conf.set("cascading.flow.step", Util.serializeBase64(this));

    return conf;
}

From source file:cascading.flow.hadoop.HadoopFlowStep.java

License:Open Source License

public JobConf createInitializedConfig(FlowProcess<JobConf> flowProcess, JobConf parentConfig) {
    JobConf conf = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig);

    // disable warning
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    conf.setJobName(getStepDisplayName(conf.getInt("cascading.display.id.truncate", Util.ID_LENGTH)));

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(Tuple.class);

    conf.setMapRunnerClass(FlowMapper.class);
    conf.setReducerClass(FlowReducer.class);

    // set for use by the shuffling phase
    TupleSerialization.setSerializations(conf);

    initFromSources(flowProcess, conf);/* w ww.j  a  v a2  s.c  o m*/

    initFromSink(flowProcess, conf);

    initFromTraps(flowProcess, conf);

    initFromStepConfigDef(conf);

    int numSinkParts = getSink().getScheme().getNumSinkParts();

    if (numSinkParts != 0) {
        // if no reducer, set num map tasks to control parts
        if (getGroup() != null)
            conf.setNumReduceTasks(numSinkParts);
        else
            conf.setNumMapTasks(numSinkParts);
    } else if (getGroup() != null) {
        int gatherPartitions = conf.getNumReduceTasks();

        if (gatherPartitions == 0)
            gatherPartitions = conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0);

        if (gatherPartitions == 0)
            throw new FlowException(getName(),
                    "a default number of gather partitions must be set, see FlowRuntimeProps");

        conf.setNumReduceTasks(gatherPartitions);
    }

    conf.setOutputKeyComparatorClass(TupleComparator.class);

    if (getGroup() == null) {
        conf.setNumReduceTasks(0); // disable reducers
    } else {
        // must set map output defaults when performing a reduce
        conf.setMapOutputKeyClass(Tuple.class);
        conf.setMapOutputValueClass(Tuple.class);
        conf.setPartitionerClass(GroupingPartitioner.class);

        // handles the case the groupby sort should be reversed
        if (getGroup().isSortReversed())
            conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

        addComparators(conf, "cascading.group.comparator", getGroup().getKeySelectors(), this, getGroup());

        if (getGroup().isGroupBy())
            addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors(), this,
                    getGroup());

        if (!getGroup().isGroupBy()) {
            conf.setPartitionerClass(CoGroupingPartitioner.class);
            conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
            conf.setMapOutputValueClass(IndexTuple.class);
            conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
            conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
        }

        if (getGroup().isSorted()) {
            conf.setPartitionerClass(GroupingSortingPartitioner.class);
            conf.setMapOutputKeyClass(TuplePair.class);

            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
            else
                conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

            // no need to supply a reverse comparator, only equality is checked
            conf.setOutputValueGroupingComparator(GroupingComparator.class);
        }
    }

    // perform last so init above will pass to tasks
    String versionString = Version.getRelease();

    if (versionString != null)
        conf.set("cascading.version", versionString);

    conf.set(CASCADING_FLOW_STEP_ID, getID());
    conf.set("cascading.flow.step.num", Integer.toString(getOrdinal()));

    HadoopUtil.setIsInflow(conf);

    Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator();

    String mapState = pack(iterator.next(), conf);
    String reduceState = pack(iterator.hasNext() ? iterator.next() : null, conf);

    // hadoop 20.2 doesn't like dist cache when using local mode
    int maxSize = Short.MAX_VALUE;

    int length = mapState.length() + reduceState.length();

    if (isHadoopLocalMode(conf) || length < maxSize) // seems safe
    {
        conf.set("cascading.flow.step.node.map", mapState);

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce", reduceState);
    } else {
        conf.set("cascading.flow.step.node.map.path",
                HadoopMRUtil.writeStateToDistCache(conf, getID(), "map", mapState));

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce.path",
                    HadoopMRUtil.writeStateToDistCache(conf, getID(), "reduce", reduceState));
    }

    return conf;
}

From source file:cn.edu.xmu.dm.mapreduce.Sort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*w w  w  .  j  av a2  s . co  m*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "Sorter");
    job.setJarByClass(Sort.class);

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for the sort MapReduce job.
 *
 * @param jobConf           sort configuration
 * @param numMapTasks       number of map tasks
 * @param numReduceTasks    number of reduce tasks
 * @param sampler           sampler, if required
 * @param codecClass        the compression codec for compressing final outputs
 * @param mapCodecClass     the compression codec for compressing intermediary map outputs
 * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
 *                          for the job output files
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws IOException        if something went wrong
 * @throws URISyntaxException if a URI wasn't correctly formed
 */// www  .  j a  v  a 2s  .  co m
public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks,
        final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass,
        final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes,
        final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException {

    jobConf.setJarByClass(Sort.class);
    jobConf.setJobName("sorter");

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    if (numMapTasks != null) {
        jobConf.setNumMapTasks(numMapTasks);
    }
    if (numReduceTasks != null) {
        jobConf.setNumReduceTasks(numReduceTasks);
    } else {
        int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sort.reduces_per_host");
        if (sortReduces != null) {
            numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }

        // Set user-supplied (possibly default) job configs
        jobConf.setNumReduceTasks(numReduces);
    }

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(SortReduce.class);

    jobConf.setInputFormat(SortInputFormat.class);

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    if (mapCodecClass != null) {
        jobConf.setMapOutputCompressorClass(mapCodecClass);
    }

    if (codecClass != null) {
        jobConf.setBoolean("mapred.output.compress", true);
        jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
    }

    FileInputFormat.setInputPaths(jobConf, inputDirAsString);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];

        FileSystem fileSystem = FileSystem.get(jobConf);

        if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
            inputDir = inputDir.getParent();
        }
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + jobConf.getNumReduceTasks() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    if (jobResult.isSuccessful()) {
        if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
            new LzoIndexer(jobConf).index(new Path(outputDirAsString));
        }
        return true;
    }
    return false;
}

From source file:com.benchmark.mapred.SleepJob.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), SleepJob.class);
    job.setNumMapTasks(numMapper);//from   www . j  av a  2  s  .c  om
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(SleepJob.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(SleepJob.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(SleepJob.class);
    job.setSpeculativeExecution(false);
    job.setJobName("Sleep job");
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:com.benchmark.mapred.Sort.java

License:Apache License

/**
 * The main driver for sort program./* w  w  w  .  jav  a  2 s  . co m*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
public int run(String[] args) throws Exception {

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}