Example usage for org.apache.hadoop.mapreduce Job setJarByClass

List of usage examples for org.apache.hadoop.mapreduce Job setJarByClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setJarByClass.

Prototype

public void setJarByClass(Class<?> cls) 

Source Link

Document

Set the Jar by finding where a given class came from.

Usage

From source file:clustering.inverted_index.Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s tf_idf_result_dir output_dir" + "[decimal_number] [pruning_threshold]\n",
                getClass().getSimpleName());
        System.exit(1);/* ww  w  . j a  v a  2s  .c o m*/
    }

    Path normDir = new Path(args[1] + "/normed");
    Path resultDir = new Path(args[1] + "/result");

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    if (args.length > 2) {
        conf.setInt("deci.number", Integer.valueOf(args[2]));
    } else {
        conf.setInt("deci.number", 4);
    }

    if (args.length > 3) {
        conf.setBoolean("pruning", true);
        conf.setDouble("pruning.threshold", Double.valueOf(args[3]));
    } else {
        conf.setBoolean("pruning", false);
    }

    JobControl jobControl = new JobControl("inverted-index jobs");

    /* step 1, normalize the vector lenth of each document */

    Job job1 = Job.getInstance(conf, "tf idf normalizer job");
    job1.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job1, new Path(args[0]));
    job1.setInputFormatClass(KeyValueTextInputFormat.class);

    job1.setMapperClass(Mapper.class);

    job1.setReducerClass(NormalizerReducer.class);
    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job1, normDir);

    ControlledJob controlledJob1 = new ControlledJob(conf);
    controlledJob1.setJob(job1);
    jobControl.addJob(controlledJob1);

    /* step 2, calculate inverted index */

    Job job2 = Job.getInstance(conf, "inverted index job");
    job2.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job2, normDir);

    job2.setInputFormatClass(KeyValueTextInputFormat.class);

    job2.setMapperClass(Mapper.class);

    job2.setReducerClass(InvertedIndexReducer.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job2, resultDir);

    ControlledJob controlledJob2 = new ControlledJob(conf);
    controlledJob2.setJob(job2);
    controlledJob2.addDependingJob(controlledJob1);
    jobControl.addJob(controlledJob2);

    MapReduceUtils.runJobs(jobControl);

    return job2.waitForCompletion(true) ? 0 : 1;
}

From source file:clustering.link_back.pre.Driver.java

License:Apache License

public Job configJob(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s input_dir output_dir\n", getClass().getSimpleName());
        System.exit(1);//from ww  w . j a v  a2s .c o  m
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "linkback pre step");
    job.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(AttachMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job;
}

From source file:clustering.link_back.step1.Driver.java

License:Apache License

public Job configJob(String[] args) throws Exception {
    if (args.length < 3) {
        System.err.printf("usage: %s mst_result_dir simhash_result_file output_dir\n",
                getClass().getSimpleName());
        System.exit(1);//  ww w . j  a  v a2  s . co m
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "link back step 1 job");
    job.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileInputFormat.addInputPath(job, new Path(args[1]));

    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(SetKeyMapper.class);
    job.setMapOutputKeyClass(Step1KeyWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(JoinPartitioner.class);
    job.setGroupingComparatorClass(Step1GroupComparator.class);

    job.setReducerClass(JoinReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(args[2]));

    return job;
}

From source file:clustering.link_back.step2.Driver.java

License:Apache License

public Job configJob(String[] args) throws Exception {
    if (args.length < 3) {
        System.err.printf("usage: %s pre_step_result_dir step1_result_dir output_dir\n",
                getClass().getSimpleName());
        System.exit(1);//w  ww  .j a  v  a2  s. c  o m
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "link back step 2 job");
    job.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileInputFormat.addInputPath(job, new Path(args[1]));

    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(clustering.link_back.step2.SetKeyMapper.class);
    job.setMapOutputKeyClass(Step2KeyWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(JoinPartitioner.class);
    job.setGroupingComparatorClass(Step2GroupComparator.class);

    job.setReducerClass(clustering.link_back.step2.JoinReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(args[2]));

    return job;
}

From source file:clustering.mst.Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.err.printf("usage: %s similarity_result_dir document_count_file output_dir "
                + "[cluster_threshold] [reduce_number] [compression]\n", getClass().getSimpleName());
        System.exit(1);//  w w  w . ja v  a  2s .  co m
    }

    Path step1_OutputDir = new Path(args[2] + "/step1");
    Path resultDir = new Path(args[2] + "/result");

    URI docCntFile = new URI(args[1] + "/part-r-00000#docCnt");

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    if (args.length > 3) {
        conf.setDouble("final.threshold", Double.valueOf(args[3]));
    } else {
        conf.setDouble("final.threshold", 0.2d);
    }
    if (args.length > 4) {
        conf.setInt("reduce.task.num", Integer.valueOf(args[4]));
    } else {
        conf.setInt("reduce.task.num", 5);
    }

    JobControl jobControl = new JobControl("mst jobs");

    /* step 1, split and calculate the child msts */

    Job childJob = Job.getInstance(conf, "mst child job");
    childJob.setJarByClass(Driver.class);

    childJob.addCacheFile(docCntFile);

    if (args.length > 5 && args[5].equals("0")) {
        FileInputFormat.addInputPath(childJob, new Path(args[0]));
        childJob.setInputFormatClass(KeyValueTextInputFormat.class);
    } else {
        SequenceFileInputFormat.addInputPath(childJob, new Path(args[0]));
        childJob.setInputFormatClass(SequenceFileAsTextInputFormat.class);
    }

    FileOutputFormat.setOutputPath(childJob, step1_OutputDir);

    childJob.setMapperClass(ChildMapper.class);
    childJob.setMapOutputKeyClass(DoubleWritable.class);
    childJob.setMapOutputValueClass(Text.class);

    childJob.setPartitionerClass(ChildPartitioner.class);

    childJob.setReducerClass(ChildReducer.class);
    childJob.setNumReduceTasks(conf.getInt("reduce.task.num", 1));
    childJob.setOutputKeyClass(DoubleWritable.class);
    childJob.setOutputValueClass(Text.class);

    ControlledJob controlledChildJob = new ControlledJob(conf);
    controlledChildJob.setJob(childJob);
    jobControl.addJob(controlledChildJob);

    /* step 2, merge step 1's output and calculate final mst */

    Job finalJob = Job.getInstance(conf, "mst final job");
    finalJob.setJarByClass(FinalReducer.class);

    finalJob.addCacheFile(docCntFile);

    FileInputFormat.addInputPath(finalJob, step1_OutputDir);
    finalJob.setInputFormatClass(KeyValueTextInputFormat.class);

    finalJob.setMapperClass(FinalMapper.class);
    finalJob.setMapOutputKeyClass(DoubleWritable.class);
    finalJob.setMapOutputValueClass(Text.class);

    finalJob.setReducerClass(FinalReducer.class);
    finalJob.setOutputKeyClass(IntWritable.class);
    finalJob.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(finalJob, resultDir);

    ControlledJob finalControlledJob = new ControlledJob(conf);
    finalControlledJob.setJob(finalJob);
    finalControlledJob.addDependingJob(controlledChildJob);
    jobControl.addJob(finalControlledJob);

    // run jobs

    MapReduceUtils.runJobs(jobControl);

    return finalJob.waitForCompletion(true) ? 0 : 1;
}

From source file:clustering.simhash.Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s init_result_dir output_dir [simhash_threshold]\n",
                getClass().getSimpleName());
        System.exit(1);/*ww  w  .j av a 2  s.  c o m*/
    }

    Path step1_outputDir = new Path(args[1] + "/step1");

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    if (args.length > 2) {
        conf.setInt("simhash.threshold", Integer.valueOf(args[2]));
    } else {
        conf.setInt("simhash.threshold", 3);
    }

    JobControl jobControl = new JobControl("simhash jobs");

    Job job1 = Job.getInstance(conf, "simhash step1 job");
    job1.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job1, new Path(args[0]));
    job1.setInputFormatClass(KeyValueTextInputFormat.class);

    job1.setMapperClass(Step1Mapper.class);
    job1.setMapOutputKeyClass(LongWritable.class);
    job1.setMapOutputValueClass(Text.class);

    job1.setReducerClass(Step1Reducer.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job1, step1_outputDir);

    ControlledJob controlledJob1 = new ControlledJob(conf);
    controlledJob1.setJob(job1);
    jobControl.addJob(controlledJob1);

    Job job2 = Job.getInstance(conf, "simhash step2 job");
    job2.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job2, step1_outputDir);
    job2.setInputFormatClass(KeyValueTextInputFormat.class);

    job2.setMapperClass(Step2Mapper.class);
    job2.setMapOutputKeyClass(IntWritable.class);
    job2.setMapOutputValueClass(Text.class);

    job2.setReducerClass(Step2Reducer.class);
    job2.setOutputKeyClass(IntWritable.class);
    job2.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job2, new Path(args[1] + "/result"));

    ControlledJob controlledJob2 = new ControlledJob(conf);
    controlledJob2.setJob(job2);
    controlledJob2.addDependingJob(controlledJob1);
    jobControl.addJob(controlledJob2);

    long starttime = System.currentTimeMillis();
    clustering.Utils.MapReduceUtils.runJobs(jobControl);

    boolean complete = job2.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("simhash job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}

From source file:clustering.similarity.ISimDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s simpre_dir output_dir " + "[compression_or_not] [reduce_task_number]\n",
                getClass().getSimpleName());
        System.exit(1);//from   w  w  w  .  j  a  v  a2 s. c om
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "isim job");
    job.setJarByClass(ISimDriver.class);

    if (args.length > 2 && args[2].equals("0")) {
        FileInputFormat.addInputPath(job, new Path(args[0]));
        job.setInputFormatClass(KeyValueTextInputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
    } else {
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(args[0]));

        conf.setBoolean("mapreduce.map.output.compress", true);
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");

        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
    }

    if (args.length > 3) {
        conf.setInt("reduce.num", Integer.valueOf(args[3]));
    } else {
        conf.setInt("reduce.num", 5);
    }

    job.setMapperClass(ISimMapper.class);
    job.setMapOutputKeyClass(IntIntTupleWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(ISimCombiner.class);
    job.setPartitionerClass(HashPartitioner.class);

    job.setNumReduceTasks(conf.getInt("reduce.num", 1));

    job.setReducerClass(ISimReducer.class);
    job.setOutputKeyClass(IntIntTupleWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    long starttime = System.currentTimeMillis();
    boolean complete = job.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("inverted similarity job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}

From source file:clustering.similarity.PreDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf(//from w  w  w  .ja  va2  s .co m
                "usage: %s inverted_index_result_dir output_dir"
                        + " [compress_or_not] [reducer_number] [deci_number]\n",
                this.getClass().getSimpleName());
        System.exit(1);
    }
    Configuration conf = getConf();

    conf = MapReduceUtils.initConf(conf);
    conf.set("mapreduce.reduce.speculative", "false");

    // TODO: 17-4-24 calculate split number from reducer number
    conf.setInt("split.num", 8);

    if (args.length > 3) {
        conf.setInt("reducer.num", Integer.valueOf(args[3]));
    } else {
        conf.setInt("reducer.num", 29);
    }
    if (args.length > 4) {
        conf.setInt("deci.number", Integer.valueOf(args[4]));
    } else {
        conf.setInt("deci.number", 3);
    }

    Job job = Job.getInstance(conf, "pre job");
    job.setJarByClass(PreDriver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(PreMapper.class);
    job.setMapOutputKeyClass(IntIntTupleWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(PrePartitioner.class);

    job.setNumReduceTasks(conf.getInt("reducer.num", 29));
    job.setReducerClass(PreReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // set default compression
    if (args.length > 2 && args[2].equals("0")) {
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
    }

    long starttime = System.currentTimeMillis();
    boolean complete = job.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("inverted similarity pre job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}

From source file:clustering.tf_idf.DocCntDriver.java

License:Apache License

Job configJob(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s simhash_result_dir pre_step_output_dir\n", getClass().getSimpleName());
        System.exit(1);/*from w ww. j a  va2s . com*/
    }

    Configuration conf = getConf();
    conf = initConf(conf);

    Job job = Job.getInstance(conf, "tf idf pre job");
    job.setJarByClass(WorkflowDriver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(DocCntMapper.class);
    job.setCombinerClass(DocCntReducer.class);

    job.setReducerClass(DocCntReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(IntWritable.class);

    return job;
}

From source file:clustering.tf_idf.TermCntDriver.java

License:Apache License

Job configJob(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s simhash_result_dir step_1_output_dir\n", getClass().getSimpleName());
        System.exit(1);/*from   ww  w. j av a 2  s . c o  m*/
    }

    Configuration conf = getConf();
    conf = initConf(conf);

    Job job = Job.getInstance(conf, "tf idf step1 job");
    job.setJarByClass(TermCntDriver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(TermCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setCombinerClass(TermCountReducer.class);

    job.setReducerClass(TermCountReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job;
}