Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:biglayer.AutoCoder.java

License:Apache License

/**
 * Runs this tool.//w  ww.j  a v  a2 s . c  o  m
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    /*if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
       System.out.println("args: " + Arrays.toString(args));
       HelpFormatter formatter = new HelpFormatter();
       formatter.setWidth(120);
       formatter.printHelp(this.getClass().getName(), options);
       ToolRunner.printGenericCommandUsage(System.out);
       return -1;
    }*/

    //String inputPath = cmdline.getOptionValue(INPUT);
    //String outputPath = cmdline.getOptionValue(OUTPUT);

    String inputPath = "qiwang321/MNIST-mingled-key/part*";
    String outputPath = "shangfu/layeroutput";

    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + AutoCoder.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    Configuration conf = getConf();

    conf.setInt("num_reduce_task", reduceTasks);
    conf.set("sidepath", outputPath + "_side/");

    Job job0 = Job.getInstance(conf);
    job0.setJobName(AutoCoder.class.getSimpleName());
    job0.setJarByClass(AutoCoder.class);
    job0.setNumReduceTasks(reduceTasks);

    job0.getConfiguration().setInt("layer_ind", 0);

    FileInputFormat.setInputPaths(job0, new Path(inputPath));
    FileOutputFormat.setOutputPath(job0, new Path(outputPath + "_0"));

    job0.setInputFormatClass(KeyValueTextInputFormat.class);
    job0.setOutputFormatClass(SequenceFileOutputFormat.class);

    job0.setMapOutputKeyClass(PairOfInts.class);
    job0.setMapOutputValueClass(ModelNode.class);
    job0.setOutputKeyClass(PairOfInts.class);
    job0.setOutputValueClass(ModelNode.class);

    job0.setMapperClass(MyMapper0.class);
    job0.setReducerClass(MyReducer0.class);
    job0.setPartitionerClass(MyPartitioner.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath + "_0");
    FileSystem.get(getConf()).delete(outputDir, true);
    long startTime = System.currentTimeMillis();
    long codeStart = System.currentTimeMillis();
    double codeTimeSum = 0;
    job0.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    codeTimeSum += (System.currentTimeMillis() - startTime) / 1000.0;

    for (int iterations = 1; iterations < GlobalUtil.NUM_LAYER + 1; iterations++) {
        Job job1 = Job.getInstance(conf);
        job1.setJobName(AutoCoder.class.getSimpleName());
        job1.setJarByClass(AutoCoder.class);
        job1.setNumReduceTasks(reduceTasks);
        job1.getConfiguration().setInt("layer_ind", iterations);
        FileInputFormat.setInputPaths(job1, new Path(outputPath + "_" + (iterations - 1)));
        FileOutputFormat.setOutputPath(job1, new Path(outputPath + "_" + iterations + "_train"));

        LOG.info("Tool: " + AutoCoder.class.getSimpleName());
        LOG.info(" - input path: " + outputPath + "_" + (iterations - 1));
        LOG.info(" - output path: " + outputPath + "_" + iterations + "_train");
        LOG.info(" - number of reducers: " + reduceTasks);

        job1.setInputFormatClass(SequenceFileInputFormat.class);
        job1.setOutputFormatClass(SequenceFileOutputFormat.class);

        job1.setMapOutputKeyClass(PairOfInts.class);
        job1.setMapOutputValueClass(ModelNode.class);
        job1.setOutputKeyClass(PairOfInts.class);
        job1.setOutputValueClass(ModelNode.class);

        job1.setMapperClass(MyMapper.class);
        job1.setReducerClass(MyReducer_Train.class);
        job1.setPartitionerClass(MyPartitioner.class);
        // Delete the output directory if it exists already.
        outputDir = new Path(outputPath + "_" + iterations + "_train");
        FileSystem.get(getConf()).delete(outputDir, true);
        startTime = System.currentTimeMillis();
        job1.waitForCompletion(true);
        LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        codeTimeSum += (System.currentTimeMillis() - startTime) / 1000.0;

        Job job2 = Job.getInstance(conf);
        job2.setJobName(AutoCoder.class.getSimpleName());
        job2.setJarByClass(AutoCoder.class);
        job2.setNumReduceTasks(reduceTasks);
        job2.getConfiguration().setInt("layer_ind", iterations);
        FileInputFormat.setInputPaths(job2, new Path(outputPath + "_" + (iterations + "_train")));
        FileOutputFormat.setOutputPath(job2, new Path(outputPath + "_" + iterations));

        LOG.info("Tool: " + AutoCoder.class.getSimpleName());
        LOG.info(" - input path: " + outputPath + "_" + iterations + "_train");
        LOG.info(" - output path: " + outputPath + "_" + iterations);
        LOG.info(" - number of reducers: " + reduceTasks);

        job2.setInputFormatClass(SequenceFileInputFormat.class);
        job2.setOutputFormatClass(SequenceFileOutputFormat.class);

        job2.setMapOutputKeyClass(PairOfInts.class);
        job2.setMapOutputValueClass(ModelNode.class);
        job2.setOutputKeyClass(PairOfInts.class);
        job2.setOutputValueClass(ModelNode.class);

        job2.setMapperClass(MyMapper.class);
        job2.setReducerClass(MyReducer_GenData.class);
        job2.setPartitionerClass(MyPartitioner.class);
        // Delete the output directory if it exists already.
        outputDir = new Path(outputPath + "_" + iterations);
        FileSystem.get(getConf()).delete(outputDir, true);
        startTime = System.currentTimeMillis();
        job2.waitForCompletion(true);
        LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        codeTimeSum += (System.currentTimeMillis() - startTime) / 1000.0;

    }

    LOG.info(" - input path: " + outputPath + "_" + GlobalUtil.NUM_LAYER);
    LOG.info(" - output path: " + outputPath);
    reduceTasks = 1;
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job_super = Job.getInstance(conf);
    job_super.setJobName(AutoCoder.class.getSimpleName());
    job_super.setJarByClass(AutoCoder.class);
    job_super.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job_super, new Path(outputPath + "_" + GlobalUtil.NUM_LAYER));
    FileOutputFormat.setOutputPath(job_super, new Path(outputPath));

    job_super.setInputFormatClass(SequenceFileInputFormat.class);
    job_super.setOutputFormatClass(SequenceFileOutputFormat.class);

    job_super.setMapOutputKeyClass(PairOfInts.class);
    job_super.setMapOutputValueClass(ModelNode.class);
    job_super.setOutputKeyClass(NullWritable.class);
    job_super.setOutputValueClass(NullWritable.class);

    job_super.setMapperClass(MyMapper_Super.class);
    job_super.setReducerClass(MyReducer_Super.class);
    job_super.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    startTime = System.currentTimeMillis();
    job_super.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    codeTimeSum += (System.currentTimeMillis() - startTime) / 1000.0;

    Log.info("Final Time: " + ((System.currentTimeMillis() - codeStart) / 1000.0) + " seconds,  " + codeTimeSum
            + " seconds.");
    //prepareNextIteration(inputPath0, outputPath,iterations,conf,reduceTasks);

    return 0;
}

From source file:bigmodel.AutoCoderLocal.java

License:Apache License

/**
 * Runs this tool./*w w  w  . j a  v  a 2  s. c  o m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT) + "/part-r-00000";
    String outputPath = cmdline.getOptionValue(OUTPUT);
    String dataPath = cmdline.getOptionValue(INPUT) + "/common";
    //String inputPath = "/home/qiwang321/mapreduce-data/data/in-mingled1-5/part*";
    //String outputPath = "output";
    //String dataPath = "/home/qiwang321/mapreduce-data/data/in-mingled1-5/common";
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + AutoCoderLocal.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    Configuration conf = getConf();
    initialParameters(conf);

    conf.set("dataPath", dataPath);

    Job job = Job.getInstance(conf);
    job.setJobName(AutoCoderLocal.class.getSimpleName());
    job.setJarByClass(AutoCoderLocal.class);
    // set the path of the information of k clusters in this iteration
    job.getConfiguration().set("sidepath", inputPath + "/side_output");
    job.setNumReduceTasks(reduceTasks);

    dataShuffle();

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileInputFormat.setMinInputSplitSize(job, 1000 * 1024 * 1024);
    FileInputFormat.setMaxInputSplitSize(job, 1000 * 1024 * 1024);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(ModelNode.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SuperModel.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    //prepareNextIteration(inputPath0, outputPath,iterations,conf,reduceTasks);

    return 0;
}

From source file:bigsidemodel.AutoCoder.java

License:Apache License

/**
 * Runs this tool.//from  w  w w.  j  a v  a 2s .c  om
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    /*if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
       System.out.println("args: " + Arrays.toString(args));
       HelpFormatter formatter = new HelpFormatter();
       formatter.setWidth(120);
       formatter.printHelp(this.getClass().getName(), options);
       ToolRunner.printGenericCommandUsage(System.out);
       return -1;
    }*/

    //String inputPath = cmdline.getOptionValue(INPUT);
    //String outputPath = cmdline.getOptionValue(OUTPUT);

    String inputPath = "qiwang321/best5-mingled-key-56x56/part*";
    String outputPath = "shangfu/bigoutput";
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + AutoCoder.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath + "0");
    LOG.info(" - number of reducers: " + reduceTasks);
    Configuration conf = getConf();
    conf.setInt("num_reduce_task", reduceTasks);
    conf.set("sidepath", outputPath + "_side/");

    Job job0 = Job.getInstance(conf);
    job0.setJobName(AutoCoder.class.getSimpleName());
    job0.setJarByClass(AutoCoder.class);
    // set the path of the information of k clusters in this iteration
    job0.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job0, new Path(inputPath));
    FileOutputFormat.setOutputPath(job0, new Path(outputPath + "0"));

    job0.setInputFormatClass(KeyValueTextInputFormat.class);
    job0.setOutputFormatClass(SequenceFileOutputFormat.class);

    job0.setMapOutputKeyClass(PairOfInts.class);
    job0.setMapOutputValueClass(DataNode.class);
    job0.setOutputKeyClass(PairOfInts.class);
    job0.setOutputValueClass(DataNode.class);

    job0.setMapperClass(MyMapper0.class);
    job0.setReducerClass(MyReducer0.class);
    job0.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath + "0");
    FileSystem.get(getConf()).delete(outputDir, true);

    long codeStart = System.currentTimeMillis();
    double jobTimeSum = 0;

    long startTime = System.currentTimeMillis();
    job0.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    jobTimeSum += (System.currentTimeMillis() - startTime) / 1000.0;

    //======= Job 1
    LOG.info("Tool: " + AutoCoder.class.getSimpleName());
    LOG.info(" - input path: " + outputPath + "0");
    LOG.info(" - output path: " + outputPath + "1");
    LOG.info(" - number of reducers: " + 1);
    int nModel = reduceTasks;
    reduceTasks = 1;

    Job job1 = Job.getInstance(conf);
    job1.setJobName(AutoCoder.class.getSimpleName());
    job1.setJarByClass(AutoCoder.class);
    // set the path of the information of k clusters in this iteration
    job1.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job1, new Path(outputPath + "0"));
    FileOutputFormat.setOutputPath(job1, new Path(outputPath + "1"));

    job1.setInputFormatClass(SequenceFileInputFormat.class);
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);

    job1.setMapOutputKeyClass(PairOfInts.class);
    job1.setMapOutputValueClass(DataNode.class);
    job1.setOutputKeyClass(NullWritable.class);
    job1.setOutputValueClass(NullWritable.class);

    job1.setMapperClass(MyMapper1.class);
    job1.setReducerClass(MyReducer1.class);
    job1.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    outputDir = new Path(outputPath + "1");
    FileSystem.get(getConf()).delete(outputDir, true);

    startTime = System.currentTimeMillis();
    job1.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    jobTimeSum += (System.currentTimeMillis() - startTime) / 1000.0;
    LOG.info("Final Time: " + ((System.currentTimeMillis() - codeStart) / 1000.0) + " seconds,  " + jobTimeSum
            + " seconds.");

    return 0;
}

From source file:boa.runtime.BoaRunner.java

License:Apache License

/**
 * Create a {@link Job} describing the work to be done by this Boa job.
 * //  w ww . j  a v a2 s  .c  o m
 * @param ins
 *            An array of {@link Path} containing the locations of the input
 *            files
 * 
 * @param out
 *            A {@link Path} containing the location of the output file
 * 
 * @param robust
 *            A boolean representing whether the job should ignore most
 *            exceptions
 * 
 * @return A {@link Job} describing the work to be done by this Boa job
 * @throws IOException
 */
public Job job(final Path[] ins, final Path out, final boolean robust) throws IOException {
    final Configuration configuration = getConf();

    configuration.setBoolean("boa.runtime.robust", robust);

    // faster local reads
    configuration.setBoolean("dfs.client.read.shortcircuit", true);
    configuration.setBoolean("dfs.client.read.shortcircuit.skip.checksum", true);

    // by default our MapFile's index every key, which takes up
    // a lot of memory - this lets you skip keys in the index and
    // control the memory requirements (as a tradeoff of slower gets)
    //configuration.setLong("io.map.index.skip", 128);

    // map output compression
    configuration.setBoolean("mapred.compress.map.output", true);
    configuration.set("mapred.map.output.compression.type", "BLOCK");
    configuration.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class);

    configuration.setBoolean("mapred.map.tasks.speculative.execution", false);
    configuration.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    configuration.setLong("mapred.job.reuse.jvm.num.tasks", -1);

    final Job job = new Job(configuration);

    if (ins != null)
        for (final Path in : ins)
            FileInputFormat.addInputPath(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setPartitionerClass(BoaPartitioner.class);

    job.setMapOutputKeyClass(EmitKey.class);
    job.setMapOutputValueClass(EmitValue.class);

    job.setOutputFormatClass(BoaOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    return job;
}

From source file:byte_import.HexastoreBulkImport.java

License:Open Source License

public Job createSubmittableJob(String[] args) {
    TABLE_NAME = args[1];// ww  w. j av a2  s  . c  o  m
    Job job = null;
    try {
        job = new Job(new Configuration(), NAME);
        job.setJarByClass(HexastoreBulkImport.class);
        job.setMapperClass(sampler.TotalOrderPrep.Map.class);
        job.setReducerClass(Reduce.class);
        job.setCombinerClass(Combiner.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(ImmutableBytesWritable.class);
        job.setPartitionerClass(TotalOrderPartitioner.class);
        //TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("/user/npapa/"+regions+"partitions/part-r-00000"));
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("partitions/part-r-00000"));
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(HFileOutputFormat.class);
        Path out = new Path("out");
        FileOutputFormat.setOutputPath(job, out);
        Configuration conf = new Configuration();
        FileSystem fs;
        try {
            fs = FileSystem.get(conf);
            if (fs.exists(out)) {
                fs.delete(out, true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        HBaseAdmin hadmin = new HBaseAdmin(conf);
        HTableDescriptor desc = new HTableDescriptor(TABLE_NAME + "_stats");
        HColumnDescriptor family = new HColumnDescriptor("size");
        desc.addFamily(family);
        conf.setInt("zookeeper.session.timeout", 600000);
        if (hadmin.tableExists(TABLE_NAME + "_stats")) {
            //hadmin.disableTable(TABLE_NAME+"_stats");
            //hadmin.deleteTable(TABLE_NAME+"_stats");
        } else {
            hadmin.createTable(desc);
        }

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //job.getConfiguration().setInt("mapred.map.tasks", 18);
        job.getConfiguration().set("h2rdf.tableName", TABLE_NAME);
        job.getConfiguration().setInt("mapred.reduce.tasks", (int) TotalOrderPrep.regions);
        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        job.getConfiguration().setInt("io.sort.mb", 100);
        job.getConfiguration().setInt("io.file.buffer.size", 131072);
        job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);
        //job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864);
        job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432);
        job.getConfiguration().setInt("mapred.tasktracker.map.tasks.maximum", 5);
        job.getConfiguration().setInt("mapred.tasktracker.reduce.tasks.maximum", 5);
        //job.getConfiguration().setInt("io.sort.mb", 100);

    } catch (IOException e2) {
        e2.printStackTrace();
    }

    return job;
}

From source file:cityhubpartitioningcountry.CityHubPartitioning.java

/**
 * @param args the command line arguments
 *///from  w  w w.j a va2  s  .co m
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "partitioner");
    job.setJarByClass(CityHubPartitioning.class);
    job.setMapperClass(PartitionMonthMapper.class);
    job.setReducerClass(countryReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(PartitionCountryPartitioner.class);

    job.setNumReduceTasks(27);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:clustering.link_back.step1.Driver.java

License:Apache License

public Job configJob(String[] args) throws Exception {
    if (args.length < 3) {
        System.err.printf("usage: %s mst_result_dir simhash_result_file output_dir\n",
                getClass().getSimpleName());
        System.exit(1);/*  w ww. j a  v  a  2 s  .  c  o  m*/
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "link back step 1 job");
    job.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileInputFormat.addInputPath(job, new Path(args[1]));

    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(SetKeyMapper.class);
    job.setMapOutputKeyClass(Step1KeyWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(JoinPartitioner.class);
    job.setGroupingComparatorClass(Step1GroupComparator.class);

    job.setReducerClass(JoinReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(args[2]));

    return job;
}

From source file:clustering.link_back.step2.Driver.java

License:Apache License

public Job configJob(String[] args) throws Exception {
    if (args.length < 3) {
        System.err.printf("usage: %s pre_step_result_dir step1_result_dir output_dir\n",
                getClass().getSimpleName());
        System.exit(1);//from   ww  w.  j a  v  a2 s . c  o m
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "link back step 2 job");
    job.setJarByClass(Driver.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileInputFormat.addInputPath(job, new Path(args[1]));

    job.setInputFormatClass(KeyValueTextInputFormat.class);

    job.setMapperClass(clustering.link_back.step2.SetKeyMapper.class);
    job.setMapOutputKeyClass(Step2KeyWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(JoinPartitioner.class);
    job.setGroupingComparatorClass(Step2GroupComparator.class);

    job.setReducerClass(clustering.link_back.step2.JoinReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(args[2]));

    return job;
}

From source file:clustering.mst.Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.err.printf("usage: %s similarity_result_dir document_count_file output_dir "
                + "[cluster_threshold] [reduce_number] [compression]\n", getClass().getSimpleName());
        System.exit(1);//from  ww  w.  j a v  a  2  s .  com
    }

    Path step1_OutputDir = new Path(args[2] + "/step1");
    Path resultDir = new Path(args[2] + "/result");

    URI docCntFile = new URI(args[1] + "/part-r-00000#docCnt");

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    if (args.length > 3) {
        conf.setDouble("final.threshold", Double.valueOf(args[3]));
    } else {
        conf.setDouble("final.threshold", 0.2d);
    }
    if (args.length > 4) {
        conf.setInt("reduce.task.num", Integer.valueOf(args[4]));
    } else {
        conf.setInt("reduce.task.num", 5);
    }

    JobControl jobControl = new JobControl("mst jobs");

    /* step 1, split and calculate the child msts */

    Job childJob = Job.getInstance(conf, "mst child job");
    childJob.setJarByClass(Driver.class);

    childJob.addCacheFile(docCntFile);

    if (args.length > 5 && args[5].equals("0")) {
        FileInputFormat.addInputPath(childJob, new Path(args[0]));
        childJob.setInputFormatClass(KeyValueTextInputFormat.class);
    } else {
        SequenceFileInputFormat.addInputPath(childJob, new Path(args[0]));
        childJob.setInputFormatClass(SequenceFileAsTextInputFormat.class);
    }

    FileOutputFormat.setOutputPath(childJob, step1_OutputDir);

    childJob.setMapperClass(ChildMapper.class);
    childJob.setMapOutputKeyClass(DoubleWritable.class);
    childJob.setMapOutputValueClass(Text.class);

    childJob.setPartitionerClass(ChildPartitioner.class);

    childJob.setReducerClass(ChildReducer.class);
    childJob.setNumReduceTasks(conf.getInt("reduce.task.num", 1));
    childJob.setOutputKeyClass(DoubleWritable.class);
    childJob.setOutputValueClass(Text.class);

    ControlledJob controlledChildJob = new ControlledJob(conf);
    controlledChildJob.setJob(childJob);
    jobControl.addJob(controlledChildJob);

    /* step 2, merge step 1's output and calculate final mst */

    Job finalJob = Job.getInstance(conf, "mst final job");
    finalJob.setJarByClass(FinalReducer.class);

    finalJob.addCacheFile(docCntFile);

    FileInputFormat.addInputPath(finalJob, step1_OutputDir);
    finalJob.setInputFormatClass(KeyValueTextInputFormat.class);

    finalJob.setMapperClass(FinalMapper.class);
    finalJob.setMapOutputKeyClass(DoubleWritable.class);
    finalJob.setMapOutputValueClass(Text.class);

    finalJob.setReducerClass(FinalReducer.class);
    finalJob.setOutputKeyClass(IntWritable.class);
    finalJob.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(finalJob, resultDir);

    ControlledJob finalControlledJob = new ControlledJob(conf);
    finalControlledJob.setJob(finalJob);
    finalControlledJob.addDependingJob(controlledChildJob);
    jobControl.addJob(finalControlledJob);

    // run jobs

    MapReduceUtils.runJobs(jobControl);

    return finalJob.waitForCompletion(true) ? 0 : 1;
}

From source file:clustering.similarity.ISimDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("usage: %s simpre_dir output_dir " + "[compression_or_not] [reduce_task_number]\n",
                getClass().getSimpleName());
        System.exit(1);//from www.  ja  v  a 2  s.c  o m
    }

    Configuration conf = getConf();
    conf = MapReduceUtils.initConf(conf);

    Job job = Job.getInstance(conf, "isim job");
    job.setJarByClass(ISimDriver.class);

    if (args.length > 2 && args[2].equals("0")) {
        FileInputFormat.addInputPath(job, new Path(args[0]));
        job.setInputFormatClass(KeyValueTextInputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
    } else {
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(args[0]));

        conf.setBoolean("mapreduce.map.output.compress", true);
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");

        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
    }

    if (args.length > 3) {
        conf.setInt("reduce.num", Integer.valueOf(args[3]));
    } else {
        conf.setInt("reduce.num", 5);
    }

    job.setMapperClass(ISimMapper.class);
    job.setMapOutputKeyClass(IntIntTupleWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(ISimCombiner.class);
    job.setPartitionerClass(HashPartitioner.class);

    job.setNumReduceTasks(conf.getInt("reduce.num", 1));

    job.setReducerClass(ISimReducer.class);
    job.setOutputKeyClass(IntIntTupleWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    long starttime = System.currentTimeMillis();
    boolean complete = job.waitForCompletion(true);
    long endtime = System.currentTimeMillis();
    System.out.println("inverted similarity job finished in: " + (endtime - starttime) / 1000 + " seconds");

    return complete ? 0 : 1;
}