Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:edu.umd.honghongie.BuildInvertedIndexCompressed.java

License:Apache License

/**
 * Runs this tool./*w  ww  . j  av  a  2 s  .  co  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool name: " + BuildInvertedIndexCompressed.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(BuildInvertedIndexCompressed.class.getSimpleName());
    job.setJarByClass(BuildInvertedIndexCompressed.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(PairOfStringLong.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfWritables.class);
    job.setOutputFormatClass(MapFileOutputFormat.class); //why mapfileoutputformat?
    //    job.setOutputFormatClass(SequenceFileOutputFormat);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.honghongie.PairsPMI.java

License:Apache License

/**
 * Runs this tool./*from w  w  w.ja v  a2  s  .c  om*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    // options.addOption(OptionBuilder.withArgName("num").hasArg()
    //     .withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;
    //    int window = cmdline.hasOption(WINDOW) ? 
    //        Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2;

    LOG.info("Tool: " + PairsPMI.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    //    LOG.info(" - window: " + window);
    LOG.info(" - number of reducers: " + reduceTasks);

    //JobConf conf = new JobConf(PairsPMI.class);
    // first job
    //Job job1 = new Job (conf,"join1");
    Configuration conf1 = getConf();
    Job job1 = Job.getInstance(conf1);
    job1.setJobName(PairsPMI.class.getSimpleName());
    job1.setJarByClass(PairsPMI.class);

    job1.setNumReduceTasks(1); //ensure go to one file

    //file path of job1  
    // Delete the output directory if it exist
    Path dir = new Path("temp");
    FileSystem.get(getConf()).delete(dir, true);

    FileInputFormat.setInputPaths(job1, new Path(inputPath));
    FileOutputFormat.setOutputPath(job1, new Path("temp"));

    job1.setMapperClass(Map_First.class);
    job1.setCombinerClass(MyCombiner.class);
    job1.setReducerClass(Reduce_First.class);

    job1.setMapOutputKeyClass(Text.class);//map output key   
    job1.setMapOutputValueClass(IntWritable.class);//map output value   

    job1.setOutputKeyClass(Text.class);//reduce output key   
    job1.setOutputValueClass(IntWritable.class);//reduce output value   

    // ControlledJob ctrljob1=new  ControlledJob(conf);   
    // ctrljob1.setJob(job1);

    long startTime1 = System.currentTimeMillis();
    job1.waitForCompletion(true);
    System.out.println(
            "First Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds");

    //begin job2
    //Configuration conf2 = getConf();
    Job job2 = Job.getInstance(getConf());
    job2.setJobName(PairsPMI.class.getSimpleName());
    job2.setJarByClass(PairsPMI.class);

    job2.setNumReduceTasks(reduceTasks);

    //delete the output directory if it exists.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    //file path of job2  
    FileInputFormat.setInputPaths(job2, new Path(inputPath));
    FileOutputFormat.setOutputPath(job2, new Path(outputPath));
    job2.addCacheFile(new URI("temp/part-r-00000"));

    job2.setMapperClass(Map_Second.class);
    job2.setCombinerClass(MyCombiner_Second.class);
    job2.setReducerClass(Reduce_Second.class);

    job2.setMapOutputKeyClass(PairOfStrings.class);//map output key   
    job2.setMapOutputValueClass(FloatWritable.class);//map output value   

    job2.setOutputKeyClass(PairOfStrings.class);//reduce output key   
    job2.setOutputValueClass(FloatWritable.class);//reduce output value   

    long startTime2 = System.currentTimeMillis();
    job2.waitForCompletion(true);
    System.out.println(
            "Second Job Finished in " + (System.currentTimeMillis() - startTime2) / 1000.0 + " seconds");
    System.out.println(
            "Total Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds");
    System.out.println("Total number of lines:" + lines);
    return 0;
}

From source file:edu.umd.honghongie.RunPersonalizedPageRankBasic.java

License:Apache License

private ArrayListOfFloats phase1(int i, int j, String basePath, int numNodes, ArrayListOfInts sourceids,
        boolean useCombiner) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    job.setJarByClass(RunPersonalizedPageRankBasic.class);

    String in = basePath + "/iter" + formatter.format(i);
    String out = basePath + "/iter" + formatter.format(j) + "t";
    String outm = out + "-mass";

    // We need to actually count the number of part files to get the number of partitions (because
    // the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;// www .java 2  s . c  o m
    }

    LOG.info("PageRank: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + numNodes);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    job.getConfiguration().setInt("NodeCount", numNodes);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    //job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");
    job.getConfiguration().set("PageRankMassPath", outm);

    //*********************** reduer uses sourcenode
    job.getConfiguration().set("SourceNode", sourceids.toString());

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(MapClass.class);

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(getConf()).delete(new Path(out), true);
    FileSystem.get(getConf()).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    System.out.println("********** 1 *********");

    ArrayListOfFloats mass = new ArrayListOfFloats();
    int length = sourceids.size();

    System.out.println("*********** 1 **********" + length);
    float test = Float.NEGATIVE_INFINITY;
    for (int k = 0; k < length; k++) {
        mass.add(Float.NEGATIVE_INFINITY); //use add to initialize
    }
    System.out.println("********** test ********" + test);
    System.out.println("******** 1 ********" + mass);

    //****************************************** how to resolve datastream
    FileSystem fs = FileSystem.get(getConf());
    ArrayListOfFloatsWritable invalue = new ArrayListOfFloatsWritable();
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());

        //**************************************  get all values from fin?
        invalue.readFields(fin);
        System.out.println("************** 1 ************" + invalue);
        for (int k = 0; k < invalue.size(); k++) {
            mass.set(k, sumLogProbs(mass.get(k), invalue.get(k)));
        }
        fin.close();
    }

    System.out.println("******** 1 ********" + mass.toString());
    return mass;

}

From source file:edu.umd.honghongie.StripesPMI.java

License:Apache License

/**
 * Runs this tool./*from   w  ww .  jav a  2  s  . c o m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    // options.addOption(OptionBuilder.withArgName("num").hasArg()
    //     .withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;
    //    int window = cmdline.hasOption(WINDOW) ? 
    //        Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2;

    LOG.info("Tool: " + StripesPMI.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    //    LOG.info(" - window: " + window);
    LOG.info(" - number of reducers: " + reduceTasks);

    //JobConf conf = new JobConf(PairsPMI.class);
    // first job
    //Job job1 = new Job (conf,"join1");
    Configuration conf1 = getConf();
    Job job1 = Job.getInstance(conf1);
    job1.setJobName(StripesPMI.class.getSimpleName());
    job1.setJarByClass(StripesPMI.class);

    job1.setNumReduceTasks(1);

    //file path of job1  
    // Delete the output directory if it exist
    Path dir = new Path("temp");
    FileSystem.get(getConf()).delete(dir, true);

    FileInputFormat.setInputPaths(job1, new Path(inputPath));
    FileOutputFormat.setOutputPath(job1, new Path("temp"));

    job1.setMapperClass(Map_First.class);
    job1.setCombinerClass(MyCombiner.class);
    job1.setReducerClass(Reduce_First.class);

    job1.setMapOutputKeyClass(Text.class);//map output key   
    job1.setMapOutputValueClass(IntWritable.class);//map output value   

    job1.setOutputKeyClass(Text.class);//reduce output key   
    job1.setOutputValueClass(IntWritable.class);//reduce output value   

    // ControlledJob ctrljob1=new  ControlledJob(conf);   
    // ctrljob1.setJob(job1);

    long startTime1 = System.currentTimeMillis();
    job1.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds");

    //begin job2
    //Configuration conf2 = getConf();
    Job job2 = Job.getInstance(getConf());
    job2.setJobName(StripesPMI.class.getSimpleName());
    job2.setJarByClass(StripesPMI.class);

    job2.setNumReduceTasks(reduceTasks);

    //delete the output directory if it exists.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    //file path of job2  
    FileInputFormat.setInputPaths(job2, new Path(inputPath));
    FileOutputFormat.setOutputPath(job2, new Path(outputPath));
    job2.addCacheFile(new URI("temp/part-r-00000"));

    job2.setMapperClass(Map_Second.class);
    job2.setReducerClass(Reduce_Second.class);

    job2.setMapOutputKeyClass(Text.class);//map output key   
    job2.setMapOutputValueClass(HMapStIW.class);//map output value   

    job2.setOutputKeyClass(PairOfStrings.class);//reduce output key   
    job2.setOutputValueClass(FloatWritable.class);//reduce output value   

    long startTime2 = System.currentTimeMillis();
    job2.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime2) / 1000.0 + " seconds");
    System.out
            .println("Total Job Finished in" + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds");
    System.out.println("total number of lines:" + lines);
    return 0;
}

From source file:edu.umd.shrawanraina.RunPageRankBasic.java

License:Apache License

private float phase1(int i, int j, String basePath, int numNodes, boolean useCombiner,
        boolean useInMapperCombiner) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankBasic.class);

    String in = basePath + "/iter" + formatter.format(i);
    String out = basePath + "/iter" + formatter.format(j) + "t";
    String outm = out + "-mass";

    // We need to actually count the number of part files to get the number
    // of partitions (because
    // the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;//  w  ww .  ja v  a 2 s . c  o  m
    }

    LOG.info("PageRank: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + numNodes);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInMapperCombiner);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    job.getConfiguration().setInt("NodeCount", numNodes);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    // job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");
    job.getConfiguration().set("PageRankMassPath", outm);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(useInMapperCombiner ? MapWithInMapperCombiningClass.class : MapClass.class);

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(getConf()).delete(new Path(out), true);
    FileSystem.get(getConf()).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    FileSystem fs = FileSystem.get(getConf());
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:edu.umd.windmemory.PMIPairs.java

License:Apache License

/**
* Runs this tool.//w  ww . j a  v a2 s.  com
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + PMIPairs.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(PMIPairs.class.getSimpleName());
    job.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path interDir = new Path("temp");
    FileSystem.get(getConf()).delete(interDir, true);

    // job.setNumMapTasks(reduceTasks);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, interDir);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyFirstMapper.class);
    job.setCombinerClass(MyFirstReducer.class);
    job.setReducerClass(MyFirstReducer.class);
    job.setPartitionerClass(MyFirstPartitioner.class);

    Job job2 = Job.getInstance(getConf());
    job2.setJobName(PMIPairs.class.getSimpleName());
    job2.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    // job2.getConfiguration().set("path", "temp");
    // job2.getConfiguration().setInt("num", reduceTasks);

    job2.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job2, new Path(inputPath));
    FileOutputFormat.setOutputPath(job2, new Path(outputPath));

    job2.setMapOutputKeyClass(PairOfStrings.class);
    job2.setMapOutputValueClass(IntWritable.class);
    job2.setOutputKeyClass(PairOfStrings.class);
    job2.setOutputValueClass(DoubleWritable.class);

    job2.setMapperClass(MySecondMapper.class);
    job2.setCombinerClass(MySecondCombiner.class);
    job2.setReducerClass(MySecondReducer.class);
    job2.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job2.addCacheFile(new URI("temp/part-r-00000"));
    job.waitForCompletion(true);
    job2.waitForCompletion(true);
    // FileSystem.get(getConf()).delete(interDir, true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umd.windmemory.PMIStripes.java

License:Apache License

/**
 * Runs this tool.//from   w  ww . j  a  va  2 s.com
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + PMIPairs.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(PMIPairs.class.getSimpleName());
    job.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path interDir = new Path("temp");
    FileSystem.get(getConf()).delete(interDir, true);

    // job.setNumMapTasks(reduceTasks);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, interDir);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyFirstMapper.class);
    job.setCombinerClass(MyFirstReducer.class);
    job.setReducerClass(MyFirstReducer.class);
    job.setPartitionerClass(MyFirstPartitioner.class);

    Job job2 = Job.getInstance(getConf());
    job2.setJobName(PMIPairs.class.getSimpleName());
    job2.setJarByClass(PMIPairs.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    // job2.getConfiguration().set("path", interDir.toString());
    // job2.getConfiguration().setInt("num", reduceTasks);

    job2.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job2, new Path(inputPath));
    FileOutputFormat.setOutputPath(job2, new Path(outputPath));

    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(HMapStIW.class);
    job2.setOutputKeyClass(PairOfStrings.class);
    job2.setOutputValueClass(DoubleWritable.class);

    job2.setMapperClass(MySecondMapper.class);
    job2.setCombinerClass(MySecondCombiner.class);
    job2.setReducerClass(MySecondReducer.class);
    job2.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job2.addCacheFile(new URI("temp/part-r-00000"));
    job.waitForCompletion(true);
    job2.waitForCompletion(true);
    // FileSystem.get(getConf()).delete(interDir, true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:edu.umn.cs.spatialHadoop.visualization.SingleLevelPlot.java

License:Open Source License

/**
 * Generates a single level using a MapReduce job and returns the created job.
 * @param inFiles//from   ww w  . j  a  v a  2 s. c  o m
 * @param outFile
 * @param plotterClass
 * @param params
 * @return
 * @throws IOException
 * @throws InterruptedException 
 * @throws ClassNotFoundException 
 */
public static Job plotMapReduce(Path[] inFiles, Path outFile, Class<? extends Plotter> plotterClass,
        OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException {
    Plotter plotter;
    try {
        plotter = plotterClass.newInstance();
    } catch (InstantiationException e) {
        throw new RuntimeException("Error creating rastierizer", e);
    } catch (IllegalAccessException e) {
        throw new RuntimeException("Error creating rastierizer", e);
    }

    Job job = new Job(params, "SingleLevelPlot");
    job.setJarByClass(SingleLevelPlot.class);
    job.setJobName("SingleLevelPlot");
    // Set plotter
    Configuration conf = job.getConfiguration();
    Plotter.setPlotter(conf, plotterClass);
    // Set input file MBR
    Rectangle inputMBR = (Rectangle) params.getShape("mbr");
    Rectangle drawRect = (Rectangle) params.getShape("rect");
    if (inputMBR == null)
        inputMBR = drawRect != null ? drawRect : FileMBR.fileMBR(inFiles, params);
    OperationsParams.setShape(conf, InputMBR, inputMBR);
    if (drawRect != null)
        OperationsParams.setShape(conf, SpatialInputFormat3.InputQueryRange, drawRect);

    // Adjust width and height if aspect ratio is to be kept
    int imageWidth = conf.getInt("width", 1000);
    int imageHeight = conf.getInt("height", 1000);
    if (params.getBoolean("keepratio", true)) {
        // Adjust width and height to maintain aspect ratio
        if (inputMBR.getWidth() / inputMBR.getHeight() > (double) imageWidth / imageHeight) {
            // Fix width and change height
            imageHeight = (int) (inputMBR.getHeight() * imageWidth / inputMBR.getWidth());
            // Make divisible by two for compatibility with ffmpeg
            if (imageHeight % 2 == 1)
                imageHeight--;
            conf.setInt("height", imageHeight);
        } else {
            imageWidth = (int) (inputMBR.getWidth() * imageHeight / inputMBR.getHeight());
            conf.setInt("width", imageWidth);
        }
    }

    boolean merge = conf.getBoolean("merge", true);
    // Set input and output
    job.setInputFormatClass(SpatialInputFormat3.class);
    SpatialInputFormat3.setInputPaths(job, inFiles);
    if (conf.getBoolean("output", true)) {
        if (merge) {
            job.setOutputFormatClass(CanvasOutputFormat.class);
            conf.setClass("mapred.output.committer.class", CanvasOutputFormat.ImageWriterOld.class,
                    org.apache.hadoop.mapred.OutputCommitter.class);
        } else {
            job.setOutputFormatClass(ImageOutputFormat.class);
        }
        CanvasOutputFormat.setOutputPath(job, outFile);
    } else {
        job.setOutputFormatClass(NullOutputFormat.class);
    }

    // Set mapper and reducer based on the partitioning scheme
    String partition = conf.get("partition", "none");
    ClusterStatus clusterStatus = new JobClient(new JobConf()).getClusterStatus();
    if (partition.equalsIgnoreCase("none")) {
        LOG.info("Using no-partition plot");
        job.setMapperClass(NoPartitionPlotMap.class);
        job.setCombinerClass(NoPartitionPlotCombine.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(plotter.getCanvasClass());
        if (merge) {
            int numSplits = new SpatialInputFormat3().getSplits(job).size();
            job.setReducerClass(NoPartitionPlotReduce.class);
            // Set number of reduce tasks according to cluster status
            int maxReduce = Math.max(1, clusterStatus.getMaxReduceTasks() * 7 / 8);
            job.setNumReduceTasks(Math.max(1, Math.min(maxReduce, numSplits / maxReduce)));
        } else {
            job.setNumReduceTasks(0);
        }
    } else {
        LOG.info("Using repartition plot");
        Partitioner partitioner;
        if (partition.equals("pixel")) {
            // Special case for pixel level partitioning as it depends on the
            // visualization parameters
            partitioner = new GridPartitioner(inputMBR, imageWidth, imageHeight);
        } else if (partition.equals("grid")) {
            int numBlocks = 0;
            for (Path in : inFiles) {
                FileSystem fs = in.getFileSystem(params);
                long size = FileUtil.getPathSize(fs, in);
                long blockSize = fs.getDefaultBlockSize(in);
                numBlocks += Math.ceil(size / (double) blockSize);
            }
            int numPartitions = numBlocks * 1000;
            int gridSize = (int) Math.ceil(Math.sqrt(numPartitions));
            partitioner = new GridPartitioner(inputMBR, gridSize, gridSize);
        } else {
            // Use a standard partitioner as created by the indexer
            partitioner = Indexer.createPartitioner(inFiles, outFile, conf, partition);
        }
        Shape shape = params.getShape("shape");
        job.setMapperClass(RepartitionPlotMap.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(shape.getClass());
        job.setReducerClass(RepartitionPlotReduce.class);
        // Set number of reducers according to cluster size
        job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10));
        Partitioner.setPartitioner(conf, partitioner);
    }

    // Use multithreading in case the job is running locally
    conf.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors());

    // Start the job
    if (params.getBoolean("background", false)) {
        // Run in background
        job.submit();
    } else {
        job.waitForCompletion(params.getBoolean("verbose", false));
    }
    return job;
}

From source file:eu.scape_project.tb.wc.archd.hadoop.HadoopArcReaderJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = new Configuration();
    GenericOptionsParser gop = new GenericOptionsParser(conf, args);
    HadoopJobCliConfig pc = new HadoopJobCliConfig();
    CommandLineParser cmdParser = new PosixParser();
    CommandLine cmd = cmdParser.parse(HadoopJobOptions.OPTIONS, gop.getRemainingArgs());
    if ((args.length == 0) || (cmd.hasOption(HadoopJobOptions.HELP_OPT))) {
        HadoopJobOptions.exit("Usage", 0);
    } else {/*from  ww  w  .ja v a  2  s  .com*/
        HadoopJobOptions.initOptions(cmd, pc);
    }
    String dir = pc.getDirStr();

    String name = pc.getHadoopJobName();
    if (name == null || name.equals("")) {
        name = "webarc_reader"; // default job name
    }

    Job job = new Job(conf);

    //**********************************************************
    // for debugging in local mode
    // comment out the 2 lines below befor switching to pseudo-distributed or fully-distributed mode
    // job.getConfiguration().set("mapred.job.tracker", "local");
    // job.getConfiguration().set("fs.default.name", "local");
    //**********************************************************

    FileInputFormat.setInputPaths(job, new Path(dir));
    String outpath = "output/" + System.currentTimeMillis() + "wcr";
    logger.info("Output directory: " + outpath);
    FileOutputFormat.setOutputPath(job, new Path(outpath));

    job.setJarByClass(HadoopArcReaderJob.class);

    job.setJobName(name);

    //*** Set interface data types
    // We are using LONG because this value can become very large on huge archives.
    // In order to use the combiner function, also the map output needs to be a LONG.
    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    //*** Set up the mapper, combiner and reducer
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    //*** Set the MAP output compression
    //job.getConfiguration().set("mapred.compress.map.output", "true");

    //*** Set input / output format
    job.setInputFormatClass(ArcInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    //*** Start the job and wait for it
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:eu.scape_project.tb.wc.archd.mapreduce.FileCharacterisation.java

License:Apache License

public int run(String[] args) throws Exception {

    Job job = null;//Job.getInstance(getConf());
    System.out.println(getConf().get("mapreduce.job.user.classpath.first"));

    for (int i = 0; i < args.length; i++) {
        System.out.println("Arg" + i + ": " + args[i]);
    }/* www.ja v a  2  s .c om*/

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setJarByClass(FileCharacterisation.class);
    job.setJobName(name);

    //*** Set interface data types
    // We are using LONG because this value can become very large on huge archives.
    // In order to use the combiner function, also the map output needs to be a LONG.
    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    //*** Set up the mapper, combiner and reducer
    job.setMapperClass(TikaMap.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    //*** Set the MAP output compression
    //job.getConfiguration().set("mapred.compress.map.output", "true");

    //*** Set input / output format
    job.setInputFormatClass(ArcInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    //*** Start the job and wait for it
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}