Example usage for org.apache.hadoop.mapreduce Job setMapperClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapperClass.

Prototype

public void setMapperClass(Class<? extends Mapper> cls) throws IllegalStateException

Source Link

Document

Set the Mapper for the job.

Usage

From source file:ca.uwaterloo.cs.bigdata2017w.assignment4.BuildPersonalizedPageRankRecords.java

License:Apache License

/**
 * Runs this tool./*from   w  w  w .j a  v  a  2 s. c o  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES));
    options.addOption(
            OptionBuilder.withArgName("sources").hasArg().withDescription("source nodes").create(SOURCES));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    String sourcesString = cmdline.getOptionValue(SOURCES);
    String[] sources = sourcesString.split(",");
    for (int i = 0; i < sources.length; i++) {
        sources[i] = sources[i].trim();
    }

    LOG.info("Tool name: " + BuildPersonalizedPageRankRecords.class.getSimpleName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - numNodes: " + n);
    LOG.info(" - use sources: " + sourcesString);

    Configuration conf = getConf();
    conf.setInt(NODE_CNT_FIELD, n);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.setStrings(SOURCES, sources);

    Job job = Job.getInstance(conf);
    job.setJobName(BuildPersonalizedPageRankRecords.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(BuildPersonalizedPageRankRecords.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.meterperfile.ThreelMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.meterperfile.ThreelMain <input> <output>");
        System.exit(2);/*from  www . j  ava  2s  .  c  o m*/
    }

    conf.set("mapreduce.input.fileinputformat.split.maxsize", "100");
    Job job = new Job(conf, "ThreelMain");
    job.setJarByClass(ThreelMain.class);

    job.setInputFormatClass(UnsplitableTextInputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);
    // job.setOutputKeyClass(LongWritable.class);
    //job.setOutputValueClass(Text.class);
    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ConsineMain <input> <output>");
        System.exit(2);/*from  w  w w  .j a  v  a2 s . c o m*/
    }
    Job job1 = new Job(conf, "ConsineMain");
    job1.setJarByClass(CosineMain.class);

    job1.setMapperClass(AggregateReadingsMapper.class);
    job1.setMapOutputKeyClass(LongWritable.class);
    job1.setMapOutputValueClass(DoubleWritable.class);

    job1.setReducerClass(AggregateReadingsReducer.class);
    job1.setOutputKeyClass(LongWritable.class);
    job1.setOutputValueClass(Text.class);
    FileInputFormat.setInputDirRecursive(job1, true);
    FileInputFormat.setInputPaths(job1, new Path(otherArgs[0]));
    int lastIdx = otherArgs[0].lastIndexOf("/");
    String tempOutput = otherArgs[0].substring(0, lastIdx) + "/temp";
    FileOutputFormat.setOutputPath(job1, new Path(tempOutput));

    System.out.println("\nStarting Job-1 ...");
    final long startTime = System.currentTimeMillis();
    try {
        final long startTimeJob1 = System.currentTimeMillis();
        if (!job1.waitForCompletion(true)) {
            System.out.println("Job-1 failed.");
        } else {
            System.out.println("Duration of Job1 " + ((System.currentTimeMillis() - startTimeJob1) / 1000.0)
                    + " seconds.");
            final Job job2 = new Job(conf, "ConsineMain Aggregate");
            job2.setJarByClass(CosineMain.class);
            job2.setInputFormatClass(CartesianInputFormat.class);
            CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, tempOutput);
            CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, tempOutput);
            FileOutputFormat.setOutputPath(job2, new Path(otherArgs[1]));

            job2.setMapperClass(CartesianProductMapper.class);
            job2.setMapOutputKeyClass(DoubleWritable.class);
            job2.setMapOutputValueClass(Text.class);

            job2.setSortComparatorClass(DescendingKeyComparator.class);

            job2.setReducerClass(CartesianProductReducer.class);
            job2.setOutputKeyClass(Text.class);
            job2.setOutputValueClass(DoubleWritable.class);

            job2.setNumReduceTasks(10);
            final long startTimeJob2 = System.currentTimeMillis();
            System.out.println("\nStarting Job-2 ...");
            if (!job2.waitForCompletion(true)) {
                System.out.println("Job-2 failed.");
            } else {
                System.out.println("Duration of Job2: "
                        + ((System.currentTimeMillis() - startTimeJob2) / 1000.0) + " seconds.");
            }

        }
        FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(tempOutput), true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Total Duration: " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java

License:Open Source License

public int run1(String[] args) throws IOException {
    if (args.length != 3) {
        System.err.println("Usage: java " + getClass().getName() + " <inputDir> <outDir> <ntasks>");
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//from w  ww .  j  a  va2  s.c  om
    Configuration conf = getConf();
    final Job job2 = new Job(conf, "ConsineMain cartesian product");
    job2.setJarByClass(CosineMain.class);

    job2.setInputFormatClass(CartesianInputFormat.class);
    CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, args[0]);
    CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, args[0]);
    FileOutputFormat.setOutputPath(job2, new Path(args[1]));

    job2.setMapperClass(CartesianProductMapper.class);
    job2.setMapOutputKeyClass(DoubleWritable.class);
    job2.setMapOutputValueClass(Text.class);

    job2.setSortComparatorClass(DescendingKeyComparator.class);

    job2.setReducerClass(CartesianProductReducer.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(DoubleWritable.class);
    job2.setNumReduceTasks(Integer.parseInt(args[2]));

    System.out.println("\nStarting Job-2 ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job2.waitForCompletion(true)) {
            System.out.println("Job-2 failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.HistogramMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.HistogramMain <input> <output>");
        System.exit(2);/* w  w w .  j a  v a2s .c o  m*/
    }
    Job job = new Job(conf, "HistogramMain");
    job.setJarByClass(HistogramMain.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(MyCombiner.class);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.PARMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.PARMain <input> <output>");
        System.exit(2);/* w  w  w  .j  ava  2 s.co m*/
    }
    Job job = new Job(conf, "PARMain");
    job.setJarByClass(PARMain.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(MyCombiner.class);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.ThreelMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ThreelMain <input> <output>");
        System.exit(2);/*from ww  w .j a  v  a  2 s.co m*/
    }

    Job job = new Job(conf, "ThreelMain");
    job.setJarByClass(ThreelMain.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(ArrayPrimitiveWritable.class);

    job.setCombinerClass(MyCombiner.class);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:CalculateSentiment.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Path tempDir = new Path("wordcount-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) {
        System.err.println("Usage: wordcount <in> <out> <category>");
        System.exit(2);//www .j a  v a  2 s .  c om
    }
    conf.set("category", otherArgs[2]);

    // try {
    //     String filePath = otherArgs[0];
    //     BufferedReader br = new BufferedReader(new FileReader(filePath));
    //     String line = br.readLine();
    //     conf.set("category", line);
    // } catch (Exception e) {
    //     e.printStackTrace();
    // }
    // conf.set("category", WordCount.read(otherArgs[2]));

    DistributedCache.createSymlink(conf);
    String path = "CalculateSentiment.obj";
    Path filePath = new Path(path);
    String uriWithLink = filePath.toUri().toString() + "#" + "object";
    DistributedCache.addCacheFile(new URI(uriWithLink), conf);

    // DistributedCache.addCacheFile(new URI("/CalculateSentiment.obj"), conf);
    Job job = new Job(conf, "Test");

    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(DoubleSumReducer.class);
    job.setReducerClass(DoubleSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(1);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:capturer.valueMerge.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: wordcount <in> [<in>...] <out>");
        System.exit(2);/*from w  ww. j  a  v a2  s .  co m*/
    }
    Job job = new Job(conf, "so fast");
    job.setJarByClass(valueMerge.class);

    job.setMapperClass(TokenizerMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testCascade() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    String sinkPath4 = getOutputPath("flow4");
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE);
    Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1,
            new Pipe("first-flow"));

    String sinkPath5 = getOutputPath("flow5");
    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE);
    Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2,
            new Pipe("second-flow"));

    JobConf defaultConf = HadoopPlanner.createJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true)));
    String sinkPath1 = getOutputPath("flow1");
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true)));
    String sinkPath2 = getOutputPath("flow2");
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    Job job = new Job(defaultConf);
    job.setJobName("third-mr");

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class);

    job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.getConfiguration().set("mapred.mapper.new-api", "true");
    job.getConfiguration().set("mapred.reducer.new-api", "true");

    org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true)));
    String sinkPath3 = getOutputPath("flow3");
    org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job,
            new Path(remove(sinkPath3, true)));

    Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    cascade.complete();/*from   w w  w . j  a  va 2s . c  o m*/

    validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}