Example usage for org.apache.hadoop.mapreduce Job setMapperClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapperClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapperClass.

Prototype

public void setMapperClass(Class<? extends Mapper> cls) throws IllegalStateException 

Source Link

Document

Set the Mapper for the job.

Usage

From source file:com.cloudera.crunch.impl.mr.plan.JobPrototype.java

License:Open Source License

private CrunchJob build(Class<?> jarClass, Configuration conf) throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();/* w w  w . j  a  v  a2s.c om*/
    job.setJarByClass(jarClass);

    Set<DoNode> outputNodes = Sets.newHashSet();
    Set<Target> targets = targetsToNodePaths.keySet();
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, workingPath, group == null);
    for (Target target : targets) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PCollectionImpl collect = nodePath.tail();
                node = DoNode.createOutputNode(target.toString(), collect.getPType());
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }

    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    RTNodeSerializer serializer = new RTNodeSerializer();
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        reduceNode = reduceNodes.get(0);
        serializer.serialize(reduceNodes, conf, NodeContext.REDUCE);

        group.configureShuffle(job);

        DoNode mapOutputNode = group.getGroupingNode();
        if (reduceNodes.size() == 1 && combineFnTable != null) {
            // Handle the combiner case
            DoNode mapSideCombineNode = combineFnTable.createDoNode();
            mapSideCombineNode.addChild(mapOutputNode);
            mapOutputNode = mapSideCombineNode;
        }

        Set<DoNode> mapNodes = Sets.newHashSet();
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    } else { // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    }

    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            DoNode inputNode = inputNodes.get(i);
            inputNode.getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(inputNodes, reduceNode));

    return new CrunchJob(job, workingPath, outputHandler);
}

From source file:com.cloudera.crunch.io.hbase.HBaseSourceTarget.java

License:Open Source License

@Override
public void configureSource(Job job, int inputId) throws IOException {
    Configuration conf = job.getConfiguration();
    job.setInputFormatClass(TableInputFormat.class);
    job.setMapperClass(CrunchMapper.class);
    HBaseConfiguration.addHbaseResources(conf);
    conf.set(TableInputFormat.INPUT_TABLE, table);
    conf.set(TableInputFormat.SCAN, convertScanToString(scan));
    TableMapReduceUtil.addDependencyJars(job);
}

From source file:com.cloudera.hbase.WordCount.java

License:Open Source License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        return 2;
    }//from   w  ww.  j  a  va2  s  .  c  o  m

    Configuration conf = getConf();

    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.recordservice.avro.mapreduce.ColorCount.java

License:Apache License

/**
 * Run the MR2 color count with generic records, and return a map of favorite colors to
 * the number of users.//from   w  w  w  . ja  va  2s .com
 */
public static java.util.Map<String, Integer> countColors()
        throws IOException, ClassNotFoundException, InterruptedException {
    String output = TestUtil.getTempDirectory();
    Path outputPath = new Path(output);
    JobConf conf = new JobConf(ColorCount.class);
    conf.setInt("mapreduce.job.reduces", 1);

    Job job = Job.getInstance(conf);
    job.setJarByClass(ColorCount.class);
    job.setJobName("MR2 Color Count With Generic Records");

    RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users");
    job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(Reduce.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    job.waitForCompletion(false);

    // Read the result and return it. Since we set the number of reducers to 1,
    // there is always just one file containing the value.
    SeekableInput input = new FsInput(new Path(output + "/part-r-00000.avro"), conf);
    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
    java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>();
    for (GenericRecord datum : fileReader) {
        colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString()));
    }
    return colorMap;
}

From source file:com.cloudera.recordservice.examples.mapreduce.MapReduceAgeCount.java

License:Apache License

public int run(String[] args) throws Exception {
    org.apache.log4j.BasicConfigurator.configure();

    if (args.length != 2) {
        System.err.println("Usage: MapReduceAgeCount <input path> <output path>");
        return -1;
    }/*from w  w  w  .ja v a  2s.c om*/

    Job job = Job.getInstance(getConf());
    job.setJarByClass(MapReduceAgeCount.class);
    job.setJobName("Age Count");

    // RECORDSERVICE:
    // To read from a table instead of a path, comment out
    // FileInputFormat.setInputPaths() and instead use:
    // FileInputFormat.setInputPaths(job, new Path(args[0]));
    RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]);

    // RECORDSERVICE:
    // Use the RecordService version of the AvroKeyValueInputFormat
    job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyValueInputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(AgeCountMapper.class);
    // Set schema for input key and value.
    AvroJob.setInputKeySchema(job, UserKey.getClassSchema());
    AvroJob.setInputValueSchema(job, UserValue.getClassSchema());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(AgeCountReducer.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.cloudera.recordservice.examples.mapreduce.MapReduceColorCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    org.apache.log4j.BasicConfigurator.configure();

    if (args.length != 2) {
        System.err.println("Usage: MapReduceColorCount <input path> <output path>");
        return -1;
    }//from  w w w  .  j av a2s .  c om

    Job job = Job.getInstance(getConf());
    job.setJarByClass(MapReduceColorCount.class);
    job.setJobName("Color Count");

    // RECORDSERVICE:
    // To read from a table instead of a path, comment out
    // FileInputFormat.setInputPaths() and instead use:
    //FileInputFormat.setInputPaths(job, new Path(args[0]));
    RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users");

    // RECORDSERVICE:
    // Use the RecordService version of the AvroKeyInputFormat
    job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class);
    //job.setInputFormatClass(AvroKeyInputFormat.class);

    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(ColorCountMapper.class);
    AvroJob.setInputKeySchema(job, User.getClassSchema());
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(ColorCountReducer.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.cloudera.recordservice.examples.mapreduce.RecordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: RecordCount <input_query> <output_path>");
        System.exit(1);//from   w  ww  .j a  va2 s .  c  o m
    }
    String inputQuery = args[0];
    String output = args[1];

    Job job = Job.getInstance(getConf());
    job.setJobName("recordcount");
    job.setJarByClass(RecordCount.class);
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);
    job.setNumReduceTasks(1);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(LongWritable.class);

    RecordServiceConfig.setInputQuery(job.getConfiguration(), inputQuery);
    job.setInputFormatClass(RecordServiceInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileSystem fs = FileSystem.get(job.getConfiguration());
    Path outputPath = new Path(output);
    if (fs.exists(outputPath))
        fs.delete(outputPath, true);
    FileOutputFormat.setOutputPath(job, outputPath);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraChecksum.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    boolean useRecordService = false;
    Job job = Job.getInstance(getConf());
    if (args.length != 2 && args.length != 3) {
        usage();//from  www.  j  a v  a  2s .  c o m
        return 2;
    }
    if (args.length == 3) {
        useRecordService = Boolean.parseBoolean(args[2]);
    }

    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSum");
    job.setJarByClass(TeraChecksum.class);
    job.setMapperClass(ChecksumMapper.class);
    job.setReducerClass(ChecksumReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Unsigned16.class);
    // force a single reducer
    job.setNumReduceTasks(1);
    if (useRecordService) {
        RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]);
        job.setInputFormatClass(RecordServiceTeraInputFormat.class);
    } else {
        TeraInputFormat.setInputPaths(job, new Path(args[0]));
        job.setInputFormatClass(TeraInputFormat.class);
    }
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java

License:Apache License

/**
 * @param args the cli arguments/*from   www  .  ja v  a2  s  .c om*/
 */
@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Job job = Job.getInstance(getConf());
    if (args.length != 2) {
        usage();
        return 2;
    }
    setNumberOfRows(job, parseHumanLong(args[0]));
    Path outputDir = new Path(args[1]);
    if (outputDir.getFileSystem(getConf()).exists(outputDir)) {
        throw new IOException("Output directory " + outputDir + " already exists.");
    }
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("TeraGen");
    job.setJarByClass(TeraGen.class);
    job.setMapperClass(SortGenMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(RangeInputFormat.class);
    job.setOutputFormatClass(TeraOutputFormat.class);
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraValidate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    boolean useRecordService = false;
    if (args.length != 2 && args.length != 3) {
        usage();/*from   w w w  . java 2 s . com*/
        return 1;
    }
    if (args.length == 3) {
        useRecordService = Boolean.parseBoolean(args[2]);
    }

    Job job = Job.getInstance(getConf());
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraValidate");
    job.setJarByClass(TeraValidate.class);
    job.setMapperClass(ValidateMapper.class);
    job.setReducerClass(ValidateReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    // force a single reducer
    job.setNumReduceTasks(1);
    // force a single split
    FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE);
    if (useRecordService) {
        RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]);
        job.setInputFormatClass(RecordServiceTeraInputFormat.class);
    } else {
        TeraInputFormat.setInputPaths(job, new Path(args[0]));
        job.setInputFormatClass(TeraInputFormat.class);
    }
    return job.waitForCompletion(true) ? 0 : 1;
}