Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:com.cloudera.accumulo.upgrade.compatibility.DataCompatibilityLoad.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    final String jobName = this.getClass().getName();
    options.parseArgs(jobName, args);//from  w  ww .jav a2 s . c om
    final Job job = new Job(getConf(), jobName);

    if (-1 == options.test.numRows) {
        options.test.numRows = job.getConfiguration().getInt("mapred.map.tasks",
                DataCompatibilityTestCli.DEFAULT_NUM_ROWS);
    }

    job.setJarByClass(this.getClass());

    job.setInputFormatClass(DataLoadInputFormat.class);
    DataLoadInputFormat.setTabletServers(job,
            options.connection.getConnector().instanceOperations().getTabletServers());
    DataLoadInputFormat.setNumRows(job, options.test.numRows);
    DataLoadInputFormat.setNumQualifiersPerFamily(job, options.test.qualifiers);

    job.getConfiguration().set(VISIBILITY, new String(options.visibility.visibility.getExpression(), "UTF-8"));

    final TableOperations ops = options.connection.getConnector().tableOperations();

    final List<String> names = options.test.getTableNamesAndConfigureThem(ops);
    for (String name : names) {
        final int numSplits = ops.getSplits(name, options.test.numRows).size();
        if (options.test.numRows > numSplits) {
            log.info("adding splits to table '" + name + "', to bring it from " + numSplits + " to "
                    + options.test.numRows + ".");
            final SortedSet<Text> splits = new TreeSet<Text>();
            // for cases where we're adding way more splits than there are currently possible servers to handle them, do a pre-pre-split
            //   N.B. If we've just created this table, there will be 0 splits because we'll just have the initial tablet.
            if (0 == numSplits || options.test.numRows / numSplits > 10) {
                log.info("splitting in two waves due to the number of splits we need to add.");
                // TODO turtles all the way down.
                final int prepre = options.test.numRows / (0 == numSplits ? 10 : numSplits * 10);
                for (int i = 0; i < prepre; i++) {
                    splits.add(new Text(new StringBuilder(Long.toString(i)).reverse().toString()));
                }
                ops.addSplits(name, splits);
                log.debug("delay 30s for splits to get assigned off host.");
                try {
                    Thread.currentThread().sleep(30 * 1000);
                } catch (InterruptedException exception) {
                    log.warn("interrupted from sleep early.");
                }
                splits.clear();
            }
            for (int i = 0; i < options.test.numRows; i++) {
                splits.add(new Text(new StringBuilder(Long.toString(i)).reverse().toString()));
            }
            ops.addSplits(name, splits);
        }
    }
    log.debug("delay 30s for splits to get assigned off host.");
    try {
        Thread.currentThread().sleep(30 * 1000);
    } catch (InterruptedException exception) {
        log.warn("interrupted from sleep early.");
    }

    job.getConfiguration().setStrings(OUTPUT_TABLE_NAMES, names.toArray(new String[0]));

    job.setMapperClass(DataLoadMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Mutation.class);

    job.setNumReduceTasks(0);

    log.info("launching map-only job to insert " + options.test.numRows + " rows of "
            + (FAMILIES.length * options.test.qualifiers) + " cells each into each of the tables " + names);
    options.output.useAccumuloOutputFormat(job);

    job.waitForCompletion(true);
    return job.isSuccessful() ? 0 : 1;
}

From source file:com.cloudera.avro.MapReduceAvroWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: AvroWordCount <input path> <output path>");
        return -1;
    }// w  w w. ja v  a2s  .  c om

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceAvroWordCount.class);
    job.setJobName("wordcount");

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)));
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setSortComparatorClass(Text.Comparator.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

    return 0;
}

From source file:com.cloudera.avro.MapReduceColorCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: MapReduceColorCount <input path> <output path>");
        return -1;
    }// w w  w.  ja  va2  s. c o m

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceColorCount.class);
    job.setJobName("Color Count");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapperClass(ColorCountMapper.class);
    AvroJob.setInputKeySchema(job, User.getClassSchema());
    AvroJob.setMapOutputValueSchema(job, User.getClassSchema());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(ColorCountReducer.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.cloudera.ByteCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(new Configuration());

    // Trim off the hadoop-specific args
    String[] remArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    // Pull in properties
    Options options = new Options();

    Option property = OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator()
            .withDescription("use value for given property").create("D");
    options.addOption(property);/*from ww  w .j  a v  a 2s .  c  o m*/

    Option skipChecksums = new Option("skipChecksums", "skip checksums");
    options.addOption(skipChecksums);

    Option profile = new Option("profile", "profile tasks");
    options.addOption(profile);

    CommandLineParser parser = new BasicParser();
    CommandLine line = parser.parse(options, remArgs);

    Properties properties = line.getOptionProperties("D");
    for (Entry<Object, Object> prop : properties.entrySet()) {
        conf.set(prop.getKey().toString(), prop.getValue().toString());
        System.out.println("Set config key " + prop.getKey() + " to " + prop.getValue());
    }

    if (line.hasOption("skipChecksums")) {
        conf.setBoolean("bytecount.skipChecksums", true);
        System.out.println("Skipping checksums");
    }

    if (line.hasOption("profile")) {
        conf.setBoolean("mapred.task.profile", true);
        conf.set("mapred.task.profile.params",
                "-agentlib:hprof=cpu=samples,depth=100,interval=1ms,lineno=y,thread=y,file=%s");
        conf.set(MRJobConfig.NUM_MAP_PROFILES, "0");
        conf.set("mapred.task.profile.maps", "1");
        System.out.println("Profiling map tasks");
    }

    // Get the positional arguments out
    remArgs = line.getArgs();
    if (remArgs.length != 2) {
        System.err.println("Usage: ByteCount <inputBase> <outputBase>");
        System.exit(1);
    }
    String inputBase = remArgs[0];
    String outputBase = remArgs[1];

    Job job = Job.getInstance(conf);

    job.setInputFormatClass(ByteBufferInputFormat.class);

    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(ByteCountMapper.class);
    job.setReducerClass(ByteCountReducer.class);
    job.setCombinerClass(ByteCountReducer.class);

    job.setOutputKeyClass(ByteWritable.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.addInputPath(job, new Path(inputBase));
    FileOutputFormat.setOutputPath(job, new Path(outputBase));

    job.setJarByClass(ByteCount.class);

    boolean success = job.waitForCompletion(true);

    Counters counters = job.getCounters();
    System.out.println("\tRead counters");
    printCounter(counters, READ_COUNTER.BYTES_READ);
    printCounter(counters, READ_COUNTER.LOCAL_BYTES_READ);
    printCounter(counters, READ_COUNTER.SCR_BYTES_READ);
    printCounter(counters, READ_COUNTER.ZCR_BYTES_READ);

    System.exit(success ? 0 : 1);
}

From source file:com.cloudera.castagna.logparser.mr.StatusCodesStats.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/* w  w w .  jav a  2 s .  co  m*/

    Configuration configuration = getConf();
    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);

    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERWRITE_OUTPUT,
            Constants.OPTION_OVERWRITE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = Job.getInstance(configuration);
    job.setJobName(Constants.STATUS_CODES_STATS);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(StatusCodesStatsMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(StatusCodesStatsCombiner.class);

    job.setReducerClass(StatusCodesStatsReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Utils.setReducers(job, configuration, log);

    job.setOutputFormatClass(TextOutputFormat.class);

    if (log.isDebugEnabled())
        Utils.log(job, log);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.castagna.logparser.mr.TranscodeLogs.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/*  w  w w  .j  a  va 2 s .com*/

    Configuration configuration = getConf();

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERWRITE_OUTPUT,
            Constants.OPTION_OVERWRITE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = Job.getInstance(configuration);
    job.setJobName(Constants.STATUS_CODES_STATS);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TranscodeLogsMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    job.setOutputFormatClass(TextOutputFormat.class);

    if (log.isDebugEnabled())
        Utils.log(job, log);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.crunch.type.avro.AvroGroupedTableType.java

License:Open Source License

@Override
public void configureShuffle(Job job, GroupingOptions options) {
    AvroTableType<K, V> att = (AvroTableType<K, V>) tableType;
    String schemaJson = att.getSchema().toString();
    job.getConfiguration().set(AvroJob.MAP_OUTPUT_SCHEMA, schemaJson);
    job.setSortComparatorClass(AvroKeyComparator.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(AvroValue.class);
    if (options != null) {
        options.configure(job);/*from w  w  w. j av a2  s.  co m*/
    }

    Collection<String> serializations = job.getConfiguration().getStringCollection("io.serializations");
    if (!serializations.contains(AvroSerialization.class.getName())) {
        serializations.add(AvroSerialization.class.getName());
        job.getConfiguration().setStrings("io.serializations", serializations.toArray(new String[0]));
    }
}

From source file:com.cloudera.crunch.type.writable.WritableGroupedTableType.java

License:Open Source License

@Override
public void configureShuffle(Job job, GroupingOptions options) {
    if (options != null) {
        options.configure(job);//w ww .j  a va 2 s . c  o  m
    }
    job.setMapOutputKeyClass(handler.getKeyClass());
    job.setMapOutputValueClass(handler.getValueClass());
}

From source file:com.cloudera.recordservice.avro.mapreduce.ColorCount.java

License:Apache License

/**
 * Run the MR2 color count with generic records, and return a map of favorite colors to
 * the number of users.// w  w w .  j av a  2 s  .c  o  m
 */
public static java.util.Map<String, Integer> countColors()
        throws IOException, ClassNotFoundException, InterruptedException {
    String output = TestUtil.getTempDirectory();
    Path outputPath = new Path(output);
    JobConf conf = new JobConf(ColorCount.class);
    conf.setInt("mapreduce.job.reduces", 1);

    Job job = Job.getInstance(conf);
    job.setJarByClass(ColorCount.class);
    job.setJobName("MR2 Color Count With Generic Records");

    RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users");
    job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(Reduce.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    job.waitForCompletion(false);

    // Read the result and return it. Since we set the number of reducers to 1,
    // there is always just one file containing the value.
    SeekableInput input = new FsInput(new Path(output + "/part-r-00000.avro"), conf);
    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
    java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>();
    for (GenericRecord datum : fileReader) {
        colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString()));
    }
    return colorMap;
}

From source file:com.cloudera.recordservice.examples.mapreduce.MapReduceAgeCount.java

License:Apache License

public int run(String[] args) throws Exception {
    org.apache.log4j.BasicConfigurator.configure();

    if (args.length != 2) {
        System.err.println("Usage: MapReduceAgeCount <input path> <output path>");
        return -1;
    }// w  ww.  j  a  v a  2  s  .  c o  m

    Job job = Job.getInstance(getConf());
    job.setJarByClass(MapReduceAgeCount.class);
    job.setJobName("Age Count");

    // RECORDSERVICE:
    // To read from a table instead of a path, comment out
    // FileInputFormat.setInputPaths() and instead use:
    // FileInputFormat.setInputPaths(job, new Path(args[0]));
    RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]);

    // RECORDSERVICE:
    // Use the RecordService version of the AvroKeyValueInputFormat
    job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyValueInputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(AgeCountMapper.class);
    // Set schema for input key and value.
    AvroJob.setInputKeySchema(job, UserKey.getClassSchema());
    AvroJob.setInputValueSchema(job, UserValue.getClassSchema());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(AgeCountReducer.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    return (job.waitForCompletion(true) ? 0 : 1);
}