Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.cloudera.accumulo.upgrade.util.MapreduceInputCli.java

License:Open Source License

/**
 * Iff you use offline mode, you have to call close() when you're done or manually delete the cloned table yourself.
 *//*from  ww  w  .  j a v a  2 s. c  om*/
public void useAccumuloInputFormat(Job job, String table, boolean offline) throws IOException,
        AccumuloException, AccumuloSecurityException, TableExistsException, TableNotFoundException {
    job.setInputFormatClass(AccumuloInputFormat.class);
    /* XXX Need to use a method that exists in 1.4 adn 1.5  :( */
    Configuration configuration = job.getConfiguration();
    AccumuloInputFormat.setZooKeeperInstance(job, connection.instance, connection.zookeepers);
    final TableOperations ops = connection.getConnector().tableOperations();

    String scan = table;
    if (offline) {
        Random random = new Random();
        scan = table + "_" + String.format("%016x", Math.abs(random.nextLong()));
        ops.clone(table, scan, true, Collections.<String, String>emptyMap(), Collections.<String>emptySet());
        try {
            ops.offline(scan);
        } finally {
            clones.add(scan);
        }
        AccumuloInputFormat.setOfflineTableScan(job, true);
    }

    PasswordToken token = new PasswordToken(connection.password);

    AccumuloInputFormat.setConnectorInfo(job, connection.principal, token);
    AccumuloInputFormat.setInputTableName(job, scan);
    AccumuloInputFormat.setScanAuthorizations(job, connection.auths);

    if (0 < maxMaps) {
        // set up ranges
        try {
            Set<Range> ranges = ops.splitRangeByTablets(table, new Range(), maxMaps);
            AccumuloInputFormat.setRanges(job, ranges);
            AccumuloInputFormat.setAutoAdjustRanges(job, false);
        } catch (Exception e) {
            throw new IOException(e);
        }
    }
}

From source file:com.cloudera.avro.MapReduceAvroWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: AvroWordCount <input path> <output path>");
        return -1;
    }/*from  w  ww .  j  a va 2  s .com*/

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceAvroWordCount.class);
    job.setJobName("wordcount");

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)));
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setSortComparatorClass(Text.Comparator.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

    return 0;
}

From source file:com.cloudera.avro.MapReduceColorCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: MapReduceColorCount <input path> <output path>");
        return -1;
    }//from ww  w. ja  va 2 s. c  om

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceColorCount.class);
    job.setJobName("Color Count");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapperClass(ColorCountMapper.class);
    AvroJob.setInputKeySchema(job, User.getClassSchema());
    AvroJob.setMapOutputValueSchema(job, User.getClassSchema());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(ColorCountReducer.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.cloudera.ByteCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(new Configuration());

    // Trim off the hadoop-specific args
    String[] remArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    // Pull in properties
    Options options = new Options();

    Option property = OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator()
            .withDescription("use value for given property").create("D");
    options.addOption(property);/*from  w  ww  . ja v a 2  s  .c  o  m*/

    Option skipChecksums = new Option("skipChecksums", "skip checksums");
    options.addOption(skipChecksums);

    Option profile = new Option("profile", "profile tasks");
    options.addOption(profile);

    CommandLineParser parser = new BasicParser();
    CommandLine line = parser.parse(options, remArgs);

    Properties properties = line.getOptionProperties("D");
    for (Entry<Object, Object> prop : properties.entrySet()) {
        conf.set(prop.getKey().toString(), prop.getValue().toString());
        System.out.println("Set config key " + prop.getKey() + " to " + prop.getValue());
    }

    if (line.hasOption("skipChecksums")) {
        conf.setBoolean("bytecount.skipChecksums", true);
        System.out.println("Skipping checksums");
    }

    if (line.hasOption("profile")) {
        conf.setBoolean("mapred.task.profile", true);
        conf.set("mapred.task.profile.params",
                "-agentlib:hprof=cpu=samples,depth=100,interval=1ms,lineno=y,thread=y,file=%s");
        conf.set(MRJobConfig.NUM_MAP_PROFILES, "0");
        conf.set("mapred.task.profile.maps", "1");
        System.out.println("Profiling map tasks");
    }

    // Get the positional arguments out
    remArgs = line.getArgs();
    if (remArgs.length != 2) {
        System.err.println("Usage: ByteCount <inputBase> <outputBase>");
        System.exit(1);
    }
    String inputBase = remArgs[0];
    String outputBase = remArgs[1];

    Job job = Job.getInstance(conf);

    job.setInputFormatClass(ByteBufferInputFormat.class);

    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(ByteCountMapper.class);
    job.setReducerClass(ByteCountReducer.class);
    job.setCombinerClass(ByteCountReducer.class);

    job.setOutputKeyClass(ByteWritable.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.addInputPath(job, new Path(inputBase));
    FileOutputFormat.setOutputPath(job, new Path(outputBase));

    job.setJarByClass(ByteCount.class);

    boolean success = job.waitForCompletion(true);

    Counters counters = job.getCounters();
    System.out.println("\tRead counters");
    printCounter(counters, READ_COUNTER.BYTES_READ);
    printCounter(counters, READ_COUNTER.LOCAL_BYTES_READ);
    printCounter(counters, READ_COUNTER.SCR_BYTES_READ);
    printCounter(counters, READ_COUNTER.ZCR_BYTES_READ);

    System.exit(success ? 0 : 1);
}

From source file:com.cloudera.castagna.logparser.mr.StatusCodesStats.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/*  w w  w .ja va2s. c  om*/

    Configuration configuration = getConf();
    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);

    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERWRITE_OUTPUT,
            Constants.OPTION_OVERWRITE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = Job.getInstance(configuration);
    job.setJobName(Constants.STATUS_CODES_STATS);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(StatusCodesStatsMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(StatusCodesStatsCombiner.class);

    job.setReducerClass(StatusCodesStatsReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Utils.setReducers(job, configuration, log);

    job.setOutputFormatClass(TextOutputFormat.class);

    if (log.isDebugEnabled())
        Utils.log(job, log);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.castagna.logparser.mr.TranscodeLogs.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//www  . jav  a 2  s . c  o  m

    Configuration configuration = getConf();

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERWRITE_OUTPUT,
            Constants.OPTION_OVERWRITE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = Job.getInstance(configuration);
    job.setJobName(Constants.STATUS_CODES_STATS);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TranscodeLogsMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    job.setOutputFormatClass(TextOutputFormat.class);

    if (log.isDebugEnabled())
        Utils.log(job, log);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.cloudera.crunch.impl.mr.plan.JobPrototype.java

License:Open Source License

private CrunchJob build(Class<?> jarClass, Configuration conf) throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();//from ww w  .  ja va 2  s .c  om
    job.setJarByClass(jarClass);

    Set<DoNode> outputNodes = Sets.newHashSet();
    Set<Target> targets = targetsToNodePaths.keySet();
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, workingPath, group == null);
    for (Target target : targets) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PCollectionImpl collect = nodePath.tail();
                node = DoNode.createOutputNode(target.toString(), collect.getPType());
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }

    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    RTNodeSerializer serializer = new RTNodeSerializer();
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        reduceNode = reduceNodes.get(0);
        serializer.serialize(reduceNodes, conf, NodeContext.REDUCE);

        group.configureShuffle(job);

        DoNode mapOutputNode = group.getGroupingNode();
        if (reduceNodes.size() == 1 && combineFnTable != null) {
            // Handle the combiner case
            DoNode mapSideCombineNode = combineFnTable.createDoNode();
            mapSideCombineNode.addChild(mapOutputNode);
            mapOutputNode = mapSideCombineNode;
        }

        Set<DoNode> mapNodes = Sets.newHashSet();
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    } else { // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    }

    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            DoNode inputNode = inputNodes.get(i);
            inputNode.getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(inputNodes, reduceNode));

    return new CrunchJob(job, workingPath, outputHandler);
}

From source file:com.cloudera.crunch.io.hbase.HBaseSourceTarget.java

License:Open Source License

@Override
public void configureSource(Job job, int inputId) throws IOException {
    Configuration conf = job.getConfiguration();
    job.setInputFormatClass(TableInputFormat.class);
    job.setMapperClass(CrunchMapper.class);
    HBaseConfiguration.addHbaseResources(conf);
    conf.set(TableInputFormat.INPUT_TABLE, table);
    conf.set(TableInputFormat.SCAN, convertScanToString(scan));
    TableMapReduceUtil.addDependencyJars(job);
}

From source file:com.cloudera.crunch.io.impl.FileSourceImpl.java

License:Open Source License

@Override
public void configureSource(Job job, int inputId) throws IOException {
    if (inputId == -1) {
        FileInputFormat.addInputPath(job, path);
        job.setInputFormatClass(inputFormatClass);
    } else {/*from  ww w. ja v  a  2s  .co m*/
        CrunchInputs.addInputPath(job, path, inputFormatClass, inputId);
    }
}

From source file:com.cloudera.crunch.io.SourceTargetHelper.java

License:Open Source License

public static void configureSource(Job job, int sourceId, Class<? extends InputFormat> inputFormatClass,
        Path path) throws IOException {
    if (sourceId == -1) {
        FileInputFormat.addInputPath(job, path);
        job.setInputFormatClass(inputFormatClass);
    } else {/*from   w w  w . j a  v  a 2s.  c  o  m*/
        CrunchInputs.addInputPath(job, path, inputFormatClass, sourceId);
    }
}