Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.BloomFilterJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, BloomFilterJob.class.getSimpleName());
    job.setJarByClass(BloomFilterJob.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample BloomFilter Job");
    job.setMapperClass(BloomFilterMapper.class);
    job.setReducerClass(BloomFilterReducer.class);
    job.setNumReduceTasks(1);//from w w  w  .  ja va  2 s.  c  o  m

    job.setInputFormatClass(TextInputFormat.class);

    /*
     * We want our reducer to output the final BloomFilter as a binary file. I think 
     * Hadoop doesn't have this format [check later], so using NullOutpuFormat.class.
     * 
     * In general life gets a little more dangerous when you deviate from MapReduce's input/output 
     * framework and start working with your own files. Your tasks are no longer guaranteed to be idempotent 
     * and you'll need to understand how various failure scenarios can affect your tasks. For example, your files 
     * may only be partially written when some tasks are restarted. Our example here is safe(r) because all the file 
     * operations take place together only once in the close() method and in only one reducer. A more 
     * careful/paranoid implementation would check each individual file operation more closely.
     */
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BloomFilter.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.ChainJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, ChainJob.class.getSimpleName());
    job.setJobName("Sample Chain Job");
    job.setJarByClass(ChainJob.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    ChainMapper.addMapper(job, ReverseMapper.class, Text.class, Text.class, Text.class, Text.class,
            new Configuration(false));
    ChainMapper.addMapper(job, AverageMapper.class, Text.class, Text.class, Text.class, AverageWritable.class,
            new Configuration(false));
    ChainReducer.setReducer(job, AverageReducer.class, Text.class, AverageWritable.class, Text.class,
            DoubleWritable.class, new Configuration(false));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;/*from w  ww . j  av a  2  s. c  o  m*/
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.CountJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, CountJob.class.getSimpleName());
    job.setJarByClass(CountJob.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample Count Job");
    job.setMapperClass(CountMapper.class);
    job.setReducerClass(CountReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;//  w ww. j  a va2  s  .  c  o  m
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.DistributedCacheJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, DistributedCacheJob.class.getSimpleName());
    job.setJarByClass(DistributedCacheJob.class);

    /*//from w w w  . j a  v a 2s .  com
     * The following will disseminate the file to all the nodes and the file defaults to HDFS.
     * The second and third arguments denote the input and output paths of the standard Hadoop 
     * job. Note that we've limited the number of data sources to two. This is not an inherent 
     * limitation of the technique, but a simplification that makes our code easier to follow.
     */
    //job.addCacheFile(new Path(args[0]).toUri());

    Path in = new Path(args[1]);
    Path out = new Path(args[2]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample DistributedCache Job");
    job.setMapperClass(DistributedCacheMapper.class);

    /*
     * Took out the Reduce class as the plan is performing the joining in the map phase and will 
     * configure the job to have no reduce.
     */
    job.setNumReduceTasks(0);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;
}

From source file:com.jumptap.h2redis.RedisDriver.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 5) {
        usage();//  ww w  .ja v  a2  s . co  m
        return 1;
    }

    Map<String, String> argMap = new HashMap<String, String>();
    String[] kv;

    for (String arg : args) {
        kv = arg.split("=");
        if (kv.length != 2) {
            usage();
            return 1;
        }
        argMap.put(kv[0].trim(), kv[1]);
    }

    Configuration conf = getConf();
    String[] hostPort = argMap.get(REDIS_CMD).split(":");
    conf.set(REDIS_HOST, hostPort[0].trim());
    conf.setInt(REDIS_PORT, Integer.valueOf(hostPort[1].trim()));
    conf.setInt(REDIS_KEY_FIELD, Integer.valueOf(argMap.get(KEY_CMD).trim()));
    conf.setInt(REDIS_HASHKEY_FIELD, Integer.valueOf(argMap.get(HASH_KEY_CMD).trim()));
    conf.setInt(REDIS_HASHVAL_FIELD, Integer.valueOf(argMap.get(HASH_VAL_CMD).trim()));

    if (argMap.containsKey(REDIS_DB_CMD)) {
        conf.set(REDIS_DB, argMap.get(REDIS_DB_CMD).trim());
    }
    if (argMap.containsKey(REDIS_PW_CMD)) {
        conf.set(REDIS_PW, argMap.get(REDIS_PW_CMD).trim());
    }
    if (argMap.containsKey(KEY_PFX_CMD)) {
        conf.set(REDIS_KEY_PREFIX, argMap.get(KEY_PFX_CMD).trim());
    }
    if (argMap.containsKey(HASH_KEY_PFX_CMD)) {
        conf.set(REDIS_HASHKEY_PREFIX, argMap.get(HASH_KEY_PFX_CMD).trim());
    }
    if (argMap.containsKey(KEY_PFX_DELIM_CMD)) {
        conf.set(REDIS_KEY_PREFIX_DELIM, argMap.get(KEY_PFX_DELIM_CMD).trim());
    }
    if (argMap.containsKey(KEY_FILTER_CMD)) {
        conf.setPattern(REDIS_KEY_FILTER, Pattern.compile(argMap.get(KEY_FILTER_CMD).trim()));
    }
    if (argMap.containsKey(HASH_FILTER_CMD)) {
        conf.setPattern(REDIS_HASH_FILTER, Pattern.compile(argMap.get(HASH_FILTER_CMD).trim()));
    }
    if (argMap.containsKey(VAL_FILTER_CMD)) {
        conf.setPattern(REDIS_VAL_FILTER, Pattern.compile(argMap.get(VAL_FILTER_CMD).trim()));
    }
    if (argMap.containsKey(VAL_FILTER_CMD)) {
        conf.setPattern(REDIS_VAL_FILTER, Pattern.compile(argMap.get(VAL_FILTER_CMD).trim()));
    }
    if (argMap.containsKey(TTL_CMD)) {
        conf.setInt(REDIS_KEY_TTL, Integer.valueOf(argMap.get(TTL_CMD).trim()));
    }
    if (argMap.containsKey(TS_KEY_CMD)) {
        conf.set(REDIS_KEY_TS, argMap.get(TS_KEY_CMD).trim());
    } else {
        conf.set(REDIS_KEY_TS, "redis.lastupdate");
    }

    Job job = new Job(conf, "RedisDriver");
    FileInputFormat.addInputPath(job, new Path(argMap.get(INPUT_CMD)));
    job.setJarByClass(RedisDriver.class);
    job.setMapperClass(RedisOutputMapper.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(RedisOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.juniarto.secondsorter.SsJob.java

public int run(String[] allArgs) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SsJob.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapOutputKeyClass(TextDsi.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(SsMapper.class);
    job.setReducerClass(SsReducer.class);
    job.setNumReduceTasks(2);//from  www .j a v a  2  s  . co  m

    String[] args = new GenericOptionsParser(getConf(), allArgs).getRemainingArgs();
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    //job.submit();

    long time1 = System.nanoTime();
    boolean status = job.waitForCompletion(true);
    long time2 = System.nanoTime();
    long timeSpent = time2 - time1;
    LOG.info("TIME: " + timeSpent);
    return 0;

}

From source file:com.kangfoo.study.hadoop1.mp.typeformat.TestMapreduceSequenceInputFormat.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: TestMapreduceSequenceInputFormat <in> <out>");
        System.exit(2);/*  ww w .j ava 2s  .c om*/
    }
    Job job = new Job(conf, "TestMapreduceSequenceInputFormat");
    job.setJarByClass(TestMapreduceSequenceInputFormat.class);//?
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class); // SequenceFileInputFormat
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.kasabi.labs.freebase.mr.Freebase2RDFDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (log.isDebugEnabled()) {
        log.debug("run({})", Utils.toString(args));
    }//from   www . j  av  a 2 s  .  c o  m

    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }

    Configuration configuration = getConf();
    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);

    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName("Freebase2RDFDriver");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(Freebase2RDFMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setReducerClass(Freebase2RDFReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Utils.setReducers(job, configuration, log);

    job.setOutputFormatClass(TextOutputFormat.class);

    if (log.isDebugEnabled())
        Utils.log(job, log);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.knewton.mapreduce.example.SSTableMRExample.java

License:Apache License

public static void main(String[] args)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException, ParseException {

    long startTime = System.currentTimeMillis();
    Options options = buildOptions();//from   w w  w.  j  a  va 2  s  .c  o m

    CommandLineParser cliParser = new BasicParser();
    CommandLine cli = cliParser.parse(options, args);
    if (cli.getArgs().length < 2 || cli.hasOption('h')) {
        printUsage(options);
    }
    Job job = getJobConf(cli);

    job.setJarByClass(SSTableMRExample.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(StudentEventWritable.class);

    job.setMapperClass(StudentEventMapper.class);
    job.setReducerClass(StudentEventReducer.class);

    job.setInputFormatClass(SSTableColumnInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    // input arg
    String inputPaths = cli.getArgs()[0];
    LOG.info("Setting initial input paths to {}", inputPaths);
    SSTableInputFormat.addInputPaths(job, inputPaths);
    // output arg
    FileOutputFormat.setOutputPath(job, new Path(cli.getArgs()[1]));
    if (cli.hasOption('c')) {
        LOG.info("Using compression for output.");
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
        FileOutputFormat.setCompressOutput(job, true);
    }
    job.waitForCompletion(true);
    LOG.info("Total runtime: {}s", (System.currentTimeMillis() - startTime) / 1000);
}

From source file:com.knewton.mrtool.example.JsonMRExample.java

License:Apache License

/**
 * /*from   ww w. ja  va2  s. c om*/
 * @param args
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Job job = new Job(new Configuration());

    job.setInputFormatClass(RecommendationsInputFormat.class);
    RecommendationsInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(RecommendationWritable.class);

    job.waitForCompletion(true);
}