Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.BloomFilterJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, BloomFilterJob.class.getSimpleName());
    job.setJarByClass(BloomFilterJob.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample BloomFilter Job");
    job.setMapperClass(BloomFilterMapper.class);
    job.setReducerClass(BloomFilterReducer.class);
    job.setNumReduceTasks(1);//from w w  w  .  ja va  2 s.  c  o  m

    job.setInputFormatClass(TextInputFormat.class);

    /*
     * We want our reducer to output the final BloomFilter as a binary file. I think 
     * Hadoop doesn't have this format [check later], so using NullOutpuFormat.class.
     * 
     * In general life gets a little more dangerous when you deviate from MapReduce's input/output 
     * framework and start working with your own files. Your tasks are no longer guaranteed to be idempotent 
     * and you'll need to understand how various failure scenarios can affect your tasks. For example, your files 
     * may only be partially written when some tasks are restarted. Our example here is safe(r) because all the file 
     * operations take place together only once in the close() method and in only one reducer. A more 
     * careful/paranoid implementation would check each individual file operation more closely.
     */
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BloomFilter.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.ChainJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, ChainJob.class.getSimpleName());
    job.setJobName("Sample Chain Job");
    job.setJarByClass(ChainJob.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    ChainMapper.addMapper(job, ReverseMapper.class, Text.class, Text.class, Text.class, Text.class,
            new Configuration(false));
    ChainMapper.addMapper(job, AverageMapper.class, Text.class, Text.class, Text.class, AverageWritable.class,
            new Configuration(false));
    ChainReducer.setReducer(job, AverageReducer.class, Text.class, AverageWritable.class, Text.class,
            DoubleWritable.class, new Configuration(false));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;/*from w  ww . j  av a  2  s. c  o  m*/
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.CountJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, CountJob.class.getSimpleName());
    job.setJarByClass(CountJob.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample Count Job");
    job.setMapperClass(CountMapper.class);
    job.setReducerClass(CountReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;//  w ww. j  a va2  s  .  c  o  m
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.DistributedCacheJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, DistributedCacheJob.class.getSimpleName());
    job.setJarByClass(DistributedCacheJob.class);

    /*//from w w w  . j a  v a 2s .  com
     * The following will disseminate the file to all the nodes and the file defaults to HDFS.
     * The second and third arguments denote the input and output paths of the standard Hadoop 
     * job. Note that we've limited the number of data sources to two. This is not an inherent 
     * limitation of the technique, but a simplification that makes our code easier to follow.
     */
    //job.addCacheFile(new Path(args[0]).toUri());

    Path in = new Path(args[1]);
    Path out = new Path(args[2]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample DistributedCache Job");
    job.setMapperClass(DistributedCacheMapper.class);

    /*
     * Took out the Reduce class as the plan is performing the joining in the map phase and will 
     * configure the job to have no reduce.
     */
    job.setNumReduceTasks(0);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;
}

From source file:com.jumptap.h2redis.RedisDriver.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 5) {
        usage();//  ww w  .ja v  a2  s . co  m
        return 1;
    }

    Map<String, String> argMap = new HashMap<String, String>();
    String[] kv;

    for (String arg : args) {
        kv = arg.split("=");
        if (kv.length != 2) {
            usage();
            return 1;
        }
        argMap.put(kv[0].trim(), kv[1]);
    }

    Configuration conf = getConf();
    String[] hostPort = argMap.get(REDIS_CMD).split(":");
    conf.set(REDIS_HOST, hostPort[0].trim());
    conf.setInt(REDIS_PORT, Integer.valueOf(hostPort[1].trim()));
    conf.setInt(REDIS_KEY_FIELD, Integer.valueOf(argMap.get(KEY_CMD).trim()));
    conf.setInt(REDIS_HASHKEY_FIELD, Integer.valueOf(argMap.get(HASH_KEY_CMD).trim()));
    conf.setInt(REDIS_HASHVAL_FIELD, Integer.valueOf(argMap.get(HASH_VAL_CMD).trim()));

    if (argMap.containsKey(REDIS_DB_CMD)) {
        conf.set(REDIS_DB, argMap.get(REDIS_DB_CMD).trim());
    }
    if (argMap.containsKey(REDIS_PW_CMD)) {
        conf.set(REDIS_PW, argMap.get(REDIS_PW_CMD).trim());
    }
    if (argMap.containsKey(KEY_PFX_CMD)) {
        conf.set(REDIS_KEY_PREFIX, argMap.get(KEY_PFX_CMD).trim());
    }
    if (argMap.containsKey(HASH_KEY_PFX_CMD)) {
        conf.set(REDIS_HASHKEY_PREFIX, argMap.get(HASH_KEY_PFX_CMD).trim());
    }
    if (argMap.containsKey(KEY_PFX_DELIM_CMD)) {
        conf.set(REDIS_KEY_PREFIX_DELIM, argMap.get(KEY_PFX_DELIM_CMD).trim());
    }
    if (argMap.containsKey(KEY_FILTER_CMD)) {
        conf.setPattern(REDIS_KEY_FILTER, Pattern.compile(argMap.get(KEY_FILTER_CMD).trim()));
    }
    if (argMap.containsKey(HASH_FILTER_CMD)) {
        conf.setPattern(REDIS_HASH_FILTER, Pattern.compile(argMap.get(HASH_FILTER_CMD).trim()));
    }
    if (argMap.containsKey(VAL_FILTER_CMD)) {
        conf.setPattern(REDIS_VAL_FILTER, Pattern.compile(argMap.get(VAL_FILTER_CMD).trim()));
    }
    if (argMap.containsKey(VAL_FILTER_CMD)) {
        conf.setPattern(REDIS_VAL_FILTER, Pattern.compile(argMap.get(VAL_FILTER_CMD).trim()));
    }
    if (argMap.containsKey(TTL_CMD)) {
        conf.setInt(REDIS_KEY_TTL, Integer.valueOf(argMap.get(TTL_CMD).trim()));
    }
    if (argMap.containsKey(TS_KEY_CMD)) {
        conf.set(REDIS_KEY_TS, argMap.get(TS_KEY_CMD).trim());
    } else {
        conf.set(REDIS_KEY_TS, "redis.lastupdate");
    }

    Job job = new Job(conf, "RedisDriver");
    FileInputFormat.addInputPath(job, new Path(argMap.get(INPUT_CMD)));
    job.setJarByClass(RedisDriver.class);
    job.setMapperClass(RedisOutputMapper.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(RedisOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.juniarto.secondsorter.SsJob.java

public int run(String[] allArgs) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SsJob.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapOutputKeyClass(TextDsi.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(SsMapper.class);
    job.setReducerClass(SsReducer.class);
    job.setNumReduceTasks(2);//from  www .j a v a  2  s  . co  m

    String[] args = new GenericOptionsParser(getConf(), allArgs).getRemainingArgs();
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    //job.submit();

    long time1 = System.nanoTime();
    boolean status = job.waitForCompletion(true);
    long time2 = System.nanoTime();
    long timeSpent = time2 - time1;
    LOG.info("TIME: " + timeSpent);
    return 0;

}

From source file:com.kangfoo.study.hadoop1.mp.typeformat.TestMapreduceSequenceInputFormat.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: TestMapreduceSequenceInputFormat <in> <out>");
        System.exit(2);/*  ww w .j ava 2s  .c om*/
    }
    Job job = new Job(conf, "TestMapreduceSequenceInputFormat");
    job.setJarByClass(TestMapreduceSequenceInputFormat.class);//?
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class); // SequenceFileInputFormat
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.kasabi.labs.freebase.mr.Freebase2RDFDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (log.isDebugEnabled()) {
        log.debug("run({})", Utils.toString(args));
    }//from   www . j  av  a 2 s  .  c o  m

    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }

    Configuration configuration = getConf();
    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);

    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName("Freebase2RDFDriver");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(Freebase2RDFMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setReducerClass(Freebase2RDFReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Utils.setReducers(job, configuration, log);

    job.setOutputFormatClass(TextOutputFormat.class);

    if (log.isDebugEnabled())
        Utils.log(job, log);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.knewton.mapreduce.example.SSTableMRExample.java

License:Apache License

public static void main(String[] args)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException, ParseException {

    long startTime = System.currentTimeMillis();
    Options options = buildOptions();//from   w w  w.  j  a  va 2  s  .c  o m

    CommandLineParser cliParser = new BasicParser();
    CommandLine cli = cliParser.parse(options, args);
    if (cli.getArgs().length < 2 || cli.hasOption('h')) {
        printUsage(options);
    }
    Job job = getJobConf(cli);

    job.setJarByClass(SSTableMRExample.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(StudentEventWritable.class);

    job.setMapperClass(StudentEventMapper.class);
    job.setReducerClass(StudentEventReducer.class);

    job.setInputFormatClass(SSTableColumnInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    // input arg
    String inputPaths = cli.getArgs()[0];
    LOG.info("Setting initial input paths to {}", inputPaths);
    SSTableInputFormat.addInputPaths(job, inputPaths);
    // output arg
    FileOutputFormat.setOutputPath(job, new Path(cli.getArgs()[1]));
    if (cli.hasOption('c')) {
        LOG.info("Using compression for output.");
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
        FileOutputFormat.setCompressOutput(job, true);
    }
    job.waitForCompletion(true);
    LOG.info("Total runtime: {}s", (System.currentTimeMillis() - startTime) / 1000);
}

From source file:com.knewton.mrtool.example.JsonMRExample.java

License:Apache License

/**
 * /*from   ww w. ja  va2  s. c om*/
 * @param args
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Job job = new Job(new Configuration());

    job.setInputFormatClass(RecommendationsInputFormat.class);
    RecommendationsInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(RecommendationWritable.class);

    job.waitForCompletion(true);
}