Example usage for org.apache.hadoop.mapreduce Job Job

List of usage examples for org.apache.hadoop.mapreduce Job Job

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job Job.

Prototype

Job(JobStatus status, JobConf conf) throws IOException 

Source Link

Usage

From source file:com.igalia.metamail.jobs.MessagesByTimePeriod.java

License:Open Source License

private static Job setupJob() throws IOException, InterruptedException, ClassNotFoundException {
    Configuration config = HBaseConfiguration.create();
    Job job = new Job(config, "MessagesByTimePeriod");
    job.setJarByClass(MessagesByTimePeriod.class);

    Scan scan = new Scan();
    scan.setCaching(500);//from  w w  w. j ava 2  s. c  om
    scan.setCacheBlocks(false); // don't set to true for MR jobs

    // Mapper
    TableMapReduceUtil.initTableMapperJob(mailsTable, // input HBase table name
            scan, // Scan instance to control CF and attribute selection
            MessagesByTimePeriod.MessagesByTimePeriodMapper.class, Text.class, IntWritable.class, job);

    // Reducer
    job.setReducerClass(MessagesByTimePeriod.MessagesByTimePeriodReducer.class);
    job.setNumReduceTasks(1);

    FileOutputFormat.setOutputPath(job, new Path(MessagesByTimePeriod.MAIL_OUT));

    return job;
}

From source file:com.impetus.code.examples.hadoop.cassandra.wordcount.WordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    String outputReducerType = "cassandra";
    if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) {
        String[] s = args[0].split("=");
        if (s != null && s.length == 2)
            outputReducerType = s[1];/*from   w  w  w  .  j av a  2 s. com*/
    }
    logger.info("output reducer type: " + outputReducerType);

    for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) {
        String columnName = "text" + i;
        getConf().set(CONF_COLUMN_NAME, columnName);

        Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);

        if (outputReducerType.equalsIgnoreCase("filesystem")) {
            job.setCombinerClass(ReducerToFilesystem.class);
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
        } else {
            job.setReducerClass(ReducerToCassandra.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setOutputKeyClass(ByteBuffer.class);
            job.setOutputValueClass(List.class);

            job.setOutputFormatClass(ColumnFamilyOutputFormat.class);

            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
        }

        job.setInputFormatClass(ColumnFamilyInputFormat.class);

        ConfigHelper.setRpcPort(job.getConfiguration(), "9160");
        ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, INPUT_COLUMN_FAMILY);
        SlicePredicate predicate = new SlicePredicate()
                .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

        job.waitForCompletion(true);
    }
    return 0;
}

From source file:com.impetus.code.examples.hadoop.cassandra.wordcount.WordCountCounters.java

License:Apache License

public int run(String[] args) throws Exception {
    Job job = new Job(getConf(), "wordcountcounters");
    job.setJarByClass(WordCountCounters.class);
    job.setMapperClass(SumMapper.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));

    job.setInputFormatClass(ColumnFamilyInputFormat.class);

    ConfigHelper.setRpcPort(job.getConfiguration(), "9160");
    ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost");
    ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
    ConfigHelper.setInputColumnFamily(job.getConfiguration(), WordCount.KEYSPACE,
            WordCountCounters.COUNTER_COLUMN_FAMILY);
    SlicePredicate predicate = new SlicePredicate()
            .setSlice_range(new SliceRange().setStart(ByteBufferUtil.EMPTY_BYTE_BUFFER)
                    .setFinish(ByteBufferUtil.EMPTY_BYTE_BUFFER).setCount(100));
    ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

    job.waitForCompletion(true);/* w ww .ja va 2 s.  c  o m*/
    return 0;
}

From source file:com.inmobi.conduit.distcp.tools.DistCp.java

License:Apache License

/**
 * Create Job object for submitting it, with all the configuration
 *
 * @return Reference to job object.//from w w w.  j a va  2s  .  c  o m
 * @throws IOException - Exception if any
 */
protected Job createJob() throws IOException {
    String jobName = "distcp";
    String userChosenName = getConf().get("mapred.job.name");
    if (userChosenName != null)
        jobName += ": " + userChosenName;
    Job job = new Job(getConf(), jobName);
    job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions));
    job.setJarByClass(CopyMapper.class);
    configureOutputFormat(job);

    job.setMapperClass(CopyMapper.class);
    job.setReducerClass(Reducer.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(CopyOutputFormat.class);
    job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false");
    job.getConfiguration().set(DistCpConstants.CONF_LABEL_NUM_MAPS, String.valueOf(inputOptions.getMaxMaps()));

    if (inputOptions.getSslConfigurationFile() != null) {
        setupSSLConfig(job.getConfiguration());
    }

    inputOptions.appendToConf(job.getConfiguration());
    return job;
}

From source file:com.intel.hadoop.hbase.dot.mapreduce.DotImportTsv.java

License:Apache License

/**
 * Sets up the actual job./*from w  w  w  .  j a v a2  s  . c o  m*/
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
        throws IOException, ClassNotFoundException {

    // Support non-XML supported characters
    // by re-encoding the passed separator as a Base64 string.
    String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
    if (actualSeparator != null) {
        conf.set(SEPARATOR_CONF_KEY, Base64.encodeBytes(actualSeparator.getBytes()));
    }

    // See if a non-default Mapper was set
    String mapperClassName = conf.get(MAPPER_CONF_KEY);
    Class mapperClass = mapperClassName != null ? Class.forName(mapperClassName) : DEFAULT_MAPPER;

    String tableName = args[0];
    Path inputDir = new Path(args[1]);
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(mapperClass);
    FileInputFormat.setInputPaths(job, inputDir);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(mapperClass);

    String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);
    if (hfileOutPath != null) {
        if (!doesTableExist(tableName)) {
            createTable(conf, tableName);
        }
        HTable table = new HTable(conf, tableName);
        job.setReducerClass(PutSortReducer.class);
        Path outputDir = new Path(hfileOutPath);
        FileOutputFormat.setOutputPath(job, outputDir);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);
        HFileOutputFormat.configureIncrementalLoad(job, table);
    } else {
        // No reducers.  Just write straight to table.  Call initTableReducerJob
        // to set up the TableOutputFormat.
        TableMapReduceUtil.initTableReducerJob(tableName, null, job);
        job.setNumReduceTasks(0);
    }

    TableMapReduceUtil.addDependencyJars(job);
    TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
            com.google.common.base.Function.class /* Guava used by TsvParser */);
    return job;
}

From source file:com.javiertordable.mrif.MapReduceQuadraticSieve.java

License:Apache License

/**
 * Setup the MapReduce parameters and run it.
 *
 * Tool parses the command line arguments for us.
 *//*  www. ja va 2s.  co m*/
public int run(String[] args) throws Exception {
    Configuration conf = getConf();

    // Check the arguments. we need the integer to attempt to factor.
    if (args.length < 1) {
        System.out.println("Please indicate the integer to factor");
        LOGGER.severe("No integer to factor. Exit.");
        System.exit(1);
    }

    // Parse N and add it to the job configuration, so that the workers can
    // access it as well.
    BigInteger N = new BigInteger(args[0]);
    LOGGER.info("Attempting factorization of: " + N.toString());
    conf.set(INTEGER_TO_FACTOR_NAME, N.toString());

    // Obtain the factor base for the integer N.
    FactorBaseArray factorBase = SieveInput.factorBase(N);
    LOGGER.info("Factor base of size: " + factorBase.size());
    conf.set(FACTOR_BASE_NAME, factorBase.toString());

    // Prepare the input of the mapreduce.
    LOGGER.info("Sieve of size: " + SieveInput.fullSieveIntervalSize(N));
    try {
        // Write the full sieve interval to disk.
        SieveInput.writeFullSieveInterval(N, "input/" + INPUT_FILE_NAME);
    } catch (FileNotFoundException e) {
        System.out.println("Unable to open the file for writing.");
    } catch (IOException e) {
        System.out.println("Unable to write to the output file.");
    }

    // Configure the classes of the mapreducer
    Job job = new Job(conf, "QuadraticSieve");
    job.setJarByClass(MapReduceQuadraticSieve.class);
    job.setMapperClass(SieveMapper.class);
    job.setReducerClass(FindSquaresReducer.class);

    // Output will be two pairs of strings:
    // <"Factor1", "59">
    // <"Factor2", "101">
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path("input/"));
    FileOutputFormat.setOutputPath(job, new Path("output/"));

    // Submit the job.
    job.waitForCompletion(true);

    return 0;
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.AverageJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, AverageJob.class.getSimpleName());
    job.setJarByClass(AverageJob.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample Average Job");
    job.setMapperClass(AverageMapper.class);
    job.setCombinerClass(AverageCombiner.class);
    job.setReducerClass(AverageReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;//from  w w  w  .j av a  2s .co  m
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.AverageMultipleOutputJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, AverageMultipleOutputJob.class.getSimpleName());
    job.setJarByClass(AverageMultipleOutputJob.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample Multiple Output Job");
    job.setMapperClass(AverageMapper.class);
    job.setReducerClass(AverageMultipleOutputReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);

    MultipleOutputs.addNamedOutput(job, "greaterThan1000", TextOutputFormat.class, Text.class,
            DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "lessThan1000", TextOutputFormat.class, Text.class,
            DoubleWritable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;/*from   w  ww. j  a v  a2  s.  c  o m*/
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.BloomFilterJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, BloomFilterJob.class.getSimpleName());
    job.setJarByClass(BloomFilterJob.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Sample BloomFilter Job");
    job.setMapperClass(BloomFilterMapper.class);
    job.setReducerClass(BloomFilterReducer.class);
    job.setNumReduceTasks(1);//  ww  w . j a v a2  s . c o m

    job.setInputFormatClass(TextInputFormat.class);

    /*
     * We want our reducer to output the final BloomFilter as a binary file. I think 
     * Hadoop doesn't have this format [check later], so using NullOutpuFormat.class.
     * 
     * In general life gets a little more dangerous when you deviate from MapReduce's input/output 
     * framework and start working with your own files. Your tasks are no longer guaranteed to be idempotent 
     * and you'll need to understand how various failure scenarios can affect your tasks. For example, your files 
     * may only be partially written when some tasks are restarted. Our example here is safe(r) because all the file 
     * operations take place together only once in the close() method and in only one reducer. A more 
     * careful/paranoid implementation would check each individual file operation more closely.
     */
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BloomFilter.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;
}

From source file:com.jhkt.playgroundArena.hadoop.tasks.jobs.ChainJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf, ChainJob.class.getSimpleName());
    job.setJobName("Sample Chain Job");
    job.setJarByClass(ChainJob.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    ChainMapper.addMapper(job, ReverseMapper.class, Text.class, Text.class, Text.class, Text.class,
            new Configuration(false));
    ChainMapper.addMapper(job, AverageMapper.class, Text.class, Text.class, Text.class, AverageWritable.class,
            new Configuration(false));
    ChainReducer.setReducer(job, AverageReducer.class, Text.class, AverageWritable.class, Text.class,
            DoubleWritable.class, new Configuration(false));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

    return 0;/*www  .j av a  2s . c o m*/
}