Example usage for org.apache.hadoop.mapreduce Job getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getConfiguration.

Prototype

public Configuration getConfiguration()

Source Link

Document

Return the configuration for the job.

Usage

From source file:cloud9.ComputeCooccurrenceMatrixStripesOOM.java

License:Apache License

/**
 * Runs this tool.//from   w w  w .  j  a v a  2s.  com
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int window = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: ComputeCooccurrenceMatrixStripes");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - window: " + window);
    sLogger.info(" - number of reducers: " + reduceTasks);

    Job job = new Job(getConf(), "CooccurrenceMatrixStripes");

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    job.getConfiguration().setInt("window", window);

    job.setJarByClass(ComputeCooccurrenceMatrixStripesOOM.class);
    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(String2IntOpenHashMapWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.getConfiguration().setInt("io.sort.mb", 400);

    job.getConfiguration().set("mapred.child.java.opts",
            "-Xmx1000m -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps");
    //job.getConfiguration().set("mapred.child.java.opts", "-Xmx1000m");

    job.getConfiguration().setInt("child.monitor.jstat.seconds", 2);

    job.getConfiguration().set("fs.default.name", "hdfs://master:9000");
    job.getConfiguration().set("mapred.job.tracker", "master:9001");

    //conf.set("user.name", "xulijie");

    job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", 1);

    //job.getConfiguration().setFloat("io.sort.record.percent", 0.2f);
    //job.getConfiguration().setFloat("io.sort.spill.percent", 0.95f);
    // conf.setFloat("mapred.job.shuffle.input.buffer.percent", 0.9f);
    // conf.setFloat("mapred.job.shuffle.merge.percent", 0.9f);
    //conf.setFloat("mapred.job.reduce.input.buffer.percent", 0.4f);
    //conf.set("mapred.job.tracker", "local");
    //conf.set("fs.default.name", "file:///");
    job.getConfiguration().setLong("mapred.min.split.size", 512 * 1024 * 1024L);
    job.getConfiguration().setLong("mapred.max.split.size", 512 * 1024 * 1024L);

    job.getConfiguration().setInt("mapred.map.max.attempts", 0);
    job.getConfiguration().setInt("mapred.reduce.max.attempts", 0);

    //job.getConfiguration().set("heapdump.reduce.input.groups", "3,897,853[5]");
    //job.getConfiguration().set("heapdump.reduce.input.records", "8407734;8407737;8407740;8407743;8407746;8407749;8407750");

    //job.getConfiguration().set("omit.reduce.input.records", "8407733;8407750");
    //job.getConfiguration().set("heapdump.reduce.input.records", "8407751");
    //job.getConfiguration().set("heapdump.reduce.output.records", "3897853");

    job.getConfiguration().set("heapdump.task.attempt.ids", "attempt_201404281552_0001_r_000000_0");

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java

public static Job createJob(Configuration conf, Path crawldb) throws IOException {

    Job job = new Job(conf);
    //job.setJarByClass(Merge.class);
    job.getConfiguration().set("mapred",
            "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path newdb = new Path(crawldb, "new");
    Path currentdb = new Path(crawldb, "current");

    FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create());
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }/*ww w . ja  va2  s  . c o  m*/

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }

    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job;
}

From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java

License:Apache License

/** Create and setup a job */
@SuppressWarnings("deprecation")
private static Job createJob(String name, Configuration conf) throws IOException {
    final Job job = new Job(conf, NAME + "_" + name);
    final Configuration jobconf = job.getConfiguration();
    job.setJarByClass(BaileyBorweinPlouffe.class);

    // setup mapper
    job.setMapperClass(BbpMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);

    // setup reducer
    job.setReducerClass(BbpReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setNumReduceTasks(1);/* w w  w .  j av a2  s  .  c  o m*/

    // setup input
    job.setInputFormatClass(BbpInputFormat.class);

    // disable task timeout
    jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0);

    // do not use speculative execution
    jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false);
    jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false);
    return job;
}

From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java

License:Apache License

/** Run a map/reduce job to compute Pi. */
private static void compute(int startDigit, int nDigits, int nMaps, String workingDir, Configuration conf,
        PrintStream out) throws IOException {
    final String name = startDigit + "_" + nDigits;

    // setup wroking directory
    out.println("Working Directory = " + workingDir);
    out.println();/*from w w  w  .ja  v a2  s  .c  o  m*/
    // final FileSystem fs = FileSystem.get(conf);// ?
    final FileSystem fs = new Path(workingDir, "part-r-00000").getFileSystem(conf);// ?
    final Path dir = fs.makeQualified(new Path(workingDir));
    if (fs.exists(dir)) {
        throw new IOException("Working directory " + dir + " already exists.  Please remove it first.");
    } else if (!fs.mkdirs(dir)) {
        throw new IOException("Cannot create working directory " + dir);
    }

    out.println("Start Digit      = " + startDigit);
    out.println("Number of Digits = " + nDigits);
    out.println("Number of Maps   = " + nMaps);

    // setup a job
    final Job job = createJob(name, conf);
    final Path hexfile = new Path(dir, "pi_" + name + ".hex");
    FileOutputFormat.setOutputPath(job, new Path(dir, "out"));

    // setup custom properties
    job.getConfiguration().set(WORKING_DIR_PROPERTY, dir.toString());
    job.getConfiguration().set(HEX_FILE_PROPERTY, hexfile.toString());

    job.getConfiguration().setInt(DIGIT_START_PROPERTY, startDigit);
    job.getConfiguration().setInt(DIGIT_SIZE_PROPERTY, nDigits);
    job.getConfiguration().setInt(DIGIT_PARTS_PROPERTY, nMaps);

    // start a map/reduce job
    out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        out.println("Duration is " + duration + " seconds.");
    }
    out.println("Output file: " + hexfile);
}

From source file:co.cask.cdap.data.stream.StreamInputFormatTest.java

License:Apache License

private void runMR(File inputDir, File outputDir, long startTime, long endTime, long splitSize, long ttl)
        throws Exception {

    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    StreamInputFormat.setTTL(conf, ttl);
    StreamInputFormat.setStreamPath(conf, inputDir.toURI());
    StreamInputFormat.setTimeRange(conf, startTime, endTime);
    StreamInputFormat.setMaxSplitSize(conf, splitSize);
    job.setInputFormatClass(TestStreamInputFormat.class);

    TextOutputFormat.setOutputPath(job, new Path(outputDir.toURI()));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setJarByClass(StreamInputFormatTest.class);
    job.setMapperClass(TokenizeMapper.class);
    job.setReducerClass(AggregateReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.waitForCompletion(true);/*  ww w.j av  a2 s  .c  om*/
}

From source file:co.cask.cdap.etl.batch.mapreduce.ETLMapReduce.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();/*w ww  .jav  a 2s .  co m*/
    }
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();

    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();

    // plugin name -> runtime args for that plugin
    Map<String, Map<String, String>> runtimeArgs = new HashMap<>();

    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, phaseSpec);

    // we checked at configure time that there is exactly one source
    String sourceName = phaseSpec.getPhase().getSources().iterator().next();

    BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName);
    batchSource = new LoggedBatchConfigurable<>(sourceName, batchSource);
    BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics,
            new DatasetContextLookupProvider(context), sourceName, context.getRuntimeArguments());
    batchSource.prepareRun(sourceContext);
    runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments());
    finishers.add(batchSource, sourceContext);

    Map<String, SinkOutput> sinkOutputs = new HashMap<>();

    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE),
            phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        String sinkName = stageInfo.getName();
        // todo: add a better way to get info for all sinks
        if (!phase.getSinks().contains(sinkName)) {
            continue;
        }

        BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName);
        batchSink = new LoggedBatchConfigurable<>(sinkName, batchSink);
        MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics,
                new DatasetContextLookupProvider(context), sinkName, context.getRuntimeArguments());
        batchSink.prepareRun(sinkContext);
        runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments());
        finishers.add(batchSink, sinkContext);

        sinkOutputs.put(sinkName,
                new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName()));
    }
    finisher = finishers.build();
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));

    // setup time partition for each error dataset
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE),
            phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        if (stageInfo.getErrorDatasetName() != null) {
            Map<String, String> args = new HashMap<>();
            args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key",
                    Constants.ERROR_SCHEMA.toString());
            TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
            context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args));
        }
    }

    job.setMapperClass(ETLMapper.class);
    Set<StageInfo> aggregators = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE);
    if (!aggregators.isEmpty()) {
        job.setReducerClass(ETLReducer.class);
        String aggregatorName = aggregators.iterator().next().getName();
        BatchAggregator aggregator = pluginInstantiator.newPluginInstance(aggregatorName);
        MapReduceAggregatorContext aggregatorContext = new MapReduceAggregatorContext(context, mrMetrics,
                new DatasetContextLookupProvider(context), aggregatorName, context.getRuntimeArguments());
        aggregator.prepareRun(aggregatorContext);
        finishers.add(aggregator, aggregatorContext);

        if (aggregatorContext.getNumPartitions() != null) {
            job.setNumReduceTasks(aggregatorContext.getNumPartitions());
        }
        // if the plugin sets the output key and value class directly, trust them
        Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass();
        Class<?> outputValClass = aggregatorContext.getGroupValueClass();
        // otherwise, derive it from the plugin's parameters
        if (outputKeyClass == null) {
            outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
        }
        if (outputValClass == null) {
            outputValClass = TypeChecker.getGroupValueClass(aggregator);
        }
        hConf.set(GROUP_KEY_CLASS, outputKeyClass.getName());
        hConf.set(GROUP_VAL_CLASS, outputValClass.getName());
        // in case the classes are not a WritableComparable, but is some common type we support
        // for example, a String or a StructuredRecord
        WritableConversion writableConversion = WritableConversions.getConversion(outputKeyClass.getName());
        // if the conversion is null, it means the user is using their own object.
        if (writableConversion != null) {
            outputKeyClass = writableConversion.getWritableClass();
        }
        writableConversion = WritableConversions.getConversion(outputValClass.getName());
        if (writableConversion != null) {
            outputValClass = writableConversion.getWritableClass();
        }
        // check classes here instead of letting mapreduce do it, since mapreduce throws a cryptic error
        if (!WritableComparable.class.isAssignableFrom(outputKeyClass)) {
            throw new IllegalArgumentException(String.format(
                    "Invalid aggregator %s. The group key class %s must implement Hadoop's WritableComparable.",
                    aggregatorName, outputKeyClass));
        }
        if (!Writable.class.isAssignableFrom(outputValClass)) {
            throw new IllegalArgumentException(String.format(
                    "Invalid aggregator %s. The group value class %s must implement Hadoop's Writable.",
                    aggregatorName, outputValClass));
        }

        job.setMapOutputKeyClass(outputKeyClass);
        job.setMapOutputValueClass(outputValClass);
    } else {
        job.setNumReduceTasks(0);
    }

    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs));
}

From source file:co.cask.cdap.examples.datacleansing.DataCleansingMapReduce.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    partitionCommitter = PartitionBatchInput.setInput(context, DataCleansing.RAW_RECORDS,
            new KVTableStatePersistor(DataCleansing.CONSUMING_STATE, "state.key"));

    // Each run writes its output to a partition for the league
    Long timeKey = Long.valueOf(context.getRuntimeArguments().get(OUTPUT_PARTITION_KEY));
    PartitionKey outputKey = PartitionKey.builder().addLongField("time", timeKey).build();

    Map<String, String> metadataToAssign = ImmutableMap.of("source.program", "DataCleansingMapReduce");

    // set up two outputs - one for invalid records and one for valid records
    Map<String, String> invalidRecordsArgs = new HashMap<>();
    PartitionedFileSetArguments.setOutputPartitionKey(invalidRecordsArgs, outputKey);
    PartitionedFileSetArguments.setOutputPartitionMetadata(invalidRecordsArgs, metadataToAssign);
    context.addOutput(Output.ofDataset(DataCleansing.INVALID_RECORDS, invalidRecordsArgs));

    Map<String, String> cleanRecordsArgs = new HashMap<>();
    PartitionedFileSetArguments.setDynamicPartitioner(cleanRecordsArgs, TimeAndZipPartitioner.class);
    PartitionedFileSetArguments.setOutputPartitionMetadata(cleanRecordsArgs, metadataToAssign);
    context.addOutput(Output.ofDataset(DataCleansing.CLEAN_RECORDS, cleanRecordsArgs));

    Job job = context.getHadoopJob();
    job.setMapperClass(SchemaMatchingFilter.class);
    job.setNumReduceTasks(0);//from  ww w  .j a  va  2s. c  o m

    // simply propagate the schema (if any) to be used by the mapper
    String schemaJson = context.getRuntimeArguments().get(SCHEMA_KEY);
    if (schemaJson != null) {
        job.getConfiguration().set(SCHEMA_KEY, schemaJson);
    }
}

From source file:co.cask.cdap.hbase.wd.RowKeyDistributorTestBase.java

License:Apache License

private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue,
        int seekIntervalMinValue, int seekIntervalMaxValue)
        throws IOException, InterruptedException, ClassNotFoundException {
    int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue,
            seekIntervalMinValue, seekIntervalMaxValue);

    // Reading data
    Configuration conf = new Configuration(testingUtility.getConfiguration());
    conf.set("fs.defaultFS", "file:///");
    conf.set("fs.default.name", "file:///");
    conf.setInt("mapreduce.local.map.tasks.maximum", 16);
    conf.setInt("mapreduce.local.reduce.tasks.maximum", 16);
    Job job = Job.getInstance(conf, "testMapReduceInternal()-Job");
    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class,
            ImmutableBytesWritable.class, Result.class, job);

    // Substituting standard TableInputFormat which was set in TableMapReduceUtil.initTableMapperJob(...)
    job.setInputFormatClass(WdTableInputFormat.class);
    keyDistributor.addInfo(job.getConfiguration());

    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);/* ww w  .  j a v  a2 s  .  c om*/

    boolean succeeded = job.waitForCompletion(true);
    Assert.assertTrue(succeeded);

    long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue();
    Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords);

    // Need to kill the job after completion, after it could leave MRAppMaster running not terminated.
    // Not sure what causing this, but maybe problem in MiniYarnCluster
    job.killJob();
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.DataSetInputFormat.java

License:Apache License

public static void setInput(Job job, String inputDatasetName) {
    job.setInputFormatClass(DataSetInputFormat.class);
    job.getConfiguration().set(DataSetInputFormat.HCONF_ATTR_INPUT_DATASET, inputDatasetName);
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.DataSetOutputFormat.java

License:Apache License

public static void setOutput(Job job, String outputDatasetName) {
    job.setOutputFormatClass(DataSetOutputFormat.class);
    job.getConfiguration().set(HCONF_ATTR_OUTPUT_DATASET, outputDatasetName);
}