Example usage for org.apache.hadoop.mapreduce Job getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getConfiguration.

Prototype

public Configuration getConfiguration()

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.cloudera.crunch.impl.mr.plan.JobPrototype.java

License:Open Source License

private CrunchJob build(Class<?> jarClass, Configuration conf) throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();
    job.setJarByClass(jarClass);//from   w w  w. j  a va2  s .co m

    Set<DoNode> outputNodes = Sets.newHashSet();
    Set<Target> targets = targetsToNodePaths.keySet();
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, workingPath, group == null);
    for (Target target : targets) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PCollectionImpl collect = nodePath.tail();
                node = DoNode.createOutputNode(target.toString(), collect.getPType());
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }

    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    RTNodeSerializer serializer = new RTNodeSerializer();
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        reduceNode = reduceNodes.get(0);
        serializer.serialize(reduceNodes, conf, NodeContext.REDUCE);

        group.configureShuffle(job);

        DoNode mapOutputNode = group.getGroupingNode();
        if (reduceNodes.size() == 1 && combineFnTable != null) {
            // Handle the combiner case
            DoNode mapSideCombineNode = combineFnTable.createDoNode();
            mapSideCombineNode.addChild(mapOutputNode);
            mapOutputNode = mapSideCombineNode;
        }

        Set<DoNode> mapNodes = Sets.newHashSet();
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    } else { // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    }

    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            DoNode inputNode = inputNodes.get(i);
            inputNode.getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(inputNodes, reduceNode));

    return new CrunchJob(job, workingPath, outputHandler);
}

From source file:com.cloudera.crunch.impl.mr.run.CrunchInputs.java

License:Apache License

public static void addInputPath(Job job, Path path, Class<? extends InputFormat> inputFormatClass,
        int nodeIndex) {
    Configuration conf = job.getConfiguration();
    String inputs = JOINER.join(inputFormatClass.getName(), nodeIndex, path.toString());
    String existing = conf.get(RuntimeParameters.MULTI_INPUTS);
    conf.set(RuntimeParameters.MULTI_INPUTS, existing == null ? inputs : existing + RECORD_SEP + inputs);
}

From source file:com.cloudera.crunch.io.avro.AvroFileSourceTarget.java

License:Open Source License

@Override
public void configureSource(Job job, int inputId) throws IOException {
    SourceTargetHelper.configureSource(job, inputId, AvroInputFormat.class, path);

    Configuration conf = job.getConfiguration();
    String inputSchema = conf.get("avro.input.schema");
    if (inputSchema == null) {
        conf.set("avro.input.schema", ptype.getSchema().toString());
    } else if (!inputSchema.equals(ptype.getSchema().toString())) {
        throw new IllegalStateException("Multiple Avro sources must use the same schema");
    }/*from www. jav  a 2  s  .c  om*/
}

From source file:com.cloudera.crunch.io.avro.AvroFileTarget.java

License:Open Source License

@Override
public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
    AvroType<?> atype = (AvroType<?>) ptype;
    Configuration conf = job.getConfiguration();
    String outputSchema = conf.get("avro.output.schema");
    if (outputSchema == null) {
        conf.set("avro.output.schema", atype.getSchema().toString());
    } else if (!outputSchema.equals(atype.getSchema().toString())) {
        throw new IllegalStateException("Avro targets must use the same output schema");
    }//from w w w .j a  v a2 s.c o  m

    SourceTargetHelper.configureTarget(job, AvroOutputFormat.class, ptype.getDataBridge(), outputPath, name);
}

From source file:com.cloudera.crunch.io.hbase.HBaseSourceTarget.java

License:Open Source License

@Override
public void configureSource(Job job, int inputId) throws IOException {
    Configuration conf = job.getConfiguration();
    job.setInputFormatClass(TableInputFormat.class);
    job.setMapperClass(CrunchMapper.class);
    HBaseConfiguration.addHbaseResources(conf);
    conf.set(TableInputFormat.INPUT_TABLE, table);
    conf.set(TableInputFormat.SCAN, convertScanToString(scan));
    TableMapReduceUtil.addDependencyJars(job);
}

From source file:com.cloudera.crunch.io.hbase.HBaseTarget.java

License:Open Source License

@Override
public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
    Configuration conf = job.getConfiguration();
    HBaseConfiguration.addHbaseResources(conf);
    job.setOutputFormatClass(TableOutputFormat.class);
    conf.set(TableOutputFormat.OUTPUT_TABLE, table);
    try {/*from   ww  w  . ja  v a 2 s  .  c o  m*/
        TableMapReduceUtil.addDependencyJars(job);
    } catch (IOException e) {
        throw new CrunchRuntimeException(e);
    }
}

From source file:com.cloudera.crunch.type.avro.AvroGroupedTableType.java

License:Open Source License

@Override
public void configureShuffle(Job job, GroupingOptions options) {
    AvroTableType<K, V> att = (AvroTableType<K, V>) tableType;
    String schemaJson = att.getSchema().toString();
    job.getConfiguration().set(AvroJob.MAP_OUTPUT_SCHEMA, schemaJson);
    job.setSortComparatorClass(AvroKeyComparator.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(AvroValue.class);
    if (options != null) {
        options.configure(job);//www.  j  av  a  2 s. c o  m
    }

    Collection<String> serializations = job.getConfiguration().getStringCollection("io.serializations");
    if (!serializations.contains(AvroSerialization.class.getName())) {
        serializations.add(AvroSerialization.class.getName());
        job.getConfiguration().setStrings("io.serializations", serializations.toArray(new String[0]));
    }
}

From source file:com.cloudera.dataflow.spark.TransformTranslator.java

License:Open Source License

private static <T> TransformEvaluator<AvroIO.Write.Bound<T>> writeAvro() {
    return new TransformEvaluator<AvroIO.Write.Bound<T>>() {
        @Override//from  w ww .  ja v a  2 s.c  o  m
        public void evaluate(AvroIO.Write.Bound<T> transform, EvaluationContext context) {
            Job job;
            try {
                job = Job.getInstance();
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
            AvroJob.setOutputKeySchema(job, transform.getSchema());
            @SuppressWarnings("unchecked")
            JavaPairRDD<AvroKey<T>, NullWritable> last = ((JavaRDDLike<WindowedValue<T>, ?>) context
                    .getInputRDD(transform)).map(WindowingHelpers.<T>unwindowFunction())
                            .mapToPair(new PairFunction<T, AvroKey<T>, NullWritable>() {
                                @Override
                                public Tuple2<AvroKey<T>, NullWritable> call(T t) throws Exception {
                                    return new Tuple2<>(new AvroKey<>(t), NullWritable.get());
                                }
                            });
            ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(),
                    transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
            writeHadoopFile(last, job.getConfiguration(), shardTemplateInfo, AvroKey.class, NullWritable.class,
                    TemplatedAvroKeyOutputFormat.class);
        }
    };
}

From source file:com.cloudera.oryx.computation.common.JobStep.java

License:Open Source License

/**
 * Creates a new {@link MRPipeline} instance that contains common configuration
 * settings./*from   w  w  w .j ava  2s. c  o  m*/
 *
 * @return a new {@link MRPipeline} instance, suitably configured
 */
protected final MRPipeline createBasicPipeline(Class<?> jarClass) throws IOException {
    Configuration conf = OryxConfiguration.get(getConf());

    conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
    conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    conf.setBoolean("mapred.output.compress", true);
    conf.set("mapred.output.compression.type", "BLOCK");
    conf.setClass("mapred.output.compression.codec", SnappyCodec.class, CompressionCodec.class);
    // Set old-style equivalents for Avro/Crunch's benefit
    conf.set("avro.output.codec", "snappy");

    conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true);
    conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, true);
    conf.setBoolean(TTConfig.TT_OUTOFBAND_HEARBEAT, true);
    conf.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1);

    //conf.setBoolean("crunch.disable.deep.copy", true);
    // Giving one mapper a lot of data can cause issues in some stages, so default to disable this
    conf.setBoolean("crunch.disable.combine.file", true);

    Config appConfig = ConfigUtils.getDefaultConfig();

    conf.set("crunch.tmp.dir", appConfig.getString("computation-layer.tmp-dir"));

    int mapMemoryMB = appConfig.getInt("computation-layer.mapper-memory-mb");
    log.info("Mapper memory: {}", mapMemoryMB);
    int mapHeapMB = (int) (mapMemoryMB / 1.3); // Matches Hadoop's default
    log.info("Mappers have {}MB heap and can access {}MB RAM", mapHeapMB, mapMemoryMB);
    if (conf.get(MRJobConfig.MAP_JAVA_OPTS) != null) {
        log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.MAP_JAVA_OPTS,
                conf.get(MRJobConfig.MAP_JAVA_OPTS));
    }
    conf.set(MRJobConfig.MAP_JAVA_OPTS,
            "-Xmx" + mapHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC");
    log.info("Set {} to '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS));
    // See comment below on CM
    conf.setInt("mapreduce.map.java.opts.max.heap", mapHeapMB);

    int reduceMemoryMB = appConfig.getInt("computation-layer.reducer-memory-mb");
    log.info("Reducer memory: {}", reduceMemoryMB);
    if (isHighMemoryStep()) {
        reduceMemoryMB *= appConfig.getInt("computation-layer.worker-high-memory-factor");
        log.info("Increasing {} to {} for high-memory step", MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB);
    }
    conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB);

    int reduceHeapMB = (int) (reduceMemoryMB / 1.3); // Matches Hadoop's default
    log.info("Reducers have {}MB heap and can access {}MB RAM", reduceHeapMB, reduceMemoryMB);
    if (conf.get(MRJobConfig.REDUCE_JAVA_OPTS) != null) {
        log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.REDUCE_JAVA_OPTS,
                conf.get(MRJobConfig.REDUCE_JAVA_OPTS));
    }
    conf.set(MRJobConfig.REDUCE_JAVA_OPTS,
            "-Xmx" + reduceHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC");
    log.info("Set {} to '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS));
    // I see this in CM but not in Hadoop docs; probably won't hurt as it's supposed to result in
    // -Xmx appended to opts above, which is at worst redundant
    conf.setInt("mapreduce.reduce.java.opts.max.heap", reduceHeapMB);

    conf.setInt("yarn.scheduler.capacity.minimum-allocation-mb", 128);
    conf.setInt("yarn.app.mapreduce.am.resource.mb", 384);

    // Pass total config state
    conf.set(CONFIG_SERIALIZATION_KEY, ConfigUtils.getDefaultConfig().root().render());

    // Make sure to set any args to conf above this line!

    setConf(conf);

    Job job = Job.getInstance(conf);

    // Basic File IO settings
    FileInputFormat.setMaxInputSplitSize(job, 1L << 28); // ~268MB
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);

    log.info("Created pipeline configuration {}", job.getConfiguration());

    return new MRPipeline(jarClass, getCustomJobName(), job.getConfiguration());
}

From source file:com.cloudera.recordservice.avro.AvroJob.java

License:Apache License

public static void setInputFormatClass(org.apache.hadoop.mapreduce.Job job,
        Class<? extends org.apache.hadoop.mapreduce.InputFormat> c) {
    if (job.getConfiguration().getBoolean(USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, false)) {
        if (c.getName().equals(org.apache.avro.mapreduce.AvroKeyInputFormat.class.getName())) {
            c = com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class;
        } else if (c.getName().equals(org.apache.avro.mapreduce.AvroKeyValueInputFormat.class.getName())) {
            c = com.cloudera.recordservice.avro.mapreduce.AvroKeyValueInputFormat.class;
        } else {//from w  w  w .j  a  va2  s.co  m
            throw new RuntimeException("Class '" + c.getName() + "' is not supported by "
                    + "the RecordService. Use AvroKeyValueInputFormat or "
                    + "AvroKeyInputFormat or disable RecordService.");
        }
    }
    LOG.debug("Using input format: " + c.getName());
    job.setInputFormatClass(c);
}