List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:com.cloudera.crunch.impl.mr.plan.JobPrototype.java
License:Open Source License
private CrunchJob build(Class<?> jarClass, Configuration conf) throws IOException { Job job = new Job(conf); conf = job.getConfiguration(); job.setJarByClass(jarClass);//from w w w. j a va2 s .co m Set<DoNode> outputNodes = Sets.newHashSet(); Set<Target> targets = targetsToNodePaths.keySet(); MSCROutputHandler outputHandler = new MSCROutputHandler(job, workingPath, group == null); for (Target target : targets) { DoNode node = null; for (NodePath nodePath : targetsToNodePaths.get(target)) { if (node == null) { PCollectionImpl collect = nodePath.tail(); node = DoNode.createOutputNode(target.toString(), collect.getPType()); outputHandler.configureNode(node, target); } outputNodes.add(walkPath(nodePath.descendingIterator(), node)); } } job.setMapperClass(CrunchMapper.class); List<DoNode> inputNodes; DoNode reduceNode = null; RTNodeSerializer serializer = new RTNodeSerializer(); if (group != null) { job.setReducerClass(CrunchReducer.class); List<DoNode> reduceNodes = Lists.newArrayList(outputNodes); reduceNode = reduceNodes.get(0); serializer.serialize(reduceNodes, conf, NodeContext.REDUCE); group.configureShuffle(job); DoNode mapOutputNode = group.getGroupingNode(); if (reduceNodes.size() == 1 && combineFnTable != null) { // Handle the combiner case DoNode mapSideCombineNode = combineFnTable.createDoNode(); mapSideCombineNode.addChild(mapOutputNode); mapOutputNode = mapSideCombineNode; } Set<DoNode> mapNodes = Sets.newHashSet(); for (NodePath nodePath : mapNodePaths) { // Advance these one step, since we've already configured // the grouping node, and the PGroupedTableImpl is the tail // of the NodePath. Iterator<PCollectionImpl> iter = nodePath.descendingIterator(); iter.next(); mapNodes.add(walkPath(iter, mapOutputNode)); } inputNodes = Lists.newArrayList(mapNodes); serializer.serialize(inputNodes, conf, NodeContext.MAP); } else { // No grouping job.setNumReduceTasks(0); inputNodes = Lists.newArrayList(outputNodes); serializer.serialize(inputNodes, conf, NodeContext.MAP); } if (inputNodes.size() == 1) { DoNode inputNode = inputNodes.get(0); inputNode.getSource().configureSource(job, -1); } else { for (int i = 0; i < inputNodes.size(); i++) { DoNode inputNode = inputNodes.get(i); inputNode.getSource().configureSource(job, i); } job.setInputFormatClass(CrunchInputFormat.class); } job.setJobName(createJobName(inputNodes, reduceNode)); return new CrunchJob(job, workingPath, outputHandler); }
From source file:com.cloudera.crunch.impl.mr.run.CrunchInputs.java
License:Apache License
public static void addInputPath(Job job, Path path, Class<? extends InputFormat> inputFormatClass, int nodeIndex) { Configuration conf = job.getConfiguration(); String inputs = JOINER.join(inputFormatClass.getName(), nodeIndex, path.toString()); String existing = conf.get(RuntimeParameters.MULTI_INPUTS); conf.set(RuntimeParameters.MULTI_INPUTS, existing == null ? inputs : existing + RECORD_SEP + inputs); }
From source file:com.cloudera.crunch.io.avro.AvroFileSourceTarget.java
License:Open Source License
@Override public void configureSource(Job job, int inputId) throws IOException { SourceTargetHelper.configureSource(job, inputId, AvroInputFormat.class, path); Configuration conf = job.getConfiguration(); String inputSchema = conf.get("avro.input.schema"); if (inputSchema == null) { conf.set("avro.input.schema", ptype.getSchema().toString()); } else if (!inputSchema.equals(ptype.getSchema().toString())) { throw new IllegalStateException("Multiple Avro sources must use the same schema"); }/*from www. jav a 2 s .c om*/ }
From source file:com.cloudera.crunch.io.avro.AvroFileTarget.java
License:Open Source License
@Override public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) { AvroType<?> atype = (AvroType<?>) ptype; Configuration conf = job.getConfiguration(); String outputSchema = conf.get("avro.output.schema"); if (outputSchema == null) { conf.set("avro.output.schema", atype.getSchema().toString()); } else if (!outputSchema.equals(atype.getSchema().toString())) { throw new IllegalStateException("Avro targets must use the same output schema"); }//from w w w .j a v a2 s.c o m SourceTargetHelper.configureTarget(job, AvroOutputFormat.class, ptype.getDataBridge(), outputPath, name); }
From source file:com.cloudera.crunch.io.hbase.HBaseSourceTarget.java
License:Open Source License
@Override public void configureSource(Job job, int inputId) throws IOException { Configuration conf = job.getConfiguration(); job.setInputFormatClass(TableInputFormat.class); job.setMapperClass(CrunchMapper.class); HBaseConfiguration.addHbaseResources(conf); conf.set(TableInputFormat.INPUT_TABLE, table); conf.set(TableInputFormat.SCAN, convertScanToString(scan)); TableMapReduceUtil.addDependencyJars(job); }
From source file:com.cloudera.crunch.io.hbase.HBaseTarget.java
License:Open Source License
@Override public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) { Configuration conf = job.getConfiguration(); HBaseConfiguration.addHbaseResources(conf); job.setOutputFormatClass(TableOutputFormat.class); conf.set(TableOutputFormat.OUTPUT_TABLE, table); try {/*from ww w . ja v a 2 s . c o m*/ TableMapReduceUtil.addDependencyJars(job); } catch (IOException e) { throw new CrunchRuntimeException(e); } }
From source file:com.cloudera.crunch.type.avro.AvroGroupedTableType.java
License:Open Source License
@Override public void configureShuffle(Job job, GroupingOptions options) { AvroTableType<K, V> att = (AvroTableType<K, V>) tableType; String schemaJson = att.getSchema().toString(); job.getConfiguration().set(AvroJob.MAP_OUTPUT_SCHEMA, schemaJson); job.setSortComparatorClass(AvroKeyComparator.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(AvroValue.class); if (options != null) { options.configure(job);//www. j av a 2 s. c o m } Collection<String> serializations = job.getConfiguration().getStringCollection("io.serializations"); if (!serializations.contains(AvroSerialization.class.getName())) { serializations.add(AvroSerialization.class.getName()); job.getConfiguration().setStrings("io.serializations", serializations.toArray(new String[0])); } }
From source file:com.cloudera.dataflow.spark.TransformTranslator.java
License:Open Source License
private static <T> TransformEvaluator<AvroIO.Write.Bound<T>> writeAvro() { return new TransformEvaluator<AvroIO.Write.Bound<T>>() { @Override//from w ww . ja v a 2 s.c o m public void evaluate(AvroIO.Write.Bound<T> transform, EvaluationContext context) { Job job; try { job = Job.getInstance(); } catch (IOException e) { throw new IllegalStateException(e); } AvroJob.setOutputKeySchema(job, transform.getSchema()); @SuppressWarnings("unchecked") JavaPairRDD<AvroKey<T>, NullWritable> last = ((JavaRDDLike<WindowedValue<T>, ?>) context .getInputRDD(transform)).map(WindowingHelpers.<T>unwindowFunction()) .mapToPair(new PairFunction<T, AvroKey<T>, NullWritable>() { @Override public Tuple2<AvroKey<T>, NullWritable> call(T t) throws Exception { return new Tuple2<>(new AvroKey<>(t), NullWritable.get()); } }); ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix()); writeHadoopFile(last, job.getConfiguration(), shardTemplateInfo, AvroKey.class, NullWritable.class, TemplatedAvroKeyOutputFormat.class); } }; }
From source file:com.cloudera.oryx.computation.common.JobStep.java
License:Open Source License
/** * Creates a new {@link MRPipeline} instance that contains common configuration * settings./*from w w w .j ava 2s. c o m*/ * * @return a new {@link MRPipeline} instance, suitably configured */ protected final MRPipeline createBasicPipeline(Class<?> jarClass) throws IOException { Configuration conf = OryxConfiguration.get(getConf()); conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); conf.setBoolean("mapred.output.compress", true); conf.set("mapred.output.compression.type", "BLOCK"); conf.setClass("mapred.output.compression.codec", SnappyCodec.class, CompressionCodec.class); // Set old-style equivalents for Avro/Crunch's benefit conf.set("avro.output.codec", "snappy"); conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true); conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, true); conf.setBoolean(TTConfig.TT_OUTOFBAND_HEARBEAT, true); conf.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1); //conf.setBoolean("crunch.disable.deep.copy", true); // Giving one mapper a lot of data can cause issues in some stages, so default to disable this conf.setBoolean("crunch.disable.combine.file", true); Config appConfig = ConfigUtils.getDefaultConfig(); conf.set("crunch.tmp.dir", appConfig.getString("computation-layer.tmp-dir")); int mapMemoryMB = appConfig.getInt("computation-layer.mapper-memory-mb"); log.info("Mapper memory: {}", mapMemoryMB); int mapHeapMB = (int) (mapMemoryMB / 1.3); // Matches Hadoop's default log.info("Mappers have {}MB heap and can access {}MB RAM", mapHeapMB, mapMemoryMB); if (conf.get(MRJobConfig.MAP_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); } conf.set(MRJobConfig.MAP_JAVA_OPTS, "-Xmx" + mapHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); // See comment below on CM conf.setInt("mapreduce.map.java.opts.max.heap", mapHeapMB); int reduceMemoryMB = appConfig.getInt("computation-layer.reducer-memory-mb"); log.info("Reducer memory: {}", reduceMemoryMB); if (isHighMemoryStep()) { reduceMemoryMB *= appConfig.getInt("computation-layer.worker-high-memory-factor"); log.info("Increasing {} to {} for high-memory step", MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); } conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); int reduceHeapMB = (int) (reduceMemoryMB / 1.3); // Matches Hadoop's default log.info("Reducers have {}MB heap and can access {}MB RAM", reduceHeapMB, reduceMemoryMB); if (conf.get(MRJobConfig.REDUCE_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); } conf.set(MRJobConfig.REDUCE_JAVA_OPTS, "-Xmx" + reduceHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); // I see this in CM but not in Hadoop docs; probably won't hurt as it's supposed to result in // -Xmx appended to opts above, which is at worst redundant conf.setInt("mapreduce.reduce.java.opts.max.heap", reduceHeapMB); conf.setInt("yarn.scheduler.capacity.minimum-allocation-mb", 128); conf.setInt("yarn.app.mapreduce.am.resource.mb", 384); // Pass total config state conf.set(CONFIG_SERIALIZATION_KEY, ConfigUtils.getDefaultConfig().root().render()); // Make sure to set any args to conf above this line! setConf(conf); Job job = Job.getInstance(conf); // Basic File IO settings FileInputFormat.setMaxInputSplitSize(job, 1L << 28); // ~268MB SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); log.info("Created pipeline configuration {}", job.getConfiguration()); return new MRPipeline(jarClass, getCustomJobName(), job.getConfiguration()); }
From source file:com.cloudera.recordservice.avro.AvroJob.java
License:Apache License
public static void setInputFormatClass(org.apache.hadoop.mapreduce.Job job, Class<? extends org.apache.hadoop.mapreduce.InputFormat> c) { if (job.getConfiguration().getBoolean(USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, false)) { if (c.getName().equals(org.apache.avro.mapreduce.AvroKeyInputFormat.class.getName())) { c = com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class; } else if (c.getName().equals(org.apache.avro.mapreduce.AvroKeyValueInputFormat.class.getName())) { c = com.cloudera.recordservice.avro.mapreduce.AvroKeyValueInputFormat.class; } else {//from w w w .j a va2 s.co m throw new RuntimeException("Class '" + c.getName() + "' is not supported by " + "the RecordService. Use AvroKeyValueInputFormat or " + "AvroKeyInputFormat or disable RecordService."); } } LOG.debug("Using input format: " + c.getName()); job.setInputFormatClass(c); }