List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:cloud9.ComputeCooccurrenceMatrixStripesOOM.java
License:Apache License
/** * Runs this tool.//from w w w . j a v a 2s. com */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int window = Integer.parseInt(args[2]); int reduceTasks = Integer.parseInt(args[3]); sLogger.info("Tool: ComputeCooccurrenceMatrixStripes"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - window: " + window); sLogger.info(" - number of reducers: " + reduceTasks); Job job = new Job(getConf(), "CooccurrenceMatrixStripes"); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); job.getConfiguration().setInt("window", window); job.setJarByClass(ComputeCooccurrenceMatrixStripesOOM.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(String2IntOpenHashMapWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.getConfiguration().setInt("io.sort.mb", 400); job.getConfiguration().set("mapred.child.java.opts", "-Xmx1000m -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps"); //job.getConfiguration().set("mapred.child.java.opts", "-Xmx1000m"); job.getConfiguration().setInt("child.monitor.jstat.seconds", 2); job.getConfiguration().set("fs.default.name", "hdfs://master:9000"); job.getConfiguration().set("mapred.job.tracker", "master:9001"); //conf.set("user.name", "xulijie"); job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", 1); //job.getConfiguration().setFloat("io.sort.record.percent", 0.2f); //job.getConfiguration().setFloat("io.sort.spill.percent", 0.95f); // conf.setFloat("mapred.job.shuffle.input.buffer.percent", 0.9f); // conf.setFloat("mapred.job.shuffle.merge.percent", 0.9f); //conf.setFloat("mapred.job.reduce.input.buffer.percent", 0.4f); //conf.set("mapred.job.tracker", "local"); //conf.set("fs.default.name", "file:///"); job.getConfiguration().setLong("mapred.min.split.size", 512 * 1024 * 1024L); job.getConfiguration().setLong("mapred.max.split.size", 512 * 1024 * 1024L); job.getConfiguration().setInt("mapred.map.max.attempts", 0); job.getConfiguration().setInt("mapred.reduce.max.attempts", 0); //job.getConfiguration().set("heapdump.reduce.input.groups", "3,897,853[5]"); //job.getConfiguration().set("heapdump.reduce.input.records", "8407734;8407737;8407740;8407743;8407746;8407749;8407750"); //job.getConfiguration().set("omit.reduce.input.records", "8407733;8407750"); //job.getConfiguration().set("heapdump.reduce.input.records", "8407751"); //job.getConfiguration().set("heapdump.reduce.output.records", "3897853"); job.getConfiguration().set("heapdump.task.attempt.ids", "attempt_201404281552_0001_r_000000_0"); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java
public static Job createJob(Configuration conf, Path crawldb) throws IOException { Job job = new Job(conf); //job.setJarByClass(Merge.class); job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path newdb = new Path(crawldb, "new"); Path currentdb = new Path(crawldb, "current"); FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create()); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }/*ww w . ja va2 s . c o m*/ if (fs.exists(newdb)) { fs.delete(newdb); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job; }
From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java
License:Apache License
/** Create and setup a job */ @SuppressWarnings("deprecation") private static Job createJob(String name, Configuration conf) throws IOException { final Job job = new Job(conf, NAME + "_" + name); final Configuration jobconf = job.getConfiguration(); job.setJarByClass(BaileyBorweinPlouffe.class); // setup mapper job.setMapperClass(BbpMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(BytesWritable.class); // setup reducer job.setReducerClass(BbpReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BytesWritable.class); job.setNumReduceTasks(1);/* w w w . j av a2 s . c o m*/ // setup input job.setInputFormatClass(BbpInputFormat.class); // disable task timeout jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0); // do not use speculative execution jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false); jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false); return job; }
From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java
License:Apache License
/** Run a map/reduce job to compute Pi. */ private static void compute(int startDigit, int nDigits, int nMaps, String workingDir, Configuration conf, PrintStream out) throws IOException { final String name = startDigit + "_" + nDigits; // setup wroking directory out.println("Working Directory = " + workingDir); out.println();/*from w w w .ja v a2 s .c o m*/ // final FileSystem fs = FileSystem.get(conf);// ? final FileSystem fs = new Path(workingDir, "part-r-00000").getFileSystem(conf);// ? final Path dir = fs.makeQualified(new Path(workingDir)); if (fs.exists(dir)) { throw new IOException("Working directory " + dir + " already exists. Please remove it first."); } else if (!fs.mkdirs(dir)) { throw new IOException("Cannot create working directory " + dir); } out.println("Start Digit = " + startDigit); out.println("Number of Digits = " + nDigits); out.println("Number of Maps = " + nMaps); // setup a job final Job job = createJob(name, conf); final Path hexfile = new Path(dir, "pi_" + name + ".hex"); FileOutputFormat.setOutputPath(job, new Path(dir, "out")); // setup custom properties job.getConfiguration().set(WORKING_DIR_PROPERTY, dir.toString()); job.getConfiguration().set(HEX_FILE_PROPERTY, hexfile.toString()); job.getConfiguration().setInt(DIGIT_START_PROPERTY, startDigit); job.getConfiguration().setInt(DIGIT_SIZE_PROPERTY, nDigits); job.getConfiguration().setInt(DIGIT_PARTS_PROPERTY, nMaps); // start a map/reduce job out.println("\nStarting Job ..."); final long startTime = System.currentTimeMillis(); try { if (!job.waitForCompletion(true)) { out.println("Job failed."); System.exit(1); } } catch (Exception e) { throw new RuntimeException(e); } finally { final double duration = (System.currentTimeMillis() - startTime) / 1000.0; out.println("Duration is " + duration + " seconds."); } out.println("Output file: " + hexfile); }
From source file:co.cask.cdap.data.stream.StreamInputFormatTest.java
License:Apache License
private void runMR(File inputDir, File outputDir, long startTime, long endTime, long splitSize, long ttl) throws Exception { Job job = Job.getInstance(); Configuration conf = job.getConfiguration(); StreamInputFormat.setTTL(conf, ttl); StreamInputFormat.setStreamPath(conf, inputDir.toURI()); StreamInputFormat.setTimeRange(conf, startTime, endTime); StreamInputFormat.setMaxSplitSize(conf, splitSize); job.setInputFormatClass(TestStreamInputFormat.class); TextOutputFormat.setOutputPath(job, new Path(outputDir.toURI())); job.setOutputFormatClass(TextOutputFormat.class); job.setJarByClass(StreamInputFormatTest.class); job.setMapperClass(TokenizeMapper.class); job.setReducerClass(AggregateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapOutputValueClass(IntWritable.class); job.waitForCompletion(true);/* ww w.j av a2 s .c om*/ }
From source file:co.cask.cdap.etl.batch.mapreduce.ETLMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) { LogStageInjector.start();/*w ww .jav a 2s . co m*/ } CompositeFinisher.Builder finishers = CompositeFinisher.builder(); Job job = context.getHadoopJob(); Configuration hConf = job.getConfiguration(); // plugin name -> runtime args for that plugin Map<String, Map<String, String>> runtimeArgs = new HashMap<>(); Map<String, String> properties = context.getSpecification().getProperties(); BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class); PipelinePhase phase = phaseSpec.getPhase(); PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, phaseSpec); // we checked at configure time that there is exactly one source String sourceName = phaseSpec.getPhase().getSources().iterator().next(); BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName); batchSource = new LoggedBatchConfigurable<>(sourceName, batchSource); BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics, new DatasetContextLookupProvider(context), sourceName, context.getRuntimeArguments()); batchSource.prepareRun(sourceContext); runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments()); finishers.add(batchSource, sourceContext); Map<String, SinkOutput> sinkOutputs = new HashMap<>(); for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) { String sinkName = stageInfo.getName(); // todo: add a better way to get info for all sinks if (!phase.getSinks().contains(sinkName)) { continue; } BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName); batchSink = new LoggedBatchConfigurable<>(sinkName, batchSink); MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics, new DatasetContextLookupProvider(context), sinkName, context.getRuntimeArguments()); batchSink.prepareRun(sinkContext); runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments()); finishers.add(batchSink, sinkContext); sinkOutputs.put(sinkName, new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName())); } finisher = finishers.build(); hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs)); // setup time partition for each error dataset for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) { if (stageInfo.getErrorDatasetName() != null) { Map<String, String> args = new HashMap<>(); args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", Constants.ERROR_SCHEMA.toString()); TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime()); context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args)); } } job.setMapperClass(ETLMapper.class); Set<StageInfo> aggregators = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE); if (!aggregators.isEmpty()) { job.setReducerClass(ETLReducer.class); String aggregatorName = aggregators.iterator().next().getName(); BatchAggregator aggregator = pluginInstantiator.newPluginInstance(aggregatorName); MapReduceAggregatorContext aggregatorContext = new MapReduceAggregatorContext(context, mrMetrics, new DatasetContextLookupProvider(context), aggregatorName, context.getRuntimeArguments()); aggregator.prepareRun(aggregatorContext); finishers.add(aggregator, aggregatorContext); if (aggregatorContext.getNumPartitions() != null) { job.setNumReduceTasks(aggregatorContext.getNumPartitions()); } // if the plugin sets the output key and value class directly, trust them Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass(); Class<?> outputValClass = aggregatorContext.getGroupValueClass(); // otherwise, derive it from the plugin's parameters if (outputKeyClass == null) { outputKeyClass = TypeChecker.getGroupKeyClass(aggregator); } if (outputValClass == null) { outputValClass = TypeChecker.getGroupValueClass(aggregator); } hConf.set(GROUP_KEY_CLASS, outputKeyClass.getName()); hConf.set(GROUP_VAL_CLASS, outputValClass.getName()); // in case the classes are not a WritableComparable, but is some common type we support // for example, a String or a StructuredRecord WritableConversion writableConversion = WritableConversions.getConversion(outputKeyClass.getName()); // if the conversion is null, it means the user is using their own object. if (writableConversion != null) { outputKeyClass = writableConversion.getWritableClass(); } writableConversion = WritableConversions.getConversion(outputValClass.getName()); if (writableConversion != null) { outputValClass = writableConversion.getWritableClass(); } // check classes here instead of letting mapreduce do it, since mapreduce throws a cryptic error if (!WritableComparable.class.isAssignableFrom(outputKeyClass)) { throw new IllegalArgumentException(String.format( "Invalid aggregator %s. The group key class %s must implement Hadoop's WritableComparable.", aggregatorName, outputKeyClass)); } if (!Writable.class.isAssignableFrom(outputValClass)) { throw new IllegalArgumentException(String.format( "Invalid aggregator %s. The group value class %s must implement Hadoop's Writable.", aggregatorName, outputValClass)); } job.setMapOutputKeyClass(outputKeyClass); job.setMapOutputValueClass(outputValClass); } else { job.setNumReduceTasks(0); } hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs)); }
From source file:co.cask.cdap.examples.datacleansing.DataCleansingMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { partitionCommitter = PartitionBatchInput.setInput(context, DataCleansing.RAW_RECORDS, new KVTableStatePersistor(DataCleansing.CONSUMING_STATE, "state.key")); // Each run writes its output to a partition for the league Long timeKey = Long.valueOf(context.getRuntimeArguments().get(OUTPUT_PARTITION_KEY)); PartitionKey outputKey = PartitionKey.builder().addLongField("time", timeKey).build(); Map<String, String> metadataToAssign = ImmutableMap.of("source.program", "DataCleansingMapReduce"); // set up two outputs - one for invalid records and one for valid records Map<String, String> invalidRecordsArgs = new HashMap<>(); PartitionedFileSetArguments.setOutputPartitionKey(invalidRecordsArgs, outputKey); PartitionedFileSetArguments.setOutputPartitionMetadata(invalidRecordsArgs, metadataToAssign); context.addOutput(Output.ofDataset(DataCleansing.INVALID_RECORDS, invalidRecordsArgs)); Map<String, String> cleanRecordsArgs = new HashMap<>(); PartitionedFileSetArguments.setDynamicPartitioner(cleanRecordsArgs, TimeAndZipPartitioner.class); PartitionedFileSetArguments.setOutputPartitionMetadata(cleanRecordsArgs, metadataToAssign); context.addOutput(Output.ofDataset(DataCleansing.CLEAN_RECORDS, cleanRecordsArgs)); Job job = context.getHadoopJob(); job.setMapperClass(SchemaMatchingFilter.class); job.setNumReduceTasks(0);//from ww w .j a va 2s. c o m // simply propagate the schema (if any) to be used by the mapper String schemaJson = context.getRuntimeArguments().get(SCHEMA_KEY); if (schemaJson != null) { job.getConfiguration().set(SCHEMA_KEY, schemaJson); } }
From source file:co.cask.cdap.hbase.wd.RowKeyDistributorTestBase.java
License:Apache License
private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue, int seekIntervalMinValue, int seekIntervalMaxValue) throws IOException, InterruptedException, ClassNotFoundException { int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue, seekIntervalMinValue, seekIntervalMaxValue); // Reading data Configuration conf = new Configuration(testingUtility.getConfiguration()); conf.set("fs.defaultFS", "file:///"); conf.set("fs.default.name", "file:///"); conf.setInt("mapreduce.local.map.tasks.maximum", 16); conf.setInt("mapreduce.local.reduce.tasks.maximum", 16); Job job = Job.getInstance(conf, "testMapReduceInternal()-Job"); TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); // Substituting standard TableInputFormat which was set in TableMapReduceUtil.initTableMapperJob(...) job.setInputFormatClass(WdTableInputFormat.class); keyDistributor.addInfo(job.getConfiguration()); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0);/* ww w . j a v a2 s . c om*/ boolean succeeded = job.waitForCompletion(true); Assert.assertTrue(succeeded); long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue(); Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords); // Need to kill the job after completion, after it could leave MRAppMaster running not terminated. // Not sure what causing this, but maybe problem in MiniYarnCluster job.killJob(); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.DataSetInputFormat.java
License:Apache License
public static void setInput(Job job, String inputDatasetName) { job.setInputFormatClass(DataSetInputFormat.class); job.getConfiguration().set(DataSetInputFormat.HCONF_ATTR_INPUT_DATASET, inputDatasetName); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.DataSetOutputFormat.java
License:Apache License
public static void setOutput(Job job, String outputDatasetName) { job.setOutputFormatClass(DataSetOutputFormat.class); job.getConfiguration().set(HCONF_ATTR_OUTPUT_DATASET, outputDatasetName); }