List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:co.cask.cdap.examples.sportresults.ScoreCounter.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(ResultsMapper.class); job.setReducerClass(TeamCounter.class); job.setNumReduceTasks(1); String league = context.getRuntimeArguments().get("league"); Preconditions.checkNotNull(league);// www .j a v a 2 s. co m // Configure the input to read all seasons for the league Map<String, String> inputArgs = Maps.newHashMap(); PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addValueCondition("league", league).build()); context.addInput(Input.ofDataset("results", inputArgs)); // Each run writes its output to a partition for the league Map<String, String> outputArgs = Maps.newHashMap(); PartitionKey outputKey = PartitionKey.builder().addStringField("league", league).build(); PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey); context.addOutput(Output.ofDataset("totals", outputArgs)); // used only for logging: PartitionedFileSet input = context.getDataset("results", inputArgs); PartitionedFileSet outputFileSet = context.getDataset("totals", outputArgs); String outputPath = FileSetArguments .getOutputPath(outputFileSet.getEmbeddedFileSet().getRuntimeArguments()); LOG.info("input: {}, output: {}", input.getEmbeddedFileSet().getInputLocations(), outputPath); }
From source file:co.cask.cdap.examples.streamconversion.StreamConversionMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(StreamConversionMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, SCHEMA); // read 5 minutes of events from the stream, ending at the logical start time of this run long logicalTime = context.getLogicalStartTime(); context.addInput(Input.ofStream("events", logicalTime - TimeUnit.MINUTES.toMillis(5), logicalTime)); // each run writes its output to a partition with the logical start time. TimePartitionedFileSetArguments.setOutputPartitionTime(dsArguments, logicalTime); context.addOutput(Output.ofDataset("converted", dsArguments)); TimePartitionedFileSet partitionedFileSet = context.getDataset("converted", dsArguments); LOG.info("Output location for new partition is: {}", partitionedFileSet.getEmbeddedFileSet().getOutputLocation()); }
From source file:co.cask.cdap.examples.wikipedia.StreamToDataset.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setNumReduceTasks(0); WorkflowToken workflowToken = context.getWorkflowToken(); Class<? extends Mapper> mapper = PageTitleToDatasetMapper.class; String inputStream = WikipediaPipelineApp.PAGE_TITLES_STREAM; String outputDataset = WikipediaPipelineApp.PAGE_TITLES_DATASET; if (workflowToken != null) { Value likesToDatasetResult = workflowToken.get("result", WikipediaPipelineApp.LIKES_TO_DATASET_MR_NAME); if (likesToDatasetResult != null && likesToDatasetResult.getAsBoolean()) { // The "likes" stream to the dataset has already run and has been successful in this run so far. // Now run raw wikipedia stream to dataset. mapper = RawWikiDataToDatasetMapper.class; inputStream = WikipediaPipelineApp.RAW_WIKIPEDIA_STREAM; outputDataset = WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET; }/* www . j av a2 s . c o m*/ } LOG.info("Using '{}' as the input stream and '{}' as the output dataset.", inputStream, outputDataset); job.setMapperClass(mapper); StreamBatchReadable.useStreamInput(context, inputStream); context.addOutput(outputDataset); }
From source file:co.cask.cdap.examples.wikipedia.TopNMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Map<String, String> runtimeArguments = context.getRuntimeArguments(); Job job = context.getHadoopJob(); WorkflowToken workflowToken = context.getWorkflowToken(); int topNRank = 10; if (runtimeArguments.containsKey("topn.rank")) { topNRank = Integer.parseInt(runtimeArguments.get("topn.rank")); }/*from ww w . j a v a 2 s .co m*/ if (workflowToken != null) { workflowToken.put("topn.rank", Value.of(topNRank)); } int numReduceTasks = 1; if (runtimeArguments.containsKey("num.reduce.tasks")) { numReduceTasks = Integer.parseInt(runtimeArguments.get("num.reduce.tasks")); } job.setNumReduceTasks(numReduceTasks); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(TopNReducer.class); context.addInput(Input.ofDataset(WikipediaPipelineApp.NORMALIZED_WIKIPEDIA_DATASET)); context.addOutput(Output.ofDataset(WikipediaPipelineApp.MAPREDUCE_TOPN_OUTPUT)); }
From source file:co.cask.cdap.examples.wikipedia.WikiContentValidatorAndNormalizer.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(FilterNormalizerMapper.class); job.setNumReduceTasks(0); context.addInput(Input.ofDataset(WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET)); context.addOutput(Output.ofDataset(WikipediaPipelineApp.NORMALIZED_WIKIPEDIA_DATASET)); }
From source file:co.cask.cdap.examples.wikipedia.WikipediaDataDownloader.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(WikipediaDataDownloaderMapper.class); job.setNumReduceTasks(0); context.addInput(Input.ofDataset(WikipediaPipelineApp.PAGE_TITLES_DATASET)); context.addOutput(Output.ofDataset(WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET)); }
From source file:co.cask.cdap.hbase.wd.RowKeyDistributorTestBase.java
License:Apache License
private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue, int seekIntervalMinValue, int seekIntervalMaxValue) throws IOException, InterruptedException, ClassNotFoundException { int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue, seekIntervalMinValue, seekIntervalMaxValue); // Reading data Configuration conf = new Configuration(testingUtility.getConfiguration()); conf.set("fs.defaultFS", "file:///"); conf.set("fs.default.name", "file:///"); conf.setInt("mapreduce.local.map.tasks.maximum", 16); conf.setInt("mapreduce.local.reduce.tasks.maximum", 16); Job job = Job.getInstance(conf, "testMapReduceInternal()-Job"); TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); // Substituting standard TableInputFormat which was set in TableMapReduceUtil.initTableMapperJob(...) job.setInputFormatClass(WdTableInputFormat.class); keyDistributor.addInfo(job.getConfiguration()); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0); boolean succeeded = job.waitForCompletion(true); Assert.assertTrue(succeeded);/*from w w w .j av a2s.c om*/ long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue(); Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords); // Need to kill the job after completion, after it could leave MRAppMaster running not terminated. // Not sure what causing this, but maybe problem in MiniYarnCluster job.killJob(); }
From source file:co.cask.cdap.longrunning.datacleansing.DataCleansingMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { partitionCommitter = PartitionBatchInput.setInput(context, DataCleansingApp.RAW_RECORDS, new KVTableStatePersistor(DataCleansingApp.CONSUMING_STATE, "state.key")); // Each run writes its output to a partition for the league Long timeKey = Long.valueOf(context.getRuntimeArguments().get(OUTPUT_PARTITION_KEY)); PartitionKey outputKey = PartitionKey.builder().addLongField("time", timeKey).build(); Map<String, String> metadataToAssign = ImmutableMap.of("source.program", "DataCleansingMapReduce"); // set up two outputs - one for invalid records and one for valid records Map<String, String> invalidRecordsArgs = new HashMap<>(); PartitionedFileSetArguments.setOutputPartitionKey(invalidRecordsArgs, outputKey); PartitionedFileSetArguments.setOutputPartitionMetadata(invalidRecordsArgs, metadataToAssign); context.addOutput(DataCleansingApp.INVALID_RECORDS, invalidRecordsArgs); Map<String, String> cleanRecordsArgs = new HashMap<>(); PartitionedFileSetArguments.setDynamicPartitioner(cleanRecordsArgs, TimeAndZipPartitioner.class); PartitionedFileSetArguments.setOutputPartitionMetadata(cleanRecordsArgs, metadataToAssign); context.addOutput(DataCleansingApp.CLEAN_RECORDS, cleanRecordsArgs); Job job = context.getHadoopJob(); job.setMapperClass(SchemaMatchingFilter.class); job.setNumReduceTasks(0); // simply propagate the schema (if any) to be used by the mapper String schemaJson = context.getRuntimeArguments().get(SCHEMA_KEY); if (schemaJson != null) { job.getConfiguration().set(SCHEMA_KEY, schemaJson); }//from ww w .j a v a 2 s. c o m }
From source file:co.cask.cdap.template.etl.batch.ETLMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); Map<String, String> runtimeArgs = context.getRuntimeArguments(); Preconditions.checkArgument(runtimeArgs.containsKey(Constants.ADAPTER_NAME)); Preconditions.checkArgument(runtimeArgs.containsKey(Constants.CONFIG_KEY)); Preconditions.checkArgument(runtimeArgs.containsKey(Constants.Source.PLUGINID)); Preconditions.checkArgument(runtimeArgs.containsKey(Constants.Sink.PLUGINID)); Preconditions.checkArgument(runtimeArgs.containsKey(Constants.Transform.PLUGINIDS)); ETLBatchConfig etlBatchConfig = GSON.fromJson(runtimeArgs.get(Constants.CONFIG_KEY), ETLBatchConfig.class); prepareSource(context, etlBatchConfig.getSource()); prepareSink(context, etlBatchConfig.getSink()); if (etlBatchConfig.getResources() != null) { context.setMapperResources(etlBatchConfig.getResources()); }//from ww w. j a v a 2 s .co m job.setMapperClass(ETLMapper.class); job.setNumReduceTasks(0); }
From source file:co.nubetech.hiho.job.DBQueryInputJob.java
License:Apache License
public void runJobs(Configuration conf, int jobCounter) throws IOException { try {/* w w w .j a v a 2 s . co m*/ checkMandatoryConfs(conf); } catch (HIHOException e1) { e1.printStackTrace(); throw new IOException(e1); } Job job = new Job(conf); for (Entry<String, String> entry : conf) { logger.warn("key, value " + entry.getKey() + "=" + entry.getValue()); } // logger.debug("Number of maps " + // conf.getInt("mapred.map.tasks", 1)); // conf.setInt(JobContext.NUM_MAPS, // conf.getInt("mapreduce.job.maps", 1)); // job.getConfiguration().setInt("mapred.map.tasks", 4); job.getConfiguration().setInt(MRJobConfig.NUM_MAPS, conf.getInt(HIHOConf.NUMBER_MAPPERS, 1)); logger.warn("Number of maps " + conf.getInt(MRJobConfig.NUM_MAPS, 1)); job.setJobName("Import job"); job.setJarByClass(DBQueryInputJob.class); String strategy = conf.get(HIHOConf.INPUT_OUTPUT_STRATEGY); OutputStrategyEnum os = OutputStrategyEnum.value(strategy); if (os == null) { throw new IllegalArgumentException("Wrong value of output strategy. Please correct"); } if (os != OutputStrategyEnum.AVRO) { switch (os) { case DUMP: { // job.setMapperClass(DBImportMapper.class); break; } /* * case AVRO: { job.setMapperClass(DBInputAvroMapper.class); // * need avro in cp // job.setJarByClass(Schema.class); // need * jackson which is needed by avro - ugly! // * job.setJarByClass(ObjectMapper.class); * job.setMapOutputKeyClass(NullWritable.class); * job.setMapOutputValueClass(AvroValue.class); * job.setOutputKeyClass(NullWritable.class); * job.setOutputValueClass(AvroValue.class); * job.setOutputFormatClass(AvroOutputFormat.class); * * AvroOutputFormat.setOutputPath(job, new * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); break; } */ case DELIMITED: { job.setMapperClass(DBInputDelimMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class); NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); } case JSON: { // job.setMapperClass(DBImportJsonMapper.class); // job.setJarByClass(ObjectMapper.class); break; } default: { job.setMapperClass(DBInputDelimMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class); NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); break; } } String inputQuery = conf.get(DBConfiguration.INPUT_QUERY); String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY); logger.debug("About to set the params"); DBQueryInputFormat.setInput(job, inputQuery, inputBoundingQuery, params); logger.debug("Set the params"); job.setNumReduceTasks(0); try { // job.setJarByClass(Class.forName(conf.get( // org.apache.hadoop.mapred.lib.db.DBConfiguration.DRIVER_CLASS_PROPERTY))); logger.debug("OUTPUT format class is " + job.getOutputFormatClass()); /* * org.apache.hadoop.mapreduce.OutputFormat<?, ?> output = * ReflectionUtils.newInstance(job.getOutputFormatClass(), * job.getConfiguration()); output.checkOutputSpecs(job); */ logger.debug("Class is " + ReflectionUtils .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName()); job.waitForCompletion(false); if (conf.get(HIHOConf.INPUT_OUTPUT_LOADTO) != null) { generateHiveScript(conf, job, jobCounter); generatePigScript(conf, job); } } /* * catch (HIHOException h) { h.printStackTrace(); } */ catch (Exception e) { e.printStackTrace(); } catch (HIHOException e) { e.printStackTrace(); } } // avro to be handled differently, thanks to all the incompatibilities // in the apis. else { String inputQuery = conf.get(DBConfiguration.INPUT_QUERY); String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY); logger.debug("About to set the params"); // co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(job, // inputQuery, inputBoundingQuery, params); logger.debug("Set the params"); JobConf jobConf = new JobConf(conf); try { GenericDBWritable queryWritable = getDBWritable(jobConf); Schema pair = DBMapper.getPairSchema(queryWritable.getColumns()); AvroJob.setMapOutputSchema(jobConf, pair); GenericRecordAvroOutputFormat.setOutputPath(jobConf, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(jobConf, inputQuery, inputBoundingQuery, params); jobConf.setInputFormat(co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.class); jobConf.setMapperClass(DBInputAvroMapper.class); jobConf.setMapOutputKeyClass(NullWritable.class); jobConf.setMapOutputValueClass(AvroValue.class); jobConf.setOutputKeyClass(NullWritable.class); jobConf.setOutputValueClass(Text.class); jobConf.setOutputFormat(GenericRecordAvroOutputFormat.class); jobConf.setJarByClass(DBQueryInputJob.class); jobConf.setStrings("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization,org.apache.avro.mapred.AvroSerialization"); jobConf.setNumReduceTasks(0); /* * jobConf.setOutputFormat(org.apache.hadoop.mapred. * SequenceFileOutputFormat.class); * org.apache.hadoop.mapred.SequenceFileOutputFormat * .setOutputPath(jobConf, new * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); */ JobClient.runJob(jobConf); } catch (Throwable e) { e.printStackTrace(); } } }