List of usage examples for org.apache.hadoop.mapreduce Job setMapperClass
public void setMapperClass(Class<? extends Mapper> cls) throws IllegalStateException
From source file:co.cask.cdap.examples.loganalysis.HitCounterProgram.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(Emitter.class); job.setReducerClass(Counter.class); context.addInput(Input.ofStream(LogAnalysisApp.LOG_STREAM)); context.addOutput(Output.ofDataset(LogAnalysisApp.HIT_COUNT_STORE)); }
From source file:co.cask.cdap.examples.sportresults.ScoreCounter.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(ResultsMapper.class); job.setReducerClass(TeamCounter.class); job.setNumReduceTasks(1);/*from w w w . jav a 2 s . co m*/ String league = context.getRuntimeArguments().get("league"); Preconditions.checkNotNull(league); // Configure the input to read all seasons for the league Map<String, String> inputArgs = Maps.newHashMap(); PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addValueCondition("league", league).build()); context.addInput(Input.ofDataset("results", inputArgs)); // Each run writes its output to a partition for the league Map<String, String> outputArgs = Maps.newHashMap(); PartitionKey outputKey = PartitionKey.builder().addStringField("league", league).build(); PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey); context.addOutput(Output.ofDataset("totals", outputArgs)); // used only for logging: PartitionedFileSet input = context.getDataset("results", inputArgs); PartitionedFileSet outputFileSet = context.getDataset("totals", outputArgs); String outputPath = FileSetArguments .getOutputPath(outputFileSet.getEmbeddedFileSet().getRuntimeArguments()); LOG.info("input: {}, output: {}", input.getEmbeddedFileSet().getInputLocations(), outputPath); }
From source file:co.cask.cdap.examples.streamconversion.StreamConversionMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(StreamConversionMapper.class); job.setNumReduceTasks(0);/*from w ww.jav a2 s .c om*/ job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, SCHEMA); // read 5 minutes of events from the stream, ending at the logical start time of this run long logicalTime = context.getLogicalStartTime(); context.addInput(Input.ofStream("events", logicalTime - TimeUnit.MINUTES.toMillis(5), logicalTime)); // each run writes its output to a partition with the logical start time. TimePartitionedFileSetArguments.setOutputPartitionTime(dsArguments, logicalTime); context.addOutput(Output.ofDataset("converted", dsArguments)); TimePartitionedFileSet partitionedFileSet = context.getDataset("converted", dsArguments); LOG.info("Output location for new partition is: {}", partitionedFileSet.getEmbeddedFileSet().getOutputLocation()); }
From source file:co.cask.cdap.examples.wikipedia.StreamToDataset.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setNumReduceTasks(0);/*from www.j a v a 2s . co m*/ WorkflowToken workflowToken = context.getWorkflowToken(); Class<? extends Mapper> mapper = PageTitleToDatasetMapper.class; String inputStream = WikipediaPipelineApp.PAGE_TITLES_STREAM; String outputDataset = WikipediaPipelineApp.PAGE_TITLES_DATASET; if (workflowToken != null) { Value likesToDatasetResult = workflowToken.get("result", WikipediaPipelineApp.LIKES_TO_DATASET_MR_NAME); if (likesToDatasetResult != null && likesToDatasetResult.getAsBoolean()) { // The "likes" stream to the dataset has already run and has been successful in this run so far. // Now run raw wikipedia stream to dataset. mapper = RawWikiDataToDatasetMapper.class; inputStream = WikipediaPipelineApp.RAW_WIKIPEDIA_STREAM; outputDataset = WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET; } } LOG.info("Using '{}' as the input stream and '{}' as the output dataset.", inputStream, outputDataset); job.setMapperClass(mapper); StreamBatchReadable.useStreamInput(context, inputStream); context.addOutput(outputDataset); }
From source file:co.cask.cdap.examples.wikipedia.TopNMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Map<String, String> runtimeArguments = context.getRuntimeArguments(); Job job = context.getHadoopJob(); WorkflowToken workflowToken = context.getWorkflowToken(); int topNRank = 10; if (runtimeArguments.containsKey("topn.rank")) { topNRank = Integer.parseInt(runtimeArguments.get("topn.rank")); }//from ww w .j a va2s .c om if (workflowToken != null) { workflowToken.put("topn.rank", Value.of(topNRank)); } int numReduceTasks = 1; if (runtimeArguments.containsKey("num.reduce.tasks")) { numReduceTasks = Integer.parseInt(runtimeArguments.get("num.reduce.tasks")); } job.setNumReduceTasks(numReduceTasks); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(TopNReducer.class); context.addInput(Input.ofDataset(WikipediaPipelineApp.NORMALIZED_WIKIPEDIA_DATASET)); context.addOutput(Output.ofDataset(WikipediaPipelineApp.MAPREDUCE_TOPN_OUTPUT)); }
From source file:co.cask.cdap.examples.wikipedia.WikiContentValidatorAndNormalizer.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(FilterNormalizerMapper.class); job.setNumReduceTasks(0);/* w w w.jav a 2 s.co m*/ context.addInput(Input.ofDataset(WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET)); context.addOutput(Output.ofDataset(WikipediaPipelineApp.NORMALIZED_WIKIPEDIA_DATASET)); }
From source file:co.cask.cdap.examples.wikipedia.WikipediaDataDownloader.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(WikipediaDataDownloaderMapper.class); job.setNumReduceTasks(0);/*from www. jav a2 s . co m*/ context.addInput(Input.ofDataset(WikipediaPipelineApp.PAGE_TITLES_DATASET)); context.addOutput(Output.ofDataset(WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET)); }
From source file:co.cask.cdap.internal.app.runtime.batch.AggregateMetricsByTag.java
License:Apache License
static void configureJob(Job job) throws IOException { job.setMapperClass(Map.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setReducerClass(Reduce.class); }
From source file:co.cask.cdap.internal.app.runtime.batch.MapperWrapper.java
License:Apache License
/** * Wraps the mapper defined in the job with this {@link MapperWrapper} if it is defined. * @param job The MapReduce job/* w w w. j av a 2 s.c o m*/ */ public static void wrap(Job job) { // NOTE: we don't use job.getMapperClass() as we don't need to load user class here Configuration conf = job.getConfiguration(); String mapClass = conf.get(MRJobConfig.MAP_CLASS_ATTR, Mapper.class.getName()); conf.set(MapperWrapper.ATTR_MAPPER_CLASS, mapClass); job.setMapperClass(MapperWrapper.class); }
From source file:co.cask.cdap.internal.app.runtime.batch.WordCount.java
License:Apache License
public static void configureJob(Job job, String inputPath, String outputPath) throws IOException { job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); }