co.cask.cdap.internal.app.runtime.batch.dataset.output.AppWithMapReduceUsingMultipleOutputs.java Source code

Introduction

Here is the source code for co.cask.cdap.internal.app.runtime.batch.dataset.output.AppWithMapReduceUsingMultipleOutputs.java
Source

/*
 * Copyright  2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.internal.app.runtime.batch.dataset.output;

import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.app.AbstractApplication;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.dataset.lib.FileSetArguments;
import co.cask.cdap.api.dataset.lib.FileSetProperties;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * App used to test whether M/R can write to multiple outputs. Tests writing to the same dataset, with different runtime
 * arguments, as two different outputs.
 */
public class AppWithMapReduceUsingMultipleOutputs extends AbstractApplication {

    public static final String PURCHASES = "purchases";
    public static final String SEPARATED_PURCHASES = "smallPurchases";

    @Override
    public void configure() {
        setName("AppWithMapReduceUsingMultipleOutputs");
        setDescription("Application with MapReduce job using multiple outputs");
        createDataset(PURCHASES, "fileSet",
                FileSetProperties.builder().setInputFormat(TextInputFormat.class).build());
        createDataset(SEPARATED_PURCHASES, "fileSet",
                FileSetProperties.builder().setOutputFormat(TextOutputFormat.class)
                        .setOutputProperty(TextOutputFormat.SEPERATOR, " ").build());
        addMapReduce(new SeparatePurchases());
        addMapReduce(new InvalidMapReduce());
    }

    /**
     * Simple map-only MR that simply writes to a different output, depending on the spend amount.
     */
    public static class SeparatePurchases extends AbstractMapReduce {

        @Override
        public void beforeSubmit(MapReduceContext context) throws Exception {
            Map<String, String> inputArgs = new HashMap<>();
            FileSetArguments.setInputPath(inputArgs, "inputFile");

            // test using a stream with the same name, but aliasing it differently (so mapper gets the alias'd name)
            context.addInput(Input.ofDataset(PURCHASES, inputArgs), FileMapper.class);

            Map<String, String> output1Args = new HashMap<>();
            FileSetArguments.setOutputPath(output1Args, "small_purchases");
            context.addOutput(Output.ofDataset(SEPARATED_PURCHASES, output1Args).alias("small_purchases"));

            Map<String, String> output2Args = new HashMap<>();
            FileSetArguments.setOutputPath(output2Args, "large_purchases");
            context.addOutput(Output.ofDataset(SEPARATED_PURCHASES, output2Args).alias("large_purchases"));

            Job job = context.getHadoopJob();
            job.setMapperClass(FileMapper.class);
            job.setNumReduceTasks(0);
        }
    }

    /**
     * This is an invalid MR because it adds an output a second time, with the same alias.
     */
    public static class InvalidMapReduce extends SeparatePurchases {
        @Override
        public void beforeSubmit(MapReduceContext context) throws Exception {
            super.beforeSubmit(context);
            context.addOutput(Output.ofDataset(SEPARATED_PURCHASES).alias("small_purchases"));
        }
    }

    public static class FileMapper extends Mapper<LongWritable, Text, LongWritable, Text>
            implements ProgramLifecycle<MapReduceTaskContext<NullWritable, Text>> {

        private MapReduceTaskContext<NullWritable, Text> mapReduceTaskContext;

        @Override
        public void initialize(MapReduceTaskContext<NullWritable, Text> context) throws Exception {
            this.mapReduceTaskContext = context;
        }

        @Override
        public void map(LongWritable key, Text data, Context context) throws IOException, InterruptedException {
            String spend = data.toString().split(" ")[1];
            String output = Integer.valueOf(spend) > 50 ? "large_purchases" : "small_purchases";
            mapReduceTaskContext.write(output, NullWritable.get(), data);
        }

        @Override
        public void destroy() {
        }
    }

}