co.cask.cdap.internal.app.runtime.batch.dataset.input.AppWithMapReduceUsingMultipleInputs.java Source code

Introduction

Here is the source code for co.cask.cdap.internal.app.runtime.batch.dataset.input.AppWithMapReduceUsingMultipleInputs.java
Source

/*
 * Copyright  2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.internal.app.runtime.batch.dataset.input;

import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.app.AbstractApplication;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.dataset.lib.FileSetArguments;
import co.cask.cdap.api.dataset.lib.FileSetProperties;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * App used to test whether M/R can read and perform a join across multiple inputs.
 */
public class AppWithMapReduceUsingMultipleInputs extends AbstractApplication {

    public static final String PURCHASES = "purchases";
    public static final String CUSTOMERS = "customers";
    public static final String OUTPUT_DATASET = "saturatedRecords";

    @Override
    public void configure() {
        setName("AppWithMapReduceUsingMultipleInputs");
        setDescription("Application with MapReduce job using multiple inputs");
        addStream(PURCHASES);
        createDataset(PURCHASES, "fileSet",
                FileSetProperties.builder().setInputFormat(TextInputFormat.class).build());
        createDataset(CUSTOMERS, "fileSet",
                FileSetProperties.builder().setInputFormat(TextInputFormat.class).build());
        createDataset(OUTPUT_DATASET, "fileSet", FileSetProperties.builder().setOutputFormat(TextOutputFormat.class)
                .setOutputProperty(TextOutputFormat.SEPERATOR, " ").build());
        addMapReduce(new ComputeSum());
        addMapReduce(new InvalidMapReduce());
    }

    /**
     * Computes sum of a customer's spending, while also joining on a lookup table, to get more data.
     */
    public static class ComputeSum extends AbstractMapReduce {

        @Override
        public void beforeSubmit(MapReduceContext context) throws Exception {
            Map<String, String> inputArgs = new HashMap<>();
            FileSetArguments.setInputPath(inputArgs, "inputFile");

            // test using a stream with the same name, but aliasing it differently (so mapper gets the alias'd name)
            context.addInput(Input.ofStream(PURCHASES).alias("streamPurchases"), StreamTestBatchMapper.class);
            context.addInput(Input.ofDataset(PURCHASES, inputArgs), FileMapper.class);
            // since we set a Mapper class on the job itself, omitting the mapper in the addInput call will default to that
            context.addInput(Input.ofDataset(CUSTOMERS, inputArgs));

            Map<String, String> outputArgs = new HashMap<>();
            FileSetArguments.setOutputPath(outputArgs, "output");
            context.addOutput(Output.ofDataset(OUTPUT_DATASET, outputArgs));

            Job job = context.getHadoopJob();
            job.setMapperClass(FileMapper.class);
            job.setReducerClass(FileReducer.class);
        }
    }

    /**
     * This is an invalid MR because it adds an input a second time, with the same alias.
     */
    public static final class InvalidMapReduce extends ComputeSum {
        @Override
        public void beforeSubmit(MapReduceContext context) throws Exception {
            super.beforeSubmit(context);
            context.addInput(Input.ofDataset(PURCHASES, ImmutableMap.of("key", "value")));
        }
    }

    public static class StreamTestBatchMapper extends Mapper<LongWritable, BytesWritable, LongWritable, Text>
            implements ProgramLifecycle<MapReduceTaskContext> {

        @Override
        protected void map(LongWritable key, BytesWritable value, Context context)
                throws IOException, InterruptedException {
            String[] split = Bytes.toString(value.copyBytes()).split(" ");
            // tag each record with the source as 'purchases', because we know this mapper is only used for that input
            context.write(new LongWritable(Long.valueOf(split[0])), new Text("purchases " + split[1]));
        }

        @Override
        public void initialize(MapReduceTaskContext context) throws Exception {
            // we aliased the stream 'purchases' as 'streamPurchases'
            Preconditions.checkArgument("streamPurchases".equals(context.getInputName()));
        }

        @Override
        public void destroy() {
            // no-op
        }
    }

    public static class FileMapper extends Mapper<LongWritable, Text, LongWritable, Text>
            implements ProgramLifecycle<MapReduceTaskContext> {

        private String source;

        @Override
        public void initialize(MapReduceTaskContext context) throws Exception {
            System.setProperty("mapper.initialized", "true");
            source = context.getInputName();
            Preconditions.checkNotNull(source);
            Preconditions.checkArgument(PURCHASES.equals(source) || CUSTOMERS.equals(source));
        }

        @Override
        public void destroy() {
            System.setProperty("mapper.destroyed", "true");
        }

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            // assert that the user gets FileInputSplit (as opposed to the TaggedInputSplit from the context
            Preconditions.checkArgument(context.getInputSplit() instanceof FileSplit);
            try {
                // assert that the user gets the TextInputFormat, as opposed to the DelegatingInputFormat from the context
                Preconditions.checkArgument(context.getInputFormatClass() == TextInputFormat.class);
            } catch (ClassNotFoundException e) {
                Throwables.propagate(e);
            }
        }

        @Override
        public void map(LongWritable key, Text data, Context context) throws IOException, InterruptedException {
            String[] split = data.toString().split(" ");
            // tag each record with the source, so the reducer is simpler
            context.write(new LongWritable(Long.valueOf(split[0])), new Text(source + " " + split[1]));
        }
    }

    public static class FileReducer extends Reducer<LongWritable, Text, String, String> {

        @Override
        public void reduce(LongWritable key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            String name = null;
            int totalSpend = 0;

            for (Text value : values) {
                String[] split = value.toString().split(" ");
                String source = split[0];
                String data = split[1];

                if (PURCHASES.equals(source)) {
                    totalSpend += Integer.valueOf(data);
                } else if (CUSTOMERS.equals(source)) {
                    name = data;
                }
            }
            Preconditions.checkNotNull(name);
            context.write(key.toString(), name + " " + totalSpend);
        }
    }

}