com.rw.legion.DefaultMapper.java Source code

Introduction

Here is the source code for com.rw.legion.DefaultMapper.java
Source

/*
 * Copyright (C) 2017 Republic Wireless
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.rw.legion;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

import java.io.IOException;

/**
 * Default Mapper class used by Legion. Takes a <code>NullWritable</code> key
 * and a <code>LegionRecord</code> value, loops through all output tables and
 * columns specified by the current <code>LegionObjective</code>, validates and
 * cleans the data, and yields a <code>NullWritable</code> key and a
 * CSV-formatted <code>Text</code> value, which will be written to a file by
 * the TextOutputFormat.
 */

public class DefaultMapper extends Mapper<NullWritable, LegionRecord, NullWritable, Text> {

    protected LegionObjective objective;
    protected MultipleOutputs<NullWritable, Text> outputWriters;
    private Text outputLine = new Text();
    private NullWritable nothing = NullWritable.get();

    /**
     * Do standard Hadoop setup, de-serialize the <code>LegionObjective</code>,
     * and prepare for writing to multiple output files.
     */
    @SuppressWarnings({ "unchecked", "rawtypes" })
    public void setup(Context context) {
        Configuration config = context.getConfiguration();
        this.objective = ObjectiveDeserializer.deserialize(config.get("legion_objective"));
        outputWriters = new MultipleOutputs(context);
    }

    /**
     * Default Mapper used by Legion. Takes a <code>NullWritable</code> key and
     * a <code>LegionRecord</code> value, loops through all output tables and
     * columns specified by the current <code>LegionObjective</code>, validates
     * and cleans the data, and yields a <code>NullWritable</code> key and a
     * CSV-formatted <code>Text</code> value, which will be written to a file by
     * the TextOutputFormat.
     * 
     * @param key  A <code>NullWritable</code>.
     * @param value  The current LegionRecord being worked on.
     * @param context  The Hadoop <code>Context</code>.
     */
    public void map(NullWritable key, LegionRecord value, Context context)
            throws IOException, InterruptedException {

        for (OutputTable outputTable : objective.getOutputTables()) {
            if (outputTable.hasIndexes()) {
                IndexComboEnumerator enumerator = value.findIndexValues(outputTable);

                // No need to output this table if there were no index values
                if (enumerator.getSize() > 0) {
                    for (IndexCombo indexCombo : enumerator) {
                        /*
                         * Generate a list of keys to extract from the
                         * LegionRecord by replacing index names with current
                         * index values. Then try to extract data for those keys
                         * and output.
                         */
                        String[] modifiedKeys = outputTable.getColumnKeys().clone();

                        for (int i = 0; i < modifiedKeys.length; i++) {
                            for (String name : outputTable.getIndexNames()) {
                                name = "<" + name + ">";

                                modifiedKeys[i] = modifiedKeys[i].replace(name, indexCombo.getValue(name));
                            }
                        }

                        tryOutput(outputTable, value, modifiedKeys);
                    }
                }
            } else {

                /*
                 *  We don't have any indexes, so just try to extract data and
                 *  output!
                 */
                tryOutput(outputTable, value);
            }
        }
    }

    /**
     * Validates the data flowing to each output column, and writes output.
     * 
     * @param output  The current <code>OutputTable</code>.
     * @param value  The current <code>LegionRecord</code>.
     */
    private void tryOutput(OutputTable output, LegionRecord value) throws IOException, InterruptedException {

        // We're not overriding column keys, so just use the standard ones.
        tryOutput(output, value, output.getColumnKeys());
    }

    /**
     * Validates the data flowing to each output column, and writes output.
     * 
     * @param output  The current <code>OutputTable</code>.
     * @param value  The current <code>LegionRecord</code>.
     * @param keyList  A list of data keys to extract from the <code>
     *                 LegionRecord</code>.
     */
    private void tryOutput(OutputTable outputTable, LegionRecord value, String[] keyList)
            throws IOException, InterruptedException {
        int i = 0;
        boolean validates = true;

        String[] dataToWrite = new String[outputTable.getColumns().size()];

        for (OutputColumn column : outputTable.getColumns()) {
            if (column.validates(keyList[i], value)) {
                column.transform(keyList[i], value);

                dataToWrite[i] = StringEscapeUtils.escapeCsv(value.getData(keyList[i]));
            } else {
                dataToWrite = new String[4];
                dataToWrite[0] = value.getData("file_name");
                dataToWrite[1] = value.getData("file_line");
                dataToWrite[2] = column.getKey();
                dataToWrite[3] = column.getFailureReason();

                validates = false;
                break;
            }

            i++;
        }

        outputLine.set(StringUtils.join(dataToWrite, ","));

        if (validates) {
            outputWriters.write(outputTable.getTitle(), nothing, outputLine, outputTable.getTitle());
        } else {
            outputWriters.write("skipped", nothing, outputLine, "skipped");
        }
    }

    /**
     * Standard Hadoop cleanup.
     */
    public void cleanup(Context context) throws IOException, InterruptedException {
        outputWriters.close();
    }
}