com.ibm.bi.dml.runtime.matrix.mapred.GMRMapper.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.matrix.mapred.GMRMapper.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.matrix.mapred;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

import com.ibm.bi.dml.runtime.matrix.data.MatrixCell;
import com.ibm.bi.dml.runtime.matrix.data.MatrixPackedCell;
import com.ibm.bi.dml.runtime.matrix.data.MatrixValue;
import com.ibm.bi.dml.runtime.matrix.data.TaggedMatrixPackedCell;
import com.ibm.bi.dml.runtime.matrix.data.TaggedMatrixValue;
import com.ibm.bi.dml.runtime.util.MapReduceTool;

/**
 * 
 * 
 */
public class GMRMapper extends MapperBase implements Mapper<Writable, Writable, Writable, Writable> {

    //whether this is a map only job
    private boolean mapOnlyJob = false;

    //the final result indexes that needed to be outputted for maponly job
    protected byte[] resultIndexes = null;
    protected byte[] resultDimsUnknown = null;

    //output converters for maponly job
    protected CollectMultipleConvertedOutputs collectFinalMultipleOutputs;

    //the counters to record how many nonZero cells have been produced for each output
    // for maponly job
    protected long[] resultsNonZeros = null;
    protected long[] resultsMaxRowDims = null;
    protected long[] resultsMaxColDims = null;
    protected String dimsUnknownFilePrefix;

    //cached reporter to report the number of nonZeros for each reduce task
    protected Reporter cachedReporter = null;

    protected String mapperID;

    //tempory variables
    private TaggedMatrixValue taggedValueBuffer = null;
    private HashMap<Byte, ArrayList<Integer>> tagMapping;

    //empty block filter flags
    private boolean _filterEmptyInputBlocks = false;

    @Override
    public void map(Writable rawKey, Writable rawValue, OutputCollector<Writable, Writable> out, Reporter reporter)
            throws IOException {
        //cache reporter for counters in close 
        cachedReporter = reporter;

        //empty block input filter
        if (_filterEmptyInputBlocks && ((MatrixValue) rawValue).isEmpty())
            return;

        //default map runtime (input converters, call to overwritten special operations)
        commonMap(rawKey, rawValue, out, reporter);
    }

    @Override
    protected void specialOperationsForActualMap(int index, OutputCollector<Writable, Writable> out,
            Reporter reporter) throws IOException {
        //apply all instructions
        processMapperInstructionsForMatrix(index);

        //output the results needed by the reducer
        if (mapOnlyJob)
            processMapFinalOutput(index, taggedValueBuffer, collectFinalMultipleOutputs, reporter, tagMapping);
        else
            processMapOutputToReducerForGMR(index, taggedValueBuffer, out);
    }

    /**
     * 
     * @param index
     * @param taggedValueBuffer
     * @param out
     * @throws IOException
     */
    protected void processMapOutputToReducerForGMR(int index, TaggedMatrixValue taggedValueBuffer,
            OutputCollector<Writable, Writable> out) throws IOException {
        for (byte output : outputIndexes.get(index)) {
            ArrayList<IndexedMatrixValue> results = cachedValues.get(output);
            if (results == null)
                continue;
            for (IndexedMatrixValue result : results) {
                if (result == null)
                    continue;

                //prepare tagged output value
                //(special case for conversion from matrixcell to taggedmatrixpackedcell, e.g., ctable)
                if (valueClass.equals(MatrixCell.class))
                    taggedValueBuffer.getBaseObject().copy(result.getValue());
                else
                    taggedValueBuffer.setBaseObject(result.getValue());
                taggedValueBuffer.setTag(output);

                //collect output (exactly once)
                out.collect(result.getIndexes(), taggedValueBuffer);
            }
        }
    }

    /**
     * 
     * @param index
     * @param taggedValueBuffer
     * @param collectFinalMultipleOutputs
     * @param reporter
     * @param tagMapping
     * @throws IOException
     */
    protected void processMapFinalOutput(int index, TaggedMatrixValue taggedValueBuffer,
            CollectMultipleConvertedOutputs collectFinalMultipleOutputs, Reporter reporter,
            HashMap<Byte, ArrayList<Integer>> tagMapping) throws IOException {
        for (byte output : outputIndexes.get(index)) {
            ArrayList<IndexedMatrixValue> results = cachedValues.get(output);
            if (results == null)
                continue;
            for (IndexedMatrixValue result : results) {
                if (result == null)
                    continue;

                //prepare tagged output value
                taggedValueBuffer.setBaseObject(result.getValue());
                taggedValueBuffer.setTag(output);

                //collect output (for all result indexes)            
                for (int outputIndex : tagMapping.get(output)) {
                    collectOutput_N_Increase_Counter(result.getIndexes(), taggedValueBuffer.getBaseObject(),
                            outputIndex, reporter, collectFinalMultipleOutputs, resultDimsUnknown, resultsNonZeros,
                            resultsMaxRowDims, resultsMaxColDims);
                }
            }
        }

    }

    public void configure(JobConf job) {
        super.configure(job);

        mapperID = job.get("mapred.task.id");
        dimsUnknownFilePrefix = job.get("dims.unknown.file.prefix");

        _filterEmptyInputBlocks = allowsFilterEmptyInputBlocks();

        //assign the temporay vairables
        try {
            //   System.out.println(valueClass.getName());
            //   System.out.println(MatrixCell.class.getName());
            if (job.getMapOutputValueClass().equals(TaggedMatrixPackedCell.class))
                taggedValueBuffer = TaggedMatrixValue.createObject(MatrixPackedCell.class);
            else
                taggedValueBuffer = TaggedMatrixValue.createObject(valueClass);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }

        //decide whether it is a maponly job
        mapOnlyJob = (job.getNumReduceTasks() <= 0);
        if (!mapOnlyJob)
            return;

        //get the indexes of the final output matrices
        resultIndexes = MRJobConfiguration.getResultIndexes(job);
        resultDimsUnknown = MRJobConfiguration.getResultDimsUnknown(job);

        //initialize SystemML Counters (defined in MRJobConfiguration)
        resultsNonZeros = new long[resultIndexes.length];
        resultsMaxRowDims = new long[resultIndexes.length];
        resultsMaxColDims = new long[resultIndexes.length];

        tagMapping = new HashMap<Byte, ArrayList<Integer>>();
        for (int i = 0; i < resultIndexes.length; i++) {
            byte output = resultIndexes[i];
            ArrayList<Integer> vec = tagMapping.get(output);
            if (vec == null) {
                vec = new ArrayList<Integer>();
                tagMapping.put(output, vec);
            }
            vec.add(i);
        }
        //for map only job, get the map output converters 
        collectFinalMultipleOutputs = MRJobConfiguration.getMultipleConvertedOutputs(job);
    }

    public void close() throws IOException {
        if (cachedReporter != null && mapOnlyJob) {
            //get and construct task id
            String[] parts = mapperID.split("_");
            String jobID = "job_" + parts[1] + "_" + parts[2];
            int taskid;
            if (parts[0].equalsIgnoreCase("task")) {
                taskid = Integer.parseInt(parts[parts.length - 1]);
            } else if (parts[0].equalsIgnoreCase("attempt")) {
                taskid = Integer.parseInt(parts[parts.length - 2]);
            } else {
                throw new RuntimeException("Unrecognized format for reducerID: " + mapperID);
            }

            //maintain unknown dimensions (if required, e.g., ctable)
            boolean dimsUnknown = false;
            for (int i = 0; i < resultIndexes.length; i++) {
                cachedReporter.incrCounter(MRJobConfiguration.NUM_NONZERO_CELLS, Integer.toString(i),
                        resultsNonZeros[i]);

                if (resultDimsUnknown != null && resultDimsUnknown[i] != (byte) 0) {
                    dimsUnknown = true;
                    // Each counter is of the form: (group, name)
                    // where group = max_rowdim_resultindex; name = taskid
                    //System.out.println("--> before i="+i+", row = " + cachedReporter.getCounter("max_rowdim_"+i, ""+taskid).getCounter() + ", col = " + cachedReporter.getCounter("max_coldim_"+i, ""+taskid).getCounter());
                    //cachedReporter.getCounter(MRJobConfiguration.MAX_ROW_DIMENSION, Integer.toString(i)).increment(resultsMaxRowDims[i]);
                    //cachedReporter.getCounter(MRJobConfiguration.MAX_COL_DIMENSION, Integer.toString(i)).increment(resultsMaxColDims[i]);
                    //System.out.println("--> after i="+i+", row = " + cachedReporter.getCounter("max_rowdim_"+i, ""+taskid).getCounter() + ", col = " + cachedReporter.getCounter("max_coldim_"+i, ""+taskid).getCounter());
                }
            }
            if (dimsUnknown) {
                // every task creates a file with max_row and max_col dimensions found in that task
                MapReduceTool.writeDimsFile(dimsUnknownFilePrefix + "/" + jobID + "_dimsFile/" + "m_" + taskid,
                        resultDimsUnknown, resultsMaxRowDims, resultsMaxColDims);
            }
        }

        if (collectFinalMultipleOutputs != null)
            collectFinalMultipleOutputs.close();
    }
}