org.apache.sysml.runtime.matrix.MMRJMR.java Source code

Introduction

Here is the source code for org.apache.sysml.runtime.matrix.MMRJMR.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.matrix;

import java.util.HashSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.Counters.Group;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixCell;
import org.apache.sysml.runtime.matrix.data.TripleIndexes;
import org.apache.sysml.runtime.matrix.mapred.MMRJMRMapper;
import org.apache.sysml.runtime.matrix.mapred.MMRJMRReducer;
import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.ConvertTarget;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups;
import org.apache.sysml.yarn.DMLAppMasterUtils;

/*
 * inBlockRepresentation: indicate whether to use block representation or cell representation
 * inputs: input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
 * inputInfos: the input format information for the input matrices
 * rlen: the number of rows for each matrix
 * clen: the number of columns for each matrix
 * brlen: the number of rows per block
 * bclen: the number of columns per block
 * instructionsInMapper: in Mapper, the set of unary operations that need to be performed on each input matrix
 * aggInstructionsInReducer: in Reducer, right after sorting, the set of aggreagte operations that need 
 *                      to be performed on each input matrix, 
 * aggBinInstrction: the aggregate binary instruction for the MMCJ operation
 * otherInstructionsInReducer: the mixed operations that need to be performed on matrices after the aggregate operations
 * numReducers: the number of reducers
 * replication: the replication factor for the output
 * resulltIndexes: the indexes of the result matrices that needs to be outputted.
 * outputs: the names for the output directories, one for each result index
 * outputInfos: output format information for the output matrices
 */
public class MMRJMR {
    private static final Log LOG = LogFactory.getLog(MMRJMR.class.getName());

    private MMRJMR() {
        //prevent instantiation via private constructor
    }

    public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
            long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String aggInstructionsInReducer,
            String aggBinInstrctions, String otherInstructionsInReducer, int numReducers, int replication,
            byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
        JobConf job = new JobConf(MMRJMR.class);
        job.setJobName("MMRJ-MR");

        if (numReducers <= 0)
            throw new Exception("MMRJ-MR has to have at least one reduce task!");

        // TODO: check w/ yuanyuan. This job always runs in blocked mode, and hence derivation is not necessary.
        boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);

        //whether use block representation or cell representation
        MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);

        byte[] realIndexes = new byte[inputs.length];
        for (byte b = 0; b < realIndexes.length; b++)
            realIndexes[b] = b;

        //set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true,
                inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);

        //set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);

        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

        //set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

        //set up the aggregate binary operation for the mmcj job
        MRJobConfiguration.setAggregateBinaryInstructions(job, aggBinInstrctions);

        //set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

        //set up the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);

        // byte[] resultIndexes=new byte[]{AggregateBinaryInstruction.parseMRInstruction(aggBinInstrction).output};

        //set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
                instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, resultIndexes);

        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
                instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, otherInstructionsInReducer,
                resultIndexes, mapoutputIndexes, false);

        MatrixCharacteristics[] stats = ret.stats;

        //set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

        // Print the complete instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);

        byte[] dimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                dimsUnknown[i] = (byte) 1;
            } else {
                dimsUnknown[i] = (byte) 0;
            }
        }

        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos,
                inBlockRepresentation);

        // configure mapper
        job.setMapperClass(MMRJMRMapper.class);
        job.setMapOutputKeyClass(TripleIndexes.class);
        if (inBlockRepresentation)
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        else
            job.setMapOutputValueClass(TaggedMatrixCell.class);
        job.setOutputKeyComparatorClass(TripleIndexes.Comparator.class);
        job.setPartitionerClass(TripleIndexes.FirstTwoIndexesPartitioner.class);

        //configure combiner
        //TODO: cannot set up combiner, because it will destroy the stable numerical algorithms 
        // for sum or for central moments 

        //   if(aggInstructionsInReducer!=null && !aggInstructionsInReducer.isEmpty())
        //      job.setCombinerClass(MMCJMRCombiner.class);

        //configure reducer
        job.setReducerClass(MMRJMRReducer.class);

        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        RunningJob runjob = JobClient.runJob(job);

        /* Process different counters */

        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }

        return new JobReturn(stats, outputInfos, runjob.isSuccessful());
    }

}