com.ibm.bi.dml.runtime.matrix.mapred.PartialAggregator.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.matrix.mapred.PartialAggregator.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.matrix.mapred;

import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;

import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.data.MatrixValue;
import com.ibm.bi.dml.runtime.matrix.operators.AggregateBinaryOperator;
import com.ibm.bi.dml.runtime.util.LocalFileUtils;
import com.ibm.bi.dml.runtime.util.MapReduceTool;

public class PartialAggregator extends MMCJMRCache {

    private long rlen = 0;
    private long clen = 0;
    private int brlen = 0;
    private int bclen = 0;
    private long numBlocksInRow = 0;
    private long numBlocksInColumn = 0;
    private AggregateBinaryOperator operation;

    //local file management
    private boolean memOnly = false;
    private boolean rowMajor = true;

    /**
     * 
     * @param conf
     * @param memSize
     * @param resultRlen
     * @param resultClen
     * @param blockRlen
     * @param blockClen
     * @param filePrefix
     * @param inRowMajor
     * @param op
     * @param vCls
     * @throws InstantiationException
     * @throws IllegalAccessException
     * @throws IOException
     */
    public PartialAggregator(JobConf conf, long memSize, long resultRlen, long resultClen, int blockRlen,
            int blockClen, boolean inRowMajor, AggregateBinaryOperator op, Class<? extends MatrixValue> vCls)
            throws InstantiationException, IllegalAccessException, IOException {
        rlen = resultRlen;
        clen = resultClen;
        brlen = blockRlen;
        bclen = blockClen;
        numBlocksInRow = (long) Math.ceil((double) clen / (double) bclen);
        numBlocksInColumn = (long) Math.ceil((double) rlen / (double) brlen);
        operation = op;

        //allocate space for buffer
        //if the buffer space is already larger than the result size, don't need extra space
        long elementSize = 77 + 8 * Math.min(rlen, brlen) * Math.min(clen, bclen) + 20 + 12 + 12 + 4;//matrix block, matrix index, pair, integer in the linked list
        int buffCapacity = (int) Math.max(Math.min((memSize / elementSize), (numBlocksInRow * numBlocksInColumn)),
                1);
        super.allocateBuffer(buffCapacity, vCls, true);

        //local file management (if necessary)
        int n = (int) Math.ceil((double) (numBlocksInRow * numBlocksInColumn) / (double) _bufferCapacity);
        memOnly = (n == 1);
        if (!memOnly) {
            _job = conf;
            _fs = FileSystem.getLocal(_job);
            rowMajor = inRowMajor;
            _fileN = n;
            super.constructLocalFilePrefix("_partial_aggregator_");
            super.deleteAllWorkingFiles();
        }
    }

    /**
     * 
     * @param indexes
     * @param value
     * @param leftcached
     * @throws IOException
     * @throws DMLUnsupportedOperationException
     * @throws DMLRuntimeException
     */
    public void aggregateToBuffer(MatrixIndexes indexes, MatrixValue value, boolean leftcached)
            throws IOException, DMLUnsupportedOperationException, DMLRuntimeException {
        if (!memOnly) {
            int newFileCursor = getFileCursor(indexes);
            if (newFileCursor >= _fileN)
                throw new IOException("indexes: " + indexes + " needs to be put in file #" + newFileCursor
                        + " which exceeds the limit: " + _fileN);

            if (_fileCursor != newFileCursor) {
                super.writeBuffer();
                _fileCursor = newFileCursor;
                super.loadBuffer();
            }
        }

        aggregateToBufferHelp(indexes, value, leftcached);
    }

    /**
     * 
     * @param outputs
     * @param j
     * @param reporter
     * @return
     * @throws IOException
     */
    public long outputToHadoop(CollectMultipleConvertedOutputs outputs, int j, Reporter reporter)
            throws IOException {
        long nonZeros = 0;

        //write the currentBufferSize if something is in memory
        for (Integer ix : _bufferMap.values()) {
            outputs.collectOutput(_buffer[ix].getKey(), _buffer[ix].getValue(), j, reporter);
            nonZeros += _buffer[ix].getValue().getNonZeros();
        }

        if (!memOnly) {
            Path path = getFilePath(_fileCursor);
            MapReduceTool.deleteFileIfExistOnHDFS(path, _job);
        }

        //flush local fs buffer pages to hdfs
        if (!memOnly)
            for (int i = 0; i < _fileN; i++)
                if (i != _fileCursor) { //current cursor already flushed
                    Path path = getFilePath(i);
                    nonZeros += copyFileContentAndDelete(path, outputs, j, reporter);
                }

        return nonZeros;
    }

    /**
     * 
     * @throws IOException
     */
    public void close() throws IOException {
        if (!memOnly)
            super.deleteAllWorkingFiles();
    }

    /**
     * 
     * @param indexes
     * @param value
     * @param leftcached
     * @throws DMLUnsupportedOperationException
     * @throws DMLRuntimeException
     */
    private void aggregateToBufferHelp(MatrixIndexes indexes, MatrixValue value, boolean leftcached)
            throws DMLUnsupportedOperationException, DMLRuntimeException {
        Integer ix = _bufferMap.get(indexes);
        if (ix != null) //agg into existing block 
        {
            _buffer[ix].getValue().binaryOperationsInPlace(operation.aggOp.increOp, value);
        } else //add as new block
        {
            addToBuffer(indexes, value);
        }
    }

    /**
     * 
     * @param indexes
     * @param value
     */
    private void addToBuffer(MatrixIndexes indexes, MatrixValue value) {
        if (_bufferSize >= _buffer.length)
            throw new RuntimeException("indexes: " + indexes + " needed to be put in postition: " + _bufferSize
                    + " which exceeds the buffer size: " + _buffer.length);

        //add to the end
        _buffer[_bufferSize].getKey().setIndexes(indexes);
        _buffer[_bufferSize].getValue().copy(value);
        _bufferMap.put(_buffer[_bufferSize].getKey(), _bufferSize);
        _bufferSize++;
    }

    /**
     * 
     * @param indexes
     * @return
     */
    private int getFileCursor(MatrixIndexes indexes) {
        if (rowMajor)
            return (int) (((indexes.getRowIndex() - 1) * numBlocksInRow + indexes.getColumnIndex() - 1)
                    / _bufferCapacity);
        else
            return (int) (((indexes.getColumnIndex() - 1) * numBlocksInColumn + indexes.getRowIndex() - 1)
                    / _bufferCapacity);
    }

    /**
     * 
     * @param path
     * @param outputs
     * @param j
     * @param reporter
     * @return
     * @throws IOException
     */
    private long copyFileContentAndDelete(Path path, CollectMultipleConvertedOutputs outputs, int j,
            Reporter reporter) throws IOException {
        long nonZeros = 0;
        if (_fs.exists(path)) {
            _bufferSize = LocalFileUtils.readBlockSequenceFromLocal(path.toString(), _buffer, _bufferMap);
            for (int i = 0; i < _bufferSize; i++) {
                outputs.collectOutput(_buffer[i].getKey(), _buffer[i].getValue(), j, reporter);
                nonZeros += _buffer[i].getValue().getNonZeros();
            }
            MapReduceTool.deleteFileIfExistOnHDFS(path, _job);
        }
        return nonZeros;
    }
}