com.ibm.bi.dml.runtime.io.ReaderBinaryBlockParallel.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.io.ReaderBinaryBlockParallel.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.io;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobConf;

import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.hops.OptimizerUtils;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration;

public class ReaderBinaryBlockParallel extends ReaderBinaryBlock {
    private static int _numThreads = 1;

    public ReaderBinaryBlockParallel(boolean localFS) {
        super(localFS);
        _numThreads = OptimizerUtils.getParallelBinaryReadParallelism();
    }

    @Override
    public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
            throws IOException, DMLRuntimeException {
        //allocate output matrix block (incl block allocation for parallel)
        MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, true);

        //prepare file access
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = _localFS ? FileSystem.getLocal(job) : FileSystem.get(job);
        Path path = new Path((_localFS ? "file:///" : "") + fname);

        //check existence and non-empty file
        checkValidInputFile(fs, path);

        //core read 
        readBinaryBlockMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen);

        //finally check if change of sparse/dense block representation required
        if (!AGGREGATE_BLOCK_NNZ)
            ret.recomputeNonZeros();
        ret.examSparsity();

        return ret;
    }

    /**
     * 
     * @param path
     * @param job
     * @param fs 
     * @param dest
     * @param rlen
     * @param clen
     * @param brlen
     * @param bclen
     * @throws IOException
     * @throws IllegalAccessException
     * @throws InstantiationException
     * @throws DMLRuntimeException 
     */
    private static void readBinaryBlockMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest,
            long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException {
        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        try {
            //create read tasks for all files
            ExecutorService pool = Executors.newFixedThreadPool(_numThreads);
            ArrayList<ReadFileTask> tasks = new ArrayList<ReadFileTask>();
            for (Path lpath : getSequenceFilePaths(fs, path)) {
                ReadFileTask t = new ReadFileTask(lpath, job, fs, dest, rlen, clen, brlen, bclen);
                tasks.add(t);
            }

            //wait until all tasks have been executed
            List<Future<Object>> rt = pool.invokeAll(tasks);
            pool.shutdown();

            //check for exceptions and aggregate nnz
            long lnnz = 0;
            for (Future<Object> task : rt)
                lnnz += (Long) task.get();

            //post-processing
            dest.setNonZeros(lnnz);
            if (dest.isInSparseFormat() && clen > bclen) {
                //no need to sort if 1 column block since always sorted
                dest.sortSparseRows();
            }
        } catch (Exception e) {
            throw new IOException("Failed parallel read of binary block input.", e);
        }
    }

    /**
     * 
     */
    private static class ReadFileTask implements Callable<Object> {
        private Path _path = null;
        private JobConf _job = null;
        private FileSystem _fs = null;
        private MatrixBlock _dest = null;
        private long _rlen = -1;
        private long _clen = -1;
        private int _brlen = -1;
        private int _bclen = -1;

        public ReadFileTask(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen,
                int brlen, int bclen) {
            _path = path;
            _fs = fs;
            _job = job;
            _dest = dest;
            _rlen = rlen;
            _clen = clen;
            _brlen = brlen;
            _bclen = bclen;
        }

        @Override
        @SuppressWarnings({ "deprecation", "resource" })
        public Object call() throws Exception {
            boolean sparse = _dest.isInSparseFormat();
            MatrixIndexes key = new MatrixIndexes();
            MatrixBlock value = new MatrixBlock();
            long lnnz = 0; //aggregate block nnz

            //directly read from sequence files (individual partfiles)
            SequenceFile.Reader reader = new SequenceFile.Reader(_fs, _path, _job);

            try {
                //note: next(key, value) does not yet exploit the given serialization classes, record reader does but is generally slower.
                while (reader.next(key, value)) {
                    //empty block filter (skip entire block)
                    if (value.isEmptyBlock(false))
                        continue;

                    int row_offset = (int) (key.getRowIndex() - 1) * _brlen;
                    int col_offset = (int) (key.getColumnIndex() - 1) * _bclen;

                    int rows = value.getNumRows();
                    int cols = value.getNumColumns();

                    //bound check per block
                    if (row_offset + rows < 0 || row_offset + rows > _rlen || col_offset + cols < 0
                            || col_offset + cols > _clen) {
                        throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + ","
                                + (col_offset + 1) + ":" + (col_offset + cols) + "] "
                                + "out of overall matrix range [1:" + _rlen + ",1:" + _clen + "].");
                    }

                    //copy block to result
                    if (sparse) {
                        //note: append requires final sort
                        if (cols < _clen) {
                            synchronized (_dest) { //sparse requires lock, when matrix is wider than one block
                                _dest.appendToSparse(value, row_offset, col_offset);
                            }
                        } else
                            _dest.appendToSparse(value, row_offset, col_offset);
                    } else {
                        _dest.copy(row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value,
                                false);
                    }

                    //aggregate nnz
                    lnnz += value.getNonZeros();
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }

            return lnnz;
        }
    }
}