com.ibm.bi.dml.runtime.io.ReaderTextCellParallel.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.io.ReaderTextCellParallel.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.io;

import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.hops.OptimizerUtils;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.matrix.data.InputInfo;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.util.FastStringTokenizer;
import com.ibm.bi.dml.runtime.util.MapReduceTool;

/**
 * Parallel version of ReaderTextCell.java. To summarize, we create read tasks per split
 * and use a fixed-size thread pool, to executed these tasks. If the target matrix is dense,
 * the inserts are done lock-free. If the matrix is sparse, we use a buffer to collect
 * unordered input cells, lock the the target sparse matrix once, and append all buffered values.
 * 
 * Note MatrixMarket:
 * 1) For matrix market files each read task probes for comments until it finds data because
 *    for very small tasks or large comments, any split might encounter % or %%. Hence,
 *    the parallel reader does not do the validity check for.
 * 2) In extreme scenarios, the last comment might be in one split, and the following meta data
 *    in the subsequent split. This would create incorrect results or errors. However, this
 *    scenario is extremely unlikely (num threads > num lines if 1 comment line) and hence ignored 
 *    similar to our parallel MR setting (but there we have a 128MB guarantee).     
 * 3) However, we use MIN_FILESIZE_MM (8KB) to give guarantees for the common case of small headers
 *    in order the issue described in (2).
 * 
 */
public class ReaderTextCellParallel extends MatrixReader {

    private static final long MIN_FILESIZE_MM = 8L * 1024; //8KB

    private boolean _isMMFile = false;
    private int _numThreads = 1;

    public ReaderTextCellParallel(InputInfo info) {
        _isMMFile = (info == InputInfo.MatrixMarketInputInfo);
        _numThreads = OptimizerUtils.getParallelTextReadParallelism();
    }

    @Override
    public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
            throws IOException, DMLRuntimeException {
        //prepare file access
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(fname);

        //check existence and non-empty file
        checkValidInputFile(fs, path);

        //allocate output matrix block
        MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false);

        //core read 
        readTextCellMatrixFromHDFS(path, job, ret, rlen, clen, brlen, bclen, _isMMFile);

        //post-processing (representation-specific, change of sparse/dense block representation)
        if (ret.isInSparseFormat())
            ret.sortSparseRows();
        else
            ret.recomputeNonZeros();
        ret.examSparsity();

        return ret;
    }

    /**
     * 
     * @param path
     * @param job
     * @param dest
     * @param rlen
     * @param clen
     * @param brlen
     * @param bclen
     * @throws IOException
     * @throws IllegalAccessException
     * @throws InstantiationException
     */
    private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen,
            int brlen, int bclen, boolean matrixMarket) throws IOException {
        int par = _numThreads;

        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);

        //check for min file size for matrix market (adjust num splits if necessary)
        if (_isMMFile) {
            long len = MapReduceTool.getFilesizeOnHDFS(path);
            par = (len < MIN_FILESIZE_MM) ? 1 : par;
        }

        ExecutorService pool = Executors.newFixedThreadPool(par);
        InputSplit[] splits = informat.getSplits(job, par);

        try {
            //create read tasks for all splits
            ArrayList<ReadTask> tasks = new ArrayList<ReadTask>();
            for (InputSplit split : splits) {
                ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
                tasks.add(t);
            }

            //wait until all tasks have been executed
            pool.invokeAll(tasks);
            pool.shutdown();

            //early error notify in case not all tasks successful
            for (ReadTask rt : tasks) {
                if (!rt.getReturnCode()) {
                    throw new IOException("Read task for text input failed: " + rt.getErrMsg());
                }
            }

        } catch (Exception e) {
            throw new IOException("Threadpool issue, while parallel read.", e);
        }

    }

    /**
     * 
     * 
     */
    public static class ReadTask implements Callable<Object> {
        private InputSplit _split = null;
        private boolean _sparse = false;
        private TextInputFormat _informat = null;
        private JobConf _job = null;
        private MatrixBlock _dest = null;
        private long _rlen = -1;
        private long _clen = -1;
        private boolean _matrixMarket = false;

        private boolean _rc = true;
        private String _errMsg = null;

        public ReadTask(InputSplit split, TextInputFormat informat, JobConf job, MatrixBlock dest, long rlen,
                long clen, boolean matrixMarket) {
            _split = split;
            _sparse = dest.isInSparseFormat();
            _informat = informat;
            _job = job;
            _dest = dest;
            _rlen = rlen;
            _clen = clen;
            _matrixMarket = matrixMarket;
        }

        public boolean getReturnCode() {
            return _rc;
        }

        public String getErrMsg() {
            return _errMsg;
        }

        @Override
        public Object call() throws Exception {
            //writables for reuse during read
            LongWritable key = new LongWritable();
            Text value = new Text();

            //required for error handling
            int row = -1;
            int col = -1;

            try {
                FastStringTokenizer st = new FastStringTokenizer(' ');
                RecordReader<LongWritable, Text> reader = _informat.getRecordReader(_split, _job, Reporter.NULL);

                // Read the header lines, if reading from a matrixMarket file
                if (_matrixMarket) {

                    // skip until end-of-comments (%% or %)
                    boolean foundComment = false;
                    while (reader.next(key, value) && value.toString().charAt(0) == '%') {
                        //do nothing just skip comments
                        foundComment = true;
                    }

                    //process current value (otherwise ignore following meta data)
                    if (!foundComment) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDoubleForParallel();
                        synchronized (_dest) { //sparse requires lock   
                            _dest.appendValue(row, col, lvalue);
                        }
                    }
                }

                try {
                    if (_sparse) //SPARSE<-value
                    {
                        CellBuffer buff = new CellBuffer();

                        while (reader.next(key, value)) {
                            st.reset(value.toString()); //reinit tokenizer
                            row = st.nextInt() - 1;
                            col = st.nextInt() - 1;
                            double lvalue = st.nextDoubleForParallel();

                            buff.addCell(row, col, lvalue);
                            //capacity buffer flush on demand
                            if (buff.size() >= CellBuffer.CAPACITY)
                                synchronized (_dest) { //sparse requires lock
                                    buff.flushCellBufferToMatrixBlock(_dest);
                                }
                        }

                        //final buffer flush 
                        synchronized (_dest) { //sparse requires lock
                            buff.flushCellBufferToMatrixBlock(_dest);
                        }
                    } else //DENSE<-value
                    {
                        while (reader.next(key, value)) {
                            st.reset(value.toString()); //reinit tokenizer
                            row = st.nextInt() - 1;
                            col = st.nextInt() - 1;
                            double lvalue = st.nextDoubleForParallel();
                            _dest.setValueDenseUnsafe(row, col, lvalue);
                        }
                    }
                } finally {
                    if (reader != null)
                        reader.close();
                }
            } catch (Exception ex) {
                //central error handling (return code, message) 
                _rc = false;
                _errMsg = ex.getMessage();

                //post-mortem error handling and bounds checking
                if (row < 0 || row + 1 > _rlen || col < 0 || col + 1 > _clen) {
                    _errMsg = "Matrix cell [" + (row + 1) + "," + (col + 1) + "] "
                            + "out of overall matrix range [1:" + _rlen + ",1:" + _clen + "]. " + ex.getMessage();
                    throw new RuntimeException(_errMsg, ex);
                } else {
                    _errMsg = "Unable to read matrix in text cell format. " + ex.getMessage();
                    throw new RuntimeException(_errMsg, ex);
                }
            }

            return null;
        }
    }

    /**
     * Useful class for buffering unordered cells before locking target onces and
     * appending all buffered cells.
     * 
     */
    public static class CellBuffer {
        public static final int CAPACITY = 102400; //100K elements 

        private int[] _rlen;
        private int[] _clen;
        private double[] _vals;
        private int _pos;

        public CellBuffer() {
            _rlen = new int[CAPACITY];
            _clen = new int[CAPACITY];
            _vals = new double[CAPACITY];
            _pos = -1;
        }

        public void addCell(int rlen, int clen, double val) {
            _pos++;
            _rlen[_pos] = rlen;
            _clen[_pos] = clen;
            _vals[_pos] = val;
        }

        public void flushCellBufferToMatrixBlock(MatrixBlock dest) {
            for (int i = 0; i <= _pos; i++)
                dest.appendValue(_rlen[i], _clen[i], _vals[i]);

            reset();
        }

        public int size() {
            return _pos + 1;
        }

        public void reset() {
            _pos = -1;
        }
    }
}