com.ibm.bi.dml.runtime.io.WriterBinaryBlockParallel.java Source code

Introduction

Here is the source code for com.ibm.bi.dml.runtime.io.WriterBinaryBlockParallel.java
Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.io;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobConf;

import com.ibm.bi.dml.conf.DMLConfig;
import com.ibm.bi.dml.hops.OptimizerUtils;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException;
import com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration;
import com.ibm.bi.dml.runtime.util.MapReduceTool;

public class WriterBinaryBlockParallel extends WriterBinaryBlock {
    public WriterBinaryBlockParallel(int replication) {
        super(replication);
    }

    /**
     * 
     * @param path
     * @param job
     * @param src
     * @param rlen
     * @param clen
     * @param brlen
     * @param bclen
     * @throws IOException
     * @throws DMLUnsupportedOperationException 
     * @throws DMLRuntimeException 
     */
    @Override
    protected void writeBinaryBlockMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen,
            int brlen, int bclen, int replication)
            throws IOException, DMLRuntimeException, DMLUnsupportedOperationException {
        //estimate output size and number of output blocks (min 1)
        int numPartFiles = (int) (OptimizerUtils.estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen,
                src.getNonZeros()) / InfrastructureAnalyzer.getHDFSBlockSize());
        numPartFiles = Math.max(numPartFiles, 1);

        //determine degree of parallelism
        int numThreads = OptimizerUtils.getParallelBinaryWriteParallelism();
        numThreads = Math.min(numThreads, numPartFiles);

        //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
        if (numThreads <= 1) {
            super.writeBinaryBlockMatrixToHDFS(path, job, src, rlen, clen, brlen, bclen, replication);
            return;
        }

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //create directory for concurrent tasks
        MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
        FileSystem fs = FileSystem.get(job);

        //create and execute write tasks
        try {
            ExecutorService pool = Executors.newFixedThreadPool(numThreads);
            ArrayList<WriteFileTask> tasks = new ArrayList<WriteFileTask>();
            int blklen = (int) Math.ceil((double) rlen / brlen / numThreads) * brlen;
            for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
                Path newPath = new Path(path, String.format("0-m-%05d", i));
                tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, Math.min((i + 1) * blklen, rlen),
                        brlen, bclen, _replication));
            }

            //wait until all tasks have been executed
            List<Future<Object>> rt = pool.invokeAll(tasks);
            pool.shutdown();

            //check for exceptions 
            for (Future<Object> task : rt)
                task.get();
        } catch (Exception e) {
            throw new IOException("Failed parallel write of binary block input.", e);
        }
    }

    /**
     * 
     */
    private static class WriteFileTask implements Callable<Object> {
        private Path _path = null;
        private JobConf _job = null;
        private FileSystem _fs = null;
        private MatrixBlock _src = null;
        private long _rl = -1;
        private long _ru = -1;
        private int _brlen = -1;
        private int _bclen = -1;
        private int _replication = 1;

        public WriteFileTask(Path path, JobConf job, FileSystem fs, MatrixBlock src, long rl, long ru, int brlen,
                int bclen, int rep) {
            _path = path;
            _fs = fs;
            _job = job;
            _src = src;
            _rl = rl;
            _ru = ru;
            _brlen = brlen;
            _bclen = bclen;
            _replication = rep;
        }

        @Override
        @SuppressWarnings("deprecation")
        public Object call() throws Exception {
            // 1) create sequence file writer, with right replication factor 
            // (config via 'dfs.replication' not possible since sequence file internally calls fs.getDefaultReplication())
            SequenceFile.Writer writer = null;
            if (_replication > 0) //if replication specified (otherwise default)
            {
                //copy of SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class), except for replication
                writer = new SequenceFile.Writer(_fs, _job, _path, MatrixIndexes.class, MatrixBlock.class,
                        _job.getInt("io.file.buffer.size", 4096), (short) _replication, _fs.getDefaultBlockSize(),
                        null, new SequenceFile.Metadata());
            } else {
                writer = new SequenceFile.Writer(_fs, _job, _path, MatrixIndexes.class, MatrixBlock.class);
            }

            try {
                //3) reblock and write
                MatrixIndexes indexes = new MatrixIndexes();

                //initialize blocks for reuse (at most 4 different blocks required)
                MatrixBlock[] blocks = createMatrixBlocksForReuse(_src.getNumRows(), _src.getNumColumns(), _brlen,
                        _bclen, _src.isInSparseFormat(), _src.getNonZeros());

                //create and write subblocks of matrix
                for (int blockRow = (int) _rl / _brlen; blockRow < (int) Math
                        .ceil(_ru / (double) _brlen); blockRow++)
                    for (int blockCol = 0; blockCol < (int) Math
                            .ceil(_src.getNumColumns() / (double) _bclen); blockCol++) {
                        int maxRow = (blockRow * _brlen + _brlen < _src.getNumRows()) ? _brlen
                                : _src.getNumRows() - blockRow * _brlen;
                        int maxCol = (blockCol * _bclen + _bclen < _src.getNumColumns()) ? _bclen
                                : _src.getNumColumns() - blockCol * _bclen;

                        int row_offset = blockRow * _brlen;
                        int col_offset = blockCol * _bclen;

                        //get reuse matrix block
                        MatrixBlock block = getMatrixBlockForReuse(blocks, maxRow, maxCol, _brlen, _bclen);

                        //copy submatrix to block
                        _src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset,
                                col_offset + maxCol - 1, block);

                        //append block to sequence file
                        indexes.setIndexes(blockRow + 1, blockCol + 1);
                        writer.append(indexes, block);

                        //reset block for later reuse
                        block.reset();
                    }
            } finally {
                IOUtilFunctions.closeSilently(writer);
            }

            return null;
        }
    }
}