Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.io; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapred.JobConf; import com.ibm.bi.dml.conf.DMLConfig; import com.ibm.bi.dml.hops.OptimizerUtils; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException; import com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes; import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration; import com.ibm.bi.dml.runtime.util.MapReduceTool; public class WriterBinaryBlockParallel extends WriterBinaryBlock { public WriterBinaryBlockParallel(int replication) { super(replication); } /** * * @param path * @param job * @param src * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws DMLUnsupportedOperationException * @throws DMLRuntimeException */ @Override protected void writeBinaryBlockMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen, int brlen, int bclen, int replication) throws IOException, DMLRuntimeException, DMLUnsupportedOperationException { //estimate output size and number of output blocks (min 1) int numPartFiles = (int) (OptimizerUtils.estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, src.getNonZeros()) / InfrastructureAnalyzer.getHDFSBlockSize()); numPartFiles = Math.max(numPartFiles, 1); //determine degree of parallelism int numThreads = OptimizerUtils.getParallelBinaryWriteParallelism(); numThreads = Math.min(numThreads, numPartFiles); //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file if (numThreads <= 1) { super.writeBinaryBlockMatrixToHDFS(path, job, src, rlen, clen, brlen, bclen, replication); return; } //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //create directory for concurrent tasks MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION); FileSystem fs = FileSystem.get(job); //create and execute write tasks try { ExecutorService pool = Executors.newFixedThreadPool(numThreads); ArrayList<WriteFileTask> tasks = new ArrayList<WriteFileTask>(); int blklen = (int) Math.ceil((double) rlen / brlen / numThreads) * brlen; for (int i = 0; i < numThreads & i * blklen < rlen; i++) { Path newPath = new Path(path, String.format("0-m-%05d", i)); tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, Math.min((i + 1) * blklen, rlen), brlen, bclen, _replication)); } //wait until all tasks have been executed List<Future<Object>> rt = pool.invokeAll(tasks); pool.shutdown(); //check for exceptions for (Future<Object> task : rt) task.get(); } catch (Exception e) { throw new IOException("Failed parallel write of binary block input.", e); } } /** * */ private static class WriteFileTask implements Callable<Object> { private Path _path = null; private JobConf _job = null; private FileSystem _fs = null; private MatrixBlock _src = null; private long _rl = -1; private long _ru = -1; private int _brlen = -1; private int _bclen = -1; private int _replication = 1; public WriteFileTask(Path path, JobConf job, FileSystem fs, MatrixBlock src, long rl, long ru, int brlen, int bclen, int rep) { _path = path; _fs = fs; _job = job; _src = src; _rl = rl; _ru = ru; _brlen = brlen; _bclen = bclen; _replication = rep; } @Override @SuppressWarnings("deprecation") public Object call() throws Exception { // 1) create sequence file writer, with right replication factor // (config via 'dfs.replication' not possible since sequence file internally calls fs.getDefaultReplication()) SequenceFile.Writer writer = null; if (_replication > 0) //if replication specified (otherwise default) { //copy of SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class), except for replication writer = new SequenceFile.Writer(_fs, _job, _path, MatrixIndexes.class, MatrixBlock.class, _job.getInt("io.file.buffer.size", 4096), (short) _replication, _fs.getDefaultBlockSize(), null, new SequenceFile.Metadata()); } else { writer = new SequenceFile.Writer(_fs, _job, _path, MatrixIndexes.class, MatrixBlock.class); } try { //3) reblock and write MatrixIndexes indexes = new MatrixIndexes(); //initialize blocks for reuse (at most 4 different blocks required) MatrixBlock[] blocks = createMatrixBlocksForReuse(_src.getNumRows(), _src.getNumColumns(), _brlen, _bclen, _src.isInSparseFormat(), _src.getNonZeros()); //create and write subblocks of matrix for (int blockRow = (int) _rl / _brlen; blockRow < (int) Math .ceil(_ru / (double) _brlen); blockRow++) for (int blockCol = 0; blockCol < (int) Math .ceil(_src.getNumColumns() / (double) _bclen); blockCol++) { int maxRow = (blockRow * _brlen + _brlen < _src.getNumRows()) ? _brlen : _src.getNumRows() - blockRow * _brlen; int maxCol = (blockCol * _bclen + _bclen < _src.getNumColumns()) ? _bclen : _src.getNumColumns() - blockCol * _bclen; int row_offset = blockRow * _brlen; int col_offset = blockCol * _bclen; //get reuse matrix block MatrixBlock block = getMatrixBlockForReuse(blocks, maxRow, maxCol, _brlen, _bclen); //copy submatrix to block _src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset, col_offset + maxCol - 1, block); //append block to sequence file indexes.setIndexes(blockRow + 1, blockCol + 1); writer.append(indexes, block); //reset block for later reuse block.reset(); } } finally { IOUtilFunctions.closeSilently(writer); } return null; } } }