Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.io; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.JobConf; import com.ibm.bi.dml.conf.ConfigurationManager; import com.ibm.bi.dml.hops.OptimizerUtils; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes; import com.ibm.bi.dml.runtime.matrix.mapred.IndexedMatrixValue; import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration; public class ReaderBinaryBlockParFiles extends MatrixReader { private boolean _localFS = false; private static int _numThreads = 1; public ReaderBinaryBlockParFiles(boolean localFS) { _localFS = localFS; _numThreads = OptimizerUtils.getParallelTextReadParallelism(); } public void setLocalFS(boolean flag) { _localFS = flag; } @Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { //allocate output matrix block MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, false, false); //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = _localFS ? FileSystem.getLocal(job) : FileSystem.get(job); Path path = new Path((_localFS ? "file:///" : "") + fname); //check existence and non-empty file checkValidInputFile(fs, path); //core read readBinaryBlockMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen); //finally check if change of sparse/dense block representation required ret.recomputeNonZeros(); ret.examSparsity(); return ret; } /** * * @param fname * @param rlen * @param clen * @param brlen * @param bclen * @param estnnz * @return * @throws IOException * @throws DMLRuntimeException */ public ArrayList<IndexedMatrixValue> readIndexedMatrixBlocksFromHDFS(String fname, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { //allocate output matrix block collection ArrayList<IndexedMatrixValue> ret = new ArrayList<IndexedMatrixValue>(); //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = _localFS ? FileSystem.getLocal(job) : FileSystem.get(job); Path path = new Path((_localFS ? "file:///" : "") + fname); //check existence and non-empty file checkValidInputFile(fs, path); //core read readBinaryBlockMatrixBlocksFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen); return ret; } /** * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException * @throws DMLRuntimeException */ private static void readBinaryBlockMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); FileInputFormat.addInputPath(job, path); ExecutorService pool = Executors.newFixedThreadPool(_numThreads); try { //create read tasks for all splits ArrayList<ReadMatrixPerPartfileTask> tasks = new ArrayList<ReadMatrixPerPartfileTask>(); for (Path lpath : getSequenceFilePaths(fs, path)) { ReadMatrixPerPartfileTask t = new ReadMatrixPerPartfileTask(lpath, job, fs, dest, rlen, clen, brlen, bclen); tasks.add(t); } //wait until all tasks have been executed pool.invokeAll(tasks); pool.shutdown(); //early error notify in case not all tasks successful for (ReadMatrixPerPartfileTask rt : tasks) { if (!rt.getReturnCode()) { throw new IOException("Read task for text input failed: " + rt.getErrMsg()); } } } catch (Exception e) { throw new IOException("Threadpool issue, while parallel read.", e); } } /** * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ private void readBinaryBlockMatrixBlocksFromHDFS(Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen) throws IOException { //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); ExecutorService pool = Executors.newFixedThreadPool(_numThreads); try { //create read tasks for all splits ArrayList<ReadMatrixBlockTask> tasks = new ArrayList<ReadMatrixBlockTask>(); for (Path lpath : getSequenceFilePaths(fs, path)) { ReadMatrixBlockTask t = new ReadMatrixBlockTask(lpath, job, fs, dest, rlen, clen, brlen, bclen); tasks.add(t); } //wait until all tasks have been executed pool.invokeAll(tasks); pool.shutdown(); //early error notify in case not all tasks successful for (ReadMatrixBlockTask rt : tasks) { if (!rt.getReturnCode()) { throw new IOException("Read task for text input failed: " + rt.getErrMsg()); } } } catch (Exception e) { throw new IOException("Threadpool issue, while parallel read.", e); } } /** * * * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat- * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is * advantageous if the actual sequence files are larger than the file splits created by * informat.getSplits (which is usually aligned to the HDFS block size) because then there is * overhead for finding the actual split between our 1k-1k blocks. This case happens * if the read matrix was create by CP or when jobs directly write to large output files * (e.g., parfor matrix partitioning). * */ private static class ReadMatrixPerPartfileTask implements Callable<Object> { private boolean _sparse = false; private Path _path = null; private JobConf _job = null; private FileSystem _fs = null; private MatrixBlock _dest = null; private long _rlen = -1; private long _clen = -1; private int _brlen = -1; private int _bclen = -1; private boolean _rc = true; private String _errMsg = null; public ReadMatrixPerPartfileTask(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) { _path = path; _fs = fs; _sparse = dest.isInSparseFormat(); _job = job; _dest = dest; _rlen = rlen; _clen = clen; _brlen = brlen; _bclen = bclen; } public boolean getReturnCode() { return _rc; } public String getErrMsg() { return _errMsg; } @Override public Object call() throws Exception { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); //directly read from sequence files (individual partfiles) @SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader(_fs, _path, _job); try { //note: next(key, value) does not yet exploit the given serialization classes, record reader does but is generally slower. while (reader.next(key, value)) { //empty block filter (skip entire block) if (value.isEmptyBlock(false)) continue; int row_offset = (int) (key.getRowIndex() - 1) * _brlen; int col_offset = (int) (key.getColumnIndex() - 1) * _bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); //bound check per block if (row_offset + rows < 0 || row_offset + rows > _rlen || col_offset + cols < 0 || col_offset + cols > _clen) { throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + _rlen + ",1:" + _clen + "]."); } //copy block to result if (_sparse) { if (cols < _clen) { synchronized (_dest) { //sparse requires lock, when matrix is wider than one block-width _dest.appendToSparse(value, row_offset, col_offset); //note: append requires final sort } } else { _dest.copy(row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false); } } else { _dest.copy(row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false); } } } finally { if (reader != null) reader.close(); } return null; } } /** * * */ private static class ReadMatrixBlockTask implements Callable<Object> { private Path _path = null; private JobConf _job = null; private FileSystem _fs = null; private Collection<IndexedMatrixValue> _dest = null; private long _rlen = -1; private long _clen = -1; private int _brlen = -1; private int _bclen = -1; private boolean _rc = true; private String _errMsg = null; public ReadMatrixBlockTask(Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen) { _path = path; _fs = fs; _job = job; _dest = dest; _rlen = rlen; _clen = clen; _brlen = brlen; _bclen = bclen; } public boolean getReturnCode() { return _rc; } public String getErrMsg() { return _errMsg; } @Override public Object call() throws Exception { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); //directly read from sequence files (individual partfiles) @SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader(_fs, _path, _job); try { while (reader.next(key, value)) { int row_offset = (int) (key.getRowIndex() - 1) * _brlen; int col_offset = (int) (key.getColumnIndex() - 1) * _bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); //bound check per block if (row_offset + rows < 0 || row_offset + rows > _rlen || col_offset + cols < 0 || col_offset + cols > _clen) { throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + _rlen + ",1:" + _clen + "]."); } //copy block to result _dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value))); } } finally { if (reader != null) reader.close(); } return null; } } }