Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.io; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapred.JobConf; import com.ibm.bi.dml.conf.ConfigurationManager; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes; import com.ibm.bi.dml.runtime.matrix.mapred.IndexedMatrixValue; import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration; public class ReaderBinaryBlock extends MatrixReader { protected boolean _localFS = false; public ReaderBinaryBlock(boolean localFS) { _localFS = localFS; } public void setLocalFS(boolean flag) { _localFS = flag; } @Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { //allocate output matrix block MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, false, false); //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = _localFS ? FileSystem.getLocal(job) : FileSystem.get(job); Path path = new Path((_localFS ? "file:///" : "") + fname); //check existence and non-empty file checkValidInputFile(fs, path); //core read readBinaryBlockMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen); //finally check if change of sparse/dense block representation required if (!AGGREGATE_BLOCK_NNZ) ret.recomputeNonZeros(); ret.examSparsity(); return ret; } /** * * @param fname * @param rlen * @param clen * @param brlen * @param bclen * @param estnnz * @return * @throws IOException * @throws DMLRuntimeException */ public ArrayList<IndexedMatrixValue> readIndexedMatrixBlocksFromHDFS(String fname, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { //allocate output matrix block collection ArrayList<IndexedMatrixValue> ret = new ArrayList<IndexedMatrixValue>(); //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = _localFS ? FileSystem.getLocal(job) : FileSystem.get(job); Path path = new Path((_localFS ? "file:///" : "") + fname); //check existence and non-empty file checkValidInputFile(fs, path); //core read readBinaryBlockMatrixBlocksFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen); return ret; } /** * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat- * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is * advantageous if the actual sequence files are larger than the file splits created by * informat.getSplits (which is usually aligned to the HDFS block size) because then there is * overhead for finding the actual split between our 1k-1k blocks. This case happens * if the read matrix was create by CP or when jobs directly write to large output files * (e.g., parfor matrix partitioning). * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private static void readBinaryBlockMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { boolean sparse = dest.isInSparseFormat(); MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); long lnnz = 0; //aggregate block nnz //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) //1..N files { //directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { //note: next(key, value) does not yet exploit the given serialization classes, record reader does but is generally slower. while (reader.next(key, value)) { //empty block filter (skip entire block) if (value.isEmptyBlock(false)) continue; int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); //bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } //copy block to result if (sparse) { //note: append requires final sort (but prevents repeated shifting) dest.appendToSparse(value, row_offset, col_offset); } else { dest.copy(row_offset, row_offset + rows - 1, col_offset, col_offset + cols - 1, value, false); } //maintain nnz as aggregate of block nnz lnnz += value.getNonZeros(); } } finally { IOUtilFunctions.closeSilently(reader); } } //post-processing dest.setNonZeros(lnnz); if (sparse && clen > bclen) { //no need to sort if 1 column block since always sorted dest.sortSparseRows(); } } /** * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings("deprecation") private void readBinaryBlockMatrixBlocksFromHDFS(Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen) throws IOException { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) //1..N files { //directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) { int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); //bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } //copy block to result dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value))); } } finally { IOUtilFunctions.closeSilently(reader); } } } }