Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.matrix.mapred; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.lib.CombineSequenceFileInputFormat; import org.apache.hadoop.mapred.lib.MultipleOutputs; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.lops.Lop; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence; import org.apache.sysml.runtime.instructions.Instruction; import org.apache.sysml.runtime.instructions.MRInstructionParser; import org.apache.sysml.runtime.instructions.mr.AggregateBinaryInstruction; import org.apache.sysml.runtime.instructions.mr.AggregateInstruction; import org.apache.sysml.runtime.instructions.mr.AppendGInstruction; import org.apache.sysml.runtime.instructions.mr.AppendMInstruction; import org.apache.sysml.runtime.instructions.mr.BinaryMInstruction; import org.apache.sysml.runtime.instructions.mr.CM_N_COVInstruction; import org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction; import org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction; import org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction; import org.apache.sysml.runtime.instructions.mr.GroupedAggregateInstruction; import org.apache.sysml.runtime.instructions.mr.MRInstruction; import org.apache.sysml.runtime.instructions.mr.MapMultChainInstruction; import org.apache.sysml.runtime.instructions.mr.PMMJMRInstruction; import org.apache.sysml.runtime.instructions.mr.ReblockInstruction; import org.apache.sysml.runtime.instructions.mr.RemoveEmptyMRInstruction; import org.apache.sysml.runtime.instructions.mr.UnaryMRInstructionBase; import org.apache.sysml.runtime.io.BinaryBlockSerialization; import org.apache.sysml.runtime.io.IOUtilFunctions; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.AddDummyWeightConverter; import org.apache.sysml.runtime.matrix.data.BinaryBlockToBinaryCellConverter; import org.apache.sysml.runtime.matrix.data.BinaryBlockToRowBlockConverter; import org.apache.sysml.runtime.matrix.data.BinaryBlockToTextCellConverter; import org.apache.sysml.runtime.matrix.data.BinaryCellToRowBlockConverter; import org.apache.sysml.runtime.matrix.data.BinaryCellToTextConverter; import org.apache.sysml.runtime.matrix.data.Converter; import org.apache.sysml.runtime.matrix.data.IdenticalConverter; import org.apache.sysml.runtime.matrix.data.InputInfo; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixCell; import org.apache.sysml.runtime.matrix.data.MatrixValue; import org.apache.sysml.runtime.matrix.data.MultipleOutputCommitter; import org.apache.sysml.runtime.matrix.data.OutputInfo; import org.apache.sysml.runtime.matrix.data.TextCellToRowBlockConverter; import org.apache.sysml.runtime.matrix.data.TextToBinaryCellConverter; import org.apache.sysml.runtime.matrix.data.WeightedCellToSortInputConverter; import org.apache.sysml.runtime.matrix.data.WeightedPair; import org.apache.sysml.runtime.matrix.data.hadoopfix.MultipleInputs; import org.apache.sysml.runtime.matrix.sort.SamplingSortMRInputFormat; import org.apache.sysml.runtime.util.MapReduceTool; import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer; @SuppressWarnings({ "rawtypes", "deprecation" }) public class MRJobConfiguration { //internal param: custom deserializer/serializer (usually 30% faster than WritableSerialization) public static final boolean USE_BINARYBLOCK_SERIALIZATION = true; //Job configurations public static IDSequence seq = new IDSequence(); //input matrices private static final String INPUT_MATRICIES_DIRS_CONFIG = "input.matrices.dirs"; //this is here to handle record reader instructions private static final String MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG = "mapfuc.input.matrices.indexes"; //about the formats of inputs private static final String BLOCK_REPRESENTATION_CONFIG = "in.block.representation"; private static final String WEIGHTEDCELL_REPRESENTATION_CONFIG = "in.weighted.cell.representation"; private static final String INPUT_CONVERTER_CLASS_PREFIX_CONFIG = "input.converter.class.for."; private static final String INPUT_KEY_CLASS_PREFIX_CONFIG = "input.key.class.for."; private static final String INPUT_VALUE_CLASS_PREFIX_CONFIG = "input.value.class.for."; //characteristics about input matrices private static final String INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG = "input.matrix.num.row."; private static final String INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG = "input.matrix.num.column."; private static final String INPUT_BLOCK_NUM_ROW_PREFIX_CONFIG = "input.block.num.row."; private static final String INPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG = "input.block.num.column."; private static final String INPUT_MATRIX_NUM_NNZ_PREFIX_CONFIG = "input.matrix.num.nnz."; //characteristics about the matrices to map outputs private static final String MAPOUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG = "map.output.matrix.num.row."; private static final String MAPOUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG = "map.output.matrix.num.column."; private static final String MAPOUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG = "map.output.block.num.row."; private static final String MAPOUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG = "map.output.block.num.column."; //operations performed in the mapper private static final String INSTRUCTIONS_IN_MAPPER_CONFIG = "instructions.in.mapper"; private static final String RAND_INSTRUCTIONS_CONFIG = "rand.instructions"; //matrix indexes to be outputted to reducer private static final String OUTPUT_INDEXES_IN_MAPPER_CONFIG = "output.indexes.in.mapper"; //parfor serialized program private static final String PARFOR_PROGRAMBLOCKS_CONFIG = "parfor.programblocks.in.mr"; private static final String PARFOR_CACHING_CONFIG = "parfor.cp.caching"; //partitioning input/output info private static final String PARTITIONING_INPUT_MATRIX_NUM_ROW_CONFIG = "partitioning.input.matrix.num.row"; private static final String PARTITIONING_INPUT_MATRIX_NUM_COLUMN_CONFIG = "partitioning.input.matrix.num.column"; private static final String PARTITIONING_INPUT_BLOCK_NUM_ROW_CONFIG = "partitioning.input.block.num.row"; private static final String PARTITIONING_INPUT_BLOCK_NUM_COLUMN_CONFIG = "partitioning.input.block.num.column"; private static final String PARTITIONING_INPUT_INFO_CONFIG = "partitioning.input.inputinfo"; private static final String PARTITIONING_OUTPUT_INFO_CONFIG = "partitioning.output.outputinfo"; private static final String PARTITIONING_OUTPUT_FORMAT_CONFIG = "partitioning.output.format"; private static final String PARTITIONING_OUTPUT_N_CONFIG = "partitioning.output.n"; private static final String PARTITIONING_OUTPUT_FILENAME_CONFIG = "partitioning.output.filename"; private static final String PARTITIONING_ITERVAR_CONFIG = "partitioning.itervar"; private static final String PARTITIONING_MATRIXVAR_CONFIG = "partitioning.matrixvar"; private static final String PARTITIONING_TRANSPOSE_COL_CONFIG = "partitioning.transposed.col"; private static final String PARTITIONING_OUTPUT_KEEP_INDEXES_CONFIG = "partitioning.output.keep.indexes"; //result merge info private static final String RESULTMERGE_INPUT_INFO_CONFIG = "resultmerge.input.inputinfo"; private static final String RESULTMERGE_COMPARE_FILENAME_CONFIG = "resultmerge.compare.filename"; private static final String RESULTMERGE_STAGING_DIR_CONFIG = "resultmerge.staging.dir"; private static final String RESULTMERGE_MATRIX_NUM_ROW_CONFIG = "resultmerge.matrix.num.row"; private static final String RESULTMERGE_MATRIX_NUM_COLUMN_CONFIG = "resultmerge.matrix.num.column"; private static final String RESULTMERGE_BLOCK_NUM_ROW_CONFIG = "resultmerge.block.num.row"; private static final String RESULTMERGE_BLOCK_NUM_COLUMN_CONFIG = "resultmerge.block.num.column"; private static final String SORT_PARTITION_FILENAME = "sort.partition.filename"; //operations performed in the reduer private static final String AGGREGATE_INSTRUCTIONS_CONFIG = "aggregate.instructions.after.groupby.at"; private static final String INSTRUCTIONS_IN_REDUCER_CONFIG = "instructions.in.reducer"; private static final String AGGREGATE_BINARY_INSTRUCTIONS_CONFIG = "aggregate.binary.instructions"; private static final String REBLOCK_INSTRUCTIONS_CONFIG = "reblock.instructions"; private static final String CSV_REBLOCK_INSTRUCTIONS_CONFIG = "csv.reblock.instructions"; private static final String CSV_WRITE_INSTRUCTIONS_CONFIG = "csv.write.instructions"; private static final String COMBINE_INSTRUCTIONS_CONFIG = "combine.instructions"; private static final String CM_N_COV_INSTRUCTIONS_CONFIG = "cm_n_com.instructions"; private static final String GROUPEDAGG_INSTRUCTIONS_CONFIG = "groupedagg.instructions"; //characteristics about the matrices to aggregate binary instructions private static final String AGGBIN_MATRIX_NUM_ROW_PREFIX_CONFIG = "aggbin.matrix.num.row."; private static final String AGGBIN_MATRIX_NUM_COLUMN_PREFIX_CONFIG = "aggbin.matrix.num.column."; private static final String AGGBIN_BLOCK_NUM_ROW_PREFIX_CONFIG = "aggbin.block.num.row."; private static final String AGGBIN_BLOCK_NUM_COLUMN_PREFIX_CONFIG = "aggbin.block.num.column."; //characteristics about the matrices to outputs private static final String OUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG = "output.matrix.num.row."; private static final String OUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG = "output.matrix.num.column."; private static final String OUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG = "output.block.num.row."; private static final String OUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG = "output.block.num.column."; //characteristics about the matrices to reblock instructions private static final String REBLOCK_MATRIX_NUM_ROW_PREFIX_CONFIG = "reblock.matrix.num.row."; private static final String REBLOCK_MATRIX_NUM_COLUMN_PREFIX_CONFIG = "reblock.matrix.num.column."; private static final String REBLOCK_BLOCK_NUM_ROW_PREFIX_CONFIG = "reblock.block.num.row."; private static final String REBLOCK_BLOCK_NUM_COLUMN_PREFIX_CONFIG = "reblock.block.num.column."; private static final String REBLOCK_MATRIX_NUM_NNZ_PREFIX_CONFIG = "reblock.matrix.num.nnz."; //characteristics about the matrices to matrixdiag instructions private static final String INTERMEDIATE_MATRIX_NUM_ROW_PREFIX_CONFIG = "rdiag.matrix.num.row."; private static final String INTERMEDIATE_MATRIX_NUM_COLUMN_PREFIX_CONFIG = "rdiag.matrix.num.column."; private static final String INTERMEDIATE_BLOCK_NUM_ROW_PREFIX_CONFIG = "rdiag.block.num.row."; private static final String INTERMEDIATE_BLOCK_NUM_COLUMN_PREFIX_CONFIG = "rdiag.block.num.column."; //matrix indexes to be outputted as final results private static final String RESULT_INDEXES_CONFIG = "results.indexes"; private static final String RESULT_DIMS_UNKNOWN_CONFIG = "results.dims.unknown"; private static final String INTERMEDIATE_INDEXES_CONFIG = "rdiag.indexes"; //output matrices and their formats public static final String OUTPUT_MATRICES_DIRS_CONFIG = "output.matrices.dirs"; private static final String OUTPUT_CONVERTER_CLASS_PREFIX_CONFIG = "output.converter.class.for."; private static final String DIMS_UNKNOWN_FILE_PREFIX = "dims.unknown.file.prefix"; private static final String MMCJ_CACHE_SIZE = "mmcj.cache.size"; private static final String DISTCACHE_INPUT_INDICES = "distcache.input.indices"; private static final String DISTCACHE_INPUT_PATHS = "distcache.input.paths"; private static final String SYSTEMML_LOCAL_TMP_DIR = "systemml.local.tmp.dir"; /* * SystemML Counter Group names * * group name for the counters on number of output nonZeros */ public static final String NUM_NONZERO_CELLS = "nonzeros"; public static final int getMiscMemRequired(JobConf job) { return job.getInt(MRConfigurationNames.IO_FILE_BUFFER_SIZE, 4096); } public static void setMMCJCacheSize(JobConf job, long size) { job.setLong(MMCJ_CACHE_SIZE, size); } public static long getMMCJCacheSize(JobConf job) { return job.getLong(MMCJ_CACHE_SIZE, 0); } public static void setMatrixValueClass(JobConf job, boolean blockRepresentation) { job.setBoolean(BLOCK_REPRESENTATION_CONFIG, blockRepresentation); } public static void setMatrixValueClassForCM_N_COM(JobConf job, boolean weightedCellRepresentation) { job.setBoolean(WEIGHTEDCELL_REPRESENTATION_CONFIG, weightedCellRepresentation); } public static Class<? extends MatrixValue> getMatrixValueClass(JobConf job) { if (job.getBoolean(WEIGHTEDCELL_REPRESENTATION_CONFIG, false)) return WeightedPair.class; if (job.getBoolean(BLOCK_REPRESENTATION_CONFIG, true)) return MatrixBlock.class; else return MatrixCell.class; } public static enum ConvertTarget { CELL, BLOCK, WEIGHTEDCELL, CSVWRITE } public static Class<? extends Converter> getConverterClass(InputInfo inputinfo, int brlen, int bclen, ConvertTarget target) { Class<? extends Converter> converterClass = IdenticalConverter.class; if (inputinfo.inputValueClass.equals(MatrixCell.class)) { switch (target) { case CELL: converterClass = IdenticalConverter.class; break; case BLOCK: throw new RuntimeException("cannot convert binary cell to binary block representation implicitly"); case WEIGHTEDCELL: converterClass = AddDummyWeightConverter.class; break; case CSVWRITE: converterClass = BinaryCellToRowBlockConverter.class; break; } } else if (inputinfo.inputValueClass.equals(MatrixBlock.class)) { switch (target) { case CELL: converterClass = BinaryBlockToBinaryCellConverter.class; break; case BLOCK: converterClass = IdenticalConverter.class; break; case WEIGHTEDCELL: converterClass = AddDummyWeightConverter.class; break; case CSVWRITE: converterClass = BinaryBlockToRowBlockConverter.class; break; } } else if (inputinfo.inputValueClass.equals(Text.class)) { switch (target) { case CELL: converterClass = TextToBinaryCellConverter.class; break; case BLOCK: if (brlen > 1 || bclen > 1) throw new RuntimeException( "cannot convert text cell to binary block representation implicitly"); else converterClass = TextToBinaryCellConverter.class; break; case WEIGHTEDCELL: converterClass = AddDummyWeightConverter.class; break; case CSVWRITE: converterClass = TextCellToRowBlockConverter.class; break; } } return converterClass; } /** * Unique working dirs required for thread-safe submission of parallel jobs; * otherwise job.xml and other files might be overridden (in local mode). * * @param job job configuration */ public static void setUniqueWorkingDir(JobConf job) { if (InfrastructureAnalyzer.isLocalMode(job)) { StringBuilder tmp = new StringBuilder(); tmp.append(Lop.FILE_SEPARATOR); tmp.append(Lop.PROCESS_PREFIX); tmp.append(DMLScript.getUUID()); tmp.append(Lop.FILE_SEPARATOR); tmp.append(seq.getNextID()); String uniqueSubdir = tmp.toString(); //unique local dir String[] dirlist = job.get(MRConfigurationNames.MR_CLUSTER_LOCAL_DIR, "/tmp").split(","); StringBuilder sb2 = new StringBuilder(); for (String dir : dirlist) { if (sb2.length() > 0) sb2.append(","); sb2.append(dir); sb2.append(uniqueSubdir); } job.set(MRConfigurationNames.MR_CLUSTER_LOCAL_DIR, sb2.toString()); //unique system dir job.set(MRConfigurationNames.MR_JOBTRACKER_SYSTEM_DIR, job.get(MRConfigurationNames.MR_JOBTRACKER_SYSTEM_DIR) + uniqueSubdir); //unique staging dir job.set(MRConfigurationNames.MR_JOBTRACKER_STAGING_ROOT_DIR, job.get(MRConfigurationNames.MR_JOBTRACKER_STAGING_ROOT_DIR) + uniqueSubdir); } } public static String getLocalWorkingDirPrefix(JobConf job) { return job.get(MRConfigurationNames.MR_CLUSTER_LOCAL_DIR); } public static String getSystemWorkingDirPrefix(JobConf job) { return job.get(MRConfigurationNames.MR_JOBTRACKER_SYSTEM_DIR); } public static String getStagingWorkingDirPrefix(JobConf job) { return job.get(MRConfigurationNames.MR_JOBTRACKER_STAGING_ROOT_DIR); } public static void setInputInfo(JobConf job, byte input, InputInfo inputinfo, int brlen, int bclen, ConvertTarget target) { Class<? extends Converter> converterClass = getConverterClass(inputinfo, brlen, bclen, target); job.setClass(INPUT_CONVERTER_CLASS_PREFIX_CONFIG + input, converterClass, Converter.class); job.setClass(INPUT_KEY_CLASS_PREFIX_CONFIG + input, inputinfo.inputKeyClass, Writable.class); job.setClass(INPUT_VALUE_CLASS_PREFIX_CONFIG + input, inputinfo.inputValueClass, Writable.class); } public static void setOutputInfo(JobConf job, int i, OutputInfo outputinfo, boolean sourceInBlock) throws DMLRuntimeException { Class<? extends Converter> converterClass; if (sourceInBlock) { if (outputinfo.outputValueClass.equals(MatrixCell.class)) converterClass = BinaryBlockToBinaryCellConverter.class; else if (outputinfo.outputValueClass.equals(Text.class)) converterClass = BinaryBlockToTextCellConverter.class; else if (outputinfo.outputValueClass.equals(MatrixBlock.class)) converterClass = IdenticalConverter.class; else if (outputinfo.outputValueClass.equals(IntWritable.class)) converterClass = WeightedCellToSortInputConverter.class; else if (outputinfo.outputValueClass.equals(WeightedPair.class)) converterClass = IdenticalConverter.class; else converterClass = IdenticalConverter.class; } else { if (outputinfo.outputValueClass.equals(MatrixCell.class)) converterClass = IdenticalConverter.class; else if (outputinfo.outputValueClass.equals(Text.class)) converterClass = BinaryCellToTextConverter.class; else if (outputinfo.outputValueClass.equals(IntWritable.class)) converterClass = WeightedCellToSortInputConverter.class; else if (outputinfo.outputValueClass.equals(WeightedPair.class)) converterClass = IdenticalConverter.class; else throw new DMLRuntimeException("unsupported conversion: " + outputinfo.outputValueClass); // converterClass=IdenticalConverter.class; } job.setClass(OUTPUT_CONVERTER_CLASS_PREFIX_CONFIG + i, converterClass, Converter.class); } public static Converter getInputConverter(JobConf job, byte input) { Converter inputConverter; try { inputConverter = (Converter) job .getClass(INPUT_CONVERTER_CLASS_PREFIX_CONFIG + input, IdenticalConverter.class).newInstance(); } catch (Exception e) { throw new RuntimeException(e); } return inputConverter; } public static Converter getOuputConverter(JobConf job, int i) { Converter outputConverter; try { outputConverter = (Converter) job .getClass(OUTPUT_CONVERTER_CLASS_PREFIX_CONFIG + i, IdenticalConverter.class).newInstance(); } catch (Exception e) { throw new RuntimeException(e); } return outputConverter; } public static MRInstruction[] getInstructionsInReducer(JobConf job) throws DMLRuntimeException { String str = job.get(INSTRUCTIONS_IN_REDUCER_CONFIG); MRInstruction[] mixed_ops = MRInstructionParser.parseMixedInstructions(str); return mixed_ops; } public static ReblockInstruction[] getReblockInstructions(JobConf job) throws DMLRuntimeException { String str = job.get(REBLOCK_INSTRUCTIONS_CONFIG); ReblockInstruction[] reblock_instructions = MRInstructionParser.parseReblockInstructions(str); return reblock_instructions; } public static CSVReblockInstruction[] getCSVReblockInstructions(JobConf job) throws DMLRuntimeException { String str = job.get(CSV_REBLOCK_INSTRUCTIONS_CONFIG); CSVReblockInstruction[] reblock_instructions = MRInstructionParser.parseCSVReblockInstructions(str); return reblock_instructions; } public static CSVWriteInstruction[] getCSVWriteInstructions(JobConf job) throws DMLRuntimeException { String str = job.get(CSV_WRITE_INSTRUCTIONS_CONFIG); CSVWriteInstruction[] reblock_instructions = MRInstructionParser.parseCSVWriteInstructions(str); return reblock_instructions; } public static AggregateInstruction[] getAggregateInstructions(JobConf job) throws DMLRuntimeException { String str = job.get(AGGREGATE_INSTRUCTIONS_CONFIG); AggregateInstruction[] agg_instructions = MRInstructionParser.parseAggregateInstructions(str); return agg_instructions; } public static MRInstruction[] getCombineInstruction(JobConf job) throws DMLRuntimeException { String str = job.get(COMBINE_INSTRUCTIONS_CONFIG); MRInstruction[] comb_instructions = MRInstructionParser.parseCombineInstructions(str); return comb_instructions; } public static MRInstruction[] getInstructionsInMapper(JobConf job) throws DMLRuntimeException { String str = job.get(INSTRUCTIONS_IN_MAPPER_CONFIG); MRInstruction[] instructions = MRInstructionParser.parseMixedInstructions(str); return instructions; } //parfor configurations public static void setProgramBlocks(JobConf job, String sProgramBlocks) { job.set(PARFOR_PROGRAMBLOCKS_CONFIG, sProgramBlocks); } public static String getProgramBlocks(JobConf job) { String str = job.get(PARFOR_PROGRAMBLOCKS_CONFIG); return str; } public static void setParforCachingConfig(JobConf job, boolean flag) { job.setBoolean(PARFOR_CACHING_CONFIG, flag); } public static boolean getParforCachingConfig(JobConf job) { return job.getBoolean(PARFOR_CACHING_CONFIG, true); } //partitioning configurations public static void setPartitioningInfo(JobConf job, long rlen, long clen, int brlen, int bclen, InputInfo ii, OutputInfo oi, PDataPartitionFormat dpf, int n, String fnameNew) throws DMLRuntimeException { job.set(PARTITIONING_INPUT_MATRIX_NUM_ROW_CONFIG, String.valueOf(rlen)); job.set(PARTITIONING_INPUT_MATRIX_NUM_COLUMN_CONFIG, String.valueOf(clen)); job.set(PARTITIONING_INPUT_BLOCK_NUM_ROW_CONFIG, String.valueOf(brlen)); job.set(PARTITIONING_INPUT_BLOCK_NUM_COLUMN_CONFIG, String.valueOf(bclen)); job.set(PARTITIONING_INPUT_INFO_CONFIG, InputInfo.inputInfoToString(ii)); job.set(PARTITIONING_OUTPUT_INFO_CONFIG, OutputInfo.outputInfoToString(oi)); job.set(PARTITIONING_OUTPUT_FORMAT_CONFIG, dpf.toString()); job.set(PARTITIONING_OUTPUT_N_CONFIG, String.valueOf(n)); job.set(PARTITIONING_OUTPUT_FILENAME_CONFIG, fnameNew); } public static void setPartitioningInfo(JobConf job, long rlen, long clen, int brlen, int bclen, InputInfo ii, OutputInfo oi, PDataPartitionFormat dpf, int n, String fnameNew, String itervar, String matrixvar, boolean tSparseCol) throws DMLRuntimeException { //set basic partitioning information setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, dpf, n, fnameNew); //set iteration variable name (used for ParFor-DPE) job.set(PARTITIONING_ITERVAR_CONFIG, itervar); //set iteration variable name (used for ParFor-DPE) job.set(PARTITIONING_MATRIXVAR_CONFIG, matrixvar); //set transpose sparse column vector job.setBoolean(PARTITIONING_TRANSPOSE_COL_CONFIG, tSparseCol); } public static void setPartitioningInfo(JobConf job, long rlen, long clen, int brlen, int bclen, InputInfo ii, OutputInfo oi, PDataPartitionFormat dpf, int n, String fnameNew, boolean keepIndexes) throws DMLRuntimeException { //set basic partitioning information setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, dpf, n, fnameNew); //set transpose sparse column vector job.setBoolean(PARTITIONING_OUTPUT_KEEP_INDEXES_CONFIG, keepIndexes); } public static MatrixCharacteristics getPartitionedMatrixSize(JobConf job) { return new MatrixCharacteristics(Long.parseLong(job.get(PARTITIONING_INPUT_MATRIX_NUM_ROW_CONFIG)), Long.parseLong(job.get(PARTITIONING_INPUT_MATRIX_NUM_COLUMN_CONFIG)), Integer.parseInt(job.get(PARTITIONING_INPUT_BLOCK_NUM_ROW_CONFIG)), Integer.parseInt(job.get(PARTITIONING_INPUT_BLOCK_NUM_COLUMN_CONFIG))); } public static void setPartitioningBlockNumRows(JobConf job, int brlen) { job.set(PARTITIONING_INPUT_BLOCK_NUM_ROW_CONFIG, String.valueOf(brlen)); } public static void setPartitioningBlockNumCols(JobConf job, int bclen) { job.set(PARTITIONING_INPUT_BLOCK_NUM_COLUMN_CONFIG, String.valueOf(bclen)); } public static InputInfo getPartitioningInputInfo(JobConf job) { return InputInfo.stringToInputInfo(job.get(PARTITIONING_INPUT_INFO_CONFIG)); } public static OutputInfo getPartitioningOutputInfo(JobConf job) { return OutputInfo.stringToOutputInfo(job.get(PARTITIONING_OUTPUT_INFO_CONFIG)); } public static void setPartitioningFormat(JobConf job, PDataPartitionFormat dpf) { job.set(PARTITIONING_OUTPUT_FORMAT_CONFIG, dpf.toString()); } public static PDataPartitionFormat getPartitioningFormat(JobConf job) { return PDataPartitionFormat.valueOf(job.get(PARTITIONING_OUTPUT_FORMAT_CONFIG)); } public static int getPartitioningSizeN(JobConf job) { return Integer.parseInt(job.get(PARTITIONING_OUTPUT_N_CONFIG)); } public static boolean getPartitioningIndexFlag(JobConf job) { return Boolean.parseBoolean(job.get(PARTITIONING_OUTPUT_KEEP_INDEXES_CONFIG)); } public static void setPartitioningFilename(JobConf job, String fname) { job.set(PARTITIONING_OUTPUT_FILENAME_CONFIG, fname); } public static String getPartitioningFilename(JobConf job) { return job.get(PARTITIONING_OUTPUT_FILENAME_CONFIG); } public static String getPartitioningItervar(JobConf job) { return job.get(PARTITIONING_ITERVAR_CONFIG); } public static String getPartitioningMatrixvar(JobConf job) { return job.get(PARTITIONING_MATRIXVAR_CONFIG); } public static boolean getPartitioningTransposedCol(JobConf job) { return job.getBoolean(PARTITIONING_TRANSPOSE_COL_CONFIG, false); } public static void setResultMergeInfo(JobConf job, String fnameNew, InputInfo ii, String stagingDir, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { job.set(RESULTMERGE_COMPARE_FILENAME_CONFIG, fnameNew); job.set(RESULTMERGE_INPUT_INFO_CONFIG, InputInfo.inputInfoToString(ii)); job.set(RESULTMERGE_STAGING_DIR_CONFIG, stagingDir); job.set(RESULTMERGE_MATRIX_NUM_ROW_CONFIG, String.valueOf(rlen)); job.set(RESULTMERGE_MATRIX_NUM_COLUMN_CONFIG, String.valueOf(clen)); job.set(RESULTMERGE_BLOCK_NUM_ROW_CONFIG, String.valueOf(brlen)); job.set(RESULTMERGE_BLOCK_NUM_COLUMN_CONFIG, String.valueOf(bclen)); } public static String getResultMergeInfoCompareFilename(JobConf job) { return job.get(RESULTMERGE_COMPARE_FILENAME_CONFIG); } public static InputInfo getResultMergeInputInfo(JobConf job) { return InputInfo.stringToInputInfo(job.get(RESULTMERGE_INPUT_INFO_CONFIG)); } public static long[] getResultMergeMatrixCharacteristics(JobConf job) { long[] ret = new long[4]; ret[0] = Long.parseLong(job.get(RESULTMERGE_MATRIX_NUM_ROW_CONFIG)); ret[1] = Long.parseLong(job.get(RESULTMERGE_MATRIX_NUM_COLUMN_CONFIG)); ret[2] = Long.parseLong(job.get(RESULTMERGE_BLOCK_NUM_ROW_CONFIG)); ret[3] = Long.parseLong(job.get(RESULTMERGE_BLOCK_NUM_COLUMN_CONFIG)); return ret; } public static byte[] getInputIndexesInMapper(JobConf job) { String[] istrs = job.get(MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG).split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } public static byte[] getOutputIndexesInMapper(JobConf job) { String[] istrs = job.get(OUTPUT_INDEXES_IN_MAPPER_CONFIG).split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } //get the indexes that this matrix file represents, //since one matrix file can occur multiple times in a statement public static ArrayList<Byte> getInputMatrixIndexesInMapper(JobConf job) throws IOException { String[] matrices = job.getStrings(INPUT_MATRICIES_DIRS_CONFIG); String str = job.get(MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG); byte[] indexes; if (str == null || str.isEmpty()) { indexes = new byte[matrices.length]; for (int i = 0; i < indexes.length; i++) indexes[i] = (byte) i; } else { String[] strs = str.split(Instruction.INSTRUCTION_DELIM); indexes = new byte[strs.length]; for (int i = 0; i < strs.length; i++) indexes[i] = Byte.parseByte(strs[i]); } int numMatrices = matrices.length; if (numMatrices > Byte.MAX_VALUE) throw new RuntimeException("number of matrices is too large > " + Byte.MAX_VALUE); for (int i = 0; i < matrices.length; i++) matrices[i] = new Path(matrices[i]).toString(); Path thisFile = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE)); FileSystem fs = IOUtilFunctions.getFileSystem(thisFile, job); thisFile = thisFile.makeQualified(fs); Path thisDir = thisFile.getParent().makeQualified(fs); ArrayList<Byte> representativeMatrixes = new ArrayList<>(); for (int i = 0; i < matrices.length; i++) { Path p = new Path(matrices[i]).makeQualified(fs); if (thisFile.toUri().equals(p.toUri()) || thisDir.toUri().equals(p.toUri())) representativeMatrixes.add(indexes[i]); } return representativeMatrixes; } /*public static void setMatrixToCacheInMMCJ(JobConf job, boolean left) { job.setBoolean(CACHE_LEFT_MATRIX_FOR_MMCJ_CONFIG, left); } public static boolean getMatrixToCacheInMMCJ(JobConf job) { return job.getBoolean(CACHE_LEFT_MATRIX_FOR_MMCJ_CONFIG, true); }*/ public static void setInstructionsInMapper(JobConf job, String instructionsInMapper) { job.set(INSTRUCTIONS_IN_MAPPER_CONFIG, instructionsInMapper); } public static void setAggregateInstructions(JobConf job, String aggInstructionsInReducer) { job.set(AGGREGATE_INSTRUCTIONS_CONFIG, aggInstructionsInReducer); } public static void setReblockInstructions(JobConf job, String reblockInstructions) { job.set(REBLOCK_INSTRUCTIONS_CONFIG, reblockInstructions); } public static void setCSVReblockInstructions(JobConf job, String reblockInstructions) { job.set(CSV_REBLOCK_INSTRUCTIONS_CONFIG, reblockInstructions); } public static void setCSVWriteInstructions(JobConf job, String csvWriteInstructions) { job.set(CSV_WRITE_INSTRUCTIONS_CONFIG, csvWriteInstructions); } public static void setCombineInstructions(JobConf job, String combineInstructions) { job.set(COMBINE_INSTRUCTIONS_CONFIG, combineInstructions); } public static void setInstructionsInReducer(JobConf job, String instructionsInReducer) { if (instructionsInReducer != null) job.set(INSTRUCTIONS_IN_REDUCER_CONFIG, instructionsInReducer); } public static void setAggregateBinaryInstructions(JobConf job, String aggBinInstrctions) { job.set(AGGREGATE_BINARY_INSTRUCTIONS_CONFIG, aggBinInstrctions); } public static void setCM_N_COMInstructions(JobConf job, String cmInstrctions) { job.set(CM_N_COV_INSTRUCTIONS_CONFIG, cmInstrctions); } public static void setGroupedAggInstructions(JobConf job, String grpaggInstructions) { job.set(GROUPEDAGG_INSTRUCTIONS_CONFIG, grpaggInstructions); } public static void setRandInstructions(JobConf job, String randInstrctions) { job.set(RAND_INSTRUCTIONS_CONFIG, randInstrctions); } // TODO: check Rand public static DataGenMRInstruction[] getDataGenInstructions(JobConf job) throws DMLRuntimeException { String str = job.get(RAND_INSTRUCTIONS_CONFIG); return MRInstructionParser.parseDataGenInstructions(str); } public static AggregateBinaryInstruction[] getAggregateBinaryInstructions(JobConf job) throws DMLRuntimeException { String str = job.get(AGGREGATE_BINARY_INSTRUCTIONS_CONFIG); return MRInstructionParser.parseAggregateBinaryInstructions(str); } public static CM_N_COVInstruction[] getCM_N_COVInstructions(JobConf job) throws DMLRuntimeException { String str = job.get(CM_N_COV_INSTRUCTIONS_CONFIG); return MRInstructionParser.parseCM_N_COVInstructions(str); } public static GroupedAggregateInstruction[] getGroupedAggregateInstructions(JobConf job) throws DMLRuntimeException { //parse all grouped aggregate instructions String str = job.get(GROUPEDAGG_INSTRUCTIONS_CONFIG); GroupedAggregateInstruction[] tmp = MRInstructionParser.parseGroupedAggInstructions(str); //obtain bclen for all instructions for (int i = 0; i < tmp.length; i++) { byte tag = tmp[i].input; tmp[i].setBclen(getMatrixCharacteristicsForInput(job, tag).getColsPerBlock()); } return tmp; } public static String[] getOutputs(JobConf job) { return job.getStrings(OUTPUT_MATRICES_DIRS_CONFIG); } private static byte[] stringArrayToByteArray(String[] istrs) { byte[] ret = new byte[istrs.length]; for (int i = 0; i < istrs.length; i++) ret[i] = Byte.parseByte(istrs[i]); return ret; } public static byte[] getResultIndexes(JobConf job) { String[] istrs = job.get(RESULT_INDEXES_CONFIG).split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } public static byte[] getResultDimsUnknown(JobConf job) { String str = job.get(RESULT_DIMS_UNKNOWN_CONFIG); if (str == null || str.isEmpty()) return null; String[] istrs = str.split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } public static byte[] getIntermediateMatrixIndexes(JobConf job) { String str = job.get(INTERMEDIATE_INDEXES_CONFIG); if (str == null || str.isEmpty()) return null; String[] istrs = str.split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } public static void setIntermediateMatrixIndexes(JobConf job, HashSet<Byte> indexes) { job.set(INTERMEDIATE_INDEXES_CONFIG, getIndexesString(indexes)); } public static void setDimsUnknownFilePrefix(JobConf job, String prefix) { job.setStrings(DIMS_UNKNOWN_FILE_PREFIX, prefix); } public static void setMatricesDimensions(JobConf job, byte[] inputIndexes, long[] rlens, long[] clens) { if (rlens.length != clens.length) throw new RuntimeException("rlens.length should be clens.length"); for (int i = 0; i < rlens.length; i++) setMatrixDimension(job, inputIndexes[i], rlens[i], clens[i]); } public static void setMatricesDimensions(JobConf job, byte[] inputIndexes, long[] rlens, long[] clens, long[] nnz) { if (rlens.length != clens.length) throw new RuntimeException("rlens.length should be clens.length"); for (int i = 0; i < rlens.length; i++) setMatrixDimension(job, inputIndexes[i], rlens[i], clens[i], nnz[i]); } public static void setMatrixDimension(JobConf job, byte matrixIndex, long rlen, long clen) { job.setLong(INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG + matrixIndex, rlen); job.setLong(INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG + matrixIndex, clen); } public static void setMatrixDimension(JobConf job, byte matrixIndex, long rlen, long clen, long nnz) { job.setLong(INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG + matrixIndex, rlen); job.setLong(INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG + matrixIndex, clen); job.setLong(INPUT_MATRIX_NUM_NNZ_PREFIX_CONFIG + matrixIndex, nnz); } public static String[] getInputPaths(JobConf job) { return job.getStrings(INPUT_MATRICIES_DIRS_CONFIG); } public static long getNumRows(JobConf job, byte matrixIndex) { return job.getLong(INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG + matrixIndex, 0); } public static long getNumColumns(JobConf job, byte matrixIndex) { return job.getLong(INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG + matrixIndex, 0); } public static void setBlocksSizes(JobConf job, byte[] inputIndexes, int[] brlens, int[] bclens) { if (brlens.length != bclens.length) throw new RuntimeException("brlens.length should be bclens.length"); for (int i = 0; i < brlens.length; i++) setBlockSize(job, inputIndexes[i], brlens[i], bclens[i]); } public static void setBlockSize(JobConf job, byte matrixIndex, int brlen, int bclen) { job.setInt(INPUT_BLOCK_NUM_ROW_PREFIX_CONFIG + matrixIndex, brlen); job.setInt(INPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG + matrixIndex, bclen); } public static int getNumRowsPerBlock(JobConf job, byte matrixIndex) { return job.getInt(INPUT_BLOCK_NUM_ROW_PREFIX_CONFIG + matrixIndex, 1); } public static int getNumColumnsPerBlock(JobConf job, byte matrixIndex) { return job.getInt(INPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG + matrixIndex, 1); } public static long getNumNonZero(JobConf job, byte matrixIndex) { return job.getLong(INPUT_MATRIX_NUM_NNZ_PREFIX_CONFIG + matrixIndex, 1); } public static void setupDistCacheInputs(JobConf job, String indices, String pathsString, ArrayList<String> paths) { job.set(DISTCACHE_INPUT_INDICES, indices); job.set(DISTCACHE_INPUT_PATHS, pathsString); Path p = null; for (String spath : paths) { p = new Path(spath); DistributedCache.addCacheFile(p.toUri(), job); DistributedCache.createSymlink(job); } } public static String getDistCacheInputIndices(JobConf job) { return job.get(DISTCACHE_INPUT_INDICES); } private static String getCSVString(PDataPartitionFormat[] formats) { if (formats == null || formats.length == 0) return ""; StringBuilder s = new StringBuilder(); s.append(formats[0]); for (int i = 1; i < formats.length; i++) { s.append(","); s.append(formats[i]); } return s.toString(); } public static void setInputPartitioningInfo(JobConf job, PDataPartitionFormat[] pformats) { job.set(PARTITIONING_OUTPUT_FORMAT_CONFIG, MRJobConfiguration.getCSVString(pformats)); } private static PDataPartitionFormat[] csv2PFormat(String s) { String[] parts = s.split(","); PDataPartitionFormat[] pformats = new PDataPartitionFormat[parts.length]; for (int i = 0; i < parts.length; i++) { pformats[i] = PDataPartitionFormat.parsePDataPartitionFormat(parts[i]); } return pformats; } public static PDataPartitionFormat[] getInputPartitionFormats(JobConf job) { return MRJobConfiguration.csv2PFormat(job.get(PARTITIONING_OUTPUT_FORMAT_CONFIG)); } public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean setConverter, ConvertTarget target) throws Exception { //conservative initialize (all jobs except GMR) boolean[] distCacheOnly = new boolean[inputIndexes.length]; Arrays.fill(distCacheOnly, false); setUpMultipleInputs(job, inputIndexes, inputs, inputInfos, brlens, bclens, distCacheOnly, setConverter, target); } public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean[] distCacheOnly, boolean setConverter, ConvertTarget target) throws Exception { if (inputs.length != inputInfos.length) throw new Exception("number of inputs and inputInfos does not match"); //set up names of the input matrices and their inputformat information job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs); MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes); //set up converter infos (converter determined implicitly) if (setConverter) { for (int i = 0; i < inputs.length; i++) setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target); } //remove redundant inputs and pure broadcast variables ArrayList<Path> lpaths = new ArrayList<>(); ArrayList<InputInfo> liinfos = new ArrayList<>(); for (int i = 0; i < inputs.length; i++) { Path p = new Path(inputs[i]); //check and skip redundant inputs if (lpaths.contains(p) //path already included || distCacheOnly[i]) //input only required in dist cache { continue; } lpaths.add(p); liinfos.add(inputInfos[i]); } boolean combineInputFormat = false; if (OptimizerUtils.ALLOW_COMBINE_FILE_INPUT_FORMAT) { //determine total input sizes double totalInputSize = 0; for (int i = 0; i < inputs.length; i++) totalInputSize += MapReduceTool.getFilesizeOnHDFS(new Path(inputs[i])); //set max split size (default blocksize) to 2x blocksize if (1) sort buffer large enough, //(2) degree of parallelism not hurt, and only a single input (except broadcasts) //(the sort buffer size is relevant for pass-through of, potentially modified, inputs to the reducers) //(the single input constraint stems from internal runtime assumptions used to relate meta data to inputs) long sizeSortBuff = InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer(); long sizeHDFSBlk = InfrastructureAnalyzer.getHDFSBlockSize(); long newSplitSize = sizeHDFSBlk * 2; //use generic config api for backwards compatibility double spillPercent = Double .parseDouble(job.get(MRConfigurationNames.MR_MAP_SORT_SPILL_PERCENT, "1.0")); int numPMap = OptimizerUtils.getNumMappers(); if (numPMap < totalInputSize / newSplitSize && sizeSortBuff * spillPercent >= newSplitSize && lpaths.size() == 1) { job.setLong(MRConfigurationNames.MR_INPUT_FILEINPUTFORMAT_SPLIT_MAXSIZE, newSplitSize); combineInputFormat = true; } } //add inputs to jobs input (incl input format configuration) for (int i = 0; i < lpaths.size(); i++) { //add input to job inputs (for binaryblock we use CombineSequenceFileInputFormat to reduce task latency) if (combineInputFormat && liinfos.get(i) == InputInfo.BinaryBlockInputInfo) MultipleInputs.addInputPath(job, lpaths.get(i), CombineSequenceFileInputFormat.class); else MultipleInputs.addInputPath(job, lpaths.get(i), liinfos.get(i).inputFormatClass); } } /** * Specific method because we need to set the input converter class according to the * input infos. Note that any mapper instruction before reblock can work on binary block * if it can work on binary cell as well. * * @param job job configuration * @param inputIndexes array of byte indexes * @param inputs array of input string * @param inputInfos array of input infos * @param brlens array of block row lengths * @param bclens array of block column lengths * @throws Exception if Exception occurs */ public static void setUpMultipleInputsReblock(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens) throws Exception { if (inputs.length != inputInfos.length) throw new Exception("number of inputs and inputInfos does not match"); //set up names of the input matrices and their inputformat information job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs); MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes); for (int i = 0; i < inputs.length; i++) { ConvertTarget target = ConvertTarget.CELL; if (inputInfos[i] == InputInfo.BinaryBlockInputInfo) target = ConvertTarget.BLOCK; setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target); } //remove redundant input files ArrayList<Path> paths = new ArrayList<>(); for (int i = 0; i < inputs.length; i++) { String name = inputs[i]; Path p = new Path(name); boolean redundant = false; for (Path ep : paths) if (ep.equals(p)) { redundant = true; break; } if (redundant) continue; MultipleInputs.addInputPath(job, p, inputInfos[i].inputFormatClass); paths.add(p); } } public static void setUpMultipleOutputs(JobConf job, byte[] resultIndexes, byte[] resultDimsUnknown, String[] outputs, OutputInfo[] outputInfos, boolean inBlockRepresentation, boolean mayContainCtable) throws Exception { if (resultIndexes.length != outputs.length) throw new Exception("number of outputs and result indexes does not match"); if (outputs.length != outputInfos.length) throw new Exception("number of outputs and outputInfos indexes does not match"); job.set(RESULT_INDEXES_CONFIG, MRJobConfiguration.getIndexesString(resultIndexes)); job.set(RESULT_DIMS_UNKNOWN_CONFIG, MRJobConfiguration.getIndexesString(resultDimsUnknown)); job.setStrings(OUTPUT_MATRICES_DIRS_CONFIG, outputs); job.setOutputCommitter(MultipleOutputCommitter.class); for (int i = 0; i < outputs.length; i++) { MapReduceTool.deleteFileIfExistOnHDFS(new Path(outputs[i]), job); if (mayContainCtable && resultDimsUnknown[i] == (byte) 1) { setOutputInfo(job, i, outputInfos[i], false); } else { setOutputInfo(job, i, outputInfos[i], inBlockRepresentation); } MultipleOutputs.addNamedOutput(job, Integer.toString(i), outputInfos[i].outputFormatClass, outputInfos[i].outputKeyClass, outputInfos[i].outputValueClass); } job.setOutputFormat(NullOutputFormat.class); // configure temp output Path tempOutputPath = new Path(constructTempOutputFilename()); FileOutputFormat.setOutputPath(job, tempOutputPath); MapReduceTool.deleteFileIfExistOnHDFS(tempOutputPath, job); } public static void setUpMultipleOutputs(JobConf job, byte[] resultIndexes, byte[] resultDimsUnknwon, String[] outputs, OutputInfo[] outputInfos, boolean inBlockRepresentation) throws Exception { setUpMultipleOutputs(job, resultIndexes, resultDimsUnknwon, outputs, outputInfos, inBlockRepresentation, false); } public static String setUpSortPartitionFilename(JobConf job) { String pfname = constructPartitionFilename(); job.set(SORT_PARTITION_FILENAME, pfname); return pfname; } public static String getSortPartitionFilename(JobConf job) { return job.get(SORT_PARTITION_FILENAME); } public static MatrixChar_N_ReducerGroups computeMatrixCharacteristics(JobConf job, byte[] inputIndexes, String instructionsInMapper, String aggInstructionsInReducer, String aggBinInstructions, String otherInstructionsInReducer, byte[] resultIndexes, HashSet<Byte> mapOutputIndexes, boolean forMMCJ) throws DMLRuntimeException { return computeMatrixCharacteristics(job, inputIndexes, null, instructionsInMapper, null, aggInstructionsInReducer, aggBinInstructions, otherInstructionsInReducer, resultIndexes, mapOutputIndexes, forMMCJ); } public static MatrixChar_N_ReducerGroups computeMatrixCharacteristics(JobConf job, byte[] inputIndexes, String instructionsInMapper, String reblockInstructions, String aggInstructionsInReducer, String aggBinInstructions, String otherInstructionsInReducer, byte[] resultIndexes, HashSet<Byte> mapOutputIndexes, boolean forMMCJ) throws DMLRuntimeException { return computeMatrixCharacteristics(job, inputIndexes, null, instructionsInMapper, reblockInstructions, aggInstructionsInReducer, aggBinInstructions, otherInstructionsInReducer, resultIndexes, mapOutputIndexes, forMMCJ); } public static void setNumReducers(JobConf job, long numReducerGroups, int numFromCompiler) throws IOException { JobClient client = new JobClient(job); int n = client.getClusterStatus().getMaxReduceTasks(); //correction max number of reducers on yarn clusters if (InfrastructureAnalyzer.isYarnEnabled()) n = (int) Math.max(n, YarnClusterAnalyzer.getNumCores() / 2); n = Math.min(n, ConfigurationManager.getNumReducers()); n = Math.min(n, numFromCompiler); if (numReducerGroups > 0) n = (int) Math.min(n, numReducerGroups); job.setNumReduceTasks(n); } public static class MatrixChar_N_ReducerGroups { public MatrixCharacteristics[] stats; public long numReducerGroups = 0; public MatrixChar_N_ReducerGroups(MatrixCharacteristics[] sts, long ng) { stats = sts; numReducerGroups = ng; } } /** * NOTE: this method needs to be in-sync with MRBaseForCommonInstructions.processOneInstruction, * otherwise, the latter will potentially fail with missing dimension information. * * @param job job configuration * @param inputIndexes array of byte indexes * @param dataGenInstructions data gen instructions as a string * @param instructionsInMapper instruction in mapper as a string * @param reblockInstructions reblock instructions as a string * @param aggInstructionsInReducer aggregate instructions in reducer as a string * @param aggBinInstructions binary aggregate instructions as a string * @param otherInstructionsInReducer other instructions in reducer as a string * @param resultIndexes array of byte result indexes * @param mapOutputIndexes set of map output indexes * @param forMMCJ ? * @return reducer groups * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static MatrixChar_N_ReducerGroups computeMatrixCharacteristics(JobConf job, byte[] inputIndexes, String dataGenInstructions, String instructionsInMapper, String reblockInstructions, String aggInstructionsInReducer, String aggBinInstructions, String otherInstructionsInReducer, byte[] resultIndexes, HashSet<Byte> mapOutputIndexes, boolean forMMCJ) throws DMLRuntimeException { HashSet<Byte> intermediateMatrixIndexes = new HashSet<>(); HashMap<Byte, MatrixCharacteristics> dims = new HashMap<>(); for (byte i : inputIndexes) { MatrixCharacteristics dim = new MatrixCharacteristics(getNumRows(job, i), getNumColumns(job, i), getNumRowsPerBlock(job, i), getNumColumnsPerBlock(job, i), getNumNonZero(job, i)); dims.put(i, dim); } DataGenMRInstruction[] dataGenIns = null; dataGenIns = MRInstructionParser.parseDataGenInstructions(dataGenInstructions); if (dataGenIns != null) { for (DataGenMRInstruction ins : dataGenIns) { MatrixCharacteristics.computeDimension(dims, ins); } } MRInstruction[] insMapper = MRInstructionParser.parseMixedInstructions(instructionsInMapper); if (insMapper != null) { for (MRInstruction ins : insMapper) { MatrixCharacteristics.computeDimension(dims, ins); if (ins instanceof UnaryMRInstructionBase) { UnaryMRInstructionBase tempIns = (UnaryMRInstructionBase) ins; setIntermediateMatrixCharactristics(job, tempIns.input, dims.get(tempIns.input)); intermediateMatrixIndexes.add(tempIns.input); } else if (ins instanceof AppendMInstruction) { AppendMInstruction tempIns = (AppendMInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); } else if (ins instanceof AppendGInstruction) { AppendGInstruction tempIns = (AppendGInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); } else if (ins instanceof BinaryMInstruction) { BinaryMInstruction tempIns = (BinaryMInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); } else if (ins instanceof AggregateBinaryInstruction) { AggregateBinaryInstruction tempIns = (AggregateBinaryInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); //TODO } else if (ins instanceof MapMultChainInstruction) { MapMultChainInstruction tempIns = (MapMultChainInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.getInput1(), dims.get(tempIns.getInput2())); intermediateMatrixIndexes.add(tempIns.getInput1()); } else if (ins instanceof PMMJMRInstruction) { PMMJMRInstruction tempIns = (PMMJMRInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input2, dims.get(tempIns.input2)); intermediateMatrixIndexes.add(tempIns.input2); } } } ReblockInstruction[] reblockIns = MRInstructionParser.parseReblockInstructions(reblockInstructions); if (reblockIns != null) { for (ReblockInstruction ins : reblockIns) { MatrixCharacteristics.computeDimension(dims, ins); setMatrixCharactristicsForReblock(job, ins.output, dims.get(ins.output)); } } Instruction[] aggIns = MRInstructionParser.parseAggregateInstructions(aggInstructionsInReducer); if (aggIns != null) { for (Instruction ins : aggIns) { MatrixCharacteristics.computeDimension(dims, (MRInstruction) ins); // if instruction's output is not in resultIndexes, then add its dimensions to jobconf MRInstruction mrins = (MRInstruction) ins; boolean found = false; for (byte b : resultIndexes) { if (b == mrins.output) { found = true; break; } } if (!found) { setIntermediateMatrixCharactristics(job, mrins.output, dims.get(mrins.output)); intermediateMatrixIndexes.add(mrins.output); } } } long numReduceGroups = 0; AggregateBinaryInstruction[] aggBinIns = getAggregateBinaryInstructions(job); if (aggBinIns != null) { for (AggregateBinaryInstruction ins : aggBinIns) { MatrixCharacteristics dim1 = dims.get(ins.input1); MatrixCharacteristics dim2 = dims.get(ins.input2); setMatrixCharactristicsForBinAgg(job, ins.input1, dim1); setMatrixCharactristicsForBinAgg(job, ins.input2, dim2); MatrixCharacteristics.computeDimension(dims, ins); if (forMMCJ)//there will be only one aggbin operation for MMCJ numReduceGroups = (long) Math.ceil((double) dim1.getCols() / (double) dim1.getColsPerBlock()); } } if (!forMMCJ) { //store the skylines ArrayList<Long> xs = new ArrayList<>(mapOutputIndexes.size()); ArrayList<Long> ys = new ArrayList<>(mapOutputIndexes.size()); for (byte idx : mapOutputIndexes) { MatrixCharacteristics dim = dims.get(idx); long x = (long) Math.ceil((double) dim.getRows() / (double) dim.getRowsPerBlock()); long y = (long) Math.ceil((double) dim.getCols() / (double) dim.getColsPerBlock()); int i = 0; boolean toadd = true; while (i < xs.size()) { if ((x >= xs.get(i) && y > ys.get(i)) || (x > xs.get(i) && y >= ys.get(i))) { //remove any included x's and y's xs.remove(i); ys.remove(i); } else if (x <= xs.get(i) && y <= ys.get(i))//if included in others, stop { toadd = false; break; } else i++; } if (toadd) { xs.add(x); ys.add(y); } } //sort by x TreeMap<Long, Long> map = new TreeMap<>(); for (int i = 0; i < xs.size(); i++) map.put(xs.get(i), ys.get(i)); numReduceGroups = 0; //compute area long prev = 0; for (Entry<Long, Long> e : map.entrySet()) { numReduceGroups += (e.getKey() - prev) * e.getValue(); prev = e.getKey(); } } MRInstruction[] insReducer = MRInstructionParser.parseMixedInstructions(otherInstructionsInReducer); if (insReducer != null) { for (MRInstruction ins : insReducer) { MatrixCharacteristics.computeDimension(dims, ins); if (ins instanceof UnaryMRInstructionBase) { UnaryMRInstructionBase tempIns = (UnaryMRInstructionBase) ins; setIntermediateMatrixCharactristics(job, tempIns.input, dims.get(tempIns.input)); intermediateMatrixIndexes.add(tempIns.input); } else if (ins instanceof RemoveEmptyMRInstruction) { RemoveEmptyMRInstruction tempIns = (RemoveEmptyMRInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); } // if instruction's output is not in resultIndexes, then add its dimensions to jobconf boolean found = false; for (byte b : resultIndexes) { if (b == ins.output) { found = true; break; } } if (!found) { setIntermediateMatrixCharactristics(job, ins.output, dims.get(ins.output)); intermediateMatrixIndexes.add(ins.output); } } } setIntermediateMatrixIndexes(job, intermediateMatrixIndexes); for (byte tag : mapOutputIndexes) setMatrixCharactristicsForMapperOutput(job, tag, dims.get(tag)); MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length]; MatrixCharacteristics resultDims; for (int i = 0; i < resultIndexes.length; i++) { resultDims = dims.get(resultIndexes[i]); stats[i] = resultDims; setMatrixCharactristicsForOutput(job, resultIndexes[i], stats[i]); } return new MatrixChar_N_ReducerGroups(stats, numReduceGroups); } public static void setIntermediateMatrixCharactristics(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(INTERMEDIATE_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, dim.getRows()); job.setLong(INTERMEDIATE_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getCols()); job.setInt(INTERMEDIATE_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, dim.getRowsPerBlock()); job.setInt(INTERMEDIATE_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getColsPerBlock()); } public static MatrixCharacteristics getIntermediateMatrixCharactristics(JobConf job, byte tag) { MatrixCharacteristics dim = new MatrixCharacteristics(); dim.setDimension(job.getLong(INTERMEDIATE_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, 0), job.getLong(INTERMEDIATE_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, 0)); dim.setBlockSize(job.getInt(INTERMEDIATE_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, 1), job.getInt(INTERMEDIATE_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, 1)); return dim; } public static void setMatrixCharactristicsForOutput(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(OUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, dim.getRows()); job.setLong(OUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getCols()); job.setInt(OUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, dim.getRowsPerBlock()); job.setInt(OUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getColsPerBlock()); } public static MatrixCharacteristics getMatrixCharacteristicsForOutput(JobConf job, byte tag) { MatrixCharacteristics dim = new MatrixCharacteristics(); dim.setDimension(job.getLong(OUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, 0), job.getLong(OUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, 0)); dim.setBlockSize(job.getInt(OUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, 1), job.getInt(OUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, 1)); return dim; } public static MatrixCharacteristics getMatrixCharacteristicsForInput(JobConf job, byte tag) { MatrixCharacteristics dim = new MatrixCharacteristics(); dim.setDimension(job.getLong(INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, 0), job.getLong(INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, 0)); dim.setBlockSize(job.getInt(INPUT_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, 1), job.getInt(INPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, 1)); return dim; } public static void setMatrixCharactristicsForMapperOutput(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(MAPOUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, dim.getRows()); job.setLong(MAPOUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getCols()); job.setInt(MAPOUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, dim.getRowsPerBlock()); job.setInt(MAPOUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getColsPerBlock()); } public static MatrixCharacteristics getMatrixCharacteristicsForMapOutput(JobConf job, byte tag) { MatrixCharacteristics dim = new MatrixCharacteristics(); dim.setDimension(job.getLong(MAPOUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, 0), job.getLong(MAPOUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, 0)); dim.setBlockSize(job.getInt(MAPOUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, 1), job.getInt(MAPOUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, 1)); return dim; } public static void setMatrixCharactristicsForReblock(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(REBLOCK_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, dim.getRows()); job.setLong(REBLOCK_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getCols()); job.setInt(REBLOCK_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, dim.getRowsPerBlock()); job.setInt(REBLOCK_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getColsPerBlock()); job.setLong(REBLOCK_MATRIX_NUM_NNZ_PREFIX_CONFIG + tag, dim.getNonZeros()); } public static MatrixCharacteristics getMatrixCharactristicsForReblock(JobConf job, byte tag) { MatrixCharacteristics dim = new MatrixCharacteristics(); dim.setDimension(job.getLong(REBLOCK_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, 0), job.getLong(REBLOCK_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, 0)); dim.setBlockSize(job.getInt(REBLOCK_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, 1), job.getInt(REBLOCK_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, 1)); long nnz = job.getLong(REBLOCK_MATRIX_NUM_NNZ_PREFIX_CONFIG + tag, -1); if (nnz >= 0) dim.setNonZeros(nnz); return dim; } public static void setMatrixCharactristicsForBinAgg(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(AGGBIN_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, dim.getRows()); job.setLong(AGGBIN_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getCols()); job.setInt(AGGBIN_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, dim.getRowsPerBlock()); job.setInt(AGGBIN_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, dim.getColsPerBlock()); } public static MatrixCharacteristics getMatrixCharactristicsForBinAgg(JobConf job, byte tag) { MatrixCharacteristics dim = new MatrixCharacteristics(); dim.setDimension(job.getLong(AGGBIN_MATRIX_NUM_ROW_PREFIX_CONFIG + tag, 0), job.getLong(AGGBIN_MATRIX_NUM_COLUMN_PREFIX_CONFIG + tag, 0)); dim.setBlockSize(job.getInt(AGGBIN_BLOCK_NUM_ROW_PREFIX_CONFIG + tag, 1), job.getInt(AGGBIN_BLOCK_NUM_COLUMN_PREFIX_CONFIG + tag, 1)); return dim; } public static HashSet<Byte> setUpOutputIndexesForMapper(JobConf job, byte[] inputIndexes, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, byte[] resultIndexes) throws DMLRuntimeException { return setUpOutputIndexesForMapper(job, inputIndexes, null, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); } public static HashSet<Byte> setUpOutputIndexesForMapper(JobConf job, byte[] inputIndexes, String instructionsInMapper, String reblockInstructions, String aggInstructionsInReducer, String otherInstructionsInReducer, byte[] resultIndexes) throws DMLRuntimeException { return setUpOutputIndexesForMapper(job, inputIndexes, null, instructionsInMapper, reblockInstructions, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); } public static HashSet<Byte> setUpOutputIndexesForMapper(JobConf job, byte[] inputIndexes, String randInstructions, String instructionsInMapper, String reblockInstructions, String aggInstructionsInReducer, String otherInstructionsInReducer, byte[] resultIndexes) throws DMLRuntimeException { //find out what results are needed to send to reducers HashSet<Byte> indexesInMapper = new HashSet<>(); for (byte b : inputIndexes) indexesInMapper.add(b); DataGenMRInstruction[] dataGenIns = null; dataGenIns = MRInstructionParser.parseDataGenInstructions(randInstructions); getIndexes(dataGenIns, indexesInMapper); MRInstruction[] insMapper = MRInstructionParser.parseMixedInstructions(instructionsInMapper); getIndexes(insMapper, indexesInMapper); ReblockInstruction[] reblockIns = null; reblockIns = MRInstructionParser.parseReblockInstructions(reblockInstructions); getIndexes(reblockIns, indexesInMapper); MRInstruction[] insReducer = MRInstructionParser.parseAggregateInstructions(aggInstructionsInReducer); HashSet<Byte> indexesInReducer = new HashSet<>(); getIndexes(insReducer, indexesInReducer); insReducer = MRInstructionParser.parseMixedInstructions(otherInstructionsInReducer); getIndexes(insReducer, indexesInReducer); for (byte ind : resultIndexes) indexesInReducer.add(ind); indexesInMapper.retainAll(indexesInReducer); job.set(OUTPUT_INDEXES_IN_MAPPER_CONFIG, getIndexesString(indexesInMapper)); return indexesInMapper; } public static CollectMultipleConvertedOutputs getMultipleConvertedOutputs(JobConf job) { byte[] resultIndexes = MRJobConfiguration.getResultIndexes(job); Converter[] outputConverters = new Converter[resultIndexes.length]; MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length]; HashMap<Byte, ArrayList<Integer>> tagMapping = new HashMap<>(); for (int i = 0; i < resultIndexes.length; i++) { byte output = resultIndexes[i]; ArrayList<Integer> vec = tagMapping.get(output); if (vec == null) { vec = new ArrayList<>(); tagMapping.put(output, vec); } vec.add(i); outputConverters[i] = getOuputConverter(job, i); stats[i] = MRJobConfiguration.getMatrixCharacteristicsForOutput(job, output); } MultipleOutputs multipleOutputs = new MultipleOutputs(job); return new CollectMultipleConvertedOutputs(outputConverters, stats, multipleOutputs); } private static void getIndexes(MRInstruction[] instructions, HashSet<Byte> indexes) throws DMLRuntimeException { if (instructions == null) return; for (MRInstruction ins : instructions) { for (byte i : ins.getAllIndexes()) indexes.add(i); } } private static String getIndexesString(HashSet<Byte> indexes) { if (indexes == null || indexes.isEmpty()) return ""; StringBuilder sb = new StringBuilder(); for (Byte ind : indexes) { sb.append(ind); sb.append(Instruction.INSTRUCTION_DELIM); } //return string without last character return sb.substring(0, sb.length() - 1); } private static String getIndexesString(byte[] indexes) { if (indexes == null || indexes.length == 0) return ""; StringBuilder sb = new StringBuilder(); for (Byte ind : indexes) { sb.append(ind); sb.append(Instruction.INSTRUCTION_DELIM); } //return string without last character return sb.substring(0, sb.length() - 1); } public static void setMapFunctionInputMatrixIndexes(JobConf job, byte[] realIndexes) { job.set(MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG, getIndexesString(realIndexes)); } public static boolean deriveRepresentation(InputInfo[] inputInfos) { for (InputInfo input : inputInfos) { if (!(input.inputValueClass == MatrixBlock.class)) { return false; } } return true; } public static String constructTempOutputFilename() { StringBuilder sb = new StringBuilder(); sb.append(ConfigurationManager.getScratchSpace()); sb.append(Lop.FILE_SEPARATOR); sb.append(Lop.PROCESS_PREFIX); sb.append(DMLScript.getUUID()); sb.append(Lop.FILE_SEPARATOR); sb.append("TmpOutput" + seq.getNextID()); //old unique dir (no guarantees): //sb.append(Integer.toHexString(new Random().nextInt(Integer.MAX_VALUE))); return sb.toString(); } private static String constructPartitionFilename() { StringBuilder sb = new StringBuilder(); sb.append(ConfigurationManager.getScratchSpace()); sb.append(Lop.FILE_SEPARATOR); sb.append(Lop.PROCESS_PREFIX); sb.append(DMLScript.getUUID()); sb.append(Lop.FILE_SEPARATOR); sb.append(SamplingSortMRInputFormat.PARTITION_FILENAME + seq.getNextID()); //old unique dir (no guarantees): //sb.append(Integer.toHexString(new Random().nextInt(Integer.MAX_VALUE))); return sb.toString(); } public static void setSystemMLLocalTmpDir(JobConf job, String dir) { job.set(SYSTEMML_LOCAL_TMP_DIR, dir); } public static String getSystemMLLocalTmpDir(JobConf job) { return job.get(SYSTEMML_LOCAL_TMP_DIR); } public static void addBinaryBlockSerializationFramework(Configuration job) { String frameworkList = job.get(MRConfigurationNames.IO_SERIALIZATIONS); String frameworkClassBB = BinaryBlockSerialization.class.getCanonicalName(); job.set(MRConfigurationNames.IO_SERIALIZATIONS, frameworkClassBB + "," + frameworkList); } /** * Set all configurations with prefix mapred or mapreduce that exist in the given * DMLConfig into the given JobConf. * * @param job job configuration * @param config dml configuration */ public static void setupCustomMRConfigurations(JobConf job, DMLConfig config) { Map<String, String> map = config.getCustomMRConfig(); for (Entry<String, String> e : map.entrySet()) { job.set(e.getKey(), e.getValue()); } } }