List of usage examples for org.apache.hadoop.mapred JobConf setStrings
public void setStrings(String name, String... values)
name
property as as comma delimited values. From source file:org.apache.sysml.runtime.matrix.CSVReblockMR.java
License:Apache License
private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception { JobConf job; job = new JobConf(ReblockMR.class); job.setJobName("CSV-Reblock-MR"); byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;//from w w w .j av a 2 s.c o m //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL); job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up custom map/reduce configurations DMLConfig config = ConfigurationManager.getDMLConfig(); MRJobConfiguration.setupCustomMRConfigurations(job, config); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, reblockInstructions, null, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups); job.setNumReduceTasks(numRed); // Print the complete instruction //if (LOG.isTraceEnabled()) // inst.printCompelteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true); // configure mapper and the mapper output key value pairs job.setMapperClass(CSVReblockMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); Path cachefile = new Path(counterFile, "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(ROWID_FILE_NAME, cachefile.toString()); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); // System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.java
License:Apache License
public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean[] distCacheOnly, boolean setConverter, ConvertTarget target) throws Exception { if (inputs.length != inputInfos.length) throw new Exception("number of inputs and inputInfos does not match"); //set up names of the input matrices and their inputformat information job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs); MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes); //set up converter infos (converter determined implicitly) if (setConverter) { for (int i = 0; i < inputs.length; i++) setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target); }//from w w w. java 2 s. c om //remove redundant inputs and pure broadcast variables ArrayList<Path> lpaths = new ArrayList<>(); ArrayList<InputInfo> liinfos = new ArrayList<>(); for (int i = 0; i < inputs.length; i++) { Path p = new Path(inputs[i]); //check and skip redundant inputs if (lpaths.contains(p) //path already included || distCacheOnly[i]) //input only required in dist cache { continue; } lpaths.add(p); liinfos.add(inputInfos[i]); } boolean combineInputFormat = false; if (OptimizerUtils.ALLOW_COMBINE_FILE_INPUT_FORMAT) { //determine total input sizes double totalInputSize = 0; for (int i = 0; i < inputs.length; i++) totalInputSize += MapReduceTool.getFilesizeOnHDFS(new Path(inputs[i])); //set max split size (default blocksize) to 2x blocksize if (1) sort buffer large enough, //(2) degree of parallelism not hurt, and only a single input (except broadcasts) //(the sort buffer size is relevant for pass-through of, potentially modified, inputs to the reducers) //(the single input constraint stems from internal runtime assumptions used to relate meta data to inputs) long sizeSortBuff = InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer(); long sizeHDFSBlk = InfrastructureAnalyzer.getHDFSBlockSize(); long newSplitSize = sizeHDFSBlk * 2; //use generic config api for backwards compatibility double spillPercent = Double .parseDouble(job.get(MRConfigurationNames.MR_MAP_SORT_SPILL_PERCENT, "1.0")); int numPMap = OptimizerUtils.getNumMappers(); if (numPMap < totalInputSize / newSplitSize && sizeSortBuff * spillPercent >= newSplitSize && lpaths.size() == 1) { job.setLong(MRConfigurationNames.MR_INPUT_FILEINPUTFORMAT_SPLIT_MAXSIZE, newSplitSize); combineInputFormat = true; } } //add inputs to jobs input (incl input format configuration) for (int i = 0; i < lpaths.size(); i++) { //add input to job inputs (for binaryblock we use CombineSequenceFileInputFormat to reduce task latency) if (combineInputFormat && liinfos.get(i) == InputInfo.BinaryBlockInputInfo) MultipleInputs.addInputPath(job, lpaths.get(i), CombineSequenceFileInputFormat.class); else MultipleInputs.addInputPath(job, lpaths.get(i), liinfos.get(i).inputFormatClass); } }
From source file:org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.java
License:Apache License
/** * Specific method because we need to set the input converter class according to the * input infos. Note that any mapper instruction before reblock can work on binary block * if it can work on binary cell as well. * /* w ww . ja v a 2 s .c o m*/ * @param job job configuration * @param inputIndexes array of byte indexes * @param inputs array of input string * @param inputInfos array of input infos * @param brlens array of block row lengths * @param bclens array of block column lengths * @throws Exception if Exception occurs */ public static void setUpMultipleInputsReblock(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens) throws Exception { if (inputs.length != inputInfos.length) throw new Exception("number of inputs and inputInfos does not match"); //set up names of the input matrices and their inputformat information job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs); MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes); for (int i = 0; i < inputs.length; i++) { ConvertTarget target = ConvertTarget.CELL; if (inputInfos[i] == InputInfo.BinaryBlockInputInfo) target = ConvertTarget.BLOCK; setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target); } //remove redundant input files ArrayList<Path> paths = new ArrayList<>(); for (int i = 0; i < inputs.length; i++) { String name = inputs[i]; Path p = new Path(name); boolean redundant = false; for (Path ep : paths) if (ep.equals(p)) { redundant = true; break; } if (redundant) continue; MultipleInputs.addInputPath(job, p, inputInfos[i].inputFormatClass); paths.add(p); } }
From source file:org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.java
License:Apache License
public static void setUpMultipleOutputs(JobConf job, byte[] resultIndexes, byte[] resultDimsUnknown, String[] outputs, OutputInfo[] outputInfos, boolean inBlockRepresentation, boolean mayContainCtable) throws Exception { if (resultIndexes.length != outputs.length) throw new Exception("number of outputs and result indexes does not match"); if (outputs.length != outputInfos.length) throw new Exception("number of outputs and outputInfos indexes does not match"); job.set(RESULT_INDEXES_CONFIG, MRJobConfiguration.getIndexesString(resultIndexes)); job.set(RESULT_DIMS_UNKNOWN_CONFIG, MRJobConfiguration.getIndexesString(resultDimsUnknown)); job.setStrings(OUTPUT_MATRICES_DIRS_CONFIG, outputs); job.setOutputCommitter(MultipleOutputCommitter.class); for (int i = 0; i < outputs.length; i++) { MapReduceTool.deleteFileIfExistOnHDFS(new Path(outputs[i]), job); if (mayContainCtable && resultDimsUnknown[i] == (byte) 1) { setOutputInfo(job, i, outputInfos[i], false); } else {/* w w w.j ava 2 s . com*/ setOutputInfo(job, i, outputInfos[i], inBlockRepresentation); } MultipleOutputs.addNamedOutput(job, Integer.toString(i), outputInfos[i].outputFormatClass, outputInfos[i].outputKeyClass, outputInfos[i].outputValueClass); } job.setOutputFormat(NullOutputFormat.class); // configure temp output Path tempOutputPath = new Path(constructTempOutputFilename()); FileOutputFormat.setOutputPath(job, tempOutputPath); MapReduceTool.deleteFileIfExistOnHDFS(tempOutputPath, job); }
From source file:org.lobid.lodmill.hadoop.IntegrationTestCollectSubjects.java
License:Open Source License
private Job createJob() throws IOException { final JobConf conf = createJobConf(); conf.setStrings("mapred.textoutputformat.separator", " "); conf.setStrings(CollectSubjects.PREFIX_KEY, "http://lobid.org/organisation"); final Job job = new Job(conf); job.setJobName("CollectSubjects"); FileInputFormat.addInputPaths(job, HDFS_IN_1 + "," + HDFS_IN_2); FileOutputFormat.setOutputPath(job, new Path(HDFS_OUT)); job.setMapperClass(CollectSubjectsMapper.class); job.setReducerClass(CollectSubjectsReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job;/* ww w . ja v a2s . c o m*/ }
From source file:org.lobid.lodmill.hadoop.IntegrationTestLobidNTriplesToJsonLd.java
License:Open Source License
private Job createJob() throws IOException { final JobConf conf = createJobConf(); conf.setStrings("mapred.textoutputformat.separator", " "); conf.setStrings(CollectSubjects.PREFIX_KEY, "http://lobid.org/organisation"); final URI zippedMapFile = CollectSubjects.asZippedMapFile(hdfs, new Path(HDFS_IN_SUBJECTS), new Path(HDFS_OUT_ZIP + "/" + CollectSubjects.MAP_FILE_ZIP)); DistributedCache.addCacheFile(zippedMapFile, conf); final Job job = new Job(conf); job.setJobName("IntegrationTestLobidNTriplesToJsonLd"); FileInputFormat.addInputPaths(job, HDFS_IN_TRIPLES_1 + "," + HDFS_IN_TRIPLES_2); FileOutputFormat.setOutputPath(job, new Path(HDFS_OUT)); job.setMapperClass(NTriplesToJsonLdMapper.class); job.setReducerClass(NTriplesToJsonLdReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job;/*w w w . j a v a 2 s . c om*/ }
From source file:voldemort.store.readonly.mr.azkaban.VoldemortBatchIndexJob.java
License:Apache License
/** * Method to allow this process to be a instance call from another Job. * /* w ww . jav a2s .c o m*/ * @storeName to dump the value * @inputFile to generate the VFILE * * */ public void execute(String voldemortClusterLocalFile, String storeName, String inputPath, String outputPath, int voldemortCheckDataPercent) throws IOException, URISyntaxException { JobConf conf = createJobConf(VoldemortBatchIndexMapper.class, VoldemortBatchIndexReducer.class); try { // get the voldemort cluster definition // We need to use cluster.xml here where it not yet localized by // TaskRunner _cluster = HadoopUtils.readCluster(voldemortClusterLocalFile, conf); } catch (Exception e) { logger.error("Failed to read Voldemort cluster details", e); throw new RuntimeException("", e); } // set the partitioner conf.setPartitionerClass(VoldemortBatchIndexPartitoner.class); conf.setNumReduceTasks(_cluster.getNumberOfNodes()); // Blow Away the O/p if force.overwirte is available FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (getProps().getBoolean("force.output.overwrite", false)) { FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf); fs.delete(FileOutputFormat.getOutputPath(conf), true); } conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setNumReduceTasks(_cluster.getNumberOfNodes()); // get the store information conf.setStrings("voldemort.index.filename", storeName + ".index"); conf.setStrings("voldemort.data.filename", storeName + ".data"); conf.setInt("input.data.check.percent", voldemortCheckDataPercent); conf.setStrings("voldemort.store.name", storeName); // run(conf); JobClient.runJob(conf); }