List of usage examples for org.apache.hadoop.mapred JobConf getNumReduceTasks
public int getNumReduceTasks()
From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java
License:Apache License
@Override protected synchronized void submit() { JobConf jobConf = this.getJobConf(); boolean isLocalHadoop = jobConf.get("mapred.job.tracker", "local").equals("local"); // the default partitioner is {@link com.ebay.erl.mobius.core.datajoin.DataJoinKeyPartitioner} // which is hash based. ///*from ww w . j a v a2s . c o m*/ // If user choose to use even partitioner, Mobius will use // {@link com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner} which // is sampling based partitioner of attempting to balance the load // for each reducer. String partitioner = jobConf.get("mobius.partitioner", "default"); if (!isLocalHadoop && jobConf.getNumReduceTasks() != 0 && partitioner.equals("even")) { // this job needs reducer, perform sampling on the keys to // make load on reducers are almost evenly distributed. double freq = jobConf.getFloat("mobius.sampler.freq", 0.1F); int numSamples = jobConf.getInt("mobius.sampler.num.samples", 50000); int maxSplits = jobConf.getInt("mobius.sampler.max.slipts.sampled", 5); // log sampling parameters so that user knows. LOGGER.info("Sampling parameters { " + "mobius.sampler.freq:" + format.format(freq) + ", " + "mobius.sampler.num.samples:" + numSamples + ", " + "mobius.sampler.max.slipts.sampled:" + maxSplits + "}"); InputSampler.Sampler<?, ?> sampler = new MobiusInputSampler(freq, numSamples, maxSplits); writePartitionFile(jobConf, sampler); // add to distributed cache try { URI partitionUri = new URI(TotalOrderPartitioner.getPartitionFile(jobConf) + "#_partitions"); LOGGER.info("Adding partition uri to distributed cache:" + partitionUri.toString()); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); jobConf.setPartitionerClass(EvenlyPartitioner.class); LOGGER.info("Using " + EvenlyPartitioner.class.getCanonicalName() + " to partiton the keys evenly among reducers."); } catch (URISyntaxException e) { LOGGER.error(e.getMessage(), e); throw new RuntimeException(e); } // adding -XX:-UseParallelOldGC, this will automatically set -XX:-UseParallelGC // according to Oracle's specification String jvmOpts = jobConf.get("mapred.child.java.opts", ""); if (jvmOpts.isEmpty()) { jvmOpts = "-XX:-UseParallelOldGC"; } else { if (jvmOpts.indexOf("-XX:-UseParallelOldGC") < 0) { // remove " jvmOpts = jvmOpts.replaceAll("\"", ""); jvmOpts = jvmOpts.concat(" -XX:-UseParallelOldGC"); } } jobConf.set("mapred.child.java.opts", jvmOpts); this.setJobConf(jobConf); } LOGGER.info("Submiting job:" + jobConf.getJobName()); super.submit(); }
From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java
License:Apache License
private static void writePartitionFile(JobConf job, Sampler sampler) { try {//w w w. j av a 2 s . c o m //////////////////////////////////////////////// // first, getting samples from the data sources //////////////////////////////////////////////// LOGGER.info("Running local sampling for job [" + job.getJobName() + "]"); InputFormat inf = job.getInputFormat(); Object[] samples = sampler.getSample(inf, job); LOGGER.info("Samples retrieved, sorting..."); //////////////////////////////////////////////// // sort the samples //////////////////////////////////////////////// RawComparator comparator = job.getOutputKeyComparator(); Arrays.sort(samples, comparator); if (job.getBoolean("mobius.print.sample", false)) { PrintWriter pw = new PrintWriter( new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream( new File(job.get("mobius.sample.file", "./samples.txt.gz"))))))); for (Object obj : samples) { pw.println(obj); } pw.flush(); pw.close(); } //////////////////////////////////////////////// // start to write partition files //////////////////////////////////////////////// FileSystem fs = FileSystem.get(job); Path partitionFile = fs.makeQualified(new Path(TotalOrderPartitioner.getPartitionFile(job))); while (fs.exists(partitionFile)) { partitionFile = new Path(partitionFile.toString() + "." + System.currentTimeMillis()); } fs.deleteOnExit(partitionFile); TotalOrderPartitioner.setPartitionFile(job, partitionFile); LOGGER.info("write partition file to:" + partitionFile.toString()); int reducersNbr = job.getNumReduceTasks(); Set<Object> wroteSamples = new HashSet<Object>(); SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, partitionFile, Tuple.class, NullWritable.class); float avgReduceSize = samples.length / reducersNbr; int lastBegin = 0; for (int i = 0; i < samples.length;) { // trying to distribute the load for every reducer evenly, // dividing the <code>samples</code> into a set of blocks // separated by boundaries, objects that selected from the // <code>samples</code> array, and each blocks should have // about the same size. // find the last index of element that equals to samples[i], as // such element might appear multiple times in the samples. int upperBound = Util.findUpperBound(samples, samples[i], comparator); int lowerBound = i;//Util.findLowerBound(samples, samples[i], comparator); // the repeat time of samples[i], if the key itself is too big // select it as boundary int currentElemSize = upperBound - lowerBound + 1; if (currentElemSize > avgReduceSize * 2) // greater than two times of average reducer size { // the current element is too big, greater than // two times of the <code>avgReduceSize</code>, // put itself as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); // immediate put the next element to the boundary, // the next element starts at <code> upperBound+1 // </code>, to prevent the current one consume even // more. if (upperBound + 1 < samples.length) { writer.append(((DataJoinKey) samples[upperBound + 1]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[upperBound + 1]).getKey()); //pw.println(samples[upperBound+1]); // move on to the next element of <code>samples[upperBound+1]/code> lastBegin = Util.findUpperBound(samples, samples[upperBound + 1], comparator) + 1; i = lastBegin; } else { break; } } else { // current element is small enough to be consider // with previous group int size = upperBound - lastBegin; if (size > avgReduceSize) { // by including the current elements, we have // found a block that's big enough, select it // as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); i = upperBound + 1; lastBegin = i; } else { i = upperBound + 1; } } } writer.close(); // if the number of wrote samples doesn't equals to number of // reducer minus one, then it means the key spaces is too small // hence TotalOrderPartitioner won't work, it works only if // the partition boundaries are distinct. // // we need to change the number of reducers if (wroteSamples.size() + 1 != reducersNbr) { LOGGER.info("Write complete, but key space is too small, sample size=" + wroteSamples.size() + ", reducer size:" + (reducersNbr)); LOGGER.info("Set the reducer size to:" + (wroteSamples.size() + 1)); // add 1 because the wrote samples define boundary, ex, if // the sample size is two with two element [300, 1000], then // there should be 3 reducers, one for handling i<300, one // for n300<=i<1000, and another one for 1000<=i job.setNumReduceTasks((wroteSamples.size() + 1)); } samples = null; } catch (IOException e) { LOGGER.error(e.getMessage(), e); throw new RuntimeException(e); } }
From source file:com.hdfs.concat.crush.CrushPartitioner.java
License:Apache License
@Override public void configure(JobConf job) { String path = job.get("crush.partition.map"); int expPartitions = job.getNumReduceTasks(); bucketToPartition = new HashMap<Text, Integer>(100); try {//from w w w .j a v a 2 s. co m FileSystem fs = FileSystem.get(job); Reader reader = new Reader(fs, new Path(path), job); Text bucket = new Text(); IntWritable partNum = new IntWritable(); while (reader.next(bucket, partNum)) { int partNumValue = partNum.get(); if (partNumValue < 0 || partNumValue >= expPartitions) { throw new IllegalArgumentException( "Partition " + partNumValue + " not allowed with " + expPartitions + " reduce tasks"); } Integer prev = bucketToPartition.put(new Text(bucket), partNumValue); if (null != prev) { throw new IllegalArgumentException("Bucket " + bucket + " appears more than once in " + path); } } } catch (IOException e) { throw new RuntimeException("Could not read partition map from " + path, e); } if (new HashSet<Integer>(bucketToPartition.values()).size() > expPartitions) { throw new IllegalArgumentException( path + " contains more than " + expPartitions + " distinct partitions"); } }
From source file:com.hp.hplc.mr.driver.WordCount.java
License:Apache License
/** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker./*from w ww . j a v a2 s . c o m*/ */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WordCount.class); conf.setJobName("wordcount"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); System.out.println("# of reduces: " + conf.getNumReduceTasks()); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:com.ibm.bi.dml.runtime.matrix.mapred.CSVWriteReducer.java
License:Open Source License
@Override public void configure(JobConf job) { super.configure(job); byte maxIndex = 0; HashMap<Byte, CSVWriteInstruction> out2Ins = new HashMap<Byte, CSVWriteInstruction>(); try {/*from ww w .ja va 2 s. c o m*/ CSVWriteInstruction[] ins = MRJobConfiguration.getCSVWriteInstructions(job); for (CSVWriteInstruction in : ins) { out2Ins.put(in.output, in); if (in.output > maxIndex) maxIndex = in.output; } } catch (Exception e) { throw new RuntimeException(e); } int numParitions = job.getNumReduceTasks(); int taskID = MapReduceTool.getUniqueTaskId(job); //LOG.info("## taks id: "+taskID); //for efficiency only, the arrays may have missing values rowIndexes = new long[maxIndex + 1]; colIndexes = new long[maxIndex + 1]; maxRowIndexes = new long[maxIndex + 1]; minRowIndexes = new long[maxIndex + 1]; numColBlocks = new long[maxIndex + 1]; lastBlockNCols = new int[maxIndex + 1]; colsPerBlock = new int[maxIndex + 1]; delims = new String[maxIndex + 1]; sparses = new boolean[maxIndex + 1]; tagToResultIndex = new int[maxIndex + 1]; for (int i = 0; i < resultIndexes.length; i++) { byte ri = resultIndexes[i]; tagToResultIndex[ri] = i; CSVWriteInstruction in = out2Ins.get(ri); MatrixCharacteristics dim = MRJobConfiguration.getMatrixCharacteristicsForInput(job, in.input); delims[ri] = in.delim; sparses[ri] = in.sparse; numColBlocks[ri] = (long) Math.ceil((double) dim.getCols() / (double) dim.getColsPerBlock()); lastBlockNCols[ri] = (int) (dim.getCols() % dim.getColsPerBlock()); colsPerBlock[ri] = dim.getColsPerBlock(); long rstep = (long) Math.ceil((double) dim.getRows() / (double) numParitions); minRowIndexes[ri] = rowIndexes[ri] = rstep * taskID; maxRowIndexes[ri] = Math.min(rstep * (taskID + 1), dim.getRows()); colIndexes[ri] = 0; } zeroBlock.setData(new MatrixBlock()); }
From source file:com.ibm.bi.dml.runtime.matrix.mapred.GMRMapper.java
License:Open Source License
public void configure(JobConf job) { super.configure(job); mapperID = job.get("mapred.task.id"); dimsUnknownFilePrefix = job.get("dims.unknown.file.prefix"); _filterEmptyInputBlocks = allowsFilterEmptyInputBlocks(); //assign the temporay vairables try {/*from ww w .j a v a2 s. co m*/ // System.out.println(valueClass.getName()); // System.out.println(MatrixCell.class.getName()); if (job.getMapOutputValueClass().equals(TaggedMatrixPackedCell.class)) taggedValueBuffer = TaggedMatrixValue.createObject(MatrixPackedCell.class); else taggedValueBuffer = TaggedMatrixValue.createObject(valueClass); } catch (Exception e) { throw new RuntimeException(e); } //decide whether it is a maponly job mapOnlyJob = (job.getNumReduceTasks() <= 0); if (!mapOnlyJob) return; //get the indexes of the final output matrices resultIndexes = MRJobConfiguration.getResultIndexes(job); resultDimsUnknown = MRJobConfiguration.getResultDimsUnknown(job); //initialize SystemML Counters (defined in MRJobConfiguration) resultsNonZeros = new long[resultIndexes.length]; resultsMaxRowDims = new long[resultIndexes.length]; resultsMaxColDims = new long[resultIndexes.length]; tagMapping = new HashMap<Byte, ArrayList<Integer>>(); for (int i = 0; i < resultIndexes.length; i++) { byte output = resultIndexes[i]; ArrayList<Integer> vec = tagMapping.get(output); if (vec == null) { vec = new ArrayList<Integer>(); tagMapping.put(output, vec); } vec.add(i); } //for map only job, get the map output converters collectFinalMultipleOutputs = MRJobConfiguration.getMultipleConvertedOutputs(job); }
From source file:com.ibm.bi.dml.runtime.matrix.sort.SamplingSortMRInputFormat.java
License:Open Source License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param conf the job to sample//w w w . ja v a2s .co m * @param partFile where to write the output file to * @throws IOException if something goes wrong * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings({ "unchecked", "unused", "deprecation" }) public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException { SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat(); Sampler sampler = new Sampler(); Class<? extends WritableComparable> targetKeyClass; targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class); //get input converter information int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0); int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0); //indicate whether the matrix value in this mapper is a matrix cell or a matrix block int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 1000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; // take N samples from different parts of the input int totalcount = 0; for (int i = 0; i < samples; ++i) { SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat .getRecordReader(splits[sampleStep * i], conf, null); int count = 0; WritableComparable key = (WritableComparable) reader.createKey(); Writable value = (Writable) reader.createValue(); while (reader.next(key, value) && count < recordsPerSample) { Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0); inputConverter.setBlockSize(brlen, bclen); inputConverter.convert(key, value); while (inputConverter.hasNext()) { Pair pair = inputConverter.next(); if (pair.getKey() instanceof DoubleWritable) { sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get())); } else if (pair.getValue() instanceof MatrixCell) { sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue())); } else throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass()); count++; } key = (WritableComparable) reader.createKey(); value = (Writable) reader.createValue(); } totalcount += count; } if (totalcount == 0) //empty input files sampler.addValue(new DoubleWritable(0)); FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } //note: key value always double/null as expected by partitioner SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); int index0 = -1, i = 0; boolean lessthan0 = true; for (WritableComparable splitValue : sampler.createPartitions(partitions)) { writer.append(splitValue, nullValue); if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) { index0 = i; lessthan0 = false; } i++; } if (lessthan0) index0 = partitions - 1; writer.close(); return index0; }
From source file:com.ibm.bi.dml.runtime.matrix.SortMR.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen, int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication, String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception { boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes; String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output; JobConf job = new JobConf(SortMR.class); job.setJobName("SortMR"); //setup partition file String pfname = MRJobConfiguration.setUpSortPartitionFilename(job); Path partitionFile = new Path(pfname); URI partitionUri = new URI(partitionFile.toString()); //setup input/output paths Path inputDir = new Path(input); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); SamplingSortMRInputFormat.setInputPaths(job, inputDir); Path outpath = new Path(tmpOutput); FileOutputFormat.setOutputPath(job, outpath); MapReduceTool.deleteFileIfExistOnHDFS(outpath, job); //set number of reducers (1 if local mode) if (InfrastructureAnalyzer.isLocalMode(job)) job.setNumReduceTasks(1);/* w w w .j a v a 2s . com*/ else MRJobConfiguration.setNumReducers(job, numReducers, numReducers); //setup input/output format job.setInputFormat(SamplingSortMRInputFormat.class); SamplingSortMRInputFormat.setTargetKeyValueClasses(job, (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass); //setup instructions and meta information if (combineInst != null && !combineInst.trim().isEmpty()) job.set(COMBINE_INSTRUCTION, combineInst); job.set(SORT_INSTRUCTION, sortInst); job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight); boolean desc = getSortInstructionDescending(sortInst); job.setBoolean(SORT_DECREASING, desc); MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen); MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile); //setup mapper/reducer/partitioner/output classes if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) { MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass); job.setMapperClass(IndexSortMapper.class); job.setReducerClass(IndexSortReducer.class); job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } else { //default case: SORT w/wo weights MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(CompactOutputFormat.class); job.setMapperClass(ValueSortMapper.class); job.setReducerClass(ValueSortReducer.class); job.setOutputKeyClass(outputInfo.outputKeyClass); //double job.setOutputValueClass(outputInfo.outputValueClass); //int } job.setPartitionerClass(TotalOrderPartitioner.class); //setup distributed cache DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); //setup replication factor job.setInt("dfs.replication", replication); MatrixCharacteristics[] s = new MatrixCharacteristics[1]; s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(s); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //run mr job RunningJob runjob = JobClient.runJob(job); Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX); numReducers = job.getNumReduceTasks(); //process final meta data long[] counts = new long[numReducers]; long total = 0; for (int i = 0; i < numReducers; i++) { counts[i] = group.getCounter(Integer.toString(i)); total += counts[i]; } //add missing 0s back to the results long missing0s = 0; if (total < rlen * clen) { if (partitionWith0 < 0) throw new RuntimeException("no partition contains 0, which is wrong!"); missing0s = rlen * clen - total; counts[partitionWith0] += missing0s; } else partitionWith0 = -1; if (sortIndexes) { //run builtin job for shifting partially sorted blocks according to global offsets //we do this in this custom form since it would not fit into the current structure //of systemml to output two intermediates (partially sorted data, offsets) out of a //single SortKeys lop boolean success = runjob.isSuccessful(); if (success) { success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication, output); } MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput); MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success); } else { MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful()); } }
From source file:com.linkedin.mlease.regression.jobs.ItemModelTest.java
License:Open Source License
@Override public void run() throws Exception { JobConfig props = super.getJobConfig(); List<String> lambdastr = props.getStringList(LAMBDA, ","); String outBasePath = props.getString(OUTPUT_BASE_PATH); for (String lambda : lambdastr) { String outPath = outBasePath + "/lambda-" + lambda; props.put("output.path", outPath); JobConf conf = createJobConf(PerItemTestMapper.class, PerItemTestReducer.class); AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(props.get(MODEL_PATH)), MODEL_PATH); conf.set(ITEM_KEY, props.getString(ITEM_KEY)); conf.setFloat(LAMBDA, Float.parseFloat(lambda)); conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false)); conf.setPartitionerClass(PerItemTestPartitioner.class); conf.setInt(NUM_REDUCERS, conf.getNumReduceTasks()); AvroUtils.runAvroJob(conf);/*w w w . j a v a2 s . c o m*/ } }
From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java
License:Apache License
/** * Runs the map-reduce job on ScaleOut hServer. * * @param jobID the id of the job * @param jobConf the job to run//from ww w. j a va2 s .co m * @param isNewApi if the job uses the new MapReduce APIs * @param splitType the type of the split * @param inputSplits the list of input splits * @param splitLocations the locations of the splits * @param grid the invocation grid to run the job * @throws IOException if errors occurred during the job * @throws InterruptedException if the processing thread is interrupted * @throws ClassNotFoundException if the invocation grid does not contain the dependency class */ @SuppressWarnings("unchecked") public void runPredefinedJob(JobID jobID, JobConf jobConf, boolean isNewApi, Class splitType, List<?> inputSplits, Map<Object, String[]> splitLocations, InvocationGrid grid) throws IOException, InterruptedException, ClassNotFoundException { //Initialize user credential in advance long time = System.currentTimeMillis(); CreateUserCredentials.run(grid); String hadoopVersion = VersionInfo.getVersion(); int appID = 0xFFFFFFF & BitConverter.hashStringOneInt(jobID.toString()); try { org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = createOutputCommitter(isNewApi, jobID, jobConf); HadoopVersionSpecificCode hadoopVersionSpecificCode = HadoopVersionSpecificCode .getInstance(hadoopVersion, jobConf); org.apache.hadoop.mapred.JobContext jobContext = hadoopVersionSpecificCode.createJobContext(jobConf, jobID); outputCommitter.setupJob(jobContext); //clear all temporary objects DataAccessor.clearObjects(appID); //Calculating the partition layout com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping .getCurrent(); List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts()); //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts int numHosts = hostAddresses.size(); int numberOfSlotsPerNode = Math .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1); //Generating split to hostname map Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(inputSplits, hostAddresses, splitLocations); int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(jobConf.getNumReduceTasks()); HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(jobConf, jobID, !isNewApi); HServerInvocationParameters parameters = new HServerInvocationParameters(hadoopParameters, appID, partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, inputSplits, splitToHostAddress, false, HServerParameters.getBooleanSetting(HServerParameters.SORT_KEYS, jobConf), hadoopVersion, null, SerializationMode.DEFAULT); StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("Splits created:\n"); for (InetAddress address : splitToHostAddress.keySet()) { stringBuilder.append("Host "); stringBuilder.append(address); stringBuilder.append(" has "); stringBuilder.append(splitToHostAddress.get(address).size()); stringBuilder.append(" splits.\n"); } System.out.println(stringBuilder.toString()); System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid, RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds()); if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) { throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0)); } System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); MapperResult resultObject = mapInvokeResult.getResult(); if (resultObject == null || mapInvokeResult.getNumFailed() != 0) { throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed()); } if (resultObject.getNumberOfSplitsProcessed() != inputSplits.size()) { throw new IOException("Number of splits does not match the number of invocations. Nsplits = " + inputSplits.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed()); } if (partitionMapping.length > 0) { //Running the reduce step InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class, appID, TimeSpan.INFINITE_TIMEOUT.getSeconds()); System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms."); DataAccessor.clearObjects(appID); //clear all temporary objects if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) { throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0)); } if (reduceInvokeResult.getNumFailed() != 0) { throw new IOException("Reduce invocation failed."); } if (reduceInvokeResult.getResult() != partitionMapping.length) { throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length + " Actual = " + reduceInvokeResult.getResult()); } } outputCommitter.commitJob(jobContext); } catch (StateServerException e) { throw new IOException("ScaleOut hServer access error.", e); } }