List of usage examples for org.apache.hadoop.mapred JobConf getNumReduceTasks
public int getNumReduceTasks()
From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java
License:Apache License
/** * Runs the map-reduce job on ScaleOut hServer.* * * @param job the job to run/*from w w w . j a va 2 s . com*/ * @param jobId the id of the job * @param sortEnabled if key sorting is enabled * @param jobParameter user defined parameter object for the job * @param grid the invocation grid to run the job * @throws IOException if errors occurred during the job * @throws InterruptedException if the processing thread is interrupted * @throws ClassNotFoundException if the invocation grid does not contain the dependency class */ @SuppressWarnings("unchecked") public void runOldApiJob(JobConf job, org.apache.hadoop.mapred.JobID jobId, boolean sortEnabled, Object jobParameter, InvocationGrid grid) throws IOException, InterruptedException, ClassNotFoundException { //Initialize user credential in advance int jobAppId = 0xFFFFFFF & BitConverter.hashStringOneInt(jobId.toString()); String hadoopVersion = VersionInfo.getVersion(); long time = System.currentTimeMillis(); CreateUserCredentials.run(grid); try { //Check output specs before running the job job.getOutputFormat().checkOutputSpecs(FileSystem.get(job), job); JobContext jContext = HadoopVersionSpecificCode.getInstance(hadoopVersion, job).createJobContext(job, jobId); org.apache.hadoop.mapred.OutputCommitter outputCommitter = job.getOutputCommitter(); outputCommitter.setupJob(jContext); //clear all temporary objects DataAccessor.clearObjects(jobAppId); //Calculating the partition layout com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping .getCurrent(); List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts()); //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts int numHosts = hostAddresses.size(); int numberOfSlotsPerNode = Math .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1); //Set the number of splits to the number of cores if (NamedMapInputFormatMapred.class.isAssignableFrom(job.getInputFormat().getClass())) { int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job) * numHosts * numberOfSlotsPerNode; job.setNumMapTasks(Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS)); } //Generating split to hostname map org.apache.hadoop.mapred.InputFormat inputFormat = job.getInputFormat(); List<org.apache.hadoop.mapred.InputSplit> splitList = Arrays .asList(inputFormat.getSplits(job, job.getNumMapTasks())); Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null); //Choose the optimal number of reducers for GridOutputFormat if (job.getOutputFormat() instanceof NamedMapOutputFormatMapred) { job.setNumReduceTasks(numHosts * numberOfSlotsPerNode); sortEnabled = false; } int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks()); //Generating invocation parameters Class<? extends org.apache.hadoop.mapred.InputSplit> splitType = splitList.size() > 0 ? splitList.get(0).getClass() : null; HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job, jobId, true); HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit> parameters = new HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit>( hadoopParameters, jobAppId, partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, splitList, splitToHostAddress, false, sortEnabled, hadoopVersion, jobParameter, SerializationMode.DEFAULT); StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("Splits created:\n"); for (InetAddress address : splitToHostAddress.keySet()) { stringBuilder.append("Host "); stringBuilder.append(address); stringBuilder.append(" has "); stringBuilder.append(splitToHostAddress.get(address).size()); stringBuilder.append(" splits.\n"); } System.out.println(stringBuilder.toString()); System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid, RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds()); if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) { throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0)); } System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); MapperResult resultObject = mapInvokeResult.getResult(); if (resultObject == null || mapInvokeResult.getNumFailed() != 0) { throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed()); } if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) { throw new IOException("Number of splits does not match the number of invocations. Nsplits = " + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed()); } if (partitionMapping.length > 0) { //Running the reduce step InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class, jobAppId, TimeSpan.INFINITE_TIMEOUT.getSeconds()); System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms."); DataAccessor.clearObjects(jobAppId); //clear all temporary objects if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) { throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0)); } if (reduceInvokeResult.getNumFailed() != 0) { throw new IOException("Reduce invocation failed."); } if (reduceInvokeResult.getResult() != partitionMapping.length) { throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length + " Actual = " + reduceInvokeResult.getResult()); } } outputCommitter.commitJob(jContext); } catch (StateServerException e) { throw new IOException("ScaleOut hServer access error.", e); } }
From source file:com.tomslabs.grid.avro.TextTypedBytesToAvroOutputFormat.java
License:Apache License
public RecordWriter<TypedBytesWritable, TypedBytesWritable> getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog) throws IOException { boolean isMapOnly = job.getNumReduceTasks() == 0; Schema schema = isMapOnly ? AvroJob.getMapOutputSchema(job) : AvroJob.getOutputSchema(job); final DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>( new GenericDatumWriter<GenericRecord>(schema)); if (FileOutputFormat.getCompressOutput(job)) { int level = job.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL); writer.setCodec(CodecFactory.deflateCodec(level)); }//ww w. java 2s.c o m // copy metadata from job for (Map.Entry<String, String> e : job) { if (e.getKey().startsWith(AvroJob.TEXT_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()), e.getValue()); if (e.getKey().startsWith(AvroJob.BINARY_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()), URLDecoder.decode(e.getValue(), "ISO-8859-1").getBytes("ISO-8859-1")); } Path path = FileOutputFormat.getTaskOutputPath(job, name + EXT); writer.create(schema, path.getFileSystem(job).create(path)); return new AvroRecordWriter(writer, schema); }
From source file:edu.brown.cs.mapreduce.BenchmarkBase.java
License:Open Source License
public void runJob(JobConf _conf) throws Exception { String ret = "BenchmarkBase(" + _conf.getJobName() + ")\n" + "\tInput Path: {"; Path inputs[] = FileInputFormat.getInputPaths(_conf); for (int ctr = 0; ctr < inputs.length; ctr++) { if (ctr > 0) ret += ", "; ret += inputs[ctr].toString();/* w w w. j a v a2 s .c o m*/ } ret += "}\n"; ret += "\tOutput Path: " + FileOutputFormat.getOutputPath(_conf) + "\n" + "\tMap Jobs: " + _conf.getNumMapTasks() + "\n" + "\tReduce Jobs: " + _conf.getNumReduceTasks() + "\n" + "\tProperties: " + this.options; System.out.println(ret); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(_conf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds."); this.last_job = _conf; return; }
From source file:edu.stolaf.cs.wmrserver.HadoopEngine.java
License:Apache License
public JobInfo getInfo(Submission submission, RunningJob job, JobConf conf) throws NotFoundException, InternalException { JobInfo info = new JobInfo(); info.setNativeID(submission.getHadoopID()); info.setName(job.getJobName());//from www .ja v a2s . c om info.setTest(false); if (conf == null) // Can't proceed any further if configuration is unavailable return info; info.setRequestedMapTasks(conf.getNumMapTasks()); info.setRequestedReduceTasks(conf.getNumReduceTasks()); info.setMapper(conf.get(CONF_MAPPER)); info.setReducer(conf.get(CONF_REDUCER)); info.setNumericSort(conf.getBoolean(CONF_NUMERIC, false)); info.setInputPath( JobServiceHandler.relativizePath(_homeDir, FileInputFormat.getInputPaths(conf)[0]).toString()); info.setOutputPath( JobServiceHandler.relativizePath(_homeDir, FileOutputFormat.getOutputPath(conf)).toString()); return info; }
From source file:edu.uci.ics.fuzzyjoin.hadoop.FuzzyJoinDriver.java
License:Apache License
public static void run(JobConf job) throws IOException { job.setJarByClass(FuzzyJoinDriver.class); ////from w ww. j a v a 2 s . co m // print info // String ret = "FuzzyJoinDriver(" + job.getJobName() + ")\n" + " Input Path: {"; Path inputs[] = FileInputFormat.getInputPaths(job); for (int ctr = 0; ctr < inputs.length; ctr++) { if (ctr > 0) { ret += "\n "; } ret += inputs[ctr].toString(); } ret += "}\n"; ret += " Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + " Map Jobs: " + job.getNumMapTasks() + "\n" + " Reduce Jobs: " + job.getNumReduceTasks() + "\n" + " Properties: {"; String[][] properties = new String[][] { new String[] { FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE }, new String[] { FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, "" + FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE }, new String[] { FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE }, new String[] { TOKENS_PACKAGE_PROPERTY, TOKENS_PACKAGE_VALUE }, new String[] { TOKENS_LENGTHSTATS_PROPERTY, "" + TOKENS_LENGTHSTATS_VALUE }, new String[] { RIDPAIRS_GROUP_CLASS_PROPERTY, RIDPAIRS_GROUP_CLASS_VALUE }, new String[] { RIDPAIRS_GROUP_FACTOR_PROPERTY, "" + RIDPAIRS_GROUP_FACTOR_VALUE }, new String[] { FuzzyJoinConfig.DATA_TOKENS_PROPERTY, "" }, new String[] { DATA_JOININDEX_PROPERTY, "" }, }; for (int crt = 0; crt < properties.length; crt++) { if (crt > 0) { ret += "\n "; } ret += properties[crt][0] + "=" + job.get(properties[crt][0], properties[crt][1]); } ret += "}"; System.out.println(ret); // // run job // Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds."); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapJoin.java
License:Apache License
@Override public void configure(JobConf job) { ///*from w w w .ja va 2s . co m*/ // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank and TokenGroup // Path tokensPath; Path lengthstatsPath = null; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); try { lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY)); } catch (IllegalArgumentException e) { } } else { tokensPath = cache[0]; if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY, FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) { lengthstatsPath = cache[1]; } } } catch (IOException e) { throw new RuntimeException(e); } String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE); int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE); recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass, Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath); TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns[0] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); dataColumns[1] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA1_PROPERTY, FuzzyJoinConfig.RECORD_DATA1_VALUE)); // // get suffix for second relation // suffixSecond = job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, "") .split(FuzzyJoinDriver.SEPSARATOR_REGEX)[1]; }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapSelfJoin.java
License:Apache License
@Override public void configure(JobConf job) { ////from ww w. j ava2 s . c om // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank and TokenGroup // Path tokensPath; Path lengthstatsPath = null; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); try { lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY)); } catch (IllegalArgumentException e) { } } else { tokensPath = cache[0]; if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY, FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) { lengthstatsPath = cache[1]; } } } catch (IOException e) { throw new RuntimeException(e); } String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE); int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE); recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass, Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath); TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.ridrecordpairs.ppjoin.MapJoin.java
License:Apache License
@Override public void configure(JobConf job) { ///*from w w w . j av a 2 s. com*/ // set Tokenizer and SimilarityFilters // tokenizer = TokenizerFactory.getTokenizer( job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE), FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE); float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY, FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE); similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold); // // set TokenRank and TokenGroup // Path tokensPath; Path lengthstatsPath = null; try { Path[] cache = DistributedCache.getLocalCacheFiles(job); if (cache == null) { tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY)); try { lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY)); } catch (IllegalArgumentException e) { } } else { tokensPath = cache[0]; if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY, FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) { lengthstatsPath = cache[1]; } } } catch (IOException e) { throw new RuntimeException(e); } String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE); int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY, FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE); recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass, Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath); TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank); tokenLoad.loadTokenRank(); // // set dataColumn // dataColumns[0] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE)); dataColumns[1] = FuzzyJoinUtil .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA1_PROPERTY, FuzzyJoinConfig.RECORD_DATA1_VALUE)); // // get suffix for second relation // suffixSecond = job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, "") .split(FuzzyJoinDriver.SEPSARATOR_REGEX)[1]; }
From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopWriteOperatorDescriptor.java
License:Apache License
private static FileSplit[] getOutputSplits(JobConf conf, int noOfMappers) throws ClassNotFoundException { int numOutputters = conf.getNumReduceTasks() != 0 ? conf.getNumReduceTasks() : noOfMappers; Object outputFormat = null;/*from w ww .j av a 2s . c o m*/ if (conf.getUseNewMapper()) { outputFormat = ReflectionUtils .newInstance(new ContextFactory().createJobContext(conf).getOutputFormatClass(), conf); } else { outputFormat = conf.getOutputFormat(); } if (outputFormat instanceof NullOutputFormat) { FileSplit[] outputFileSplits = new FileSplit[numOutputters]; for (int i = 0; i < numOutputters; i++) { String outputPath = "/tmp/" + System.currentTimeMillis() + i; outputFileSplits[i] = new FileSplit("localhost", new FileReference(new File(outputPath))); } return outputFileSplits; } else { FileSplit[] outputFileSplits = new FileSplit[numOutputters]; String absolutePath = FileOutputFormat.getOutputPath(conf).toString(); for (int index = 0; index < numOutputters; index++) { String suffix = new String("part-00000"); suffix = new String(suffix.substring(0, suffix.length() - ("" + index).length())); suffix = suffix + index; String outputPath = absolutePath + "/" + suffix; outputFileSplits[index] = new FileSplit("localhost", outputPath); } return outputFileSplits; } }
From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java
License:Apache License
private IOperatorDescriptor configureOutput(IOperatorDescriptor previousOperator, JobConf conf, JobSpecification spec) throws Exception { int instanceCountPreviousOperator = operatorInstanceCount.get(previousOperator.getOperatorId()); int numOutputters = conf.getNumReduceTasks() != 0 ? conf.getNumReduceTasks() : instanceCountPreviousOperator; HadoopWriteOperatorDescriptor writer = null; writer = new HadoopWriteOperatorDescriptor(spec, conf, numOutputters); configurePartitionCountConstraint(spec, writer, numOutputters); spec.connect(new OneToOneConnectorDescriptor(spec), previousOperator, 0, writer, 0); return writer; }