List of usage examples for org.apache.hadoop.mapred JobConf setNumMapTasks
public void setNumMapTasks(int n)
From source file:org.apache.mahout.df.mapred.partial.PartialBuilder.java
License:Apache License
@Override protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException { FileInputFormat.setInputPaths(job, getDataPath()); FileOutputFormat.setOutputPath(job, getOutputPath(job)); job.setOutputKeyClass(TreeID.class); job.setOutputValueClass(MapredOutput.class); job.setMapperClass(Step1Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormat(TextInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); // if we are in 'local' mode, correct the number of maps // or the mappers won't be able to compute the right indexes String tracker = job.get("mapred.job.tracker", "local"); if ("local".equals(tracker)) { log.warn("Hadoop running in 'local' mode, only one map task will be launched"); job.setNumMapTasks(1); }//from w w w. j a v a 2 s. com }
From source file:org.apache.mahout.df.mapred.partial.PartialBuilderTest.java
License:Apache License
public void testProcessOutput() throws Exception { JobConf job = new JobConf(); job.setNumMapTasks(numMaps); Random rng = RandomUtils.getRandom(); // prepare the output TreeID[] keys = new TreeID[numTrees]; MapredOutput[] values = new MapredOutput[numTrees]; int[] firstIds = new int[numMaps]; randomKeyValues(rng, keys, values, firstIds); // store the output in a sequence file Path base = getTestTempDirPath("testdata"); FileSystem fs = base.getFileSystem(job); Path outputFile = new Path(base, "PartialBuilderTest.seq"); Writer writer = SequenceFile.createWriter(fs, job, outputFile, TreeID.class, MapredOutput.class); for (int index = 0; index < numTrees; index++) { writer.append(keys[index], values[index]); }/* ww w . ja va 2 s . com*/ writer.close(); // load the output and make sure its valid TreeID[] newKeys = new TreeID[numTrees]; Node[] newTrees = new Node[numTrees]; PartialBuilder.processOutput(job, base, firstIds, newKeys, newTrees, new TestCallback(keys, values)); // check the forest for (int tree = 0; tree < numTrees; tree++) { assertEquals(values[tree].getTree(), newTrees[tree]); } assertTrue("keys not equal", Arrays.deepEquals(keys, newKeys)); }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException { int numMaps = job.getNumMapTasks(); super.configureJob(job, nbTrees, oobEstimate); // PartialBuilder sets the number of maps to 1 if we are running in 'local' job.setNumMapTasks(numMaps); }
From source file:org.apache.mahout.df.mapred.partial.PartitionBugTest.java
License:Apache License
/** * Make sure that the correct instance ids are being computed * // www .j ava 2s .c o m * @throws Exception * */ public void testProcessOutput() throws Exception { Random rng = RandomUtils.getRandom(); //long seed = rng.nextLong(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, numAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, numInstances); // each instance label is its index in the dataset int labelId = Utils.findLabel(descriptor); for (int index = 0; index < numInstances; index++) { source[index][labelId] = index; } // store the data into a file String[] sData = Utils.double2String(source); Path dataPath = Utils.writeDataToTestFile(sData); Dataset dataset = DataLoader.generateDataset(descriptor, sData); Data data = DataLoader.loadData(dataset, sData); JobConf jobConf = new JobConf(); jobConf.setNumMapTasks(numMaps); // prepare a custom TreeBuilder that will classify each // instance with its own label (in this case its index in the dataset) TreeBuilder treeBuilder = new MockTreeBuilder(); // disable the second step because we can test without it // and we won't be able to serialize the MockNode PartialBuilder.setStep2(jobConf, false); long seed = 1L; PartialSequentialBuilder builder = new PartialSequentialBuilder(treeBuilder, dataPath, dataset, seed, jobConf); // remove the output path (its only used for testing) Path outputPath = builder.getOutputPath(jobConf); FileSystem fs = outputPath.getFileSystem(jobConf); HadoopUtil.overwriteOutput(outputPath); builder.build(numTrees, new MockCallback(data)); }
From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java
License:Apache License
public void testStep0Mapper() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, numAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, numInstances); String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); JobConf job = new JobConf(); job.setNumMapTasks(numMaps); FileInputFormat.setInputPaths(job, dataPath); // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, numMaps); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted);/* w ww .jav a 2 s . com*/ Step0OutputCollector collector = new Step0OutputCollector(numMaps); Reporter reporter = Reporter.NULL; for (int p = 0; p < numMaps; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Step0Mapper mapper = new Step0Mapper(); mapper.configure(p); Long firstKey = null; int size = 0; while (reader.next(key, value)) { if (firstKey == null) { firstKey = key.get(); } mapper.map(key, value, collector, reporter); size++; } mapper.close(); // validate the mapper's output assertEquals(p, collector.keys[p]); assertEquals(firstKey.longValue(), collector.values[p].getFirstId()); assertEquals(size, collector.values[p].getSize()); } }
From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java
License:Apache License
public void testProcessOutput() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, numAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, numInstances); // each instance label is its index in the dataset int labelId = Utils.findLabel(descriptor); for (int index = 0; index < numInstances; index++) { source[index][labelId] = index;/* w w w . j a v a 2s . c om*/ } String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); // prepare a data converter Dataset dataset = DataLoader.generateDataset(descriptor, sData); DataConverter converter = new DataConverter(dataset); JobConf job = new JobConf(); job.setNumMapTasks(numMaps); FileInputFormat.setInputPaths(job, dataPath); // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, numMaps); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted); Reporter reporter = Reporter.NULL; int[] keys = new int[numMaps]; Step0Output[] values = new Step0Output[numMaps]; int[] expectedIds = new int[numMaps]; for (int p = 0; p < numMaps; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Long firstKey = null; int size = 0; while (reader.next(key, value)) { if (firstKey == null) { firstKey = key.get(); expectedIds[p] = converter.convert(0, value.toString()).label; } size++; } keys[p] = p; values[p] = new Step0Output(firstKey, size); } Step0Output[] partitions = Step0Job.processOutput(keys, values); int[] actualIds = Step0Output.extractFirstIds(partitions); assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds), Arrays.equals(expectedIds, actualIds)); }
From source file:org.apache.oozie.action.hadoop.LauncherMapperHelper.java
License:Apache License
public static void setupLauncherInfo(JobConf launcherConf, String jobId, String actionId, Path actionDir, String recoveryId, Configuration actionConf, String prepareXML) throws IOException, HadoopAccessorException { launcherConf.setMapperClass(LauncherMapper.class); launcherConf.setSpeculativeExecution(false); launcherConf.setNumMapTasks(1); launcherConf.setNumReduceTasks(0);//from w w w . j a v a2 s .c o m launcherConf.set(LauncherMapper.OOZIE_JOB_ID, jobId); launcherConf.set(LauncherMapper.OOZIE_ACTION_ID, actionId); launcherConf.set(LauncherMapper.OOZIE_ACTION_DIR_PATH, actionDir.toString()); launcherConf.set(LauncherMapper.OOZIE_ACTION_RECOVERY_ID, recoveryId); launcherConf.set(LauncherMapper.ACTION_PREPARE_XML, prepareXML); actionConf.set(LauncherMapper.OOZIE_JOB_ID, jobId); actionConf.set(LauncherMapper.OOZIE_ACTION_ID, actionId); if (Services.get().getConf().getBoolean("oozie.hadoop-2.0.2-alpha.workaround.for.distributed.cache", false)) { List<String> purgedEntries = new ArrayList<String>(); Collection<String> entries = actionConf.getStringCollection("mapreduce.job.cache.files"); for (String entry : entries) { if (entry.contains("#")) { purgedEntries.add(entry); } } actionConf.setStrings("mapreduce.job.cache.files", purgedEntries.toArray(new String[purgedEntries.size()])); launcherConf.setBoolean("oozie.hadoop-2.0.2-alpha.workaround.for.distributed.cache", true); } FileSystem fs = Services.get().get(HadoopAccessorService.class) .createFileSystem(launcherConf.get("user.name"), actionDir.toUri(), launcherConf); fs.mkdirs(actionDir); OutputStream os = fs.create(new Path(actionDir, LauncherMapper.ACTION_CONF_XML)); try { actionConf.writeXml(os); } finally { IOUtils.closeSafely(os); } launcherConf.setInputFormat(OozieLauncherInputFormat.class); launcherConf.set("mapred.output.dir", new Path(actionDir, "output").toString()); }
From source file:org.apache.oozie.example.SampleOozieActionConfigurator.java
License:Apache License
@Override public void configure(JobConf actionConf) throws OozieActionConfiguratorException { if (actionConf.getUser() == null) { throw new OozieActionConfiguratorException("No user set"); }/*from ww w .jav a2 s. c o m*/ if (actionConf.get("examples.root") == null) { throw new OozieActionConfiguratorException("examples.root not set"); } if (actionConf.get("output.dir.name") == null) { throw new OozieActionConfiguratorException("output.dir.name not set"); } actionConf.setMapperClass(SampleMapper.class); actionConf.setReducerClass(SampleReducer.class); actionConf.setNumMapTasks(1); FileInputFormat.setInputPaths(actionConf, new Path( "/user/" + actionConf.getUser() + "/" + actionConf.get("examples.root") + "/input-data/text")); FileOutputFormat.setOutputPath(actionConf, new Path("/user/" + actionConf.getUser() + "/" + actionConf.get("examples.root") + "/output-data/" + actionConf.get("output.dir.name"))); }
From source file:org.apache.pig.test.utils.datagen.HadoopRunner.java
License:Apache License
public void generate() throws IOException { // Configuration processed by ToolRunner // Create a JobConf using the processed conf JobConf job; if (conf != null) { // TODO: conf could be null, check when and why job = new JobConf(conf); } else {//from w w w. j av a 2 s. c om job = new JobConf(new Configuration()); } fs = FileSystem.get(job); tmpHome = createTempDir(null); String config = genMapFiles().toUri().getRawPath(); // set config properties into job conf job.set(COLUMN_CONF_FILE_PATH, config); job.set(COLUMN_OUTPUT_SEPARATOR, String.valueOf((int) dgConf.getSeparator())); job.setJobName("data-gen"); job.setNumMapTasks(dgConf.getNumMappers()); job.setNumReduceTasks(0); job.setMapperClass(DataGenMapper.class); job.setJarByClass(DataGenMapper.class); // if inFile is specified, use it as input if (dgConf.getInFile() != null) { FileInputFormat.setInputPaths(job, dgConf.getInFile()); job.set(HAS_USER_INPUT, "true"); } else { job.set(HAS_USER_INPUT, "false"); Path input = genInputFiles(); FileInputFormat.setInputPaths(job, input); } FileOutputFormat.setOutputPath(job, new Path(dgConf.getOutputFile())); // Submit the job, then poll for progress until the job is complete System.out.println("Submit hadoop job..."); RunningJob j = JobClient.runJob(job); if (!j.isSuccessful()) { throw new IOException("Job failed"); } if (fs.exists(tmpHome)) { fs.delete(tmpHome, true); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.RemoteParForMR.java
License:Apache License
public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile, MatrixObject colocatedDPMatrixObj, //inputs boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null;/*from w w w . ja v a 2s .c o m*/ String jobname = "ParFor-EMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the mapper MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //set mappers, reducers, combiners job.setMapperClass(RemoteParWorkerMapper.class); //map-only //set input format (one split per row, NLineInputFormat default N=1) if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) { job.setInputFormat(RemoteParForColocatedNLineInputFormat.class); MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat()); MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics(); MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock()); MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock()); MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName()); } else //default case { job.setInputFormat(NLineInputFormat.class); } //set the input path and output path FileInputFormat.setInputPaths(job, new Path(taskFile)); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumMapTasks(numMappers); //numMappers job.setNumReduceTasks(0); //job.setInt("mapred.map.tasks.maximum", 1); //system property //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property //set jvm memory size (if require) String memKey = MRConfigurationNames.MR_CHILD_JAVA_OPTS; if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) { InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem); LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M."); } //disable automatic tasks timeouts and speculative task exec job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0); job.setMapSpeculativeExecution(false); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //enables the reuse of JVMs (multiple tasks per MR task) if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption) job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8); //8MB //set the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job); MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }