List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:edu.american.student.foreman.HadoopForeman.java
License:Open Source License
/** * Creates a native hadoop job given a HadoopJobConfiguration * @param conf// w w w . ja v a 2s . com * @return * @throws HadoopException */ @SuppressWarnings({ "unchecked", "rawtypes" }) public Job getHadoopJob(HadoopJobConfiguration conf) throws HadoopException { Job job; try { job = new Job(); job.setJobName(conf.getJobName()); job.setMapperClass(conf.getMapperClass()); job.setInputFormatClass((Class<? extends InputFormat>) conf.getInputFormatClass()); if (conf.getOutputFormatClass() != null) { job.setOutputFormatClass((Class<? extends OutputFormat>) conf.getOutputFormatClass()); } if (conf.getOutputKeyClass() != null) { job.setOutputKeyClass(conf.getOutputKeyClass()); } if (conf.getOutputValueClass() != null) { job.setOutputValueClass(conf.getOutputValueClass()); } if (conf.getReducerClass() != null) { job.setReducerClass(conf.getReducerClass()); } job.setNumReduceTasks(conf.getNumReduceTasks()); Configuration conf1 = job.getConfiguration(); if (conf.getInputFormatClass() == AccumuloInputFormat.class) { AccumuloInputFormat.setInputInfo(conf1, Constants.getAccumuloUser(), Constants.getAccumuloPassword().getBytes(), conf.getDefaultTable(), conf.getDefaultAuths()); AccumuloInputFormat.setZooKeeperInstance(conf1, Constants.getZookeeperInstanceName(), Constants.getZookeeperInstance()); } if (conf.getFetchColumns() != null) { AccumuloInputFormat.fetchColumns(conf1, conf.getFetchColumns()); } else if (conf.getInputFormatClass() == TextInputFormat.class) { if (conf.getPathToProcess() != null) { FileInputFormat.setInputPaths(job, conf.getPathToProcess()); } } if (conf.getOutputFormatClass() == AccumuloOutputFormat.class) { AccumuloOutputFormat.setOutputInfo(conf1, Constants.getAccumuloUser(), Constants.getAccumuloPassword().getBytes(), true, conf.getDefaultTable()); AccumuloOutputFormat.setZooKeeperInstance(conf1, Constants.getZookeeperInstanceName(), Constants.getZookeeperInstance()); } return job; } catch (IOException e) { String gripe = "Could not configure a Hadoop job"; log.log(Level.SEVERE, gripe, e); throw new HadoopException(gripe, e); } }
From source file:edu.american.student.mnemosyne.core.util.foreman.HadoopForeman.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public Job getHadoopJob(HadoopJobConfiguration conf) throws HadoopException { Job job; try {// ww w .j a v a 2 s.c om job = new Job(); DistributedCache.setCacheArchives( new URI[] { new URI("/cache/accumulo-core-1.4.1.jar"), new URI("/cache/accumulo-server-1.4.1.jar"), new URI("/cache/accumulo-start-1.4.1.jar"), new URI("/cache/cloudtrace-1.4.1.jar"), new URI("/cache/commons-collections-3.2.jar"), new URI("/cache/commons-configuration-1.5.jar"), new URI("/cache/commons-io-1.4.jar"), new URI("/cache/commons-jci-core-1.0.jar"), new URI("/cache/commons-jci-fam-1.0.jar"), new URI("/cache/commons-lang-2.4.jar"), new URI("/cache/commons-logging-1.0.4.jar"), new URI("/cache/commons-logging-api-1.0.4.jar"), new URI("/cache/jline-0.9.94.jar"), new URI("/cache/libthrift-0.6.1.jar"), new URI("/cache/log4j-1.2.16.jar") }, job.getConfiguration()); job.setJobName(conf.getJobName()); System.out.println("Setting jar class " + conf.getJarClass()); ((JobConf) job.getConfiguration()).setJar("/opt/mnemosyne.jar"); job.setJarByClass(conf.getJarClass()); job.setMapperClass(conf.getMapperClass()); job.setInputFormatClass((Class<? extends InputFormat>) conf.getInputFormatClass()); if (conf.getOutputFormatClass() != null) { job.setOutputFormatClass((Class<? extends OutputFormat>) conf.getOutputFormatClass()); } if (conf.getOutputKeyClass() != null) { job.setOutputKeyClass(conf.getOutputKeyClass()); } if (conf.getOutputValueClass() != null) { job.setOutputValueClass(conf.getOutputValueClass()); } if (conf.getReducerClass() != null) { job.setReducerClass(conf.getReducerClass()); } job.setNumReduceTasks(conf.getNumReduceTasks()); Configuration conf1 = job.getConfiguration(); if (conf.getInputFormatClass() == AccumuloInputFormat.class) { AccumuloInputFormat.setInputInfo(conf1, MnemosyneConstants.getAccumuloUser(), MnemosyneConstants.getAccumuloPassword().getBytes(), conf.getDefaultTable(), conf.getDefaultAuths()); AccumuloInputFormat.setZooKeeperInstance(conf1, MnemosyneConstants.getZookeeperInstanceName(), MnemosyneConstants.getZookeeperInstance()); } if (conf.getFetchColumns() != null) { AccumuloInputFormat.fetchColumns(conf1, conf.getFetchColumns()); } else if (conf.getInputFormatClass() == TextInputFormat.class) { if (conf.getPathToProcess() != null) { FileInputFormat.setInputPaths(job, conf.getPathToProcess()); } } if (conf.getOutputFormatClass() == AccumuloOutputFormat.class) { AccumuloOutputFormat.setOutputInfo(conf1, MnemosyneConstants.getAccumuloUser(), MnemosyneConstants.getAccumuloPassword().getBytes(), true, conf.getDefaultTable()); AccumuloOutputFormat.setZooKeeperInstance(conf1, MnemosyneConstants.getZookeeperInstanceName(), MnemosyneConstants.getZookeeperInstance()); } return job; } catch (IOException e) { String gripe = "Could not configure a Hadoop job"; log.log(Level.SEVERE, gripe, e); throw new HadoopException(gripe, e); } catch (URISyntaxException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; }
From source file:edu.berkeley.amplab.adam.modules.ConvertFilesMR.java
License:Apache License
@Override public int moduleRun() throws Exception { Configuration configuration = new Configuration(); configuration.set(CONFIG_KEY_OUTPUT, outputPath); Job job = new Job(configuration); job.setNumReduceTasks(1); job.setMapperClass(ConvertFileMapper.class); job.setReducerClass(ConvertFileReducer.class); BAMInputFormat.setInputPaths(job, new Path(inputPath)); job.setInputFormatClass(BAMInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(outputPath).getParent()); AvroJob.setMapOutputValueSchema(job, ADAMRecord.SCHEMA$); job.waitForCompletion(true);//from w w w.j av a 2 s .c o m return 0; }
From source file:edu.bigdata.training.serialization.UserHistory.java
@Override public int run(String[] strings) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "postHistory"); job.setJarByClass(UserHistory.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(1); MultipleInputs.addInputPath(job, new Path("input/posts/user_info.txt"), TextInputFormat.class, UserCityMapper.class); MultipleInputs.addInputPath(job, new Path("input/posts/user_posts.txt"), TextInputFormat.class, UserPostsMapper.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, UserPostSummary.getClassSchema()); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); Path outPath = new Path("output/user/posts"); FileOutputFormat.setOutputPath(job, outPath); job.setReducerClass(UserPostHistory.class); //outPath.getFileSystem(job.getConfiguration()).delete(outPath, true); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:edu.cuhk.hccl.hadoop.HadoopApp.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args == null || args.length < 4) { System.out.println("Please specify parameters: input, output, domain, num-reducers!"); System.exit(-1);//w ww .j a v a 2 s . co m } String input = args[0]; String output = args[1]; String domain = args[2]; int numReducers = Integer.parseInt(args[3]); float similarity = Float.parseFloat(args[4]); int range = Integer.parseInt(args[5]); Job job = new Job(new Configuration(), this.getClass().getSimpleName()); // Must below the line of job creation Configuration conf = job.getConfiguration(); // Reuse the JVM conf.setInt("mapred.job.reuse.jvm.num.tasks", -1); conf.setFloat("SIM_THRESHOLD", similarity); conf.setInt("SEARCH_RANGE", range); if (domain.equalsIgnoreCase("restaurant")) { conf.setStrings("ASPECTS", Constant.RESTAURANT_ASPECTS); job.setMapperClass(YelpMapper.class); job.setInputFormatClass(TextInputFormat.class); // args[4] is the business file to select matching business_ids to restaurant String busiFile = args[6]; DistributedCache.addCacheFile(new URI(busiFile), conf); } else if (domain.equalsIgnoreCase("hotel")) { conf.setStrings("ASPECTS", Constant.TRIPADVISOR_ASPECTS); job.setMapperClass(TripAdvisorMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); } else { System.out.println("Wrong domain type!"); System.exit(-1); } job.setJarByClass(HadoopApp.class); job.setReducerClass(ReviewReducer.class); job.setNumReduceTasks(numReducers); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(UserItemPair.class); job.setOutputValueClass(NounPhrase.class); // Delete output if exists Path outputDir = new Path(output); FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(outputDir)) hdfs.delete(outputDir, true); FileInputFormat.setInputPaths(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.waitForCompletion(true); return 0; }
From source file:edu.indiana.cs.b649.HadoopBlast.java
License:Open Source License
/** * Launch the MapReduce computation./*from www . j a v a 2 s . com*/ * This method first, remove any previous working directories and create a new one * Then the data (file names) is copied to this new directory and launch the * MapReduce (map-only though) computation. * @param numReduceTasks - Number of reduce tasks = 0. * @param binAndDbArchive - the uploaded databaseArchive filename on HDFS * @param execName - Name of the binary executable. * @param workingDir - the local disk working directory when computing the downloaded *.fa from HDFS * @param databaseArchiveDir - The directory where the Blast+/Cap3 program is after unzip the distributed cached archive. * @param databaseName - the Blast+ database name, normally "nr" * @param inputDir - Directory where the input data set is located on HDFS. * @param outputDir - Output directory to place the output on HDFS. * @param cmdArgs - These are the command line arguments to the Blast+ program. * @throws Exception - Throws any exception occurs in this program. * * you are free to change this launch function to support your own program */ void launch(int numReduceTasks, String binAndDbArchive, String execName, String workingDir, String databaseArchiveDir, String databaseName, String inputDir, String outputDir, String cmdArgs) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, execName); Path hdMainDir = new Path(outputDir); FileSystem fs = FileSystem.get(conf); fs.delete(hdMainDir, true); Path hdOutDir = new Path(hdMainDir, "out"); Configuration jc = job.getConfiguration(); jc.set(Bin_DB_Archive, binAndDbArchive); // this the name of the executable archive jc.set(EXECUTABLE, execName); jc.set(WORKING_DIR, workingDir); jc.set(DB_ARCHIVE_DIR, databaseArchiveDir); jc.set(DB_NAME, databaseName); jc.set(OUTPUT_DIR, outputDir); jc.set(PARAMETERS, cmdArgs); jc.set(OUTPUT_DIR, outputDir); FileInputFormat.setInputPaths(job, inputDir); FileOutputFormat.setOutputPath(job, hdOutDir); DistributedCache.addCacheArchive(new URI(Bin_DB_Archive), jc); /* * Your code here */ System.out.println("so far so good"); job.setJarByClass(HadoopBlast.class); job.setMapperClass(RunnerMap.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(DataFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReduceTasks); int exitStatus = job.waitForCompletion(true) ? 0 : 1; //clean the cache System.exit(exitStatus); }
From source file:edu.indiana.d2i.htrc.corpus.analysis.LDAAnalysisDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub /**/* ww w . j a va 2s . c om*/ * Following generic arguments should be specified in command line * * -D user.args.mapping.table.filename=<mappingtable_filename> -D * user.args.topics.filename=<topics_filename> -D * user.args.topdoctable.capacity.stepsize=<stepSize> -D * user.args.lda.state.filepath=</hdfs/path/to/lda/state/file> (being * set automatically) -files * </local/path/to/mapping/table/file>,</local/path/to/topics/file> * -libjars <dependent jars> (if any) * */ if (args.length != 2) { System.err.printf( "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory/prefix> <path/to/property/file>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration conf = getConf(); Job job = new Job(conf, "HTRC LDA Analysis"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(LDAAnalysisDriver.class); job.setMapperClass(LDAAnalysisMapper.class); job.setReducerClass(LDAAnalysisReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LDAState.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); /* set number of reduce tasks to be 1 */ job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.indiana.d2i.htrc.corpus.wordset.ComposeWordsetDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub if (args.length != 2) { System.err.printf(/* ww w . j a v a 2 s . c o m*/ "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration conf = getConf(); Job job = new Job(conf, "HTRC Composing Wordset"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(ComposeWordsetDriver.class); job.setMapperClass(ComposeWordsetMapper.class); job.setReducerClass(ComposeWordsetReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); /* Only one reducer */ job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.indiana.d2i.htrc.exp.PartialVectorsFromTokenizedDoc.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();/* w ww . jav a2 s. co m*/ } // all directories are in HDFS tokenizedDocDir = args[0]; dictDir = args[1]; outputDir = args[2]; numReducers = Integer.valueOf(args[3]); logger.info("PartialVectorsFromTokenizedDoc "); logger.info(" - tokenizedDocDir: " + tokenizedDocDir); logger.info(" - dictDir: " + dictDir); logger.info(" - outputDir: " + outputDir); logger.info(" - numReducers: " + numReducers); Path tokenizedDocPath = new Path(tokenizedDocDir); Path dictPath = new Path(dictDir); Path outputPath = new Path(outputDir); // get dimension Configuration conf = getConf(); int dimension = 0; for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictPath, true, conf)) { dimension++; } logger.info("dimension of a vector: " + dimension); // submit job long t0 = System.currentTimeMillis(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); DistributedCache.setCacheFiles(new URI[] { dictPath.toUri() }, conf); Job job = new Job(conf); job.setJobName("PartialVectorsFromTokenizedDoc::MakePartialVectors: input-folder: " + tokenizedDocDir + ", dictionary-file: " + dictDir); job.setJarByClass(PartialVectorsFromTokenizedDoc.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, tokenizedDocPath); FileOutputFormat.setOutputPath(job, outputPath); HadoopUtil.delete(conf, outputPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); job.waitForCompletion(true); long t1 = System.currentTimeMillis(); logger.info("PartialVectorsFromTokenizedDoc takes " + (double) (t1 - t0) / 1000 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 6) { printUsage();/* w w w . j av a2 s . c o m*/ } String inputPath = args[0]; String outputPath = args[1]; int maxIdsPerSplit = Integer.valueOf(args[2]); String dataAPIConfClassName = args[3]; String analyzerClassName = args[4]; int maxIdsPerReq = Integer.valueOf(args[5]); logger.info("DataCopyTokenizerJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); logger.info(" - analyzerName: " + analyzerClassName); logger.info(" - maxIdsPerReq: " + maxIdsPerReq); // upload dictionary file to HDFS // FileSystem fs = FileSystem.get(getConf()); // Path dictionaryPath = new Path(outputPath, Utilities.path2FileName(dictionaryFile)); // BufferedWriter writer = new BufferedWriter( // new OutputStreamWriter(fs.create(dictionaryPath, true))); // BufferedReader reader = new BufferedReader(new FileReader(dictionaryFile)); // String line = null; // while ((line = reader.readLine()) != null) { // writer.write(line + "\n"); // } // writer.close(); // Job job = new Job(getConf(), "Copy and tokenize data from HTRC data storage parallely."); job.setJarByClass(DataCopyTokenizerJob.class); // set analyzer job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // set distributed cache // Path dictionaryPath = new Path(dictionaryFile); // DistributedCache.setCacheFiles(new URI[] {dictionaryPath.toUri()}, job.getConfiguration()); // set data api conf job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); job.setMapperClass(DataCopyTokenizerMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("DataCopyTokenizerJob took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }