List of usage examples for org.apache.hadoop.mapred JobConf setNumReduceTasks
public void setNumReduceTasks(int n)
From source file:org.cloudata.examples.weblink.UploadJob.java
License:Apache License
public void run(String[] args) throws IOException { if (args.length < 3) { System.out.println("Usage: java UploadJob <input path> <table name> <distributed cache dir>"); System.exit(0);//from www .jav a2 s .c o m } Path inputPath = new Path(args[0]); String tableName = args[1]; CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, tableName)) { TableSchema tableSchema = new TableSchema(tableName); tableSchema.addColumn("url"); tableSchema.addColumn("page"); tableSchema.addColumn("title"); tableSchema.addColumn("outlink"); CTable.createTable(nconf, tableSchema); } JobConf jobConf = new JobConf(UploadJob.class); jobConf.set("mapred.child.java.opts", "-Xss4096K"); jobConf.setJobName("CloudataExamles.weblink.UploadJob_" + new Date()); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); DistributedCache.addArchiveToClassPath(new Path(args[2] + "/htmllexer.jar"), jobConf); DistributedCache.addArchiveToClassPath(new Path(args[2] + "/htmlparser.jar"), jobConf); DistributedCache.addArchiveToClassPath(new Path(args[2] + "/jdom.jar"), jobConf); // <MAP> FileInputFormat.addInputPath(jobConf, inputPath); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(UploadJobMapper.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); // </MAP> // <REDUCE> // Map Only FileOutputFormat.setOutputPath(jobConf, new Path("CloudataExamles_WebUploadJob_" + System.currentTimeMillis())); jobConf.setNumReduceTasks(0); // </REDUCE> try { JobClient.runJob(jobConf); } catch (Exception e) { e.printStackTrace(); } finally { FileSystem fs = FileSystem.get(jobConf); fs.delete(FileOutputFormat.getOutputPath(jobConf), true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.util.matrix.AbstractMatrix.java
License:Apache License
public void mutiply(AbstractMatrix targetMatrix, AbstractMatrix resultMatrix) throws IOException { Path tempOutputPath = new Path("temp/Matrix_" + System.currentTimeMillis()); JobConf jobConf = new JobConf(AbstractMatrix.class); jobConf.setJobName("Matrix_Mutiply_Job" + "(" + new Date() + ")"); //<MAP> jobConf.setMapperClass(MatrixMutiplyMap.class); jobConf.setInputFormat(MatrixInputFormat.class); jobConf.set(MatrixInputFormat.MATRIX_INPUT_TABLE, ctable.getTableName()); jobConf.set(MatrixInputFormat.MATRIX_INPUT_COLUMN, columnName); jobConf.set(MatrixInputFormat.MATRIX_TARGET_TABLE, targetMatrix.ctable.getTableName()); jobConf.set(MatrixInputFormat.MATRIX_TARGET_COLUMN, targetMatrix.columnName); jobConf.setBoolean(MatrixInputFormat.MATRIX_TARGET_SPARSE, targetMatrix.isSparse()); jobConf.setMapOutputKeyClass(MatrixItem.class); jobConf.setMapOutputValueClass(Text.class); //</MAP> //<REDUCE> jobConf.setPartitionerClass(KeyRangePartitioner.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, resultMatrix.ctable.getTableName()); jobConf.setReducerClass(MatrixMutiplyReduce.class); jobConf.set(MatrixInputFormat.MATRIX_RESULT_TABLE, resultMatrix.ctable.getTableName()); jobConf.set(MatrixInputFormat.MATRIX_RESULT_COLUMN, resultMatrix.columnName); jobConf.setBoolean(MatrixInputFormat.MATRIX_RESULT_SPARSE, resultMatrix.isSparse()); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); TabletInfo[] tabletInfos = resultMatrix.ctable.listTabletInfos(); jobConf.setNumReduceTasks(tabletInfos.length); jobConf.setMaxReduceAttempts(0);/*from w w w . j a va 2s . co m*/ FileOutputFormat.setOutputPath(jobConf, tempOutputPath); //</REDUCE> //Run Job JobClient.runJob(jobConf); //delete temp output path FileSystem fs = FileSystem.get(jobConf); fs.delete(tempOutputPath, true); }
From source file:org.cloudata.util.upload.UploadUtil.java
License:Apache License
private void doHadoopUpload(CloudataConf conf) throws IOException { if (!CTable.existsTable(conf, tableName)) { throw new IOException("No table:" + tableName); }//from w w w . j a v a 2 s .co m JobConf jobConf = new JobConf(UploadUtil.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")"); //KeyRangePartitioner //AbstractTabletInputFormat.OUTPUT_TABLE? ? jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); //<Map> FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(TextInputFormat.class); jobConf.set("uploadJob.delim", delim); String columnStr = ""; for (String eachColumn : columns) { columnStr += eachColumn + ","; } jobConf.set("uploadJob.columns", columnStr); String fieldNumStr = ""; for (int eachField : fieldNums) { fieldNumStr += eachField + ","; } jobConf.set("uploadJob.fieldNums", fieldNumStr); jobConf.setBoolean("uploadJob.keyValuePair", keyValuePair); jobConf.setMapperClass(UploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setMapSpeculativeExecution(false); jobConf.setMaxMapAttempts(0); //</Map> //<Reduce> Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer"); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setNumReduceTasks(0); //</Reduce> try { JobClient.runJob(jobConf); } finally { FileSystem fs = FileSystem.get(jobConf); FileUtil.delete(fs, tempOutputPath, true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.clueweb.clueweb09.app.CountWarcRecordsOld.java
License:Apache License
/** * Runs this tool./*w ww. j a v a 2 s. c o m*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + CountWarcRecordsOld.class.getSimpleName()); LOG.info(" - input: " + input); JobConf conf = new JobConf(getConf(), CountWarcRecordsOld.class); conf.setJobName(CountWarcRecordsOld.class.getSimpleName() + ":" + input); conf.setNumReduceTasks(0); FileInputFormat.addInputPaths(conf, input); conf.setInputFormat(ClueWeb09InputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); LOG.info("Read " + numDocs + " docs."); return 0; }
From source file:org.clueweb.clueweb12.app.CountClueWarcRecords.java
License:Apache License
/** * Runs this tool./*from w w w .ja va2 s . c o m*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName()); LOG.info(" - input: " + input); JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class); conf.setJobName(CountClueWarcRecords.class.getSimpleName() + ":" + input); conf.setNumReduceTasks(0); FileInputFormat.addInputPaths(conf, input); conf.setInputFormat(ClueWarcInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); LOG.info("Read " + numDocs + " docs."); return 0; }
From source file:org.clueweb.clueweb12.app.CountWarcRecordsOld.java
License:Apache License
/** * Runs this tool./*from w w w .j a va 2s. c om*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + CountWarcRecordsOld.class.getSimpleName()); LOG.info(" - input: " + input); JobConf conf = new JobConf(getConf(), CountWarcRecordsOld.class); conf.setJobName(CountWarcRecordsOld.class.getSimpleName() + ":" + input); conf.setNumReduceTasks(0); FileInputFormat.addInputPaths(conf, input); conf.setInputFormat(ClueWeb12InputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); LOG.info("Read " + numDocs + " docs."); return 0; }
From source file:org.clueweb.clueweb12.app.DumpClueWarcRecordsToPlainText.java
License:Apache License
/** * Runs this tool.// w w w . j a v a 2 s . c o m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + DumpClueWarcRecordsToPlainText.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); JobConf conf = new JobConf(getConf(), DumpClueWarcRecordsToPlainText.class); conf.setJobName(DumpClueWarcRecordsToPlainText.class.getSimpleName() + ":" + input); conf.setNumReduceTasks(0); FileInputFormat.addInputPaths(conf, input); FileOutputFormat.setOutputPath(conf, new Path(output)); conf.setInputFormat(ClueWarcInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); LOG.info("Read " + numDocs + " docs."); return 0; }
From source file:org.commoncrawl.mapred.segmenter.Segmenter.java
License:Open Source License
public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath, Path finalOutputPath) {/* w w w. j av a 2 s. c om*/ try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); final Path tempOutputDir = new Path( CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis()); JobConf job = new JobConf(conf); // compute crawlers string ... String crawlers = new String(); for (int i = 0; i < crawlerArray.length; ++i) { if (i != 0) crawlers += ","; crawlers += crawlerArray[i]; } LOG.info("Segment Generator: crawlers:" + crawlers); job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers); LOG.info("Crawler Count:" + crawlerArray.length); job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length); LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER); job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER); job.setJobName("Generate Segments"); for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) { LOG.info("Adding File:" + candidate.getPath()); job.addInputPath(candidate.getPath()); } // multi file merger job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class); job.setMapOutputValueClass(SegmentGeneratorItemBundle.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(SegmenterReducer.class); job.setPartitionerClass(BundleKeyPartitioner.class); job.setOutputKeyComparatorClass(BundleKeyComparator.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputPath(tempOutputDir); job.setNumTasksToExecutePerJvm(1000); job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER); LOG.info("Running Segmenter OutputDir:" + tempOutputDir); JobClient.runJob(job); LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:" + finalOutputPath); fs.rename(tempOutputDir, finalOutputPath); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } }
From source file:org.elasticsearch.hadoop.integration.mr.AbstractExtraMRTests.java
License:Apache License
@Parameters public static Collection<Object[]> configs() throws IOException { JobConf conf = HdpBootstrap.hadoopConfig(); conf.setInputFormat(SplittableTextInputFormat.class); conf.setOutputFormat(EsOutputFormat.class); conf.setReducerClass(IdentityReducer.class); HadoopCfgUtils.setGenericOptions(conf); conf.setNumMapTasks(2);/*from w w w . j ava2s .c om*/ conf.setInt("actual.splits", 2); conf.setNumReduceTasks(0); JobConf standard = new JobConf(conf); standard.setMapperClass(TabMapper.class); standard.setMapOutputValueClass(LinkedMapWritable.class); standard.set(ConfigurationOptions.ES_INPUT_JSON, "false"); FileInputFormat.setInputPaths(standard, new Path(TestUtils.gibberishDat(conf))); JobConf json = new JobConf(conf); json.setMapperClass(IdentityMapper.class); json.setMapOutputValueClass(Text.class); json.set(ConfigurationOptions.ES_INPUT_JSON, "true"); FileInputFormat.setInputPaths(json, new Path(TestUtils.gibberishJson(conf))); return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } }); }
From source file:org.elasticsearch.hadoop.integration.mr.AbstractExtraMRTests.java
License:Apache License
private JobConf createReadJobConf() throws IOException { JobConf conf = HdpBootstrap.hadoopConfig(); conf.setInputFormat(EsInputFormat.class); conf.setOutputFormat(PrintStreamOutputFormat.class); conf.setOutputKeyClass(Text.class); boolean type = random.nextBoolean(); Class<?> mapType = (type ? MapWritable.class : LinkedMapWritable.class); conf.setOutputValueClass(MapWritable.class); HadoopCfgUtils.setGenericOptions(conf); conf.setNumReduceTasks(0); conf.set(ConfigurationOptions.ES_READ_METADATA, String.valueOf(random.nextBoolean())); conf.set(ConfigurationOptions.ES_READ_METADATA_VERSION, String.valueOf(true)); conf.set(ConfigurationOptions.ES_OUTPUT_JSON, "true"); FileInputFormat.setInputPaths(conf, new Path(TestUtils.gibberishDat(conf))); return conf;/* w w w. j a va 2 s.c om*/ }