List of usage examples for org.apache.hadoop.mapred JobConf setNumReduceTasks
public void setNumReduceTasks(int n)
From source file:org.apache.hive.hcatalog.hbase.TestHiveHBaseTableOutputFormat.java
License:Apache License
@Test public void directOutputFormatTest() throws IOException, ClassNotFoundException, InterruptedException { String testName = "directOutputFormatTest"; Path methodTestDir = new Path(getTestDir(), testName); String tableName = newTableName(testName).toLowerCase(); String familyName = "my_family"; byte[] familyNameBytes = Bytes.toBytes(familyName); //include hbase config in conf file Configuration conf = new Configuration(allConf); conf.set(HCatConstants.HCAT_KEY_HIVE_CONF, HCatUtil.serialize(allConf.getAllProperties())); //create table createTable(tableName, new String[] { familyName }); String data[] = { "1,english:ONE,spanish:UNO", "2,english:TWO,spanish:DOS", "3,english:THREE,spanish:TRES" }; // input/output settings Path inputPath = new Path(methodTestDir, "mr_input"); getFileSystem().mkdirs(inputPath);/*from www. j ava 2 s . c om*/ FSDataOutputStream os = getFileSystem().create(new Path(inputPath, "inputFile.txt")); for (String line : data) os.write(Bytes.toBytes(line + "\n")); os.close(); //create job JobConf job = new JobConf(conf); job.setJobName(testName); job.setWorkingDirectory(new Path(methodTestDir, "mr_work")); job.setJarByClass(this.getClass()); job.setMapperClass(MapWrite.class); job.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); org.apache.hadoop.mapred.TextInputFormat.setInputPaths(job, inputPath); // why we need to set all the 3 properties?? job.setOutputFormat(HiveHBaseTableOutputFormat.class); job.set(HBaseSerDe.HBASE_TABLE_NAME, tableName); job.set(TableOutputFormat.OUTPUT_TABLE, tableName); job.set(HCatConstants.HCAT_DEFAULT_TOPIC_PREFIX + ".hbase.mapreduce.outputTableName", tableName); try { OutputJobInfo outputJobInfo = OutputJobInfo.create("default", tableName, null); job.set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(outputJobInfo)); } catch (Exception ex) { throw new IOException("Serialization error " + ex.getMessage(), ex); } job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(HCatRecord.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(HCatRecord.class); job.setNumReduceTasks(0); System.getProperty("java.classpath"); RunningJob runJob = JobClient.runJob(job); runJob.waitForCompletion(); assertTrue(runJob.isSuccessful()); //verify HTable table = new HTable(conf, tableName); Scan scan = new Scan(); scan.addFamily(familyNameBytes); ResultScanner scanner = table.getScanner(scan); int index = 0; for (Result result : scanner) { String vals[] = data[index].toString().split(","); for (int i = 1; i < vals.length; i++) { String pair[] = vals[i].split(":"); assertTrue(result.containsColumn(familyNameBytes, Bytes.toBytes(pair[0]))); assertEquals(pair[1], Bytes.toString(result.getValue(familyNameBytes, Bytes.toBytes(pair[0])))); } index++; } assertEquals(data.length, index); }
From source file:org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.java
License:Apache License
public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); conf.setMapperClass(OutputMapper.class); conf.setReducerClass(Reducer.class); conf.setNumReduceTasks(0); client.setConf(conf);// w w w . j a v a2s . co m JobClient.runJob(conf); }
From source file:org.apache.mahout.df.mapred.inmem.InMemBuilder.java
License:Apache License
@Override protected void configureJob(JobConf conf, int nbTrees, boolean oobEstimate) throws IOException { FileOutputFormat.setOutputPath(conf, getOutputPath(conf)); // put the data in the DistributedCache DistributedCache.addCacheFile(getDataPath().toUri(), conf); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(MapredOutput.class); conf.setMapperClass(InMemMapper.class); conf.setNumReduceTasks(0); // no reducers conf.setInputFormat(InMemInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); }
From source file:org.apache.mahout.df.mapred.partial.PartialBuilder.java
License:Apache License
@Override protected void configureJob(JobConf job, int nbTrees, boolean oobEstimate) throws IOException { FileInputFormat.setInputPaths(job, getDataPath()); FileOutputFormat.setOutputPath(job, getOutputPath(job)); job.setOutputKeyClass(TreeID.class); job.setOutputValueClass(MapredOutput.class); job.setMapperClass(Step1Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormat(TextInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); // if we are in 'local' mode, correct the number of maps // or the mappers won't be able to compute the right indexes String tracker = job.get("mapred.job.tracker", "local"); if ("local".equals(tracker)) { log.warn("Hadoop running in 'local' mode, only one map task will be launched"); job.setNumMapTasks(1);/* w w w . j a v a 2s. c o m*/ } }
From source file:org.apache.mahout.df.mapred.partial.Step0Job.java
License:Apache License
/** * Computes the partitions' first ids in Hadoop's order * /*from w w w.jav a 2s . com*/ * @param conf * configuration * @return first ids for all the partitions * @throws IOException */ public Step0Output[] run(Configuration conf) throws IOException { JobConf job = new JobConf(conf, Step0Job.class); // check the output if (outputPath.getFileSystem(job).exists(outputPath)) { throw new IOException("Output path already exists : " + outputPath); } // put the dataset into the DistributedCache // use setCacheFiles() to overwrite the first-step cache files URI[] files = { datasetPath.toUri() }; DistributedCache.setCacheFiles(files, job); FileInputFormat.setInputPaths(job, dataPath); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Step0Output.class); job.setMapperClass(Step0Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormat(TextInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); // run the job JobClient.runJob(job); return parseOutput(job); }
From source file:org.apache.mahout.df.mapred.partial.Step2Job.java
License:Apache License
/** * Run the second step./* w w w . j av a 2 s . c o m*/ * * @param conf * configuration * @param keys * keys returned by the first step * @param trees * trees returned by the first step * @param callback * @throws IOException */ public void run(Configuration conf, TreeID[] keys, Node[] trees, PredictionCallback callback) throws IOException { if (callback == null) { // no need to launch the job return; } int numTrees = keys.length; JobConf job = new JobConf(conf, Step2Job.class); // check the output if (outputPath.getFileSystem(job).exists(outputPath)) { throw new IOException("Output path already exists : " + outputPath); } int[] sizes = Step0Output.extractSizes(partitions); InterResults.store(forestPath.getFileSystem(job), forestPath, keys, trees, sizes); // needed by the mapper Builder.setNbTrees(job, numTrees); // put the dataset and the forest into the DistributedCache // use setCacheFiles() to overwrite the first-step cache files URI[] files = { datasetPath.toUri(), forestPath.toUri() }; DistributedCache.setCacheFiles(files, job); FileInputFormat.setInputPaths(job, dataPath); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(TreeID.class); job.setOutputValueClass(MapredOutput.class); job.setMapperClass(Step2Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormat(TextInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); // run the job JobClient.runJob(job); parseOutput(job, callback); }
From source file:org.apache.nutch.crawl.CrawlDbReader.java
License:Apache License
public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")"); LOG.info("CrawlDb db: " + crawlDb); }/*from ww w . j a v a2 s. c om*/ Path outFolder = new Path(output); Path tempDir = new Path(config.get("mapred.temp.dir", ".") + "/readdb-topN-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("topN prepare " + crawlDb); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbTopNMapper.class); job.setReducerClass(IdentityReducer.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); // XXX hmmm, no setFloat() in the API ... :( job.setLong("db.reader.topn.min", Math.round(1000000.0 * min)); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: collecting topN scores."); } job = new NutchJob(config); job.setJobName("topN collect " + crawlDb); job.setLong("db.reader.topn", topN); FileInputFormat.addInputPath(job, tempDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(CrawlDbTopNReducer.class); FileOutputFormat.setOutputPath(job, outFolder); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); // create a single file. JobClient.runJob(job); FileSystem fs = FileSystem.get(config); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); } }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the counter job. The counter job determines the number of links in the * webgraph. This is used during analysis. * /*from ww w . j a va 2 s . co m*/ * @param fs The job file system. * @param webGraphDb The web graph database to use. * * @return The number of nodes in the web graph. * @throws IOException If an error occurs while running the counter job. */ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException { // configure the counter job Path numLinksPath = new Path(webGraphDb, NUM_NODES); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); JobConf counter = new NutchJob(getConf()); counter.setJobName("LinkRank Counter"); FileInputFormat.addInputPath(counter, nodeDb); FileOutputFormat.setOutputPath(counter, numLinksPath); counter.setInputFormat(SequenceFileInputFormat.class); counter.setMapperClass(Counter.class); counter.setCombinerClass(Counter.class); counter.setReducerClass(Counter.class); counter.setMapOutputKeyClass(Text.class); counter.setMapOutputValueClass(LongWritable.class); counter.setOutputKeyClass(Text.class); counter.setOutputValueClass(LongWritable.class); counter.setNumReduceTasks(1); counter.setOutputFormat(TextOutputFormat.class); counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the counter job, outputs to a single reduce task and file LOG.info("Starting link counter job"); try { JobClient.runJob(counter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished link counter job"); // read the first (and only) line from the file which should be the // number of links in the web graph LOG.info("Reading numlinks temp file"); FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000")); BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks)); String numLinksLine = buffer.readLine(); readLinks.close(); // check if there are links to process, if none, webgraph might be empty if (numLinksLine == null || numLinksLine.length() == 0) { fs.delete(numLinksPath, true); throw new IOException("No links to process, is the webgraph empty?"); } // delete temp file and convert and return the number of links as an int LOG.info("Deleting numlinks temp file"); fs.delete(numLinksPath, true); String numLinks = numLinksLine.split("\\s+")[1]; return Integer.parseInt(numLinks); }
From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java
License:Apache License
/** * Runs the process to dump the top urls out to a text file. * * @param webGraphDb The WebGraph from which to pull values. * * @param topN/*ww w .j a v a 2s.c o m*/ * @param output * * @throws IOException If an error occurs while dumping the top values. */ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Configuration conf = getConf(); JobConf dumper = new NutchJob(conf); dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormat(SequenceFileInputFormat.class); if (nameType == null) { dumper.setMapperClass(Sorter.class); dumper.setReducerClass(Sorter.class); dumper.setMapOutputKeyClass(FloatWritable.class); dumper.setMapOutputValueClass(Text.class); } else { dumper.setMapperClass(Dumper.class); dumper.setReducerClass(Dumper.class); dumper.setMapOutputKeyClass(Text.class); dumper.setMapOutputValueClass(FloatWritable.class); } dumper.setOutputKeyClass(Text.class); dumper.setOutputValueClass(FloatWritable.class); FileOutputFormat.setOutputPath(dumper, output); if (asSequenceFile) { dumper.setOutputFormat(SequenceFileOutputFormat.class); } else { dumper.setOutputFormat(TextOutputFormat.class); } dumper.setNumReduceTasks(1); dumper.setBoolean("inlinks", type == DumpType.INLINKS); dumper.setBoolean("outlinks", type == DumpType.OUTLINKS); dumper.setBoolean("scores", type == DumpType.SCORES); dumper.setBoolean("host", nameType == NameType.HOST); dumper.setBoolean("domain", nameType == NameType.DOMAIN); dumper.setBoolean("sum", aggrType == AggrType.SUM); dumper.setBoolean("max", aggrType == AggrType.MAX); dumper.setLong("topn", topN); // Set equals-sign as separator for Solr's ExternalFileField if (asEff) { dumper.set("mapred.textoutputformat.separator", "="); } try { LOG.info("NodeDumper: running"); JobClient.runJob(dumper); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.tools.FreeGenerator.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]"); System.err.println("\tinputDir\tinput directory containing one or more input files."); System.err.println("\t\tEach text file contains a list of URLs, one URL per line"); System.err.println("\tsegmentsDir\toutput directory, where new segment will be created"); System.err.println("\t-filter\trun current URLFilters on input URLs"); System.err.println("\t-normalize\trun current URLNormalizers on input URLs"); return -1; }/*from w w w .ja v a 2 s.co m*/ boolean filter = false; boolean normalize = false; if (args.length > 2) { for (int i = 2; i < args.length; i++) { if (args[i].equals("-filter")) { filter = true; } else if (args[i].equals("-normalize")) { normalize = true; } else { LOG.error("Unknown argument: " + args[i] + ", exiting ..."); return -1; } } } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("FreeGenerator: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); job.setBoolean(FILTER_KEY, filter); job.setBoolean(NORMALIZE_KEY, normalize); FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormat(TextInputFormat.class); job.setMapperClass(FG.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Generator.SelectorEntry.class); job.setPartitionerClass(URLPartitioner.class); job.setReducerClass(FG.class); String segName = Generator.generateSegmentName(); job.setNumReduceTasks(job.getNumMapTasks()); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(Generator.HashComparator.class); FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME))); try { JobClient.runJob(job); } catch (Exception e) { LOG.error("FAILED: " + StringUtils.stringifyException(e)); return -1; } long end = System.currentTimeMillis(); LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }