List of usage examples for org.apache.hadoop.mapred.lib HashPartitioner HashPartitioner
HashPartitioner
From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws IOException { JobConf conf = new JobConf(RunPageRankBasic.class); String in = path + "/iter" + sFormat.format(i); String out = path + "/iter" + sFormat.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // we need to actually count the number of part files to get the number // of partitions (because the directory might contain _log) int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;// ww w .j av a 2 s .c o m } conf.setInt("NodeCount", n); Partitioner p = null; if (useRange) { p = new RangePartitioner<IntWritable, Writable>(); p.configure(conf); } else { p = new HashPartitioner<WritableComparable, Writable>(); } // this is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (f.getPath().getName().contains("_logs")) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); sLogger.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + "\t"); } sLogger.info(sb.toString().trim()); sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1"); sLogger.info(" - input: " + in); sLogger.info(" - output: " + out); sLogger.info(" - nodeCnt: " + n); sLogger.info(" - useCombiner: " + useCombiner); sLogger.info(" - useInmapCombiner: " + useInmapCombiner); sLogger.info(" - numPartitions: " + numPartitions); sLogger.info(" - useRange: " + useRange); sLogger.info("computed number of partitions: " + numPartitions); int numMapTasks = numPartitions; int numReduceTasks = numPartitions; conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); FileInputFormat.setInputPaths(conf, new Path(in)); FileOutputFormat.setOutputPath(conf, new Path(out)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(FloatWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { conf.setMapperClass(MapWithInMapperCombiningClass.class); } else { conf.setMapperClass(MapClass.class); } if (useCombiner) { conf.setCombinerClass(CombineClass.class); } if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); JobClient.runJob(conf); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:org.apache.nutch.crawl.CrawlDbReader.java
License:Apache License
public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException { Text key = new Text(url); CrawlDatum val = new CrawlDatum(); openReaders(crawlDb, config);//from w w w. ja v a 2 s. c o m CrawlDatum res = (CrawlDatum) MapFileOutputFormat.getEntry(readers, new HashPartitioner<Text, CrawlDatum>(), key, val); return res; }
From source file:org.apache.nutch.scoring.webgraph.LoopReader.java
License:Apache License
/** * Prints loopset for a single url. The loopset information will show any * outlink url the eventually forms a link cycle. * /*from w w w . j a va 2s . c o m*/ * @param webGraphDb The WebGraph to check for loops * @param url The url to check. * * @throws IOException If an error occurs while printing loopset information. */ public void dumpUrl(Path webGraphDb, String url) throws IOException { // open the readers fs = FileSystem.get(getConf()); loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, Loops.LOOPS_DIR), getConf()); // get the loopset for a given url, if any Text key = new Text(url); LoopSet loop = new LoopSet(); MapFileOutputFormat.getEntry(loopReaders, new HashPartitioner<Text, LoopSet>(), key, loop); // print out each loop url in the set System.out.println(url + ":"); for (String loopUrl : loop.getLoopSet()) { System.out.println(" " + loopUrl); } // close the readers FSUtils.closeReaders(loopReaders); }
From source file:org.apache.nutch.scoring.webgraph.NodeReader.java
License:Apache License
/** * Prints the content of the Node represented by the url to system out. * //from ww w. ja va 2 s .com * @param webGraphDb The webgraph from which to get the node. * @param url The url of the node. * * @throws IOException If an error occurs while getting the node. */ public void dumpUrl(Path webGraphDb, String url) throws IOException { fs = FileSystem.get(getConf()); nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, WebGraph.NODE_DIR), getConf()); // open the readers, get the node, print out the info, and close the readers Text key = new Text(url); Node node = new Node(); MapFileOutputFormat.getEntry(nodeReaders, new HashPartitioner<Text, Node>(), key, node); System.out.println(url + ":"); System.out.println(" inlink score: " + node.getInlinkScore()); System.out.println(" outlink score: " + node.getOutlinkScore()); System.out.println(" num inlinks: " + node.getNumInlinks()); System.out.println(" num outlinks: " + node.getNumOutlinks()); FSUtils.closeReaders(nodeReaders); }