List of usage examples for org.apache.hadoop.mapreduce.lib.partition HashPartitioner HashPartitioner
HashPartitioner
From source file:RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws Exception { Configuration conf = getConf(); String in = path + "/iter" + FORMAT.format(i); String out = path + "/iter" + FORMAT.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // We need to actually count the number of part files to get the number // of partitions (because the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) { numPartitions++;//from w ww .j av a 2 s . c om } } conf.setInt("NodeCount", n); Partitioner<IntWritable, Writable> p = null; if (useRange) { p = new RangePartitioner(); ((Configurable) p).setConf(conf); } else { p = new HashPartitioner<IntWritable, Writable>(); } // This is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (!f.getPath().getName().contains("part-")) { continue; } SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath())); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); LOG.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + ";"); } LOG.info(sb.toString().trim()); LOG.info("PageRankSchimmy: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + n); LOG.info(" - useCombiner: " + useCombiner); LOG.info(" - useInmapCombiner: " + useInmapCombiner); LOG.info(" - numPartitions: " + numPartitions); LOG.info(" - useRange: " + useRange); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); //conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); Job job = Job.getInstance(conf); job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); job.setJarByClass(RunPageRankSchimmy.class); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { job.setMapperClass(MapWithInMapperCombiningClass.class); } else { job.setMapperClass(MapClass.class); } if (useCombiner) { job.setCombinerClass(CombineClass.class); } if (useRange) { job.setPartitionerClass(RangePartitioner.class); } job.setReducerClass(ReduceClass.class); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:com.github.ygf.pagerank.InLinksTopNReducer.java
License:Apache License
@Override protected void cleanup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); Path titlesDir = new Path(conf.get("inlinks.titles_dir")); MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); IntWritable page = new IntWritable(); Text title = new Text(); int[] inLinks = new int[topN.size()]; String[] titles = new String[topN.size()]; for (int i = inLinks.length - 1; i >= 0; i--) { Map.Entry<Integer, Integer> entry = topN.poll(); page.set(entry.getValue());/*from w w w . j a v a 2 s . c om*/ MapFileOutputFormat.getEntry(readers, partitioner, page, title); inLinks[i] = entry.getKey(); titles[i] = title.toString(); } for (MapFile.Reader reader : readers) { reader.close(); } for (int i = 0; i < inLinks.length; i++) { context.write(new IntWritable(inLinks[i]), new Text(titles[i])); } }
From source file:com.github.ygf.pagerank.PageRankIterationMapper.java
License:Apache License
@Override public void map(ShortArrayWritable inKey, MatrixBlockWritable inValue, Context context) throws IOException, InterruptedException { // This task gets each block M_{i,j}, loads the corresponding stripe j // of the vector v_{k-1} and produces the partial result of the stripe i // of the vector v_k. Configuration conf = context.getConfiguration(); int iter = Integer.parseInt(conf.get("pagerank.iteration")); int numPages = Integer.parseInt(conf.get("pagerank.num_pages")); short blockSize = Short.parseShort(conf.get("pagerank.block_size")); Writable[] blockIndexes = inKey.get(); short i = ((ShortWritable) blockIndexes[0]).get(); short j = ((ShortWritable) blockIndexes[1]).get(); int vjSize = (j > numPages / blockSize) ? (numPages % blockSize) : blockSize; FloatWritable[] vj = new FloatWritable[vjSize]; if (iter == 1) { // Initial PageRank vector with 1/n for all pages. for (int k = 0; k < vj.length; k++) { vj[k] = new FloatWritable(1.0f / numPages); }//from w w w .j a v a 2 s . co m } else { // Load the stripe j of the vector v_{k-1} from the MapFiles. Path outputDir = MapFileOutputFormat.getOutputPath(context).getParent(); Path vjDir = new Path(outputDir, "v" + (iter - 1)); MapFile.Reader[] readers = MapFileOutputFormat.getReaders(vjDir, conf); Partitioner<ShortWritable, FloatArrayWritable> partitioner = new HashPartitioner<ShortWritable, FloatArrayWritable>(); ShortWritable key = new ShortWritable(j); FloatArrayWritable value = new FloatArrayWritable(); MapFileOutputFormat.getEntry(readers, partitioner, key, value); Writable[] writables = value.get(); for (int k = 0; k < vj.length; k++) { vj[k] = (FloatWritable) writables[k]; } for (MapFile.Reader reader : readers) { reader.close(); } } // Initialize the partial result i of the vector v_k. int viSize = (i > numPages / blockSize) ? (numPages % blockSize) : blockSize; FloatWritable[] vi = new FloatWritable[viSize]; for (int k = 0; k < vi.length; k++) { vi[k] = new FloatWritable(0); } // Multiply M_{i,j} by the stripe j of the vector v_{k-1} to obtain the // partial result i of the vector v_k. Writable[][] blockColumns = inValue.get(); for (int k = 0; k < blockColumns.length; k++) { Writable[] blockColumn = blockColumns[k]; if (blockColumn.length > 0) { int vDegree = ((ShortWritable) blockColumn[0]).get(); for (int columnIndex = 1; columnIndex < blockColumn.length; columnIndex++) { int l = ((ShortWritable) blockColumn[columnIndex]).get(); vi[l].set(vi[l].get() + (1.0f / vDegree) * vj[k].get()); } } } context.write(new ShortWritable(i), new FloatArrayWritable(vi)); }
From source file:com.github.ygf.pagerank.PageRankTopNReducer.java
License:Apache License
@Override protected void cleanup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); Path titlesDir = new Path(conf.get("pagerank.titles_dir")); MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); IntWritable page = new IntWritable(); Text title = new Text(); float[] pageRanks = new float[topN.size()]; String[] titles = new String[topN.size()]; // The order of the entries is reversed. The priority queue is in // non-decreasing order and we want the highest PageRank first. for (int i = pageRanks.length - 1; i >= 0; i--) { Map.Entry<Float, Integer> entry = topN.poll(); // Get the title of the page from the title index. page.set(entry.getValue());/*from w w w. j a va 2 s . c o m*/ MapFileOutputFormat.getEntry(readers, partitioner, page, title); pageRanks[i] = entry.getKey(); titles[i] = title.toString(); } for (MapFile.Reader reader : readers) { reader.close(); } for (int i = 0; i < pageRanks.length; i++) { context.write(new FloatWritable(pageRanks[i]), new Text(titles[i])); } }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
private List<LongPairWritable> getFilterQualifiedBlocks(JobContext context, Path file, BinaryExpression filterCondition, long splitMaxSize) throws IOException { Expression lhs = filterCondition.getLhs(); Expression rhs = filterCondition.getRhs(); if (filterCondition.getOpType() == OpType.OP_EQ) { // "leaf node" // handle cases like 'abcd' == column , column == 'abcd' if (rhs instanceof Column && lhs instanceof Const) { lhs = filterCondition.getRhs(); rhs = filterCondition.getLhs(); }//from w ww .j av a 2s. c o m String columnName = ((Column) lhs).getName(); String value = ((String) ((Const) rhs).getValue()); Text searchedValue = new Text(value); FileStatus[] dirlist = listIndexFiles(context, file, columnName); int part_num = dirlist.length; int part_seqnum = (new HashPartitioner<Text, Text>()).getPartition(searchedValue, searchedValue, part_num); String part_name = "/part-r-" + String.format("%05d", part_seqnum); FileSystem fs = file.getFileSystem(context.getConfiguration()); MapFile.Reader mapFileIndexReader = new MapFile.Reader(fs, getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName + part_name, context.getConfiguration()); ListLongPair indexedBlocks = new ListLongPair(); mapFileIndexReader.get(searchedValue, indexedBlocks); mapFileIndexReader.close(); return indexedBlocks.get(); } List<LongPairWritable> blocksLeft = getFilterQualifiedBlocks(context, file, (BinaryExpression) lhs, splitMaxSize); List<LongPairWritable> blocksRight = getFilterQualifiedBlocks(context, file, (BinaryExpression) rhs, splitMaxSize); if (filterCondition.getOpType() == OpType.OP_AND) return andFilter(blocksLeft, blocksRight); else if (filterCondition.getOpType() == OpType.OP_OR) { return orFilter(blocksLeft, blocksRight, splitMaxSize); } else throw new IOException("not supported filter condition:" + filterCondition); }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { JobBuilder.printUsage(this, "<path> <key>"); return -1; }//from w w w .java2s .co m Path path = new Path(args[0]); IntWritable key = new IntWritable(Integer.parseInt(args[1])); Reader[] readers = /*[*/MapFileOutputFormat.getReaders(path, getConf())/*]*/; Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); Text val = new Text(); Writable entry = /*[*/MapFileOutputFormat.getEntry(readers, partitioner, key, val)/*]*/; if (entry == null) { System.err.println("Key not found: " + key); return -1; } NcdcRecordParser parser = new NcdcRecordParser(); parser.parse(val.toString()); System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear()); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { JobBuilder.printUsage(this, "<path> <key>"); return -1; }//from ww w. ja v a 2 s.co m Path path = new Path(args[0]); IntWritable key = new IntWritable(Integer.parseInt(args[1])); Reader[] readers = MapFileOutputFormat.getReaders(path, getConf()); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); Text val = new Text(); // vv LookupRecordsByTemperature-ReaderFragment Reader reader = readers[partitioner.getPartition(key, val, readers.length)]; // ^^ LookupRecordsByTemperature-ReaderFragment Writable entry = reader.get(key, val); if (entry == null) { System.err.println("Key not found: " + key); return -1; } NcdcRecordParser parser = new NcdcRecordParser(); IntWritable nextKey = new IntWritable(); do { parser.parse(val.toString()); System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear()); } while (reader.next(nextKey, val) && key.equals(nextKey)); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { JobBuilder.printUsage(this, "<path> <key>"); return -1; }// w w w . j a v a 2 s . c om Path path = new Path(args[0]); IntWritable key = new IntWritable(Integer.parseInt(args[1])); FileSystem fs = path.getFileSystem(getConf()); Reader[] readers = /*[*/MapFileOutputFormat.getReaders(fs, path, getConf())/*]*/; Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); Text val = new Text(); Writable entry = /*[*/MapFileOutputFormat.getEntry(readers, partitioner, key, val)/*]*/; if (entry == null) { System.err.println("Key not found: " + key); return -1; } NcdcRecordParser parser = new NcdcRecordParser(); parser.parse(val.toString()); System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear()); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { JobBuilder.printUsage(this, "<path> <key>"); return -1; }/* w w w . j a va2 s .com*/ Path path = new Path(args[0]); IntWritable key = new IntWritable(Integer.parseInt(args[1])); FileSystem fs = path.getFileSystem(getConf()); Reader[] readers = MapFileOutputFormat.getReaders(fs, path, getConf()); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); Text val = new Text(); Reader reader = readers[partitioner.getPartition(key, val, readers.length)]; Writable entry = reader.get(key, val); if (entry == null) { System.err.println("Key not found: " + key); return -1; } NcdcRecordParser parser = new NcdcRecordParser(); IntWritable nextKey = new IntWritable(); do { parser.parse(val.toString()); System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear()); } while (reader.next(nextKey, val) && key.equals(nextKey)); return 0; }
From source file:org.apache.blur.utils.TableShardCountCollapserTest.java
License:Apache License
private void assertData(int totalShardCount) throws IOException { Partitioner<IntWritable, IntWritable> partitioner = new HashPartitioner<IntWritable, IntWritable>(); for (int i = 0; i < totalShardCount; i++) { HdfsDirectory directory = new HdfsDirectory(configuration, new Path(path, ShardUtil.getShardName(i))); DirectoryReader reader = DirectoryReader.open(directory); int numDocs = reader.numDocs(); for (int d = 0; d < numDocs; d++) { Document document = reader.document(d); IndexableField field = document.getField("id"); Integer id = (Integer) field.numericValue(); int partition = partitioner.getPartition(new IntWritable(id), null, totalShardCount); assertEquals(i, partition);/*from w ww. ja v a 2 s . com*/ } reader.close(); } }