List of usage examples for org.apache.hadoop.mapreduce Partitioner getPartition
public abstract int getPartition(KEY key, VALUE value, int numPartitions);
From source file:RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws Exception { Configuration conf = getConf(); String in = path + "/iter" + FORMAT.format(i); String out = path + "/iter" + FORMAT.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // We need to actually count the number of part files to get the number // of partitions (because the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) { numPartitions++;/*from w ww.ja v a2 s . co m*/ } } conf.setInt("NodeCount", n); Partitioner<IntWritable, Writable> p = null; if (useRange) { p = new RangePartitioner(); ((Configurable) p).setConf(conf); } else { p = new HashPartitioner<IntWritable, Writable>(); } // This is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (!f.getPath().getName().contains("part-")) { continue; } SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath())); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); LOG.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + ";"); } LOG.info(sb.toString().trim()); LOG.info("PageRankSchimmy: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + n); LOG.info(" - useCombiner: " + useCombiner); LOG.info(" - useInmapCombiner: " + useInmapCombiner); LOG.info(" - numPartitions: " + numPartitions); LOG.info(" - useRange: " + useRange); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); //conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); Job job = Job.getInstance(conf); job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); job.setJarByClass(RunPageRankSchimmy.class); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { job.setMapperClass(MapWithInMapperCombiningClass.class); } else { job.setMapperClass(MapClass.class); } if (useCombiner) { job.setCombinerClass(CombineClass.class); } if (useRange) { job.setPartitionerClass(RangePartitioner.class); } job.setReducerClass(ReduceClass.class); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:com.asakusafw.compiler.flow.stage.ShufflePartitionerEmitterTest.java
License:Apache License
/** * simple case.//from w ww .jav a 2 s. com * @throws Exception if error was occurred while output */ @Test public void simple() throws Exception { ShuffleModel analyzed = shuffle(CoGroupStage.class); ShufflePartitionerEmitter emitter = new ShufflePartitionerEmitter(environment); Name key = emitKey(analyzed); Name value = emitValue(analyzed); Name name = emitter.emit(analyzed, key, value); ClassLoader loader = start(); @SuppressWarnings("unchecked") Partitioner<Object, Object> part = (Partitioner<Object, Object>) create(loader, name); SegmentedWritable k = (SegmentedWritable) create(loader, key); SegmentedWritable v = (SegmentedWritable) create(loader, value); List<Segment> segments = analyzed.getSegments(); assertThat(segments.size(), is(2)); Segment seg1 = segments.get(0); Segment seg2 = segments.get(1); assertThat(seg1.getTerms().size(), is(2)); assertThat(seg2.getTerms().size(), is(2)); Ex1 ex1 = new Ex1(); ex1.setSid(1); ex1.setValue(100); ex1.setStringAsString("ex1"); Ex2 ex2 = new Ex2(); ex2.setSid(2); ex2.setValue(100); ex2.setStringAsString("ex2"); int p01, p02; setShuffleKeyValue(seg1, k, v, ex1); p01 = part.getPartition(k, v, 100); setShuffleKeyValue(seg2, k, v, ex2); p02 = part.getPartition(k, v, 100); assertThat(p01, is(p02)); setShuffleKeyValue(seg1, k, v, ex1); p01 = part.getPartition(k, v, 100); ex1.setValue(101); setShuffleKeyValue(seg1, k, v, ex1); p02 = part.getPartition(k, v, 100); assertThat(p01, not(p02)); ex1.setValue(100); ex1.setSid(2); setShuffleKeyValue(seg1, k, v, ex1); p01 = part.getPartition(k, v, 100); setShuffleKeyValue(seg2, k, v, ex2); p02 = part.getPartition(k, v, 100); assertThat(p01, is(p02)); ex2.setStringAsString("ex3"); setShuffleKeyValue(seg1, k, v, ex1); p01 = part.getPartition(k, v, 100); setShuffleKeyValue(seg2, k, v, ex2); p02 = part.getPartition(k, v, 100); assertThat(p01, is(p02)); ex1.setValue(101); setShuffleKeyValue(seg1, k, v, ex1); p01 = part.getPartition(k, v, 100); setShuffleKeyValue(seg2, k, v, ex2); p02 = part.getPartition(k, v, 100); assertThat(p01, not(p02)); ex2.setValue(101); setShuffleKeyValue(seg1, k, v, ex1); p01 = part.getPartition(k, v, 100); setShuffleKeyValue(seg2, k, v, ex2); p02 = part.getPartition(k, v, 100); assertThat(p01, is(p02)); ex2.setValue(102); setShuffleKeyValue(seg1, k, v, ex1); p01 = part.getPartition(k, v, 100); setShuffleKeyValue(seg2, k, v, ex2); p02 = part.getPartition(k, v, 100); assertThat(p01, not(p02)); }
From source file:com.asakusafw.runtime.io.util.ShuffleKeyTest.java
License:Apache License
/** * partition testing./* w ww. jav a 2 s .co m*/ * @throws Exception if failed */ @SuppressWarnings("rawtypes") @Test public void partition() throws Exception { Partitioner<ShuffleKey, ?> part = new ShuffleKey.Partitioner(); Mock o11 = new Mock("1", "1"); Mock o12 = new Mock("1", "2"); Mock o21 = new Mock("2", "1"); Mock o22 = new Mock("2", "2"); assertThat(part.getPartition(o11, null, 10), equalTo(part.getPartition(o11, null, 10))); assertThat(part.getPartition(o11, null, 10), equalTo(part.getPartition(o12, null, 10))); assertThat(part.getPartition(o21, null, 10), equalTo(part.getPartition(o22, null, 10))); Random random = new Random(12345); for (int i = 0; i < 100000; i++) { Mock mock = new Mock(String.valueOf(random.nextInt()), "left"); int value = part.getPartition(mock, null, 10000); assertThat(value, is(greaterThanOrEqualTo(0))); } boolean found = false; for (int i = 0; i < 100000; i++) { Mock left = new Mock(String.valueOf(random.nextInt()), "left"); Mock right = new Mock(String.valueOf(random.nextInt()), "right"); if (left.getGroupObject().equals(right.getGroupObject())) { continue; } if (part.getPartition(left, null, 10000) != part.getPartition(right, null, 10000)) { found = true; break; } } assertThat(found, is(true)); }
From source file:com.pinterest.terrapin.hadoop.BaseUploader.java
License:Apache License
/** * Validates the first non-empty partition hfile has right partitioning function. * It reads several keys, then calculates the partition according to the partitioning function * client offering. If the calculated partition number is different with actual partition number * an exception is thrown. If all partition hfiles are empty, an exception is thrown. * * @param parts full absolute path for all partitions * @param partitionerType type of paritioning function * @param numShards total number of partitions * @throws IOException if something goes wrong when reading the hfiles * @throws IllegalArgumentException if the partitioner type is wrong or all partitions are empty *///from ww w .j a v a 2s. c om public void validate(List<Path> parts, PartitionerType partitionerType, int numShards) throws IOException { boolean hasNonEmptyPartition = false; HColumnDescriptor columnDescriptor = new HColumnDescriptor(); // Disable block cache to ensure it reads the actual file content. columnDescriptor.setBlockCacheEnabled(false); for (int shardIndex = 0; shardIndex < parts.size(); shardIndex++) { Path fileToBeValidated = parts.get(shardIndex); HFile.Reader reader = null; try { FileSystem fs = FileSystem.newInstance(fileToBeValidated.toUri(), conf); CacheConfig cc = new CacheConfig(conf, columnDescriptor); reader = HFile.createReader(fs, fileToBeValidated, cc); Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType); byte[] rowKey = reader.getFirstRowKey(); if (rowKey == null) { LOG.warn(String.format("empty partition %s", fileToBeValidated.toString())); reader.close(); continue; } hasNonEmptyPartition = true; BytesWritable key = new BytesWritable(rowKey); int partition = partitioner.getPartition(key, null, numShards); if (partition != shardIndex) { throw new IllegalArgumentException( String.format("wrong partition type %s for key %s in partition %d, expected %d", partitionerType.toString(), new String(key.getBytes()), shardIndex, partition)); } } finally { if (reader != null) { reader.close(); } } } if (!hasNonEmptyPartition) { throw new IllegalArgumentException("all partitions are empty"); } }
From source file:com.pinterest.terrapin.TerrapinUtil.java
License:Apache License
public static String getPartitionName(ByteBuffer key, PartitionerType partitionerType, int numPartitions) { Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType); return Integer.toString(partitioner.getPartition( new BytesWritable(BytesUtil.readBytesFromByteBufferWithoutConsume(key)), null, numPartitions)); }
From source file:com.pinterest.terrapin.tools.HFileGenerator.java
License:Apache License
/** * Generate hfiles for testing purpose/*w w w . j av a2 s. c o m*/ * * @param sourceFileSystem source file system * @param conf configuration for hfile * @param outputFolder output folder for generated hfiles * @param partitionerType partitioner type * @param numOfPartitions number of partitions * @param numOfKeys number of keys * @return list of generated hfiles * @throws IOException if hfile creation goes wrong */ public static List<Path> generateHFiles(FileSystem sourceFileSystem, Configuration conf, File outputFolder, PartitionerType partitionerType, int numOfPartitions, int numOfKeys) throws IOException { StoreFile.Writer[] writers = new StoreFile.Writer[numOfPartitions]; for (int i = 0; i < numOfPartitions; i++) { writers[i] = new StoreFile.WriterBuilder(conf, new CacheConfig(conf), sourceFileSystem, 4096) .withFilePath(new Path(String.format("%s/%s", outputFolder.getAbsoluteFile(), TerrapinUtil.formatPartitionName(i)))) .withCompression(Compression.Algorithm.NONE).build(); } Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType); for (int i = 0; i < numOfKeys; i++) { byte[] key = String.format("%06d", i).getBytes(); byte[] value; if (i <= 1) { value = "".getBytes(); } else { value = ("v" + (i + 1)).getBytes(); } KeyValue kv = new KeyValue(key, Bytes.toBytes("cf"), Bytes.toBytes(""), value); int partition = partitioner.getPartition(new BytesWritable(key), new BytesWritable(value), numOfPartitions); writers[partition].append(kv); } for (int i = 0; i < numOfPartitions; i++) { writers[i].close(); } return Lists.transform(Lists.newArrayList(writers), new Function<StoreFile.Writer, Path>() { @Override public Path apply(StoreFile.Writer writer) { return writer.getPath(); } }); }
From source file:com.skp.experiment.common.mapreduce.MapFileOutputFormat.java
License:Apache License
/** Get an entry from output generated by this class. */ public static <K extends WritableComparable<?>, V extends Writable> Writable getEntry(MapFile.Reader[] readers, Partitioner<K, V> partitioner, K key, V value) throws IOException { int part = partitioner.getPartition(key, value, readers.length); return readers[part].get(key, value); }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { JobBuilder.printUsage(this, "<path> <key>"); return -1; }/*from ww w. j a v a 2 s.c o m*/ Path path = new Path(args[0]); IntWritable key = new IntWritable(Integer.parseInt(args[1])); Reader[] readers = MapFileOutputFormat.getReaders(path, getConf()); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); Text val = new Text(); // vv LookupRecordsByTemperature-ReaderFragment Reader reader = readers[partitioner.getPartition(key, val, readers.length)]; // ^^ LookupRecordsByTemperature-ReaderFragment Writable entry = reader.get(key, val); if (entry == null) { System.err.println("Key not found: " + key); return -1; } NcdcRecordParser parser = new NcdcRecordParser(); IntWritable nextKey = new IntWritable(); do { parser.parse(val.toString()); System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear()); } while (reader.next(nextKey, val) && key.equals(nextKey)); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { JobBuilder.printUsage(this, "<path> <key>"); return -1; }/*from w ww. j ava 2 s.c o m*/ Path path = new Path(args[0]); IntWritable key = new IntWritable(Integer.parseInt(args[1])); FileSystem fs = path.getFileSystem(getConf()); Reader[] readers = MapFileOutputFormat.getReaders(fs, path, getConf()); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); Text val = new Text(); Reader reader = readers[partitioner.getPartition(key, val, readers.length)]; Writable entry = reader.get(key, val); if (entry == null) { System.err.println("Key not found: " + key); return -1; } NcdcRecordParser parser = new NcdcRecordParser(); IntWritable nextKey = new IntWritable(); do { parser.parse(val.toString()); System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear()); } while (reader.next(nextKey, val) && key.equals(nextKey)); return 0; }
From source file:kogiri.common.hadoop.io.format.map.BloomMapFileOutputFormat.java
License:Apache License
/** * Get an entry from output generated by this class. *//*from w w w. j a va 2s . c o m*/ public static <K extends WritableComparable<?>, V extends Writable> Writable getEntry( BloomMapFile.Reader[] readers, Partitioner<K, V> partitioner, K key, V value) throws IOException { int part = partitioner.getPartition(key, value, readers.length); return readers[part].get(key, value); }