Example usage for org.apache.hadoop.mapreduce Partitioner getPartition

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Partitioner getPartition.

Prototype

public abstract int getPartition(KEY key, VALUE value, int numPartitions);

Source Link

Document

Get the partition number for a given key (hence record) given the total number of partitions i.e.

Usage

From source file:RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws Exception {
    Configuration conf = getConf();

    String in = path + "/iter" + FORMAT.format(i);
    String out = path + "/iter" + FORMAT.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // We need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-")) {
            numPartitions++;/*from  w ww.ja  v  a2 s .  co m*/
        }
    }

    conf.setInt("NodeCount", n);

    Partitioner<IntWritable, Writable> p = null;

    if (useRange) {
        p = new RangePartitioner();
        ((Configurable) p).setConf(conf);
    } else {
        p = new HashPartitioner<IntWritable, Writable>();
    }

    // This is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (!f.getPath().getName().contains("part-")) {
            continue;
        }

        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath()));

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        LOG.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + ";");
    }

    LOG.info(sb.toString().trim());

    LOG.info("PageRankSchimmy: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + n);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInmapCombiner);
    LOG.info(" - numPartitions: " + numPartitions);
    LOG.info(" - useRange: " + useRange);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    //conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    Job job = Job.getInstance(conf);
    job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankSchimmy.class);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        job.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        job.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        job.setPartitionerClass(RangePartitioner.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:com.asakusafw.compiler.flow.stage.ShufflePartitionerEmitterTest.java

License:Apache License

/**
 * simple case.//from w ww .jav  a 2 s. com
 * @throws Exception if error was occurred while output
 */
@Test
public void simple() throws Exception {
    ShuffleModel analyzed = shuffle(CoGroupStage.class);
    ShufflePartitionerEmitter emitter = new ShufflePartitionerEmitter(environment);
    Name key = emitKey(analyzed);
    Name value = emitValue(analyzed);
    Name name = emitter.emit(analyzed, key, value);

    ClassLoader loader = start();
    @SuppressWarnings("unchecked")
    Partitioner<Object, Object> part = (Partitioner<Object, Object>) create(loader, name);

    SegmentedWritable k = (SegmentedWritable) create(loader, key);
    SegmentedWritable v = (SegmentedWritable) create(loader, value);

    List<Segment> segments = analyzed.getSegments();
    assertThat(segments.size(), is(2));

    Segment seg1 = segments.get(0);
    Segment seg2 = segments.get(1);
    assertThat(seg1.getTerms().size(), is(2));
    assertThat(seg2.getTerms().size(), is(2));

    Ex1 ex1 = new Ex1();
    ex1.setSid(1);
    ex1.setValue(100);
    ex1.setStringAsString("ex1");

    Ex2 ex2 = new Ex2();
    ex2.setSid(2);
    ex2.setValue(100);
    ex2.setStringAsString("ex2");

    int p01, p02;

    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, is(p02));

    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    ex1.setValue(101);
    setShuffleKeyValue(seg1, k, v, ex1);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, not(p02));

    ex1.setValue(100);
    ex1.setSid(2);
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, is(p02));

    ex2.setStringAsString("ex3");
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, is(p02));

    ex1.setValue(101);
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, not(p02));

    ex2.setValue(101);
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, is(p02));

    ex2.setValue(102);
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, not(p02));
}

From source file:com.asakusafw.runtime.io.util.ShuffleKeyTest.java

License:Apache License

/**
 * partition testing./* w  ww. jav a 2 s  .co m*/
 * @throws Exception if failed
 */
@SuppressWarnings("rawtypes")
@Test
public void partition() throws Exception {
    Partitioner<ShuffleKey, ?> part = new ShuffleKey.Partitioner();
    Mock o11 = new Mock("1", "1");
    Mock o12 = new Mock("1", "2");
    Mock o21 = new Mock("2", "1");
    Mock o22 = new Mock("2", "2");

    assertThat(part.getPartition(o11, null, 10), equalTo(part.getPartition(o11, null, 10)));
    assertThat(part.getPartition(o11, null, 10), equalTo(part.getPartition(o12, null, 10)));
    assertThat(part.getPartition(o21, null, 10), equalTo(part.getPartition(o22, null, 10)));

    Random random = new Random(12345);
    for (int i = 0; i < 100000; i++) {
        Mock mock = new Mock(String.valueOf(random.nextInt()), "left");
        int value = part.getPartition(mock, null, 10000);
        assertThat(value, is(greaterThanOrEqualTo(0)));
    }
    boolean found = false;
    for (int i = 0; i < 100000; i++) {
        Mock left = new Mock(String.valueOf(random.nextInt()), "left");
        Mock right = new Mock(String.valueOf(random.nextInt()), "right");
        if (left.getGroupObject().equals(right.getGroupObject())) {
            continue;
        }
        if (part.getPartition(left, null, 10000) != part.getPartition(right, null, 10000)) {
            found = true;
            break;
        }
    }
    assertThat(found, is(true));
}

From source file:com.pinterest.terrapin.hadoop.BaseUploader.java

License:Apache License

/**
 * Validates the first non-empty partition hfile has right partitioning function.
 * It reads several keys, then calculates the partition according to the partitioning function
 * client offering. If the calculated partition number is different with actual partition number
 * an exception is thrown. If all partition hfiles are empty, an exception is thrown.
 *
 * @param parts full absolute path for all partitions
 * @param partitionerType type of paritioning function
 * @param numShards total number of partitions
 * @throws IOException if something goes wrong when reading the hfiles
 * @throws IllegalArgumentException if the partitioner type is wrong or all partitions are empty
 *///from ww w  .j  a v  a  2s.  c  om
public void validate(List<Path> parts, PartitionerType partitionerType, int numShards) throws IOException {
    boolean hasNonEmptyPartition = false;
    HColumnDescriptor columnDescriptor = new HColumnDescriptor();
    // Disable block cache to ensure it reads the actual file content.
    columnDescriptor.setBlockCacheEnabled(false);
    for (int shardIndex = 0; shardIndex < parts.size(); shardIndex++) {
        Path fileToBeValidated = parts.get(shardIndex);
        HFile.Reader reader = null;
        try {
            FileSystem fs = FileSystem.newInstance(fileToBeValidated.toUri(), conf);
            CacheConfig cc = new CacheConfig(conf, columnDescriptor);
            reader = HFile.createReader(fs, fileToBeValidated, cc);
            Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType);
            byte[] rowKey = reader.getFirstRowKey();
            if (rowKey == null) {
                LOG.warn(String.format("empty partition %s", fileToBeValidated.toString()));
                reader.close();
                continue;
            }
            hasNonEmptyPartition = true;
            BytesWritable key = new BytesWritable(rowKey);
            int partition = partitioner.getPartition(key, null, numShards);
            if (partition != shardIndex) {
                throw new IllegalArgumentException(
                        String.format("wrong partition type %s for key %s in partition %d, expected %d",
                                partitionerType.toString(), new String(key.getBytes()), shardIndex, partition));
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
        }
    }
    if (!hasNonEmptyPartition) {
        throw new IllegalArgumentException("all partitions are empty");
    }
}

From source file:com.pinterest.terrapin.TerrapinUtil.java

License:Apache License

public static String getPartitionName(ByteBuffer key, PartitionerType partitionerType, int numPartitions) {
    Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType);
    return Integer.toString(partitioner.getPartition(
            new BytesWritable(BytesUtil.readBytesFromByteBufferWithoutConsume(key)), null, numPartitions));
}

From source file:com.pinterest.terrapin.tools.HFileGenerator.java

License:Apache License

/**
 * Generate hfiles for testing purpose/*w  w  w . j  av  a2  s.  c  o m*/
 *
 * @param sourceFileSystem source file system
 * @param conf configuration for hfile
 * @param outputFolder output folder for generated hfiles
 * @param partitionerType partitioner type
 * @param numOfPartitions number of partitions
 * @param numOfKeys number of keys
 * @return list of generated hfiles
 * @throws IOException if hfile creation goes wrong
 */
public static List<Path> generateHFiles(FileSystem sourceFileSystem, Configuration conf, File outputFolder,
        PartitionerType partitionerType, int numOfPartitions, int numOfKeys) throws IOException {
    StoreFile.Writer[] writers = new StoreFile.Writer[numOfPartitions];
    for (int i = 0; i < numOfPartitions; i++) {
        writers[i] = new StoreFile.WriterBuilder(conf, new CacheConfig(conf), sourceFileSystem, 4096)
                .withFilePath(new Path(String.format("%s/%s", outputFolder.getAbsoluteFile(),
                        TerrapinUtil.formatPartitionName(i))))
                .withCompression(Compression.Algorithm.NONE).build();
    }
    Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType);
    for (int i = 0; i < numOfKeys; i++) {
        byte[] key = String.format("%06d", i).getBytes();
        byte[] value;
        if (i <= 1) {
            value = "".getBytes();
        } else {
            value = ("v" + (i + 1)).getBytes();
        }
        KeyValue kv = new KeyValue(key, Bytes.toBytes("cf"), Bytes.toBytes(""), value);
        int partition = partitioner.getPartition(new BytesWritable(key), new BytesWritable(value),
                numOfPartitions);
        writers[partition].append(kv);
    }
    for (int i = 0; i < numOfPartitions; i++) {
        writers[i].close();
    }
    return Lists.transform(Lists.newArrayList(writers), new Function<StoreFile.Writer, Path>() {
        @Override
        public Path apply(StoreFile.Writer writer) {
            return writer.getPath();
        }
    });
}

From source file:com.skp.experiment.common.mapreduce.MapFileOutputFormat.java

License:Apache License

/** Get an entry from output generated by this class. */
public static <K extends WritableComparable<?>, V extends Writable> Writable getEntry(MapFile.Reader[] readers,
        Partitioner<K, V> partitioner, K key, V value) throws IOException {
    int part = partitioner.getPartition(key, value, readers.length);
    return readers[part].get(key, value);
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }/*from   ww w.  j a v a 2 s.c  o  m*/
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));

        Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();

        // vv LookupRecordsByTemperature-ReaderFragment
        Reader reader = readers[partitioner.getPartition(key, val, readers.length)];
        // ^^ LookupRecordsByTemperature-ReaderFragment
        Writable entry = reader.get(key, val);
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        IntWritable nextKey = new IntWritable();
        do {
            parser.parse(val.toString());
            System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        } while (reader.next(nextKey, val) && key.equals(nextKey));
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }/*from w ww. j  ava 2  s.c o m*/
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));
        FileSystem fs = path.getFileSystem(getConf());

        Reader[] readers = MapFileOutputFormat.getReaders(fs, path, getConf());
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();

        Reader reader = readers[partitioner.getPartition(key, val, readers.length)];
        Writable entry = reader.get(key, val);
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        IntWritable nextKey = new IntWritable();
        do {
            parser.parse(val.toString());
            System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        } while (reader.next(nextKey, val) && key.equals(nextKey));
        return 0;
    }

From source file:kogiri.common.hadoop.io.format.map.BloomMapFileOutputFormat.java

License:Apache License

/**
 * Get an entry from output generated by this class.
 *//*from   w  w  w.  j  a  va  2s . c  o m*/
public static <K extends WritableComparable<?>, V extends Writable> Writable getEntry(
        BloomMapFile.Reader[] readers, Partitioner<K, V> partitioner, K key, V value) throws IOException {
    int part = partitioner.getPartition(key, value, readers.length);
    return readers[part].get(key, value);
}