Example usage for org.apache.hadoop.mapreduce Partitioner getPartition

List of usage examples for org.apache.hadoop.mapreduce Partitioner getPartition

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Partitioner getPartition.

Prototype

public abstract int getPartition(KEY key, VALUE value, int numPartitions);

Source Link

Document

Get the partition number for a given key (hence record) given the total number of partitions i.e.

Usage

From source file:RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws Exception {
    Configuration conf = getConf();

    String in = path + "/iter" + FORMAT.format(i);
    String out = path + "/iter" + FORMAT.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // We need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-")) {
            numPartitions++;/*from  w ww.ja  v  a2 s .  co m*/
        }
    }

    conf.setInt("NodeCount", n);

    Partitioner<IntWritable, Writable> p = null;

    if (useRange) {
        p = new RangePartitioner();
        ((Configurable) p).setConf(conf);
    } else {
        p = new HashPartitioner<IntWritable, Writable>();
    }

    // This is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (!f.getPath().getName().contains("part-")) {
            continue;
        }

        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath()));

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        LOG.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + ";");
    }

    LOG.info(sb.toString().trim());

    LOG.info("PageRankSchimmy: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + n);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInmapCombiner);
    LOG.info(" - numPartitions: " + numPartitions);
    LOG.info(" - useRange: " + useRange);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    //conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    Job job = Job.getInstance(conf);
    job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankSchimmy.class);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        job.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        job.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        job.setPartitionerClass(RangePartitioner.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:com.asakusafw.compiler.flow.stage.ShufflePartitionerEmitterTest.java

License:Apache License

/**
 * simple case.//from w ww .jav  a 2 s. com
 * @throws Exception if error was occurred while output
 */
@Test
public void simple() throws Exception {
    ShuffleModel analyzed = shuffle(CoGroupStage.class);
    ShufflePartitionerEmitter emitter = new ShufflePartitionerEmitter(environment);
    Name key = emitKey(analyzed);
    Name value = emitValue(analyzed);
    Name name = emitter.emit(analyzed, key, value);

    ClassLoader loader = start();
    @SuppressWarnings("unchecked")
    Partitioner<Object, Object> part = (Partitioner<Object, Object>) create(loader, name);

    SegmentedWritable k = (SegmentedWritable) create(loader, key);
    SegmentedWritable v = (SegmentedWritable) create(loader, value);

    List<Segment> segments = analyzed.getSegments();
    assertThat(segments.size(), is(2));

    Segment seg1 = segments.get(0);
    Segment seg2 = segments.get(1);
    assertThat(seg1.getTerms().size(), is(2));
    assertThat(seg2.getTerms().size(), is(2));

    Ex1 ex1 = new Ex1();
    ex1.setSid(1);
    ex1.setValue(100);
    ex1.setStringAsString("ex1");

    Ex2 ex2 = new Ex2();
    ex2.setSid(2);
    ex2.setValue(100);
    ex2.setStringAsString("ex2");

    int p01, p02;

    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, is(p02));

    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    ex1.setValue(101);
    setShuffleKeyValue(seg1, k, v, ex1);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, not(p02));

    ex1.setValue(100);
    ex1.setSid(2);
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, is(p02));

    ex2.setStringAsString("ex3");
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, is(p02));

    ex1.setValue(101);
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, not(p02));

    ex2.setValue(101);
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, is(p02));

    ex2.setValue(102);
    setShuffleKeyValue(seg1, k, v, ex1);
    p01 = part.getPartition(k, v, 100);
    setShuffleKeyValue(seg2, k, v, ex2);
    p02 = part.getPartition(k, v, 100);
    assertThat(p01, not(p02));
}

From source file:com.asakusafw.runtime.io.util.ShuffleKeyTest.java

License:Apache License

/**
 * partition testing./* w  ww. jav a 2 s  .co m*/
 * @throws Exception if failed
 */
@SuppressWarnings("rawtypes")
@Test
public void partition() throws Exception {
    Partitioner<ShuffleKey, ?> part = new ShuffleKey.Partitioner();
    Mock o11 = new Mock("1", "1");
    Mock o12 = new Mock("1", "2");
    Mock o21 = new Mock("2", "1");
    Mock o22 = new Mock("2", "2");

    assertThat(part.getPartition(o11, null, 10), equalTo(part.getPartition(o11, null, 10)));
    assertThat(part.getPartition(o11, null, 10), equalTo(part.getPartition(o12, null, 10)));
    assertThat(part.getPartition(o21, null, 10), equalTo(part.getPartition(o22, null, 10)));

    Random random = new Random(12345);
    for (int i = 0; i < 100000; i++) {
        Mock mock = new Mock(String.valueOf(random.nextInt()), "left");
        int value = part.getPartition(mock, null, 10000);
        assertThat(value, is(greaterThanOrEqualTo(0)));
    }
    boolean found = false;
    for (int i = 0; i < 100000; i++) {
        Mock left = new Mock(String.valueOf(random.nextInt()), "left");
        Mock right = new Mock(String.valueOf(random.nextInt()), "right");
        if (left.getGroupObject().equals(right.getGroupObject())) {
            continue;
        }
        if (part.getPartition(left, null, 10000) != part.getPartition(right, null, 10000)) {
            found = true;
            break;
        }
    }
    assertThat(found, is(true));
}

From source file:com.pinterest.terrapin.hadoop.BaseUploader.java

License:Apache License

/**
 * Validates the first non-empty partition hfile has right partitioning function.
 * It reads several keys, then calculates the partition according to the partitioning function
 * client offering. If the calculated partition number is different with actual partition number
 * an exception is thrown. If all partition hfiles are empty, an exception is thrown.
 *
 * @param parts full absolute path for all partitions
 * @param partitionerType type of paritioning function
 * @param numShards total number of partitions
 * @throws IOException if something goes wrong when reading the hfiles
 * @throws IllegalArgumentException if the partitioner type is wrong or all partitions are empty
 *///from ww w  .j  a v  a  2s.  c  om
public void validate(List<Path> parts, PartitionerType partitionerType, int numShards) throws IOException {
    boolean hasNonEmptyPartition = false;
    HColumnDescriptor columnDescriptor = new HColumnDescriptor();
    // Disable block cache to ensure it reads the actual file content.
    columnDescriptor.setBlockCacheEnabled(false);
    for (int shardIndex = 0; shardIndex < parts.size(); shardIndex++) {
        Path fileToBeValidated = parts.get(shardIndex);
        HFile.Reader reader = null;
        try {
            FileSystem fs = FileSystem.newInstance(fileToBeValidated.toUri(), conf);
            CacheConfig cc = new CacheConfig(conf, columnDescriptor);
            reader = HFile.createReader(fs, fileToBeValidated, cc);
            Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType);
            byte[] rowKey = reader.getFirstRowKey();
            if (rowKey == null) {
                LOG.warn(String.format("empty partition %s", fileToBeValidated.toString()));
                reader.close();
                continue;
            }
            hasNonEmptyPartition = true;
            BytesWritable key = new BytesWritable(rowKey);
            int partition = partitioner.getPartition(key, null, numShards);
            if (partition != shardIndex) {
                throw new IllegalArgumentException(
                        String.format("wrong partition type %s for key %s in partition %d, expected %d",
                                partitionerType.toString(), new String(key.getBytes()), shardIndex, partition));
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
        }
    }
    if (!hasNonEmptyPartition) {
        throw new IllegalArgumentException("all partitions are empty");
    }
}

From source file:com.pinterest.terrapin.TerrapinUtil.java

License:Apache License

public static String getPartitionName(ByteBuffer key, PartitionerType partitionerType, int numPartitions) {
    Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType);
    return Integer.toString(partitioner.getPartition(
            new BytesWritable(BytesUtil.readBytesFromByteBufferWithoutConsume(key)), null, numPartitions));
}

From source file:com.pinterest.terrapin.tools.HFileGenerator.java

License:Apache License

/**
 * Generate hfiles for testing purpose/*w  w  w . j  av  a2  s.  c  o m*/
 *
 * @param sourceFileSystem source file system
 * @param conf configuration for hfile
 * @param outputFolder output folder for generated hfiles
 * @param partitionerType partitioner type
 * @param numOfPartitions number of partitions
 * @param numOfKeys number of keys
 * @return list of generated hfiles
 * @throws IOException if hfile creation goes wrong
 */
public static List<Path> generateHFiles(FileSystem sourceFileSystem, Configuration conf, File outputFolder,
        PartitionerType partitionerType, int numOfPartitions, int numOfKeys) throws IOException {
    StoreFile.Writer[] writers = new StoreFile.Writer[numOfPartitions];
    for (int i = 0; i < numOfPartitions; i++) {
        writers[i] = new StoreFile.WriterBuilder(conf, new CacheConfig(conf), sourceFileSystem, 4096)
                .withFilePath(new Path(String.format("%s/%s", outputFolder.getAbsoluteFile(),
                        TerrapinUtil.formatPartitionName(i))))
                .withCompression(Compression.Algorithm.NONE).build();
    }
    Partitioner partitioner = PartitionerFactory.getPartitioner(partitionerType);
    for (int i = 0; i < numOfKeys; i++) {
        byte[] key = String.format("%06d", i).getBytes();
        byte[] value;
        if (i <= 1) {
            value = "".getBytes();
        } else {
            value = ("v" + (i + 1)).getBytes();
        }
        KeyValue kv = new KeyValue(key, Bytes.toBytes("cf"), Bytes.toBytes(""), value);
        int partition = partitioner.getPartition(new BytesWritable(key), new BytesWritable(value),
                numOfPartitions);
        writers[partition].append(kv);
    }
    for (int i = 0; i < numOfPartitions; i++) {
        writers[i].close();
    }
    return Lists.transform(Lists.newArrayList(writers), new Function<StoreFile.Writer, Path>() {
        @Override
        public Path apply(StoreFile.Writer writer) {
            return writer.getPath();
        }
    });
}

From source file:com.skp.experiment.common.mapreduce.MapFileOutputFormat.java

License:Apache License

/** Get an entry from output generated by this class. */
public static <K extends WritableComparable<?>, V extends Writable> Writable getEntry(MapFile.Reader[] readers,
        Partitioner<K, V> partitioner, K key, V value) throws IOException {
    int part = partitioner.getPartition(key, value, readers.length);
    return readers[part].get(key, value);
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }/*from   ww w.  j a v a 2 s.c  o  m*/
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));

        Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();

        // vv LookupRecordsByTemperature-ReaderFragment
        Reader reader = readers[partitioner.getPartition(key, val, readers.length)];
        // ^^ LookupRecordsByTemperature-ReaderFragment
        Writable entry = reader.get(key, val);
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        IntWritable nextKey = new IntWritable();
        do {
            parser.parse(val.toString());
            System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        } while (reader.next(nextKey, val) && key.equals(nextKey));
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }/*from w ww. j  ava 2  s.c o m*/
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));
        FileSystem fs = path.getFileSystem(getConf());

        Reader[] readers = MapFileOutputFormat.getReaders(fs, path, getConf());
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();

        Reader reader = readers[partitioner.getPartition(key, val, readers.length)];
        Writable entry = reader.get(key, val);
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        IntWritable nextKey = new IntWritable();
        do {
            parser.parse(val.toString());
            System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        } while (reader.next(nextKey, val) && key.equals(nextKey));
        return 0;
    }

From source file:kogiri.common.hadoop.io.format.map.BloomMapFileOutputFormat.java

License:Apache License

/**
 * Get an entry from output generated by this class.
 *//*from   w  w  w.  j  a  va  2s . c  o m*/
public static <K extends WritableComparable<?>, V extends Writable> Writable getEntry(
        BloomMapFile.Reader[] readers, Partitioner<K, V> partitioner, K key, V value) throws IOException {
    int part = partitioner.getPartition(key, value, readers.length);
    return readers[part].get(key, value);
}