Example usage for org.apache.hadoop.mapreduce.lib.partition HashPartitioner HashPartitioner

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.partition HashPartitioner HashPartitioner.

Prototype

HashPartitioner

Source Link

Usage

From source file:RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws Exception {
    Configuration conf = getConf();

    String in = path + "/iter" + FORMAT.format(i);
    String out = path + "/iter" + FORMAT.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // We need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-")) {
            numPartitions++;//from  w ww  .j av a 2 s  . c om
        }
    }

    conf.setInt("NodeCount", n);

    Partitioner<IntWritable, Writable> p = null;

    if (useRange) {
        p = new RangePartitioner();
        ((Configurable) p).setConf(conf);
    } else {
        p = new HashPartitioner<IntWritable, Writable>();
    }

    // This is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (!f.getPath().getName().contains("part-")) {
            continue;
        }

        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath()));

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        LOG.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + ";");
    }

    LOG.info(sb.toString().trim());

    LOG.info("PageRankSchimmy: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + n);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInmapCombiner);
    LOG.info(" - numPartitions: " + numPartitions);
    LOG.info(" - useRange: " + useRange);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    //conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    Job job = Job.getInstance(conf);
    job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankSchimmy.class);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        job.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        job.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        job.setPartitionerClass(RangePartitioner.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:com.github.ygf.pagerank.InLinksTopNReducer.java

License:Apache License

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    Path titlesDir = new Path(conf.get("inlinks.titles_dir"));

    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    IntWritable page = new IntWritable();
    Text title = new Text();

    int[] inLinks = new int[topN.size()];
    String[] titles = new String[topN.size()];

    for (int i = inLinks.length - 1; i >= 0; i--) {
        Map.Entry<Integer, Integer> entry = topN.poll();
        page.set(entry.getValue());/*from w  w w  . j  a v a  2  s  .  c  om*/
        MapFileOutputFormat.getEntry(readers, partitioner, page, title);
        inLinks[i] = entry.getKey();
        titles[i] = title.toString();
    }

    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    for (int i = 0; i < inLinks.length; i++) {
        context.write(new IntWritable(inLinks[i]), new Text(titles[i]));
    }
}

From source file:com.github.ygf.pagerank.PageRankIterationMapper.java

License:Apache License

@Override
public void map(ShortArrayWritable inKey, MatrixBlockWritable inValue, Context context)
        throws IOException, InterruptedException {

    // This task gets each block M_{i,j}, loads the corresponding stripe j
    // of the vector v_{k-1} and produces the partial result of the stripe i
    // of the vector v_k.

    Configuration conf = context.getConfiguration();
    int iter = Integer.parseInt(conf.get("pagerank.iteration"));
    int numPages = Integer.parseInt(conf.get("pagerank.num_pages"));
    short blockSize = Short.parseShort(conf.get("pagerank.block_size"));

    Writable[] blockIndexes = inKey.get();
    short i = ((ShortWritable) blockIndexes[0]).get();
    short j = ((ShortWritable) blockIndexes[1]).get();

    int vjSize = (j > numPages / blockSize) ? (numPages % blockSize) : blockSize;
    FloatWritable[] vj = new FloatWritable[vjSize];

    if (iter == 1) {
        // Initial PageRank vector with 1/n for all pages.
        for (int k = 0; k < vj.length; k++) {
            vj[k] = new FloatWritable(1.0f / numPages);
        }//from  w w  w  .j  a  v a 2 s  .  co  m
    } else {
        // Load the stripe j of the vector v_{k-1} from the MapFiles.
        Path outputDir = MapFileOutputFormat.getOutputPath(context).getParent();
        Path vjDir = new Path(outputDir, "v" + (iter - 1));
        MapFile.Reader[] readers = MapFileOutputFormat.getReaders(vjDir, conf);
        Partitioner<ShortWritable, FloatArrayWritable> partitioner = new HashPartitioner<ShortWritable, FloatArrayWritable>();
        ShortWritable key = new ShortWritable(j);
        FloatArrayWritable value = new FloatArrayWritable();
        MapFileOutputFormat.getEntry(readers, partitioner, key, value);
        Writable[] writables = value.get();
        for (int k = 0; k < vj.length; k++) {
            vj[k] = (FloatWritable) writables[k];
        }
        for (MapFile.Reader reader : readers) {
            reader.close();
        }
    }

    // Initialize the partial result i of the vector v_k.
    int viSize = (i > numPages / blockSize) ? (numPages % blockSize) : blockSize;
    FloatWritable[] vi = new FloatWritable[viSize];
    for (int k = 0; k < vi.length; k++) {
        vi[k] = new FloatWritable(0);
    }

    // Multiply M_{i,j} by the stripe j of the vector v_{k-1} to obtain the
    // partial result i of the vector v_k.
    Writable[][] blockColumns = inValue.get();
    for (int k = 0; k < blockColumns.length; k++) {
        Writable[] blockColumn = blockColumns[k];
        if (blockColumn.length > 0) {
            int vDegree = ((ShortWritable) blockColumn[0]).get();
            for (int columnIndex = 1; columnIndex < blockColumn.length; columnIndex++) {
                int l = ((ShortWritable) blockColumn[columnIndex]).get();
                vi[l].set(vi[l].get() + (1.0f / vDegree) * vj[k].get());
            }
        }
    }

    context.write(new ShortWritable(i), new FloatArrayWritable(vi));
}

From source file:com.github.ygf.pagerank.PageRankTopNReducer.java

License:Apache License

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    Path titlesDir = new Path(conf.get("pagerank.titles_dir"));

    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    IntWritable page = new IntWritable();
    Text title = new Text();

    float[] pageRanks = new float[topN.size()];
    String[] titles = new String[topN.size()];

    // The order of the entries is reversed. The priority queue is in
    // non-decreasing order and we want the highest PageRank first.
    for (int i = pageRanks.length - 1; i >= 0; i--) {
        Map.Entry<Float, Integer> entry = topN.poll();
        // Get the title of the page from the title index.
        page.set(entry.getValue());/*from   w  w  w.  j  a  va  2 s .  c  o  m*/
        MapFileOutputFormat.getEntry(readers, partitioner, page, title);
        pageRanks[i] = entry.getKey();
        titles[i] = title.toString();
    }

    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    for (int i = 0; i < pageRanks.length; i++) {
        context.write(new FloatWritable(pageRanks[i]), new Text(titles[i]));
    }
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

private List<LongPairWritable> getFilterQualifiedBlocks(JobContext context, Path file,
        BinaryExpression filterCondition, long splitMaxSize) throws IOException {

    Expression lhs = filterCondition.getLhs();
    Expression rhs = filterCondition.getRhs();

    if (filterCondition.getOpType() == OpType.OP_EQ) { // "leaf node"
        // handle cases like 'abcd' == column , column == 'abcd'
        if (rhs instanceof Column && lhs instanceof Const) {
            lhs = filterCondition.getRhs();
            rhs = filterCondition.getLhs();
        }//from  w ww .j  av a  2s.  c o  m
        String columnName = ((Column) lhs).getName();
        String value = ((String) ((Const) rhs).getValue());
        Text searchedValue = new Text(value);

        FileStatus[] dirlist = listIndexFiles(context, file, columnName);
        int part_num = dirlist.length;
        int part_seqnum = (new HashPartitioner<Text, Text>()).getPartition(searchedValue, searchedValue,
                part_num);
        String part_name = "/part-r-" + String.format("%05d", part_seqnum);
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        MapFile.Reader mapFileIndexReader = new MapFile.Reader(fs,
                getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName + part_name,
                context.getConfiguration());
        ListLongPair indexedBlocks = new ListLongPair();
        mapFileIndexReader.get(searchedValue, indexedBlocks);
        mapFileIndexReader.close();
        return indexedBlocks.get();
    }

    List<LongPairWritable> blocksLeft = getFilterQualifiedBlocks(context, file, (BinaryExpression) lhs,
            splitMaxSize);
    List<LongPairWritable> blocksRight = getFilterQualifiedBlocks(context, file, (BinaryExpression) rhs,
            splitMaxSize);

    if (filterCondition.getOpType() == OpType.OP_AND)
        return andFilter(blocksLeft, blocksRight);
    else if (filterCondition.getOpType() == OpType.OP_OR) {
        return orFilter(blocksLeft, blocksRight, splitMaxSize);
    } else
        throw new IOException("not supported filter condition:" + filterCondition);
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }//from w  w  w  .java2s  .co m
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));

        Reader[] readers = /*[*/MapFileOutputFormat.getReaders(path, getConf())/*]*/;
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();
        Writable entry = /*[*/MapFileOutputFormat.getEntry(readers, partitioner, key, val)/*]*/;
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        parser.parse(val.toString());
        System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }//from   ww w.  ja v  a  2  s.co m
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));

        Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();

        // vv LookupRecordsByTemperature-ReaderFragment
        Reader reader = readers[partitioner.getPartition(key, val, readers.length)];
        // ^^ LookupRecordsByTemperature-ReaderFragment
        Writable entry = reader.get(key, val);
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        IntWritable nextKey = new IntWritable();
        do {
            parser.parse(val.toString());
            System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        } while (reader.next(nextKey, val) && key.equals(nextKey));
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }//  w  w  w . j a  v  a 2 s  .  c  om
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));
        FileSystem fs = path.getFileSystem(getConf());

        Reader[] readers = /*[*/MapFileOutputFormat.getReaders(fs, path, getConf())/*]*/;
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();
        Writable entry = /*[*/MapFileOutputFormat.getEntry(readers, partitioner, key, val)/*]*/;
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        parser.parse(val.toString());
        System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }/*  w  w  w  . j a va2  s  .com*/
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));
        FileSystem fs = path.getFileSystem(getConf());

        Reader[] readers = MapFileOutputFormat.getReaders(fs, path, getConf());
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();

        Reader reader = readers[partitioner.getPartition(key, val, readers.length)];
        Writable entry = reader.get(key, val);
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        IntWritable nextKey = new IntWritable();
        do {
            parser.parse(val.toString());
            System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        } while (reader.next(nextKey, val) && key.equals(nextKey));
        return 0;
    }

From source file:org.apache.blur.utils.TableShardCountCollapserTest.java

License:Apache License

private void assertData(int totalShardCount) throws IOException {
    Partitioner<IntWritable, IntWritable> partitioner = new HashPartitioner<IntWritable, IntWritable>();
    for (int i = 0; i < totalShardCount; i++) {
        HdfsDirectory directory = new HdfsDirectory(configuration, new Path(path, ShardUtil.getShardName(i)));
        DirectoryReader reader = DirectoryReader.open(directory);
        int numDocs = reader.numDocs();
        for (int d = 0; d < numDocs; d++) {
            Document document = reader.document(d);
            IndexableField field = document.getField("id");
            Integer id = (Integer) field.numericValue();
            int partition = partitioner.getPartition(new IntWritable(id), null, totalShardCount);
            assertEquals(i, partition);/*from w ww.  ja  v a  2 s  .  com*/
        }
        reader.close();
    }
}