Example usage for org.apache.hadoop.mapreduce.lib.partition HashPartitioner HashPartitioner

List of usage examples for org.apache.hadoop.mapreduce.lib.partition HashPartitioner HashPartitioner

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.partition HashPartitioner HashPartitioner.

Prototype

HashPartitioner

Source Link

Usage

From source file:RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws Exception {
    Configuration conf = getConf();

    String in = path + "/iter" + FORMAT.format(i);
    String out = path + "/iter" + FORMAT.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // We need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-")) {
            numPartitions++;//from  w ww  .j av a 2 s  . c om
        }
    }

    conf.setInt("NodeCount", n);

    Partitioner<IntWritable, Writable> p = null;

    if (useRange) {
        p = new RangePartitioner();
        ((Configurable) p).setConf(conf);
    } else {
        p = new HashPartitioner<IntWritable, Writable>();
    }

    // This is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (!f.getPath().getName().contains("part-")) {
            continue;
        }

        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath()));

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        LOG.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + ";");
    }

    LOG.info(sb.toString().trim());

    LOG.info("PageRankSchimmy: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + n);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInmapCombiner);
    LOG.info(" - numPartitions: " + numPartitions);
    LOG.info(" - useRange: " + useRange);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    //conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    Job job = Job.getInstance(conf);
    job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankSchimmy.class);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        job.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        job.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        job.setPartitionerClass(RangePartitioner.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:com.github.ygf.pagerank.InLinksTopNReducer.java

License:Apache License

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    Path titlesDir = new Path(conf.get("inlinks.titles_dir"));

    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    IntWritable page = new IntWritable();
    Text title = new Text();

    int[] inLinks = new int[topN.size()];
    String[] titles = new String[topN.size()];

    for (int i = inLinks.length - 1; i >= 0; i--) {
        Map.Entry<Integer, Integer> entry = topN.poll();
        page.set(entry.getValue());/*from w  w w  . j  a v a  2  s  .  c  om*/
        MapFileOutputFormat.getEntry(readers, partitioner, page, title);
        inLinks[i] = entry.getKey();
        titles[i] = title.toString();
    }

    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    for (int i = 0; i < inLinks.length; i++) {
        context.write(new IntWritable(inLinks[i]), new Text(titles[i]));
    }
}

From source file:com.github.ygf.pagerank.PageRankIterationMapper.java

License:Apache License

@Override
public void map(ShortArrayWritable inKey, MatrixBlockWritable inValue, Context context)
        throws IOException, InterruptedException {

    // This task gets each block M_{i,j}, loads the corresponding stripe j
    // of the vector v_{k-1} and produces the partial result of the stripe i
    // of the vector v_k.

    Configuration conf = context.getConfiguration();
    int iter = Integer.parseInt(conf.get("pagerank.iteration"));
    int numPages = Integer.parseInt(conf.get("pagerank.num_pages"));
    short blockSize = Short.parseShort(conf.get("pagerank.block_size"));

    Writable[] blockIndexes = inKey.get();
    short i = ((ShortWritable) blockIndexes[0]).get();
    short j = ((ShortWritable) blockIndexes[1]).get();

    int vjSize = (j > numPages / blockSize) ? (numPages % blockSize) : blockSize;
    FloatWritable[] vj = new FloatWritable[vjSize];

    if (iter == 1) {
        // Initial PageRank vector with 1/n for all pages.
        for (int k = 0; k < vj.length; k++) {
            vj[k] = new FloatWritable(1.0f / numPages);
        }//from  w w  w  .j  a  v a 2 s  .  co  m
    } else {
        // Load the stripe j of the vector v_{k-1} from the MapFiles.
        Path outputDir = MapFileOutputFormat.getOutputPath(context).getParent();
        Path vjDir = new Path(outputDir, "v" + (iter - 1));
        MapFile.Reader[] readers = MapFileOutputFormat.getReaders(vjDir, conf);
        Partitioner<ShortWritable, FloatArrayWritable> partitioner = new HashPartitioner<ShortWritable, FloatArrayWritable>();
        ShortWritable key = new ShortWritable(j);
        FloatArrayWritable value = new FloatArrayWritable();
        MapFileOutputFormat.getEntry(readers, partitioner, key, value);
        Writable[] writables = value.get();
        for (int k = 0; k < vj.length; k++) {
            vj[k] = (FloatWritable) writables[k];
        }
        for (MapFile.Reader reader : readers) {
            reader.close();
        }
    }

    // Initialize the partial result i of the vector v_k.
    int viSize = (i > numPages / blockSize) ? (numPages % blockSize) : blockSize;
    FloatWritable[] vi = new FloatWritable[viSize];
    for (int k = 0; k < vi.length; k++) {
        vi[k] = new FloatWritable(0);
    }

    // Multiply M_{i,j} by the stripe j of the vector v_{k-1} to obtain the
    // partial result i of the vector v_k.
    Writable[][] blockColumns = inValue.get();
    for (int k = 0; k < blockColumns.length; k++) {
        Writable[] blockColumn = blockColumns[k];
        if (blockColumn.length > 0) {
            int vDegree = ((ShortWritable) blockColumn[0]).get();
            for (int columnIndex = 1; columnIndex < blockColumn.length; columnIndex++) {
                int l = ((ShortWritable) blockColumn[columnIndex]).get();
                vi[l].set(vi[l].get() + (1.0f / vDegree) * vj[k].get());
            }
        }
    }

    context.write(new ShortWritable(i), new FloatArrayWritable(vi));
}

From source file:com.github.ygf.pagerank.PageRankTopNReducer.java

License:Apache License

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    Path titlesDir = new Path(conf.get("pagerank.titles_dir"));

    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf);
    Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
    IntWritable page = new IntWritable();
    Text title = new Text();

    float[] pageRanks = new float[topN.size()];
    String[] titles = new String[topN.size()];

    // The order of the entries is reversed. The priority queue is in
    // non-decreasing order and we want the highest PageRank first.
    for (int i = pageRanks.length - 1; i >= 0; i--) {
        Map.Entry<Float, Integer> entry = topN.poll();
        // Get the title of the page from the title index.
        page.set(entry.getValue());/*from   w  w  w.  j  a  va  2 s .  c  o  m*/
        MapFileOutputFormat.getEntry(readers, partitioner, page, title);
        pageRanks[i] = entry.getKey();
        titles[i] = title.toString();
    }

    for (MapFile.Reader reader : readers) {
        reader.close();
    }

    for (int i = 0; i < pageRanks.length; i++) {
        context.write(new FloatWritable(pageRanks[i]), new Text(titles[i]));
    }
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

private List<LongPairWritable> getFilterQualifiedBlocks(JobContext context, Path file,
        BinaryExpression filterCondition, long splitMaxSize) throws IOException {

    Expression lhs = filterCondition.getLhs();
    Expression rhs = filterCondition.getRhs();

    if (filterCondition.getOpType() == OpType.OP_EQ) { // "leaf node"
        // handle cases like 'abcd' == column , column == 'abcd'
        if (rhs instanceof Column && lhs instanceof Const) {
            lhs = filterCondition.getRhs();
            rhs = filterCondition.getLhs();
        }//from  w ww .j  av a  2s.  c o  m
        String columnName = ((Column) lhs).getName();
        String value = ((String) ((Const) rhs).getValue());
        Text searchedValue = new Text(value);

        FileStatus[] dirlist = listIndexFiles(context, file, columnName);
        int part_num = dirlist.length;
        int part_seqnum = (new HashPartitioner<Text, Text>()).getPartition(searchedValue, searchedValue,
                part_num);
        String part_name = "/part-r-" + String.format("%05d", part_seqnum);
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        MapFile.Reader mapFileIndexReader = new MapFile.Reader(fs,
                getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName + part_name,
                context.getConfiguration());
        ListLongPair indexedBlocks = new ListLongPair();
        mapFileIndexReader.get(searchedValue, indexedBlocks);
        mapFileIndexReader.close();
        return indexedBlocks.get();
    }

    List<LongPairWritable> blocksLeft = getFilterQualifiedBlocks(context, file, (BinaryExpression) lhs,
            splitMaxSize);
    List<LongPairWritable> blocksRight = getFilterQualifiedBlocks(context, file, (BinaryExpression) rhs,
            splitMaxSize);

    if (filterCondition.getOpType() == OpType.OP_AND)
        return andFilter(blocksLeft, blocksRight);
    else if (filterCondition.getOpType() == OpType.OP_OR) {
        return orFilter(blocksLeft, blocksRight, splitMaxSize);
    } else
        throw new IOException("not supported filter condition:" + filterCondition);
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }//from w  w  w  .java2s  .co m
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));

        Reader[] readers = /*[*/MapFileOutputFormat.getReaders(path, getConf())/*]*/;
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();
        Writable entry = /*[*/MapFileOutputFormat.getEntry(readers, partitioner, key, val)/*]*/;
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        parser.parse(val.toString());
        System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }//from   ww w.  ja v  a  2  s.co m
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));

        Reader[] readers = MapFileOutputFormat.getReaders(path, getConf());
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();

        // vv LookupRecordsByTemperature-ReaderFragment
        Reader reader = readers[partitioner.getPartition(key, val, readers.length)];
        // ^^ LookupRecordsByTemperature-ReaderFragment
        Writable entry = reader.get(key, val);
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        IntWritable nextKey = new IntWritable();
        do {
            parser.parse(val.toString());
            System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        } while (reader.next(nextKey, val) && key.equals(nextKey));
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }//  w  w  w . j a  v  a 2 s  .  c  om
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));
        FileSystem fs = path.getFileSystem(getConf());

        Reader[] readers = /*[*/MapFileOutputFormat.getReaders(fs, path, getConf())/*]*/;
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();
        Writable entry = /*[*/MapFileOutputFormat.getEntry(readers, partitioner, key, val)/*]*/;
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        parser.parse(val.toString());
        System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            JobBuilder.printUsage(this, "<path> <key>");
            return -1;
        }/*  w  w  w  . j a va2  s  .com*/
        Path path = new Path(args[0]);
        IntWritable key = new IntWritable(Integer.parseInt(args[1]));
        FileSystem fs = path.getFileSystem(getConf());

        Reader[] readers = MapFileOutputFormat.getReaders(fs, path, getConf());
        Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>();
        Text val = new Text();

        Reader reader = readers[partitioner.getPartition(key, val, readers.length)];
        Writable entry = reader.get(key, val);
        if (entry == null) {
            System.err.println("Key not found: " + key);
            return -1;
        }
        NcdcRecordParser parser = new NcdcRecordParser();
        IntWritable nextKey = new IntWritable();
        do {
            parser.parse(val.toString());
            System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear());
        } while (reader.next(nextKey, val) && key.equals(nextKey));
        return 0;
    }

From source file:org.apache.blur.utils.TableShardCountCollapserTest.java

License:Apache License

private void assertData(int totalShardCount) throws IOException {
    Partitioner<IntWritable, IntWritable> partitioner = new HashPartitioner<IntWritable, IntWritable>();
    for (int i = 0; i < totalShardCount; i++) {
        HdfsDirectory directory = new HdfsDirectory(configuration, new Path(path, ShardUtil.getShardName(i)));
        DirectoryReader reader = DirectoryReader.open(directory);
        int numDocs = reader.numDocs();
        for (int d = 0; d < numDocs; d++) {
            Document document = reader.document(d);
            IndexableField field = document.getField("id");
            Integer id = (Integer) field.numericValue();
            int partition = partitioner.getPartition(new IntWritable(id), null, totalShardCount);
            assertEquals(i, partition);/*from w ww.  ja  v a  2 s  .  com*/
        }
        reader.close();
    }
}