com.github.ygf.pagerank.PageRankTopNMapper.java Source code

Java tutorial

Introduction

Here is the source code for com.github.ygf.pagerank.PageRankTopNMapper.java

Source

/*
 * Copyright 2014 Yasser Gonzalez Fernandez <ygonzalezfernandez@gmail.com>.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. 
 */

package com.github.ygf.pagerank;

import java.io.IOException;
import java.util.AbstractMap;
import java.util.Map;
import java.util.PriorityQueue;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.ShortWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Mapper;

public class PageRankTopNMapper extends Mapper<ShortWritable, FloatArrayWritable, FloatWritable, IntWritable> {

    // TODO: Create base classes TopN{Mapper,Reducer} to avoid duplicate 
    // code in {PageRank,InLinks}TopN{Mapper,Reducer}.

    private PriorityQueue<Map.Entry<Float, Integer>> topN;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        Configuration conf = context.getConfiguration();
        int topResults = Integer.parseInt(conf.get("pagerank.top_results"));

        // This queue keeps the top N elements by PageRank.
        topN = new PriorityQueue<Map.Entry<Float, Integer>>(topResults,
                new MapEntryKeyComparator<Float, Integer>());
    }

    @Override
    public void map(ShortWritable inKey, FloatArrayWritable inValue, Context context)
            throws IOException, InterruptedException {

        Configuration conf = context.getConfiguration();
        short blockSize = Short.parseShort(conf.get("pagerank.block_size"));
        int topResults = Integer.parseInt(conf.get("pagerank.top_results"));

        Writable[] vStripe = inValue.get();
        for (int i = 0; i < vStripe.length; i++) {
            int page = 1 + (inKey.get() - 1) * blockSize + i;
            float pageRank = ((FloatWritable) vStripe[i]).get();

            // The elements in the queue are sorted (in non-decreasing order) by
            // PageRank. The queue is filled up until it contains topResults
            // elements. Then, a new element will be added only if its PageRank
            // is greater than the lowest PageRank in the queue. If the queue is
            // full and a new element is added, the one with the lowest PageRank
            // is removed from the queue.
            if (topN.size() < topResults || pageRank >= topN.peek().getKey()) {
                topN.add(new AbstractMap.SimpleEntry<Float, Integer>(pageRank, page));
                if (topN.size() > topResults) {
                    topN.poll();
                }
            }
        }
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {

        // The mapper outputs the top N pages by PageRank in the partition.
        for (Map.Entry<Float, Integer> entry : topN) {
            context.write(new FloatWritable(entry.getKey()), new IntWritable(entry.getValue()));
        }
    }
}