Example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction.

Prototype

PairFlatMapFunction

Source Link

Usage

From source file:com.cloudera.oryx.app.mllib.als.Evaluation.java

License:Open Source License

/**
 * Computes AUC (area under the ROC curve) as a recommender evaluation metric.
 * Really, it computes what might be described as "Mean AUC", as it computes AUC per
 * user and averages them./*w  w  w. ja  va 2  s  .c om*/
 */
static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel,
        JavaRDD<Rating> positiveData) {

    // This does not use Spark's BinaryClassificationMetrics.areaUnderROC because it
    // is intended to operate on one large set of (score,label) pairs. The computation
    // here is really many small AUC problems, for which a much faster direct computation
    // is available.

    // Extract all positive (user,product) pairs
    JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData
            .mapToPair(new PairFunction<Rating, Integer, Integer>() {
                @Override
                public Tuple2<Integer, Integer> call(Rating rating) {
                    return new Tuple2<>(rating.user(), rating.product());
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData,
            positiveUserProducts);

    // All distinct item IDs, to be broadcast
    final Broadcast<List<Integer>> allItemIDsBC = sparkContext
            .broadcast(positiveUserProducts.values().distinct().collect());

    JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey()
            .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() {
                private final RandomGenerator random = RandomManager.getRandom();

                @Override
                public Iterable<Tuple2<Integer, Integer>> call(
                        Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) {
                    Integer userID = userIDsAndItemIDs._1();
                    Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2());
                    int numPositive = positiveItemIDs.size();
                    Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive);
                    List<Integer> allItemIDs = allItemIDsBC.value();
                    int numItems = allItemIDs.size();
                    // Sample about as many negative examples as positive
                    for (int i = 0; i < numItems && negative.size() < numPositive; i++) {
                        Integer itemID = allItemIDs.get(random.nextInt(numItems));
                        if (!positiveItemIDs.contains(itemID)) {
                            negative.add(new Tuple2<>(userID, itemID));
                        }
                    }
                    return negative;
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData,
            negativeUserProducts);

    return positivePredictions.join(negativePredictions).values()
            .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() {
                @Override
                public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) {
                    // AUC is also the probability that random positive examples
                    // rank higher than random examples at large. Here we compare all random negative
                    // examples to all positive examples and report the totals as an alternative
                    // computation for AUC
                    long correct = 0;
                    long total = 0;
                    for (Rating positive : t._1()) {
                        for (Rating negative : t._2()) {
                            if (positive.rating() > negative.rating()) {
                                correct++;
                            }
                            total++;
                        }
                    }
                    return (double) correct / total;
                }
            }).mean();
}

From source file:com.cloudera.oryx.lazarus.batch.ExampleBatchLayerUpdate.java

License:Open Source License

public static Map<String, Integer> countDistinctOtherWords(JavaPairRDD<String, String> data) {
    return data.values().flatMapToPair(new PairFlatMapFunction<String, String, String>() {
        @Override//from  w ww.ja  v  a 2  s .c  om
        public Iterable<Tuple2<String, String>> call(String line) {
            List<Tuple2<String, String>> result = new ArrayList<>();
            Set<String> distinctTokens = new HashSet<>(Arrays.asList(line.split(" ")));
            for (String a : distinctTokens) {
                for (String b : distinctTokens) {
                    if (!a.equals(b)) {
                        result.add(new Tuple2<>(a, b));
                    }
                }
            }
            return result;
        }
    }).distinct().groupByKey().mapValues(new Function<Iterable<String>, Integer>() {
        @Override
        public Integer call(Iterable<String> values) {
            int count = 0;
            for (String v : values) {
                count++;
            }
            return count;
        }
    }).collectAsMap();
}

From source file:com.cloudera.oryx.ml.mllib.als.AUC.java

License:Open Source License

static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel,
        JavaRDD<Rating> positiveData) {

    // This does not use Spark's BinaryClassificationMetrics.areaUnderROC because it
    // is intended to operate on one large set of (score,label) pairs. The computation
    // here is really many small AUC problems, for which a much faster direct computation
    // is available.

    // Extract all positive (user,product) pairs
    JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData
            .mapToPair(new PairFunction<Rating, Integer, Integer>() {
                @Override/*from  www.  ja  va  2  s.  co m*/
                public Tuple2<Integer, Integer> call(Rating rating) {
                    return new Tuple2<>(rating.user(), rating.product());
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData,
            positiveUserProducts);

    // All distinct item IDs, to be broadcast
    final Broadcast<List<Integer>> allItemIDsBC = sparkContext
            .broadcast(positiveUserProducts.values().distinct().collect());

    JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey()
            .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() {
                private final RandomGenerator random = RandomManager.getRandom();

                @Override
                public Iterable<Tuple2<Integer, Integer>> call(
                        Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) {
                    Integer userID = userIDsAndItemIDs._1();
                    Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2());
                    int numPositive = positiveItemIDs.size();
                    Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive);
                    List<Integer> allItemIDs = allItemIDsBC.value();
                    int numItems = allItemIDs.size();
                    // Sample about as many negative examples as positive
                    for (int i = 0; i < numItems && negative.size() < numPositive; i++) {
                        Integer itemID = allItemIDs.get(random.nextInt(numItems));
                        if (!positiveItemIDs.contains(itemID)) {
                            negative.add(new Tuple2<>(userID, itemID));
                        }
                    }
                    return negative;
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData,
            negativeUserProducts);

    return positivePredictions.join(negativePredictions).values()
            .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() {
                @Override
                public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) {
                    // AUC is also the probability that random positive examples
                    // rank higher than random examples at large. Here we compare all random negative
                    // examples to all positive examples and report the totals as an alternative
                    // computation for AUC
                    long correct = 0;
                    long total = 0;
                    for (Rating positive : t._1()) {
                        for (Rating negative : t._2()) {
                            if (positive.rating() > negative.rating()) {
                                correct++;
                            }
                            total++;
                        }
                    }
                    return (double) correct / total;
                }
            }).mean();
}

From source file:com.hxr.bigdata.spark.example141.JavaPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/*from   w w w  . j  a v a  2  s.  c o  m*/
    }

    showWarning();

    SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {

        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {

        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {

                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {

            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    ctx.stop();
}

From source file:com.jyz.study.hadoop.spark.examples.JavaPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/*from  w ww. j  a va  2 s  .c  o m*/
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    ctx.stop();
}

From source file:com.sdw.dream.spark.examples.JavaPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);//from   w  w w  .  ja  v  a  2 s.c  o  m
    }

    showWarning();

    SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    ctx.stop();
}

From source file:com.spark.cis833.extra.SparkPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: SparkPageRank <input> <output>");
        System.exit(1);/*from w w w  .  j  a  va 2 s  .c o  m*/
    }
    SparkConf sparkConf = new SparkConf().setAppName("SparkPageRank");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    JavaRDD<String> lines = ctx.textFile(args[0], 1);
    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();
    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });
    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt("10"); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                }).sortByKey(false);
        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        }).sortByKey(false);
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    JavaPairRDD<Double, String> swap1 = ranks
            .mapToPair(new PairFunction<Tuple2<String, Double>, Double, String>() {
                @Override
                public Tuple2<Double, String> call(Tuple2<String, Double> item) throws Exception {
                    return item.swap();
                }

            }).sortByKey(false);

    swap1.saveAsTextFile(args[1]);

    ctx.stop();
}

From source file:com.springdeveloper.spark.SparkHashtags.java

License:Apache License

public static void main(String[] args) {
    System.out.println("Spark Hashtags:");

    String fileName = "";
    if (args.length > 0) {
        fileName = args[0];/*w w w.j  a  v  a  2 s.  c om*/
        System.out.println("processing: " + fileName);
    }

    SparkConf conf = new SparkConf().setAppName("spark-hashtags");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> tweetData = sc.textFile(fileName).cache();

    JavaRDD<Map<String, Object>> tweets = tweetData.map(new Function<String, Map<String, Object>>() {
        public Map<String, Object> call(String s) throws Exception {
            return jsonMapper.readValue(s.toString(), new TypeReference<HashMap<String, Object>>() {
            });
        }
    });

    JavaPairRDD<String, Integer> hashTags = tweets
            .flatMapToPair(new PairFlatMapFunction<Map<String, Object>, String, Integer>() {
                public Iterable<Tuple2<String, Integer>> call(Map<String, Object> tweet) throws Exception {

                    Map<String, Object> entities = (Map<String, Object>) tweet.get("entities");
                    List<Map<String, Object>> hashTagEntries = null;
                    if (entities != null) {
                        hashTagEntries = (List<Map<String, Object>>) entities.get("hashtags");
                    }
                    List<Tuple2<String, Integer>> hashTags = new ArrayList<Tuple2<String, Integer>>();
                    if (hashTagEntries != null && hashTagEntries.size() > 0) {
                        for (Map<String, Object> hashTagEntry : hashTagEntries) {
                            String hashTag = hashTagEntry.get("text").toString();
                            hashTags.add(new Tuple2<String, Integer>(hashTag, 1));
                        }
                    }
                    return hashTags;
                }
            });

    JavaPairRDD<String, Integer> hashTagCounts = hashTags
            .reduceByKey(new Function2<Integer, Integer, Integer>() {
                public Integer call(Integer int1, Integer int2) throws Exception {
                    return int1 + int2;
                }
            });

    JavaPairRDD<String, Integer> hashTagCountsSorted = hashTagCounts
            .mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
                public Tuple2<Integer, String> call(Tuple2<String, Integer> in) throws Exception {
                    return new Tuple2<Integer, String>(in._2, in._1);
                }
            }).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
                public Tuple2<String, Integer> call(Tuple2<Integer, String> in) throws Exception {
                    return new Tuple2<String, Integer>(in._2, in._1);
                }
            });

    List<Tuple2<String, Integer>> top10 = hashTagCountsSorted.take(10);

    System.out.println("Tweets: " + tweets.count());
    System.out.println("HashTags: " + top10);

    sc.stop();
}

From source file:com.streamsets.spark.GetCreditCardType.java

License:Apache License

@Override
public TransformResult transform(JavaRDD<Record> records) {
    // Validate incoming records
    JavaPairRDD<Record, String> errors = records
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Record>, Record, String>() {
                public Iterable<Tuple2<Record, String>> call(Iterator<Record> recordIterator) throws Exception {
                    List<Tuple2<Record, String>> errors = new LinkedList<>();
                    // Iterate through incoming records
                    while (recordIterator.hasNext()) {
                        Record record = recordIterator.next();
                        // Validate each record
                        if (!validateRecord(record)) {
                            // We have a problem - flag the record as an error
                            errors.add(new Tuple2<>(record, "Credit card number is missing"));
                        }/*from w  w w  .j av  a 2 s.  c o  m*/
                    }
                    return errors;
                }
            });

    // Filter out invalid records before applying the map
    JavaRDD<Record> result = records.filter(new Function<Record, Boolean>() {
        // Only operate on valid records
        public Boolean call(Record record) throws Exception {
            return validateRecord(record);
        }
    }).map(new Function<Record, Record>() {
        public Record call(Record record) throws Exception {
            // Get the credit card number from the record
            String creditCard = record.get(VALUE_PATH).getValueAsString();

            // Look through the map of credit card types
            for (Map.Entry<String, String[]> entry : ccTypes.entrySet()) {
                // Find the first matching prefix
                for (String prefix : entry.getValue()) {
                    if (creditCard.startsWith(prefix)) {
                        // Set the credit card type
                        record.set(RESULT_PATH, Field.create(entry.getKey()));
                        return record;
                    }
                }
            }

            return record;
        }
    });
    return new TransformResult(result, errors);
}

From source file:common.JavaPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 3) {
        System.err.println("Usage: JavaPageRank <master> <file> <number_of_iterations>");
        System.exit(1);/*  www . j  a  va 2 s  . c  o m*/
    }

    JavaSparkContext ctx = new JavaSparkContext(args[0], "JavaPageRank", System.getenv("SPARK_HOME"),
            System.getenv("SPARK_EXAMPLES_JAR"));

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    JavaRDD<String> lines = ctx.textFile(args[1], 1);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, List<String>> links = lines.map(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = s.split("\\s+");
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<List<String>, Double>() {
        @Override
        public Double call(List<String> rs) throws Exception {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[2]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMap(new PairFlatMapFunction<Tuple2<List<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<List<String>, Double> s) {
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2 / s._1.size()));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) throws Exception {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2 tuple : output) {
        System.out.println(tuple._1 + " has rank: " + tuple._2 + ".");
    }

    System.exit(0);
}