Example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction.

Prototype

PairFlatMapFunction

Source Link

Usage

From source file:$package.SparkPageRankProgram.java

License:Apache License

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();

    LOG.info("Processing backlinkURLs data");
    JavaPairRDD<Long, String> backlinkURLs = sec.fromStream("backlinkURLStream", String.class);
    int iterationCount = getIterationCount(sec);

    LOG.info("Grouping data by key");
    // Grouping backlinks by unique URL in key
    JavaPairRDD<String, Iterable<String>> links = backlinkURLs.values()
            .mapToPair(new PairFunction<String, String, String>() {
                @Override/*w ww.  j  ava2  s.  co  m*/
                public Tuple2<String, String> call(String s) {
                    String[] parts = SPACES.split(s);
                    return new Tuple2<>(parts[0], parts[1]);
                }
            }).distinct().groupByKey().cache();

    // Initialize default rank for each key URL
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });
    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < iterationCount; current++) {
        LOG.debug("Processing data with PageRank algorithm. Iteration {}/{}", current + 1, (iterationCount));
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        LOG.debug("Processing {} with rank {}", s._1(), s._2());
                        int urlCount = Iterables.size(s._1());
                        List<Tuple2<String, Double>> results = new ArrayList<>();
                        for (String n : s._1()) {
                            results.add(new Tuple2<>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });
        // Re-calculates URL ranks based on backlink contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    LOG.info("Writing ranks data");

    final ServiceDiscoverer discoveryServiceContext = sec.getServiceDiscoverer();
    final Metrics sparkMetrics = sec.getMetrics();
    JavaPairRDD<byte[], Integer> ranksRaw = ranks
            .mapToPair(new PairFunction<Tuple2<String, Double>, byte[], Integer>() {
                @Override
                public Tuple2<byte[], Integer> call(Tuple2<String, Double> tuple) throws Exception {
                    LOG.debug("URL {} has rank {}", Arrays.toString(tuple._1().getBytes(Charsets.UTF_8)),
                            tuple._2());
                    URL serviceURL = discoveryServiceContext.getServiceURL(SparkPageRankApp.SERVICE_HANDLERS);
                    if (serviceURL == null) {
                        throw new RuntimeException(
                                "Failed to discover service: " + SparkPageRankApp.SERVICE_HANDLERS);
                    }
                    try {
                        URLConnection connection = new URL(serviceURL,
                                String.format("%s/%s",
                                        SparkPageRankApp.SparkPageRankServiceHandler.TRANSFORM_PATH,
                                        tuple._2().toString())).openConnection();
                        try (BufferedReader reader = new BufferedReader(
                                new InputStreamReader(connection.getInputStream(), Charsets.UTF_8))) {
                            String pr = reader.readLine();
                            if ((Integer.parseInt(pr)) == POPULAR_PAGE_THRESHOLD) {
                                sparkMetrics.count(POPULAR_PAGES, 1);
                            } else if (Integer.parseInt(pr) <= UNPOPULAR_PAGE_THRESHOLD) {
                                sparkMetrics.count(UNPOPULAR_PAGES, 1);
                            } else {
                                sparkMetrics.count(REGULAR_PAGES, 1);
                            }
                            return new Tuple2<>(tuple._1().getBytes(Charsets.UTF_8), Integer.parseInt(pr));
                        }
                    } catch (Exception e) {
                        LOG.warn("Failed to read the Stream for service {}", SparkPageRankApp.SERVICE_HANDLERS,
                                e);
                        throw Throwables.propagate(e);
                    }
                }
            });

    // Store calculated results in output Dataset.
    // All calculated results are stored in one row.
    // Each result, the calculated URL rank based on backlink contributions, is an entry of the row.
    // The value of the entry is the URL rank.
    sec.saveAsDataset(ranksRaw, "ranks");

    LOG.info("PageRanks successfuly computed and written to \"ranks\" dataset");
}

From source file:biz.hangyang.knnspark.spark.KNNClassifySpark.java

public static JavaPairRDD<Entity, Object> calKDistance(final String trainingDataPath, String testingDataPath,
        final int k, final Map<Object, Double> weightMap, JavaSparkContext sc, int partition,
        final Accumulator<Integer> accum) {
    JavaRDD<String> testingDataRDD = sc.textFile(testingDataPath, partition);
    //?Entity//from   w  w  w . j  a v a2 s  . c  om
    JavaRDD<Entity> testingEntityRDD = testingDataRDD.map(new Function<String, Entity>() {
        @Override
        public Entity call(String line) throws Exception {
            return new GeneEntity(line);
        }
    });
    //??????K??KV
    JavaPairRDD<Entity, KDistance> ekRDD = testingEntityRDD
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Entity>, Entity, KDistance>() {
                @Override
                public Iterable<Tuple2<Entity, KDistance>> call(Iterator<Entity> t) throws Exception {
                    //?PARTITION?
                    List<Entity> entityList = new ArrayList<>();
                    while (t.hasNext()) {
                        entityList.add(t.next());
                    }
                    //??LIST
                    List<KDistance> kDistanceList = new ArrayList<>();
                    for (int i = 0; i < entityList.size(); i++) {
                        kDistanceList.add(new KDistance(k));
                    }

                    //???hdfs
                    Configuration conf = new Configuration();
                    FileSystem fs = FileSystem.get(URI.create(trainingDataPath), conf);
                    FSDataInputStream in = fs.open(new Path(trainingDataPath));
                    BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
                    String line;
                    while ((line = br.readLine()) != null) {
                        Entity lineEntity = new GeneEntity(line);
                        for (int i = 0; i < entityList.size(); i++) {
                            kDistanceList.get(i).add(new DemoDistanceCatagory(
                                    lineEntity.distance(entityList.get(i)), lineEntity.category));
                        }
                    }

                    List<Tuple2<Entity, KDistance>> tList = new ArrayList<>();
                    for (int i = 0; i < entityList.size(); i++) {
                        tList.add(new Tuple2<>(entityList.get(i), kDistanceList.get(i)));
                    }
                    return tList;
                }
            });

    JavaPairRDD<Entity, Object> eoRDD = ekRDD
            .mapToPair(new PairFunction<Tuple2<Entity, KDistance>, Entity, Object>() {
                @Override
                public Tuple2<Entity, Object> call(Tuple2<Entity, KDistance> t) throws Exception {
                    KDistance kDistance = t._2();
                    //???
                    Object catagory = KDistance.getCatagory(kDistance.get(), weightMap);
                    if (t._1().category.equals(catagory)) {
                        accum.add(1);
                    }
                    return new Tuple2<>(t._1(), catagory);
                }
            });

    return eoRDD;
}

From source file:cn.com.bsfit.frms.spark.PageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/* w w  w .  j  a  va  2s. co m*/
    }

    showWarning();

    SparkSession spark = SparkSession.builder().appName("JavaPageRank").getOrCreate();

    // Loads in input file. It should be in format of:
    // URL neighbor URL
    // URL neighbor URL
    // URL neighbor URL
    // ...
    JavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        private static final long serialVersionUID = 1L;

        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and
    // initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        private static final long serialVersionUID = 1L;

        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank
    // algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public Iterator<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<>();
                        for (String n : s._1) {
                            results.add(new Tuple2<>(n, s._2() / urlCount));
                        }
                        return results.iterator();
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            private static final long serialVersionUID = 1L;

            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    spark.stop();
}

From source file:co.cask.cdap.etl.batch.spark.ETLSparkProgram.java

License:Apache License

@Override
public void run(DatasetContext datasetContext) throws Exception {

    BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID),
            BatchPhaseSpec.class);
    Set<StageInfo> aggregators = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE);
    String aggregatorName = null;
    if (!aggregators.isEmpty()) {
        aggregatorName = aggregators.iterator().next().getName();
    }/*w  w w  .j av a  2 s  .  co  m*/

    SparkBatchSourceFactory sourceFactory;
    SparkBatchSinkFactory sinkFactory;
    Integer numPartitions;
    try (InputStream is = new FileInputStream(sec.getLocalizationContext().getLocalFile("ETLSpark.config"))) {
        sourceFactory = SparkBatchSourceFactory.deserialize(is);
        sinkFactory = SparkBatchSinkFactory.deserialize(is);
        numPartitions = new DataInputStream(is).readInt();
    }

    JavaPairRDD<Object, Object> rdd = sourceFactory.createRDD(sec, jsc, Object.class, Object.class);
    JavaPairRDD<String, Object> resultRDD = doTransform(sec, jsc, datasetContext, phaseSpec, rdd,
            aggregatorName, numPartitions);

    Set<StageInfo> stagesOfTypeSparkSink = phaseSpec.getPhase().getStagesOfType(SparkSink.PLUGIN_TYPE);
    Set<String> namesOfTypeSparkSink = new HashSet<>();

    for (StageInfo stageInfo : stagesOfTypeSparkSink) {
        namesOfTypeSparkSink.add(stageInfo.getName());
    }

    for (final String sinkName : phaseSpec.getPhase().getSinks()) {

        JavaPairRDD<String, Object> filteredResultRDD = resultRDD
                .filter(new Function<Tuple2<String, Object>, Boolean>() {
                    @Override
                    public Boolean call(Tuple2<String, Object> v1) throws Exception {
                        return v1._1().equals(sinkName);
                    }
                });

        if (namesOfTypeSparkSink.contains(sinkName)) {
            SparkSink sparkSink = sec.getPluginContext().newPluginInstance(sinkName);
            sparkSink.run(new BasicSparkExecutionPluginContext(sec, jsc, datasetContext, sinkName),
                    filteredResultRDD.values());
        } else {

            JavaPairRDD<Object, Object> sinkRDD = filteredResultRDD
                    .flatMapToPair(new PairFlatMapFunction<Tuple2<String, Object>, Object, Object>() {
                        @Override
                        public Iterable<Tuple2<Object, Object>> call(Tuple2<String, Object> input)
                                throws Exception {
                            List<Tuple2<Object, Object>> result = new ArrayList<>();
                            KeyValue<Object, Object> keyValue = (KeyValue<Object, Object>) input._2();
                            result.add(new Tuple2<>(keyValue.getKey(), keyValue.getValue()));
                            return result;
                        }
                    });
            sinkFactory.writeFromRDD(sinkRDD, sec, sinkName, Object.class, Object.class);
        }
    }
}

From source file:co.cask.cdap.spark.app.SparkLogParser.java

License:Apache License

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();

    Map<String, String> runtimeArguments = sec.getRuntimeArguments();
    String inputFileSet = runtimeArguments.get("input");
    final String outputTable = runtimeArguments.get("output");

    JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);

    final JavaPairRDD<String, String> aggregated = input
            .mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {
                @Override/* w  w  w  . jav  a  2  s .  c o  m*/
                public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
                    return SparkAppUsingGetDataset.parse(input._2());
                }
            }).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {
                @Override
                public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
                    return stats1.aggregate(stats2);
                }
            })
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {
                @Override
                public Iterable<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor)
                        throws Exception {
                    final Gson gson = new Gson();
                    return Lists.newArrayList(Iterators.transform(itor,
                            new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {
                                @Override
                                public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
                                    return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
                                }
                            }));
                }
            });

    // Collect all data to driver and write to dataset directly. That's the intend of the test.
    sec.execute(new TxRunnable() {
        @Override
        public void run(DatasetContext context) throws Exception {
            KeyValueTable kvTable = context.getDataset(outputTable);
            for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
                kvTable.write(entry.getKey(), entry.getValue());
            }
        }
    });
}

From source file:com.andado.spark.examples.JavaPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/*from  ww w.  j  a  v a 2 s  . c  o  m*/
    }

    showWarning();

    SparkSession spark = SparkSession.builder().appName("JavaPageRank").getOrCreate();

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    JavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterator<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<>();
                        for (String n : s._1) {
                            results.add(new Tuple2<>(n, s._2() / urlCount));
                        }
                        return results.iterator();
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    spark.stop();
}

From source file:com.anhth12.lambda.app.ml.als.Evaluation.java

/**
 * Compute AUC (area under the ROC curve) as a recommender evaluation
 *
 * @param sparkContext/*from ww w .j ava2  s  .co  m*/
 * @param mfModel
 * @param positiveData
 * @return
 */
static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel,
        JavaRDD<Rating> positiveData) {

    JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData
            .mapToPair(new PairFunction<Rating, Integer, Integer>() {

                @Override
                public Tuple2<Integer, Integer> call(Rating t) throws Exception {
                    return new Tuple2<>(t.user(), t.product());
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData,
            positiveUserProducts);

    final Broadcast<List<Integer>> allItemIDsBC = sparkContext
            .broadcast(positiveUserProducts.values().distinct().collect());

    JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey()
            .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() {
                private final RandomGenerator random = RandomManager.getRandom();

                @Override
                public Iterable<Tuple2<Integer, Integer>> call(
                        Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) throws Exception {
                    Integer userID = userIDsAndItemIDs._1;
                    Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2());
                    int numPositive = positiveItemIDs.size();

                    Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive);

                    List<Integer> allItemIDs = allItemIDsBC.value();

                    int numItems = allItemIDs.size();

                    for (int i = 0; i < numItems && negative.size() < numPositive; i++) {
                        Integer itemID = allItemIDs.get(random.nextInt(numItems));
                        if (!positiveItemIDs.contains(itemID)) {
                            negative.add(new Tuple2<>(userID, itemID));
                        }
                    }

                    return negative;
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData,
            negativeUserProducts);

    return positivePredictions.join(negativePredictions).values()
            .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() {

                @Override
                public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) throws Exception {
                    //AUC is also the probability that random positive examples
                    //ranking higher than random examples at large. Heare wer compare all random negative
                    //examples to all positive exampls and rapost the totals as an alternative 
                    //computatioin for AUC
                    long correct = 0;
                    long total = 0;

                    for (Rating positive : t._1()) {
                        for (Rating negative : t._2()) {
                            if (positive.rating() > negative.rating()) {
                                correct++;
                            }
                            total++;
                        }
                    }

                    return (double) correct / total;
                }
            }).mean();

}

From source file:com.audaque.instancematch.match.GenerateSignature.java

/**
 * ??q,hashNum???tuple2key???valueJavaPairRDD
 * @param srcFile ?//  www .j  ava2 s.  c  om
 * @param seedFile
 * @param q q-gramq
 * @param hashNum 
 * @param sc
 * @return
 * @throws IOException 
 */
public static List<Tuple2<String, String>> generateSignature(String srcFile, String seedFile, final int q,
        int hashNum, JavaSparkContext sc) throws IOException {
    //        final int[] seeds = Hash.loadSeeds("res/seed0.txt", hashNum);// ??

    JavaRDD<String> srcRDD = sc.textFile(srcFile, 40);
    JavaRDD<String> seedRDD = sc.textFile(seedFile);
    final List<String> seedList = seedRDD.collect();

    JavaPairRDD<String, String> seeds_hashRDD = srcRDD
            .flatMapToPair(new PairFlatMapFunction<String, String, String>() {
                @Override
                public Iterable<Tuple2<String, String>> call(String line) throws Exception {
                    List<Tuple2<String, String>> list = new ArrayList<Tuple2<String, String>>();
                    for (int i = 0; i < seedList.size(); i++) {
                        int hash;
                        if (q < 0) {
                            hash = Hash.RSHash(line, Integer.valueOf(seedList.get(i)));
                        } else {
                            hash = QGramHash.RSHash(line, Integer.valueOf(seedList.get(i)), q);
                        }
                        list.add(new Tuple2<String, String>(seedList.get(i), String.valueOf(hash)));
                    }
                    return list;
                }
            });

    JavaPairRDD<String, String> seed_hashRDD = seeds_hashRDD
            .reduceByKey(new Function2<String, String, String>() {
                @Override
                public String call(String v1, String v2) throws Exception {
                    if (Integer.valueOf(v1) < Integer.valueOf(v2)) {
                        return v1;
                    } else {
                        return v2;
                    }
                }
            });
    //        seeds_hashRDD.sortByKey().saveAsTextFile("hdfs://172.16.1.101:8020/user/ALGO/result");
    return seed_hashRDD.sortByKey().collect();
}

From source file:com.audaque.instancematch.match.GenerateSignature2.java

public static List<Tuple2<String, String>> generateSignature(final String srcFile, String seedFile, final int q,
        int hashNum, JavaSparkContext sc) {
    JavaRDD<String> seedRDD = sc.textFile(seedFile, 40);

    JavaPairRDD<String, String> seeds_hashRDD = seedRDD
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<String>, String, String>() {
                @Override/*w  ww  . j  a  v a  2  s.c  om*/
                public Iterable<Tuple2<String, String>> call(Iterator<String> seed) throws Exception {
                    List<Integer> seedList = new ArrayList<Integer>();
                    while (seed.hasNext()) {
                        seedList.add(Integer.valueOf(seed.next()));
                    }
                    int[] minHash = new int[seedList.size()];
                    for (int i = 0; i < minHash.length; i++) {
                        minHash[i] = Integer.MAX_VALUE;
                    }
                    //???hdfs
                    Configuration conf = new Configuration();
                    FileSystem fs = FileSystem.get(URI.create(srcFile), conf);
                    FSDataInputStream in = fs.open(new Path(srcFile));
                    BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
                    String line;
                    while ((line = br.readLine()) != null) {
                        for (int i = 0; i < seedList.size(); i++) {
                            int hash;
                            if (q < 0) {
                                hash = Hash.RSHash(line, seedList.get(i));
                            } else {
                                hash = QGramHash.RSHash(line, seedList.get(i), q);
                            }
                            if (hash < minHash[i]) {
                                minHash[i] = hash;
                            }
                        }

                    }
                    List<Tuple2<String, String>> tList = new ArrayList<Tuple2<String, String>>();
                    for (int i = 0; i < seedList.size(); i++) {
                        tList.add(new Tuple2<String, String>(String.valueOf(seedList.get(i)),
                                String.valueOf(minHash[i])));
                    }

                    return tList;
                }
            });

    return seeds_hashRDD.sortByKey().collect();
}

From source file:com.cloudera.oryx.app.batch.mllib.als.Evaluation.java

License:Open Source License

/**
 * Computes AUC (area under the ROC curve) as a recommender evaluation metric.
 * Really, it computes what might be described as "Mean AUC", as it computes AUC per
 * user and averages them./* ww w  .  j ava  2 s .c  o m*/
 */
static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel,
        JavaRDD<Rating> positiveData) {

    // This does not use Spark's BinaryClassificationMetrics.areaUnderROC because it
    // is intended to operate on one large set of (score,label) pairs. The computation
    // here is really many small AUC problems, for which a much faster direct computation
    // is available.

    // Extract all positive (user,product) pairs
    JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData
            .mapToPair(rating -> new Tuple2<>(rating.user(), rating.product()));

    JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData,
            positiveUserProducts);

    // All distinct item IDs, to be broadcast
    Broadcast<List<Integer>> allItemIDsBC = sparkContext
            .broadcast(positiveUserProducts.values().distinct().collect());

    JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey()
            .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() {
                private final RandomGenerator random = RandomManager.getRandom();

                @Override
                public Iterator<Tuple2<Integer, Integer>> call(
                        Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) {
                    Integer userID = userIDsAndItemIDs._1();
                    Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2());
                    int numPositive = positiveItemIDs.size();
                    Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive);
                    List<Integer> allItemIDs = allItemIDsBC.value();
                    int numItems = allItemIDs.size();
                    // Sample about as many negative examples as positive
                    for (int i = 0; i < numItems && negative.size() < numPositive; i++) {
                        Integer itemID = allItemIDs.get(random.nextInt(numItems));
                        if (!positiveItemIDs.contains(itemID)) {
                            negative.add(new Tuple2<>(userID, itemID));
                        }
                    }
                    return negative.iterator();
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData,
            negativeUserProducts);

    return positivePredictions.join(negativePredictions).values().mapToDouble(t -> {
        // AUC is also the probability that random positive examples
        // rank higher than random examples at large. Here we compare all random negative
        // examples to all positive examples and report the totals as an alternative
        // computation for AUC
        long correct = 0;
        long total = 0;
        for (Rating positive : t._1()) {
            for (Rating negative : t._2()) {
                if (positive.rating() > negative.rating()) {
                    correct++;
                }
                total++;
            }
        }
        if (total == 0) {
            return 0.0;
        }
        return (double) correct / total;
    }).mean();
}