List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction
PairFlatMapFunction
From source file:com.cloudera.oryx.app.mllib.als.Evaluation.java
License:Open Source License
/** * Computes AUC (area under the ROC curve) as a recommender evaluation metric. * Really, it computes what might be described as "Mean AUC", as it computes AUC per * user and averages them./*w w w. ja va 2 s .c om*/ */ static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel, JavaRDD<Rating> positiveData) { // This does not use Spark's BinaryClassificationMetrics.areaUnderROC because it // is intended to operate on one large set of (score,label) pairs. The computation // here is really many small AUC problems, for which a much faster direct computation // is available. // Extract all positive (user,product) pairs JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData .mapToPair(new PairFunction<Rating, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(Rating rating) { return new Tuple2<>(rating.user(), rating.product()); } }); JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData, positiveUserProducts); // All distinct item IDs, to be broadcast final Broadcast<List<Integer>> allItemIDsBC = sparkContext .broadcast(positiveUserProducts.values().distinct().collect()); JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey() .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() { private final RandomGenerator random = RandomManager.getRandom(); @Override public Iterable<Tuple2<Integer, Integer>> call( Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) { Integer userID = userIDsAndItemIDs._1(); Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2()); int numPositive = positiveItemIDs.size(); Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive); List<Integer> allItemIDs = allItemIDsBC.value(); int numItems = allItemIDs.size(); // Sample about as many negative examples as positive for (int i = 0; i < numItems && negative.size() < numPositive; i++) { Integer itemID = allItemIDs.get(random.nextInt(numItems)); if (!positiveItemIDs.contains(itemID)) { negative.add(new Tuple2<>(userID, itemID)); } } return negative; } }); JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData, negativeUserProducts); return positivePredictions.join(negativePredictions).values() .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() { @Override public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) { // AUC is also the probability that random positive examples // rank higher than random examples at large. Here we compare all random negative // examples to all positive examples and report the totals as an alternative // computation for AUC long correct = 0; long total = 0; for (Rating positive : t._1()) { for (Rating negative : t._2()) { if (positive.rating() > negative.rating()) { correct++; } total++; } } return (double) correct / total; } }).mean(); }
From source file:com.cloudera.oryx.lazarus.batch.ExampleBatchLayerUpdate.java
License:Open Source License
public static Map<String, Integer> countDistinctOtherWords(JavaPairRDD<String, String> data) { return data.values().flatMapToPair(new PairFlatMapFunction<String, String, String>() { @Override//from w ww.ja v a 2 s .c om public Iterable<Tuple2<String, String>> call(String line) { List<Tuple2<String, String>> result = new ArrayList<>(); Set<String> distinctTokens = new HashSet<>(Arrays.asList(line.split(" "))); for (String a : distinctTokens) { for (String b : distinctTokens) { if (!a.equals(b)) { result.add(new Tuple2<>(a, b)); } } } return result; } }).distinct().groupByKey().mapValues(new Function<Iterable<String>, Integer>() { @Override public Integer call(Iterable<String> values) { int count = 0; for (String v : values) { count++; } return count; } }).collectAsMap(); }
From source file:com.cloudera.oryx.ml.mllib.als.AUC.java
License:Open Source License
static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel, JavaRDD<Rating> positiveData) { // This does not use Spark's BinaryClassificationMetrics.areaUnderROC because it // is intended to operate on one large set of (score,label) pairs. The computation // here is really many small AUC problems, for which a much faster direct computation // is available. // Extract all positive (user,product) pairs JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData .mapToPair(new PairFunction<Rating, Integer, Integer>() { @Override/*from www. ja va 2 s. co m*/ public Tuple2<Integer, Integer> call(Rating rating) { return new Tuple2<>(rating.user(), rating.product()); } }); JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData, positiveUserProducts); // All distinct item IDs, to be broadcast final Broadcast<List<Integer>> allItemIDsBC = sparkContext .broadcast(positiveUserProducts.values().distinct().collect()); JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey() .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() { private final RandomGenerator random = RandomManager.getRandom(); @Override public Iterable<Tuple2<Integer, Integer>> call( Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) { Integer userID = userIDsAndItemIDs._1(); Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2()); int numPositive = positiveItemIDs.size(); Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive); List<Integer> allItemIDs = allItemIDsBC.value(); int numItems = allItemIDs.size(); // Sample about as many negative examples as positive for (int i = 0; i < numItems && negative.size() < numPositive; i++) { Integer itemID = allItemIDs.get(random.nextInt(numItems)); if (!positiveItemIDs.contains(itemID)) { negative.add(new Tuple2<>(userID, itemID)); } } return negative; } }); JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData, negativeUserProducts); return positivePredictions.join(negativePredictions).values() .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() { @Override public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) { // AUC is also the probability that random positive examples // rank higher than random examples at large. Here we compare all random negative // examples to all positive examples and report the totals as an alternative // computation for AUC long correct = 0; long total = 0; for (Rating positive : t._1()) { for (Rating negative : t._2()) { if (positive.rating() > negative.rating()) { correct++; } total++; } } return (double) correct / total; } }).mean(); }
From source file:com.hxr.bigdata.spark.example141.JavaPageRank.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaPageRank <file> <number_of_iterations>"); System.exit(1);/*from w w w . j a v a 2 s. c o m*/ } showWarning(); SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // ... JavaRDD<String> lines = ctx.textFile(args[0], 1); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() { public Tuple2<String, String> call(String s) { String[] parts = SPACES.split(s); return new Tuple2<String, String>(parts[0], parts[1]); } }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() { public Double call(Iterable<String> rs) { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < Integer.parseInt(args[1]); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() { public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) { int urlCount = Iterables.size(s._1); List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>(); for (String n : s._1) { results.add(new Tuple2<String, Double>(n, s._2() / urlCount)); } return results; } }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { public Double call(Double sum) { return 0.15 + sum * 0.85; } }); } // Collects all URL ranks and dump them to console. List<Tuple2<String, Double>> output = ranks.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); } ctx.stop(); }
From source file:com.jyz.study.hadoop.spark.examples.JavaPageRank.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaPageRank <file> <number_of_iterations>"); System.exit(1);/*from w ww. j a va 2 s .c o m*/ } SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // ... JavaRDD<String> lines = ctx.textFile(args[0], 1); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) { String[] parts = SPACES.split(s); return new Tuple2<String, String>(parts[0], parts[1]); } }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() { @Override public Double call(Iterable<String> rs) { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < Integer.parseInt(args[1]); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() { @Override public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) { int urlCount = Iterables.size(s._1); List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>(); for (String n : s._1) { results.add(new Tuple2<String, Double>(n, s._2() / urlCount)); } return results; } }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { @Override public Double call(Double sum) { return 0.15 + sum * 0.85; } }); } // Collects all URL ranks and dump them to console. List<Tuple2<String, Double>> output = ranks.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); } ctx.stop(); }
From source file:com.sdw.dream.spark.examples.JavaPageRank.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaPageRank <file> <number_of_iterations>"); System.exit(1);//from w w w . ja v a 2 s.c o m } showWarning(); SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // ... JavaRDD<String> lines = ctx.textFile(args[0], 1); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) { String[] parts = SPACES.split(s); return new Tuple2<String, String>(parts[0], parts[1]); } }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() { @Override public Double call(Iterable<String> rs) { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < Integer.parseInt(args[1]); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() { @Override public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) { int urlCount = Iterables.size(s._1); List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>(); for (String n : s._1) { results.add(new Tuple2<String, Double>(n, s._2() / urlCount)); } return results; } }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { @Override public Double call(Double sum) { return 0.15 + sum * 0.85; } }); } // Collects all URL ranks and dump them to console. List<Tuple2<String, Double>> output = ranks.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); } ctx.stop(); }
From source file:com.spark.cis833.extra.SparkPageRank.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: SparkPageRank <input> <output>"); System.exit(1);/*from w w w . j a va 2 s .c o m*/ } SparkConf sparkConf = new SparkConf().setAppName("SparkPageRank"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // ... JavaRDD<String> lines = ctx.textFile(args[0], 1); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) { String[] parts = SPACES.split(s); return new Tuple2<String, String>(parts[0], parts[1]); } }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() { @Override public Double call(Iterable<String> rs) { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < Integer.parseInt("10"); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() { @Override public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) { int urlCount = Iterables.size(s._1); List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>(); for (String n : s._1) { results.add(new Tuple2<String, Double>(n, s._2() / urlCount)); } return results; } }).sortByKey(false); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { @Override public Double call(Double sum) { return 0.15 + sum * 0.85; } }).sortByKey(false); } // Collects all URL ranks and dump them to console. List<Tuple2<String, Double>> output = ranks.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); } JavaPairRDD<Double, String> swap1 = ranks .mapToPair(new PairFunction<Tuple2<String, Double>, Double, String>() { @Override public Tuple2<Double, String> call(Tuple2<String, Double> item) throws Exception { return item.swap(); } }).sortByKey(false); swap1.saveAsTextFile(args[1]); ctx.stop(); }
From source file:com.springdeveloper.spark.SparkHashtags.java
License:Apache License
public static void main(String[] args) { System.out.println("Spark Hashtags:"); String fileName = ""; if (args.length > 0) { fileName = args[0];/*w w w.j a v a 2 s. c om*/ System.out.println("processing: " + fileName); } SparkConf conf = new SparkConf().setAppName("spark-hashtags"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> tweetData = sc.textFile(fileName).cache(); JavaRDD<Map<String, Object>> tweets = tweetData.map(new Function<String, Map<String, Object>>() { public Map<String, Object> call(String s) throws Exception { return jsonMapper.readValue(s.toString(), new TypeReference<HashMap<String, Object>>() { }); } }); JavaPairRDD<String, Integer> hashTags = tweets .flatMapToPair(new PairFlatMapFunction<Map<String, Object>, String, Integer>() { public Iterable<Tuple2<String, Integer>> call(Map<String, Object> tweet) throws Exception { Map<String, Object> entities = (Map<String, Object>) tweet.get("entities"); List<Map<String, Object>> hashTagEntries = null; if (entities != null) { hashTagEntries = (List<Map<String, Object>>) entities.get("hashtags"); } List<Tuple2<String, Integer>> hashTags = new ArrayList<Tuple2<String, Integer>>(); if (hashTagEntries != null && hashTagEntries.size() > 0) { for (Map<String, Object> hashTagEntry : hashTagEntries) { String hashTag = hashTagEntry.get("text").toString(); hashTags.add(new Tuple2<String, Integer>(hashTag, 1)); } } return hashTags; } }); JavaPairRDD<String, Integer> hashTagCounts = hashTags .reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer int1, Integer int2) throws Exception { return int1 + int2; } }); JavaPairRDD<String, Integer> hashTagCountsSorted = hashTagCounts .mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() { public Tuple2<Integer, String> call(Tuple2<String, Integer> in) throws Exception { return new Tuple2<Integer, String>(in._2, in._1); } }).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() { public Tuple2<String, Integer> call(Tuple2<Integer, String> in) throws Exception { return new Tuple2<String, Integer>(in._2, in._1); } }); List<Tuple2<String, Integer>> top10 = hashTagCountsSorted.take(10); System.out.println("Tweets: " + tweets.count()); System.out.println("HashTags: " + top10); sc.stop(); }
From source file:com.streamsets.spark.GetCreditCardType.java
License:Apache License
@Override public TransformResult transform(JavaRDD<Record> records) { // Validate incoming records JavaPairRDD<Record, String> errors = records .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Record>, Record, String>() { public Iterable<Tuple2<Record, String>> call(Iterator<Record> recordIterator) throws Exception { List<Tuple2<Record, String>> errors = new LinkedList<>(); // Iterate through incoming records while (recordIterator.hasNext()) { Record record = recordIterator.next(); // Validate each record if (!validateRecord(record)) { // We have a problem - flag the record as an error errors.add(new Tuple2<>(record, "Credit card number is missing")); }/*from w w w .j av a 2 s. c o m*/ } return errors; } }); // Filter out invalid records before applying the map JavaRDD<Record> result = records.filter(new Function<Record, Boolean>() { // Only operate on valid records public Boolean call(Record record) throws Exception { return validateRecord(record); } }).map(new Function<Record, Record>() { public Record call(Record record) throws Exception { // Get the credit card number from the record String creditCard = record.get(VALUE_PATH).getValueAsString(); // Look through the map of credit card types for (Map.Entry<String, String[]> entry : ccTypes.entrySet()) { // Find the first matching prefix for (String prefix : entry.getValue()) { if (creditCard.startsWith(prefix)) { // Set the credit card type record.set(RESULT_PATH, Field.create(entry.getKey())); return record; } } } return record; } }); return new TransformResult(result, errors); }
From source file:common.JavaPageRank.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 3) { System.err.println("Usage: JavaPageRank <master> <file> <number_of_iterations>"); System.exit(1);/* www . j a va 2 s . c o m*/ } JavaSparkContext ctx = new JavaSparkContext(args[0], "JavaPageRank", System.getenv("SPARK_HOME"), System.getenv("SPARK_EXAMPLES_JAR")); // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // ... JavaRDD<String> lines = ctx.textFile(args[1], 1); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, List<String>> links = lines.map(new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) { String[] parts = s.split("\\s+"); return new Tuple2<String, String>(parts[0], parts[1]); } }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<List<String>, Double>() { @Override public Double call(List<String> rs) throws Exception { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < Integer.parseInt(args[2]); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMap(new PairFlatMapFunction<Tuple2<List<String>, Double>, String, Double>() { @Override public Iterable<Tuple2<String, Double>> call(Tuple2<List<String>, Double> s) { List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>(); for (String n : s._1) { results.add(new Tuple2<String, Double>(n, s._2 / s._1.size())); } return results; } }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { @Override public Double call(Double sum) throws Exception { return 0.15 + sum * 0.85; } }); } // Collects all URL ranks and dump them to console. List<Tuple2<String, Double>> output = ranks.collect(); for (Tuple2 tuple : output) { System.out.println(tuple._1 + " has rank: " + tuple._2 + "."); } System.exit(0); }