List of usage examples for org.apache.spark.api.java.function DoubleFunction DoubleFunction
DoubleFunction
From source file:com.anhth12.lambda.app.ml.als.Evaluation.java
/** * Computes root mean squared error/*from w ww . ja v a 2s . co m*/ * * @param mfModel * @param testData * @return */ static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer, Integer>, Double> testUserProductValues = testData .mapToPair(new RatingToTupleDouble()); RDD<Tuple2<Object, Object>> testUserProducts = (RDD<Tuple2<Object, Object>>) (RDD<?>) testUserProductValues .keys().rdd(); JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair(new RatingToTupleDouble()).join(testUserProductValues).values() .mapToDouble(new DoubleFunction<Tuple2<Double, Double>>() { @Override public double call(Tuple2<Double, Double> valuePrediction) throws Exception { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; } }).mean(); return Math.sqrt(mse); }
From source file:com.anhth12.lambda.app.ml.als.Evaluation.java
/** * Compute AUC (area under the ROC curve) as a recommender evaluation * * @param sparkContext/*from w ww .jav a 2 s . c om*/ * @param mfModel * @param positiveData * @return */ static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel, JavaRDD<Rating> positiveData) { JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData .mapToPair(new PairFunction<Rating, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(Rating t) throws Exception { return new Tuple2<>(t.user(), t.product()); } }); JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData, positiveUserProducts); final Broadcast<List<Integer>> allItemIDsBC = sparkContext .broadcast(positiveUserProducts.values().distinct().collect()); JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey() .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() { private final RandomGenerator random = RandomManager.getRandom(); @Override public Iterable<Tuple2<Integer, Integer>> call( Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) throws Exception { Integer userID = userIDsAndItemIDs._1; Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2()); int numPositive = positiveItemIDs.size(); Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive); List<Integer> allItemIDs = allItemIDsBC.value(); int numItems = allItemIDs.size(); for (int i = 0; i < numItems && negative.size() < numPositive; i++) { Integer itemID = allItemIDs.get(random.nextInt(numItems)); if (!positiveItemIDs.contains(itemID)) { negative.add(new Tuple2<>(userID, itemID)); } } return negative; } }); JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData, negativeUserProducts); return positivePredictions.join(negativePredictions).values() .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() { @Override public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) throws Exception { //AUC is also the probability that random positive examples //ranking higher than random examples at large. Heare wer compare all random negative //examples to all positive exampls and rapost the totals as an alternative //computatioin for AUC long correct = 0; long total = 0; for (Rating positive : t._1()) { for (Rating negative : t._2()) { if (positive.rating() > negative.rating()) { correct++; } total++; } } return (double) correct / total; } }).mean(); }
From source file:com.cloudera.oryx.app.mllib.als.ALSUpdate.java
License:Open Source License
/** * Implementation which splits based solely on time. It will return approximately * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training * data and the rest as test data./*from w w w.j a va 2s .c om*/ */ @Override protected Pair<JavaRDD<String>, JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) { // Rough approximation; assumes timestamps are fairly evenly distributed StatCounter maxMin = newData.mapToDouble(new DoubleFunction<String>() { @Override public double call(String line) throws Exception { return MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue(); } }).stats(); long minTime = (long) maxMin.min(); long maxTime = (long) maxMin.max(); log.info("New data timestamp range: {} - {}", minTime, maxTime); final long approxTestTrainBoundary = minTime + (long) (getTestFraction() * (maxTime - minTime)); log.info("Splitting at timestamp {}", approxTestTrainBoundary); JavaRDD<String> newTrainData = newData.filter(new Function<String, Boolean>() { @Override public Boolean call(String line) throws Exception { return MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary; } }); JavaRDD<String> testData = newData.filter(new Function<String, Boolean>() { @Override public Boolean call(String line) throws Exception { return MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary; } }); return new Pair<>(newTrainData, testData); }
From source file:com.cloudera.oryx.app.mllib.als.Evaluation.java
License:Open Source License
/** * Computes root mean squared error of {@link Rating#rating()} versus predicted value. *//* ww w . ja v a2 s.co m*/ static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer, Integer>, Double> testUserProductValues = testData .mapToPair(new RatingToTupleDouble()); @SuppressWarnings("unchecked") RDD<Tuple2<Object, Object>> testUserProducts = (RDD<Tuple2<Object, Object>>) (RDD<?>) testUserProductValues .keys().rdd(); JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair(new RatingToTupleDouble()).join(testUserProductValues).values() .mapToDouble(new DoubleFunction<Tuple2<Double, Double>>() { @Override public double call(Tuple2<Double, Double> valuePrediction) { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; } }).mean(); return Math.sqrt(mse); }
From source file:com.cloudera.oryx.app.mllib.als.Evaluation.java
License:Open Source License
/** * Computes AUC (area under the ROC curve) as a recommender evaluation metric. * Really, it computes what might be described as "Mean AUC", as it computes AUC per * user and averages them.//from www . j ava2 s .c o m */ static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel, JavaRDD<Rating> positiveData) { // This does not use Spark's BinaryClassificationMetrics.areaUnderROC because it // is intended to operate on one large set of (score,label) pairs. The computation // here is really many small AUC problems, for which a much faster direct computation // is available. // Extract all positive (user,product) pairs JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData .mapToPair(new PairFunction<Rating, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(Rating rating) { return new Tuple2<>(rating.user(), rating.product()); } }); JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData, positiveUserProducts); // All distinct item IDs, to be broadcast final Broadcast<List<Integer>> allItemIDsBC = sparkContext .broadcast(positiveUserProducts.values().distinct().collect()); JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey() .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() { private final RandomGenerator random = RandomManager.getRandom(); @Override public Iterable<Tuple2<Integer, Integer>> call( Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) { Integer userID = userIDsAndItemIDs._1(); Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2()); int numPositive = positiveItemIDs.size(); Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive); List<Integer> allItemIDs = allItemIDsBC.value(); int numItems = allItemIDs.size(); // Sample about as many negative examples as positive for (int i = 0; i < numItems && negative.size() < numPositive; i++) { Integer itemID = allItemIDs.get(random.nextInt(numItems)); if (!positiveItemIDs.contains(itemID)) { negative.add(new Tuple2<>(userID, itemID)); } } return negative; } }); JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData, negativeUserProducts); return positivePredictions.join(negativePredictions).values() .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() { @Override public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) { // AUC is also the probability that random positive examples // rank higher than random examples at large. Here we compare all random negative // examples to all positive examples and report the totals as an alternative // computation for AUC long correct = 0; long total = 0; for (Rating positive : t._1()) { for (Rating negative : t._2()) { if (positive.rating() > negative.rating()) { correct++; } total++; } } return (double) correct / total; } }).mean(); }
From source file:com.cloudera.oryx.app.mllib.rdf.Evaluation.java
License:Open Source License
static double rmse(final DecisionForest forest, JavaRDD<Example> examples) { double mse = examples.mapToDouble(new DoubleFunction<Example>() { @Override/*from w w w .j a v a 2s. co m*/ public double call(Example example) { NumericPrediction prediction = (NumericPrediction) forest.predict(example); NumericFeature target = (NumericFeature) example.getTarget(); double diff = prediction.getPrediction() - target.getValue(); return diff * diff; } }).mean(); return Math.sqrt(mse); }
From source file:com.cloudera.oryx.ml.mllib.als.AUC.java
License:Open Source License
static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel, JavaRDD<Rating> positiveData) { // This does not use Spark's BinaryClassificationMetrics.areaUnderROC because it // is intended to operate on one large set of (score,label) pairs. The computation // here is really many small AUC problems, for which a much faster direct computation // is available. // Extract all positive (user,product) pairs JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData .mapToPair(new PairFunction<Rating, Integer, Integer>() { @Override//w ww.ja v a 2s .c o m public Tuple2<Integer, Integer> call(Rating rating) { return new Tuple2<>(rating.user(), rating.product()); } }); JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData, positiveUserProducts); // All distinct item IDs, to be broadcast final Broadcast<List<Integer>> allItemIDsBC = sparkContext .broadcast(positiveUserProducts.values().distinct().collect()); JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey() .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() { private final RandomGenerator random = RandomManager.getRandom(); @Override public Iterable<Tuple2<Integer, Integer>> call( Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) { Integer userID = userIDsAndItemIDs._1(); Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2()); int numPositive = positiveItemIDs.size(); Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive); List<Integer> allItemIDs = allItemIDsBC.value(); int numItems = allItemIDs.size(); // Sample about as many negative examples as positive for (int i = 0; i < numItems && negative.size() < numPositive; i++) { Integer itemID = allItemIDs.get(random.nextInt(numItems)); if (!positiveItemIDs.contains(itemID)) { negative.add(new Tuple2<>(userID, itemID)); } } return negative; } }); JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData, negativeUserProducts); return positivePredictions.join(negativePredictions).values() .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() { @Override public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) { // AUC is also the probability that random positive examples // rank higher than random examples at large. Here we compare all random negative // examples to all positive examples and report the totals as an alternative // computation for AUC long correct = 0; long total = 0; for (Rating positive : t._1()) { for (Rating negative : t._2()) { if (positive.rating() > negative.rating()) { correct++; } total++; } } return (double) correct / total; } }).mean(); }
From source file:com.cloudera.oryx.ml.mllib.als.RMSE.java
License:Open Source License
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer, Integer>, Double> testUserProductValues = testData .mapToPair(new RatingToTupleDouble()); @SuppressWarnings("unchecked") RDD<Tuple2<Object, Object>> testUserProducts = (RDD<Tuple2<Object, Object>>) (RDD<?>) testUserProductValues .keys().rdd();//from w w w . j av a2s .c o m JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair(new RatingToTupleDouble()).join(testUserProductValues).values() .mapToDouble(new DoubleFunction<Tuple2<Double, Double>>() { @Override public double call(Tuple2<Double, Double> valuePrediction) { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; } }).mean(); return Math.sqrt(mse); }
From source file:com.cloudera.spark.movie.JavaMovieLensALS.java
License:Apache License
/** Compute RMSE (Root Mean Squared Error). */ public static double computeRmse(MatrixFactorizationModel model, JavaRDD<Rating> data) { // user product RDD JavaPairRDD<Integer, Integer> userProductRDD = data.mapToPair(new PairFunction<Rating, Integer, Integer>() { @Override/*from w ww .j a v a 2 s. co m*/ public Tuple2<Integer, Integer> call(Rating rating) throws Exception { return new Tuple2<Integer, Integer>(rating.user(), rating.product()); } }); // predict test data JavaRDD<Rating> predictions = model.predict(userProductRDD); // map test data to pair (user/product) & rating JavaPairRDD<Tuple2<Integer, Integer>, Double> dataPair = data .mapToPair(new PairFunction<Rating, Tuple2<Integer, Integer>, Double>() { @Override public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating rating) throws Exception { return new Tuple2<Tuple2<Integer, Integer>, Double>( new Tuple2<Integer, Integer>(rating.user(), rating.product()), rating.rating()); } }); // map predictions to pair (user/product) & rating JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsPair = predictions .mapToPair(new PairFunction<Rating, Tuple2<Integer, Integer>, Double>() { @Override public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating rating) throws Exception { return new Tuple2<Tuple2<Integer, Integer>, Double>( new Tuple2<Integer, Integer>(rating.user(), rating.product()), rating.rating()); } }); // join predictions pair to test data pair JavaRDD<Tuple2<Double, Double>> origPredRatingRDD = predictionsPair.join(dataPair).values(); // compute rmse JavaDoubleRDD errorRDD = origPredRatingRDD.mapToDouble(new DoubleFunction<Tuple2<Double, Double>>() { @Override public double call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception { return (doubleDoubleTuple2._1() - doubleDoubleTuple2._2()) * (doubleDoubleTuple2._1() - doubleDoubleTuple2._2()); } }); double rmse = errorRDD.mean(); return rmse; }
From source file:com.thinkbiganalytics.spark.dataprofiler.histo.HistogramStatistics.java
License:Apache License
public void accomodate(final Integer columnIndex, JavaRDD<Row> javaRDD, StructField columnField) { try {//from w ww. ja v a 2 s . c o m if (isNumeric(columnField)) { Tuple2<double[], long[]> histogram = javaRDD.filter(new Function<Row, Boolean>() { @Override public Boolean call(Row row) throws Exception { return !row.isNullAt(columnIndex); } }).mapToDouble(new DoubleFunction<Row>() { @Override public double call(Row row) throws Exception { return Double.parseDouble(row.get(columnIndex).toString()); } }).histogram(bins); ObjectMapper mapper = new ObjectMapper(); String jsonHisto = mapper.writeValueAsString(histogram); OutputRow row = new OutputRow(columnField.name(), "HISTO", jsonHisto); this.outputRows.add(row); } } catch (Exception e) { log.warn("Histogram generation failed for column {}", columnField.name(), e); } }