Example usage for org.apache.spark.api.java.function DoubleFunction DoubleFunction

List of usage examples for org.apache.spark.api.java.function DoubleFunction DoubleFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function DoubleFunction DoubleFunction.

Prototype

DoubleFunction

Source Link

Usage

From source file:com.anhth12.lambda.app.ml.als.Evaluation.java

/**
 * Computes root mean squared error/*from   w ww  .  ja  v  a  2s .  co  m*/
 *
 * @param mfModel
 * @param testData
 * @return
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
    JavaPairRDD<Tuple2<Integer, Integer>, Double> testUserProductValues = testData
            .mapToPair(new RatingToTupleDouble());
    RDD<Tuple2<Object, Object>> testUserProducts = (RDD<Tuple2<Object, Object>>) (RDD<?>) testUserProductValues
            .keys().rdd();
    JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
    double mse = predictions.mapToPair(new RatingToTupleDouble()).join(testUserProductValues).values()
            .mapToDouble(new DoubleFunction<Tuple2<Double, Double>>() {

                @Override
                public double call(Tuple2<Double, Double> valuePrediction) throws Exception {
                    double diff = valuePrediction._1() - valuePrediction._2();
                    return diff * diff;
                }
            }).mean();

    return Math.sqrt(mse);
}

From source file:com.anhth12.lambda.app.ml.als.Evaluation.java

/**
 * Compute AUC (area under the ROC curve) as a recommender evaluation
 *
 * @param sparkContext/*from  w ww  .jav  a 2 s  .  c  om*/
 * @param mfModel
 * @param positiveData
 * @return
 */
static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel,
        JavaRDD<Rating> positiveData) {

    JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData
            .mapToPair(new PairFunction<Rating, Integer, Integer>() {

                @Override
                public Tuple2<Integer, Integer> call(Rating t) throws Exception {
                    return new Tuple2<>(t.user(), t.product());
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData,
            positiveUserProducts);

    final Broadcast<List<Integer>> allItemIDsBC = sparkContext
            .broadcast(positiveUserProducts.values().distinct().collect());

    JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey()
            .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() {
                private final RandomGenerator random = RandomManager.getRandom();

                @Override
                public Iterable<Tuple2<Integer, Integer>> call(
                        Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) throws Exception {
                    Integer userID = userIDsAndItemIDs._1;
                    Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2());
                    int numPositive = positiveItemIDs.size();

                    Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive);

                    List<Integer> allItemIDs = allItemIDsBC.value();

                    int numItems = allItemIDs.size();

                    for (int i = 0; i < numItems && negative.size() < numPositive; i++) {
                        Integer itemID = allItemIDs.get(random.nextInt(numItems));
                        if (!positiveItemIDs.contains(itemID)) {
                            negative.add(new Tuple2<>(userID, itemID));
                        }
                    }

                    return negative;
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData,
            negativeUserProducts);

    return positivePredictions.join(negativePredictions).values()
            .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() {

                @Override
                public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) throws Exception {
                    //AUC is also the probability that random positive examples
                    //ranking higher than random examples at large. Heare wer compare all random negative
                    //examples to all positive exampls and rapost the totals as an alternative 
                    //computatioin for AUC
                    long correct = 0;
                    long total = 0;

                    for (Rating positive : t._1()) {
                        for (Rating negative : t._2()) {
                            if (positive.rating() > negative.rating()) {
                                correct++;
                            }
                            total++;
                        }
                    }

                    return (double) correct / total;
                }
            }).mean();

}

From source file:com.cloudera.oryx.app.mllib.als.ALSUpdate.java

License:Open Source License

/**
 * Implementation which splits based solely on time. It will return approximately
 * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training
 * data and the rest as test data./*from w  w w.j a  va  2s  .c om*/
 */
@Override
protected Pair<JavaRDD<String>, JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) {
    // Rough approximation; assumes timestamps are fairly evenly distributed
    StatCounter maxMin = newData.mapToDouble(new DoubleFunction<String>() {
        @Override
        public double call(String line) throws Exception {
            return MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue();
        }
    }).stats();

    long minTime = (long) maxMin.min();
    long maxTime = (long) maxMin.max();
    log.info("New data timestamp range: {} - {}", minTime, maxTime);
    final long approxTestTrainBoundary = minTime + (long) (getTestFraction() * (maxTime - minTime));
    log.info("Splitting at timestamp {}", approxTestTrainBoundary);

    JavaRDD<String> newTrainData = newData.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String line) throws Exception {
            return MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary;
        }
    });
    JavaRDD<String> testData = newData.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String line) throws Exception {
            return MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary;
        }
    });

    return new Pair<>(newTrainData, testData);
}

From source file:com.cloudera.oryx.app.mllib.als.Evaluation.java

License:Open Source License

/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 *//*  ww  w  . ja v  a2 s.co m*/
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
    JavaPairRDD<Tuple2<Integer, Integer>, Double> testUserProductValues = testData
            .mapToPair(new RatingToTupleDouble());
    @SuppressWarnings("unchecked")
    RDD<Tuple2<Object, Object>> testUserProducts = (RDD<Tuple2<Object, Object>>) (RDD<?>) testUserProductValues
            .keys().rdd();
    JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
    double mse = predictions.mapToPair(new RatingToTupleDouble()).join(testUserProductValues).values()
            .mapToDouble(new DoubleFunction<Tuple2<Double, Double>>() {
                @Override
                public double call(Tuple2<Double, Double> valuePrediction) {
                    double diff = valuePrediction._1() - valuePrediction._2();
                    return diff * diff;
                }
            }).mean();
    return Math.sqrt(mse);
}

From source file:com.cloudera.oryx.app.mllib.als.Evaluation.java

License:Open Source License

/**
 * Computes AUC (area under the ROC curve) as a recommender evaluation metric.
 * Really, it computes what might be described as "Mean AUC", as it computes AUC per
 * user and averages them.//from   www .  j  ava2 s .c  o m
 */
static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel,
        JavaRDD<Rating> positiveData) {

    // This does not use Spark's BinaryClassificationMetrics.areaUnderROC because it
    // is intended to operate on one large set of (score,label) pairs. The computation
    // here is really many small AUC problems, for which a much faster direct computation
    // is available.

    // Extract all positive (user,product) pairs
    JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData
            .mapToPair(new PairFunction<Rating, Integer, Integer>() {
                @Override
                public Tuple2<Integer, Integer> call(Rating rating) {
                    return new Tuple2<>(rating.user(), rating.product());
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData,
            positiveUserProducts);

    // All distinct item IDs, to be broadcast
    final Broadcast<List<Integer>> allItemIDsBC = sparkContext
            .broadcast(positiveUserProducts.values().distinct().collect());

    JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey()
            .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() {
                private final RandomGenerator random = RandomManager.getRandom();

                @Override
                public Iterable<Tuple2<Integer, Integer>> call(
                        Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) {
                    Integer userID = userIDsAndItemIDs._1();
                    Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2());
                    int numPositive = positiveItemIDs.size();
                    Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive);
                    List<Integer> allItemIDs = allItemIDsBC.value();
                    int numItems = allItemIDs.size();
                    // Sample about as many negative examples as positive
                    for (int i = 0; i < numItems && negative.size() < numPositive; i++) {
                        Integer itemID = allItemIDs.get(random.nextInt(numItems));
                        if (!positiveItemIDs.contains(itemID)) {
                            negative.add(new Tuple2<>(userID, itemID));
                        }
                    }
                    return negative;
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData,
            negativeUserProducts);

    return positivePredictions.join(negativePredictions).values()
            .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() {
                @Override
                public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) {
                    // AUC is also the probability that random positive examples
                    // rank higher than random examples at large. Here we compare all random negative
                    // examples to all positive examples and report the totals as an alternative
                    // computation for AUC
                    long correct = 0;
                    long total = 0;
                    for (Rating positive : t._1()) {
                        for (Rating negative : t._2()) {
                            if (positive.rating() > negative.rating()) {
                                correct++;
                            }
                            total++;
                        }
                    }
                    return (double) correct / total;
                }
            }).mean();
}

From source file:com.cloudera.oryx.app.mllib.rdf.Evaluation.java

License:Open Source License

static double rmse(final DecisionForest forest, JavaRDD<Example> examples) {
    double mse = examples.mapToDouble(new DoubleFunction<Example>() {
        @Override/*from w  w w .j  a  v  a  2s. co m*/
        public double call(Example example) {
            NumericPrediction prediction = (NumericPrediction) forest.predict(example);
            NumericFeature target = (NumericFeature) example.getTarget();
            double diff = prediction.getPrediction() - target.getValue();
            return diff * diff;
        }
    }).mean();
    return Math.sqrt(mse);
}

From source file:com.cloudera.oryx.ml.mllib.als.AUC.java

License:Open Source License

static double areaUnderCurve(JavaSparkContext sparkContext, MatrixFactorizationModel mfModel,
        JavaRDD<Rating> positiveData) {

    // This does not use Spark's BinaryClassificationMetrics.areaUnderROC because it
    // is intended to operate on one large set of (score,label) pairs. The computation
    // here is really many small AUC problems, for which a much faster direct computation
    // is available.

    // Extract all positive (user,product) pairs
    JavaPairRDD<Integer, Integer> positiveUserProducts = positiveData
            .mapToPair(new PairFunction<Rating, Integer, Integer>() {
                @Override//w ww.ja  v a 2s .c o  m
                public Tuple2<Integer, Integer> call(Rating rating) {
                    return new Tuple2<>(rating.user(), rating.product());
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> positivePredictions = predictAll(mfModel, positiveData,
            positiveUserProducts);

    // All distinct item IDs, to be broadcast
    final Broadcast<List<Integer>> allItemIDsBC = sparkContext
            .broadcast(positiveUserProducts.values().distinct().collect());

    JavaPairRDD<Integer, Integer> negativeUserProducts = positiveUserProducts.groupByKey()
            .flatMapToPair(new PairFlatMapFunction<Tuple2<Integer, Iterable<Integer>>, Integer, Integer>() {
                private final RandomGenerator random = RandomManager.getRandom();

                @Override
                public Iterable<Tuple2<Integer, Integer>> call(
                        Tuple2<Integer, Iterable<Integer>> userIDsAndItemIDs) {
                    Integer userID = userIDsAndItemIDs._1();
                    Collection<Integer> positiveItemIDs = Sets.newHashSet(userIDsAndItemIDs._2());
                    int numPositive = positiveItemIDs.size();
                    Collection<Tuple2<Integer, Integer>> negative = new ArrayList<>(numPositive);
                    List<Integer> allItemIDs = allItemIDsBC.value();
                    int numItems = allItemIDs.size();
                    // Sample about as many negative examples as positive
                    for (int i = 0; i < numItems && negative.size() < numPositive; i++) {
                        Integer itemID = allItemIDs.get(random.nextInt(numItems));
                        if (!positiveItemIDs.contains(itemID)) {
                            negative.add(new Tuple2<>(userID, itemID));
                        }
                    }
                    return negative;
                }
            });

    JavaPairRDD<Integer, Iterable<Rating>> negativePredictions = predictAll(mfModel, positiveData,
            negativeUserProducts);

    return positivePredictions.join(negativePredictions).values()
            .mapToDouble(new DoubleFunction<Tuple2<Iterable<Rating>, Iterable<Rating>>>() {
                @Override
                public double call(Tuple2<Iterable<Rating>, Iterable<Rating>> t) {
                    // AUC is also the probability that random positive examples
                    // rank higher than random examples at large. Here we compare all random negative
                    // examples to all positive examples and report the totals as an alternative
                    // computation for AUC
                    long correct = 0;
                    long total = 0;
                    for (Rating positive : t._1()) {
                        for (Rating negative : t._2()) {
                            if (positive.rating() > negative.rating()) {
                                correct++;
                            }
                            total++;
                        }
                    }
                    return (double) correct / total;
                }
            }).mean();
}

From source file:com.cloudera.oryx.ml.mllib.als.RMSE.java

License:Open Source License

static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
    JavaPairRDD<Tuple2<Integer, Integer>, Double> testUserProductValues = testData
            .mapToPair(new RatingToTupleDouble());
    @SuppressWarnings("unchecked")
    RDD<Tuple2<Object, Object>> testUserProducts = (RDD<Tuple2<Object, Object>>) (RDD<?>) testUserProductValues
            .keys().rdd();//from   w  w  w . j av  a2s .c o m
    JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
    double mse = predictions.mapToPair(new RatingToTupleDouble()).join(testUserProductValues).values()
            .mapToDouble(new DoubleFunction<Tuple2<Double, Double>>() {
                @Override
                public double call(Tuple2<Double, Double> valuePrediction) {
                    double diff = valuePrediction._1() - valuePrediction._2();
                    return diff * diff;
                }
            }).mean();
    return Math.sqrt(mse);
}

From source file:com.cloudera.spark.movie.JavaMovieLensALS.java

License:Apache License

/** Compute RMSE (Root Mean Squared Error). */
public static double computeRmse(MatrixFactorizationModel model, JavaRDD<Rating> data) {

    // user product RDD
    JavaPairRDD<Integer, Integer> userProductRDD = data.mapToPair(new PairFunction<Rating, Integer, Integer>() {
        @Override/*from  w  ww  .j  a  v a 2  s. co  m*/
        public Tuple2<Integer, Integer> call(Rating rating) throws Exception {
            return new Tuple2<Integer, Integer>(rating.user(), rating.product());
        }
    });

    // predict test data
    JavaRDD<Rating> predictions = model.predict(userProductRDD);

    // map test data to pair (user/product) & rating
    JavaPairRDD<Tuple2<Integer, Integer>, Double> dataPair = data
            .mapToPair(new PairFunction<Rating, Tuple2<Integer, Integer>, Double>() {
                @Override
                public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating rating) throws Exception {
                    return new Tuple2<Tuple2<Integer, Integer>, Double>(
                            new Tuple2<Integer, Integer>(rating.user(), rating.product()), rating.rating());
                }
            });

    // map predictions to pair (user/product) & rating
    JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsPair = predictions
            .mapToPair(new PairFunction<Rating, Tuple2<Integer, Integer>, Double>() {
                @Override
                public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating rating) throws Exception {
                    return new Tuple2<Tuple2<Integer, Integer>, Double>(
                            new Tuple2<Integer, Integer>(rating.user(), rating.product()), rating.rating());
                }
            });

    // join predictions pair to test data pair
    JavaRDD<Tuple2<Double, Double>> origPredRatingRDD = predictionsPair.join(dataPair).values();

    // compute rmse
    JavaDoubleRDD errorRDD = origPredRatingRDD.mapToDouble(new DoubleFunction<Tuple2<Double, Double>>() {
        @Override
        public double call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception {
            return (doubleDoubleTuple2._1() - doubleDoubleTuple2._2())
                    * (doubleDoubleTuple2._1() - doubleDoubleTuple2._2());
        }
    });

    double rmse = errorRDD.mean();

    return rmse;
}

From source file:com.thinkbiganalytics.spark.dataprofiler.histo.HistogramStatistics.java

License:Apache License

public void accomodate(final Integer columnIndex, JavaRDD<Row> javaRDD, StructField columnField) {
    try {//from  w  ww.  ja v a  2  s  .  c o  m
        if (isNumeric(columnField)) {

            Tuple2<double[], long[]> histogram = javaRDD.filter(new Function<Row, Boolean>() {
                @Override
                public Boolean call(Row row) throws Exception {
                    return !row.isNullAt(columnIndex);
                }
            }).mapToDouble(new DoubleFunction<Row>() {
                @Override
                public double call(Row row) throws Exception {
                    return Double.parseDouble(row.get(columnIndex).toString());
                }
            }).histogram(bins);

            ObjectMapper mapper = new ObjectMapper();
            String jsonHisto = mapper.writeValueAsString(histogram);

            OutputRow row = new OutputRow(columnField.name(), "HISTO", jsonHisto);
            this.outputRows.add(row);
        }
    } catch (Exception e) {
        log.warn("Histogram generation failed for column {}", columnField.name(), e);
    }
}