Java tutorial
/* * Copyright (c) 2014, Cloudera and Intel, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.oryx.ml.mllib.als; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Preconditions; import com.typesafe.config.Config; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.DoubleFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.mllib.recommendation.ALS; import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; import org.apache.spark.mllib.recommendation.Rating; import org.apache.spark.rdd.RDD; import org.apache.spark.storage.StorageLevel; import org.apache.spark.util.StatCounter; import org.dmg.pmml.PMML; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; import scala.reflect.ClassTag$; import com.cloudera.oryx.common.collection.Pair; import com.cloudera.oryx.lambda.TopicProducer; import com.cloudera.oryx.lambda.fn.Functions; import com.cloudera.oryx.ml.MLUpdate; import com.cloudera.oryx.ml.common.fn.MLFunctions; import com.cloudera.oryx.ml.param.HyperParamRange; import com.cloudera.oryx.ml.param.HyperParamRanges; import com.cloudera.oryx.common.pmml.PMMLUtils; /** * A specialization of {@link MLUpdate} that creates a matrix factorization model of its * input, using the Alternating Least Squares algorithm. */ public final class ALSUpdate extends MLUpdate<String> { private static final Logger log = LoggerFactory.getLogger(ALSUpdate.class); private static final ObjectMapper MAPPER = new ObjectMapper(); private final int iterations; private final boolean implicit; private final List<HyperParamRange> hyperParamRanges; private final boolean noKnownItems; public ALSUpdate(Config config) { super(config); iterations = config.getInt("oryx.als.iterations"); implicit = config.getBoolean("oryx.als.implicit"); Preconditions.checkArgument(iterations > 0); hyperParamRanges = Arrays.asList(HyperParamRanges.fromConfig(config, "oryx.als.hyperparams.features"), HyperParamRanges.fromConfig(config, "oryx.als.hyperparams.lambda"), HyperParamRanges.fromConfig(config, "oryx.als.hyperparams.alpha")); noKnownItems = config.getBoolean("oryx.als.no-known-items"); } @Override public List<HyperParamRange> getHyperParameterRanges() { return hyperParamRanges; } @Override public PMML buildModel(JavaSparkContext sparkContext, JavaRDD<String> trainData, List<Number> hyperParams, Path candidatePath) { log.info("Building model with params {}", hyperParams); int features = hyperParams.get(0).intValue(); double lambda = hyperParams.get(1).doubleValue(); double alpha = hyperParams.get(2).doubleValue(); Preconditions.checkArgument(features > 0); Preconditions.checkArgument(lambda >= 0.0); Preconditions.checkArgument(alpha > 0.0); JavaRDD<Rating> trainRatingData = parsedToRatingRDD(trainData.map(MLFunctions.PARSE_FN)); trainRatingData = aggregateScores(trainRatingData); MatrixFactorizationModel model; if (implicit) { model = ALS.trainImplicit(trainRatingData.rdd(), features, iterations, lambda, alpha); } else { model = ALS.train(trainRatingData.rdd(), features, iterations, lambda); } PMML pmml = mfModelToPMML(model, features, lambda, alpha, implicit, candidatePath); unpersist(model); return pmml; } @Override public double evaluate(JavaSparkContext sparkContext, PMML model, Path modelParentPath, JavaRDD<String> testData) { log.info("Evaluating model"); JavaRDD<Rating> testRatingData = parsedToRatingRDD(testData.map(MLFunctions.PARSE_FN)); testRatingData = aggregateScores(testRatingData); MatrixFactorizationModel mfModel = pmmlToMFModel(sparkContext, model, modelParentPath); double eval; if (implicit) { double auc = AUC.areaUnderCurve(sparkContext, mfModel, testRatingData); log.info("AUC: {}", auc); eval = auc; } else { double rmse = RMSE.rmse(mfModel, testRatingData); log.info("RMSE: {}", rmse); eval = 1.0 / rmse; } unpersist(mfModel); return eval; } /** * Manually unpersists the RDDs that are persisted inside a model. * * @param model model whose RDDs were persisted */ private static void unpersist(MatrixFactorizationModel model) { model.userFeatures().unpersist(false); model.productFeatures().unpersist(false); } @Override public void publishAdditionalModelData(JavaSparkContext sparkContext, PMML pmml, JavaRDD<String> newData, JavaRDD<String> pastData, Path modelParentPath, TopicProducer<String, String> modelUpdateTopic) { JavaRDD<String[]> allData = (pastData == null ? newData : newData.union(pastData)) .map(MLFunctions.PARSE_FN); log.info("Sending user / X data as model updates"); String xPathString = PMMLUtils.getExtensionValue(pmml, "X"); JavaPairRDD<Integer, double[]> userRDD = fromRDD( readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString))); if (noKnownItems) { userRDD.foreach(new EnqueueFeatureVecsFn("X", modelUpdateTopic)); } else { log.info("Sending known item data with model updates"); JavaPairRDD<Integer, Collection<Integer>> knownItems = knownsRDD(allData, true); userRDD.join(knownItems).foreach(new EnqueueFeatureVecsAndKnownItemsFn("X", modelUpdateTopic)); } log.info("Sending item / Y data as model updates"); String yPathString = PMMLUtils.getExtensionValue(pmml, "Y"); JavaPairRDD<Integer, double[]> productRDD = fromRDD( readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString))); // For now, there is no use in sending known users for each item //if (noKnownItems) { productRDD.foreach(new EnqueueFeatureVecsFn("Y", modelUpdateTopic)); //} else { // log.info("Sending known user data with model updates"); // JavaPairRDD<Integer,Collection<Integer>> knownUsers = knownsRDD(allData, false); // productRDD.join(knownUsers).foreach( // new EnqueueFeatureVecsAndKnownItemsFn("Y", modelUpdateTopic)); //} } /** * Implementation which splits based solely on time. It will return approximately * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training * data and the rest as test data. */ @Override protected Pair<JavaRDD<String>, JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) { // Rough approximation; assumes timestamps are fairly evenly distributed StatCounter maxMin = newData.mapToDouble(new DoubleFunction<String>() { @Override public double call(String line) throws Exception { return MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue(); } }).stats(); long minTime = (long) maxMin.min(); long maxTime = (long) maxMin.max(); log.info("New data timestamp range: {} - {}", minTime, maxTime); final long approxTestTrainBoundary = minTime + (long) (getTestFraction() * (maxTime - minTime)); log.info("Splitting at timestamp {}", approxTestTrainBoundary); JavaRDD<String> newTrainData = newData.filter(new Function<String, Boolean>() { @Override public Boolean call(String line) throws Exception { return MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary; } }); JavaRDD<String> testData = newData.filter(new Function<String, Boolean>() { @Override public Boolean call(String line) throws Exception { return MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary; } }); return new Pair<>(newTrainData, testData); } /** * @param parsedRDD parsed input as {@code String[]} * @return {@link Rating}s ordered by timestamp */ private static JavaRDD<Rating> parsedToRatingRDD(JavaRDD<String[]> parsedRDD) { return parsedRDD.mapToPair(new PairFunction<String[], Long, Rating>() { @Override public Tuple2<Long, Rating> call(String[] tokens) { return new Tuple2<>(Long.valueOf(tokens[3]), new Rating(Integer.parseInt(tokens[0]), Integer.parseInt(tokens[1]), // Empty value means 'delete'; propagate as NaN tokens[2].isEmpty() ? Double.NaN : Double.parseDouble(tokens[2]))); } }).sortByKey().values(); } /** * Combines {@link Rating}s with the same user/item into one, with score as the sum of * all of the scores. */ private JavaRDD<Rating> aggregateScores(JavaRDD<Rating> original) { JavaPairRDD<Tuple2<Integer, Integer>, Double> tuples = original.mapToPair(new RatingToTupleDouble()); JavaPairRDD<Tuple2<Integer, Integer>, Double> aggregated; if (implicit) { // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since // they don't guarantee the delete elements are properly handled aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN); } else { // For non-implicit, last wins. aggregated = tuples.foldByKey(Double.NaN, Functions.<Double>last()); } return aggregated.filter(MLFunctions.<Tuple2<Integer, Integer>>notNaNValue()) .map(new Function<Tuple2<Tuple2<Integer, Integer>, Double>, Rating>() { @Override public Rating call(Tuple2<Tuple2<Integer, Integer>, Double> userProductScore) { Tuple2<Integer, Integer> userProduct = userProductScore._1(); return new Rating(userProduct._1(), userProduct._2(), userProductScore._2()); } }); } /** * There is no actual serialization of a massive factored matrix model into PMML. * Instead, we create an ad-hoc serialization where the model just contains pointers * to files that contain the matrix data, as Extensions. */ private static PMML mfModelToPMML(MatrixFactorizationModel model, int features, double lambda, double alpha, boolean implicit, Path candidatePath) { saveFeaturesRDD(model.userFeatures(), new Path(candidatePath, "X")); saveFeaturesRDD(model.productFeatures(), new Path(candidatePath, "Y")); PMML pmml = PMMLUtils.buildSkeletonPMML(); PMMLUtils.addExtension(pmml, "X", "X/"); PMMLUtils.addExtension(pmml, "Y", "Y/"); PMMLUtils.addExtension(pmml, "features", Integer.toString(features)); PMMLUtils.addExtension(pmml, "lambda", Double.toString(lambda)); PMMLUtils.addExtension(pmml, "implicit", Boolean.toString(implicit)); if (implicit) { PMMLUtils.addExtension(pmml, "alpha", Double.toString(alpha)); } addIDsExtension(pmml, "XIDs", model.userFeatures()); addIDsExtension(pmml, "YIDs", model.productFeatures()); return pmml; } private static void addIDsExtension(PMML pmml, String key, RDD<Tuple2<Object, double[]>> features) { List<String> ids = fromRDD(features).keys().map(Functions.toStringValue()).collect(); PMMLUtils.addExtensionContent(pmml, key, ids); } private static void saveFeaturesRDD(RDD<Tuple2<Object, double[]>> features, Path path) { log.info("Saving features RDD to {}", path); fromRDD(features).map(new Function<Tuple2<Object, double[]>, String>() { @Override public String call(Tuple2<Object, double[]> keyAndVector) throws IOException { Object key = keyAndVector._1(); double[] vector = keyAndVector._2(); return MAPPER.writeValueAsString(Arrays.asList(key, vector)); } }).saveAsTextFile(path.toString(), GzipCodec.class); } private static MatrixFactorizationModel pmmlToMFModel(JavaSparkContext sparkContext, PMML pmml, Path modelParentPath) { String xPathString = PMMLUtils.getExtensionValue(pmml, "X"); String yPathString = PMMLUtils.getExtensionValue(pmml, "Y"); RDD<Tuple2<Integer, double[]>> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString)); RDD<Tuple2<Integer, double[]>> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString)); // This mimics the persistence level establish by ALS training methods userRDD.persist(StorageLevel.MEMORY_AND_DISK()); productRDD.persist(StorageLevel.MEMORY_AND_DISK()); // Cast is needed for some reason with this Scala API returning array @SuppressWarnings("unchecked") Tuple2<?, double[]> first = (Tuple2<?, double[]>) ((Object[]) userRDD.take(1))[0]; int rank = first._2().length; return new MatrixFactorizationModel(rank, massageToObjectKey(userRDD), massageToObjectKey(productRDD)); } private static RDD<Tuple2<Integer, double[]>> readFeaturesRDD(JavaSparkContext sparkContext, Path path) { log.info("Loading features RDD from {}", path); JavaRDD<String> featureLines = sparkContext.textFile(path.toString()); return featureLines.map(new Function<String, Tuple2<Integer, double[]>>() { @Override public Tuple2<Integer, double[]> call(String line) throws IOException { List<?> update = MAPPER.readValue(line, List.class); Integer key = Integer.valueOf(update.get(0).toString()); double[] vector = MAPPER.convertValue(update.get(1), double[].class); return new Tuple2<>(key, vector); } }).rdd(); } private static <A, B> RDD<Tuple2<Object, B>> massageToObjectKey(RDD<Tuple2<A, B>> in) { // More horrible hacks to get around Scala-Java unfriendliness @SuppressWarnings("unchecked") RDD<Tuple2<Object, B>> result = (RDD<Tuple2<Object, B>>) (RDD<?>) in; return result; } private static JavaPairRDD<Integer, Collection<Integer>> knownsRDD(JavaRDD<String[]> allData, final boolean knownItems) { JavaRDD<String[]> sorted = allData.sortBy(new Function<String[], Long>() { @Override public Long call(String[] datum) { return Long.valueOf(datum[3]); } }, true, allData.partitions().size()); JavaPairRDD<Integer, Tuple2<Integer, Boolean>> tuples = sorted .mapToPair(new PairFunction<String[], Integer, Tuple2<Integer, Boolean>>() { @Override public Tuple2<Integer, Tuple2<Integer, Boolean>> call(String[] datum) { Integer user = Integer.valueOf(datum[0]); Integer item = Integer.valueOf(datum[1]); Boolean delete = datum[2].isEmpty(); return knownItems ? new Tuple2<>(user, new Tuple2<>(item, delete)) : new Tuple2<>(item, new Tuple2<>(user, delete)); } }); // TODO likely need to figure out a way to avoid groupByKey but collectByKey // won't work here -- doesn't guarantee enough about ordering return tuples.groupByKey() .mapValues(new Function<Iterable<Tuple2<Integer, Boolean>>, Collection<Integer>>() { @Override public Collection<Integer> call(Iterable<Tuple2<Integer, Boolean>> idDeletes) { Collection<Integer> ids = new HashSet<>(); for (Tuple2<Integer, Boolean> idDelete : idDeletes) { if (idDelete._2()) { ids.remove(idDelete._1()); } else { ids.add(idDelete._1()); } } return ids; } }); } private static <K, V> JavaPairRDD<K, V> fromRDD(RDD<Tuple2<K, V>> rdd) { return JavaPairRDD.fromRDD(rdd, ClassTag$.MODULE$.<K>apply(Object.class), ClassTag$.MODULE$.<V>apply(Object.class)); } }