Java tutorial
/* * Copyright (c) 2015, WSO2 Inc. (http://www.wso2.org) All Rights Reserved. * * WSO2 Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"); you may not use this file except * in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.wso2.carbon.ml.core.spark.models.ext; import org.apache.commons.math3.ml.distance.EuclideanDistance; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.mllib.clustering.KMeansModel; import org.apache.spark.mllib.linalg.Vector; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * AnomalyDetectionModel model. */ public class AnomalyDetectionModel implements Serializable { private static final long serialVersionUID = 7012024887487309471L; private KMeansModel kMeansModel; private Map<Integer, List<Double>> clusterIndexToDistancesListMap; private String normalLabel; private String anomalyLabel; public KMeansModel getkMeansModel() { return kMeansModel; } public void setkMeansModel(KMeansModel kMeansModel) { this.kMeansModel = kMeansModel; } public Map<Integer, List<Double>> getClusterIndexToDistancesListMap() { return clusterIndexToDistancesListMap; } public void setClusterIndexToDistancesListMap(Map<Integer, List<Double>> clusterIndexTodistancesListMap) { this.clusterIndexToDistancesListMap = clusterIndexTodistancesListMap; } public String getNormalLabel() { return normalLabel; } public void setNormalLabel(String normalLabel) { this.normalLabel = normalLabel; } public String getAnomalyLabel() { return anomalyLabel; } public void setAnomalyLabel(String anomalyLabel) { this.anomalyLabel = anomalyLabel; } /** * This method applies a anomaly detection model to a given dataset * * @param data a single data point as a Vector * @param percentile percentile value to identify the cluster boundaries * @return prediction label as a String */ public String predict(Vector data, double percentile) { String predictions; int predictedCluster = kMeansModel.predict(data); int clusterIndex = predictedCluster; double clusterBoundary = getPercentileDistance(percentile, clusterIndex); predictions = getPredictedValue(data, clusterIndex, clusterBoundary); return predictions; } /** * This method applies a anomaly detection model to a given dataset * * @param data JavaRDD containing feature vectors * @param percentile percentile value to identify the cluster boundaries * @return prediction labels as a List of Strings */ public List<String> predict(JavaRDD<Vector> data, double percentile) { // convert data JAVARDD into a List List<Vector> dataList = data.collect(); List<Integer> predictedClusters = kMeansModel.predict(data).collect(); Map<Integer, Double> percentilesMap = getPercentileDistancesMap(percentile); List<String> predictions = getPredictionsList(dataList, predictedClusters, percentilesMap); return predictions; } /** * This method applies a anomaly detection model to a given dataset for a range of percentile values * * @param data a single data point as a Vector * @param minPercentile min percentile value of the range * @param maxPercentile max percentile value of the range * @return Map<Integer, String> key:percentile value:prediction label */ public Map<Integer, String> predict(Vector data, int minPercentile, int maxPercentile) { /* * key : percentile value * value : prediction label */ Map<Integer, String> percentileToPredictionMap = new HashMap<Integer, String>(); int predictedCluster = kMeansModel.predict(data); for (int percentile = minPercentile; percentile <= maxPercentile; percentile++) { int clusterIndex = predictedCluster; double clusterBoundary = getPercentileDistance(percentile, clusterIndex); String prediction = getPredictedValue(data, clusterIndex, clusterBoundary); percentileToPredictionMap.put(percentile, prediction); } return percentileToPredictionMap; } /** * This method applies a anomaly detection model to a given dataset for a range of percentile values * * @param data JavaRDD containing feature vectors * @param minPercentile min percentile value of the range * @param maxPercentile max percentile value of the range * @return Map<Integer, List<String>> key:percentile value:prediction labels as a List of Strings */ public Map<Integer, List<String>> predict(JavaRDD<Vector> data, int minPercentile, int maxPercentile) { /* * key : percentile value * value : predictions List */ Map<Integer, List<String>> percentileToPredictionsListMap = new HashMap<Integer, List<String>>(); // convert data JAVARDD into a List List<Vector> dataList = data.collect(); List<Integer> predictedClusters = kMeansModel.predict(data).collect(); for (int percentile = minPercentile; percentile <= maxPercentile; percentile++) { Map<Integer, Double> percentilesMap = getPercentileDistancesMap(percentile); List<String> predictionsList = getPredictionsList(dataList, predictedClusters, percentilesMap); percentileToPredictionsListMap.put(percentile, predictionsList); } return percentileToPredictionsListMap; } private List<String> getPredictionsList(List<Vector> dataList, List<Integer> predictedClusters, Map<Integer, Double> percentilesMap) { List<String> predictionsList = new ArrayList<String>(); for (int i = 0; i < dataList.size(); i++) { int clusterIndex = predictedClusters.get(i); double clusterBoundary = percentilesMap.get(clusterIndex); String prediction = getPredictedValue(dataList.get(i), clusterIndex, clusterBoundary); predictionsList.add(prediction); } return predictionsList; } /** * This method is to predict the label of a given data point */ private String getPredictedValue(Vector dataPointVector, int clusterIndex, double clusterBoundary) { String prediction; EuclideanDistance euclideanDistance = new EuclideanDistance(); Vector[] clusterCenters = kMeansModel.clusterCenters(); double[] dataPoint = dataPointVector.toArray(); double[] clusterCenter = clusterCenters[clusterIndex].toArray(); double distance = euclideanDistance.compute(clusterCenter, dataPoint); if (distance > clusterBoundary) { prediction = anomalyLabel; } else { prediction = normalLabel; } return prediction; } /** * This method is to get the percentile distances map * key : percentile value * value : distance value * This will return cluster boundary distance values with respect to each percentile */ private Map<Integer, Double> getPercentileDistancesMap(double percentileValue) { // Get a DescriptiveStatistics instance DescriptiveStatistics stats = new DescriptiveStatistics(); /* * key : percentile value * value : distance value */ Map<Integer, Double> percentilesMap = new HashMap<Integer, Double>(); // calculating percentile distance of each cluster for (int clusterIndex = 0; clusterIndex < clusterIndexToDistancesListMap.size(); clusterIndex++) { for (double distance : clusterIndexToDistancesListMap.get(clusterIndex)) { stats.addValue(distance); } double percentileDistance = stats.getPercentile(percentileValue); percentilesMap.put(clusterIndex, percentileDistance); stats.clear(); } return percentilesMap; } /** * This method is to get the percentile distance to a given cluster */ private double getPercentileDistance(double percentileValue, int clusterIndex) { // Get a DescriptiveStatistics instance DescriptiveStatistics stats = new DescriptiveStatistics(); // calculating percentile distance for (double distance : clusterIndexToDistancesListMap.get(clusterIndex)) { stats.addValue(distance); } double percentileDistance = stats.getPercentile(percentileValue); stats.clear(); return percentileDistance; } }