List of usage examples for weka.core Instance toDoubleArray
public double[] toDoubleArray();
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * Use inclusion probability to discover the cluster "nearest" the provided instance * Uses main object's outlier container//from w ww. ja va 2 s . com * @param x instance in question * @return sorted set of clusters, ordered by inc */ protected final NearestInstanceTuple[] findNearestOutliers(Instance x) { NearestInstanceTuple[] ret = new NearestInstanceTuple[potentialNovels.size()]; double[] xVals = x.toDoubleArray(); int idx = 0; for (Instance n : potentialNovels) { double distance = VectorDistances.distance(xVals, n.toDoubleArray(), x.dataset(), this.distanceStrategyOption.getChosenIndex()); NearestInstanceTuple nit = new NearestInstanceTuple(n, distance); ret[idx++] = nit; } // end for Arrays.parallelSort(ret); return ret; }
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * Uses methodology from Kim et al. "A Novel Validity Index for Determination of the Optimal Number of Clusters" * * @param D Warm-up data set//www . j av a 2 s . c o m */ public final void initialize(List<Instance> D) { String ncCSVfilePrefix = "META-" + D.get(0).dataset().relationName() + "-" + iso8601FormatString.format(new Date()); final boolean doMetaLog = logMetaRecordsOption.isSet(); if (doMetaLog) { try { File ncCSVFile = new File(ncCSVfilePrefix + ".csv"); ncCSVwriter = new BufferedWriter(new FileWriter(ncCSVFile)); String ncCSVHeader = "" + "usize" + "," + "urad" + "," + "ctally" + "," + "cpur" + "," + "csize" + "," + "cweight" + "," + "crad" + "," + "cdist" + "," + "pout" + "," + "vweight" + "," + "qdmin" + "," + "qdout" + "," + "qnsc" + "," + "novel"; ncCSVwriter.write(ncCSVHeader); ncCSVwriter.newLine(); ncCSVwriter.flush(); } catch (IOException fileSetupIOException) { System.err.println("NC-CSV meta-data file failed to open: " + fileSetupIOException.toString()); } } knownLabels = new int[D.get(0).numClasses()]; Arrays.fill(knownLabels, 0); this.numAttributes = D.get(0).numAttributes(); universalProbabilitySums = 0; bestProbabilitySums = 0; bestProbabilityCount = 0; // Setup the universal set/cluster. Note that this will be crucial for subspace selection (cross-entropy checks against null hypothesis) double[] universalCentroid = new double[D.get(0).numAttributes()]; double[] universalVariance = new double[D.get(0).numAttributes()]; Arrays.fill(universalCentroid, 0); Arrays.fill(universalVariance, 0); universalCluster = new Riffle(D.get(0)); //universalCluster.updateStrategyOption.setChosenIndex(this.updateStrategyOption.getChosenIndex()); //universalCluster.outlierDefinitionStrategyOption.setChosenIndex(this.outlierDefinitionStrategyOption.getChosenIndex()); universalCluster.distanceStrategyOption.setChosenIndex(this.distanceStrategyOption.getChosenIndex()); //universalCluster.initialStandardDeviationOption.setValue(this.initialStandardDeviationOption.getValue()); //universalCluster.alphaAdjustmentWeightOption.setValue(this.learningRateAlphaOption.getValue()); //universalCluster.setParentClusterer(this); if (D.size() > 1) { double[] ep = new double[universalCentroid.length]; Arrays.fill(ep, 0); universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below universalCluster.setVariances(universalVariance); // temporary - start with standard gaussian, will update below universalCluster.setWeight(0); double N = D.size(); for (Instance x : D) { // Pre-populate univeral cluster with data points int y = (int) x.classValue(); if (y < knownLabels.length) { knownLabels[y]++; } universalCluster.addInstance(x); double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { universalCentroid[i] += xValues[i]; } } for (int i = 0; i < universalCentroid.length; ++i) { universalCentroid[i] /= N; } // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so // we use the 2-Pass method for computing sample variance (per dimension) for (Instance x : D) { double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { double delta = universalCentroid[i] - xValues[i]; ep[i] += delta; universalVariance[i] += delta * delta; } } for (int i = 0; i < universalVariance.length; ++i) { universalVariance[i] = (universalVariance[i] - ep[i] * ep[i] / N) / (N - 1); } universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below universalCluster.setVariances(universalVariance); } universalCluster.recompute(); // this updates entropies and such int numKnownLabels = 0; for (int y : knownLabels) { if (y > 0) { numKnownLabels++; } } // Ok, now let's use K-Means to find the initial cluster set int Cmin = this.clustersPerLabelOption.getValue() * numKnownLabels; int Cmax = Cmin + 1; if (optimizeInitialClusterNumberOption.isSet()) { Cmin = this.minimumNumberOfClusterSizeOption.getValue();//Math.max(knownLabels.size(), 2); Cmax = Math.max(Cmin + 1, Math.min(this.clustersPerLabelOption.getValue() * numKnownLabels, this.maximumNumberOfClusterSizeOption.getValue())); } ArrayList<ValIdxTupleType> valIdxSet = new ArrayList<>(Cmax); Set<Riffle> V; // Create multiple hypothesis for best K choices: for (int c = Cmin; c < Cmax; c++) { V = batchCluster(D, c, true); ValIdxTupleType i = new ValIdxTupleType(V); valIdxSet.add(i); if (CVI == null) { CVI = i; } else { CVI.setVo_min(Math.min(i.getVo(), CVI.getVo_min())); CVI.setVo_max(Math.max(i.getVo(), CVI.getVo_max())); CVI.setVu_min(Math.min(i.getVu(), CVI.getVu_min())); CVI.setVu_max(Math.max(i.getVu(), CVI.getVu_max())); } } // Normalize all: for (ValIdxTupleType i : valIdxSet) { i.setVo_min(CVI.getVo_min()); i.setVo_max(CVI.getVo_max()); i.setVu_min(CVI.getVu_min()); i.setVu_max(CVI.getVu_max()); } // Find the best K by finding the minimum score: valIdxSet.stream().filter((i) -> (i.getValIdx() < CVI.getValIdx())).forEach((i) -> { CVI = i; }); this.clusters.clear(); for (Riffle c : CVI.getClustering()) { if (c.instances == null || c.instances.isEmpty()) { continue; } double[] clusterCentroid = new double[universalCentroid.length]; double[] clusterVariance = new double[universalVariance.length]; for (Instance x : c.instances) { // Pre-populate univeral cluster with data points double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { clusterCentroid[i] += xValues[i] / ((double) c.instances.size()); } } // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so // we use the 2-Pass method for computing sample variance (per dimension) if (c.instances.size() < 2) { for (int i = 0; i < clusterVariance.length; ++i) { clusterVariance[i] = universalCluster.getVariances()[i] * 0.85; // Statistical Variance } } else { double n = c.instances.size(); double[] cep = new double[universalCentroid.length]; Arrays.fill(cep, 0); for (Instance x : c.instances) { double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { double delta = clusterCentroid[i] - xValues[i]; cep[i] += delta; clusterVariance[i] += delta * delta; // Statistical Variance } } for (int i = 0; i < clusterVariance.length; ++i) { clusterVariance[i] = (clusterVariance[i] - cep[i] * cep[i] / n) / (n - 1); } } c.setCenter(clusterCentroid); // temporary - start with standard gaussian, gets updated below c.setVariances(clusterVariance); c.recompute(); // this updates entropies and such for (Instance x : c.instances) { this.hopperCache.push(new ClusterPointPair(x, c)); } this.clusters.add(c); } this.newClusterCreateCalls = 0; System.out.println("Starting with " + this.clusters.size() + " clusters."); instancesSeen = D.size(); weightsSeen = D.size(); }
From source file:moa.clusterers.clustream.Clustream.java
License:Apache License
@Override public void trainOnInstanceImpl(Instance instance) { int dim = instance.numValues(); timestamp++;//from w w w . jav a 2s.co m // 0. Initialize if (!initialized) { if (buffer.size() < bufferSize) { buffer.add(new ClustreamKernel(instance, dim, timestamp, t, m)); return; } int k = kernels.length; //System.err.println("k="+k+" bufferSize="+bufferSize); assert (k <= bufferSize); ClustreamKernel[] centers = new ClustreamKernel[k]; for (int i = 0; i < k; i++) { centers[i] = buffer.get(i); // TODO: make random! } Clustering kmeans_clustering = kMeans(k, centers, buffer); // Clustering kmeans_clustering = kMeans(k, buffer); for (int i = 0; i < kmeans_clustering.size(); i++) { kernels[i] = new ClustreamKernel(new DenseInstance(1.0, centers[i].getCenter()), dim, timestamp, t, m); } buffer.clear(); initialized = true; return; } // 1. Determine closest kernel ClustreamKernel closestKernel = null; double minDistance = Double.MAX_VALUE; for (int i = 0; i < kernels.length; i++) { //System.out.println(i+" "+kernels[i].getWeight()+" "+kernels[i].getDeviation()); double distance = distance(instance.toDoubleArray(), kernels[i].getCenter()); if (distance < minDistance) { closestKernel = kernels[i]; minDistance = distance; } } // 2. Check whether instance fits into closestKernel double radius = 0.0; if (closestKernel.getWeight() == 1) { // Special case: estimate radius by determining the distance to the // next closest cluster radius = Double.MAX_VALUE; double[] center = closestKernel.getCenter(); for (int i = 0; i < kernels.length; i++) { if (kernels[i] == closestKernel) { continue; } double distance = distance(kernels[i].getCenter(), center); radius = Math.min(distance, radius); } } else { radius = closestKernel.getRadius(); } if (minDistance < radius) { // Date fits, put into kernel and be happy closestKernel.insert(instance, timestamp); return; } // 3. Date does not fit, we need to free // some space to insert a new kernel long threshold = timestamp - timeWindow; // Kernels before this can be forgotten // 3.1 Try to forget old kernels for (int i = 0; i < kernels.length; i++) { if (kernels[i].getRelevanceStamp() < threshold) { kernels[i] = new ClustreamKernel(instance, dim, timestamp, t, m); return; } } // 3.2 Merge closest two kernels int closestA = 0; int closestB = 0; minDistance = Double.MAX_VALUE; for (int i = 0; i < kernels.length; i++) { double[] centerA = kernels[i].getCenter(); for (int j = i + 1; j < kernels.length; j++) { double dist = distance(centerA, kernels[j].getCenter()); if (dist < minDistance) { minDistance = dist; closestA = i; closestB = j; } } } assert (closestA != closestB); kernels[closestA].add(kernels[closestB]); kernels[closestB] = new ClustreamKernel(instance, dim, timestamp, t, m); }
From source file:moa.clusterers.clustream.ClustreamKernel.java
License:Apache License
/** * See interface <code>Cluster</code> * @param point/*from w w w .j a v a 2 s . c om*/ * @return */ @Override public double getInclusionProbability(Instance instance) { //trivial cluster if (N == 1) { double distance = 0.0; for (int i = 0; i < LS.length; i++) { double d = LS[i] - instance.value(i); distance += d * d; } distance = Math.sqrt(distance); if (distance < EPSILON) return 1.0; return 0.0; } else { double dist = calcNormalizedDistance(instance.toDoubleArray()); if (dist <= getRadius()) { return 1; } else { return 0; } // double res = AuxiliaryFunctions.distanceProbabilty(dist, LS.length); // return res; } }
From source file:moa.clusterers.clustream.WithKmeans.java
License:Apache License
@Override public void trainOnInstanceImpl(Instance instance) { int dim = instance.numValues(); timestamp++;// www. ja va 2 s . c om // 0. Initialize if (!initialized) { if (buffer.size() < bufferSize) { buffer.add(new ClustreamKernel(instance, dim, timestamp, t, m)); return; } else { for (int i = 0; i < buffer.size(); i++) { kernels[i] = new ClustreamKernel(new DenseInstance(1.0, buffer.get(i).getCenter()), dim, timestamp, t, m); } buffer.clear(); initialized = true; return; } } // 1. Determine closest kernel ClustreamKernel closestKernel = null; double minDistance = Double.MAX_VALUE; for (int i = 0; i < kernels.length; i++) { //System.out.println(i+" "+kernels[i].getWeight()+" "+kernels[i].getDeviation()); double distance = distance(instance.toDoubleArray(), kernels[i].getCenter()); if (distance < minDistance) { closestKernel = kernels[i]; minDistance = distance; } } // 2. Check whether instance fits into closestKernel double radius = 0.0; if (closestKernel.getWeight() == 1) { // Special case: estimate radius by determining the distance to the // next closest cluster radius = Double.MAX_VALUE; double[] center = closestKernel.getCenter(); for (int i = 0; i < kernels.length; i++) { if (kernels[i] == closestKernel) { continue; } double distance = distance(kernels[i].getCenter(), center); radius = Math.min(distance, radius); } } else { radius = closestKernel.getRadius(); } if (minDistance < radius) { // Date fits, put into kernel and be happy closestKernel.insert(instance, timestamp); return; } // 3. Date does not fit, we need to free // some space to insert a new kernel long threshold = timestamp - timeWindow; // Kernels before this can be forgotten // 3.1 Try to forget old kernels for (int i = 0; i < kernels.length; i++) { if (kernels[i].getRelevanceStamp() < threshold) { kernels[i] = new ClustreamKernel(instance, dim, timestamp, t, m); return; } } // 3.2 Merge closest two kernels int closestA = 0; int closestB = 0; minDistance = Double.MAX_VALUE; for (int i = 0; i < kernels.length; i++) { double[] centerA = kernels[i].getCenter(); for (int j = i + 1; j < kernels.length; j++) { double dist = distance(centerA, kernels[j].getCenter()); if (dist < minDistance) { minDistance = dist; closestA = i; closestB = j; } } } assert (closestA != closestB); kernels[closestA].add(kernels[closestB]); kernels[closestB] = new ClustreamKernel(instance, dim, timestamp, t, m); }
From source file:moa.clusterers.clustree.ClusKernel.java
License:Apache License
@Override public double getInclusionProbability(Instance instance) { //trivial cluster if (N == 1) { double distance = 0.0; for (int i = 0; i < LS.length; i++) { double d = LS[i] - instance.value(i); distance += d * d;//from w w w. j a va 2 s. c o m } distance = Math.sqrt(distance); if (distance < EPSILON) return 1.0; return 0.0; } else { double dist = calcNormalizedDistance(instance.toDoubleArray()); if (dist <= getRadius()) { return 1; } else { return 0; } // double res = AuxiliaryFunctions.distanceProbabilty(dist, LS.length); // return res; } }
From source file:moa.clusterers.clustree.ClusTree.java
License:Apache License
@Override public void trainOnInstanceImpl(Instance instance) { timestamp++;//from w w w .j ava2s .c o m //TODO check if instance contains label if (root == null) { numberDimensions = instance.numAttributes(); root = new Node(numberDimensions, 0); } else { if (numberDimensions != instance.numAttributes()) System.out.println( "Wrong dimensionality, expected:" + numberDimensions + "found:" + instance.numAttributes()); } ClusKernel newPointAsKernel = new ClusKernel(instance.toDoubleArray(), numberDimensions); insert(newPointAsKernel, new SimpleBudget(1000), timestamp); }
From source file:moa.clusterers.denstream.MicroCluster.java
License:Apache License
public MicroCluster(Instance instance, int dimensions, long timestamp, double lambda, Timestamp currentTimestamp) { this(instance.toDoubleArray(), dimensions, timestamp, lambda, currentTimestamp); }
From source file:moa.clusterers.outliers.AnyOut.util.DataObject.java
License:Apache License
/** * Standard constructor for <code>DataObject</code>. * @param idCounter The id for the <code>DataObject</code>. * @param features The feature as a <code>double[]</code> * @param classLabel The label id for the <code>DataObject</code>. */// w w w . j a va 2s . c om public DataObject(int idCounter, Instance inst) { this.id = idCounter; this.inst = inst; this.features = inst.toDoubleArray(); this.classLabel = (int) inst.classValue(); this.isOutiler = false; }
From source file:moa.evaluation.CMM_GTAnalysis.java
License:Apache License
/** * Calculates Euclidian distance /* w w w . j av a 2 s .c om*/ * @param inst1 point as double array * @param inst2 point as double array * @return euclidian distance */ private double distance(Instance inst1, Instance inst2) { return distance(inst1, inst2.toDoubleArray()); }