List of usage examples for weka.core Instance toDoubleArray
public double[] toDoubleArray();
From source file:moa.cluster.Riffle.java
License:Apache License
/** * Inverse process of adding instance Note that caller function takes care of object recompute() call. We just need to * update the specific changes here/* ww w .j a v a 2 s.c o m*/ * * @param x */ final protected void removeInstanceViaShephard(Instance x) { // multi-dimensional extension to Data Analysis 4th Ed Ch. 2 (Shepherd) if (this.numTotalPoints > 0) { double runningDeviation = this.getCenterDistance(x); double newPoint[] = x.toDoubleArray(); for (int i = 0; i < centroid.length; ++i) { if (this.symbolFrequencies[i] == null) { double attributeDist = newPoint[i] - centroid[i]; centroid[i] = centroid[i] - attributeDist / this.numTotalPoints; } else { int newVal = (int) newPoint[i]; if (newVal < this.symbolFrequencies[i].length) { this.symbolFrequencies[i][(int) newPoint[i]]--; } centroid[i] = weka.core.Utils.maxIndex(symbolFrequencies[i]); } } this.setCenter(centroid); this.runningSumOfSquares -= runningDeviation * this.getCenterDistance(x); } }
From source file:moa.cluster.Riffle.java
License:Apache License
/** * * @param x//from www . j a v a 2 s.c o m * @return */ @Override final public double getCenterDistance(Instance x) { if (this.distanceStrategyOption.getChosenIndex() == 13) { return 1.0 - this.getInclusionProbability(x); } else { double[] src = x.toDoubleArray(); return VectorDistances.distance(src, centroid, this.instances, this.distanceStrategyOption.getChosenIndex()); } }
From source file:moa.cluster.Riffle.java
License:Apache License
/** * Set pre-computed information fields/*from w w w .jav a 2s .c om*/ * @return */ public final double recomputeAll() { if (this.instances != null) { Arrays.fill(this.gtLabelFrequencies, 0); Arrays.fill(this.labelFrequencies, 0); this.numTotalPoints = instances.size(); this.numLabeledPoints = 0; if (!this.instances.isEmpty()) { // double[] clusterCentroid = this.getCenter(); double[] clusterVariance = this.getVariances(); for (int i = 0; i < centroid.length; ++i) { centroid[i] /= (double) this.instances.size() + 1.0; } for (double[] sf : this.symbolFrequencies) { if (sf != null) { Arrays.fill(sf, 0); } } for (Instance x : this.instances) { // Pre-populate univeral cluster with data points if (x == null) { System.out.println("Sieve::MaximizationStep() - x is NULL!"); continue; } this.gtLabelFrequencies[(int) x.classValue()]++; this.labelFrequencies[(int) x.classValue()] += x.weight(); this.numLabeledPoints += x.weight(); double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { double val = xValues[i]; centroid[i] += val / ((double) this.instances.size() + 1.0); if ((this.symbolFrequencies[i] != null) && (val < this.symbolFrequencies[i].length)) { this.symbolFrequencies[i][(int) val]++; } } } // for // Set 'centroid' to 'mode' (most frequent symbol) for nominal data: for (int i = 0; i < this.symbolFrequencies.length; ++i) { if (this.symbolFrequencies[i] != null) { centroid[i] = weka.core.Utils.maxIndex(this.symbolFrequencies[i]); } } setCenter(centroid); // temporary - start with standard gaussian, gets updated below // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so // we use the 2-Pass method for computing sample variance (per dimension) double n = instances.size(); if (n > 1) { double[] cep = new double[centroid.length]; Arrays.fill(cep, 0); Arrays.fill(clusterVariance, 0); for (Instance x : this.instances) { if (x == null) { System.out.println("Riffle::recompute() - x is null!"); continue; } double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { double delta = (this.symbolFrequencies[i] == null) ? centroid[i] - xValues[i] : (Math.abs(centroid[i] - xValues[i]) < 1e-32) ? 1 : 1e-20; cep[i] += delta; clusterVariance[i] += delta * delta; // Statistical Variance } } for (int i = 0; i < clusterVariance.length; ++i) { clusterVariance[i] = (clusterVariance[i] - cep[i] * cep[i] / n) / (n - 1); } setVariances(clusterVariance); } // end if (enough data for variance) } // end if(!instances.empty) recompute(); } // end if(!instances null) return getRadius() * getEntropy(); }
From source file:moa.cluster.Riffle.java
License:Apache License
/** * Sanity check and initialization of dynamic fields * * @param x// ww w. j a v a2 s . c o m */ protected final void safeInit(Instance x) { if (this.embeddedLearnerOption.getValueAsCLIString().contains("Majority class")) { this.excludeOutlierVoting = true; } if (centroid == null) { centroid = x.toDoubleArray(); } if (this.instances == null) { prepareEmbeddedClassifier(); ArrayList<Attribute> attribs = new ArrayList<>(); this.symbolFrequencies = new double[x.dataset().numAttributes()][]; for (int i = 0; i < x.dataset().numAttributes(); ++i) { Attribute a = (Attribute) x.dataset().attribute(i).copy(); if (i == x.classIndex()) { a.setWeight(0.0); } else { a.setWeight(1.0); } switch (a.type()) { case Attribute.STRING: case Attribute.NOMINAL: //UnsafeUtils.setAttributeRange(a, x.value(i), x.value(i)); this.symbolFrequencies[i] = new double[a.numValues()]; break; case Attribute.NUMERIC: case Attribute.RELATIONAL: case Attribute.DATE: default: // UnsafeUtils.setAttributeRange(a, x.value(i), x.value(i)); this.symbolFrequencies[i] = null; } attribs.add(a); } this.instances = new Instances("ClusterData", attribs, 1); this.instances.setClassIndex(x.classIndex()); } // else { // for (int i = 0; i < x.dataset().numAttributes() && i < this.header.numAttributes(); ++i) { // double val = x.value(i); // Attribute a = this.header.attribute(i); // // expand range as necessary // if (val < a.getLowerNumericBound() || val > a.getUpperNumericBound()){ // UnsafeUtils.setAttributeRange(a, Math.min(val,a.getLowerNumericBound()), Math.max(val,a.getUpperNumericBound())); // } // // increase frequency counts if new string value is encountered // if (a.type() == Attribute.STRING && (val >= Math.max(this.symbolFrequencies[i].length, a.numValues()))) { // double newArray[] = new double[Math.max(this.symbolFrequencies[i].length, a.numValues())]; // Arrays.fill(newArray, 0); // for(int j = 0; j <= this.symbolFrequencies[i].length; j++) { // newArray[j] = this.symbolFrequencies[i][j]; // } // this.symbolFrequencies[i] = newArray; // } // } // } if (this.variances == null) { this.variances = new double[x.numAttributes()]; Arrays.fill(this.variances, 1); } if (this.entropies == null) { this.entropies = new double[x.numAttributes()]; Arrays.fill(this.entropies, 0); } if (this.labelFrequencies == null) { this.labelFrequencies = new double[x.numClasses()]; Arrays.fill(this.labelFrequencies, 0); } if (this.gtLabelFrequencies == null) { this.gtLabelFrequencies = new double[x.numClasses()]; Arrays.fill(this.gtLabelFrequencies, 0); } if (this.rho == null) { this.rho = new double[x.numAttributes()]; Arrays.fill(this.rho, 0); } }
From source file:moa.cluster.SphereCluster.java
License:Apache License
public SphereCluster(List<? extends Instance> instances, int dimension) { this();/*from www.j av a 2s .c om*/ if (instances == null || instances.size() <= 0) return; weight = instances.size(); Miniball mb = new Miniball(dimension); mb.clear(); for (Instance instance : instances) { mb.check_in(instance.toDoubleArray()); } mb.build(); center = mb.center(); radius = mb.radius(); mb.clear(); }
From source file:moa.cluster.SphereCluster.java
License:Apache License
public double[] getDistanceVector(Instance instance) { return distanceVector(getCenter(), instance.toDoubleArray()); }
From source file:moa.clusterer.FeS2.java
License:Apache License
/** * Wrapper for parallel K-Means for processing warm-up data set * @param D Warm-up data set/*from w w w. j a v a 2 s. c o m*/ * @param K number of clusters * @param useLabels if true, use * @return */ protected Set<Riffle> batchCluster(List<Instance> D, int K, boolean useLabels) { assert K >= 2 : "Minimum number of clusters (K) is 2"; int numAttributes = D.get(0).numAttributes(); TreeSet<Riffle> ret = new TreeSet<>(); TreeSet<Integer> labels = new TreeSet<>(); TreeMap<Integer, TreeSet<Riffle>> potentialClusters = new TreeMap<>(); //Create a potential cluster pool. Seperate into seperate pools by label if useLabels is set to true: for (Instance x : D) { int label = (useLabels) ? (int) x.classValue() : 0; labels.add(label); TreeSet<Riffle> clusterSet = potentialClusters.get(label); if (clusterSet == null) { clusterSet = new TreeSet<>(); } clusterSet.add(this.createNewCluster(x)); potentialClusters.put(label, clusterSet); } // Initialize following the K-Means++ approach: Riffle C = potentialClusters.firstEntry().getValue().first(); ret.add(C); potentialClusters.firstEntry().getValue().remove(C); Iterator<Integer> labelIter = labels.iterator(); while ((ret.size() < K) && !potentialClusters.isEmpty()) { if (!labelIter.hasNext()) { labelIter = labels.iterator(); } // loop around as needed int pseudoLabel = labelIter.next(); TreeSet<Riffle> clusterSet = potentialClusters.get(pseudoLabel); if (clusterSet.isEmpty()) { potentialClusters.remove(pseudoLabel); labelIter.remove(); continue; } SortedSet<NearestClusterTuple> nearestClusters = findMostLikelyClusters(clusterSet, C.toInstance()); C = nearestClusters.last().getCluster(); ret.add(C); clusterSet.remove(C); } potentialClusters.clear(); // Iterate final int maxIterations = 100; final double minDelta = 0.0001; int iteration = 0; double valIdxDelta = 1.0; ValIdxTupleType lastScore = null; while ((iteration < maxIterations) && (valIdxDelta > minDelta)) { iteration++; ret.parallelStream().forEach((c) -> { c.cleanTallies(); if (c.instances == null) { c.instances = c.getHeader(); } c.instances.clear(); }); // Expectation Step boolean wasAdded; for (Instance x : D) { SortedSet<NearestClusterTuple> nearestClusters = findMostLikelyClusters(ret, x); wasAdded = false; int xLabel = (int) x.classValue(); int cLabel = 0; if (useLabels) { // Add to nearest cluster with same label for (NearestClusterTuple nct : nearestClusters) { cLabel = (int) nct.getCluster().getGroundTruth(); if (cLabel == xLabel) { nct.getCluster().addInstance(x); nct.getCluster().instances.add(x); wasAdded = true; //break; } } } // just add to the closest cluster if (!wasAdded) { nearestClusters.last().getCluster().instances.add(x); } } // Maximization Step for (Riffle c : ret) { if (c.instances == null || c.instances.isEmpty()) { continue; } double[] clusterCentroid = new double[numAttributes]; double[] clusterVariance = new double[numAttributes]; for (Instance x : c.instances) { // Pre-populate univeral cluster with data points double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { clusterCentroid[i] += xValues[i] / ((double) c.instances.size()); } } // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so // we use the 2-Pass method for computing sample variance (per dimension) if (c.instances.size() < 2) { for (int i = 0; i < clusterVariance.length; ++i) { clusterVariance[i] = universalCluster.getVariances()[i] * 0.85; // Statistical Variance } } else { double n = c.instances.size(); double[] cep = new double[numAttributes]; Arrays.fill(cep, 0); for (Instance x : c.instances) { double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { double delta = clusterCentroid[i] - xValues[i]; cep[i] += delta; clusterVariance[i] += delta * delta; // Statistical Variance } } for (int i = 0; i < clusterVariance.length; ++i) { clusterVariance[i] = (clusterVariance[i] - cep[i] * cep[i] / n) / (n - 1); } } c.setCenter(clusterCentroid); // temporary - start with standard gaussian, gets updated below c.setVariances(clusterVariance); c.recompute(); // this updates entropies and such // double[] clusterCentroid = new double[numAttributes]; // Arrays.fill(clusterCentroid, 0); // for (Instance x : c.instances) { // Pre-populate univeral cluster with data points // double[] xValues = x.toDoubleArray(); // for (int i = 0; i < xValues.length; ++i) { // clusterCentroid[i] += xValues[i] / ((double) c.instances.size()); // } // } // c.setCenter(clusterCentroid); } ValIdxTupleType currentScore = new ValIdxTupleType(ret); if (lastScore != null) { double diff = Math.abs(lastScore.getValIdx() - currentScore.getValIdx()); double denominator = lastScore.getValIdx(); valIdxDelta = (denominator == 0) ? 0.0 : Math.abs(diff / denominator); } lastScore = currentScore; } // end while return ret; }
From source file:moa.clusterer.FeS2.java
License:Apache License
/** * Uses methodology from Kim et al. "A Novel Validity Index for Determination of the Optimal Number of Clusters" * @param D Warm-up data set/*from www . j ava 2 s . c om*/ */ public void initialize(List<Instance> D) { assert (D == null || D.isEmpty() || D.get(0) == null) : "FeS::initialize() called with a null data list!"; knownLabels.clear(); universalProbabilitySums = 0; bestProbabilitySums = 0; bestProbabilityCount = 0; // Setup the universal set/cluster. Note that this will be crucial for subspace selection (cross-entropy checks against null hypothesis) double[] universalCentroid = new double[D.get(0).numAttributes()]; double[] universalVariance = new double[D.get(0).numAttributes()]; Arrays.fill(universalCentroid, 0); Arrays.fill(universalVariance, 0); universalCluster = new Riffle(D.get(0)); universalCluster.updateStrategyOption.setChosenIndex(this.updateStrategyOption.getChosenIndex()); universalCluster.outlierDefinitionStrategyOption .setChosenIndex(this.outlierDefinitionStrategyOption.getChosenIndex()); universalCluster.distanceStrategyOption.setChosenIndex(this.distanceStrategyOption.getChosenIndex()); universalCluster.initialStandardDeviationOption.setValue(this.initialStandardDeviationOption.getValue()); universalCluster.alphaAdjustmentWeightOption.setValue(this.learningRateAlphaOption.getValue()); //universalCluster.setParentClusterer(this); if (D.size() > 1) { double[] ep = new double[universalCentroid.length]; Arrays.fill(ep, 0); universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below universalCluster.setVariances(universalVariance); // temporary - start with standard gaussian, will update below universalCluster.setWeight(0); double N = D.size(); for (Instance x : D) { // Pre-populate univeral cluster with data points knownLabels.add((int) x.classValue()); universalCluster.addInstance(x); double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { universalCentroid[i] += xValues[i]; } } for (int i = 0; i < universalCentroid.length; ++i) { universalCentroid[i] /= N; } // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so // we use the 2-Pass method for computing sample variance (per dimension) for (Instance x : D) { double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { double delta = universalCentroid[i] - xValues[i]; ep[i] += delta; universalVariance[i] += delta * delta; } } for (int i = 0; i < universalVariance.length; ++i) { universalVariance[i] = (universalVariance[i] - ep[i] * ep[i] / N) / (N - 1); } universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below universalCluster.setVariances(universalVariance); } universalCluster.recompute(); // this updates entropies and such // Ok, now let's use K-Means to find the initial cluster set int Cmin = this.clustersPerLabelOption.getValue() * this.knownLabels.size(); int Cmax = Cmin + 1; if (optimizeInitialClusterNumberOption.isSet()) { Cmin = this.minimumNumberOfClusterSizeOption.getValue();//Math.max(knownLabels.size(), 2); Cmax = Math.max(Cmin + 1, Math.min(this.clustersPerLabelOption.getValue() * this.knownLabels.size(), this.maximumNumberOfClusterSizeOption.getValue())); } ArrayList<ValIdxTupleType> valIdxSet = new ArrayList<>(Cmax); Set<Riffle> V; // Create multiple hypothesis for best K choices: for (int c = Cmin; c < Cmax; c++) { V = batchCluster(D, c, true); ValIdxTupleType i = new ValIdxTupleType(V); valIdxSet.add(i); if (CVI == null) { CVI = i; } else { CVI.setVo_min(Math.min(i.getVo(), CVI.getVo_min())); CVI.setVo_max(Math.max(i.getVo(), CVI.getVo_max())); CVI.setVu_min(Math.min(i.getVu(), CVI.getVu_min())); CVI.setVu_max(Math.max(i.getVu(), CVI.getVu_max())); } } // Normalize all: valIdxSet.parallelStream().map((i) -> { i.setVo_min(CVI.getVo_min()); return i; }).map((i) -> { i.setVo_max(CVI.getVo_max()); return i; }).map((i) -> { i.setVu_min(CVI.getVu_min()); return i; }).forEach((i) -> { i.setVu_max(CVI.getVu_max()); }); // Find the best K by finding the minimum score: valIdxSet.stream().filter((i) -> (i.getValIdx() < CVI.getValIdx())).forEach((i) -> { CVI = i; }); BufferedWriter datawriter = null; // DEBUG BufferedWriter rawdatawriter = null; // DEBUG BufferedWriter clusterwriter = null; // DEBUG String filePrefix = "DEBUG-" + iso8601FormatString.format(new Date()); // DEBUG try { // DEBUG File warmupData = new File((filePrefix + "-first" + D.size() + ".csv")); // DEBUG File rawwarmupData = new File((filePrefix + "-raw" + D.size() + ".csv")); // DEBUG File clusterData = new File((filePrefix + "-clusters.csv")); // DEBUG datawriter = new BufferedWriter(new FileWriter(warmupData)); // DEBUG rawdatawriter = new BufferedWriter(new FileWriter(rawwarmupData)); // DEBUG clusterwriter = new BufferedWriter(new FileWriter(clusterData)); // DEBUG clusterwriter.write("id,s,w,r,e,p,y,c,v"); // DEBUG clusterwriter.newLine(); // DEBUG String csv = ""; // DEBUG int rowCount = 0; // DEBUG for (Instance x : D) { // DEBUG double[] dataArray = x.toDoubleArray(); // DEBUG for (int dIdx = 0; dIdx < dataArray.length; ++dIdx) { // DEBUG csv += dataArray[dIdx] + ","; // DEBUG } // DEBUG csv += ++rowCount; // DEBUG rawdatawriter.write(csv); // DEBUG rawdatawriter.newLine(); // DEBUG csv = ""; // DEBUG } // DEBUG for (Double uvar : universalVariance) { csv += uvar + ","; } rawdatawriter.write(csv); // DEBUG rawdatawriter.newLine(); // DEBUG csv = ""; for (Double umean : universalCentroid) { csv += umean + ","; } rawdatawriter.write(csv); // DEBUG rawdatawriter.newLine(); // DEBUG csv = ""; rawdatawriter.flush(); this.clusters.clear(); for (Riffle c : CVI.getClustering()) { if (c.instances == null || c.instances.isEmpty()) { continue; } double[] clusterCentroid = new double[universalCentroid.length]; double[] clusterVariance = new double[universalVariance.length]; for (Instance x : c.instances) { // Pre-populate univeral cluster with data points double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { clusterCentroid[i] += xValues[i] / ((double) c.instances.size()); } } // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so // we use the 2-Pass method for computing sample variance (per dimension) if (c.instances.size() < 2) { for (int i = 0; i < clusterVariance.length; ++i) { clusterVariance[i] = universalCluster.getVariances()[i] * 0.85; // Statistical Variance } } else { double n = c.instances.size(); double[] cep = new double[universalCentroid.length]; Arrays.fill(cep, 0); for (Instance x : c.instances) { double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { double delta = clusterCentroid[i] - xValues[i]; cep[i] += delta; clusterVariance[i] += delta * delta; // Statistical Variance } } for (int i = 0; i < clusterVariance.length; ++i) { clusterVariance[i] = (clusterVariance[i] - cep[i] * cep[i] / n) / (n - 1); } } c.setCenter(clusterCentroid); // temporary - start with standard gaussian, gets updated below c.setVariances(clusterVariance); c.recompute(); // this updates entropies and such // WRITE DEBUG DATA for (Instance x : c.instances) { double[] dataArray = x.toDoubleArray(); for (int dIdx = 0; dIdx < dataArray.length; ++dIdx) { csv += dataArray[dIdx] + ","; } csv += c.getId(); datawriter.write(csv); datawriter.newLine(); csv = ""; } // clusterwriter.write("id,w,r,e,p,y,c,v"); if (Double.isNaN(c.getRadius())) { System.out.print("Bad radius"); } clusterwriter.write(c.getId() + "," + c.size() + "," + c.getWeight() + "," + c.getRadius() + "," + c.getEntropy() + "," + c.getTruePurity() + "," + weka.core.Utils.maxIndex(c.getVotes()) + ",Centroid:," + weka.core.Utils.arrayToString(c.getCenter()) + ",Var:," + weka.core.Utils.arrayToString(c.getVariances())); clusterwriter.newLine(); // END DEBUG DATA this.clusters.add(c); } if (this.outlierDefinitionStrategyOption.getChosenIndex() == 1) { this.setupPerceptron(); double outlierPerceptronTrainingError = this.trainPerceptron(); System.out .println("outlier detection Perceptron training error = " + outlierPerceptronTrainingError); } this.clusters.stream().forEach((c) -> { c.instances.clear(); }); this.newClusterCreateCalls = 0; System.out.println( "Starting with " + this.clusters.size() + " clusters and " + this.knownLabels + " labels."); clusterwriter.flush(); // DEBUG clusterwriter.close(); // DEBUG datawriter.flush(); // DEBUG datawriter.close(); // DEBUG rawdatawriter.flush(); // DEBUG rawdatawriter.close(); // DEBUG } catch (IOException e) { } // DEBUG }
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * Use inclusion probability to discover the cluster "nearest" the provided instance * * @param x instance in question//from w w w . ja v a 2s . c o m * @param C set of clusters * @return sorted set of clusters, ordered by inc */ protected final NearestClusterTuple[] findMostLikelyClusters(Collection<Riffle> C, Instance x) { NearestClusterTuple[] ret = new NearestClusterTuple[C.size()]; double[] xVals = x.toDoubleArray(); int idx = 0; double dist = 0; for (Riffle c : C) { dist = c.getCenterDistance(xVals); ret[idx++] = new NearestClusterTuple(c, dist); } // end for Arrays.parallelSort(ret); return ret; }
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * Use inclusion probability to discover the cluster "nearest" the provided instance * * @param D instance set to sort from//from ww w. j a va2 s . com * @param x instance in question * @return sorted set of clusters, ordered by inc */ protected final NearestInstanceTuple[] findNearestNeighbors(Instances D, Instance x) { NearestInstanceTuple[] ret = new NearestInstanceTuple[D.size()]; double[] xVals = x.toDoubleArray(); int idx = 0; for (Instance n : D) { ret[idx++] = new NearestInstanceTuple(n, VectorDistances.distance(xVals, n.toDoubleArray(), D, this.distanceStrategyOption.getChosenIndex())); } // end for Arrays.parallelSort(ret); return ret; }