List of usage examples for weka.core Instance classValue
public double classValue();
From source file:moa.clusterer.FeS2.java
License:Apache License
/**; * * @param x instance to train on/*from w ww . j a v a 2 s .c o m*/ */ @Override public void trainOnInstanceImpl(Instance x) { safeInit(x); assert (x != null) : "FeS2::trainOnInstanceImpl() Training on a null instance!"; int classValue = (int) x.classValue(); boolean isNewLabel = (!knownLabels.contains(classValue)) && (x.weight() > 0); if ((x.weight() > 0)) { this.knownLabels.add(classValue); } this.universalCluster.addInstance(x); // Find nearest Cluster final SortedSet<NearestClusterTuple> nearestClusters = findMostLikelyClusters(this.clusters, x); assert !nearestClusters.isEmpty() : "Cluster set for probability matching is empty"; // Compute some base metrics we need to know: double maxRadius = 0; double avgRadius = 0; boolean unanimousOutlier = true; double weightTotal = 0; double minWeight = Double.MAX_VALUE; for (NearestClusterTuple nct : nearestClusters) { unanimousOutlier = unanimousOutlier && nct.getCluster().isOutlier(x); maxRadius = Math.max(maxRadius, nct.getCluster().getRadius()); avgRadius += nct.getCluster().getRadius(); } avgRadius /= nearestClusters.size(); // Update weights for (NearestClusterTuple nct : nearestClusters) { Riffle c = nct.getCluster(); c.penalize(); // unilaterally reduce weights int clusterMajorityClass = weka.core.Utils.maxIndex(c.getVotes()); // increase weights for matches (define 'match' criteria by strategy parameter) switch (this.positiveClusterFeedbackStrategyOption.getChosenIndex()) { case 0: // only the closest if (!unanimousOutlier && c == nearestClusters.last().getCluster()) { addToCluster(x, c); } break; case 1: // All label matches // This ternary condition is very important for results int hypothesisClass = (x.weight() > 0) ? classValue : weka.core.Utils.maxIndex(this.getVotesForInstance(x)); if (clusterMajorityClass == hypothesisClass) { addToCluster(x, c); } break; case 2: // All proximity matches if (!nct.getCluster().isOutlier(x)) { addToCluster(x, c); } break; default: break; } //end switch weightTotal += c.getWeight(); minWeight = Math.min(minWeight, c.getWeight()); } // Sort by (weight / sigma) Riffle[] sortedClusters = new Riffle[clusters.size()]; int i = 0; for (Riffle c : clusters) { sortedClusters[i++] = c; } // Kuddos to Java 8 and lambda expressions for making this a one-liner: Arrays.parallelSort(sortedClusters, (Riffle a, Riffle b) -> Double.compare(a.getWeight() / Math.max(a.getRadius(), 1e-96), b.getWeight() / Math.max(b.getRadius(), 1e-96))); boolean atClusterCapacity = (this.clusters.size() >= Math.min( this.clustersPerLabelOption.getValue() * this.knownLabels.size(), this.maximumNumberOfClusterSizeOption.getValue())); // * * * // // Results show that when average P(x|k) < Chauvenet, no new clusters, and visa versa (which is opposite of expected behavior) // // * * * boolean universalOutlier = this.universalCluster.isOutlier(x); if (isNewLabel) { newLabelCount++; } if (universalOutlier) { universalOutlierCount++; } if (unanimousOutlier) { unanimousOutlierCount++; } // If we have no matches at all, then the weakest clsuter is replaced by a new one with a high variance and low weight //if (isNewLabel || (unanimousOutlier && universalOutlier)) { if (isNewLabel || unanimousOutlier) { Riffle weakestLink = sortedClusters[sortedClusters.length - 1]; // get last one Riffle novelCluster = this.createNewCluster(x); //novelCluster.setRadius((avgRadius + maxRadius) / 2.0); // Set to half-way between average and max radius novelCluster.setWeight(weightTotal / nearestClusters.size()); // <---- Validate this ------ weightTotal += novelCluster.getWeight(); // update for new normalization factor // You are the weakest link... Goodbye if (atClusterCapacity) { weightTotal -= weakestLink.getWeight(); // update for new normalization factor this.clusters.remove(weakestLink); } // Everyone please welcome our newest contestant... clusters.add(novelCluster); } // Normalize Weights and Update variance estimates for singleton clusters double[] universeVariance = universalCluster.getVariances(); double[] initialVariance = new double[universeVariance.length]; for (int j = 0; j < initialVariance.length; ++j) { initialVariance[j] = universeVariance[j] * 0.85; } if (weightTotal <= 0) { weightTotal = 1; } for (Riffle c : this.clusters) { if (c.size() < 2) { c.setVariances(initialVariance); } c.setWeight(c.getWeight() / weightTotal); } }
From source file:moa.clusterer.FeS2.java
License:Apache License
/** * Find the nearest cluster, and use its most frequent label. * If nearest cluster has no label, then we have a novel cluster * Unless data point is an outlier to all clusters, then it is just an outlier * @param inst/*from ww w .j av a2 s . co m*/ * @return */ @Override public double[] getVotesForInstance(Instance inst) { assert (this.universalCluster != null) : "FeS2::getVotesForInstance() called without any initialization or training!"; int novelClassLabel = inst.numClasses(); int outlierLabel = novelClassLabel + 1; double[] votes = new double[inst.numClasses() + 2]; if (this.clusters.isEmpty()) { return votes; } double[] cumulativeVotes = new double[inst.numClasses()]; double[] cumulativeVotes_p = new double[inst.numClasses()]; double[] cumulativeVotes_pw = new double[inst.numClasses()]; double[] cumulativeVotes_n = new double[inst.numClasses()]; double[] cumulativeVotes_np = new double[inst.numClasses()]; double[] cumulativeVotes_npw = new double[inst.numClasses()]; double[] cumulativeWinnerTakesAllVotes = new double[inst.numClasses()]; Arrays.fill(votes, 0.0); Arrays.fill(cumulativeVotes, 0.0); Arrays.fill(cumulativeVotes_p, 0.0); Arrays.fill(cumulativeVotes_pw, 0.0); Arrays.fill(cumulativeVotes_n, 0.0); Arrays.fill(cumulativeVotes_np, 0.0); Arrays.fill(cumulativeVotes_npw, 0.0); Arrays.fill(cumulativeWinnerTakesAllVotes, 0.0); final int TRUE_CLASS = (int) inst.classValue(); // for debug watch windows only final SortedSet<NearestClusterTuple> nearestClusters = findMostLikelyClusters(this.clusters, inst); boolean memberOfAtLeastOneTrueCluster = false; boolean universalOutlier = true; double bestProbability = 0; double universalProbability = this.universalCluster.getInclusionProbability(inst); NearestClusterTuple bestMatchCluster = null; // Gather data for (NearestClusterTuple nct : nearestClusters) { double p = nct.getDistance(); boolean localOutlier = nct.getCluster().isOutlier(inst); memberOfAtLeastOneTrueCluster = memberOfAtLeastOneTrueCluster || (!localOutlier && nct.getCluster().size() > this.minimumClusterSizeOption.getValue()); universalOutlier = universalOutlier && localOutlier; bestProbability = Math.max(p, bestProbability); if (p <= 0) { continue; } int localWinner = (int) nct.getCluster().getGroundTruth(); cumulativeWinnerTakesAllVotes[localWinner] += p; double clusterVotes[] = nct.getCluster().getVotes(); double clusterNormalizedVotes[] = nct.getCluster().getVotes().clone(); if (weka.core.Utils.sum(clusterNormalizedVotes) > 0) { weka.core.Utils.normalize(clusterNormalizedVotes); } for (int i = 0; i < clusterVotes.length; ++i) { cumulativeVotes[i] += clusterVotes[i]; cumulativeVotes_p[i] += clusterVotes[i] * p; cumulativeVotes_pw[i] += clusterVotes[i] * p * nct.getCluster().getWeight(); cumulativeVotes_n[i] += clusterNormalizedVotes[i]; cumulativeVotes_np[i] += clusterNormalizedVotes[i] * p; cumulativeVotes_npw[i] += clusterNormalizedVotes[i] * p * nct.getCluster().getWeight(); } if (!localOutlier) { bestMatchCluster = nct; } } // end for universalProbabilitySums += universalProbability; bestProbabilitySums += bestProbability; bestProbabilityCount += 1; if (nearestClusters.isEmpty()) { votes[outlierLabel] = 1.0; } else { if (weka.core.Utils.sum(cumulativeVotes) > 0) { weka.core.Utils.normalize(cumulativeVotes); } if (weka.core.Utils.sum(cumulativeVotes_p) > 0) { weka.core.Utils.normalize(cumulativeVotes_p); } if (weka.core.Utils.sum(cumulativeVotes_pw) > 0) { weka.core.Utils.normalize(cumulativeVotes_pw); } if (weka.core.Utils.sum(cumulativeVotes_n) > 0) { weka.core.Utils.normalize(cumulativeVotes_n); } if (weka.core.Utils.sum(cumulativeVotes_np) > 0) { weka.core.Utils.normalize(cumulativeVotes_np); } if (weka.core.Utils.sum(cumulativeVotes_npw) > 0) { weka.core.Utils.normalize(cumulativeVotes_npw); } if (weka.core.Utils.sum(cumulativeWinnerTakesAllVotes) > 0) { weka.core.Utils.normalize(cumulativeWinnerTakesAllVotes); } switch (this.votingStrategyOption.getChosenIndex()) { case 0: // 1-NN - usually not the strongest double[] nearestNeighborVotes = nearestClusters.last().getCluster().getVotes(); for (int i = 0; i < nearestNeighborVotes.length; ++i) { votes[i] = nearestNeighborVotes[i]; } break; case 1: // Global k-NN - this is a poor performer for (int i = 0; i < cumulativeVotes.length; ++i) { votes[i] = cumulativeVotes[i]; } break; case 2: // Globally probability-weighted k-NN - good, but biased towards heavy clusters for (int i = 0; i < cumulativeVotes_p.length; ++i) { votes[i] = cumulativeVotes_p[i]; } break; case 3: // Globally probability-utility-weighted k-NN - good, but overly complex for (int i = 0; i < cumulativeVotes_pw.length; ++i) { votes[i] = cumulativeVotes_pw[i]; } break; case 4: // Globally normalized k-NN - this is also usually a really really poor performer. Don't use it for (int i = 0; i < cumulativeVotes_n.length; ++i) { votes[i] = cumulativeVotes_n[i]; } break; case 5: // Globally normalized probability-weighted k-NN - a safe bet for (int i = 0; i < cumulativeVotes_np.length; ++i) { votes[i] = cumulativeVotes_np[i]; } break; case 6: // Globally normalized probability-utility-weighted k-NN - default and preferred method for (int i = 0; i < cumulativeVotes_npw.length; ++i) { votes[i] = cumulativeVotes_npw[i]; } break; case 7: // Globally weighted k-NN winner take all per cluster - Can avoid noise, but not usually the best default: for (int i = 0; i < cumulativeWinnerTakesAllVotes.length; ++i) { votes[i] = cumulativeWinnerTakesAllVotes[i]; } } // end switch double voteAccumulator = 0; for (double v : votes) { voteAccumulator += v; } // A novel cluster is one of sufficient size but no label if ((bestMatchCluster != null) // It matches a cluster && (bestMatchCluster.getCluster().size() > this.minimumClusterSizeOption.getValue()) // that is overall large enough && (bestMatchCluster.getCluster().getNumLabeledPoints() < 1)) { // but without labels votes[novelClassLabel] = 1.0; } // outlier detection if (universalOutlier) { int maxIdx = weka.core.Utils.maxIndex(votes); if (maxIdx < 0) { maxIdx = 0; } double outlierValue = votes[maxIdx]; if (outlierValue <= 0) { votes[novelClassLabel] = 1.0; // special case of novelty when we have absolutely no clue how to label an outlier outlierValue = 1e-16; } votes[outlierLabel] = outlierValue / 2.0; //Math.max(Math.abs(1.0 - bestProbability), Math.abs(1.0 - universalProbability)); } } // end if (nearestClusters not empty) return votes; }
From source file:moa.clusterer.FeS2.java
License:Apache License
/** * @return training accuracy//from w w w. j a v a2 s . co m */ private double trainPerceptron() { // Train the perceptron from warmup phase clustering final int epochs = 20; final int numberOfPerceptrons = 10; final int MEMBER = 0; final int OUTLIER = 1; double accuracySum = 0; double accuracyCount = 0; this.outlierPerceptronTrainingSet.clear(); Random rng = new Random(this.randomSeed); // Generate training set for (Riffle thisCluster : this.clusters) { for (Riffle thatCluster : this.clusters) { double groundTruth = (thisCluster == thatCluster) ? MEMBER : OUTLIER; for (Instance x : thatCluster.getHeader()) { Instance pseudoPt = makePerceptronInstance(thisCluster, x); pseudoPt.setClassValue(groundTruth); this.outlierPerceptronTrainingSet.add(pseudoPt); } } } this.outlierPerceptronTrainingSet.parallelStream().forEach((x) -> { x.setWeight(1.0 / this.outlierPerceptronTrainingSet.numInstances()); }); // Boost it this.perceptrons = new Perceptron[numberOfPerceptrons]; this.pweights = new double[numberOfPerceptrons]; for (int perceptronIdx = 0; perceptronIdx < numberOfPerceptrons; ++perceptronIdx) { // Discover new weak learner Perceptron candidatePerceptron = new Perceptron(); candidatePerceptron.prepareForUse(); candidatePerceptron.learningRatioOption.setValue(rng.nextDouble() * 0.9 + 0.1); for (int epoch = 0; epoch < epochs; epoch++) { for (Instance x : this.outlierPerceptronTrainingSet) { if ((rng.nextDouble() / this.outlierPerceptronTrainingSet.numInstances()) < x.weight()) { // weighted subsampling candidatePerceptron.trainOnInstance(x); } } } //end epochs // Evaluate weak learner double errorFunctionSum = 0; double weightSum = 0; for (Instance x : this.outlierPerceptronTrainingSet) { if (!candidatePerceptron.correctlyClassifies(x)) { errorFunctionSum += x.weight(); } } // adjust training weights for (Instance x : this.outlierPerceptronTrainingSet) { double newWeight = x.weight(); if (candidatePerceptron.correctlyClassifies(x)) { newWeight *= errorFunctionSum / (1.0 - errorFunctionSum); if (Double.isNaN(newWeight)) { newWeight = weka.core.Utils.SMALL; } x.setWeight(newWeight); } weightSum += newWeight; } // Normalize for (Instance x : this.outlierPerceptronTrainingSet) { x.setWeight(x.weight() / weightSum); } // Add to ensemble double newPerceptronWeight = Math.log((1 - errorFunctionSum) / errorFunctionSum); this.perceptrons[perceptronIdx] = candidatePerceptron; this.pweights[perceptronIdx] = newPerceptronWeight; } // end numPerceptrons // Check training error accuracySum = 0; accuracyCount = 0; for (Instance x : this.outlierPerceptronTrainingSet) { if (this.getPerceptronVotesForOutlierStatus(x) == x.classValue()) { accuracySum++; } accuracyCount++; } double trainingAccuracy = (accuracyCount > 0) ? (accuracySum / accuracyCount) : 0.0; this.outlierPerceptronTrainingSet.clear(); return trainingAccuracy; }
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * This is not your grandpa's E-M algorithm... it has multiple mini-steps, * but "The e1-m1-e2-m2-e3-m3-Algorithm" is a mouthful, so we just call it *-Means Clustering * {Pronounced "Any-means (necessary) clustering"} * @param D// www.j a v a 2 s .co m * @param subclusters * @param maxK * @return score at the end of the process */ protected final double EMStep(List<ClusterPointPair> D, Collection<Riffle> subclusters, int maxK) { double ret = 0; // clear the pallette for (Riffle c : subclusters) { if (c.instances == null) { c.instances = c.getHeader(); } c.instances.clear(); c.cleanTallies(); } // Assign by X's to nearest clusters (Maximization step 1) for (ClusterPointPair cxp : D) { if (this.potentialNovels.contains(cxp.x)) { // could also be if cxp.c == null, but this is safer continue; // ignore the outliers for a moment } final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(subclusters, cxp.x); // double ds[] = new double[nearestClusters.length]; // int foo = 0; // for(NearestClusterTuple gnarf : nearestClusters) { // ds[foo++] = gnarf.getDistance(); // } cxp.c = nearestClusters[0].getCluster(); nearestClusters[0].getCluster().instances.add(cxp.x); if (cxp.x.weight() > 0.99) { nearestClusters[0].getCluster().addLabeling((int) cxp.x.classValue(), cxp.x.weight()); } } // Find new radius (Expectation step) for (Riffle c : subclusters) { ret += c.recomputeAll(); } // Remove empty clusters to make room for splits (Expectation-ish) Iterator<Riffle> cIter = subclusters.iterator(); while (cIter.hasNext()) { Riffle rc = cIter.next(); if (rc.instances.size() < 1) { cIter.remove(); } } // Are we full? if (subclusters.size() < maxK) { // Fix bad clusters (Maximization step 2 - breaking up noisy clusters) Riffle sortedClusters[] = new Riffle[subclusters.size()]; int tmpIdx = 0; for (Riffle tmpRfl : subclusters) { if (tmpIdx >= sortedClusters.length) { break; } sortedClusters[tmpIdx] = tmpRfl; tmpIdx++; } Arrays.sort(sortedClusters, new Comparator<Riffle>() { @Override public int compare(Riffle first, Riffle second) { if (first == null) { return 1; } if (second == null) { return -1; } double[] votes1 = first.getVotes().clone(); double[] votes2 = second.getVotes().clone(); double total1 = weka.core.Utils.sum(votes1); double total2 = weka.core.Utils.sum(votes2); Arrays.sort(votes1); Arrays.sort(votes2); double pentultimate1 = 1e-16 + ((votes1.length > 1) ? votes1[votes1.length - 2] : 0); double pentultimate2 = 1e-16 + ((votes2.length > 1) ? votes2[votes2.length - 2] : 0); // this is equiv to purity - margin... yea... really... it's awesome... gotta love math... double score1 = (total1 > 0) ? first.size() * pentultimate1 / total1 : 0; double score2 = (total2 > 0) ? second.size() * pentultimate2 / total2 : 0; return Double.compare(score2, score1); } }); // end Anon sort for (int cIdx = 0; cIdx < sortedClusters.length && subclusters.size() < maxK; cIdx++) { Riffle splitMe = sortedClusters[cIdx]; if (splitMe.getPurity() > 0.9) { continue; } double[] votes = splitMe.getVotes(); final double totalVotes = weka.core.Utils.sum(votes); final double critVotes = 1.0 / (votes.length * 2); if (totalVotes < 2) { continue; } ArrayList<Riffle> splitSet = new ArrayList<>(votes.length); int numberOfNewClusters = 0; for (int lblIdx = 0; lblIdx < votes.length; ++lblIdx) { double labelVote = votes[lblIdx] / totalVotes; if (labelVote >= critVotes) { splitSet.add(this.createNewCluster(splitMe.toInstance())); numberOfNewClusters++; } else { splitSet.add(null); } } if (numberOfNewClusters < 2) { continue; } Instances extras = new Instances(splitMe.getHeader()); for (Instance x : splitMe.instances) { if (x.weight() > 0.999) { Riffle myHopefulCluster = splitSet.get((int) x.classValue()); if (myHopefulCluster != null) { myHopefulCluster.instances.add(x); myHopefulCluster.addLabeling((int) x.classValue(), x.weight()); } else { extras.add(x); } } else { extras.add(x); } } LinkedList<Riffle> goodSet = new LinkedList<>(); for (Riffle rfc : splitSet) { if (rfc == null) { continue; } rfc.recomputeAll(); goodSet.add(rfc); subclusters.add(rfc); } for (Instance x : extras) { final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(goodSet, x); nearestClusters[0].getCluster().instances.add(x); } subclusters.remove(splitMe); } } // The pentultimate Expectation step ret = 0; for (Riffle c : subclusters) { ret += c.recomputeAll(); } // See if any outliers should actually be consumed by a cluster now... (Maximization step 3) Iterator<Instance> xIter = potentialNovels.iterator(); while (xIter.hasNext()) { Instance xOut = xIter.next(); final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(subclusters, xOut); if (nearestClusters == null || nearestClusters.length < 1) { continue; } Riffle c = nearestClusters[0].getCluster(); double d = nearestClusters[0].getDistance(); if (d > c.getRadius()) { // Welcome home wayward tuple! c.instances.add(xOut); xIter.remove(); } } // And the final Expectation step ret = 0; for (Riffle c : subclusters) { ret += c.recomputeAll(); } // return ret; }
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * Wrapper for parallel K-Means for processing warm-up data set * * @param D Warm-up data set//from www .j a va 2 s . co m * @param K number of clusters * @param useLabels if true, use * @return */ protected final Set<Riffle> batchCluster(List<Instance> D, int K, boolean useLabels) { assert K >= 2 : "Minimum number of clusters (K) is 2"; TreeSet<Riffle> ret = new TreeSet<>(); TreeSet<Integer> labels = new TreeSet<>(); TreeMap<Integer, TreeSet<Riffle>> potentialClusters = new TreeMap<>(); LinkedList<ClusterPointPair> DSet = new LinkedList<>(); //Create a potential cluster pool. Seperate into seperate pools by label if useLabels is set to true: for (Instance x : D) { int label = (useLabels) ? (int) x.classValue() : 0; labels.add(label); TreeSet<Riffle> clusterSet = potentialClusters.get(label); if (clusterSet == null) { clusterSet = new TreeSet<>(); } clusterSet.add(this.createNewCluster(x)); potentialClusters.put(label, clusterSet); DSet.addLast(new ClusterPointPair(x, null)); } // Initialize following the K-Means++ approach: Riffle C = potentialClusters.firstEntry().getValue().first(); ret.add(C); potentialClusters.firstEntry().getValue().remove(C); Iterator<Integer> labelIter = labels.iterator(); while ((ret.size() < K) && !potentialClusters.isEmpty()) { if (!labelIter.hasNext()) { labelIter = labels.iterator(); } // loop around as needed int pseudoLabel = labelIter.next(); TreeSet<Riffle> clusterSet = potentialClusters.get(pseudoLabel); if (clusterSet.isEmpty()) { potentialClusters.remove(pseudoLabel); labelIter.remove(); continue; } NearestClusterTuple[] nearestClusters = findMostLikelyClusters(clusterSet, C.toInstance()); if (nearestClusters.length == 0) { continue; } if (nearestClusters.length == 1) { C = nearestClusters[0].getCluster(); } else { C = nearestClusters[nearestClusters.length - 1].getCluster(); // WAS BACKWARDS } ret.add(C); clusterSet.remove(C); } potentialClusters.clear(); // Iterate final int maxIterations = 100; final double minDelta = 0.0001; int iteration = 0; double valIdxDelta = 1.0; ValIdxTupleType lastScore = null; while ((iteration < maxIterations) && (valIdxDelta > minDelta)) { iteration++; EMStep(DSet, ret, this.maximumNumberOfClusterSizeOption.getValue() - (int) (this.clustersPerLabelOption.getValue() * 0.75)); // Expectation Step ValIdxTupleType currentScore = new ValIdxTupleType(ret); if (lastScore != null) { double diff = Math.abs(lastScore.getValIdx() - currentScore.getValIdx()); double denominator = lastScore.getValIdx(); valIdxDelta = (denominator == 0) ? 0.0 : Math.abs(diff / denominator); } lastScore = currentScore; } // end while return ret; }
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * Uses methodology from Kim et al. "A Novel Validity Index for Determination of the Optimal Number of Clusters" * * @param D Warm-up data set// www .ja v a2 s . c om */ public final void initialize(List<Instance> D) { String ncCSVfilePrefix = "META-" + D.get(0).dataset().relationName() + "-" + iso8601FormatString.format(new Date()); final boolean doMetaLog = logMetaRecordsOption.isSet(); if (doMetaLog) { try { File ncCSVFile = new File(ncCSVfilePrefix + ".csv"); ncCSVwriter = new BufferedWriter(new FileWriter(ncCSVFile)); String ncCSVHeader = "" + "usize" + "," + "urad" + "," + "ctally" + "," + "cpur" + "," + "csize" + "," + "cweight" + "," + "crad" + "," + "cdist" + "," + "pout" + "," + "vweight" + "," + "qdmin" + "," + "qdout" + "," + "qnsc" + "," + "novel"; ncCSVwriter.write(ncCSVHeader); ncCSVwriter.newLine(); ncCSVwriter.flush(); } catch (IOException fileSetupIOException) { System.err.println("NC-CSV meta-data file failed to open: " + fileSetupIOException.toString()); } } knownLabels = new int[D.get(0).numClasses()]; Arrays.fill(knownLabels, 0); this.numAttributes = D.get(0).numAttributes(); universalProbabilitySums = 0; bestProbabilitySums = 0; bestProbabilityCount = 0; // Setup the universal set/cluster. Note that this will be crucial for subspace selection (cross-entropy checks against null hypothesis) double[] universalCentroid = new double[D.get(0).numAttributes()]; double[] universalVariance = new double[D.get(0).numAttributes()]; Arrays.fill(universalCentroid, 0); Arrays.fill(universalVariance, 0); universalCluster = new Riffle(D.get(0)); //universalCluster.updateStrategyOption.setChosenIndex(this.updateStrategyOption.getChosenIndex()); //universalCluster.outlierDefinitionStrategyOption.setChosenIndex(this.outlierDefinitionStrategyOption.getChosenIndex()); universalCluster.distanceStrategyOption.setChosenIndex(this.distanceStrategyOption.getChosenIndex()); //universalCluster.initialStandardDeviationOption.setValue(this.initialStandardDeviationOption.getValue()); //universalCluster.alphaAdjustmentWeightOption.setValue(this.learningRateAlphaOption.getValue()); //universalCluster.setParentClusterer(this); if (D.size() > 1) { double[] ep = new double[universalCentroid.length]; Arrays.fill(ep, 0); universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below universalCluster.setVariances(universalVariance); // temporary - start with standard gaussian, will update below universalCluster.setWeight(0); double N = D.size(); for (Instance x : D) { // Pre-populate univeral cluster with data points int y = (int) x.classValue(); if (y < knownLabels.length) { knownLabels[y]++; } universalCluster.addInstance(x); double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { universalCentroid[i] += xValues[i]; } } for (int i = 0; i < universalCentroid.length; ++i) { universalCentroid[i] /= N; } // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so // we use the 2-Pass method for computing sample variance (per dimension) for (Instance x : D) { double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { double delta = universalCentroid[i] - xValues[i]; ep[i] += delta; universalVariance[i] += delta * delta; } } for (int i = 0; i < universalVariance.length; ++i) { universalVariance[i] = (universalVariance[i] - ep[i] * ep[i] / N) / (N - 1); } universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below universalCluster.setVariances(universalVariance); } universalCluster.recompute(); // this updates entropies and such int numKnownLabels = 0; for (int y : knownLabels) { if (y > 0) { numKnownLabels++; } } // Ok, now let's use K-Means to find the initial cluster set int Cmin = this.clustersPerLabelOption.getValue() * numKnownLabels; int Cmax = Cmin + 1; if (optimizeInitialClusterNumberOption.isSet()) { Cmin = this.minimumNumberOfClusterSizeOption.getValue();//Math.max(knownLabels.size(), 2); Cmax = Math.max(Cmin + 1, Math.min(this.clustersPerLabelOption.getValue() * numKnownLabels, this.maximumNumberOfClusterSizeOption.getValue())); } ArrayList<ValIdxTupleType> valIdxSet = new ArrayList<>(Cmax); Set<Riffle> V; // Create multiple hypothesis for best K choices: for (int c = Cmin; c < Cmax; c++) { V = batchCluster(D, c, true); ValIdxTupleType i = new ValIdxTupleType(V); valIdxSet.add(i); if (CVI == null) { CVI = i; } else { CVI.setVo_min(Math.min(i.getVo(), CVI.getVo_min())); CVI.setVo_max(Math.max(i.getVo(), CVI.getVo_max())); CVI.setVu_min(Math.min(i.getVu(), CVI.getVu_min())); CVI.setVu_max(Math.max(i.getVu(), CVI.getVu_max())); } } // Normalize all: for (ValIdxTupleType i : valIdxSet) { i.setVo_min(CVI.getVo_min()); i.setVo_max(CVI.getVo_max()); i.setVu_min(CVI.getVu_min()); i.setVu_max(CVI.getVu_max()); } // Find the best K by finding the minimum score: valIdxSet.stream().filter((i) -> (i.getValIdx() < CVI.getValIdx())).forEach((i) -> { CVI = i; }); this.clusters.clear(); for (Riffle c : CVI.getClustering()) { if (c.instances == null || c.instances.isEmpty()) { continue; } double[] clusterCentroid = new double[universalCentroid.length]; double[] clusterVariance = new double[universalVariance.length]; for (Instance x : c.instances) { // Pre-populate univeral cluster with data points double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { clusterCentroid[i] += xValues[i] / ((double) c.instances.size()); } } // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so // we use the 2-Pass method for computing sample variance (per dimension) if (c.instances.size() < 2) { for (int i = 0; i < clusterVariance.length; ++i) { clusterVariance[i] = universalCluster.getVariances()[i] * 0.85; // Statistical Variance } } else { double n = c.instances.size(); double[] cep = new double[universalCentroid.length]; Arrays.fill(cep, 0); for (Instance x : c.instances) { double[] xValues = x.toDoubleArray(); for (int i = 0; i < xValues.length; ++i) { double delta = clusterCentroid[i] - xValues[i]; cep[i] += delta; clusterVariance[i] += delta * delta; // Statistical Variance } } for (int i = 0; i < clusterVariance.length; ++i) { clusterVariance[i] = (clusterVariance[i] - cep[i] * cep[i] / n) / (n - 1); } } c.setCenter(clusterCentroid); // temporary - start with standard gaussian, gets updated below c.setVariances(clusterVariance); c.recompute(); // this updates entropies and such for (Instance x : c.instances) { this.hopperCache.push(new ClusterPointPair(x, c)); } this.clusters.add(c); } this.newClusterCreateCalls = 0; System.out.println("Starting with " + this.clusters.size() + " clusters."); instancesSeen = D.size(); weightsSeen = D.size(); }
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * Train on data instance/*from ww w .j a va 2s.c om*/ * * @param x instance to train on */ @Override public final void trainOnInstanceImpl(Instance x) { safeInit(x); assert (x != null) : "Sieve::trainOnInstanceImpl() Training on a null instance!"; int y = (int) x.classValue(); if ((y > 0) && (y < knownLabels.length)) { knownLabels[y] += x.weight(); } this.instancesSeen++; this.weightsSeen += x.weight(); this.universalCluster.addInstance(x); final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(this.clusters, x); if (nearestClusters.length < 1) { // Handles weird corner case Riffle firstCluster = this.createNewCluster(x); clusters.add(firstCluster); System.err.println("Sieve::trainOnInstanceImpl() - no other clusters found!"); } else { // Everyone takes a weight hit, and we will reward the best later... for (NearestClusterTuple nct : nearestClusters) { nct.getCluster().penalize(); } NearestClusterTuple ncx = nearestClusters[0]; // For code convienance ClusterPointPair cxp = new ClusterPointPair(x, ncx.getCluster()); // we will change this later in the function... maybe if (ncx.getDistance() > ncx.getCluster().getRadius()) { // outlier // Hang out with the outcasts and see if you can start your own clique cxp.c = null; if (!onlyCreateNewClusterAtResyncOption.isSet()) { cxp.c = trainOnOutlierInstance(x, ncx); } if (cxp.c == null) { this.potentialNovels.add(x);// or just wait patiently for a friend to sit next to you } } else { // end if(isRadialOutlier) // Or join an existing club if you are in the "IN" crowd... Riffle nc = ncx.getCluster(); nc.reward(); nc.trainEmbeddedClassifier(x); nc.addInstance(x); } // end else (not Outlier) // Randomly (based on distance) cross-train other models for (int i = 0; i < nearestClusters.length; ++i) { double pTrain = ((double) nearestClusters.length - i) / (2.0 * nearestClusters.length); if (this.clustererRandom.nextDouble() < pTrain) { nearestClusters[i].getCluster().trainEmbeddedClassifier(x); } } // end for(i) hopperCache.addLast(cxp); } // corner case safety periodicResync(); }
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * Temporary function for algorithm analysis *///from www. ja va 2 s . c om private void debugMetrics(double qNSC, double qDout, double qDmin, double dist, double rawTally, Instance x, Riffle c) { if (this.logMetaRecordsOption.isSet()) { try { int groundTruth = (int) x.classValue(); boolean isTrueNovel = (groundTruth > 0) && (groundTruth < knownLabels.length) && (knownLabels[groundTruth] < (this.minimumClusterSizeOption.getValue())); String ncCSVLine = "" + universalCluster.size() + "," + universalCluster.getRadius() + "," + rawTally + "," + c.getPurity() + "," + c.size() + "," + c.getWeight() + "," + c.getRadius() + "," + dist + "," + (c.isOutlier(x) ? 1 : 0) + "," + x.weight() + "," + qDmin + "," + qDout + "," + qNSC + "," + isTrueNovel; ncCSVwriter.write(ncCSVLine); ncCSVwriter.newLine(); ncCSVwriter.flush(); } catch (IOException fileIoExcption) { System.err.println("Could not write NC CSV line: " + fileIoExcption.toString()); } } }
From source file:moa.clusterer.outliers.Sieve.java
License:Apache License
/** * @return training accuracy//from ww w . j av a2 s . c om */ private double trainPerceptron() { // Train the perceptron from warmup phase clustering final int epochs = 20; final int numberOfPerceptrons = 1; final int MEMBER = 0; final int OUTLIER = 1; double accuracySum = 0; double accuracyCount = 0; this.outlierPerceptronTrainingSet.clear(); Random rng = new Random(this.randomSeed); // Generate training set for (Riffle thisCluster : this.clusters) { for (Instance x : thisCluster.getHeader()) { Instance pseudoPt = makePerceptronInstance(thisCluster, x); for (Riffle thatCluster : this.clusters) { double groundTruth = (thisCluster == thatCluster) ? MEMBER : OUTLIER; pseudoPt.setClassValue(groundTruth); this.outlierPerceptronTrainingSet.add(pseudoPt); } } } for (Instance x : this.outlierPerceptronTrainingSet) { x.setWeight(1.0 / this.outlierPerceptronTrainingSet.numInstances()); } ; // Boost it this.perceptrons = new Perceptron[numberOfPerceptrons]; this.pweights = new double[numberOfPerceptrons]; for (int perceptronIdx = 0; perceptronIdx < numberOfPerceptrons; ++perceptronIdx) { // Discover new weak learner Perceptron candidatePerceptron = new Perceptron(); candidatePerceptron.prepareForUse(); candidatePerceptron.learningRatioOption.setValue(rng.nextDouble() * 0.9 + 0.1); for (int epoch = 0; epoch < epochs; epoch++) { for (Instance x : this.outlierPerceptronTrainingSet) { if ((rng.nextDouble() / this.outlierPerceptronTrainingSet.numInstances()) < x.weight()) { // weighted subsampling candidatePerceptron.trainOnInstance(x); } } } //end epochs // Evaluate weak learner double errorFunctionSum = 0; double weightSum = 0; for (Instance x : this.outlierPerceptronTrainingSet) { if (!candidatePerceptron.correctlyClassifies(x)) { errorFunctionSum += x.weight(); } } // adjust training weights for (Instance x : this.outlierPerceptronTrainingSet) { double newWeight = x.weight(); if (candidatePerceptron.correctlyClassifies(x)) { newWeight *= errorFunctionSum / (1.0 - errorFunctionSum); if (Double.isNaN(newWeight)) { newWeight = weka.core.Utils.SMALL; } x.setWeight(newWeight); } weightSum += newWeight; } // Normalize for (Instance x : this.outlierPerceptronTrainingSet) { x.setWeight(x.weight() / weightSum); } // Add to ensemble double newPerceptronWeight = Math.log((1 - errorFunctionSum) / errorFunctionSum); this.perceptrons[perceptronIdx] = candidatePerceptron; this.pweights[perceptronIdx] = newPerceptronWeight; } // end numPerceptrons // Check training error accuracySum = 0; accuracyCount = 0; for (Instance x : this.outlierPerceptronTrainingSet) { if (this.getPerceptronVotesForOutlierStatus(x) == x.classValue()) { accuracySum++; } accuracyCount++; } double trainingAccuracy = (accuracyCount > 0) ? (accuracySum / accuracyCount) : 0.0; this.outlierPerceptronTrainingSet.clear(); return trainingAccuracy; }
From source file:moa.clusterers.outliers.AnyOut.util.DataObject.java
License:Apache License
/** * Standard constructor for <code>DataObject</code>. * @param idCounter The id for the <code>DataObject</code>. * @param features The feature as a <code>double[]</code> * @param classLabel The label id for the <code>DataObject</code>. *///from w ww . j av a 2s . co m public DataObject(int idCounter, Instance inst) { this.id = idCounter; this.inst = inst; this.features = inst.toDoubleArray(); this.classLabel = (int) inst.classValue(); this.isOutiler = false; }