Example usage for weka.core Instance classValue

Introduction

In this page you can find the example usage for weka.core Instance classValue.

Prototype

public double classValue();

Source Link

Document

Returns an instance's class value as a floating-point number.

Usage

From source file:moa.clusterer.FeS2.java

License:Apache License

/**;
 *
 * @param x instance to train on/*from w  ww . j a  v  a 2 s  .c  o  m*/
 */
@Override
public void trainOnInstanceImpl(Instance x) {
    safeInit(x);
    assert (x != null) : "FeS2::trainOnInstanceImpl() Training on a null instance!";
    int classValue = (int) x.classValue();
    boolean isNewLabel = (!knownLabels.contains(classValue)) && (x.weight() > 0);
    if ((x.weight() > 0)) {
        this.knownLabels.add(classValue);
    }
    this.universalCluster.addInstance(x);
    // Find nearest Cluster
    final SortedSet<NearestClusterTuple> nearestClusters = findMostLikelyClusters(this.clusters, x);
    assert !nearestClusters.isEmpty() : "Cluster set for probability matching is empty";

    // Compute some base metrics we need to know:
    double maxRadius = 0;
    double avgRadius = 0;
    boolean unanimousOutlier = true;
    double weightTotal = 0;
    double minWeight = Double.MAX_VALUE;
    for (NearestClusterTuple nct : nearestClusters) {
        unanimousOutlier = unanimousOutlier && nct.getCluster().isOutlier(x);
        maxRadius = Math.max(maxRadius, nct.getCluster().getRadius());
        avgRadius += nct.getCluster().getRadius();
    }
    avgRadius /= nearestClusters.size();

    // Update weights
    for (NearestClusterTuple nct : nearestClusters) {
        Riffle c = nct.getCluster();
        c.penalize(); // unilaterally reduce weights
        int clusterMajorityClass = weka.core.Utils.maxIndex(c.getVotes());
        // increase weights for matches (define 'match' criteria by strategy parameter)
        switch (this.positiveClusterFeedbackStrategyOption.getChosenIndex()) {
        case 0: // only the closest
            if (!unanimousOutlier && c == nearestClusters.last().getCluster()) {
                addToCluster(x, c);
            }
            break;
        case 1: // All label matches
            // This ternary condition is very important for results
            int hypothesisClass = (x.weight() > 0) ? classValue
                    : weka.core.Utils.maxIndex(this.getVotesForInstance(x));
            if (clusterMajorityClass == hypothesisClass) {
                addToCluster(x, c);
            }
            break;
        case 2: // All proximity matches
            if (!nct.getCluster().isOutlier(x)) {
                addToCluster(x, c);
            }
            break;
        default:
            break;
        } //end switch
        weightTotal += c.getWeight();
        minWeight = Math.min(minWeight, c.getWeight());
    }

    // Sort by (weight / sigma)
    Riffle[] sortedClusters = new Riffle[clusters.size()];
    int i = 0;
    for (Riffle c : clusters) {
        sortedClusters[i++] = c;
    }
    // Kuddos to Java 8 and lambda expressions for making this a one-liner:
    Arrays.parallelSort(sortedClusters,
            (Riffle a, Riffle b) -> Double.compare(a.getWeight() / Math.max(a.getRadius(), 1e-96),
                    b.getWeight() / Math.max(b.getRadius(), 1e-96)));
    boolean atClusterCapacity = (this.clusters.size() >= Math.min(
            this.clustersPerLabelOption.getValue() * this.knownLabels.size(),
            this.maximumNumberOfClusterSizeOption.getValue()));

    // * * *
    //
    // Results show that when average P(x|k) < Chauvenet, no new clusters, and visa versa (which is opposite of expected behavior)
    //
    // * * *
    boolean universalOutlier = this.universalCluster.isOutlier(x);
    if (isNewLabel) {
        newLabelCount++;
    }
    if (universalOutlier) {
        universalOutlierCount++;
    }
    if (unanimousOutlier) {
        unanimousOutlierCount++;
    }
    // If we have no matches at all, then the weakest clsuter is replaced by a new one with a high variance and low weight
    //if (isNewLabel || (unanimousOutlier && universalOutlier)) {   
    if (isNewLabel || unanimousOutlier) {
        Riffle weakestLink = sortedClusters[sortedClusters.length - 1]; // get last one
        Riffle novelCluster = this.createNewCluster(x);
        //novelCluster.setRadius((avgRadius + maxRadius) / 2.0); // Set to half-way between average and max radius
        novelCluster.setWeight(weightTotal / nearestClusters.size()); // <---- Validate this ------
        weightTotal += novelCluster.getWeight(); // update for new normalization factor
        // You are the weakest link... Goodbye
        if (atClusterCapacity) {
            weightTotal -= weakestLink.getWeight(); // update for new normalization factor
            this.clusters.remove(weakestLink);
        }
        // Everyone please welcome our newest contestant...
        clusters.add(novelCluster);
    }

    // Normalize Weights and Update variance estimates for singleton clusters
    double[] universeVariance = universalCluster.getVariances();
    double[] initialVariance = new double[universeVariance.length];
    for (int j = 0; j < initialVariance.length; ++j) {
        initialVariance[j] = universeVariance[j] * 0.85;
    }
    if (weightTotal <= 0) {
        weightTotal = 1;
    }
    for (Riffle c : this.clusters) {
        if (c.size() < 2) {
            c.setVariances(initialVariance);
        }
        c.setWeight(c.getWeight() / weightTotal);
    }
}

From source file:moa.clusterer.FeS2.java

License:Apache License

/**
 * Find the nearest cluster, and use its most frequent label.
 * If nearest cluster has no label, then we have a novel cluster
 * Unless data point is an outlier to all clusters, then it is just an outlier
 * @param inst/*from  ww  w .j  av a2 s  . co m*/
 * @return 
 */
@Override
public double[] getVotesForInstance(Instance inst) {
    assert (this.universalCluster != null) : "FeS2::getVotesForInstance() called without any initialization or training!";
    int novelClassLabel = inst.numClasses();
    int outlierLabel = novelClassLabel + 1;
    double[] votes = new double[inst.numClasses() + 2];
    if (this.clusters.isEmpty()) {
        return votes;
    }
    double[] cumulativeVotes = new double[inst.numClasses()];
    double[] cumulativeVotes_p = new double[inst.numClasses()];
    double[] cumulativeVotes_pw = new double[inst.numClasses()];
    double[] cumulativeVotes_n = new double[inst.numClasses()];
    double[] cumulativeVotes_np = new double[inst.numClasses()];
    double[] cumulativeVotes_npw = new double[inst.numClasses()];
    double[] cumulativeWinnerTakesAllVotes = new double[inst.numClasses()];
    Arrays.fill(votes, 0.0);
    Arrays.fill(cumulativeVotes, 0.0);
    Arrays.fill(cumulativeVotes_p, 0.0);
    Arrays.fill(cumulativeVotes_pw, 0.0);
    Arrays.fill(cumulativeVotes_n, 0.0);
    Arrays.fill(cumulativeVotes_np, 0.0);
    Arrays.fill(cumulativeVotes_npw, 0.0);
    Arrays.fill(cumulativeWinnerTakesAllVotes, 0.0);

    final int TRUE_CLASS = (int) inst.classValue(); // for debug watch windows only
    final SortedSet<NearestClusterTuple> nearestClusters = findMostLikelyClusters(this.clusters, inst);
    boolean memberOfAtLeastOneTrueCluster = false;
    boolean universalOutlier = true;
    double bestProbability = 0;
    double universalProbability = this.universalCluster.getInclusionProbability(inst);

    NearestClusterTuple bestMatchCluster = null;

    // Gather data
    for (NearestClusterTuple nct : nearestClusters) {
        double p = nct.getDistance();
        boolean localOutlier = nct.getCluster().isOutlier(inst);
        memberOfAtLeastOneTrueCluster = memberOfAtLeastOneTrueCluster
                || (!localOutlier && nct.getCluster().size() > this.minimumClusterSizeOption.getValue());
        universalOutlier = universalOutlier && localOutlier;
        bestProbability = Math.max(p, bestProbability);
        if (p <= 0) {
            continue;
        }
        int localWinner = (int) nct.getCluster().getGroundTruth();
        cumulativeWinnerTakesAllVotes[localWinner] += p;
        double clusterVotes[] = nct.getCluster().getVotes();
        double clusterNormalizedVotes[] = nct.getCluster().getVotes().clone();
        if (weka.core.Utils.sum(clusterNormalizedVotes) > 0) {
            weka.core.Utils.normalize(clusterNormalizedVotes);
        }
        for (int i = 0; i < clusterVotes.length; ++i) {
            cumulativeVotes[i] += clusterVotes[i];
            cumulativeVotes_p[i] += clusterVotes[i] * p;
            cumulativeVotes_pw[i] += clusterVotes[i] * p * nct.getCluster().getWeight();
            cumulativeVotes_n[i] += clusterNormalizedVotes[i];
            cumulativeVotes_np[i] += clusterNormalizedVotes[i] * p;
            cumulativeVotes_npw[i] += clusterNormalizedVotes[i] * p * nct.getCluster().getWeight();
        }
        if (!localOutlier) {
            bestMatchCluster = nct;
        }
    } // end for

    universalProbabilitySums += universalProbability;
    bestProbabilitySums += bestProbability;
    bestProbabilityCount += 1;

    if (nearestClusters.isEmpty()) {
        votes[outlierLabel] = 1.0;
    } else {
        if (weka.core.Utils.sum(cumulativeVotes) > 0) {
            weka.core.Utils.normalize(cumulativeVotes);
        }
        if (weka.core.Utils.sum(cumulativeVotes_p) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_p);
        }
        if (weka.core.Utils.sum(cumulativeVotes_pw) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_pw);
        }
        if (weka.core.Utils.sum(cumulativeVotes_n) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_n);
        }
        if (weka.core.Utils.sum(cumulativeVotes_np) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_np);
        }
        if (weka.core.Utils.sum(cumulativeVotes_npw) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_npw);
        }
        if (weka.core.Utils.sum(cumulativeWinnerTakesAllVotes) > 0) {
            weka.core.Utils.normalize(cumulativeWinnerTakesAllVotes);
        }
        switch (this.votingStrategyOption.getChosenIndex()) {
        case 0: // 1-NN - usually not the strongest
            double[] nearestNeighborVotes = nearestClusters.last().getCluster().getVotes();
            for (int i = 0; i < nearestNeighborVotes.length; ++i) {
                votes[i] = nearestNeighborVotes[i];
            }
            break;
        case 1: // Global  k-NN - this is a poor performer
            for (int i = 0; i < cumulativeVotes.length; ++i) {
                votes[i] = cumulativeVotes[i];
            }
            break;
        case 2: // Globally probability-weighted k-NN - good, but biased towards heavy clusters
            for (int i = 0; i < cumulativeVotes_p.length; ++i) {
                votes[i] = cumulativeVotes_p[i];
            }
            break;
        case 3: // Globally probability-utility-weighted k-NN - good, but overly complex
            for (int i = 0; i < cumulativeVotes_pw.length; ++i) {
                votes[i] = cumulativeVotes_pw[i];
            }
            break;
        case 4: // Globally normalized k-NN - this is also usually a really really poor performer. Don't use it
            for (int i = 0; i < cumulativeVotes_n.length; ++i) {
                votes[i] = cumulativeVotes_n[i];
            }
            break;
        case 5: // Globally normalized probability-weighted k-NN - a safe bet
            for (int i = 0; i < cumulativeVotes_np.length; ++i) {
                votes[i] = cumulativeVotes_np[i];
            }
            break;
        case 6: // Globally normalized probability-utility-weighted k-NN - default and preferred method
            for (int i = 0; i < cumulativeVotes_npw.length; ++i) {
                votes[i] = cumulativeVotes_npw[i];
            }
            break;
        case 7: // Globally weighted k-NN winner take all per cluster - Can avoid noise, but not usually the best
        default:
            for (int i = 0; i < cumulativeWinnerTakesAllVotes.length; ++i) {
                votes[i] = cumulativeWinnerTakesAllVotes[i];
            }
        } // end switch
        double voteAccumulator = 0;
        for (double v : votes) {
            voteAccumulator += v;
        }
        // A novel cluster is one of sufficient size but no label
        if ((bestMatchCluster != null) // It matches a cluster
                && (bestMatchCluster.getCluster().size() > this.minimumClusterSizeOption.getValue()) // that is overall large enough
                && (bestMatchCluster.getCluster().getNumLabeledPoints() < 1)) { // but without labels
            votes[novelClassLabel] = 1.0;
        }
        // outlier detection
        if (universalOutlier) {
            int maxIdx = weka.core.Utils.maxIndex(votes);
            if (maxIdx < 0) {
                maxIdx = 0;
            }
            double outlierValue = votes[maxIdx];
            if (outlierValue <= 0) {
                votes[novelClassLabel] = 1.0; // special case of novelty when we have absolutely no clue how to label an outlier
                outlierValue = 1e-16;
            }
            votes[outlierLabel] = outlierValue / 2.0; //Math.max(Math.abs(1.0 - bestProbability), Math.abs(1.0 - universalProbability));
        }
    } // end if (nearestClusters not empty)
    return votes;
}

From source file:moa.clusterer.FeS2.java

License:Apache License

/**
 * @return training accuracy//from w w w. j a v  a2  s . co  m
 */
private double trainPerceptron() {
    // Train the perceptron from warmup phase clustering 
    final int epochs = 20;
    final int numberOfPerceptrons = 10;
    final int MEMBER = 0;
    final int OUTLIER = 1;
    double accuracySum = 0;
    double accuracyCount = 0;
    this.outlierPerceptronTrainingSet.clear();
    Random rng = new Random(this.randomSeed);

    // Generate training set
    for (Riffle thisCluster : this.clusters) {
        for (Riffle thatCluster : this.clusters) {
            double groundTruth = (thisCluster == thatCluster) ? MEMBER : OUTLIER;
            for (Instance x : thatCluster.getHeader()) {
                Instance pseudoPt = makePerceptronInstance(thisCluster, x);
                pseudoPt.setClassValue(groundTruth);
                this.outlierPerceptronTrainingSet.add(pseudoPt);
            }
        }
    }
    this.outlierPerceptronTrainingSet.parallelStream().forEach((x) -> {
        x.setWeight(1.0 / this.outlierPerceptronTrainingSet.numInstances());
    });

    // Boost it
    this.perceptrons = new Perceptron[numberOfPerceptrons];
    this.pweights = new double[numberOfPerceptrons];
    for (int perceptronIdx = 0; perceptronIdx < numberOfPerceptrons; ++perceptronIdx) {
        // Discover new weak learner
        Perceptron candidatePerceptron = new Perceptron();
        candidatePerceptron.prepareForUse();
        candidatePerceptron.learningRatioOption.setValue(rng.nextDouble() * 0.9 + 0.1);
        for (int epoch = 0; epoch < epochs; epoch++) {
            for (Instance x : this.outlierPerceptronTrainingSet) {
                if ((rng.nextDouble() / this.outlierPerceptronTrainingSet.numInstances()) < x.weight()) { // weighted subsampling
                    candidatePerceptron.trainOnInstance(x);
                }
            }
        } //end epochs
          // Evaluate weak learner
        double errorFunctionSum = 0;
        double weightSum = 0;
        for (Instance x : this.outlierPerceptronTrainingSet) {
            if (!candidatePerceptron.correctlyClassifies(x)) {
                errorFunctionSum += x.weight();
            }
        }
        // adjust training weights
        for (Instance x : this.outlierPerceptronTrainingSet) {
            double newWeight = x.weight();
            if (candidatePerceptron.correctlyClassifies(x)) {
                newWeight *= errorFunctionSum / (1.0 - errorFunctionSum);
                if (Double.isNaN(newWeight)) {
                    newWeight = weka.core.Utils.SMALL;
                }
                x.setWeight(newWeight);
            }
            weightSum += newWeight;
        }
        // Normalize
        for (Instance x : this.outlierPerceptronTrainingSet) {
            x.setWeight(x.weight() / weightSum);
        }
        // Add to ensemble
        double newPerceptronWeight = Math.log((1 - errorFunctionSum) / errorFunctionSum);

        this.perceptrons[perceptronIdx] = candidatePerceptron;
        this.pweights[perceptronIdx] = newPerceptronWeight;
    } // end numPerceptrons

    // Check training error
    accuracySum = 0;
    accuracyCount = 0;
    for (Instance x : this.outlierPerceptronTrainingSet) {
        if (this.getPerceptronVotesForOutlierStatus(x) == x.classValue()) {
            accuracySum++;
        }
        accuracyCount++;
    }
    double trainingAccuracy = (accuracyCount > 0) ? (accuracySum / accuracyCount) : 0.0;
    this.outlierPerceptronTrainingSet.clear();
    return trainingAccuracy;
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * This is not your grandpa's E-M algorithm... it has multiple mini-steps,
 * but "The e1-m1-e2-m2-e3-m3-Algorithm" is a mouthful, so we just call it *-Means Clustering
 * {Pronounced "Any-means (necessary) clustering"}
 * @param D//  www.j  a  v  a  2  s  .co  m
 * @param subclusters
 * @param maxK
 * @return score at the end of the process
 */
protected final double EMStep(List<ClusterPointPair> D, Collection<Riffle> subclusters, int maxK) {
    double ret = 0;
    // clear the pallette
    for (Riffle c : subclusters) {
        if (c.instances == null) {
            c.instances = c.getHeader();
        }
        c.instances.clear();
        c.cleanTallies();
    }

    // Assign by X's to nearest clusters (Maximization step 1)
    for (ClusterPointPair cxp : D) {
        if (this.potentialNovels.contains(cxp.x)) { // could also be if cxp.c == null, but this is safer
            continue; // ignore the outliers for a moment
        }
        final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(subclusters, cxp.x);
        //            double ds[] = new double[nearestClusters.length];
        //            int foo = 0;
        //            for(NearestClusterTuple gnarf : nearestClusters) {
        //                ds[foo++] = gnarf.getDistance();
        //            }

        cxp.c = nearestClusters[0].getCluster();

        nearestClusters[0].getCluster().instances.add(cxp.x);
        if (cxp.x.weight() > 0.99) {
            nearestClusters[0].getCluster().addLabeling((int) cxp.x.classValue(), cxp.x.weight());
        }
    }

    // Find new radius (Expectation step)
    for (Riffle c : subclusters) {
        ret += c.recomputeAll();
    }

    // Remove empty clusters to make room for splits (Expectation-ish)
    Iterator<Riffle> cIter = subclusters.iterator();
    while (cIter.hasNext()) {
        Riffle rc = cIter.next();
        if (rc.instances.size() < 1) {
            cIter.remove();
        }
    }

    // Are we full?
    if (subclusters.size() < maxK) {
        // Fix bad clusters (Maximization step 2 - breaking up noisy clusters)
        Riffle sortedClusters[] = new Riffle[subclusters.size()];
        int tmpIdx = 0;
        for (Riffle tmpRfl : subclusters) {
            if (tmpIdx >= sortedClusters.length) {
                break;
            }
            sortedClusters[tmpIdx] = tmpRfl;
            tmpIdx++;
        }
        Arrays.sort(sortedClusters, new Comparator<Riffle>() {
            @Override
            public int compare(Riffle first, Riffle second) {
                if (first == null) {
                    return 1;
                }
                if (second == null) {
                    return -1;
                }
                double[] votes1 = first.getVotes().clone();
                double[] votes2 = second.getVotes().clone();
                double total1 = weka.core.Utils.sum(votes1);
                double total2 = weka.core.Utils.sum(votes2);
                Arrays.sort(votes1);
                Arrays.sort(votes2);
                double pentultimate1 = 1e-16 + ((votes1.length > 1) ? votes1[votes1.length - 2] : 0);
                double pentultimate2 = 1e-16 + ((votes2.length > 1) ? votes2[votes2.length - 2] : 0);
                // this is equiv to purity - margin... yea... really... it's awesome... gotta love math...
                double score1 = (total1 > 0) ? first.size() * pentultimate1 / total1 : 0;
                double score2 = (total2 > 0) ? second.size() * pentultimate2 / total2 : 0;
                return Double.compare(score2, score1);
            }
        }); // end Anon sort
        for (int cIdx = 0; cIdx < sortedClusters.length && subclusters.size() < maxK; cIdx++) {
            Riffle splitMe = sortedClusters[cIdx];
            if (splitMe.getPurity() > 0.9) {
                continue;
            }
            double[] votes = splitMe.getVotes();
            final double totalVotes = weka.core.Utils.sum(votes);
            final double critVotes = 1.0 / (votes.length * 2);
            if (totalVotes < 2) {
                continue;
            }
            ArrayList<Riffle> splitSet = new ArrayList<>(votes.length);
            int numberOfNewClusters = 0;
            for (int lblIdx = 0; lblIdx < votes.length; ++lblIdx) {
                double labelVote = votes[lblIdx] / totalVotes;
                if (labelVote >= critVotes) {
                    splitSet.add(this.createNewCluster(splitMe.toInstance()));
                    numberOfNewClusters++;
                } else {
                    splitSet.add(null);
                }
            }
            if (numberOfNewClusters < 2) {
                continue;
            }
            Instances extras = new Instances(splitMe.getHeader());
            for (Instance x : splitMe.instances) {
                if (x.weight() > 0.999) {
                    Riffle myHopefulCluster = splitSet.get((int) x.classValue());
                    if (myHopefulCluster != null) {
                        myHopefulCluster.instances.add(x);
                        myHopefulCluster.addLabeling((int) x.classValue(), x.weight());
                    } else {
                        extras.add(x);
                    }
                } else {
                    extras.add(x);
                }
            }
            LinkedList<Riffle> goodSet = new LinkedList<>();
            for (Riffle rfc : splitSet) {
                if (rfc == null) {
                    continue;
                }
                rfc.recomputeAll();
                goodSet.add(rfc);
                subclusters.add(rfc);
            }
            for (Instance x : extras) {
                final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(goodSet, x);
                nearestClusters[0].getCluster().instances.add(x);
            }
            subclusters.remove(splitMe);
        }
    }

    // The pentultimate Expectation step
    ret = 0;
    for (Riffle c : subclusters) {
        ret += c.recomputeAll();
    }

    // See if any outliers should actually be consumed by a cluster now... (Maximization step 3)
    Iterator<Instance> xIter = potentialNovels.iterator();
    while (xIter.hasNext()) {
        Instance xOut = xIter.next();
        final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(subclusters, xOut);
        if (nearestClusters == null || nearestClusters.length < 1) {
            continue;
        }
        Riffle c = nearestClusters[0].getCluster();
        double d = nearestClusters[0].getDistance();
        if (d > c.getRadius()) { // Welcome home wayward tuple!
            c.instances.add(xOut);
            xIter.remove();
        }
    }

    // And the final Expectation step
    ret = 0;
    for (Riffle c : subclusters) {
        ret += c.recomputeAll();
    }
    // 
    return ret;
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * Wrapper for parallel K-Means for processing warm-up data set
 *
 * @param D Warm-up data set//from  www  .j  a  va 2  s  .  co m
 * @param K number of clusters
 * @param useLabels if true, use
 * @return
 */
protected final Set<Riffle> batchCluster(List<Instance> D, int K, boolean useLabels) {
    assert K >= 2 : "Minimum number of clusters (K) is 2";
    TreeSet<Riffle> ret = new TreeSet<>();
    TreeSet<Integer> labels = new TreeSet<>();
    TreeMap<Integer, TreeSet<Riffle>> potentialClusters = new TreeMap<>();
    LinkedList<ClusterPointPair> DSet = new LinkedList<>();
    //Create a potential cluster pool. Seperate into seperate pools by label if useLabels is set to true:
    for (Instance x : D) {
        int label = (useLabels) ? (int) x.classValue() : 0;
        labels.add(label);
        TreeSet<Riffle> clusterSet = potentialClusters.get(label);
        if (clusterSet == null) {
            clusterSet = new TreeSet<>();
        }
        clusterSet.add(this.createNewCluster(x));
        potentialClusters.put(label, clusterSet);
        DSet.addLast(new ClusterPointPair(x, null));
    }

    // Initialize following the K-Means++ approach:
    Riffle C = potentialClusters.firstEntry().getValue().first();
    ret.add(C);
    potentialClusters.firstEntry().getValue().remove(C);

    Iterator<Integer> labelIter = labels.iterator();
    while ((ret.size() < K) && !potentialClusters.isEmpty()) {
        if (!labelIter.hasNext()) {
            labelIter = labels.iterator();
        } // loop around as needed
        int pseudoLabel = labelIter.next();
        TreeSet<Riffle> clusterSet = potentialClusters.get(pseudoLabel);
        if (clusterSet.isEmpty()) {
            potentialClusters.remove(pseudoLabel);
            labelIter.remove();
            continue;
        }
        NearestClusterTuple[] nearestClusters = findMostLikelyClusters(clusterSet, C.toInstance());
        if (nearestClusters.length == 0) {
            continue;
        }
        if (nearestClusters.length == 1) {
            C = nearestClusters[0].getCluster();
        } else {
            C = nearestClusters[nearestClusters.length - 1].getCluster(); // WAS BACKWARDS
        }
        ret.add(C);
        clusterSet.remove(C);
    }
    potentialClusters.clear();

    // Iterate 
    final int maxIterations = 100;
    final double minDelta = 0.0001;
    int iteration = 0;
    double valIdxDelta = 1.0;
    ValIdxTupleType lastScore = null;
    while ((iteration < maxIterations) && (valIdxDelta > minDelta)) {
        iteration++;

        EMStep(DSet, ret, this.maximumNumberOfClusterSizeOption.getValue()
                - (int) (this.clustersPerLabelOption.getValue() * 0.75)); // Expectation Step

        ValIdxTupleType currentScore = new ValIdxTupleType(ret);
        if (lastScore != null) {
            double diff = Math.abs(lastScore.getValIdx() - currentScore.getValIdx());
            double denominator = lastScore.getValIdx();
            valIdxDelta = (denominator == 0) ? 0.0 : Math.abs(diff / denominator);
        }
        lastScore = currentScore;
    } // end while
    return ret;
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * Uses methodology from Kim et al. "A Novel Validity Index for Determination of the Optimal Number of Clusters"
 *
 * @param D Warm-up data set//  www  .ja  v  a2 s .  c om
 */
public final void initialize(List<Instance> D) {
    String ncCSVfilePrefix = "META-" + D.get(0).dataset().relationName() + "-"
            + iso8601FormatString.format(new Date());
    final boolean doMetaLog = logMetaRecordsOption.isSet();
    if (doMetaLog) {
        try {
            File ncCSVFile = new File(ncCSVfilePrefix + ".csv");
            ncCSVwriter = new BufferedWriter(new FileWriter(ncCSVFile));
            String ncCSVHeader = "" + "usize" + "," + "urad" + "," + "ctally" + "," + "cpur" + "," + "csize"
                    + "," + "cweight" + "," + "crad" + "," + "cdist" + "," + "pout" + "," + "vweight" + ","
                    + "qdmin" + "," + "qdout" + "," + "qnsc" + "," + "novel";
            ncCSVwriter.write(ncCSVHeader);
            ncCSVwriter.newLine();
            ncCSVwriter.flush();
        } catch (IOException fileSetupIOException) {
            System.err.println("NC-CSV meta-data file failed to open: " + fileSetupIOException.toString());
        }
    }
    knownLabels = new int[D.get(0).numClasses()];
    Arrays.fill(knownLabels, 0);
    this.numAttributes = D.get(0).numAttributes();
    universalProbabilitySums = 0;
    bestProbabilitySums = 0;
    bestProbabilityCount = 0;
    // Setup the universal set/cluster. Note that this will be crucial for subspace selection (cross-entropy checks against null hypothesis)
    double[] universalCentroid = new double[D.get(0).numAttributes()];
    double[] universalVariance = new double[D.get(0).numAttributes()];
    Arrays.fill(universalCentroid, 0);
    Arrays.fill(universalVariance, 0);
    universalCluster = new Riffle(D.get(0));
    //universalCluster.updateStrategyOption.setChosenIndex(this.updateStrategyOption.getChosenIndex());
    //universalCluster.outlierDefinitionStrategyOption.setChosenIndex(this.outlierDefinitionStrategyOption.getChosenIndex());
    universalCluster.distanceStrategyOption.setChosenIndex(this.distanceStrategyOption.getChosenIndex());
    //universalCluster.initialStandardDeviationOption.setValue(this.initialStandardDeviationOption.getValue());
    //universalCluster.alphaAdjustmentWeightOption.setValue(this.learningRateAlphaOption.getValue());
    //universalCluster.setParentClusterer(this);
    if (D.size() > 1) {
        double[] ep = new double[universalCentroid.length];
        Arrays.fill(ep, 0);
        universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below
        universalCluster.setVariances(universalVariance); // temporary - start with standard gaussian, will update below
        universalCluster.setWeight(0);
        double N = D.size();
        for (Instance x : D) { // Pre-populate univeral cluster with data points
            int y = (int) x.classValue();
            if (y < knownLabels.length) {
                knownLabels[y]++;
            }
            universalCluster.addInstance(x);
            double[] xValues = x.toDoubleArray();
            for (int i = 0; i < xValues.length; ++i) {
                universalCentroid[i] += xValues[i];
            }
        }
        for (int i = 0; i < universalCentroid.length; ++i) {
            universalCentroid[i] /= N;
        }
        // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so
        // we use the 2-Pass method for computing sample variance (per dimension)
        for (Instance x : D) {
            double[] xValues = x.toDoubleArray();
            for (int i = 0; i < xValues.length; ++i) {
                double delta = universalCentroid[i] - xValues[i];
                ep[i] += delta;
                universalVariance[i] += delta * delta;
            }
        }
        for (int i = 0; i < universalVariance.length; ++i) {
            universalVariance[i] = (universalVariance[i] - ep[i] * ep[i] / N) / (N - 1);
        }
        universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below
        universalCluster.setVariances(universalVariance);
    }
    universalCluster.recompute(); // this updates entropies and such
    int numKnownLabels = 0;
    for (int y : knownLabels) {
        if (y > 0) {
            numKnownLabels++;
        }
    }
    // Ok, now let's use K-Means to find the initial cluster set
    int Cmin = this.clustersPerLabelOption.getValue() * numKnownLabels;
    int Cmax = Cmin + 1;
    if (optimizeInitialClusterNumberOption.isSet()) {
        Cmin = this.minimumNumberOfClusterSizeOption.getValue();//Math.max(knownLabels.size(), 2);
        Cmax = Math.max(Cmin + 1, Math.min(this.clustersPerLabelOption.getValue() * numKnownLabels,
                this.maximumNumberOfClusterSizeOption.getValue()));
    }
    ArrayList<ValIdxTupleType> valIdxSet = new ArrayList<>(Cmax);
    Set<Riffle> V;
    // Create multiple hypothesis for best K choices:
    for (int c = Cmin; c < Cmax; c++) {
        V = batchCluster(D, c, true);
        ValIdxTupleType i = new ValIdxTupleType(V);
        valIdxSet.add(i);
        if (CVI == null) {
            CVI = i;
        } else {
            CVI.setVo_min(Math.min(i.getVo(), CVI.getVo_min()));
            CVI.setVo_max(Math.max(i.getVo(), CVI.getVo_max()));
            CVI.setVu_min(Math.min(i.getVu(), CVI.getVu_min()));
            CVI.setVu_max(Math.max(i.getVu(), CVI.getVu_max()));
        }
    }

    // Normalize all:
    for (ValIdxTupleType i : valIdxSet) {
        i.setVo_min(CVI.getVo_min());
        i.setVo_max(CVI.getVo_max());
        i.setVu_min(CVI.getVu_min());
        i.setVu_max(CVI.getVu_max());
    }

    // Find the best K by finding the minimum score:
    valIdxSet.stream().filter((i) -> (i.getValIdx() < CVI.getValIdx())).forEach((i) -> {
        CVI = i;
    });

    this.clusters.clear();
    for (Riffle c : CVI.getClustering()) {
        if (c.instances == null || c.instances.isEmpty()) {
            continue;
        }
        double[] clusterCentroid = new double[universalCentroid.length];
        double[] clusterVariance = new double[universalVariance.length];
        for (Instance x : c.instances) { // Pre-populate univeral cluster with data points
            double[] xValues = x.toDoubleArray();
            for (int i = 0; i < xValues.length; ++i) {
                clusterCentroid[i] += xValues[i] / ((double) c.instances.size());
            }
        }
        // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so
        // we use the 2-Pass method for computing sample variance (per dimension)
        if (c.instances.size() < 2) {
            for (int i = 0; i < clusterVariance.length; ++i) {
                clusterVariance[i] = universalCluster.getVariances()[i] * 0.85; // Statistical Variance
            }
        } else {
            double n = c.instances.size();
            double[] cep = new double[universalCentroid.length];
            Arrays.fill(cep, 0);
            for (Instance x : c.instances) {
                double[] xValues = x.toDoubleArray();
                for (int i = 0; i < xValues.length; ++i) {
                    double delta = clusterCentroid[i] - xValues[i];
                    cep[i] += delta;
                    clusterVariance[i] += delta * delta; // Statistical Variance
                }
            }
            for (int i = 0; i < clusterVariance.length; ++i) {
                clusterVariance[i] = (clusterVariance[i] - cep[i] * cep[i] / n) / (n - 1);
            }
        }
        c.setCenter(clusterCentroid); // temporary - start with standard gaussian, gets updated below
        c.setVariances(clusterVariance);
        c.recompute(); // this updates entropies and such
        for (Instance x : c.instances) {
            this.hopperCache.push(new ClusterPointPair(x, c));
        }
        this.clusters.add(c);
    }

    this.newClusterCreateCalls = 0;
    System.out.println("Starting with " + this.clusters.size() + " clusters.");
    instancesSeen = D.size();
    weightsSeen = D.size();
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * Train on data instance/*from   ww w  .j a  va 2s.c om*/
 *
 * @param x instance to train on
 */
@Override
public final void trainOnInstanceImpl(Instance x) {
    safeInit(x);
    assert (x != null) : "Sieve::trainOnInstanceImpl() Training on a null instance!";
    int y = (int) x.classValue();
    if ((y > 0) && (y < knownLabels.length)) {
        knownLabels[y] += x.weight();
    }
    this.instancesSeen++;
    this.weightsSeen += x.weight();
    this.universalCluster.addInstance(x);
    final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(this.clusters, x);
    if (nearestClusters.length < 1) { // Handles weird corner case
        Riffle firstCluster = this.createNewCluster(x);
        clusters.add(firstCluster);
        System.err.println("Sieve::trainOnInstanceImpl() - no other clusters found!");

    } else {
        // Everyone takes a weight hit, and we will reward the best later...
        for (NearestClusterTuple nct : nearestClusters) {
            nct.getCluster().penalize();
        }
        NearestClusterTuple ncx = nearestClusters[0]; // For code convienance
        ClusterPointPair cxp = new ClusterPointPair(x, ncx.getCluster()); // we will change this later in the function... maybe

        if (ncx.getDistance() > ncx.getCluster().getRadius()) { // outlier
            // Hang out with the outcasts and see if you can start your own clique
            cxp.c = null;
            if (!onlyCreateNewClusterAtResyncOption.isSet()) {
                cxp.c = trainOnOutlierInstance(x, ncx);
            }
            if (cxp.c == null) {
                this.potentialNovels.add(x);// or just wait patiently for a friend to sit next to you
            }
        } else { // end if(isRadialOutlier)                 
            // Or join an existing club if you are in the "IN" crowd...
            Riffle nc = ncx.getCluster();
            nc.reward();
            nc.trainEmbeddedClassifier(x);
            nc.addInstance(x);
        } // end else (not Outlier)
          // Randomly (based on distance) cross-train other models
        for (int i = 0; i < nearestClusters.length; ++i) {
            double pTrain = ((double) nearestClusters.length - i) / (2.0 * nearestClusters.length);
            if (this.clustererRandom.nextDouble() < pTrain) {
                nearestClusters[i].getCluster().trainEmbeddedClassifier(x);
            }
        } // end for(i)
        hopperCache.addLast(cxp);
    } // corner case safety
    periodicResync();
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * Temporary function for algorithm analysis
 *///from www. ja va 2 s  .  c  om
private void debugMetrics(double qNSC, double qDout, double qDmin, double dist, double rawTally, Instance x,
        Riffle c) {
    if (this.logMetaRecordsOption.isSet()) {
        try {
            int groundTruth = (int) x.classValue();
            boolean isTrueNovel = (groundTruth > 0) && (groundTruth < knownLabels.length)
                    && (knownLabels[groundTruth] < (this.minimumClusterSizeOption.getValue()));
            String ncCSVLine = "" + universalCluster.size() + "," + universalCluster.getRadius() + ","
                    + rawTally + "," + c.getPurity() + "," + c.size() + "," + c.getWeight() + ","
                    + c.getRadius() + "," + dist + "," + (c.isOutlier(x) ? 1 : 0) + "," + x.weight() + ","
                    + qDmin + "," + qDout + "," + qNSC + "," + isTrueNovel;
            ncCSVwriter.write(ncCSVLine);
            ncCSVwriter.newLine();
            ncCSVwriter.flush();
        } catch (IOException fileIoExcption) {
            System.err.println("Could not write NC CSV line: " + fileIoExcption.toString());
        }
    }
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * @return training accuracy//from   ww  w  . j  av a2  s  . c  om
 */
private double trainPerceptron() {
    // Train the perceptron from warmup phase clustering 
    final int epochs = 20;
    final int numberOfPerceptrons = 1;
    final int MEMBER = 0;
    final int OUTLIER = 1;
    double accuracySum = 0;
    double accuracyCount = 0;
    this.outlierPerceptronTrainingSet.clear();
    Random rng = new Random(this.randomSeed);

    // Generate training set
    for (Riffle thisCluster : this.clusters) {
        for (Instance x : thisCluster.getHeader()) {
            Instance pseudoPt = makePerceptronInstance(thisCluster, x);
            for (Riffle thatCluster : this.clusters) {
                double groundTruth = (thisCluster == thatCluster) ? MEMBER : OUTLIER;
                pseudoPt.setClassValue(groundTruth);
                this.outlierPerceptronTrainingSet.add(pseudoPt);
            }
        }
    }
    for (Instance x : this.outlierPerceptronTrainingSet) {
        x.setWeight(1.0 / this.outlierPerceptronTrainingSet.numInstances());
    }
    ;

    // Boost it
    this.perceptrons = new Perceptron[numberOfPerceptrons];
    this.pweights = new double[numberOfPerceptrons];
    for (int perceptronIdx = 0; perceptronIdx < numberOfPerceptrons; ++perceptronIdx) {
        // Discover new weak learner
        Perceptron candidatePerceptron = new Perceptron();
        candidatePerceptron.prepareForUse();
        candidatePerceptron.learningRatioOption.setValue(rng.nextDouble() * 0.9 + 0.1);
        for (int epoch = 0; epoch < epochs; epoch++) {
            for (Instance x : this.outlierPerceptronTrainingSet) {
                if ((rng.nextDouble() / this.outlierPerceptronTrainingSet.numInstances()) < x.weight()) { // weighted subsampling
                    candidatePerceptron.trainOnInstance(x);
                }
            }
        } //end epochs
          // Evaluate weak learner
        double errorFunctionSum = 0;
        double weightSum = 0;
        for (Instance x : this.outlierPerceptronTrainingSet) {
            if (!candidatePerceptron.correctlyClassifies(x)) {
                errorFunctionSum += x.weight();
            }
        }
        // adjust training weights
        for (Instance x : this.outlierPerceptronTrainingSet) {
            double newWeight = x.weight();
            if (candidatePerceptron.correctlyClassifies(x)) {
                newWeight *= errorFunctionSum / (1.0 - errorFunctionSum);
                if (Double.isNaN(newWeight)) {
                    newWeight = weka.core.Utils.SMALL;
                }
                x.setWeight(newWeight);
            }
            weightSum += newWeight;
        }
        // Normalize
        for (Instance x : this.outlierPerceptronTrainingSet) {
            x.setWeight(x.weight() / weightSum);
        }
        // Add to ensemble
        double newPerceptronWeight = Math.log((1 - errorFunctionSum) / errorFunctionSum);

        this.perceptrons[perceptronIdx] = candidatePerceptron;
        this.pweights[perceptronIdx] = newPerceptronWeight;
    } // end numPerceptrons

    // Check training error
    accuracySum = 0;
    accuracyCount = 0;
    for (Instance x : this.outlierPerceptronTrainingSet) {
        if (this.getPerceptronVotesForOutlierStatus(x) == x.classValue()) {
            accuracySum++;
        }
        accuracyCount++;
    }
    double trainingAccuracy = (accuracyCount > 0) ? (accuracySum / accuracyCount) : 0.0;
    this.outlierPerceptronTrainingSet.clear();
    return trainingAccuracy;
}

From source file:moa.clusterers.outliers.AnyOut.util.DataObject.java

License:Apache License

/**
 * Standard constructor for <code>DataObject</code>.
 * @param idCounter The id for the <code>DataObject</code>.
 * @param features The feature as a <code>double[]</code>
 * @param classLabel The label id for the <code>DataObject</code>.
 *///from   w  ww  . j av  a 2s .  co m
public DataObject(int idCounter, Instance inst) {
    this.id = idCounter;
    this.inst = inst;
    this.features = inst.toDoubleArray();
    this.classLabel = (int) inst.classValue();
    this.isOutiler = false;
}