Example usage for weka.core Instance classValue

List of usage examples for weka.core Instance classValue

Introduction

In this page you can find the example usage for weka.core Instance classValue.

Prototype

public double classValue();

Source Link

Document

Returns an instance's class value as a floating-point number.

Usage

From source file:moa.clusterer.FeS2.java

License:Apache License

/**;
 *
 * @param x instance to train on/*from w  ww . j a  v  a 2 s  .c  o  m*/
 */
@Override
public void trainOnInstanceImpl(Instance x) {
    safeInit(x);
    assert (x != null) : "FeS2::trainOnInstanceImpl() Training on a null instance!";
    int classValue = (int) x.classValue();
    boolean isNewLabel = (!knownLabels.contains(classValue)) && (x.weight() > 0);
    if ((x.weight() > 0)) {
        this.knownLabels.add(classValue);
    }
    this.universalCluster.addInstance(x);
    // Find nearest Cluster
    final SortedSet<NearestClusterTuple> nearestClusters = findMostLikelyClusters(this.clusters, x);
    assert !nearestClusters.isEmpty() : "Cluster set for probability matching is empty";

    // Compute some base metrics we need to know:
    double maxRadius = 0;
    double avgRadius = 0;
    boolean unanimousOutlier = true;
    double weightTotal = 0;
    double minWeight = Double.MAX_VALUE;
    for (NearestClusterTuple nct : nearestClusters) {
        unanimousOutlier = unanimousOutlier && nct.getCluster().isOutlier(x);
        maxRadius = Math.max(maxRadius, nct.getCluster().getRadius());
        avgRadius += nct.getCluster().getRadius();
    }
    avgRadius /= nearestClusters.size();

    // Update weights
    for (NearestClusterTuple nct : nearestClusters) {
        Riffle c = nct.getCluster();
        c.penalize(); // unilaterally reduce weights
        int clusterMajorityClass = weka.core.Utils.maxIndex(c.getVotes());
        // increase weights for matches (define 'match' criteria by strategy parameter)
        switch (this.positiveClusterFeedbackStrategyOption.getChosenIndex()) {
        case 0: // only the closest
            if (!unanimousOutlier && c == nearestClusters.last().getCluster()) {
                addToCluster(x, c);
            }
            break;
        case 1: // All label matches
            // This ternary condition is very important for results
            int hypothesisClass = (x.weight() > 0) ? classValue
                    : weka.core.Utils.maxIndex(this.getVotesForInstance(x));
            if (clusterMajorityClass == hypothesisClass) {
                addToCluster(x, c);
            }
            break;
        case 2: // All proximity matches
            if (!nct.getCluster().isOutlier(x)) {
                addToCluster(x, c);
            }
            break;
        default:
            break;
        } //end switch
        weightTotal += c.getWeight();
        minWeight = Math.min(minWeight, c.getWeight());
    }

    // Sort by (weight / sigma)
    Riffle[] sortedClusters = new Riffle[clusters.size()];
    int i = 0;
    for (Riffle c : clusters) {
        sortedClusters[i++] = c;
    }
    // Kuddos to Java 8 and lambda expressions for making this a one-liner:
    Arrays.parallelSort(sortedClusters,
            (Riffle a, Riffle b) -> Double.compare(a.getWeight() / Math.max(a.getRadius(), 1e-96),
                    b.getWeight() / Math.max(b.getRadius(), 1e-96)));
    boolean atClusterCapacity = (this.clusters.size() >= Math.min(
            this.clustersPerLabelOption.getValue() * this.knownLabels.size(),
            this.maximumNumberOfClusterSizeOption.getValue()));

    // * * *
    //
    // Results show that when average P(x|k) < Chauvenet, no new clusters, and visa versa (which is opposite of expected behavior)
    //
    // * * *
    boolean universalOutlier = this.universalCluster.isOutlier(x);
    if (isNewLabel) {
        newLabelCount++;
    }
    if (universalOutlier) {
        universalOutlierCount++;
    }
    if (unanimousOutlier) {
        unanimousOutlierCount++;
    }
    // If we have no matches at all, then the weakest clsuter is replaced by a new one with a high variance and low weight
    //if (isNewLabel || (unanimousOutlier && universalOutlier)) {   
    if (isNewLabel || unanimousOutlier) {
        Riffle weakestLink = sortedClusters[sortedClusters.length - 1]; // get last one
        Riffle novelCluster = this.createNewCluster(x);
        //novelCluster.setRadius((avgRadius + maxRadius) / 2.0); // Set to half-way between average and max radius
        novelCluster.setWeight(weightTotal / nearestClusters.size()); // <---- Validate this ------
        weightTotal += novelCluster.getWeight(); // update for new normalization factor
        // You are the weakest link... Goodbye
        if (atClusterCapacity) {
            weightTotal -= weakestLink.getWeight(); // update for new normalization factor
            this.clusters.remove(weakestLink);
        }
        // Everyone please welcome our newest contestant...
        clusters.add(novelCluster);
    }

    // Normalize Weights and Update variance estimates for singleton clusters
    double[] universeVariance = universalCluster.getVariances();
    double[] initialVariance = new double[universeVariance.length];
    for (int j = 0; j < initialVariance.length; ++j) {
        initialVariance[j] = universeVariance[j] * 0.85;
    }
    if (weightTotal <= 0) {
        weightTotal = 1;
    }
    for (Riffle c : this.clusters) {
        if (c.size() < 2) {
            c.setVariances(initialVariance);
        }
        c.setWeight(c.getWeight() / weightTotal);
    }
}

From source file:moa.clusterer.FeS2.java

License:Apache License

/**
 * Find the nearest cluster, and use its most frequent label.
 * If nearest cluster has no label, then we have a novel cluster
 * Unless data point is an outlier to all clusters, then it is just an outlier
 * @param inst/*from  ww  w .j  av a2 s  . co m*/
 * @return 
 */
@Override
public double[] getVotesForInstance(Instance inst) {
    assert (this.universalCluster != null) : "FeS2::getVotesForInstance() called without any initialization or training!";
    int novelClassLabel = inst.numClasses();
    int outlierLabel = novelClassLabel + 1;
    double[] votes = new double[inst.numClasses() + 2];
    if (this.clusters.isEmpty()) {
        return votes;
    }
    double[] cumulativeVotes = new double[inst.numClasses()];
    double[] cumulativeVotes_p = new double[inst.numClasses()];
    double[] cumulativeVotes_pw = new double[inst.numClasses()];
    double[] cumulativeVotes_n = new double[inst.numClasses()];
    double[] cumulativeVotes_np = new double[inst.numClasses()];
    double[] cumulativeVotes_npw = new double[inst.numClasses()];
    double[] cumulativeWinnerTakesAllVotes = new double[inst.numClasses()];
    Arrays.fill(votes, 0.0);
    Arrays.fill(cumulativeVotes, 0.0);
    Arrays.fill(cumulativeVotes_p, 0.0);
    Arrays.fill(cumulativeVotes_pw, 0.0);
    Arrays.fill(cumulativeVotes_n, 0.0);
    Arrays.fill(cumulativeVotes_np, 0.0);
    Arrays.fill(cumulativeVotes_npw, 0.0);
    Arrays.fill(cumulativeWinnerTakesAllVotes, 0.0);

    final int TRUE_CLASS = (int) inst.classValue(); // for debug watch windows only
    final SortedSet<NearestClusterTuple> nearestClusters = findMostLikelyClusters(this.clusters, inst);
    boolean memberOfAtLeastOneTrueCluster = false;
    boolean universalOutlier = true;
    double bestProbability = 0;
    double universalProbability = this.universalCluster.getInclusionProbability(inst);

    NearestClusterTuple bestMatchCluster = null;

    // Gather data
    for (NearestClusterTuple nct : nearestClusters) {
        double p = nct.getDistance();
        boolean localOutlier = nct.getCluster().isOutlier(inst);
        memberOfAtLeastOneTrueCluster = memberOfAtLeastOneTrueCluster
                || (!localOutlier && nct.getCluster().size() > this.minimumClusterSizeOption.getValue());
        universalOutlier = universalOutlier && localOutlier;
        bestProbability = Math.max(p, bestProbability);
        if (p <= 0) {
            continue;
        }
        int localWinner = (int) nct.getCluster().getGroundTruth();
        cumulativeWinnerTakesAllVotes[localWinner] += p;
        double clusterVotes[] = nct.getCluster().getVotes();
        double clusterNormalizedVotes[] = nct.getCluster().getVotes().clone();
        if (weka.core.Utils.sum(clusterNormalizedVotes) > 0) {
            weka.core.Utils.normalize(clusterNormalizedVotes);
        }
        for (int i = 0; i < clusterVotes.length; ++i) {
            cumulativeVotes[i] += clusterVotes[i];
            cumulativeVotes_p[i] += clusterVotes[i] * p;
            cumulativeVotes_pw[i] += clusterVotes[i] * p * nct.getCluster().getWeight();
            cumulativeVotes_n[i] += clusterNormalizedVotes[i];
            cumulativeVotes_np[i] += clusterNormalizedVotes[i] * p;
            cumulativeVotes_npw[i] += clusterNormalizedVotes[i] * p * nct.getCluster().getWeight();
        }
        if (!localOutlier) {
            bestMatchCluster = nct;
        }
    } // end for

    universalProbabilitySums += universalProbability;
    bestProbabilitySums += bestProbability;
    bestProbabilityCount += 1;

    if (nearestClusters.isEmpty()) {
        votes[outlierLabel] = 1.0;
    } else {
        if (weka.core.Utils.sum(cumulativeVotes) > 0) {
            weka.core.Utils.normalize(cumulativeVotes);
        }
        if (weka.core.Utils.sum(cumulativeVotes_p) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_p);
        }
        if (weka.core.Utils.sum(cumulativeVotes_pw) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_pw);
        }
        if (weka.core.Utils.sum(cumulativeVotes_n) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_n);
        }
        if (weka.core.Utils.sum(cumulativeVotes_np) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_np);
        }
        if (weka.core.Utils.sum(cumulativeVotes_npw) > 0) {
            weka.core.Utils.normalize(cumulativeVotes_npw);
        }
        if (weka.core.Utils.sum(cumulativeWinnerTakesAllVotes) > 0) {
            weka.core.Utils.normalize(cumulativeWinnerTakesAllVotes);
        }
        switch (this.votingStrategyOption.getChosenIndex()) {
        case 0: // 1-NN - usually not the strongest
            double[] nearestNeighborVotes = nearestClusters.last().getCluster().getVotes();
            for (int i = 0; i < nearestNeighborVotes.length; ++i) {
                votes[i] = nearestNeighborVotes[i];
            }
            break;
        case 1: // Global  k-NN - this is a poor performer
            for (int i = 0; i < cumulativeVotes.length; ++i) {
                votes[i] = cumulativeVotes[i];
            }
            break;
        case 2: // Globally probability-weighted k-NN - good, but biased towards heavy clusters
            for (int i = 0; i < cumulativeVotes_p.length; ++i) {
                votes[i] = cumulativeVotes_p[i];
            }
            break;
        case 3: // Globally probability-utility-weighted k-NN - good, but overly complex
            for (int i = 0; i < cumulativeVotes_pw.length; ++i) {
                votes[i] = cumulativeVotes_pw[i];
            }
            break;
        case 4: // Globally normalized k-NN - this is also usually a really really poor performer. Don't use it
            for (int i = 0; i < cumulativeVotes_n.length; ++i) {
                votes[i] = cumulativeVotes_n[i];
            }
            break;
        case 5: // Globally normalized probability-weighted k-NN - a safe bet
            for (int i = 0; i < cumulativeVotes_np.length; ++i) {
                votes[i] = cumulativeVotes_np[i];
            }
            break;
        case 6: // Globally normalized probability-utility-weighted k-NN - default and preferred method
            for (int i = 0; i < cumulativeVotes_npw.length; ++i) {
                votes[i] = cumulativeVotes_npw[i];
            }
            break;
        case 7: // Globally weighted k-NN winner take all per cluster - Can avoid noise, but not usually the best
        default:
            for (int i = 0; i < cumulativeWinnerTakesAllVotes.length; ++i) {
                votes[i] = cumulativeWinnerTakesAllVotes[i];
            }
        } // end switch
        double voteAccumulator = 0;
        for (double v : votes) {
            voteAccumulator += v;
        }
        // A novel cluster is one of sufficient size but no label
        if ((bestMatchCluster != null) // It matches a cluster
                && (bestMatchCluster.getCluster().size() > this.minimumClusterSizeOption.getValue()) // that is overall large enough
                && (bestMatchCluster.getCluster().getNumLabeledPoints() < 1)) { // but without labels
            votes[novelClassLabel] = 1.0;
        }
        // outlier detection
        if (universalOutlier) {
            int maxIdx = weka.core.Utils.maxIndex(votes);
            if (maxIdx < 0) {
                maxIdx = 0;
            }
            double outlierValue = votes[maxIdx];
            if (outlierValue <= 0) {
                votes[novelClassLabel] = 1.0; // special case of novelty when we have absolutely no clue how to label an outlier
                outlierValue = 1e-16;
            }
            votes[outlierLabel] = outlierValue / 2.0; //Math.max(Math.abs(1.0 - bestProbability), Math.abs(1.0 - universalProbability));
        }
    } // end if (nearestClusters not empty)
    return votes;
}

From source file:moa.clusterer.FeS2.java

License:Apache License

/**
 * @return training accuracy//from w w w. j a v  a2  s . co  m
 */
private double trainPerceptron() {
    // Train the perceptron from warmup phase clustering 
    final int epochs = 20;
    final int numberOfPerceptrons = 10;
    final int MEMBER = 0;
    final int OUTLIER = 1;
    double accuracySum = 0;
    double accuracyCount = 0;
    this.outlierPerceptronTrainingSet.clear();
    Random rng = new Random(this.randomSeed);

    // Generate training set
    for (Riffle thisCluster : this.clusters) {
        for (Riffle thatCluster : this.clusters) {
            double groundTruth = (thisCluster == thatCluster) ? MEMBER : OUTLIER;
            for (Instance x : thatCluster.getHeader()) {
                Instance pseudoPt = makePerceptronInstance(thisCluster, x);
                pseudoPt.setClassValue(groundTruth);
                this.outlierPerceptronTrainingSet.add(pseudoPt);
            }
        }
    }
    this.outlierPerceptronTrainingSet.parallelStream().forEach((x) -> {
        x.setWeight(1.0 / this.outlierPerceptronTrainingSet.numInstances());
    });

    // Boost it
    this.perceptrons = new Perceptron[numberOfPerceptrons];
    this.pweights = new double[numberOfPerceptrons];
    for (int perceptronIdx = 0; perceptronIdx < numberOfPerceptrons; ++perceptronIdx) {
        // Discover new weak learner
        Perceptron candidatePerceptron = new Perceptron();
        candidatePerceptron.prepareForUse();
        candidatePerceptron.learningRatioOption.setValue(rng.nextDouble() * 0.9 + 0.1);
        for (int epoch = 0; epoch < epochs; epoch++) {
            for (Instance x : this.outlierPerceptronTrainingSet) {
                if ((rng.nextDouble() / this.outlierPerceptronTrainingSet.numInstances()) < x.weight()) { // weighted subsampling
                    candidatePerceptron.trainOnInstance(x);
                }
            }
        } //end epochs
          // Evaluate weak learner
        double errorFunctionSum = 0;
        double weightSum = 0;
        for (Instance x : this.outlierPerceptronTrainingSet) {
            if (!candidatePerceptron.correctlyClassifies(x)) {
                errorFunctionSum += x.weight();
            }
        }
        // adjust training weights
        for (Instance x : this.outlierPerceptronTrainingSet) {
            double newWeight = x.weight();
            if (candidatePerceptron.correctlyClassifies(x)) {
                newWeight *= errorFunctionSum / (1.0 - errorFunctionSum);
                if (Double.isNaN(newWeight)) {
                    newWeight = weka.core.Utils.SMALL;
                }
                x.setWeight(newWeight);
            }
            weightSum += newWeight;
        }
        // Normalize
        for (Instance x : this.outlierPerceptronTrainingSet) {
            x.setWeight(x.weight() / weightSum);
        }
        // Add to ensemble
        double newPerceptronWeight = Math.log((1 - errorFunctionSum) / errorFunctionSum);

        this.perceptrons[perceptronIdx] = candidatePerceptron;
        this.pweights[perceptronIdx] = newPerceptronWeight;
    } // end numPerceptrons

    // Check training error
    accuracySum = 0;
    accuracyCount = 0;
    for (Instance x : this.outlierPerceptronTrainingSet) {
        if (this.getPerceptronVotesForOutlierStatus(x) == x.classValue()) {
            accuracySum++;
        }
        accuracyCount++;
    }
    double trainingAccuracy = (accuracyCount > 0) ? (accuracySum / accuracyCount) : 0.0;
    this.outlierPerceptronTrainingSet.clear();
    return trainingAccuracy;
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * This is not your grandpa's E-M algorithm... it has multiple mini-steps,
 * but "The e1-m1-e2-m2-e3-m3-Algorithm" is a mouthful, so we just call it *-Means Clustering
 * {Pronounced "Any-means (necessary) clustering"}
 * @param D//  www.j  a  v  a  2  s  .co  m
 * @param subclusters
 * @param maxK
 * @return score at the end of the process
 */
protected final double EMStep(List<ClusterPointPair> D, Collection<Riffle> subclusters, int maxK) {
    double ret = 0;
    // clear the pallette
    for (Riffle c : subclusters) {
        if (c.instances == null) {
            c.instances = c.getHeader();
        }
        c.instances.clear();
        c.cleanTallies();
    }

    // Assign by X's to nearest clusters (Maximization step 1)
    for (ClusterPointPair cxp : D) {
        if (this.potentialNovels.contains(cxp.x)) { // could also be if cxp.c == null, but this is safer
            continue; // ignore the outliers for a moment
        }
        final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(subclusters, cxp.x);
        //            double ds[] = new double[nearestClusters.length];
        //            int foo = 0;
        //            for(NearestClusterTuple gnarf : nearestClusters) {
        //                ds[foo++] = gnarf.getDistance();
        //            }

        cxp.c = nearestClusters[0].getCluster();

        nearestClusters[0].getCluster().instances.add(cxp.x);
        if (cxp.x.weight() > 0.99) {
            nearestClusters[0].getCluster().addLabeling((int) cxp.x.classValue(), cxp.x.weight());
        }
    }

    // Find new radius (Expectation step)
    for (Riffle c : subclusters) {
        ret += c.recomputeAll();
    }

    // Remove empty clusters to make room for splits (Expectation-ish)
    Iterator<Riffle> cIter = subclusters.iterator();
    while (cIter.hasNext()) {
        Riffle rc = cIter.next();
        if (rc.instances.size() < 1) {
            cIter.remove();
        }
    }

    // Are we full?
    if (subclusters.size() < maxK) {
        // Fix bad clusters (Maximization step 2 - breaking up noisy clusters)
        Riffle sortedClusters[] = new Riffle[subclusters.size()];
        int tmpIdx = 0;
        for (Riffle tmpRfl : subclusters) {
            if (tmpIdx >= sortedClusters.length) {
                break;
            }
            sortedClusters[tmpIdx] = tmpRfl;
            tmpIdx++;
        }
        Arrays.sort(sortedClusters, new Comparator<Riffle>() {
            @Override
            public int compare(Riffle first, Riffle second) {
                if (first == null) {
                    return 1;
                }
                if (second == null) {
                    return -1;
                }
                double[] votes1 = first.getVotes().clone();
                double[] votes2 = second.getVotes().clone();
                double total1 = weka.core.Utils.sum(votes1);
                double total2 = weka.core.Utils.sum(votes2);
                Arrays.sort(votes1);
                Arrays.sort(votes2);
                double pentultimate1 = 1e-16 + ((votes1.length > 1) ? votes1[votes1.length - 2] : 0);
                double pentultimate2 = 1e-16 + ((votes2.length > 1) ? votes2[votes2.length - 2] : 0);
                // this is equiv to purity - margin... yea... really... it's awesome... gotta love math...
                double score1 = (total1 > 0) ? first.size() * pentultimate1 / total1 : 0;
                double score2 = (total2 > 0) ? second.size() * pentultimate2 / total2 : 0;
                return Double.compare(score2, score1);
            }
        }); // end Anon sort
        for (int cIdx = 0; cIdx < sortedClusters.length && subclusters.size() < maxK; cIdx++) {
            Riffle splitMe = sortedClusters[cIdx];
            if (splitMe.getPurity() > 0.9) {
                continue;
            }
            double[] votes = splitMe.getVotes();
            final double totalVotes = weka.core.Utils.sum(votes);
            final double critVotes = 1.0 / (votes.length * 2);
            if (totalVotes < 2) {
                continue;
            }
            ArrayList<Riffle> splitSet = new ArrayList<>(votes.length);
            int numberOfNewClusters = 0;
            for (int lblIdx = 0; lblIdx < votes.length; ++lblIdx) {
                double labelVote = votes[lblIdx] / totalVotes;
                if (labelVote >= critVotes) {
                    splitSet.add(this.createNewCluster(splitMe.toInstance()));
                    numberOfNewClusters++;
                } else {
                    splitSet.add(null);
                }
            }
            if (numberOfNewClusters < 2) {
                continue;
            }
            Instances extras = new Instances(splitMe.getHeader());
            for (Instance x : splitMe.instances) {
                if (x.weight() > 0.999) {
                    Riffle myHopefulCluster = splitSet.get((int) x.classValue());
                    if (myHopefulCluster != null) {
                        myHopefulCluster.instances.add(x);
                        myHopefulCluster.addLabeling((int) x.classValue(), x.weight());
                    } else {
                        extras.add(x);
                    }
                } else {
                    extras.add(x);
                }
            }
            LinkedList<Riffle> goodSet = new LinkedList<>();
            for (Riffle rfc : splitSet) {
                if (rfc == null) {
                    continue;
                }
                rfc.recomputeAll();
                goodSet.add(rfc);
                subclusters.add(rfc);
            }
            for (Instance x : extras) {
                final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(goodSet, x);
                nearestClusters[0].getCluster().instances.add(x);
            }
            subclusters.remove(splitMe);
        }
    }

    // The pentultimate Expectation step
    ret = 0;
    for (Riffle c : subclusters) {
        ret += c.recomputeAll();
    }

    // See if any outliers should actually be consumed by a cluster now... (Maximization step 3)
    Iterator<Instance> xIter = potentialNovels.iterator();
    while (xIter.hasNext()) {
        Instance xOut = xIter.next();
        final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(subclusters, xOut);
        if (nearestClusters == null || nearestClusters.length < 1) {
            continue;
        }
        Riffle c = nearestClusters[0].getCluster();
        double d = nearestClusters[0].getDistance();
        if (d > c.getRadius()) { // Welcome home wayward tuple!
            c.instances.add(xOut);
            xIter.remove();
        }
    }

    // And the final Expectation step
    ret = 0;
    for (Riffle c : subclusters) {
        ret += c.recomputeAll();
    }
    // 
    return ret;
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * Wrapper for parallel K-Means for processing warm-up data set
 *
 * @param D Warm-up data set//from  www  .j  a  va 2  s  .  co m
 * @param K number of clusters
 * @param useLabels if true, use
 * @return
 */
protected final Set<Riffle> batchCluster(List<Instance> D, int K, boolean useLabels) {
    assert K >= 2 : "Minimum number of clusters (K) is 2";
    TreeSet<Riffle> ret = new TreeSet<>();
    TreeSet<Integer> labels = new TreeSet<>();
    TreeMap<Integer, TreeSet<Riffle>> potentialClusters = new TreeMap<>();
    LinkedList<ClusterPointPair> DSet = new LinkedList<>();
    //Create a potential cluster pool. Seperate into seperate pools by label if useLabels is set to true:
    for (Instance x : D) {
        int label = (useLabels) ? (int) x.classValue() : 0;
        labels.add(label);
        TreeSet<Riffle> clusterSet = potentialClusters.get(label);
        if (clusterSet == null) {
            clusterSet = new TreeSet<>();
        }
        clusterSet.add(this.createNewCluster(x));
        potentialClusters.put(label, clusterSet);
        DSet.addLast(new ClusterPointPair(x, null));
    }

    // Initialize following the K-Means++ approach:
    Riffle C = potentialClusters.firstEntry().getValue().first();
    ret.add(C);
    potentialClusters.firstEntry().getValue().remove(C);

    Iterator<Integer> labelIter = labels.iterator();
    while ((ret.size() < K) && !potentialClusters.isEmpty()) {
        if (!labelIter.hasNext()) {
            labelIter = labels.iterator();
        } // loop around as needed
        int pseudoLabel = labelIter.next();
        TreeSet<Riffle> clusterSet = potentialClusters.get(pseudoLabel);
        if (clusterSet.isEmpty()) {
            potentialClusters.remove(pseudoLabel);
            labelIter.remove();
            continue;
        }
        NearestClusterTuple[] nearestClusters = findMostLikelyClusters(clusterSet, C.toInstance());
        if (nearestClusters.length == 0) {
            continue;
        }
        if (nearestClusters.length == 1) {
            C = nearestClusters[0].getCluster();
        } else {
            C = nearestClusters[nearestClusters.length - 1].getCluster(); // WAS BACKWARDS
        }
        ret.add(C);
        clusterSet.remove(C);
    }
    potentialClusters.clear();

    // Iterate 
    final int maxIterations = 100;
    final double minDelta = 0.0001;
    int iteration = 0;
    double valIdxDelta = 1.0;
    ValIdxTupleType lastScore = null;
    while ((iteration < maxIterations) && (valIdxDelta > minDelta)) {
        iteration++;

        EMStep(DSet, ret, this.maximumNumberOfClusterSizeOption.getValue()
                - (int) (this.clustersPerLabelOption.getValue() * 0.75)); // Expectation Step

        ValIdxTupleType currentScore = new ValIdxTupleType(ret);
        if (lastScore != null) {
            double diff = Math.abs(lastScore.getValIdx() - currentScore.getValIdx());
            double denominator = lastScore.getValIdx();
            valIdxDelta = (denominator == 0) ? 0.0 : Math.abs(diff / denominator);
        }
        lastScore = currentScore;
    } // end while
    return ret;
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * Uses methodology from Kim et al. "A Novel Validity Index for Determination of the Optimal Number of Clusters"
 *
 * @param D Warm-up data set//  www  .ja  v  a2 s .  c om
 */
public final void initialize(List<Instance> D) {
    String ncCSVfilePrefix = "META-" + D.get(0).dataset().relationName() + "-"
            + iso8601FormatString.format(new Date());
    final boolean doMetaLog = logMetaRecordsOption.isSet();
    if (doMetaLog) {
        try {
            File ncCSVFile = new File(ncCSVfilePrefix + ".csv");
            ncCSVwriter = new BufferedWriter(new FileWriter(ncCSVFile));
            String ncCSVHeader = "" + "usize" + "," + "urad" + "," + "ctally" + "," + "cpur" + "," + "csize"
                    + "," + "cweight" + "," + "crad" + "," + "cdist" + "," + "pout" + "," + "vweight" + ","
                    + "qdmin" + "," + "qdout" + "," + "qnsc" + "," + "novel";
            ncCSVwriter.write(ncCSVHeader);
            ncCSVwriter.newLine();
            ncCSVwriter.flush();
        } catch (IOException fileSetupIOException) {
            System.err.println("NC-CSV meta-data file failed to open: " + fileSetupIOException.toString());
        }
    }
    knownLabels = new int[D.get(0).numClasses()];
    Arrays.fill(knownLabels, 0);
    this.numAttributes = D.get(0).numAttributes();
    universalProbabilitySums = 0;
    bestProbabilitySums = 0;
    bestProbabilityCount = 0;
    // Setup the universal set/cluster. Note that this will be crucial for subspace selection (cross-entropy checks against null hypothesis)
    double[] universalCentroid = new double[D.get(0).numAttributes()];
    double[] universalVariance = new double[D.get(0).numAttributes()];
    Arrays.fill(universalCentroid, 0);
    Arrays.fill(universalVariance, 0);
    universalCluster = new Riffle(D.get(0));
    //universalCluster.updateStrategyOption.setChosenIndex(this.updateStrategyOption.getChosenIndex());
    //universalCluster.outlierDefinitionStrategyOption.setChosenIndex(this.outlierDefinitionStrategyOption.getChosenIndex());
    universalCluster.distanceStrategyOption.setChosenIndex(this.distanceStrategyOption.getChosenIndex());
    //universalCluster.initialStandardDeviationOption.setValue(this.initialStandardDeviationOption.getValue());
    //universalCluster.alphaAdjustmentWeightOption.setValue(this.learningRateAlphaOption.getValue());
    //universalCluster.setParentClusterer(this);
    if (D.size() > 1) {
        double[] ep = new double[universalCentroid.length];
        Arrays.fill(ep, 0);
        universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below
        universalCluster.setVariances(universalVariance); // temporary - start with standard gaussian, will update below
        universalCluster.setWeight(0);
        double N = D.size();
        for (Instance x : D) { // Pre-populate univeral cluster with data points
            int y = (int) x.classValue();
            if (y < knownLabels.length) {
                knownLabels[y]++;
            }
            universalCluster.addInstance(x);
            double[] xValues = x.toDoubleArray();
            for (int i = 0; i < xValues.length; ++i) {
                universalCentroid[i] += xValues[i];
            }
        }
        for (int i = 0; i < universalCentroid.length; ++i) {
            universalCentroid[i] /= N;
        }
        // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so
        // we use the 2-Pass method for computing sample variance (per dimension)
        for (Instance x : D) {
            double[] xValues = x.toDoubleArray();
            for (int i = 0; i < xValues.length; ++i) {
                double delta = universalCentroid[i] - xValues[i];
                ep[i] += delta;
                universalVariance[i] += delta * delta;
            }
        }
        for (int i = 0; i < universalVariance.length; ++i) {
            universalVariance[i] = (universalVariance[i] - ep[i] * ep[i] / N) / (N - 1);
        }
        universalCluster.setCenter(universalCentroid); // temporary - start with standard gaussian, gets updated below
        universalCluster.setVariances(universalVariance);
    }
    universalCluster.recompute(); // this updates entropies and such
    int numKnownLabels = 0;
    for (int y : knownLabels) {
        if (y > 0) {
            numKnownLabels++;
        }
    }
    // Ok, now let's use K-Means to find the initial cluster set
    int Cmin = this.clustersPerLabelOption.getValue() * numKnownLabels;
    int Cmax = Cmin + 1;
    if (optimizeInitialClusterNumberOption.isSet()) {
        Cmin = this.minimumNumberOfClusterSizeOption.getValue();//Math.max(knownLabels.size(), 2);
        Cmax = Math.max(Cmin + 1, Math.min(this.clustersPerLabelOption.getValue() * numKnownLabels,
                this.maximumNumberOfClusterSizeOption.getValue()));
    }
    ArrayList<ValIdxTupleType> valIdxSet = new ArrayList<>(Cmax);
    Set<Riffle> V;
    // Create multiple hypothesis for best K choices:
    for (int c = Cmin; c < Cmax; c++) {
        V = batchCluster(D, c, true);
        ValIdxTupleType i = new ValIdxTupleType(V);
        valIdxSet.add(i);
        if (CVI == null) {
            CVI = i;
        } else {
            CVI.setVo_min(Math.min(i.getVo(), CVI.getVo_min()));
            CVI.setVo_max(Math.max(i.getVo(), CVI.getVo_max()));
            CVI.setVu_min(Math.min(i.getVu(), CVI.getVu_min()));
            CVI.setVu_max(Math.max(i.getVu(), CVI.getVu_max()));
        }
    }

    // Normalize all:
    for (ValIdxTupleType i : valIdxSet) {
        i.setVo_min(CVI.getVo_min());
        i.setVo_max(CVI.getVo_max());
        i.setVu_min(CVI.getVu_min());
        i.setVu_max(CVI.getVu_max());
    }

    // Find the best K by finding the minimum score:
    valIdxSet.stream().filter((i) -> (i.getValIdx() < CVI.getValIdx())).forEach((i) -> {
        CVI = i;
    });

    this.clusters.clear();
    for (Riffle c : CVI.getClustering()) {
        if (c.instances == null || c.instances.isEmpty()) {
            continue;
        }
        double[] clusterCentroid = new double[universalCentroid.length];
        double[] clusterVariance = new double[universalVariance.length];
        for (Instance x : c.instances) { // Pre-populate univeral cluster with data points
            double[] xValues = x.toDoubleArray();
            for (int i = 0; i < xValues.length; ++i) {
                clusterCentroid[i] += xValues[i] / ((double) c.instances.size());
            }
        }
        // The cluster class uses an incremental heuristic, but we want to start out as pure as possible, so
        // we use the 2-Pass method for computing sample variance (per dimension)
        if (c.instances.size() < 2) {
            for (int i = 0; i < clusterVariance.length; ++i) {
                clusterVariance[i] = universalCluster.getVariances()[i] * 0.85; // Statistical Variance
            }
        } else {
            double n = c.instances.size();
            double[] cep = new double[universalCentroid.length];
            Arrays.fill(cep, 0);
            for (Instance x : c.instances) {
                double[] xValues = x.toDoubleArray();
                for (int i = 0; i < xValues.length; ++i) {
                    double delta = clusterCentroid[i] - xValues[i];
                    cep[i] += delta;
                    clusterVariance[i] += delta * delta; // Statistical Variance
                }
            }
            for (int i = 0; i < clusterVariance.length; ++i) {
                clusterVariance[i] = (clusterVariance[i] - cep[i] * cep[i] / n) / (n - 1);
            }
        }
        c.setCenter(clusterCentroid); // temporary - start with standard gaussian, gets updated below
        c.setVariances(clusterVariance);
        c.recompute(); // this updates entropies and such
        for (Instance x : c.instances) {
            this.hopperCache.push(new ClusterPointPair(x, c));
        }
        this.clusters.add(c);
    }

    this.newClusterCreateCalls = 0;
    System.out.println("Starting with " + this.clusters.size() + " clusters.");
    instancesSeen = D.size();
    weightsSeen = D.size();
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * Train on data instance/*from   ww w  .j a  va 2s.c om*/
 *
 * @param x instance to train on
 */
@Override
public final void trainOnInstanceImpl(Instance x) {
    safeInit(x);
    assert (x != null) : "Sieve::trainOnInstanceImpl() Training on a null instance!";
    int y = (int) x.classValue();
    if ((y > 0) && (y < knownLabels.length)) {
        knownLabels[y] += x.weight();
    }
    this.instancesSeen++;
    this.weightsSeen += x.weight();
    this.universalCluster.addInstance(x);
    final NearestClusterTuple[] nearestClusters = findMostLikelyClusters(this.clusters, x);
    if (nearestClusters.length < 1) { // Handles weird corner case
        Riffle firstCluster = this.createNewCluster(x);
        clusters.add(firstCluster);
        System.err.println("Sieve::trainOnInstanceImpl() - no other clusters found!");

    } else {
        // Everyone takes a weight hit, and we will reward the best later...
        for (NearestClusterTuple nct : nearestClusters) {
            nct.getCluster().penalize();
        }
        NearestClusterTuple ncx = nearestClusters[0]; // For code convienance
        ClusterPointPair cxp = new ClusterPointPair(x, ncx.getCluster()); // we will change this later in the function... maybe

        if (ncx.getDistance() > ncx.getCluster().getRadius()) { // outlier
            // Hang out with the outcasts and see if you can start your own clique
            cxp.c = null;
            if (!onlyCreateNewClusterAtResyncOption.isSet()) {
                cxp.c = trainOnOutlierInstance(x, ncx);
            }
            if (cxp.c == null) {
                this.potentialNovels.add(x);// or just wait patiently for a friend to sit next to you
            }
        } else { // end if(isRadialOutlier)                 
            // Or join an existing club if you are in the "IN" crowd...
            Riffle nc = ncx.getCluster();
            nc.reward();
            nc.trainEmbeddedClassifier(x);
            nc.addInstance(x);
        } // end else (not Outlier)
          // Randomly (based on distance) cross-train other models
        for (int i = 0; i < nearestClusters.length; ++i) {
            double pTrain = ((double) nearestClusters.length - i) / (2.0 * nearestClusters.length);
            if (this.clustererRandom.nextDouble() < pTrain) {
                nearestClusters[i].getCluster().trainEmbeddedClassifier(x);
            }
        } // end for(i)
        hopperCache.addLast(cxp);
    } // corner case safety
    periodicResync();
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * Temporary function for algorithm analysis
 *///from www. ja va 2 s  .  c  om
private void debugMetrics(double qNSC, double qDout, double qDmin, double dist, double rawTally, Instance x,
        Riffle c) {
    if (this.logMetaRecordsOption.isSet()) {
        try {
            int groundTruth = (int) x.classValue();
            boolean isTrueNovel = (groundTruth > 0) && (groundTruth < knownLabels.length)
                    && (knownLabels[groundTruth] < (this.minimumClusterSizeOption.getValue()));
            String ncCSVLine = "" + universalCluster.size() + "," + universalCluster.getRadius() + ","
                    + rawTally + "," + c.getPurity() + "," + c.size() + "," + c.getWeight() + ","
                    + c.getRadius() + "," + dist + "," + (c.isOutlier(x) ? 1 : 0) + "," + x.weight() + ","
                    + qDmin + "," + qDout + "," + qNSC + "," + isTrueNovel;
            ncCSVwriter.write(ncCSVLine);
            ncCSVwriter.newLine();
            ncCSVwriter.flush();
        } catch (IOException fileIoExcption) {
            System.err.println("Could not write NC CSV line: " + fileIoExcption.toString());
        }
    }
}

From source file:moa.clusterer.outliers.Sieve.java

License:Apache License

/**
 * @return training accuracy//from   ww  w  . j  av a2  s  . c  om
 */
private double trainPerceptron() {
    // Train the perceptron from warmup phase clustering 
    final int epochs = 20;
    final int numberOfPerceptrons = 1;
    final int MEMBER = 0;
    final int OUTLIER = 1;
    double accuracySum = 0;
    double accuracyCount = 0;
    this.outlierPerceptronTrainingSet.clear();
    Random rng = new Random(this.randomSeed);

    // Generate training set
    for (Riffle thisCluster : this.clusters) {
        for (Instance x : thisCluster.getHeader()) {
            Instance pseudoPt = makePerceptronInstance(thisCluster, x);
            for (Riffle thatCluster : this.clusters) {
                double groundTruth = (thisCluster == thatCluster) ? MEMBER : OUTLIER;
                pseudoPt.setClassValue(groundTruth);
                this.outlierPerceptronTrainingSet.add(pseudoPt);
            }
        }
    }
    for (Instance x : this.outlierPerceptronTrainingSet) {
        x.setWeight(1.0 / this.outlierPerceptronTrainingSet.numInstances());
    }
    ;

    // Boost it
    this.perceptrons = new Perceptron[numberOfPerceptrons];
    this.pweights = new double[numberOfPerceptrons];
    for (int perceptronIdx = 0; perceptronIdx < numberOfPerceptrons; ++perceptronIdx) {
        // Discover new weak learner
        Perceptron candidatePerceptron = new Perceptron();
        candidatePerceptron.prepareForUse();
        candidatePerceptron.learningRatioOption.setValue(rng.nextDouble() * 0.9 + 0.1);
        for (int epoch = 0; epoch < epochs; epoch++) {
            for (Instance x : this.outlierPerceptronTrainingSet) {
                if ((rng.nextDouble() / this.outlierPerceptronTrainingSet.numInstances()) < x.weight()) { // weighted subsampling
                    candidatePerceptron.trainOnInstance(x);
                }
            }
        } //end epochs
          // Evaluate weak learner
        double errorFunctionSum = 0;
        double weightSum = 0;
        for (Instance x : this.outlierPerceptronTrainingSet) {
            if (!candidatePerceptron.correctlyClassifies(x)) {
                errorFunctionSum += x.weight();
            }
        }
        // adjust training weights
        for (Instance x : this.outlierPerceptronTrainingSet) {
            double newWeight = x.weight();
            if (candidatePerceptron.correctlyClassifies(x)) {
                newWeight *= errorFunctionSum / (1.0 - errorFunctionSum);
                if (Double.isNaN(newWeight)) {
                    newWeight = weka.core.Utils.SMALL;
                }
                x.setWeight(newWeight);
            }
            weightSum += newWeight;
        }
        // Normalize
        for (Instance x : this.outlierPerceptronTrainingSet) {
            x.setWeight(x.weight() / weightSum);
        }
        // Add to ensemble
        double newPerceptronWeight = Math.log((1 - errorFunctionSum) / errorFunctionSum);

        this.perceptrons[perceptronIdx] = candidatePerceptron;
        this.pweights[perceptronIdx] = newPerceptronWeight;
    } // end numPerceptrons

    // Check training error
    accuracySum = 0;
    accuracyCount = 0;
    for (Instance x : this.outlierPerceptronTrainingSet) {
        if (this.getPerceptronVotesForOutlierStatus(x) == x.classValue()) {
            accuracySum++;
        }
        accuracyCount++;
    }
    double trainingAccuracy = (accuracyCount > 0) ? (accuracySum / accuracyCount) : 0.0;
    this.outlierPerceptronTrainingSet.clear();
    return trainingAccuracy;
}

From source file:moa.clusterers.outliers.AnyOut.util.DataObject.java

License:Apache License

/**
 * Standard constructor for <code>DataObject</code>.
 * @param idCounter The id for the <code>DataObject</code>.
 * @param features The feature as a <code>double[]</code>
 * @param classLabel The label id for the <code>DataObject</code>.
 *///from   w  ww  . j av  a 2s .  co m
public DataObject(int idCounter, Instance inst) {
    this.id = idCounter;
    this.inst = inst;
    this.features = inst.toDoubleArray();
    this.classLabel = (int) inst.classValue();
    this.isOutiler = false;
}