moa.streams.generators.multilabel.MetaMultilabelGenerator.java Source code

Introduction

Here is the source code for moa.streams.generators.multilabel.MetaMultilabelGenerator.java
Source

/*
 *    MetaMultilabelGenerator.java
 *    Copyright (C) 2012 University of Waikato, Hamilton, New Zealand
 *    @author Jesse Read (jesse@tsc.uc3m.es)
 * 
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program. If not, see <http://www.gnu.org/licenses/>.
 *    
 */
package moa.streams.generators.multilabel;

import java.util.*;
import moa.core.InstancesHeader;
import moa.core.MultilabelInstancesHeader;
import moa.core.ObjectRepository;
import moa.options.AbstractOptionHandler;
import moa.options.ClassOption;
import moa.options.FloatOption;
import moa.options.IntOption;
import moa.streams.InstanceStream;
import moa.tasks.TaskMonitor;
import weka.core.*;

/**
 * Stream generator for multilabel data.
 *
 * @author Jesse Read ((jesse@tsc.uc3m.es))
 * @version $Revision: 7 $
 */
public class MetaMultilabelGenerator extends AbstractOptionHandler implements InstanceStream {

    private static final long serialVersionUID = 1L;

    public ClassOption binaryGeneratorOption = new ClassOption("binaryGenerator", 's',
            "Binary Generator (specify the number of attributes here, but only two classes!).",
            InstanceStream.class, "generators.RandomTreeGenerator");

    public IntOption metaRandomSeedOption = new IntOption("metaRandomSeed", 'm',
            "Random seed (for the meta process). Use two streams with the same seed and r > 0.0 in the second stream if you wish to introduce drift to the label dependencies without changing the underlying concept.",
            1);

    public IntOption numLabelsOption = new IntOption("numLabels", 'c', "Number of labels.", 10, 2,
            Integer.MAX_VALUE);

    public IntOption skewOption = new IntOption("skew", 'k',
            "Skewed label distribution: 1 (default) = yes; 0 = no (relatively uniform) @NOTE: not currently implemented.",
            1, 0, 1);

    public FloatOption labelCardinalityOption = new FloatOption("labelCardinality", 'z',
            "Desired label cardinality (average number of labels per example).", 1.5, 0.0, Integer.MAX_VALUE);

    public FloatOption labelCardinalityVarOption = new FloatOption("labelCardinalityVar", 'v',
            "Desired label cardinality variance (variance of z) @NOTE: not currently implemented.", 1.0, 0.0,
            Integer.MAX_VALUE);

    public FloatOption labelCardinalityRatioOption = new FloatOption("labelDependency", 'u',
            "Specifies how much label dependency from 0 (total independence) to 1 (full dependence).", 0.25, 0.0,
            1.0);

    public FloatOption labelDependencyChangeRatioOption = new FloatOption("labelDependencyRatioChange", 'r',
            "Each label-pair dependency has a 'r' chance of being modified. Use this option on the second of two streams with the same random seed (-m) to introduce label-dependence drift.",
            0.0, 0.0, 1.0);

    protected MultilabelInstancesHeader m_MultilabelInstancesHeader = null;

    protected InstanceStream m_BinaryGenerator = null;

    protected Instances multilabelStreamTemplate = null;

    protected Random m_MetaRandom = new Random();

    protected int m_L = 0, m_A = 0;

    protected double priors[] = null, priors_norm[] = null;

    protected double Conditional[][] = null;

    protected HashSet m_TopCombinations[] = null;

    @Override
    public void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) {
        this.restart();
    }

    @Override
    public void restart() {

        // The number of class labels L
        this.m_L = numLabelsOption.getValue();

        if (this.labelCardinalityOption.getValue() > m_L) {
            System.err.println("Error: Label cardinality (z) cannot be greater than the number of labels (c)!");
            System.exit(1);
        }

        // Initialise the chosen binary generator
        this.m_BinaryGenerator = (InstanceStream) getPreparedClassOption(this.binaryGeneratorOption);
        this.m_BinaryGenerator.restart();

        // The number of attributes A (not including class-attributes)
        this.m_A = this.m_BinaryGenerator.getHeader().numAttributes() - 1;

        // Random seed
        this.m_MetaRandom = new Random(this.metaRandomSeedOption.getValue());

        // Set up a queue system
        this.queue = new LinkedList[2];
        for (int i = 0; i < this.queue.length; i++) {
            this.queue[i] = new LinkedList<Instance>();
        }

        // Generate the multi-label header
        this.m_MultilabelInstancesHeader = generateMultilabelHeader(this.m_BinaryGenerator.getHeader());

        // Generate label prior distribution
        this.priors = generatePriors(m_MetaRandom, m_L, labelCardinalityOption.getValue(),
                (skewOption.getValue() >= 1));
        //printVector(this.priors);

        // Generate the matrix marking the label-dependencies
        boolean DependencyMatrix[][] = modifyDependencyMatrix(new boolean[m_L][m_L],
                labelCardinalityRatioOption.getValue(), m_MetaRandom);

        // Modify the dependency matrix (and the priors) if they are to change in this stream
        if (labelDependencyChangeRatioOption.getValue() > 0.0) {
            priors = modifyPriorVector(priors, labelDependencyChangeRatioOption.getValue(), m_MetaRandom,
                    (skewOption.getValue() >= 1));
            modifyDependencyMatrix(DependencyMatrix, labelDependencyChangeRatioOption.getValue(), m_MetaRandom);
        }

        // Generate the conditional matrix, using this change matrix
        this.Conditional = generateConditional(priors, DependencyMatrix);
        //printMatrix(this.Conditional);

        // Make a normalised version of the priors
        this.priors_norm = Arrays.copyOf(priors, priors.length);
        Utils.normalize(this.priors_norm);

        // Create the feature-label mappings
        m_TopCombinations = getTopCombinations(m_A);
    }

    /**
     * GenerateMultilabelHeader.
     *
     * @param   si   single-label Instances
     */
    protected MultilabelInstancesHeader generateMultilabelHeader(Instances si) {
        Instances mi = new Instances(si, 0, 0);
        mi.setClassIndex(-1);
        mi.deleteAttributeAt(mi.numAttributes() - 1);
        FastVector bfv = new FastVector();
        bfv.addElement("0");
        bfv.addElement("1");
        for (int i = 0; i < this.m_L; i++) {
            mi.insertAttributeAt(new Attribute("class" + i, bfv), i);
        }
        this.multilabelStreamTemplate = mi;
        this.multilabelStreamTemplate.setRelationName("SYN_Z" + this.labelCardinalityOption.getValue() + "L"
                + this.m_L + "X" + m_A + "S" + metaRandomSeedOption.getValue() + ": -C " + this.m_L);
        this.multilabelStreamTemplate.setClassIndex(this.m_L);
        return new MultilabelInstancesHeader(multilabelStreamTemplate, m_L);
    }

    /**
     * Generate Priors. Generate the label priors.
     *
     * @param   L   number of labels
     * @param   z   desired label cardinality
     * @param   r   random number generator
     * @param   skew   whether to be very skewed or not (@NOTE not currently used)
     * @return   P   label prior distribution
     */
    private double[] generatePriors(Random r, int L, double z, boolean skew) {
        double P[] = new double[L];
        for (int i = 0; i < L; i++) {
            P[i] = r.nextDouble();
            //P[i] = 1.0; // @temp
        }
        // normalise to z
        do {
            double c = Utils.sum(P) / z;
            for (int i = 0; i < L; i++) {
                P[i] = Math.min(1.0, P[i] / c); // must be in [0,1]
            }
        } while (Utils.sum(P) < z);
        return P;
    }

    /**
     * GetNextWithBinary. Get the next instance with binary class i
     *
     * @param   i   the class to generate (0,1)
     */
    LinkedList<Instance> queue[] = null;

    private Instance getNextWithBinary(int i) {
        int lim = 1000;
        if (queue[i].size() <= 0) {
            int c = -1;
            while (lim-- > 0) {
                Instance tinst = this.m_BinaryGenerator.nextInstance();
                c = (int) Math.round(tinst.classValue());
                if (i == c) {
                    return tinst;
                } else if (queue[c].size() < 100) {
                    queue[c].add(tinst);
                }
            }
            System.err.println(
                    "[Overflow] The binary stream is too skewed, could not get an example of class " + i + "");
            System.exit(1);
            return null;
        } else {
            return queue[i].remove();
        }
    }

    /**
     * GenerateML. Generates a multi-label example.
     */
    @Override
    public Instance nextInstance() {
        return generateMLInstance(generateSet());
    }

    /**
     * Generate Set.
     *
     * @return   a label set Y
     */
    private HashSet generateSet() {

        int y[] = new int[m_L]; // [0,0,0]
        int k = samplePMF(priors_norm); // k = 1     // y[k] ~ p(k==1)
        y[k] = 1; // [0,1,0]
        ArrayList<Integer> indices = getShuffledListToLWithoutK(m_L, k);
        for (int j : indices) {
            //y[j] ~ p(j==1|y)
            y[j] = (joint(j, y) > m_MetaRandom.nextDouble()) ? 1 : 0;
        }
        return vector2set(y);
    }

    // P(y[] where y[k]==1)
    private double joint(int k, int y[]) {
        double p = 1.0; //priors[k];
        for (int j = 0; j < y.length; j++) {
            if (j != k && y[j] == 1) {
                p *= Conditional[k][j];
            }
        }
        return p;
    }

    /**
     * GenerateMLInstance.
     *
     * @param   Y   a set of label [indices]
     * @return a multit-labelled example
     */
    private Instance generateMLInstance(HashSet<Integer> Y) {

        // create a multi-label instance:
        Instance x_ml = new SparseInstance(this.multilabelStreamTemplate.numAttributes());
        x_ml.setDataset(this.multilabelStreamTemplate);

        // set classes
        for (int j = 0; j < m_L; j++) {
            x_ml.setValue(j, 0.0);
        }
        for (int l : Y) {
            x_ml.setValue(l, 1.0);
        }

        // generate binary instances
        Instance x_0 = getNextWithBinary(0);
        Instance x_1 = getNextWithBinary(1);

        // Loop through each feature attribute @warning: assumes class is last index
        for (int a = 0; a < m_A; a++) {

            // The combination is present: use a positive value
            if (Y.containsAll(m_TopCombinations[a])) {
                x_ml.setValue(m_L + a, x_1.value(a));
                //x_ml.setValue(m_L+a,1.0);
            } // The combination is absent: use a negative value
            else {
                x_ml.setValue(m_L + a, x_0.value(a));
                //x_ml.setValue(m_L+a,0.0);
            }
        }

        return x_ml;
    }

    /**
     * samplePMF.
     *
     * @param   p   a pmf
     * @return   an index i of p with probability p[i], and -1 with probability
     * 1.0-p[i]
     */
    private int samplePMF(double p[]) {

        double r = m_MetaRandom.nextDouble();

        double sum = 0.0;
        for (int i = 0; i < p.length; i++) {
            sum += p[i];
            if (r < sum) {
                return i;
            }
        }
        return -1;
    }

    /**
     * ModifyPriorVector. A certain number of values will be altered.
     *
     * @param   P[]   the prior distribution
     * @param   u   the probability of changing P[j]
     * @param   r   for random numbers
     * @param   skew   NOTE not currently used
     * @return   the modified P
     */
    protected double[] modifyPriorVector(double P[], double u, Random r, boolean skew) {
        for (int j = 0; j < P.length; j++) {
            if (r.nextDouble() < u) {
                P[j] = r.nextDouble();
            }
        }
        return P;
    }

    /**
     * ModifyDependencyMatrix. A certain number of values will be altered. @NOTE
     * a future improvement would be to detect cycles, since this may lead to
     * inconsistencies. However, due to the rarity of this occurring, and the
     * minimal problems it would cause (and considering that inconsistencies
     * also occr in real datasets) we don't implement this.
     *
     * @param   M[][]   a boolean matrix
     * @param   u   the probability of changing the relationship M[j][k]
     * @param   r   for random numbers
     * @return   the modified M
     */
    protected boolean[][] modifyDependencyMatrix(boolean M[][], double u, Random r) {
        //List<int[]> L = new ArrayList<int[]>();
        for (int j = 0; j < M.length; j++) {
            for (int k = j + 1; k < M[j].length; k++) {
                if (/*
                         * !hasCycle(L) &&
                         */r.nextDouble() <= u) {
                    M[j][k] ^= true;
                    //L.add(new int[]{j,k});
                }
            }
        }
        return M;
    }

    /**
     * GenerateConditional. Given the priors distribution and a matrix flagging
     * dependencies, generate a conditional distribution matrix Q; such that:
     * P(i) = Q[i][i] P(i|j) = Q[i][j]
     *
     * @param   P   prior distribution
     * @param   M   dependency matrix (where 1 == dependency)
     * @return   Q   conditional dependency matrix
     */
    protected double[][] generateConditional(double P[], boolean M[][]) {

        int L = P.length;

        double Q[][] = new double[L][L];

        // set priors
        for (int j = 0; j < L; j++) {
            Q[j][j] = P[j];
        }

        // create conditionals
        for (int j = 0; j < Q.length; j++) {
            for (int k = j + 1; k < Q[j].length; k++) {

                // dependence
                if (M[j][k]) {
                    // min = tending toward mutual exclusivity
                    // max = tending toward total co-occurence
                    // @NOTE it would also be an option to select in [min,max], but since
                    //       we are approximating the joint distribution, we can take
                    //       a stronger approach, and just take either min or max.
                    Q[j][k] = (m_MetaRandom.nextBoolean() ? min(P[j], P[k]) : max(P[j], P[k]));
                    Q[k][j] = (Q[j][k] * Q[j][j]) / Q[k][k]; // Bayes Rule
                } // independence
                else {
                    Q[j][k] = P[j];
                    Q[k][j] = (Q[j][k] * P[k]) / P[j]; // Bayes Rule
                }
            }
        }

        return Q;
    }

    /**
     * GetTopCombinations. Calculating the full joint probability distribution
     * is too complex. - sample from the approximate joint many times - record
     * the the n most commonly ocurring Y and their frequencies - create a map
     * based on these frequencies
     *
     * @param   n   the number of labelsets
     * @return   n labelsts
     */
    private HashSet[] getTopCombinations(int n) {

        final HashMap<HashSet, Integer> count = new HashMap<HashSet, Integer>();
        HashMap<HashSet, Integer> isets = new HashMap<HashSet, Integer>();

        int N = 100000;
        double lc = 0.0;
        for (int i = 0; i < N; i++) {
            HashSet Y = generateSet();
            lc += Y.size();
            count.put(Y, count.get(Y) != null ? count.get(Y) + 1 : 1);
        }
        lc = lc / N;

        // @TODO could generate closed frequent itemsets from 'count'

        List<HashSet> top_set = new ArrayList<HashSet>(count.keySet());
        // Sort the sets by their count
        Collections.sort(top_set, new Comparator<HashSet>() {

            @Override
            public int compare(HashSet Y1, HashSet Y2) {
                return count.get(Y2).compareTo(count.get(Y1));
            }
        });

        System.err.println("The most common labelsets (from which we will build the map) will likely be: ");
        HashSet map_set[] = new HashSet[n];
        double weights[] = new double[n];
        int idx = 0;
        for (HashSet Y : top_set) {
            System.err.println(" " + Y + " : " + (count.get(Y) * 100.0 / N) + "%");
            weights[idx++] = count.get(Y);
            if (idx == weights.length) {
                break;
            }
        }
        double sum = Utils.sum(weights);

        System.err.println("Estimated Label Cardinality:  " + lc + "\n\n");
        System.err.println("Estimated % Unique Labelsets: " + (count.size() * 100.0 / N) + "%\n\n");

        // normalize weights[]
        Utils.normalize(weights);

        // add sets to the map set, according to their weights
        for (int i = 0, k = 0; i < top_set.size() && k < map_set.length; i++) { // i'th combination (pre)
            int num = (int) Math.round(Math.max(weights[i] * map_set.length, 1.0)); // i'th weight
            for (int j = 0; j < num && k < map_set.length; j++) {
                map_set[k++] = top_set.get(i);
            }
        }

        // shuffle 
        Collections.shuffle(Arrays.asList(map_set));

        // return
        return map_set;
    }

    @Override
    public InstancesHeader getHeader() {
        return m_MultilabelInstancesHeader;
    }

    @Override
    public String getPurposeString() {
        return "Generates a multi-label stream based on a binary random generator.";
    }

    @Override
    public long estimatedRemainingInstances() {
        return -1;
    }

    @Override
    public boolean hasMoreInstances() {
        return true;
    }

    @Override
    public boolean isRestartable() {
        return true;
    }

    @Override
    public void getDescription(StringBuilder sb, int indent) {
    }

    // ------- following are private utility functions -----------
    // convert set Y to an L-length vector y
    private int[] set2vector(HashSet<Integer> Y, int L) {
        int y[] = new int[L];
        for (int j : Y) {
            y[j] = 1;
        }
        return y;
    }
    // convert L-length vector y to set Y

    private HashSet<Integer> vector2set(int y[]) {
        HashSet<Integer> Y = new HashSet<Integer>();
        for (int j = 0; j < y.length; j++) {
            if (y[j] > 0) {
                Y.add(j);
            }
        }
        return Y;
    }

    // the highest possible prob. of P(A|B) given A and B
    private double max(double A, double B) {
        return Math.min(1.0, (B / A));
    }

    // the lowest possible prob. of P(A|B) given A and B
    private double min(double A, double B) {
        return Math.max(0.0, (-1.0 + A + B));
    }

    private ArrayList<Integer> getShuffledListToLWithoutK(int L, int k) {
        ArrayList<Integer> list = new ArrayList<Integer>(L - 1);
        for (int j = 0; j < L; j++) {
            if (j != k) {
                list.add(j);
            }
        }
        Collections.shuffle(list);
        return list;
    }

    // ------- following are private debugging functions -----------
    public static void main(String args[]) {
        // test routines
    }

    private void printMatrix(double M[][]) {
        System.out.println("--- MATRIX ---");
        for (int i = 0; i < M.length; i++) {
            for (int j = 0; j < M[i].length; j++) {
                System.out.print(" " + Utils.doubleToString(M[i][j], 5, 3));
            }
            System.out.println("");
        }
    }

    private void printVector(double V[]) {
        System.out.println("--- VECTOR ---");
        for (int j = 0; j < V.length; j++) {
            System.out.print(" " + Utils.doubleToString(V[j], 5, 3));
        }
        System.out.println("");
    }
}