tr.gov.ulakbim.jDenetX.streams.generators.multilabel.MetaMultilabelGenerator.java Source code

Java tutorial

Introduction

Here is the source code for tr.gov.ulakbim.jDenetX.streams.generators.multilabel.MetaMultilabelGenerator.java

Source

package tr.gov.ulakbim.jDenetX.streams.generators.multilabel;

/*
 *    MetaMultilabelGenerator.java
 *    Copyright (C) 2010 University of Waikato, Hamilton, New Zealand
 *    @author Jesse Read (jmr30@cs.waikato.ac.nz)
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

import tr.gov.ulakbim.jDenetX.core.EuclideanSimilarityDiscoverer.MultilabelInstancesHeader;
import tr.gov.ulakbim.jDenetX.core.InstancesHeader;
import tr.gov.ulakbim.jDenetX.core.ObjectRepository;
import tr.gov.ulakbim.jDenetX.options.AbstractOptionHandler;
import tr.gov.ulakbim.jDenetX.options.ClassOption;
import tr.gov.ulakbim.jDenetX.options.FloatOption;
import tr.gov.ulakbim.jDenetX.options.IntOption;
import tr.gov.ulakbim.jDenetX.streams.InstanceStream;
import tr.gov.ulakbim.jDenetX.tasks.TaskMonitor;
import weka.core.*;

import java.util.*;

public class MetaMultilabelGenerator extends AbstractOptionHandler implements InstanceStream {

    private static final long serialVersionUID = 1L;

    public ClassOption binaryGeneratorOption = new ClassOption("binaryGenerator", 's',
            "Binary Generator (use thihs option to specify the number of attributes, but specify two classes only).",
            InstanceStream.class, "generators.RandomTreeGenerator");

    public IntOption metaRandomSeedOption = new IntOption("metaRandomSeed", 'm',
            "Random seed (for the meta process).", 1);

    public IntOption numLabelsOption = new IntOption("numLabels", 'c', "Number of labels.", 1);

    public IntOption skewOption = new IntOption("skew", 'k',
            "Skewed label distribution: 1 (default) = yes; 0 = no (relatively uniform).", 1, 0, 1);

    public FloatOption labelCardinalityOption = new FloatOption("labelCardinality", 'z',
            "Target label cardinality of resulting set", 1.5, 0.0, Integer.MAX_VALUE);

    protected MultilabelInstancesHeader m_MultilabelInstancesHeader = null;
    protected InstanceStream m_BinaryGenerator = null;
    protected Instances multilabelStreamTemplate = null;
    protected Random m_MetaRandom = null;
    protected int m_N = 0, m_A = 0;
    protected double m_Z = 0.0;
    protected double skew[] = null, skew_n[] = null;
    protected double matrix[][] = null;
    protected ArrayList m_FeatureEffects[] = null;

    @Override
    public void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) {
        this.restart();
    }

    @Override
    public void restart() {

        // Extract option 'c' (number of classes(labels))
        this.m_N = numLabelsOption.getValue();

        // Binary generator
        this.m_BinaryGenerator = (InstanceStream) getPreparedClassOption(this.binaryGeneratorOption);
        this.m_BinaryGenerator.restart();

        // Extract number of attributes (minus class-attribute)
        this.m_A = this.m_BinaryGenerator.getHeader().numAttributes() - 1;

        // Random seed
        this.m_MetaRandom = new Random(this.metaRandomSeedOption.getValue());

        // Setup queue system (so that generated binary instances aren't 'wasted')
        this.queue = new LinkedList[2];
        for (int i = 0; i < this.queue.length; i++) {
            this.queue[i] = new LinkedList<Instance>();
        }

        // Generate the multi-label header
        this.m_MultilabelInstancesHeader = generateMultilabelHeader(this.m_BinaryGenerator.getHeader());

        // Determine Z : label cardinality as a percentage of |L| (m_N)
        m_Z = labelCardinalityOption.getValue();
        double z = m_Z;

        // Chceck that the label sets we generate fit the label cardinality we specified
        while (true) {
            // Create the label skew
            this.skew = fillSkew(m_MetaRandom, z);
            // Create a normalised version of the skew (for wwhen we choose at least one label)
            this.skew_n = Arrays.copyOf(skew, skew.length);
            Utils.normalize(this.skew_n);
            // Create a matrix from the label skew
            this.matrix = fillMatrix(skew, m_Z / (double) m_N, m_MetaRandom);
            double total = 0.0;
            for (int i = 0; i < 10000; i++) {
                total += (generateSet(discreteRandomIndex(this.skew_n))).size();
            }
            total /= 10000.0;
            if (total - m_Z < -0.1)
                z += 0.1;
            else if (total - m_Z > 0.1)
                z -= 0.1;
            else
                break;
        }

        // Create the feature-label mappings
        m_FeatureEffects = getTopCombinations(m_N * 2);

    }

    /**
     * GenerateMultilabelHeader.
     */
    protected MultilabelInstancesHeader generateMultilabelHeader(Instances si) {
        Instances mi = new Instances(si, 0, 0);
        mi.setClassIndex(-1);
        mi.deleteAttributeAt(mi.numAttributes() - 1);
        FastVector bfv = new FastVector();
        bfv.addElement("0");
        bfv.addElement("1");
        for (int i = 0; i < this.m_N; i++) {
            mi.insertAttributeAt(new Attribute("class" + i, bfv), i);
        }
        this.multilabelStreamTemplate = mi;
        this.multilabelStreamTemplate.setRelationName("SYN_Z" + this.labelCardinalityOption.getValue() + "L"
                + this.m_N + "X" + m_A + "S" + metaRandomSeedOption.getValue() + ": -C " + this.m_N);
        this.multilabelStreamTemplate.setClassIndex(this.m_N);
        return new MultilabelInstancesHeader(multilabelStreamTemplate, m_N);
    }

    /**
     * GenSkew.
     * Generate a label skew (given desired lcard z)
     *
     * @param z desired label cardinality
     * @param r random generator
     */
    private double[] fillSkew(Random r, double z) {
        double d[] = new double[m_N];
        for (int i = 0; i < m_N; i++) {
            if (skewOption.getValue() >= 1)
                d[i] = m_MetaRandom.nextDouble();
            else
                d[i] = 1.0;
        }
        Utils.normalize(d, Utils.sum(d) / z);
        for (int i = 0; i < m_N; i++) {
            if (Double.isNaN(d[i]))
                d[i] = 0.01;
        }
        return d;
    }

    /**
     * GetNextWithBinary.
     * Get the next instance with binary class i
     *
     * @param i    the class to generate (0,1)
     */
    LinkedList<Instance> queue[] = null;

    private Instance getNextWithBinary(int i) {
        int lim = 1000;
        if (queue[i].size() <= 0) {
            int c = -1;
            while (lim-- > 0) {
                Instance tinst = this.m_BinaryGenerator.nextInstance();
                //System.err.println("next binary : "+tinst);
                c = (int) Math.round(tinst.classValue());
                if (i == c)
                    return tinst;
                else if (queue[c].size() < 100)
                    queue[c].add(tinst);
            }
            System.err.println(
                    "[Overflow] The binary stream is too skewed, could not get an example of class " + i + "");
            System.exit(1);
            return null;
        } else
            return queue[i].remove();
    }

    /**
     * LabelCorrelation.
     *
     * @param lbls existing labels (indices) in the set
     * @return a random label (index) to be associated with these labels (-1 if none)
     */
    private int labelCorrelation(ArrayList<Integer> lbls) {
        double r[] = new double[m_N];
        Arrays.fill(r, 1.0);
        for (int l : lbls) {
            //get row
            for (int j = 0; j < matrix[l].length; j++) {
                // *= P(j|l) (probability of label 'j', given that label 'l' is in the set
                r[j] = (j == l) ? 0.0 : r[j] * matrix[j][l];
            }
        }
        return discreteRandomIndex(r);
    }

    /**
     * GenerateML.
     * Generates a multi-label example.
     */
    @Override
    public Instance nextInstance() {

        try {
            return generateMLInstance(generateSet(discreteRandomIndex(this.skew_n)));
        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
        return null;
    }

    private ArrayList generateSet(int l) {
        ArrayList<Integer> lbls = new ArrayList<Integer>();
        while (l >= 0) {
            lbls.add(l);
            l = labelCorrelation(lbls);
        }
        return lbls;
    }

    /**
     * GenerateMLInstance.
     */
    private Instance generateMLInstance(ArrayList<Integer> lbls) throws Exception {

        // create a multi-label instance   :
        Instance ml_x = new SparseInstance(this.multilabelStreamTemplate.numAttributes());
        ml_x.setDataset(this.multilabelStreamTemplate);

        // set classes
        for (int i = 0; i < m_N; i++)
            ml_x.setValue(i, 0.0);
        for (int i = 0; i < lbls.size(); i++) {
            ml_x.setValue(lbls.get(i), 1.0);
        }

        // generate binary instances
        Instance binary0 = getNextWithBinary(0);
        Instance binary1 = getNextWithBinary(1);

        // Loop through each feature attribute @warning: assumes class is last index
        for (int a = 0; a < m_A; a++) {

            // The combination is present: use a positive value
            if (lbls.containsAll(m_FeatureEffects[a % m_FeatureEffects.length])) {
                ml_x.setValue(m_N + a, binary1.value(a));
            }
            // The combination is absent: use a negative value
            else {
                ml_x.setValue(m_N + a, binary0.value(a));
            }
        }

        return ml_x;

    }

    /**
     * DiscreteRandomIndex.
     * Pick a random index i of p, based on the weight of the doubles each p[i] contains
     *
     * @note: expecting data to be normalised first
     */
    private int discreteRandomIndex(double p[]) {

        double r = m_MetaRandom.nextDouble();

        if (Utils.sum(p) <= r || Double.isNaN(Utils.sum(p)))
            return -1; //m_MetaRandom.nextInt(p.length);

        int i = 0;
        double sum = 0.0;
        while (r > sum) {
            // won't be selecting anything
            if (i >= p.length)
                return -1;
            sum += p[i++];
        }
        //System.out.println("i="+i);
        return i - 1;
    }

    protected static double genE(int i, double L) {
        return L * Math.pow(Math.E, -L * i);
    }

    /**
     * genMatrix.
     * P(i) = matrix[i][i]
     * P(i|j) = matrix[i][j]
     *
     * @param skew the matrix with skew stored along the diagonal
     * @param Z    goal label cardinality
     * @param r    random seed
     */
    protected double[][] fillMatrix(double skew[], double Z, Random r) {

        this.matrix = new double[skew.length][skew.length];

        //System.out.println("skew "+Arrays.toString(skew));

        for (int i = 0; i < skew.length; i++) {
            matrix[i][i] = Utils.roundDouble(skew[i], 3);
        }

        for (int i = 0; i < matrix.length; i++) {
            for (int j = i + 1; j < matrix[i].length; j++) {
                // label-dependence factors
                if (r.nextDouble() <= (Z * 2.0)) {
                    matrix[i][j] = randFromRange(min(P(i), P(j)), max(P(i), P(j)));
                    matrix[j][i] = (matrix[i][j] * matrix[i][i]) / matrix[j][j]; // Bayes Rule
                }
                // label-exclusivity factors
                else {
                    matrix[i][j] = min(P(i), P(j));
                    matrix[j][i] = (matrix[i][j] * matrix[j][j]) / matrix[i][i]; // Bayes Rule
                }
                // this is just rounding
                matrix[i][j] = Utils.roundDouble(matrix[i][j], 3);
                matrix[j][i] = Utils.roundDouble(matrix[j][i], 3);
            }
        }

        return matrix;
    }

    protected double randFromRange(double min, double max) {
        return min + genE(m_MetaRandom.nextInt(5), (max - min));
    }

    // P(i)

    protected double P(int i) {
        return matrix[i][i];
    }

    // P(i|j)

    protected double P(int i, int j) {
        return matrix[i][j];
    }

    // the highest possible prob. of P(A|B) given A and B

    protected double max(double A, double B) {
        return Math.min(1.0, (B / A));
    }

    // the lowest possible prob. of P(A|B) given A and B

    protected double min(double A, double B) {
        return Math.max(0.0, (-1.0 + A + B));
    }

    /**
     * GetTopCombinations.
     * Return the top n occurring combinations (we just measure the 10000 for this)
     */
    private ArrayList[] getTopCombinations(int n) {

        HashMap<String, Integer> top = new HashMap<String, Integer>();

        for (int i = 0; i < 10000; i++) {
            String s = arrayToString(generateSet(discreteRandomIndex(this.skew_n)), m_N);
            top.put(s, top.get(s) != null ? top.get(s) + 1 : 1);
        }

        HashMap<String, Integer> rating = getAsReverseSortedHashMap(top);

        ArrayList al[] = new ArrayList[rating.size()];
        int i = 0;
        for (String s : rating.keySet()) {
            al[i++] = stringToArray(s);
        }
        return al;
    }

    // auxilliary functions follow

    private static HashMap<String, Integer> getAsReverseSortedHashMap(HashMap<String, Integer> c) {

        Map<String, Integer> tempMap = new HashMap<String, Integer>();
        for (String wsState : c.keySet()) {
            tempMap.put(wsState, c.get(wsState));
        }

        List<String> mapKeys = new ArrayList<String>(tempMap.keySet());
        List<Integer> mapValues = new ArrayList<Integer>(tempMap.values());
        HashMap<String, Integer> sortedMap = new LinkedHashMap<String, Integer>();
        TreeSet<Integer> sortedSet = new TreeSet<Integer>(mapValues);
        Object[] sortedArray = sortedSet.toArray();
        int size = sortedArray.length;
        for (int i = 0; i < size; i++) {
            sortedMap.put(mapKeys.get(mapValues.indexOf(sortedArray[size - 1 - i])),
                    (Integer) sortedArray[size - 1 - i]);
        }
        return sortedMap;
    }

    private static ArrayList stringToArray(String s) {
        ArrayList al = new ArrayList();
        for (int i = 0; i < s.length(); i++) {
            if (s.charAt(i) == '1')
                al.add(i);
        }
        return al;
    }

    private static String arrayToString(ArrayList<Integer> lbls, int N) {
        StringBuilder sb = new StringBuilder(N);
        for (int i = 0; i < N; i++) {
            sb.append('0');
        }
        for (int l : lbls) {
            sb.setCharAt(l, '1');
        }
        return sb.toString();
    }

    @Override
    public InstancesHeader getHeader() {
        return m_MultilabelInstancesHeader;
    }

    @Override
    public String getPurposeString() {
        return "Generates a multi-label stream using a binary generator.";
    }

    @Override
    public long estimatedRemainingInstances() {
        return -1;
    }

    @Override
    public boolean hasMoreInstances() {
        return true;
    }

    @Override
    public boolean isRestartable() {
        return true;
    }

    @Override
    public void getDescription(StringBuilder sb, int indent) {
        // TODO Auto-generated method stub
    }

}