Example usage for weka.filters.unsupervised.attribute RemoveUseless RemoveUseless

List of usage examples for weka.filters.unsupervised.attribute RemoveUseless RemoveUseless

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute RemoveUseless RemoveUseless.

Prototype

RemoveUseless

Source Link

Usage

From source file:br.com.ufu.lsi.rebfnetwork.RBFModel.java

License:Open Source License

/**
 * Method used to pre-process the data, perform clustering, and
 * set the initial parameter vector.//ww  w .  j av a  2s .com
 */
protected Instances initializeClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    data = new Instances(data);
    data.deleteWithMissingClass();

    // Make sure data is shuffled
    Random random = new Random(m_Seed);
    if (data.numInstances() > 2) {
        random = data.getRandomNumberGenerator(m_Seed);
    }
    data.randomize(random);

    double y0 = data.instance(0).classValue(); // This stuff is not relevant in classification case
    int index = 1;
    while (index < data.numInstances() && data.instance(index).classValue() == y0) {
        index++;
    }
    if (index == data.numInstances()) {
        // degenerate case, all class values are equal
        // we don't want to deal with this, too much hassle
        throw new Exception("All class values are the same. At least two class values should be different");
    }
    double y1 = data.instance(index).classValue();

    // Replace missing values   
    m_ReplaceMissingValues = new ReplaceMissingValues();
    m_ReplaceMissingValues.setInputFormat(data);
    data = Filter.useFilter(data, m_ReplaceMissingValues);

    // Remove useless attributes
    m_AttFilter = new RemoveUseless();
    m_AttFilter.setInputFormat(data);
    data = Filter.useFilter(data, m_AttFilter);

    // only class? -> build ZeroR model
    if (data.numAttributes() == 1) {
        System.err.println(
                "Cannot build model (only class attribute present in data after removing useless attributes!), "
                        + "using ZeroR model instead!");
        m_ZeroR = new weka.classifiers.rules.ZeroR();
        m_ZeroR.buildClassifier(data);
        return data;
    } else {
        m_ZeroR = null;
    }

    // Transform attributes
    m_NominalToBinary = new NominalToBinary();
    m_NominalToBinary.setInputFormat(data);
    data = Filter.useFilter(data, m_NominalToBinary);

    m_Filter = new Normalize();
    ((Normalize) m_Filter).setIgnoreClass(true);
    m_Filter.setInputFormat(data);
    data = Filter.useFilter(data, m_Filter);
    double z0 = data.instance(0).classValue(); // This stuff is not relevant in classification case
    double z1 = data.instance(index).classValue();
    m_x1 = (y0 - y1) / (z0 - z1); // no division by zero, since y0 != y1 guaranteed => z0 != z1 ???
    m_x0 = (y0 - m_x1 * z0); // = y1 - m_x1 * z1

    m_classIndex = data.classIndex();
    m_numClasses = data.numClasses();
    m_numAttributes = data.numAttributes();

    // Run k-means
    SimpleKMeans skm = new SimpleKMeans();
    skm.setMaxIterations(10000);
    skm.setNumClusters(m_numUnits);
    Remove rm = new Remove();
    data.setClassIndex(-1);
    rm.setAttributeIndices((m_classIndex + 1) + "");
    rm.setInputFormat(data);
    Instances dataRemoved = Filter.useFilter(data, rm);
    data.setClassIndex(m_classIndex);
    skm.buildClusterer(dataRemoved);
    Instances centers = skm.getClusterCentroids();

    if (centers.numInstances() < m_numUnits) {
        m_numUnits = centers.numInstances();
    }

    // Set up arrays
    OFFSET_WEIGHTS = 0;
    if (m_useAttributeWeights) {
        OFFSET_ATTRIBUTE_WEIGHTS = (m_numUnits + 1) * m_numClasses;
        OFFSET_CENTERS = OFFSET_ATTRIBUTE_WEIGHTS + m_numAttributes;
    } else {
        OFFSET_ATTRIBUTE_WEIGHTS = -1;
        OFFSET_CENTERS = (m_numUnits + 1) * m_numClasses;
    }
    OFFSET_SCALES = OFFSET_CENTERS + m_numUnits * m_numAttributes;

    switch (m_scaleOptimizationOption) {
    case USE_GLOBAL_SCALE:
        m_RBFParameters = new double[OFFSET_SCALES + 1];
        break;
    case USE_SCALE_PER_UNIT_AND_ATTRIBUTE:
        m_RBFParameters = new double[OFFSET_SCALES + m_numUnits * m_numAttributes];
        break;
    default:
        m_RBFParameters = new double[OFFSET_SCALES + m_numUnits];
        break;
    }

    // Set initial radius based on distance to nearest other basis function
    double maxMinDist = -1;
    for (int i = 0; i < centers.numInstances(); i++) {
        double minDist = Double.MAX_VALUE;
        for (int j = i + 1; j < centers.numInstances(); j++) {
            double dist = 0;
            for (int k = 0; k < centers.numAttributes(); k++) {
                if (k != centers.classIndex()) {
                    double diff = centers.instance(i).value(k) - centers.instance(j).value(k);
                    dist += diff * diff;
                }
            }
            if (dist < minDist) {
                minDist = dist;
            }
        }
        if ((minDist != Double.MAX_VALUE) && (minDist > maxMinDist)) {
            maxMinDist = minDist;
        }
    }

    // Initialize parameters
    if (m_scaleOptimizationOption == USE_GLOBAL_SCALE) {
        m_RBFParameters[OFFSET_SCALES] = Math.sqrt(maxMinDist);
    }
    for (int i = 0; i < m_numUnits; i++) {
        if (m_scaleOptimizationOption == USE_SCALE_PER_UNIT) {
            m_RBFParameters[OFFSET_SCALES + i] = Math.sqrt(maxMinDist);
        }
        int k = 0;
        for (int j = 0; j < m_numAttributes; j++) {
            if (k == centers.classIndex()) {
                k++;
            }
            if (j != data.classIndex()) {
                if (m_scaleOptimizationOption == USE_SCALE_PER_UNIT_AND_ATTRIBUTE) {
                    m_RBFParameters[OFFSET_SCALES + (i * m_numAttributes + j)] = Math.sqrt(maxMinDist);
                }
                m_RBFParameters[OFFSET_CENTERS + (i * m_numAttributes) + j] = centers.instance(i).value(k);
                k++;
            }
        }
    }

    if (m_useAttributeWeights) {
        for (int j = 0; j < m_numAttributes; j++) {
            if (j != data.classIndex()) {
                m_RBFParameters[OFFSET_ATTRIBUTE_WEIGHTS + j] = 1.0;
            }
        }
    }

    initializeOutputLayer(random);

    return data;
}

From source file:com.deafgoat.ml.prognosticator.InstancesFilter.java

License:Apache License

/**
 * Applies a filter to remove useless attributes with a variance greater
 * than the specified value/*  w ww.j av  a 2 s  .  co  m*/
 * 
 * @param variance
 *            The maximum variance for the attribute
 * @throws Exception
 *             If filter could not be applied
 */
public void removeUselessFilter(String variance) throws Exception {
    if (_logger.isDebugEnabled()) {
        _logger.debug("Applying remove useless filter");
    }
    // Might employ filtered classifier for production
    RemoveUseless ru = new RemoveUseless();
    String[] options = new String[2];
    options[0] = "-M";
    options[1] = variance;
    ru.setOptions(options);
    ru.setInputFormat(_instances);
    _instances = Filter.useFilter(_instances, ru);
}

From source file:graph.clustering.NodeClusterer.java

License:Apache License

private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) {
    String[] filterOptions = new String[10];
    filterOptions[0] = "-R"; // attribute indices
    filterOptions[1] = "first-last";
    filterOptions[2] = "-W"; // The number of words (per class if there is a
    // class attribute assigned) to attempt to
    // keep./* www . j a  va2 s  .c  o m*/
    filterOptions[3] = "1000";
    filterOptions[4] = "-prune-rate"; // periodical pruning
    filterOptions[5] = "-1.0";
    filterOptions[6] = "-N"; // 0=not normalize
    filterOptions[7] = "0";
    filterOptions[8] = "-M"; // The minimum term frequency
    filterOptions[9] = "1";

    SnowballStemmer stemmer = new SnowballStemmer();
    stemmer.setStemmer("english");
    WordTokenizer tokenizer = new WordTokenizer();

    StringToWordVector s2wFilterer = new StringToWordVector();
    try {
        s2wFilterer.setOptions(filterOptions);
        s2wFilterer.setStemmer(stemmer);
        s2wFilterer.setTokenizer(tokenizer);
        s2wFilterer.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer);
    } catch (Exception e1) {
        System.out.println("Error in converting string into word vectors:");
        e1.printStackTrace();
    }

    RemoveUseless ruFilter = new RemoveUseless();
    try {
        ruFilter.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter);
    } catch (Exception e1) {
        System.out.println("Error in removing useless terms:");
        e1.printStackTrace();
    }

    return clusterTrainingSet;
}

From source file:Helper.CustomFilter.java

public Instances removeAttribute(Instances structure) throws Exception {
    //NORMALIZE AND REMOVE USELESS ATTRIBUTES
    Normalize norm = new Normalize();
    norm.setInputFormat(structure);//w  w  w  .  j  a va2  s .  co m
    structure = Filter.useFilter(structure, norm);

    RemoveUseless ru = new RemoveUseless();
    ru.setInputFormat(structure);
    structure = Filter.useFilter(structure, ru);

    Ranker rank = new Ranker();
    InfoGainAttributeEval eval = new InfoGainAttributeEval();
    eval.buildEvaluator(structure);
    //END OF NORMALIZATION

    return structure;
}

From source file:LogReg.Logistic.java

License:Open Source License

/**
 * Builds the classifier/*  w ww  .j a  v a  2 s  . c  o m*/
 *
 * @param train the training data to be used for generating the
 * boosted classifier.
 * @throws Exception if the classifier could not be built successfully
 */
public void buildClassifier(Instances train) throws Exception {
    // can classifier handle the data?
    getCapabilities().testWithFail(train);

    // remove instances with missing class
    train = new Instances(train);
    train.deleteWithMissingClass();

    // Replace missing values   
    m_ReplaceMissingValues = new ReplaceMissingValues();
    m_ReplaceMissingValues.setInputFormat(train);
    train = Filter.useFilter(train, m_ReplaceMissingValues);

    // Remove useless attributes
    m_AttFilter = new RemoveUseless();
    m_AttFilter.setInputFormat(train);
    train = Filter.useFilter(train, m_AttFilter);

    // Transform attributes
    m_NominalToBinary = new NominalToBinary();
    m_NominalToBinary.setInputFormat(train);
    train = Filter.useFilter(train, m_NominalToBinary);

    // Save the structure for printing the model
    m_structure = new Instances(train, 0);

    // Extract data
    m_ClassIndex = train.classIndex();
    m_NumClasses = train.numClasses();

    int nK = m_NumClasses - 1; // Only K-1 class labels needed 
    int nR = m_NumPredictors = train.numAttributes() - 1;
    int nC = train.numInstances();

    m_Data = new double[nC][nR + 1]; // Data values
    int[] Y = new int[nC]; // Class labels
    double[] xMean = new double[nR + 1]; // Attribute means
    xSD = new double[nR + 1]; // Attribute stddev's
    double[] sY = new double[nK + 1]; // Number of classes
    double[] weights = new double[nC]; // Weights of instances
    double totWeights = 0; // Total weights of the instances
    m_Par = new double[nR + 1][nK]; // Optimized parameter values

    if (m_Debug) {
        System.out.println("Extracting data...");
    }

    for (int i = 0; i < nC; i++) {
        // initialize X[][]
        Instance current = train.instance(i);
        Y[i] = (int) current.classValue(); // Class value starts from 0
        weights[i] = current.weight(); // Dealing with weights
        totWeights += weights[i];

        m_Data[i][0] = 1;
        int j = 1;
        for (int k = 0; k <= nR; k++) {
            if (k != m_ClassIndex) {
                double x = current.value(k);
                m_Data[i][j] = x;
                xMean[j] += weights[i] * x;
                xSD[j] += weights[i] * x * x;
                j++;
            }
        }

        // Class count
        sY[Y[i]]++;
    }

    if ((totWeights <= 1) && (nC > 1))
        throw new Exception("Sum of weights of instances less than 1, please reweight!");

    xMean[0] = 0;
    xSD[0] = 1;
    for (int j = 1; j <= nR; j++) {
        xMean[j] = xMean[j] / totWeights;
        if (totWeights > 1)
            xSD[j] = Math.sqrt(Math.abs(xSD[j] - totWeights * xMean[j] * xMean[j]) / (totWeights - 1));
        else
            xSD[j] = 0;
    }

    if (m_Debug) {
        // Output stats about input data
        System.out.println("Descriptives...");
        for (int m = 0; m <= nK; m++)
            System.out.println(sY[m] + " cases have class " + m);
        System.out.println("\n Variable     Avg       SD    ");
        for (int j = 1; j <= nR; j++)
            System.out.println(Utils.doubleToString(j, 8, 4) + Utils.doubleToString(xMean[j], 10, 4)
                    + Utils.doubleToString(xSD[j], 10, 4));
    }

    // Normalise input data 
    for (int i = 0; i < nC; i++) {
        for (int j = 0; j <= nR; j++) {
            if (xSD[j] != 0) {
                m_Data[i][j] = (m_Data[i][j] - xMean[j]) / xSD[j];
            }
        }
    }

    if (m_Debug) {
        System.out.println("\nIteration History...");
    }

    double x[] = new double[(nR + 1) * nK];
    double[][] b = new double[2][x.length]; // Boundary constraints, N/A here

    // Initialize
    for (int p = 0; p < nK; p++) {
        int offset = p * (nR + 1);
        x[offset] = Math.log(sY[p] + 1.0) - Math.log(sY[nK] + 1.0); // Null model
        b[0][offset] = Double.NaN;
        b[1][offset] = Double.NaN;
        for (int q = 1; q <= nR; q++) {
            x[offset + q] = 0.0;
            b[0][offset + q] = Double.NaN;
            b[1][offset + q] = Double.NaN;
        }
    }

    OptEng opt = new OptEng();
    opt.setDebug(m_Debug);
    opt.setWeights(weights);
    opt.setClassLabels(Y);

    if (m_MaxIts == -1) { // Search until convergence
        x = opt.findArgmin(x, b);
        while (x == null) {
            x = opt.getVarbValues();
            if (m_Debug)
                System.out.println("200 iterations finished, not enough!");
            x = opt.findArgmin(x, b);
        }
        if (m_Debug)
            System.out.println(" -------------<Converged>--------------");
    } else {
        opt.setMaxIteration(m_MaxIts);
        x = opt.findArgmin(x, b);
        if (x == null) // Not enough, but use the current value
            x = opt.getVarbValues();
    }

    m_LL = -opt.getMinFunction(); // Log-likelihood

    // Don't need data matrix anymore
    m_Data = null;

    // Convert coefficients back to non-normalized attribute units
    for (int i = 0; i < nK; i++) {
        m_Par[0][i] = x[i * (nR + 1)];
        for (int j = 1; j <= nR; j++) {
            m_Par[j][i] = x[i * (nR + 1) + j];
            if (xSD[j] != 0) {
                m_Par[j][i] /= xSD[j];
                m_Par[0][i] -= m_Par[j][i] * xMean[j];
            }
        }
    }
}