Example usage for weka.core Utils sm

List of usage examples for weka.core Utils sm

Introduction

In this page you can find the example usage for weka.core Utils sm.

Prototype

public staticboolean sm(double a, double b) 

Source Link

Document

Tests if a is smaller than b.

Usage

From source file:de.uni_potsdam.hpi.bpt.promnicat.analysisModules.clustering.ProcessInstances.java

License:Open Source License

/**
 * Checks if the given instance is compatible with this dataset. Only looks
 * at the size of the instance and the ranges of the values for nominal and
 * string attributes.//from ww  w.  j  a  v  a2  s. c  om
 * 
 * @param instance
 *            the instance to check
 * @return true if the instance is compatible with the dataset
 */
public/* @pure@ */boolean checkInstance(ProcessInstance instance) {

    if (instance.numAttributes() != numAttributes()) {
        return false;
    }
    if (instance.numStrAttributes() != numStrAttributes()) {
        return false;
    }
    for (int i = 0; i < numAttributes(); i++) {
        if (instance.isMissing(i)) {
            continue;
        } else if (attribute(i).isNominal() || attribute(i).isString()) {
            if (!(Utils.eq(instance.value(i), (double) (int) instance.value(i)))) {
                return false;
            } else if (Utils.sm(instance.value(i), 0)
                    || Utils.gr(instance.value(i), attribute(i).numValues())) {
                return false;
            }
        }
    }
    return true;
}

From source file:j48.BinC45ModelSelection.java

License:Open Source License

/**
 * Selects C4.5-type split for the given dataset.
 *//*from   w w w . j  av a  2  s . c o  m*/
public final ClassifierSplitModel selectModel(Instances data) {

    double minResult;
    double currentResult;
    BinC45Split[] currentModel;
    BinC45Split bestModel = null;
    NoSplit noSplitModel = null;
    double averageInfoGain = 0;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    double sumOfWeights;
    int i;

    try {

        // Check if all Instances belong to one class or if not
        // enough Instances to split.
        checkDistribution = new Distribution(data);
        noSplitModel = new NoSplit(checkDistribution);
        if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(),
                checkDistribution.perClass(checkDistribution.maxClass())))
            return noSplitModel;

        // Check if all attributes are nominal and have a
        // lot of values.
        Enumeration enu = data.enumerateAttributes();
        while (enu.hasMoreElements()) {
            Attribute attribute = (Attribute) enu.nextElement();
            if ((attribute.isNumeric())
                    || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) {
                multiVal = false;
                break;
            }
        }
        currentModel = new BinC45Split[data.numAttributes()];
        sumOfWeights = data.sumOfWeights();

        // For each attribute.
        for (i = 0; i < data.numAttributes(); i++) {

            // Apart from class attribute.
            if (i != (data).classIndex()) {

                // Get models for current attribute.
                currentModel[i] = new BinC45Split(i, m_minNoObj, sumOfWeights);
                currentModel[i].buildClassifier(data);

                // Check if useful split for current attribute
                // exists and check for enumerated attributes with
                // a lot of values.
                if (currentModel[i].checkModel())
                    if ((data.attribute(i).isNumeric())
                            || (multiVal || Utils.sm((double) data.attribute(i).numValues(),
                                    (0.3 * (double) m_allData.numInstances())))) {
                        averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                        validModels++;
                    }
            } else
                currentModel[i] = null;
        }

        // Check if any useful split was found.
        if (validModels == 0)
            return noSplitModel;
        averageInfoGain = averageInfoGain / (double) validModels;

        // Find "best" attribute to split on.
        minResult = 0;
        for (i = 0; i < data.numAttributes(); i++) {
            if ((i != (data).classIndex()) && (currentModel[i].checkModel()))

                // Use 1E-3 here to get a closer approximation to the
                // original
                // implementation.
                if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3))
                        && Utils.gr(currentModel[i].gainRatio(), minResult)) {
                    bestModel = currentModel[i];
                    minResult = currentModel[i].gainRatio();
                }
        }

        // Check if useful split was found.
        if (Utils.eq(minResult, 0))
            return noSplitModel;

        // Add all Instances with unknown values for the corresponding
        // attribute to the distribution for the model, so that
        // the complete distribution is stored with the model.
        bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex());

        // Set the split point analogue to C45 if attribute numeric.
        bestModel.setSplitPoint(m_allData);
        return bestModel;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:j48.BinC45Split.java

License:Open Source License

/**
 * Creates split on numeric attribute./*w  w  w .j a  v  a2 s  .  c  o  m*/
 *
 * @exception Exception if something goes wrong
 */
private void handleNumericAttribute(Instances trainInstances) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int index = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double defaultEnt;
    double minSplit;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new Distribution(2, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (instance.isMissing(m_attIndex))
            break;
        m_distribution.add(1, instance);
        i++;
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses());
    if (Utils.smOrEq(minSplit, m_minNoObj))
        minSplit = m_minNoObj;
    else if (Utils.gr(minSplit, 25))
        minSplit = 25;

    // Enough Instances with known values?
    if (Utils.sm((double) firstMiss, 2 * minSplit))
        return;

    // Compute values of criteria for all possible split
    // indices.
    defaultEnt = m_infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {

        if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next)
                .value(m_attIndex)) {

            // Move class values for all Instances up to next 
            // possible split point.
            m_distribution.shiftRange(1, 0, trainInstances, last, next);

            // Check if enough Instances in each subset and compute
            // values for criteria.
            if (Utils.grOrEq(m_distribution.perBag(0), minSplit)
                    && Utils.grOrEq(m_distribution.perBag(1), minSplit)) {
                currentInfoGain = m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt);
                if (Utils.gr(currentInfoGain, m_infoGain)) {
                    m_infoGain = currentInfoGain;
                    splitIndex = next - 1;
                }
                index++;
            }
            last = next;
        }
        next++;
    }

    // Was there any useful split?
    if (index == 0)
        return;

    // Compute modified information gain for best split.
    m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights);
    if (Utils.smOrEq(m_infoGain, 0))
        return;

    // Set instance variables' values to values for
    // best split.
    m_numSubsets = 2;
    m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex)
            + trainInstances.instance(splitIndex).value(m_attIndex)) / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) {
        m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distributioN for best split.
    m_distribution = new Distribution(2, trainInstances.numClasses());
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    // Compute modified gain ratio for best split.
    m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain);
}

From source file:j48.C45ModelSelection.java

License:Open Source License

/**
 * Selects C4.5-type split for the given dataset.
 *///from ww  w .  ja  va 2 s  . c  o  m
public final ClassifierSplitModel selectModel(Instances data) {

    double minResult;
    double currentResult;
    C45Split[] currentModel;
    C45Split bestModel = null;
    NoSplit noSplitModel = null;
    double averageInfoGain = 0;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    Attribute attribute;
    double sumOfWeights;
    int i;

    try {

        // Check if all Instances belong to one class or if not
        // enough Instances to split.
        checkDistribution = new Distribution(data);
        noSplitModel = new NoSplit(checkDistribution);
        if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(),
                checkDistribution.perClass(checkDistribution.maxClass())))
            return noSplitModel;

        // Check if all attributes are nominal and have a
        // lot of values.
        if (m_allData != null) {
            Enumeration enu = data.enumerateAttributes();
            while (enu.hasMoreElements()) {
                attribute = (Attribute) enu.nextElement();
                if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(),
                        (0.3 * (double) m_allData.numInstances())))) {
                    multiVal = false;
                    break;
                }
            }
        }

        currentModel = new j48.C45Split[data.numAttributes()];
        sumOfWeights = data.sumOfWeights();

        // For each attribute.
        for (i = 0; i < data.numAttributes(); i++) {

            // Apart from class attribute.
            if (i != (data).classIndex()) {

                // Get models for current attribute.
                currentModel[i] = new j48.C45Split(i, m_minNoObj, sumOfWeights);
                currentModel[i].buildClassifier(data);

                // Check if useful split for current attribute
                // exists and check for enumerated attributes with
                // a lot of values.
                if (currentModel[i].checkModel())
                    if (m_allData != null) {
                        if ((data.attribute(i).isNumeric())
                                || (multiVal || Utils.sm((double) data.attribute(i).numValues(),
                                        (0.3 * (double) m_allData.numInstances())))) {
                            averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                            validModels++;
                        }
                    } else {
                        averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                        validModels++;
                    }
            } else
                currentModel[i] = null;
        }

        // Check if any useful split was found.
        if (validModels == 0)
            return noSplitModel;
        averageInfoGain = averageInfoGain / (double) validModels;

        // Find "best" attribute to split on.
        minResult = 0;
        for (i = 0; i < data.numAttributes(); i++) {
            if ((i != (data).classIndex()) && (currentModel[i].checkModel()))

                // Use 1E-3 here to get a closer approximation to the
                // original
                // implementation.
                if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3))
                        && Utils.gr(currentModel[i].gainRatio(), minResult)) {
                    bestModel = currentModel[i];
                    minResult = currentModel[i].gainRatio();
                }
        }

        // Check if useful split was found.
        if (Utils.eq(minResult, 0))
            return noSplitModel;

        // Add all Instances with unknown values for the corresponding
        // attribute to the distribution for the model, so that
        // the complete distribution is stored with the model.
        bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex());

        // Set the split point analogue to C45 if attribute numeric.
        if (m_allData != null)
            bestModel.setSplitPoint(m_allData);
        return bestModel;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:j48.C45Split.java

License:Open Source License

/**
 * Creates split on numeric attribute.// ww  w.  j a  va 2 s .com
 * 
 * @exception Exception
 *                if something goes wrong
 */
private void handleNumericAttribute(Instances trainInstances) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double defaultEnt;
    double minSplit;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new Distribution(2, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (instance.isMissing(m_attIndex))
            break;
        m_distribution.add(1, instance);
        i++;
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses());
    if (Utils.smOrEq(minSplit, m_minNoObj))
        minSplit = m_minNoObj;
    else if (Utils.gr(minSplit, 25))
        minSplit = 25;

    // Enough Instances with known values?
    if (Utils.sm((double) firstMiss, 2 * minSplit))
        return;

    // Compute values of criteria for all possible split
    // indices.
    defaultEnt = infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {

        if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next)
                .value(m_attIndex)) {

            // Move class values for all Instances up to next
            // possible split point.
            m_distribution.shiftRange(1, 0, trainInstances, last, next);

            // Check if enough Instances in each subset and compute
            // values for criteria.
            if (Utils.grOrEq(m_distribution.perBag(0), minSplit)
                    && Utils.grOrEq(m_distribution.perBag(1), minSplit)) {
                currentInfoGain = infoGainCrit.splitCritValue1(m_distribution, m_sumOfWeights, defaultEnt,
                        rrrrr);
                if (Utils.gr(currentInfoGain, m_infoGain)) {
                    m_infoGain = currentInfoGain;
                    splitIndex = next - 1;
                }
                m_index++;
            }
            last = next;
        }
        next++;
    }

    // Was there any useful split?
    if (m_index == 0)
        return;

    // Compute modified information gain for best split.
    m_infoGain = m_infoGain - (Utils.log2(m_index) / m_sumOfWeights);
    if (Utils.smOrEq(m_infoGain, 0))
        return;

    // Set instance variables' values to values for
    // best split.
    m_numSubsets = 2;
    m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex)
            + trainInstances.instance(splitIndex).value(m_attIndex)) / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) {
        m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distributioN for best split.
    m_distribution = new Distribution(2, trainInstances.numClasses());
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    // Compute modified gain ratio for best split.
    m_gainRatio = gainRatioCrit.splitCritValue1(m_distribution, m_sumOfWeights, m_infoGain, lllll);
}

From source file:j48.NBTreeModelSelection.java

License:Open Source License

/**
 * Selects NBTree-type split for the given dataset.
 */// w  w  w.  j  a v a 2  s .c o m
public final ClassifierSplitModel selectModel(Instances data) {

    double globalErrors = 0;

    double minResult;
    double currentResult;
    NBTreeSplit[] currentModel;
    NBTreeSplit bestModel = null;
    NBTreeNoSplit noSplitModel = null;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    Attribute attribute;
    double sumOfWeights;
    int i;

    try {
        // build the global model at this node
        noSplitModel = new NBTreeNoSplit();
        noSplitModel.buildClassifier(data);
        if (data.numInstances() < 5) {
            return noSplitModel;
        }

        // evaluate it
        globalErrors = noSplitModel.getErrors();
        if (globalErrors == 0) {
            return noSplitModel;
        }

        // Check if all Instances belong to one class or if not
        // enough Instances to split.
        checkDistribution = new Distribution(data);
        if (Utils.sm(checkDistribution.total(), m_minNoObj) || Utils.eq(checkDistribution.total(),
                checkDistribution.perClass(checkDistribution.maxClass()))) {
            return noSplitModel;
        }

        // Check if all attributes are nominal and have a 
        // lot of values.
        if (m_allData != null) {
            Enumeration enu = data.enumerateAttributes();
            while (enu.hasMoreElements()) {
                attribute = (Attribute) enu.nextElement();
                if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(),
                        (0.3 * (double) m_allData.numInstances())))) {
                    multiVal = false;
                    break;
                }
            }
        }

        currentModel = new NBTreeSplit[data.numAttributes()];
        sumOfWeights = data.sumOfWeights();

        // For each attribute.
        for (i = 0; i < data.numAttributes(); i++) {

            // Apart from class attribute.
            if (i != (data).classIndex()) {

                // Get models for current attribute.
                currentModel[i] = new NBTreeSplit(i, m_minNoObj, sumOfWeights);
                currentModel[i].setGlobalModel(noSplitModel);
                currentModel[i].buildClassifier(data);

                // Check if useful split for current attribute
                // exists and check for enumerated attributes with 
                // a lot of values.
                if (currentModel[i].checkModel()) {
                    validModels++;
                }
            } else {
                currentModel[i] = null;
            }
        }

        // Check if any useful split was found.
        if (validModels == 0) {
            return noSplitModel;
        }

        // Find "best" attribute to split on.
        minResult = globalErrors;
        for (i = 0; i < data.numAttributes(); i++) {
            if ((i != (data).classIndex()) && (currentModel[i].checkModel())) {
                /*  System.err.println("Errors for "+data.attribute(i).name()+" "+
                    currentModel[i].getErrors()); */
                if (currentModel[i].getErrors() < minResult) {
                    bestModel = currentModel[i];
                    minResult = currentModel[i].getErrors();
                }
            }
        }
        //      System.exit(1);
        // Check if useful split was found.

        if (((globalErrors - minResult) / globalErrors) < 0.05) {
            return noSplitModel;
        }

        /*      if (bestModel == null) {
        System.err.println("This shouldn't happen! glob : "+globalErrors+
              " minRes : "+minResult);
        System.exit(1);
        } */
        // Set the global model for the best split
        //      bestModel.setGlobalModel(noSplitModel);

        return bestModel;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}