Example usage for weka.core Instance value

List of usage examples for weka.core Instance value

Introduction

In this page you can find the example usage for weka.core Instance value.

Prototype

public double value(Attribute att);

Source Link

Document

Returns an instance's attribute value in internal format.

Usage

From source file:org.opentox.jaqpot3.qsar.util.WekaInstancesProcess.java

License:Open Source License

public static Map<String, Double> getInstanceAttributeValues(Instance inst, int numAttributes) {
    //numAttributes need to be set before adding the new attributes
    Map<String, Double> featureMap = new HashMap();
    if (numAttributes > 0) {
        double res;
        for (int i = 0; i < numAttributes; ++i) {
            res = (!Double.isNaN(inst.value(i))) ? inst.value(i) : 0;
            res = (!Double.isInfinite(res)) ? res : 0;
            featureMap.put(inst.attribute(i).name(), res);
        }//ww w.  j a v a  2s  . c om
    }
    return featureMap;
}

From source file:org.opentox.jaqpot3.qsar.util.WekaInstancesProcess.java

License:Open Source License

public static PMMLEvaluationContext getInstanceAttributeFieldRefValues(Instance inst, int numAttributes,
        PMMLEvaluationContext context, List<DataField> dataFields) {
    //numAttributes need to be set before adding the new attributes
    for (DataField dataField : dataFields) {
        for (int i = 0; i < numAttributes; ++i) {
            if (StringUtils.equals(inst.attribute(i).name(), dataField.getName().toString())) {
                context.declare(dataField.getName(), inst.value(i));
                break;
            }//from ww w .j a  va2  s.c om
        }

    }

    return context;
}

From source file:org.opentox.toxotis.factory.DatasetFactory.java

License:Open Source License

/**
 * Create a {@link DataEntry data entry} from a single instance.
 * @param instance//from  www .jav  a2  s.c  o m
 * @return
 *      A Data Entry that corresponds to the provided instance.
 * @throws ToxOtisException
 */
public DataEntry createDataEntry(Instance instance) throws ToxOtisException {
    Enumeration attributes = instance.enumerateAttributes();
    DataEntry de = new DataEntry();
    try {
        while (attributes.hasMoreElements()) {
            Attribute attribute = (Attribute) attributes.nextElement();
            if (attribute.name().equals(Dataset.COMPOUND_URI) || attribute.name().equals("URI")) {
                de.setConformer(new Compound(new VRI(instance.stringValue(attribute))));
            } else {
                FeatureValue fv = new FeatureValue();
                Feature feature = new Feature(new VRI(attribute.name()));

                LiteralValue value = null;
                if (attribute.isNumeric()) {
                    value = new LiteralValue<Double>(instance.value(attribute), XSDDatatype.XSDdouble);
                    feature.getOntologicalClasses().add(OTClasses.numericFeature());
                } else if (attribute.isString() || attribute.isDate()) {
                    value = new LiteralValue<String>(instance.stringValue(attribute), XSDDatatype.XSDstring);
                    feature.getOntologicalClasses().add(OTClasses.stringFeature());
                } else if (attribute.isNominal()) {
                    value = new LiteralValue<String>(instance.stringValue(attribute), XSDDatatype.XSDstring);
                    Enumeration nominalValues = attribute.enumerateValues();
                    feature.getOntologicalClasses().add(OTClasses.nominalFeature());
                    while (nominalValues.hasMoreElements()) {
                        String nomValue = (String) nominalValues.nextElement();
                        feature.getAdmissibleValues()
                                .add(new LiteralValue<String>(nomValue, XSDDatatype.XSDstring));
                    }
                }
                fv.setFeature(feature);
                fv.setValue(value);
                de.addFeatureValue(fv);
            }
        }
    } catch (URISyntaxException ex) {
        throw new ToxOtisException(ex);
    }
    return de;
}

From source file:org.packDataMining.SMOTE.java

License:Open Source License

/**
 * The procedure implementing the SMOTE algorithm. The output
 * instances are pushed onto the output queue for collection.
 * //from  www.  j  a  v  a2  s  .c o m
 * @throws Exception    if provided options cannot be executed 
 *          on input instances
 */
protected void doSMOTE() throws Exception {
    int minIndex = 0;
    int min = Integer.MAX_VALUE;
    if (m_DetectMinorityClass) {
        // find minority class
        int[] classCounts = getInputFormat().attributeStats(getInputFormat().classIndex()).nominalCounts;
        for (int i = 0; i < classCounts.length; i++) {
            if (classCounts[i] != 0 && classCounts[i] < min) {
                min = classCounts[i];
                minIndex = i;
            }
        }
    } else {
        String classVal = getClassValue();
        if (classVal.equalsIgnoreCase("first")) {
            minIndex = 1;
        } else if (classVal.equalsIgnoreCase("last")) {
            minIndex = getInputFormat().numClasses();
        } else {
            minIndex = Integer.parseInt(classVal);
        }
        if (minIndex > getInputFormat().numClasses()) {
            throw new Exception("value index must be <= the number of classes");
        }
        minIndex--; // make it an index
    }

    int nearestNeighbors;
    if (min <= getNearestNeighbors()) {
        nearestNeighbors = min - 1;
    } else {
        nearestNeighbors = getNearestNeighbors();
    }
    if (nearestNeighbors < 1)
        throw new Exception("Cannot use 0 neighbors!");

    // compose minority class dataset
    // also push all dataset instances
    Instances sample = getInputFormat().stringFreeStructure();
    Enumeration instanceEnum = getInputFormat().enumerateInstances();
    while (instanceEnum.hasMoreElements()) {
        Instance instance = (Instance) instanceEnum.nextElement();
        push((Instance) instance.copy());
        if ((int) instance.classValue() == minIndex) {
            sample.add(instance);
        }
    }

    // compute Value Distance Metric matrices for nominal features
    Map vdmMap = new HashMap();
    Enumeration attrEnum = getInputFormat().enumerateAttributes();
    while (attrEnum.hasMoreElements()) {
        Attribute attr = (Attribute) attrEnum.nextElement();
        if (!attr.equals(getInputFormat().classAttribute())) {
            if (attr.isNominal() || attr.isString()) {
                double[][] vdm = new double[attr.numValues()][attr.numValues()];
                vdmMap.put(attr, vdm);
                int[] featureValueCounts = new int[attr.numValues()];
                int[][] featureValueCountsByClass = new int[getInputFormat().classAttribute().numValues()][attr
                        .numValues()];
                instanceEnum = getInputFormat().enumerateInstances();
                while (instanceEnum.hasMoreElements()) {
                    Instance instance = (Instance) instanceEnum.nextElement();
                    int value = (int) instance.value(attr);
                    int classValue = (int) instance.classValue();
                    featureValueCounts[value]++;
                    featureValueCountsByClass[classValue][value]++;
                }
                for (int valueIndex1 = 0; valueIndex1 < attr.numValues(); valueIndex1++) {
                    for (int valueIndex2 = 0; valueIndex2 < attr.numValues(); valueIndex2++) {
                        double sum = 0;
                        for (int classValueIndex = 0; classValueIndex < getInputFormat()
                                .numClasses(); classValueIndex++) {
                            double c1i = (double) featureValueCountsByClass[classValueIndex][valueIndex1];
                            double c2i = (double) featureValueCountsByClass[classValueIndex][valueIndex2];
                            double c1 = (double) featureValueCounts[valueIndex1];
                            double c2 = (double) featureValueCounts[valueIndex2];
                            double term1 = c1i / c1;
                            double term2 = c2i / c2;
                            sum += Math.abs(term1 - term2);
                        }
                        vdm[valueIndex1][valueIndex2] = sum;
                    }
                }
            }
        }
    }

    // use this random source for all required randomness
    Random rand = new Random(getRandomSeed());

    // find the set of extra indices to use if the percentage is not evenly divisible by 100
    List extraIndices = new LinkedList();
    double percentageRemainder = (getPercentage() / 100) - Math.floor(getPercentage() / 100.0);
    int extraIndicesCount = (int) (percentageRemainder * sample.numInstances());
    if (extraIndicesCount >= 1) {
        for (int i = 0; i < sample.numInstances(); i++) {
            extraIndices.add(i);
        }
    }
    Collections.shuffle(extraIndices, rand);
    extraIndices = extraIndices.subList(0, extraIndicesCount);
    Set extraIndexSet = new HashSet(extraIndices);

    // the main loop to handle computing nearest neighbors and generating SMOTE
    // examples from each instance in the original minority class data
    Instance[] nnArray = new Instance[nearestNeighbors];
    for (int i = 0; i < sample.numInstances(); i++) {
        Instance instanceI = sample.instance(i);
        // find k nearest neighbors for each instance
        List distanceToInstance = new LinkedList();
        for (int j = 0; j < sample.numInstances(); j++) {
            Instance instanceJ = sample.instance(j);
            if (i != j) {
                double distance = 0;
                attrEnum = getInputFormat().enumerateAttributes();
                while (attrEnum.hasMoreElements()) {
                    Attribute attr = (Attribute) attrEnum.nextElement();
                    if (!attr.equals(getInputFormat().classAttribute())) {
                        double iVal = instanceI.value(attr);
                        double jVal = instanceJ.value(attr);
                        if (attr.isNumeric()) {
                            distance += Math.pow(iVal - jVal, 2);
                        } else {
                            distance += ((double[][]) vdmMap.get(attr))[(int) iVal][(int) jVal];
                        }
                    }
                }
                distance = Math.pow(distance, .5);
                distanceToInstance.add(new Object[] { distance, instanceJ });
            }
        }

        // sort the neighbors according to distance
        Collections.sort(distanceToInstance, new Comparator() {
            public int compare(Object o1, Object o2) {
                double distance1 = (Double) ((Object[]) o1)[0];
                double distance2 = (Double) ((Object[]) o2)[0];
                return (int) Math.ceil(distance1 - distance2);
            }
        });

        // populate the actual nearest neighbor instance array
        Iterator entryIterator = distanceToInstance.iterator();
        int j = 0;
        while (entryIterator.hasNext() && j < nearestNeighbors) {
            nnArray[j] = (Instance) ((Object[]) entryIterator.next())[1];
            j++;
        }

        // create synthetic examples
        int n = (int) Math.floor(getPercentage() / 100);
        while (n > 0 || extraIndexSet.remove(i)) {
            double[] values = new double[sample.numAttributes()];
            int nn = rand.nextInt(nearestNeighbors);
            attrEnum = getInputFormat().enumerateAttributes();
            while (attrEnum.hasMoreElements()) {
                Attribute attr = (Attribute) attrEnum.nextElement();
                if (!attr.equals(getInputFormat().classAttribute())) {
                    if (attr.isNumeric()) {
                        double dif = nnArray[nn].value(attr) - instanceI.value(attr);
                        double gap = rand.nextDouble();
                        values[attr.index()] = (double) (instanceI.value(attr) + gap * dif);
                    } else if (attr.isDate()) {
                        double dif = nnArray[nn].value(attr) - instanceI.value(attr);
                        double gap = rand.nextDouble();
                        values[attr.index()] = (long) (instanceI.value(attr) + gap * dif);
                    } else {
                        int[] valueCounts = new int[attr.numValues()];
                        int iVal = (int) instanceI.value(attr);
                        valueCounts[iVal]++;
                        for (int nnEx = 0; nnEx < nearestNeighbors; nnEx++) {
                            int val = (int) nnArray[nnEx].value(attr);
                            valueCounts[val]++;
                        }
                        int maxIndex = 0;
                        int max = Integer.MIN_VALUE;
                        for (int index = 0; index < attr.numValues(); index++) {
                            if (valueCounts[index] > max) {
                                max = valueCounts[index];
                                maxIndex = index;
                            }
                        }
                        values[attr.index()] = maxIndex;
                    }
                }
            }
            values[sample.classIndex()] = minIndex;
            Instance synthetic = new Instance(1.0, values);
            push(synthetic);
            n--;
        }
    }
}

From source file:org.scripps.branch.classifier.ManualTree.java

License:Open Source License

/**
 * Trying to get generate distribution of classes
 * //from  www .  j  a  va2  s. c  om
 * @param Instances
 * @Param Attribute index to get distribution of
 * @Param HashMap to put data into
 * 
 * @return HashMap of class distribution data
 */
protected HashMap addDistributionData(Instances instances, int attIndex, HashMap distMap) throws Exception {
    Map<String, Comparable> temp = new HashMap<String, Comparable>();
    ArrayList<Object> distData = new ArrayList();
    // GenerateCSV csv = new GenerateCSV();
    // String data = "";
    boolean isNominal = false;
    instances.sort(attIndex);
    for (int i = 0; i < instances.numInstances(); i++) {
        Instance inst = instances.instance(i);
        if (!Double.isNaN(inst.value(attIndex))) {
            temp = new HashMap<String, Comparable>();
            if (inst.attribute(attIndex).isNominal()) {
                temp.put("value", inst.attribute(attIndex).value((int) inst.value(attIndex)));
                isNominal = true;
                // data+=inst.attribute(m_Attribute).value((int)inst.value(m_Attribute))+",";
            } else {
                temp.put("value", inst.value(attIndex));
                // data+=inst.value(att)+",";
            }
            temp.put("classprob", inst.classAttribute().value((int) inst.classValue()));
            // data+=inst.classAttribute().value((int)
            // inst.classValue())+"\n";
            distData.add(temp);
        }
    }
    if (!distData.isEmpty()) {
        distMap.put("dataArray", distData);
        distMap.put("isNominal", isNominal);
        setDistributionData(distMap);
    }
    return distMap;
    // To check if data is being generated right.
    // csv.generateCsvFile("/home/karthik/Documents/distribution.csv",
    // data);
}

From source file:org.scripps.branch.classifier.ManualTree.java

License:Open Source License

/**
 * Recursively backfits data into the tree.
 * /*from  w ww . j av  a 2s . c o  m*/
 * @param data
 *            the data to work with
 * @param classProbs
 *            the class distribution
 * @throws Exception
 *             if generation fails
 */
protected void backfitData(Instances data, double[] classProbs) throws Exception {

    // Make leaf if there are no training instances
    if (data.numInstances() == 0) {
        m_Attribute = -1;
        m_ClassDistribution = null;
        m_Prop = null;
        return;
    }

    // Check if node doesn't contain enough instances or is pure
    // or maximum depth reached
    m_ClassDistribution = classProbs.clone();

    /*
     * if (Utils.sum(m_ClassDistribution) < 2 * m_MinNum ||
     * Utils.eq(m_ClassDistribution[Utils.maxIndex(m_ClassDistribution)],
     * Utils .sum(m_ClassDistribution))) {
     * 
     * // Make leaf m_Attribute = -1; m_Prop = null; return; }
     */

    // Are we at an inner node
    if (m_Attribute > -1) {

        // Compute new weights for subsets based on backfit data
        m_Prop = new double[m_Successors.length];
        for (int i = 0; i < data.numInstances(); i++) {
            Instance inst = data.instance(i);
            if (!inst.isMissing(m_Attribute)) {
                if (data.attribute(m_Attribute).isNominal()) {
                    m_Prop[(int) inst.value(m_Attribute)] += inst.weight();
                } else {
                    m_Prop[(inst.value(m_Attribute) < m_SplitPoint) ? 0 : 1] += inst.weight();
                }
            }
        }

        // If we only have missing values we can make this node into a leaf
        if (Utils.sum(m_Prop) <= 0) {
            m_Attribute = -1;
            m_Prop = null;
            return;
        }

        // Otherwise normalize the proportions
        Utils.normalize(m_Prop);

        // Split data
        Instances[] subsets = splitData(data);

        // Go through subsets
        for (int i = 0; i < subsets.length; i++) {

            // Compute distribution for current subset
            double[] dist = new double[data.numClasses()];
            for (int j = 0; j < subsets[i].numInstances(); j++) {
                dist[(int) subsets[i].instance(j).classValue()] += subsets[i].instance(j).weight();
            }

            // Backfit subset
            m_Successors[i].backfitData(subsets[i], dist);
        }

        // If unclassified instances are allowed, we don't need to store the
        // class distribution
        if (getAllowUnclassifiedInstances()) {
            m_ClassDistribution = null;
            return;
        }

        // Otherwise, if all successors are non-empty, we don't need to
        // store the class distribution
        boolean emptySuccessor = false;
        for (int i = 0; i < subsets.length; i++) {
            if (m_Successors[i].m_ClassDistribution == null) {
                emptySuccessor = true;
                return;
            }
        }
        m_ClassDistribution = null;

        // If we have a least two non-empty successors, we should keep this
        // tree
        /*
         * int nonEmptySuccessors = 0; for (int i = 0; i < subsets.length;
         * i++) { if (m_Successors[i].m_ClassDistribution != null) {
         * nonEmptySuccessors++; if (nonEmptySuccessors > 1) { return; } } }
         * 
         * // Otherwise, this node is a leaf or should become a leaf
         * m_Successors = null; m_Attribute = -1; m_Prop = null; return;
         */
    }
}

From source file:org.scripps.branch.classifier.ManualTree.java

License:Open Source License

/**
 * Recursively generates a tree./*from   w ww .j a  va2 s. c  o m*/
 * 
 * @param data
 *            the data to work with
 * @param classProbs
 *            the class distribution
 * @param header
 *            the header of the data
 * @param minNum
 *            the minimum number of instances per leaf
 * @param debug
 *            whether debugging is on
 * @param attIndicesWindow
 *            the attribute window to choose attributes from
 * @param random
 *            random number generator for choosing random attributes
 * @param depth
 *            the current depth
 * @param determineStructure
 *            whether to determine structure
 * @param m_distributionData
 *            HashMap to put distribution data if getSplitData is true in
 *            any node
 * @throws Exception
 *             if generation fails
 */
protected void buildTree(Instances data, double[] classProbs, Instances header, boolean debug, int depth,
        JsonNode node, int parent_index, HashMap m_distributionData, Instances requiredInstances,
        LinkedHashMap<String, Classifier> custom_classifiers, List<CustomSet> cSList,
        CustomClassifierService ccService, Dataset ds) throws Exception {

    if (mapper == null) {
        mapper = new ObjectMapper();
    }
    // Store structure of dataset, set minimum number of instances
    m_Info = header;
    m_Debug = debug;

    // if in dead json return
    if (node == null) {
        m_Attribute = -1;
        m_ClassDistribution = null;
        m_Prop = null;
        return;
    }

    // Make leaf if there are no training instances
    if (data.numInstances() == 0) {
        m_Attribute = -1;
        m_ClassDistribution = null;
        m_Prop = null;
        return;
    }

    // Check if node doesn't contain enough instances or is pure
    // or maximum depth reached
    m_ClassDistribution = classProbs.clone();
    cSetList = cSList;
    ccSer = ccService;
    d = ds;

    // if (Utils.sum(m_ClassDistribution) < 2 * m_MinNum
    // || Utils.eq(m_ClassDistribution[Utils.maxIndex(m_ClassDistribution)],
    // Utils
    // .sum(m_ClassDistribution))
    // || ((getMaxDepth() > 0) && (depth >= getMaxDepth()))) {
    // // Make leaf
    // m_Attribute = -1;
    // m_Prop = null;
    // return;
    // }

    // Investigate the selected attribute
    int attIndex = parent_index;

    // options child added by web client developer
    // TODO work with him to make a more meaningful structure...
    JsonNode options = node.get("options");
    if (options == null) {
        return;
    }
    String kind = options.get("kind").asText();
    JsonNode att_name = options.get("attribute_name");
    Boolean getSplitData = false;
    Boolean getInstanceData = false;
    // this allows me to modify the json tree structure to add data about
    // the evaluation
    ObjectNode evalresults = (ObjectNode) options;
    ObjectNode _node = (ObjectNode) node;
    //For Roc - Node Match
    _node.set("roc_uid_0", null);
    _node.set("roc_uid_1", null);
    Map<String, JsonNode> sons = new HashMap<String, JsonNode>();
    // String name = node_name.asText();
    if (kind != null && kind.equals("split_node") && att_name != null) { //
        // attIndex = data.attribute(node_id.asText()).index();
        if (!att_name.asText().equals("") && !att_name.asText().contains("custom_classifier")
                && !att_name.asText().contains("custom_tree") && !att_name.asText().contains("custom_set")) {
            attIndex = data.attribute(att_name.asText()).index();
        } else {
            if (att_name.asText().contains("custom_set")) {
                int ctr = 0;
                for (CustomSet c : cSList) {
                    if (c.getId() == Long.valueOf(att_name.asText().replace("custom_set_", ""))) {
                        break;
                    }
                    ctr++;
                }
                attIndex = (data.numAttributes() - 1) + custom_classifiers.size() + ctr;
            } else {
                if (att_name.asText().contains("custom_classifier_new")) {
                    HashMap mp = ccSer.buildCustomClasifier(data,
                            Long.valueOf(att_name.asText().replace("custom_classifier_new_", "")));
                    Classifier fc = (Classifier) mp.get("classifier");
                    custom_classifiers.put("custom_classifier_" + mp.get("id"), fc);
                    evalresults.put("unique_id", "custom_classifier_" + mp.get("id"));
                    evalresults.put("attribute_name", "custom_classifier_" + mp.get("id"));
                    att_name = evalresults.get("attribute_name");
                }
                int ctr = 0;
                for (String key : custom_classifiers.keySet()) {
                    if (key.equals(att_name.asText())) {
                        break;
                    }
                    ctr++;
                }
                attIndex = (data.numAttributes() - 1) + ctr;
            }
        }
        if (node.get("getSplitData") != null) {
            getSplitData = node.get("getSplitData").asBoolean();
        }
        JsonNode split_values = node.get("children");
        int c = 0;
        if (split_values != null && split_values.size() > 0) {
            for (JsonNode svalue : split_values) {
                String key = svalue.get("name").asText();
                JsonNode son = svalue.get("children").get(0);
                if (key.contains("<")) {
                    key = "low";
                } else if (key.contains(">")) {
                    key = "high";
                }
                sons.put(key, son);
                c++;
            }
        }
        // LOGGER.debug("Id name "+att_name+" index "+attIndex+" type "+kind+" sons "+c);
    } else {
        // LOGGER.debug("non split node, name "+att_name+" type "+kind);
    }

    double[] vals = new double[data.numAttributes() + custom_classifiers.size() + cSetList.size()];
    double[][][] dists = new double[data.numAttributes() + custom_classifiers.size() + cSetList.size()][0][0];
    double[][] props = new double[data.numAttributes() + custom_classifiers.size() + cSetList.size()][0];
    double[] splits = new double[data.numAttributes() + custom_classifiers.size() + cSetList.size()];
    listOfFc = custom_classifiers;
    // Compute class distributions and value of splitting
    // criterion for each attribute
    HashMap<String, Double> mp = new HashMap<String, Double>();
    if (attIndex >= data.numAttributes() && attIndex < data.numAttributes() + custom_classifiers.size()) {
        mp = distribution(props, dists, attIndex, data, Double.NaN, custom_classifiers);
    } else if (attIndex >= data.numAttributes() + custom_classifiers.size() - 1) {
        mp = distribution(props, dists, attIndex, data, Double.NaN, custom_classifiers);
    } else {
        if (options.get("split_point") != null) {
            mp = distribution(props, dists, attIndex, data, options.get("split_point").asDouble(),
                    custom_classifiers);
        } else {
            mp = distribution(props, dists, attIndex, data, Double.NaN, custom_classifiers);
        }
    }

    splits[attIndex] = mp.get("split_point");
    vals[attIndex] = gain(dists[attIndex], priorVal(dists[attIndex]));

    m_Attribute = attIndex;
    double[][] distribution = dists[m_Attribute];

    // stop if input json tree does not contain any more children
    // replacing Utils.gr(vals[m_Attribute], 0)&&
    if (kind != null && kind.equals("split_node") && att_name != null) {
        //Assign Classes for custom sets(visual splits).
        m_ClassAssignment.put("Inside", Utils.maxIndex(dists[m_Attribute][1]));
        m_ClassAssignment.put("Outside", (Utils.maxIndex(dists[m_Attribute][1]) == 1) ? 0 : 1);
        // Build subtrees
        m_SplitPoint = splits[m_Attribute];
        m_Prop = props[m_Attribute];
        Instances[] subsets = splitData(data);
        m_Successors = new ManualTree[distribution.length];

        // record quantity and quality measures for node
        int quantity = 0;
        for (int i = 0; i < distribution.length; i++) {
            quantity += subsets[i].numInstances();
        }
        evalresults.put("bin_size", quantity);
        evalresults.put("infogain", vals[m_Attribute]);
        evalresults.put("majClass", m_Info.classAttribute().value(Utils.maxIndex(m_ClassDistribution)));
        evalresults.put("split_point", m_SplitPoint);
        evalresults.put("orig_split_point", mp.get("orig_split_point"));

        if (Boolean.TRUE.equals(getSplitData)) {
            addDistributionData(data, m_Attribute, m_distributionData);
        }

        int maxIndex = 0;
        double maxCount = 0;
        double errors = 0;
        double[] classDist = new double[2];
        double pct_correct = 0;
        double bin_size = 0;

        for (int i = 0; i < distribution.length; i++) {
            m_Successors[i] = new ManualTree();
            m_Successors[i].setKValue(m_KValue);
            m_Successors[i].setMaxDepth(getMaxDepth());

            //To compute class distribution for split node.
            for (int j = 0; j < distribution[i].length; j++) {
                classDist[j] += distribution[i][j];
            }
            // test an instance to see which child node to send its subset
            // down.
            // after split, should hold for all in set
            String child_name = "";
            Instances subset = subsets[i];
            if (subset == null || subset.numInstances() == 0) {
                continue;
            }
            Instance inst = subset.instance(0);
            if (m_Attribute >= data.numAttributes()
                    && m_Attribute < data.numAttributes() + custom_classifiers.size()) {
                double predictedClass = custom_classifiers.get(att_name.asText()).classifyInstance(inst);
                child_name = m_Info.classAttribute().value((int) predictedClass);

            } else if (m_Attribute >= data.numAttributes() + custom_classifiers.size() - 1) {
                CustomSet cSet = getReqCustomSet(
                        m_Attribute - (data.numAttributes() - 1 + custom_classifiers.size()), cSetList);
                JsonNode vertices = mapper.readTree(cSet.getConstraints());
                ArrayList<double[]> attrVertices = generateVerticesList(vertices);
                List<Attribute> aList = generateAttributeList(cSet, data, ds);
                double[] testPoint = new double[2];
                testPoint[0] = inst.value(aList.get(0));
                testPoint[1] = inst.value(aList.get(1));
                int check = checkPointInPolygon(attrVertices, testPoint);
                if (check == 0) {
                    child_name = "Outside";
                } else {
                    child_name = "Inside";
                }
            } else {
                // which nominal attribute is this split linked to?
                if (subset.attribute(m_Attribute).isNominal()) {
                    child_name = inst.attribute(m_Attribute).value((int) inst.value(m_Attribute));
                }
                // otherwise, if we have a numeric attribute, are we going
                // high or low?
                else if (data.attribute(m_Attribute).isNumeric()) {
                    if (inst.value(m_Attribute) < m_SplitPoint) {
                        child_name = "low";
                    } else {
                        child_name = "high";
                    }
                }
            }
            m_Successors[i].setM_ClassAssignment((HashMap<String, Integer>) m_ClassAssignment.clone());
            JsonNode son = sons.get(child_name);
            if (son != null) {
                m_Successors[i].buildTree(subsets[i], distribution[i], header, m_Debug, depth + 1, son,
                        attIndex, m_distributionData, requiredInstances, custom_classifiers, cSList, ccService,
                        ds);
            } else {
                // if we are a split node with no input children, we need to
                // add them into the tree
                // JsonNode split_values = node.get("children");
                if (kind != null && kind.equals("split_node")) {
                    ArrayNode children = (ArrayNode) node.get("children");
                    if (children == null) {
                        children = mapper.createArrayNode();
                    }
                    ObjectNode child = mapper.createObjectNode();
                    child.put("name", child_name);
                    ObjectNode c_options = mapper.createObjectNode();
                    c_options.put("attribute_name", child_name);
                    c_options.put("kind", "split_value");
                    child.put("options", c_options);
                    children.add(child);
                    _node.put("children", children);
                    m_Successors[i].buildTree(subsets[i], distribution[i], header, m_Debug, depth + 1, child,
                            attIndex, m_distributionData, requiredInstances, custom_classifiers, cSList,
                            ccService, ds);

                } else {
                    // for leaf nodes, calling again ends the cycle and
                    // fills up the bins appropriately
                    m_Successors[i].buildTree(subsets[i], distribution[i], header, m_Debug, depth + 1, node,
                            attIndex, m_distributionData, requiredInstances, custom_classifiers, cSList,
                            ccService, ds);
                }
            }
        }

        // Compute pct_correct from distributions and send to split_node
        bin_size = Utils.sum(classDist);
        maxIndex = Utils.maxIndex(classDist);
        maxCount = classDist[maxIndex];
        String class_name = m_Info.classAttribute().value(maxIndex);
        _node.put("majClass", class_name);
        errors += bin_size - maxCount;

        pct_correct = (quantity - errors) / quantity;
        evalresults.put("pct_correct", pct_correct);
        // If all successors are non-empty, we don't need to store the class
        // distribution
        boolean emptySuccessor = false;
        for (int i = 0; i < subsets.length; i++) {
            if (m_Successors[i].m_ClassDistribution == null) {
                emptySuccessor = true;
                break;
            }
        }
        if (!emptySuccessor) {
            m_ClassDistribution = null;
        }
    } else {
        m_Attribute = -1;
        if (kind != null && kind.equals("leaf_node")) {
            double bin_size = 0, maxCount = 0;
            int maxIndex = 0;
            double errors = 0;
            double pct_correct = 0;
            if (m_ClassDistribution != null) {
                bin_size = Utils.sum(m_ClassDistribution);
                maxIndex = Utils.maxIndex(m_ClassDistribution); // this is
                // where it
                // decides
                // what
                // class the
                // leaf is..
                // takes the
                // majority.
                maxCount = m_ClassDistribution[maxIndex];
                errors = bin_size - maxCount;
                pct_correct = (bin_size - errors) / bin_size;
            }
            if (node.get("pickInst") != null) {
                getInstanceData = node.get("pickInst").asBoolean();
            }
            if (Boolean.TRUE.equals(getInstanceData)) {
                requiredInstances.delete();
                for (int k = 0; k < data.numInstances(); k++) {
                    requiredInstances.add(data.instance(k));
                }
            }
            String class_name = m_Info.classAttribute().value(maxIndex);
            _node.put("majClass", class_name);
            if (node.get("setClass") != null) {
                String setClass = node.get("setClass").asText();
                class_name = m_Info.classAttribute().value(m_ClassAssignment.get(setClass));
            }
            _node.put("name", class_name);
            evalresults.put("attribute_name", class_name);
            evalresults.put("kind", "leaf_node");
            evalresults.put("bin_size", Utils.doubleToString(bin_size, 2));
            evalresults.put("errors", Utils.doubleToString(errors, 2));
            evalresults.put("pct_correct", Utils.doubleToString(pct_correct, 2));
            this.setJsonnode(_node);
        } else {
            // Make leaf

            // add the data to the json object
            double bin_size = 0, maxCount = 0;
            int maxIndex = 0;
            double errors = 0;
            double pct_correct = 0;
            if (m_ClassDistribution != null) {
                bin_size = Utils.sum(m_ClassDistribution);
                maxIndex = Utils.maxIndex(m_ClassDistribution); // this is
                // where it
                // decides
                // what
                // class the
                // leaf is..
                // takes the
                // majority.
                maxCount = m_ClassDistribution[maxIndex];
                errors = bin_size - maxCount;
                pct_correct = (bin_size - errors) / bin_size;
            }
            ArrayNode children = (ArrayNode) node.get("children");
            if (children == null) {
                children = mapper.createArrayNode();
            }
            ObjectNode child = mapper.createObjectNode();
            String class_name = m_Info.classAttribute().value(maxIndex);
            child.put("majClass", class_name);
            String nodeName = node.get("name").asText();
            if (nodeName.equals("Inside") || nodeName.equals("Outside")) {
                child.put("setClass", nodeName);
                class_name = m_Info.classAttribute().value(m_ClassAssignment.get(nodeName));
            }
            child.put("name", class_name);
            ObjectNode c_options = mapper.createObjectNode();
            c_options.put("attribute_name", class_name);
            c_options.put("kind", "leaf_node");
            c_options.put("bin_size", Utils.doubleToString(bin_size, 2));
            c_options.put("errors", Utils.doubleToString(errors, 2));
            c_options.put("pct_correct", Utils.doubleToString(pct_correct, 2));
            child.put("options", c_options);
            children.add(child);
            _node.put("children", children);
            this.setJsonnode(child);
        }
    }
}

From source file:org.scripps.branch.classifier.ManualTree.java

License:Open Source License

/**
 * Computes class distribution for an attribute.
 * //from   www . j  av  a 2 s  . c o  m
 * @param props
 * @param dists
 * @param att
 *            the attribute index
 * @param data
 *            the data to work with
 * @throws Exception
 *             if something goes wrong
 */
protected HashMap<String, Double> distribution(double[][] props, double[][][] dists, int att, Instances data,
        double givenSplitPoint, HashMap<String, Classifier> custom_classifiers) throws Exception {

    HashMap<String, Double> mp = new HashMap<String, Double>();
    double splitPoint = givenSplitPoint;
    double origSplitPoint = 0;
    Attribute attribute = null;
    double[][] dist = null;
    int indexOfFirstMissingValue = -1;
    String CustomClassifierId = null;
    CustomSet cSet = null;
    if (att >= data.numAttributes() && att < data.numAttributes() + custom_classifiers.size()) {
        CustomClassifierId = getKeyinMap(custom_classifiers, att, data);
    } else if (att >= data.numAttributes() + custom_classifiers.size()) {
        cSet = getReqCustomSet(att - (data.numAttributes() - 1 + custom_classifiers.size()), cSetList);
    } else {
        attribute = data.attribute(att);
    }
    if (CustomClassifierId == null && cSet == null) {
        if (attribute.isNominal()) {
            // For nominal attributes
            dist = new double[attribute.numValues()][data.numClasses()];
            for (int i = 0; i < data.numInstances(); i++) {
                Instance inst = data.instance(i);
                if (inst.isMissing(att)) {

                    // Skip missing values at this stage
                    if (indexOfFirstMissingValue < 0) {
                        indexOfFirstMissingValue = i;
                    }
                    continue;
                }
                dist[(int) inst.value(att)][(int) inst.classValue()] += inst.weight();
            }
        } else {

            // For numeric attributes
            double[][] currDist = new double[2][data.numClasses()];
            dist = new double[2][data.numClasses()];

            // Sort data
            data.sort(att);

            // Move all instances into second subset
            for (int j = 0; j < data.numInstances(); j++) {
                Instance inst = data.instance(j);
                if (inst.isMissing(att)) {

                    // Can stop as soon as we hit a missing value
                    indexOfFirstMissingValue = j;
                    break;
                }
                currDist[1][(int) inst.classValue()] += inst.weight();
            }

            // Value before splitting
            double priorVal = priorVal(currDist);

            // Save initial distribution
            for (int j = 0; j < currDist.length; j++) {
                System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length);
            }

            if (Double.isNaN(splitPoint)) {
                // Try all possible split points
                double currSplit = data.instance(0).value(att);
                double currVal, bestVal = -Double.MAX_VALUE;
                for (int i = 0; i < data.numInstances(); i++) {
                    Instance inst = data.instance(i);
                    if (inst.isMissing(att)) {

                        // Can stop as soon as we hit a missing value
                        break;
                    }

                    // Can we place a sensible split point here?
                    if (inst.value(att) > currSplit) {

                        // Compute gain for split point
                        currVal = gain(currDist, priorVal);

                        // Is the current split point the best point so far?
                        if (currVal > bestVal) {

                            // Store value of current point
                            bestVal = currVal;

                            // Save split point
                            splitPoint = (inst.value(att) + currSplit) / 2.0;
                            origSplitPoint = splitPoint;

                            // Save distribution
                            for (int j = 0; j < currDist.length; j++) {
                                System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length);
                            }
                        }
                    }
                    currSplit = inst.value(att);

                    // Shift over the weight
                    currDist[0][(int) inst.classValue()] += inst.weight();
                    currDist[1][(int) inst.classValue()] -= inst.weight();
                }
            } else {
                double currSplit = data.instance(0).value(att);
                double currVal, bestVal = -Double.MAX_VALUE;
                // Split data set using given split point.
                for (int i = 0; i < data.numInstances(); i++) {
                    Instance inst = data.instance(i);
                    if (inst.isMissing(att)) {
                        // Can stop as soon as we hit a missing value
                        break;
                    }
                    if (inst.value(att) > currSplit) {
                        // Compute gain for split point
                        currVal = gain(currDist, priorVal);
                        // Is the current split point the best point so far?
                        if (currVal > bestVal) {
                            // Store value of current point
                            bestVal = currVal;
                            // Save computed split point
                            origSplitPoint = (inst.value(att) + currSplit) / 2.0;
                        }
                    }
                    currSplit = inst.value(att);
                    // Shift over the weight
                    currDist[0][(int) inst.classValue()] += inst.weight();
                    currDist[1][(int) inst.classValue()] -= inst.weight();
                    if (inst.value(att) <= splitPoint) {
                        // Save distribution since split point is specified
                        for (int j = 0; j < currDist.length; j++) {
                            System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length);
                        }
                    }
                }
            }
        }
    } else if (CustomClassifierId != null) {
        Classifier fc = custom_classifiers.get(CustomClassifierId);
        dist = new double[data.numClasses()][data.numClasses()];
        Instance inst;
        for (int i = 0; i < data.numInstances(); i++) {
            inst = data.instance(i);
            double predictedClass = fc.classifyInstance(inst);
            if (predictedClass != Instance.missingValue()) {
                dist[(int) predictedClass][(int) inst.classValue()] += inst.weight();
            }
        }
    } else if (cSet != null) {
        dist = new double[data.numClasses()][data.numClasses()];
        JsonNode vertices = mapper.readTree(cSet.getConstraints());
        ArrayList<double[]> attrVertices = generateVerticesList(vertices);
        List<Attribute> aList = generateAttributeList(cSet, data, d);
        double[] testPoint = new double[2];
        int ctr = 0;
        for (int k = 0; k < data.numInstances(); k++) {
            testPoint = new double[2];
            ctr = 0;
            for (Attribute a : aList) {
                if (!data.instance(k).isMissing(a)) {
                    testPoint[ctr] = data.instance(k).value(a);
                    ctr++;
                }
            }
            int check = checkPointInPolygon(attrVertices, testPoint);
            dist[check][(int) data.instance(k).classValue()] += data.instance(k).weight();
        }
    }

    // Compute weights for subsetsCustomClassifierIndex
    props[att] = new double[dist.length];
    for (int k = 0; k < props[att].length; k++) {
        props[att][k] = Utils.sum(dist[k]);
    }
    if (Utils.eq(Utils.sum(props[att]), 0)) {
        for (int k = 0; k < props[att].length; k++) {
            props[att][k] = 1.0 / props[att].length;
        }
    } else {
        Utils.normalize(props[att]);
    }

    // Any instances with missing values ?
    if (indexOfFirstMissingValue > -1) {

        // Distribute weights for instances with missing values
        for (int i = indexOfFirstMissingValue; i < data.numInstances(); i++) {
            Instance inst = data.instance(i);
            if (attribute.isNominal()) {

                // Need to check if attribute value is missing
                if (inst.isMissing(att)) {
                    for (int j = 0; j < dist.length; j++) {
                        dist[j][(int) inst.classValue()] += props[att][j] * inst.weight();
                    }
                }
            } else {

                // Can be sure that value is missing, so no test required
                for (int j = 0; j < dist.length; j++) {
                    dist[j][(int) inst.classValue()] += props[att][j] * inst.weight();
                }
            }
        }
    }

    // Return distribution and split point
    dists[att] = dist;
    mp.put("split_point", splitPoint);
    mp.put("orig_split_point", origSplitPoint);
    return mp;
}

From source file:org.scripps.branch.classifier.ManualTree.java

License:Open Source License

/**
 * Computes class distribution of an instance using the decision tree.
 * /*from ww  w .  ja  v  a  2  s. c  o m*/
 * @param instance
 *            the instance to compute the distribution for
 * @return the computed class distribution
 * @throws Exception
 *             if computation fails
 */
@Override
public double[] distributionForInstance(Instance instance) throws Exception {

    // default model?
    if (m_ZeroR != null) {
        return m_ZeroR.distributionForInstance(instance);
    }

    double[] returnedDist = null;

    //Set Parent Node to set m_pred in case custom set occurs.
    if (m_Successors != null) {
        for (int i = 0; i < m_Successors.length; i++) {
            m_Successors[i].setParentNode(this.parentNode);
        }
    }

    if (m_Info != null) {
        if (m_Attribute > -1 && m_Attribute < m_Info.numAttributes()) {

            // Node is not a leaf
            if (instance.isMissing(m_Attribute)) {
                LOGGER.debug("Missing attribute");
                // Value is missing
                returnedDist = new double[m_Info.numClasses()];

                // Split instance up
                for (int i = 0; i < m_Successors.length; i++) {
                    double[] help = m_Successors[i].distributionForInstance(instance);
                    if (help != null) {
                        for (int j = 0; j < help.length; j++) {
                            returnedDist[j] += m_Prop[i] * help[j];
                        }
                    }
                }
                LOGGER.debug("Missing Instance");
            } else if (m_Info.attribute(m_Attribute).isNominal()) {

                // For nominal attributes
                returnedDist = m_Successors[(int) instance.value(m_Attribute)]
                        .distributionForInstance(instance);
            } else {

                // For numeric attributes
                if (instance.value(m_Attribute) < m_SplitPoint) {
                    returnedDist = m_Successors[0].distributionForInstance(instance);
                } else {
                    returnedDist = m_Successors[1].distributionForInstance(instance);
                }
            }
        } else if (m_Attribute >= m_Info.numAttributes() - 1) {
            if (m_Attribute >= (listOfFc.size() + m_Info.numAttributes()) - 1) {
                CustomSet cSet = getReqCustomSet(m_Attribute - (listOfFc.size() - 1 + m_Info.numAttributes()),
                        cSetList);
                JsonNode vertices = mapper.readTree(cSet.getConstraints());
                ArrayList<double[]> attrVertices = generateVerticesList(vertices);
                List<Attribute> aList = generateAttributeList(cSet, m_Info, d);
                double[] testPoint = new double[2];
                testPoint[0] = instance.value(aList.get(0));
                testPoint[1] = instance.value(aList.get(1));
                int check = checkPointInPolygon(attrVertices, testPoint);
                if (m_Successors[check].getM_Attribute() == -1) {
                    parentNode.setM_pred(m_ClassAssignment.get((check == 0) ? "Outside" : "Inside"));
                }
                returnedDist = m_Successors[check].distributionForInstance(instance);

            } else {
                String classifierId = "";
                classifierId = getKeyinMap(listOfFc, m_Attribute, m_Info);
                Classifier fc = listOfFc.get(classifierId);
                double predictedClass = fc.classifyInstance(instance);
                if (predictedClass != Instance.missingValue()) {
                    returnedDist = m_Successors[(int) predictedClass].distributionForInstance(instance);
                }
            }
        }
    }

    // Node is a leaf or successor is empty?
    if ((m_Attribute == -1) || (returnedDist == null)) {

        // Is node empty?
        if (m_ClassDistribution == null) {
            if (getAllowUnclassifiedInstances()) {
                return new double[m_Info.numClasses()];
            } else {
                return null;
            }
        }

        // Else return normalized distribution
        double[] normalizedDistribution = m_ClassDistribution.clone();
        if (this.parentNode != null) {
            this.parentNode.setJsonnode(this.getJsonnode());
        }
        try {
            Utils.normalize(normalizedDistribution);
        } catch (Exception e) {
            LOGGER.error("Sum is 0. Coudln't Normalize");
        }
        return normalizedDistribution;
    } else {
        return returnedDist;
    }
}

From source file:org.scripps.branch.classifier.ManualTree.java

License:Open Source License

/**
 * Computes class distribution of an instance using the decision tree.
 * /*w ww.j a va2  s.  c  om*/
 * @param instance
 *            the instance to compute the distribution for
 * @return the computed class distribution
 * @throws Exception
 *             if computation fails
 */
public double[] predForInstance(Instance instance) throws Exception {

    // default model?
    if (m_ZeroR != null) {
        return m_ZeroR.distributionForInstance(instance);
    }

    double[] returnedDist = null;

    if (m_Attribute > -1 && m_Attribute < m_Info.numAttributes()) {

        // Node is not a leaf
        if (instance.isMissing(m_Attribute)) {

            // Value is missing
            returnedDist = new double[m_Info.numClasses()];

            // Split instance up
            for (int i = 0; i < m_Successors.length; i++) {
                double[] help = m_Successors[i].distributionForInstance(instance);
                if (help != null) {
                    for (int j = 0; j < help.length; j++) {
                        returnedDist[j] += m_Prop[i] * help[j];
                    }
                }
            }
        } else if (m_Info.attribute(m_Attribute).isNominal()) {

            // For nominal attributes
            returnedDist = m_Successors[(int) instance.value(m_Attribute)].distributionForInstance(instance);
        } else {

            // For numeric attributes
            if (instance.value(m_Attribute) < m_SplitPoint) {
                returnedDist = m_Successors[0].distributionForInstance(instance);
            } else {
                returnedDist = m_Successors[1].distributionForInstance(instance);
            }
        }
    } else if (m_Attribute >= m_Info.numAttributes() - 1) {
        if (m_Attribute >= (listOfFc.size() + m_Info.numAttributes()) - 1) {
            CustomSet cSet = getReqCustomSet(m_Attribute - (listOfFc.size() - 1 + m_Info.numAttributes()),
                    cSetList);
            JsonNode vertices = mapper.readTree(cSet.getConstraints());
            ArrayList<double[]> attrVertices = generateVerticesList(vertices);
            List<Attribute> aList = generateAttributeList(cSet, m_Info, d);
            double[] testPoint = new double[2];
            testPoint[0] = instance.value(aList.get(0));
            testPoint[1] = instance.value(aList.get(1));
            int check = checkPointInPolygon(attrVertices, testPoint);
            returnedDist = m_Successors[check].distributionForInstance(instance);

        } else {
            String classifierId = "";
            classifierId = getKeyinMap(listOfFc, m_Attribute, m_Info);
            Classifier fc = listOfFc.get(classifierId);
            double predictedClass = fc.classifyInstance(instance);
            if (predictedClass != Instance.missingValue()) {
                returnedDist = m_Successors[(int) predictedClass].distributionForInstance(instance);
            }
        }
    }

    // Node is a leaf or successor is empty?
    if ((m_Attribute == -1) || (returnedDist == null)) {

        // Is node empty?
        if (m_ClassDistribution == null) {
            if (getAllowUnclassifiedInstances()) {
                return new double[m_Info.numClasses()];
            } else {
                return null;
            }
        }

        // Else return normalized distribution
        double[] normalizedDistribution = m_ClassDistribution.clone();
        Utils.normalize(normalizedDistribution);
        return normalizedDistribution;
    } else {
        return returnedDist;
    }
}