Example usage for weka.core Attribute equals

List of usage examples for weka.core Attribute equals

Introduction

In this page you can find the example usage for weka.core Attribute equals.

Prototype

@Override
public finalboolean equals(Object other) 

Source Link

Document

Tests if given attribute is equal to this attribute.

Usage

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *  Adds frequency attribute +1 (or +(1/tokenNum) to an attribute in the given feature vector
 * /*ww  w .java  2 s  .  c  o m*/
 * @param Attribute att to add a value to.
 * @param double[] fVector feature vector where the value should be added
 * 
 */
private void addNumericToFeatureVector(Attribute att, double[] fVector, int sentTokNum) {
    if (!att.equals(null)) {
        int current_ind = att.index();
        //update feature value in the feature vector 
        fVector[current_ind] = fVector[current_ind] + (1 / (double) sentTokNum);
    }
}

From source file:gov.va.chir.tagline.dao.DatasetUtil.java

License:Open Source License

public static Instances createDataset(final Collection<Document> documents) {

    // Key = feature name | Value = number representing NUMERIC, NOMINAL, etc.
    final Map<String, Integer> featureType = new TreeMap<String, Integer>();

    // Key = feature name | Values = distinct values for NOMINAL values
    final Map<String, Set<String>> nominalFeatureMap = new HashMap<String, Set<String>>();

    final Set<String> labels = new TreeSet<String>();
    final Set<String> docIds = new TreeSet<String>();

    // First scan -- determine attribute values
    for (Document document : documents) {
        processFeatures(document.getFeatures(), featureType, nominalFeatureMap);
        docIds.add(document.getName());//from   w  ww  .j av a 2  s . co m

        for (Line line : document.getLines()) {
            processFeatures(line.getFeatures(), featureType, nominalFeatureMap);

            labels.add(line.getLabel());
        }
    }

    final ArrayList<Attribute> attributes = new ArrayList<Attribute>();

    // Add Document and Line IDs as first two attributes
    //final Attribute docId = new Attribute(DOC_ID, (ArrayList<String>) null);
    final Attribute docId = new Attribute(DOC_ID, new ArrayList<String>(docIds));
    final Attribute lineId = new Attribute(LINE_ID);

    attributes.add(docId);
    attributes.add(lineId);

    // Build attributes
    for (String feature : featureType.keySet()) {
        final int type = featureType.get(feature);

        if (type == Attribute.NUMERIC) {
            attributes.add(new Attribute(feature));
        } else {
            if (nominalFeatureMap.containsKey(feature)) {
                attributes.add(new Attribute(feature, new ArrayList<String>(nominalFeatureMap.get(feature))));
            }
        }
    }

    // Add class attribute
    Attribute classAttr = new Attribute(LABEL, new ArrayList<String>(labels));
    attributes.add(classAttr);

    final Instances instances = new Instances("train", attributes, documents.size());

    // Second scan -- add data
    for (Document document : documents) {
        final Map<String, Object> docFeatures = document.getFeatures();

        for (Line line : document.getLines()) {
            final Instance instance = new DenseInstance(attributes.size());

            final Map<String, Object> lineFeatures = line.getFeatures();
            lineFeatures.putAll(docFeatures);

            instance.setValue(docId, document.getName());
            instance.setValue(lineId, line.getLineId());
            instance.setValue(classAttr, line.getLabel());

            for (Attribute attribute : attributes) {
                if (!attribute.equals(docId) && !attribute.equals(lineId) && !attribute.equals(classAttr)) {
                    final String name = attribute.name();
                    final Object obj = lineFeatures.get(name);

                    if (obj instanceof Double) {
                        instance.setValue(attribute, ((Double) obj).doubleValue());
                    } else if (obj instanceof Integer) {
                        instance.setValue(attribute, ((Integer) obj).doubleValue());
                    } else {
                        instance.setValue(attribute, obj.toString());
                    }
                }
            }

            instances.add(instance);
        }
    }

    // Set last attribute as class
    instances.setClassIndex(attributes.size() - 1);

    return instances;
}

From source file:gov.va.chir.tagline.dao.DatasetUtil.java

License:Open Source License

@SuppressWarnings("unchecked")
public static Instances createDataset(final Instances header, final Collection<Document> documents)
        throws Exception {

    // Update header to include all docIDs from the passed in documents
    // (Weka requires all values for nominal features)
    final Set<String> docIds = new TreeSet<String>();

    for (Document document : documents) {
        docIds.add(document.getName());/*ww w .j av  a 2  s .  c o m*/
    }

    final AddValues avf = new AddValues();
    avf.setLabels(StringUtils.join(docIds, ","));

    // Have to add 1 because SingleIndex.setValue() has a bug, expecting
    // the passed in index to be 1-based rather than 0-based. Why? I have 
    // no idea.
    // Calling path: AddValues.setInputFormat() -->
    //               SingleIndex.setUpper() -->
    //               SingleIndex.setValue()
    avf.setAttributeIndex(String.valueOf(header.attribute(DOC_ID).index() + 1));

    avf.setInputFormat(header);
    final Instances newHeader = Filter.useFilter(header, avf);

    final Instances instances = new Instances(newHeader, documents.size());

    // Map attributes
    final Map<String, Attribute> attrMap = new HashMap<String, Attribute>();

    final Enumeration<Attribute> en = newHeader.enumerateAttributes();

    while (en.hasMoreElements()) {
        final Attribute attr = en.nextElement();

        attrMap.put(attr.name(), attr);
    }

    attrMap.put(newHeader.classAttribute().name(), newHeader.classAttribute());

    final Attribute docId = attrMap.get(DOC_ID);
    final Attribute lineId = attrMap.get(LINE_ID);
    final Attribute classAttr = attrMap.get(LABEL);

    // Add data
    for (Document document : documents) {
        final Map<String, Object> docFeatures = document.getFeatures();

        for (Line line : document.getLines()) {
            final Instance instance = new DenseInstance(attrMap.size());

            final Map<String, Object> lineFeatures = line.getFeatures();
            lineFeatures.putAll(docFeatures);

            instance.setValue(docId, document.getName());
            instance.setValue(lineId, line.getLineId());

            if (line.getLabel() == null) {
                instance.setMissing(classAttr);
            } else {
                instance.setValue(classAttr, line.getLabel());
            }

            for (Attribute attribute : attrMap.values()) {
                if (!attribute.equals(docId) && !attribute.equals(lineId) && !attribute.equals(classAttr)) {
                    final String name = attribute.name();
                    final Object obj = lineFeatures.get(name);

                    if (obj instanceof Double) {
                        instance.setValue(attribute, ((Double) obj).doubleValue());
                    } else if (obj instanceof Integer) {
                        instance.setValue(attribute, ((Integer) obj).doubleValue());
                    } else {
                        instance.setValue(attribute, obj.toString());
                    }
                }
            }

            instances.add(instance);
        }
    }

    // Set last attribute as class
    instances.setClassIndex(attrMap.size() - 1);

    return instances;
}

From source file:org.packDataMining.SMOTE.java

License:Open Source License

/**
 * The procedure implementing the SMOTE algorithm. The output
 * instances are pushed onto the output queue for collection.
 * //from w ww .  j  a v  a 2s .co m
 * @throws Exception    if provided options cannot be executed 
 *          on input instances
 */
protected void doSMOTE() throws Exception {
    int minIndex = 0;
    int min = Integer.MAX_VALUE;
    if (m_DetectMinorityClass) {
        // find minority class
        int[] classCounts = getInputFormat().attributeStats(getInputFormat().classIndex()).nominalCounts;
        for (int i = 0; i < classCounts.length; i++) {
            if (classCounts[i] != 0 && classCounts[i] < min) {
                min = classCounts[i];
                minIndex = i;
            }
        }
    } else {
        String classVal = getClassValue();
        if (classVal.equalsIgnoreCase("first")) {
            minIndex = 1;
        } else if (classVal.equalsIgnoreCase("last")) {
            minIndex = getInputFormat().numClasses();
        } else {
            minIndex = Integer.parseInt(classVal);
        }
        if (minIndex > getInputFormat().numClasses()) {
            throw new Exception("value index must be <= the number of classes");
        }
        minIndex--; // make it an index
    }

    int nearestNeighbors;
    if (min <= getNearestNeighbors()) {
        nearestNeighbors = min - 1;
    } else {
        nearestNeighbors = getNearestNeighbors();
    }
    if (nearestNeighbors < 1)
        throw new Exception("Cannot use 0 neighbors!");

    // compose minority class dataset
    // also push all dataset instances
    Instances sample = getInputFormat().stringFreeStructure();
    Enumeration instanceEnum = getInputFormat().enumerateInstances();
    while (instanceEnum.hasMoreElements()) {
        Instance instance = (Instance) instanceEnum.nextElement();
        push((Instance) instance.copy());
        if ((int) instance.classValue() == minIndex) {
            sample.add(instance);
        }
    }

    // compute Value Distance Metric matrices for nominal features
    Map vdmMap = new HashMap();
    Enumeration attrEnum = getInputFormat().enumerateAttributes();
    while (attrEnum.hasMoreElements()) {
        Attribute attr = (Attribute) attrEnum.nextElement();
        if (!attr.equals(getInputFormat().classAttribute())) {
            if (attr.isNominal() || attr.isString()) {
                double[][] vdm = new double[attr.numValues()][attr.numValues()];
                vdmMap.put(attr, vdm);
                int[] featureValueCounts = new int[attr.numValues()];
                int[][] featureValueCountsByClass = new int[getInputFormat().classAttribute().numValues()][attr
                        .numValues()];
                instanceEnum = getInputFormat().enumerateInstances();
                while (instanceEnum.hasMoreElements()) {
                    Instance instance = (Instance) instanceEnum.nextElement();
                    int value = (int) instance.value(attr);
                    int classValue = (int) instance.classValue();
                    featureValueCounts[value]++;
                    featureValueCountsByClass[classValue][value]++;
                }
                for (int valueIndex1 = 0; valueIndex1 < attr.numValues(); valueIndex1++) {
                    for (int valueIndex2 = 0; valueIndex2 < attr.numValues(); valueIndex2++) {
                        double sum = 0;
                        for (int classValueIndex = 0; classValueIndex < getInputFormat()
                                .numClasses(); classValueIndex++) {
                            double c1i = (double) featureValueCountsByClass[classValueIndex][valueIndex1];
                            double c2i = (double) featureValueCountsByClass[classValueIndex][valueIndex2];
                            double c1 = (double) featureValueCounts[valueIndex1];
                            double c2 = (double) featureValueCounts[valueIndex2];
                            double term1 = c1i / c1;
                            double term2 = c2i / c2;
                            sum += Math.abs(term1 - term2);
                        }
                        vdm[valueIndex1][valueIndex2] = sum;
                    }
                }
            }
        }
    }

    // use this random source for all required randomness
    Random rand = new Random(getRandomSeed());

    // find the set of extra indices to use if the percentage is not evenly divisible by 100
    List extraIndices = new LinkedList();
    double percentageRemainder = (getPercentage() / 100) - Math.floor(getPercentage() / 100.0);
    int extraIndicesCount = (int) (percentageRemainder * sample.numInstances());
    if (extraIndicesCount >= 1) {
        for (int i = 0; i < sample.numInstances(); i++) {
            extraIndices.add(i);
        }
    }
    Collections.shuffle(extraIndices, rand);
    extraIndices = extraIndices.subList(0, extraIndicesCount);
    Set extraIndexSet = new HashSet(extraIndices);

    // the main loop to handle computing nearest neighbors and generating SMOTE
    // examples from each instance in the original minority class data
    Instance[] nnArray = new Instance[nearestNeighbors];
    for (int i = 0; i < sample.numInstances(); i++) {
        Instance instanceI = sample.instance(i);
        // find k nearest neighbors for each instance
        List distanceToInstance = new LinkedList();
        for (int j = 0; j < sample.numInstances(); j++) {
            Instance instanceJ = sample.instance(j);
            if (i != j) {
                double distance = 0;
                attrEnum = getInputFormat().enumerateAttributes();
                while (attrEnum.hasMoreElements()) {
                    Attribute attr = (Attribute) attrEnum.nextElement();
                    if (!attr.equals(getInputFormat().classAttribute())) {
                        double iVal = instanceI.value(attr);
                        double jVal = instanceJ.value(attr);
                        if (attr.isNumeric()) {
                            distance += Math.pow(iVal - jVal, 2);
                        } else {
                            distance += ((double[][]) vdmMap.get(attr))[(int) iVal][(int) jVal];
                        }
                    }
                }
                distance = Math.pow(distance, .5);
                distanceToInstance.add(new Object[] { distance, instanceJ });
            }
        }

        // sort the neighbors according to distance
        Collections.sort(distanceToInstance, new Comparator() {
            public int compare(Object o1, Object o2) {
                double distance1 = (Double) ((Object[]) o1)[0];
                double distance2 = (Double) ((Object[]) o2)[0];
                return (int) Math.ceil(distance1 - distance2);
            }
        });

        // populate the actual nearest neighbor instance array
        Iterator entryIterator = distanceToInstance.iterator();
        int j = 0;
        while (entryIterator.hasNext() && j < nearestNeighbors) {
            nnArray[j] = (Instance) ((Object[]) entryIterator.next())[1];
            j++;
        }

        // create synthetic examples
        int n = (int) Math.floor(getPercentage() / 100);
        while (n > 0 || extraIndexSet.remove(i)) {
            double[] values = new double[sample.numAttributes()];
            int nn = rand.nextInt(nearestNeighbors);
            attrEnum = getInputFormat().enumerateAttributes();
            while (attrEnum.hasMoreElements()) {
                Attribute attr = (Attribute) attrEnum.nextElement();
                if (!attr.equals(getInputFormat().classAttribute())) {
                    if (attr.isNumeric()) {
                        double dif = nnArray[nn].value(attr) - instanceI.value(attr);
                        double gap = rand.nextDouble();
                        values[attr.index()] = (double) (instanceI.value(attr) + gap * dif);
                    } else if (attr.isDate()) {
                        double dif = nnArray[nn].value(attr) - instanceI.value(attr);
                        double gap = rand.nextDouble();
                        values[attr.index()] = (long) (instanceI.value(attr) + gap * dif);
                    } else {
                        int[] valueCounts = new int[attr.numValues()];
                        int iVal = (int) instanceI.value(attr);
                        valueCounts[iVal]++;
                        for (int nnEx = 0; nnEx < nearestNeighbors; nnEx++) {
                            int val = (int) nnArray[nnEx].value(attr);
                            valueCounts[val]++;
                        }
                        int maxIndex = 0;
                        int max = Integer.MIN_VALUE;
                        for (int index = 0; index < attr.numValues(); index++) {
                            if (valueCounts[index] > max) {
                                max = valueCounts[index];
                                maxIndex = index;
                            }
                        }
                        values[attr.index()] = maxIndex;
                    }
                }
            }
            values[sample.classIndex()] = minIndex;
            Instance synthetic = new Instance(1.0, values);
            push(synthetic);
            n--;
        }
    }
}