org.stream_gpu.float_knn.float_search.NormalizableDistance.java Source code

Java tutorial

Introduction

Here is the source code for org.stream_gpu.float_knn.float_search.NormalizableDistance.java

Source

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    NormalizableDistance.java
 *    Copyright (C) 2007 University of Waikato, Hamilton, New Zealand
 *
 */

package org.stream_gpu.float_knn.float_search;

import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.RevisionHandler;
import weka.core.Utils;
import weka.core.neighboursearch.PerformanceStats;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Vector;

import moa.options.OptionHandler;

/**
 * Represents the abstract ancestor for normalizable distance functions, like
 * Euclidean or Manhattan distance.
 *
 * @author Fracpete (fracpete at waikato dot ac dot nz)
 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) -- original code from weka.core.EuclideanDistance
 * @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz) -- original code from weka.core.EuclideanDistance
 * @version $Revision: 1.2 $
 */
public abstract class NormalizableDistance implements Serializable, RevisionHandler {

    /** Index in ranges for MIN. */
    public static final int R_MIN = 0;

    /** Index in ranges for MAX. */

    public static final int R_MAX = 1;

    /** Index in ranges for WIDTH. */
    public static final int R_WIDTH = 2;

    /** the instances used internally. */
    protected Instances m_Data = null;

    /** True if normalization is turned off (default false).*/
    protected boolean m_DontNormalize = false;

    /** The range of the attributes. */
    protected float[][] m_Ranges;

    /** The range of attributes to use for calculating the distance. */
    protected Range m_AttributeIndices = new Range("first-last");

    /** The boolean flags, whether an attribute will be used or not. */
    protected boolean[] m_ActiveIndices;

    /** Whether all the necessary preparations have been done. */
    protected boolean m_Validated;

    /**
     * Invalidates the distance function, Instances must be still set.
     */
    public NormalizableDistance() {
        invalidate();
    }

    /**
     * Initializes the distance function and automatically initializes the
     * ranges.
     * 
     * @param data    the instances the distance function should work on
     */
    public NormalizableDistance(Instances data) {
        setInstances(data);
    }

    /**
     * Returns a string describing this object.
     * 
     * @return       a description of the evaluator suitable for
     *          displaying in the explorer/experimenter gui
     */
    public abstract String globalInfo();

    /**
     * Returns an enumeration describing the available options.
     *
     * @return       an enumeration of all the available options.
     */
    public Enumeration listOptions() {
        Vector result = new Vector();

        result.add(new Option("\tTurns off the normalization of attribute \n" + "\tvalues in distance calculation.",
                "D", 0, "-D"));

        result.addElement(new Option(
                "\tSpecifies list of columns to used in the calculation of the \n"
                        + "\tdistance. 'first' and 'last' are valid indices.\n" + "\t(default: first-last)",
                "R", 1, "-R <col1,col2-col4,...>"));

        result.addElement(new Option("\tInvert matching sense of column indices.", "V", 0, "-V"));

        return result.elements();
    }

    /**
     * Gets the current settings. Returns empty array.
     *
     * @return       an array of strings suitable for passing to setOptions()
     */
    public String[] getOptions() {
        Vector<String> result;

        result = new Vector<String>();

        if (getDontNormalize())
            result.add("-D");

        result.add("-R");
        result.add(getAttributeIndices());

        if (getInvertSelection())
            result.add("-V");

        return result.toArray(new String[result.size()]);
    }

    /**
     * Parses a given list of options.
     *
     * @param options    the list of options as an array of strings
     * @throws Exception    if an option is not supported
     */
    public void setOptions(String[] options) throws Exception {
        String tmpStr;

        setDontNormalize(Utils.getFlag('D', options));

        tmpStr = Utils.getOption('R', options);
        if (tmpStr.length() != 0)
            setAttributeIndices(tmpStr);
        else
            setAttributeIndices("first-last");

        setInvertSelection(Utils.getFlag('V', options));
    }

    /** 
     * Returns the tip text for this property.
     * 
     * @return       tip text for this property suitable for
     *               displaying in the explorer/experimenter gui
     */
    public String dontNormalizeTipText() {
        return "Whether if the normalization of attributes should be turned off "
                + "for distance calculation (Default: false i.e. attribute values " + "are normalized). ";
    }

    /** 
     * Sets whether if the attribute values are to be normalized in distance
     * calculation.
     * 
     * @param dontNormalize   if true the values are not normalized
     */
    public void setDontNormalize(boolean dontNormalize) {
        m_DontNormalize = dontNormalize;
        invalidate();
    }

    /**
     * Gets whether if the attribute values are to be normazlied in distance
     * calculation. (default false i.e. attribute values are normalized.)
     * 
     * @return      false if values get normalized
     */
    public boolean getDontNormalize() {
        return m_DontNormalize;
    }

    /**
     * Returns the tip text for this property.
     *
     * @return       tip text for this property suitable for
     *          displaying in the explorer/experimenter gui
     */
    public String attributeIndicesTipText() {
        return "Specify range of attributes to act on. "
                + "This is a comma separated list of attribute indices, with "
                + "\"first\" and \"last\" valid values. Specify an inclusive "
                + "range with \"-\". E.g: \"first-3,5,6-10,last\".";
    }

    /**
     * Sets the range of attributes to use in the calculation of the distance.
     * The indices start from 1, 'first' and 'last' are valid as well. 
     * E.g.: first-3,5,6-last
     * 
     * @param value   the new attribute index range
     */
    public void setAttributeIndices(String value) {
        m_AttributeIndices.setRanges(value);
        invalidate();
    }

    /**
     * Gets the range of attributes used in the calculation of the distance.
     * 
     * @return      the attribute index range
     */
    public String getAttributeIndices() {
        return m_AttributeIndices.getRanges();
    }

    /**
     * Returns the tip text for this property.
     *
     * @return       tip text for this property suitable for
     *          displaying in the explorer/experimenter gui
     */
    public String invertSelectionTipText() {
        return "Set attribute selection mode. If false, only selected "
                + "attributes in the range will be used in the distance calculation; if "
                + "true, only non-selected attributes will be used for the calculation.";
    }

    /**
     * Sets whether the matching sense of attribute indices is inverted or not.
     * 
     * @param value   if true the matching sense is inverted
     */
    public void setInvertSelection(boolean value) {
        m_AttributeIndices.setInvert(value);
        invalidate();
    }

    /**
     * Gets whether the matching sense of attribute indices is inverted or not.
     * 
     * @return      true if the matching sense is inverted
     */
    public boolean getInvertSelection() {
        return m_AttributeIndices.getInvert();
    }

    /**
     * invalidates all initializations.
     */
    protected void invalidate() {
        m_Validated = false;
    }

    /**
     * performs the initializations if necessary.
     */
    protected void validate() {
        if (!m_Validated) {
            initialize();
            m_Validated = true;
        }
    }

    /**
     * initializes the ranges and the attributes being used.
     */
    protected void initialize() {
        initializeAttributeIndices();
        initializeRanges();
    }

    /**
     * initializes the attribute indices.
     */
    protected void initializeAttributeIndices() {
        m_AttributeIndices.setUpper(m_Data.numAttributes() - 1);
        m_ActiveIndices = new boolean[m_Data.numAttributes()];
        for (int i = 0; i < m_ActiveIndices.length; i++)
            m_ActiveIndices[i] = m_AttributeIndices.isInRange(i);
    }

    /**
     * Sets the instances.
     * 
     * @param insts    the instances to use
     */
    public void setInstances(Instances insts) {
        m_Data = insts;
        invalidate();
    }

    /**
     * returns the instances currently set.
     * 
     * @return       the current instances
     */
    public Instances getInstances() {
        return m_Data;
    }

    /**
     * Does nothing, derived classes may override it though.
     * 
     * @param distances   the distances to post-process
     */
    public void postProcessDistances(float[] distances) {
    }

    /**
     * Update the distance function (if necessary) for the newly added instance.
     * 
     * @param ins      the instance to add
     */
    public void update(Instance ins) {
        validate();

        m_Ranges = updateRanges(ins, m_Ranges);
    }

    /**
     * Calculates the distance between two instances.
     * 
     * @param first    the first instance
     * @param second    the second instance
     * @return       the distance between the two given instances
     */
    public float distance(Instance first, Instance second) {
        return distance(first, second, null);
    }

    /**
     * Calculates the distance between two instances.
     * 
     * @param first    the first instance
     * @param second    the second instance
     * @param stats    the performance stats object
     * @return       the distance between the two given instances
     */
    public float distance(Instance first, Instance second, PerformanceStats stats) {
        return distance(first, second, Float.POSITIVE_INFINITY, stats);
    }

    /**
     * Calculates the distance between two instances. Offers speed up (if the 
     * distance function class in use supports it) in nearest neighbour search by 
     * taking into account the cutOff or maximum distance. Depending on the 
     * distance function class, post processing of the distances by 
     * postProcessDistances(float []) may be required if this function is used.
     *
     * @param first    the first instance
     * @param second    the second instance
     * @param cutOffValue If the distance being calculated becomes larger than 
     *                    cutOffValue then the rest of the calculation is 
     *                    discarded.
     * @return       the distance between the two given instances or 
     *          float.POSITIVE_INFINITY if the distance being 
     *          calculated becomes larger than cutOffValue. 
     */
    public float distance(Instance first, Instance second, float cutOffValue) {
        return distance(first, second, cutOffValue, null);
    }

    /**
     * Calculates the distance between two instances. Offers speed up (if the 
     * distance function class in use supports it) in nearest neighbour search by 
     * taking into account the cutOff or maximum distance. Depending on the 
     * distance function class, post processing of the distances by 
     * postProcessDistances(float []) may be required if this function is used.
     *
     * @param first    the first instance
     * @param second    the second instance
     * @param cutOffValue If the distance being calculated becomes larger than 
     *                    cutOffValue then the rest of the calculation is 
     *                    discarded.
     * @param stats    the performance stats object
     * @return       the distance between the two given instances or 
     *          float.POSITIVE_INFINITY if the distance being 
     *          calculated becomes larger than cutOffValue. 
     */
    public float distance(Instance first, Instance second, float cutOffValue, PerformanceStats stats) {
        float distance = 0;
        int firstI, secondI;
        int firstNumValues = first.numValues();
        int secondNumValues = second.numValues();
        int numAttributes = m_Data.numAttributes();
        int classIndex = m_Data.classIndex();

        validate();

        for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) {
            if (p1 >= firstNumValues)
                firstI = numAttributes;
            else
                firstI = first.index(p1);

            if (p2 >= secondNumValues)
                secondI = numAttributes;
            else
                secondI = second.index(p2);

            if (firstI == classIndex) {
                p1++;
                continue;
            }
            if ((firstI < numAttributes) && !m_ActiveIndices[firstI]) {
                p1++;
                continue;
            }

            if (secondI == classIndex) {
                p2++;
                continue;
            }
            if ((secondI < numAttributes) && !m_ActiveIndices[secondI]) {
                p2++;
                continue;
            }

            float diff;

            if (firstI == secondI) {
                diff = difference(firstI, (float) first.valueSparse(p1), (float) second.valueSparse(p2));
                p1++;
                p2++;
            } else if (firstI > secondI) {
                diff = difference(secondI, 0, (float) second.valueSparse(p2));
                p2++;
            } else {
                diff = difference(firstI, (float) first.valueSparse(p1), 0);
                p1++;
            }
            if (stats != null)
                stats.incrCoordCount();

            distance = updateDistance(distance, diff);
            if (distance > cutOffValue)
                return Float.POSITIVE_INFINITY;
        }

        return distance;
    }

    /**
     * Updates the current distance calculated so far with the new difference
     * between two attributes. The difference between the attributes was 
     * calculated with the difference(int,float,float) method.
     * 
     * @param currDist   the current distance calculated so far
     * @param diff   the difference between two new attributes
     * @return      the update distance
     * @see      #difference(int, float, float)
     */
    protected abstract float updateDistance(float currDist, float diff);

    /**
     * Normalizes a given value of a numeric attribute.
     *
     * @param x       the value to be normalized
     * @param i       the attribute's index
     * @return      the normalized value
     */
    protected float norm(float x, int i) {
        if (Float.isNaN(m_Ranges[i][R_MIN]) || (m_Ranges[i][R_MAX] == m_Ranges[i][R_MIN]))
            return 0;
        else
            return (x - m_Ranges[i][R_MIN]) / (m_Ranges[i][R_WIDTH]);
    }

    /**
     * Computes the difference between two given attribute
     * values.
     * 
     * @param index   the attribute index
     * @param val1   the first value
     * @param val2   the second value
     * @return      the difference
     */
    protected float difference(int index, float val1, float val2) {
        switch (m_Data.attribute(index).type()) {
        case Attribute.NOMINAL:
            if (Float.isNaN(val1) || Float.isNaN(val2) || ((int) val1 != (int) val2)) {
                return 1;
            } else {
                return 0;
            }

        case Attribute.NUMERIC:
            if (Float.isNaN(val1) || Float.isNaN(val2)) {
                if (Float.isNaN(val1) && Float.isNaN(val2)) {
                    if (!m_DontNormalize) //We are doing normalization
                        return 1;
                    else
                        return (m_Ranges[index][R_MAX] - m_Ranges[index][R_MIN]);
                } else {
                    float diff;
                    if (Float.isNaN(val2)) {
                        diff = (!m_DontNormalize) ? norm(val1, index) : val1;
                    } else {
                        diff = (!m_DontNormalize) ? norm(val2, index) : val2;
                    }
                    if (!m_DontNormalize && diff < 0.5) {
                        diff = 1.0F - diff;
                    } else if (m_DontNormalize) {
                        if ((m_Ranges[index][R_MAX] - diff) > (diff - m_Ranges[index][R_MIN]))
                            return m_Ranges[index][R_MAX] - diff;
                        else
                            return diff - m_Ranges[index][R_MIN];
                    }
                    return diff;
                }
            } else {
                return (!m_DontNormalize) ? (norm(val1, index) - norm(val2, index)) : (val1 - val2);
            }

        default:
            return 0;
        }
    }

    /**
     * Initializes the ranges using all instances of the dataset.
     * Sets m_Ranges.
     * 
     * @return       the ranges
     */
    public float[][] initializeRanges() {
        if (m_Data == null) {
            m_Ranges = null;
            return m_Ranges;
        }

        int numAtt = m_Data.numAttributes();
        float[][] ranges = new float[numAtt][3];

        if (m_Data.numInstances() <= 0) {
            initializeRangesEmpty(numAtt, ranges);
            m_Ranges = ranges;
            return m_Ranges;
        } else {
            // initialize ranges using the first instance
            updateRangesFirst(m_Data.instance(0), numAtt, ranges);
        }

        // update ranges, starting from the second
        for (int i = 1; i < m_Data.numInstances(); i++)
            updateRanges(m_Data.instance(i), numAtt, ranges);

        m_Ranges = ranges;

        return m_Ranges;
    }

    /**
     * Used to initialize the ranges. For this the values of the first
     * instance is used to save time.
     * Sets low and high to the values of the first instance and
     * width to zero.
     * 
     * @param instance    the new instance
     * @param numAtt    number of attributes in the model
     * @param ranges    low, high and width values for all attributes
     */
    public void updateRangesFirst(Instance instance, int numAtt, float[][] ranges) {
        for (int j = 0; j < numAtt; j++) {
            if (!instance.isMissing(j)) {
                ranges[j][R_MIN] = (float) instance.value(j);
                ranges[j][R_MAX] = (float) instance.value(j);
                ranges[j][R_WIDTH] = 0.0F;
            } else { // if value was missing
                ranges[j][R_MIN] = Float.POSITIVE_INFINITY;
                ranges[j][R_MAX] = -Float.POSITIVE_INFINITY;
                ranges[j][R_WIDTH] = Float.POSITIVE_INFINITY;
            }
        }
    }

    /**
     * Updates the minimum and maximum and width values for all the attributes
     * based on a new instance.
     * 
     * @param instance    the new instance
     * @param numAtt    number of attributes in the model
     * @param ranges    low, high and width values for all attributes
     */
    public void updateRanges(Instance instance, int numAtt, float[][] ranges) {
        // updateRangesFirst must have been called on ranges
        for (int j = 0; j < numAtt; j++) {
            float value = (float) instance.value(j);
            if (!instance.isMissing(j)) {
                if (value < ranges[j][R_MIN]) {
                    ranges[j][R_MIN] = value;
                    ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
                    if (value > ranges[j][R_MAX]) { //if this is the first value that is
                        ranges[j][R_MAX] = value; //not missing. The,0
                        ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
                    }
                } else {
                    if (value > ranges[j][R_MAX]) {
                        ranges[j][R_MAX] = value;
                        ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
                    }
                }
            }
        }
    }

    /**
     * Used to initialize the ranges.
     * 
     * @param numAtt    number of attributes in the model
     * @param ranges    low, high and width values for all attributes
     */
    public void initializeRangesEmpty(int numAtt, float[][] ranges) {
        for (int j = 0; j < numAtt; j++) {
            ranges[j][R_MIN] = Float.POSITIVE_INFINITY;
            ranges[j][R_MAX] = -Float.POSITIVE_INFINITY;
            ranges[j][R_WIDTH] = Float.POSITIVE_INFINITY;
        }
    }

    /**
     * Updates the ranges given a new instance.
     * 
     * @param instance    the new instance
     * @param ranges    low, high and width values for all attributes
     * @return      the updated ranges
     */
    public float[][] updateRanges(Instance instance, float[][] ranges) {
        // updateRangesFirst must have been called on ranges
        for (int j = 0; j < ranges.length; j++) {
            float value = (float) instance.value(j);
            if (!instance.isMissing(j)) {
                if (value < ranges[j][R_MIN]) {
                    ranges[j][R_MIN] = value;
                    ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
                } else {
                    if (instance.value(j) > ranges[j][R_MAX]) {
                        ranges[j][R_MAX] = value;
                        ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN];
                    }
                }
            }
        }

        return ranges;
    }

    /**
     * Initializes the ranges of a subset of the instances of this dataset.
     * Therefore m_Ranges is not set.
     * 
     * @param instList    list of indexes of the subset
     * @return       the ranges
     * @throws Exception   if something goes wrong
     */
    public float[][] initializeRanges(int[] instList) throws Exception {
        if (m_Data == null)
            throw new Exception("No instances supplied.");

        int numAtt = m_Data.numAttributes();
        float[][] ranges = new float[numAtt][3];

        if (m_Data.numInstances() <= 0) {
            initializeRangesEmpty(numAtt, ranges);
            return ranges;
        } else {
            // initialize ranges using the first instance
            updateRangesFirst(m_Data.instance(instList[0]), numAtt, ranges);
            // update ranges, starting from the second
            for (int i = 1; i < instList.length; i++) {
                updateRanges(m_Data.instance(instList[i]), numAtt, ranges);
            }
        }
        return ranges;
    }

    /**
     * Initializes the ranges of a subset of the instances of this dataset.
     * Therefore m_Ranges is not set.
     * The caller of this method should ensure that the supplied start and end 
     * indices are valid (start &lt;= end, end&lt;instList.length etc) and
     * correct.
     *
     * @param instList    list of indexes of the instances
     * @param startIdx    start index of the subset of instances in the indices array
     * @param endIdx    end index of the subset of instances in the indices array
     * @return       the ranges
     * @throws Exception   if something goes wrong
     */
    public float[][] initializeRanges(int[] instList, int startIdx, int endIdx) throws Exception {
        if (m_Data == null)
            throw new Exception("No instances supplied.");

        int numAtt = m_Data.numAttributes();
        float[][] ranges = new float[numAtt][3];

        if (m_Data.numInstances() <= 0) {
            initializeRangesEmpty(numAtt, ranges);
            return ranges;
        } else {
            // initialize ranges using the first instance
            updateRangesFirst(m_Data.instance(instList[startIdx]), numAtt, ranges);
            // update ranges, starting from the second
            for (int i = startIdx + 1; i <= endIdx; i++) {
                updateRanges(m_Data.instance(instList[i]), numAtt, ranges);
            }
        }

        return ranges;
    }

    /**
     * Update the ranges if a new instance comes.
     * 
     * @param instance    the new instance
     */
    public void updateRanges(Instance instance) {
        validate();

        m_Ranges = updateRanges(instance, m_Ranges);
    }

    /**
     * Test if an instance is within the given ranges.
     * 
     * @param instance    the instance
     * @param ranges    the ranges the instance is tested to be in
     * @return true    if instance is within the ranges
     */
    public boolean inRanges(Instance instance, float[][] ranges) {
        boolean isIn = true;

        // updateRangesFirst must have been called on ranges
        for (int j = 0; isIn && (j < ranges.length); j++) {
            if (!instance.isMissing(j)) {
                float value = (float) instance.value(j);
                isIn = value <= ranges[j][R_MAX];
                if (isIn)
                    isIn = value >= ranges[j][R_MIN];
            }
        }

        return isIn;
    }

    /**
     * Check if ranges are set.
     * 
     * @return       true if ranges are set
     */
    public boolean rangesSet() {
        return (m_Ranges != null);
    }

    /**
     * Method to get the ranges.
     * 
     * @return       the ranges
     * @throws Exception   if no randes are set yet
     */
    public float[][] getRanges() throws Exception {
        validate();

        if (m_Ranges == null)
            throw new Exception("Ranges not yet set.");

        return m_Ranges;
    }

    /**
     * Returns an empty string.
     * 
     * @return      an empty string
     */
    public String toString() {
        return "";
    }
}