Java tutorial
/** * PromniCAT - Collection and Analysis of Business Process Models * Copyright (C) 2012 Cindy Fhnrich, Tobias Hoppe, Andrina Mascher * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.uni_potsdam.hpi.bpt.promnicat.util; import weka.core.Attribute; import weka.core.EditDistance; import weka.core.Instance; import weka.core.neighboursearch.PerformanceStats; import de.uni_potsdam.hpi.bpt.promnicat.analysisModules.clustering.ProcessInstance; import de.uni_potsdam.hpi.bpt.promnicat.analysisModules.clustering.ProcessInstances; /** * Computes the Levenshtein edit distance between two strings. * Incorporates weights. * * @author Cindy Fhnrich * */ public class WeightedEditDistance extends EditDistance { private static final long serialVersionUID = 1L; public WeightedEditDistance() { } public WeightedEditDistance(ProcessInstances data) { super(data); } /** * Calculates the distance between two instances. Offers speed up (if the * distance function class in use supports it) in nearest neighbour search by * taking into account the cutOff or maximum distance. Depending on the * distance function class, post processing of the distances by * postProcessDistances(double []) may be required if this function is used. * * @param first the first instance * @param second the second instance * @param cutOffValue If the distance being calculated becomes larger than * cutOffValue then the rest of the calculation is * discarded. * @param stats the performance stats object * @return the distance between the two given instances or * Double.POSITIVE_INFINITY if the distance being * calculated becomes larger than cutOffValue. */ @Override public double distance(Instance first, Instance second, double cutOffValue, PerformanceStats stats) { double sqDistance = 0; int numAttributes = ((ProcessInstances) m_Data).numStrAttributes(); validate(); double diff; //incorporates weights double weights = 1; for (int i = 0; i < numAttributes; i++) { diff = 0; if (m_ActiveIndices[i]) { diff = difference(i, ((ProcessInstance) first).strValue(i), ((ProcessInstance) second).strValue(i)); //use weight diff = diff * ((ProcessInstances) m_Data).strAttribute(i).weight(); } sqDistance = updateDistance(sqDistance, diff); if (sqDistance > (cutOffValue * cutOffValue)) return Double.POSITIVE_INFINITY; } double distance = Math.sqrt(sqDistance); if (weights > 1) { return distance / (weights - 1); } return distance / weights; } /** * Calculates the distance (Levenshtein Edit Distance) between two strings * * @param stringA the first string * @param stringB the second string * @return the distance between the two given strings */ public double getStringDistance(String stringA, String stringB) { int lengthA = stringA.length(); int lengthB = stringB.length(); double[][] distanceMatrix = new double[lengthA + 1][lengthB + 1]; for (int i = 0; i <= lengthA; i++) { distanceMatrix[i][0] = i; } for (int j = 1; j <= lengthB; j++) { distanceMatrix[0][j] = j; } for (int i = 1; i <= lengthA; i++) { for (int j = 1; j <= lengthB; j++) { if (stringA.charAt(i - 1) == stringB.charAt(j - 1)) { distanceMatrix[i][j] = distanceMatrix[i - 1][j - 1]; } else { distanceMatrix[i][j] = 1 + Math.min(distanceMatrix[i - 1][j], Math.min(distanceMatrix[i][j - 1], distanceMatrix[i - 1][j - 1])); } } } return distanceMatrix[lengthA][lengthB]; } /** * Computes the difference between two given attribute * values. * * @param index the attribute index * @param val1 the first value * @param val2 the second value * @return the difference */ protected double difference(int index, String string1, String string2) { switch (((ProcessInstances) m_Data).strAttribute(index).type()) { case Attribute.STRING: double diff = getStringDistance(string1, string2); if (m_DontNormalize == true) { return diff; } else { if (string1.length() > string2.length()) { return diff / ((double) string1.length()); } else { return diff / ((double) string2.length()); } } default: return 0; } } /** * Updates the ranges given a new instance. * * @param instance the new instance * @param ranges low, high and width values for all attributes * @return the updated ranges */ public double[][] updateRanges(Instance instance, double[][] ranges) { // updateRangesFirst must have been called on ranges for (int j = 0; j < ranges.length; j++) { double value = instance.value(j); if (!((ProcessInstance) instance).isStringMissing(j)) { if (value < ranges[j][R_MIN]) { ranges[j][R_MIN] = value; ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; } else { if (instance.value(j) > ranges[j][R_MAX]) { ranges[j][R_MAX] = value; ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; } } } } return ranges; } /** * Updates the minimum and maximum and width values for all the attributes * based on a new instance. * * @param instance the new instance * @param numAtt number of attributes in the model * @param ranges low, high and width values for all attributes */ public void updateRanges(Instance instance, int numAtt, double[][] ranges) { // updateRangesFirst must have been called on ranges for (int j = 0; j < numAtt; j++) { double value = instance.value(j); if (!((ProcessInstance) instance).isMissing(j)) { if (value < ranges[j][R_MIN]) { ranges[j][R_MIN] = value; ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; if (value > ranges[j][R_MAX]) { //if this is the first value that is ranges[j][R_MAX] = value; //not missing. The,0 ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; } } else { if (value > ranges[j][R_MAX]) { ranges[j][R_MAX] = value; ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; } } } } } /** * Used to initialize the ranges. For this the values of the first * instance is used to save time. * Sets low and high to the values of the first instance and * width to zero. * * @param instance the new instance * @param numAtt number of attributes in the model * @param ranges low, high and width values for all attributes */ public void updateRangesFirst(Instance instance, int numAtt, double[][] ranges) { for (int j = 0; j < numAtt; j++) { if (!((ProcessInstance) instance).isStringMissing(j)) { ranges[j][R_MIN] = instance.value(j); ranges[j][R_MAX] = instance.value(j); ranges[j][R_WIDTH] = 0.0; } else { // if value was missing ranges[j][R_MIN] = Double.POSITIVE_INFINITY; ranges[j][R_MAX] = -Double.POSITIVE_INFINITY; ranges[j][R_WIDTH] = Double.POSITIVE_INFINITY; } } } /** * Initializes the ranges using all instances of the dataset. * Sets m_Ranges. * * @return the ranges */ public double[][] initializeRanges() { if (m_Data == null) { m_Ranges = null; return m_Ranges; } int numAtt = ((ProcessInstances) m_Data).numStrAttributes(); double[][] ranges = new double[numAtt][3]; if (m_Data.numInstances() <= 0) { initializeRangesEmpty(numAtt, ranges); m_Ranges = ranges; return m_Ranges; } else { // initialize ranges using the first instance updateRangesFirst(m_Data.instance(0), numAtt, ranges); } // update ranges, starting from the second for (int i = 1; i < m_Data.numInstances(); i++) updateRanges(m_Data.instance(i), numAtt, ranges); m_Ranges = ranges; return m_Ranges; } /** * Initializes the ranges of a subset of the instances of this dataset. * Therefore m_Ranges is not set. * * @param instList list of indexes of the subset * @return the ranges * @throws Exception if something goes wrong */ public double[][] initializeRanges(int[] instList) throws Exception { if (m_Data == null) throw new Exception("No instances supplied."); int numAtt = ((ProcessInstances) m_Data).numStrAttributes(); double[][] ranges = new double[numAtt][3]; if (m_Data.numInstances() <= 0) { initializeRangesEmpty(numAtt, ranges); return ranges; } else { // initialize ranges using the first instance updateRangesFirst(m_Data.instance(instList[0]), numAtt, ranges); // update ranges, starting from the second for (int i = 1; i < instList.length; i++) { updateRanges(m_Data.instance(instList[i]), numAtt, ranges); } } return ranges; } /** * initializes the attribute indices. */ protected void initializeAttributeIndices() { m_AttributeIndices.setUpper(((ProcessInstances) m_Data).numStrAttributes() - 1); m_ActiveIndices = new boolean[((ProcessInstances) m_Data).numStrAttributes()]; for (int i = 0; i < m_ActiveIndices.length; i++) m_ActiveIndices[i] = m_AttributeIndices.isInRange(i); } /** * Initializes the ranges of a subset of the instances of this dataset. * Therefore m_Ranges is not set. * The caller of this method should ensure that the supplied start and end * indices are valid (start <= end, end<instList.length etc) and * correct. * * @param instList list of indexes of the instances * @param startIdx start index of the subset of instances in the indices array * @param endIdx end index of the subset of instances in the indices array * @return the ranges * @throws Exception if something goes wrong */ public double[][] initializeRanges(int[] instList, int startIdx, int endIdx) throws Exception { if (m_Data == null) throw new Exception("No instances supplied."); int numAtt = ((ProcessInstances) m_Data).numStrAttributes(); double[][] ranges = new double[numAtt][3]; if (m_Data.numInstances() <= 0) { initializeRangesEmpty(numAtt, ranges); return ranges; } else { // initialize ranges using the first instance updateRangesFirst(m_Data.instance(instList[startIdx]), numAtt, ranges); // update ranges, starting from the second for (int i = startIdx + 1; i <= endIdx; i++) { updateRanges(m_Data.instance(instList[i]), numAtt, ranges); } } return ranges; } /** * Test if an instance is within the given ranges. * * @param instance the instance * @param ranges the ranges the instance is tested to be in * @return true if instance is within the ranges */ public boolean inRanges(Instance instance, double[][] ranges) { boolean isIn = true; // updateRangesFirst must have been called on ranges for (int j = 0; isIn && (j < ranges.length); j++) { if (!((ProcessInstance) instance).isStringMissing(j)) { double value = instance.value(j); isIn = value <= ranges[j][R_MAX]; if (isIn) isIn = value >= ranges[j][R_MIN]; } } return isIn; } }