List of usage examples for weka.core Instances attribute
publicAttribute attribute(String name)
From source file:ml.dataprocess.CorrelationAttributeEval.java
License:Open Source License
/** * Initializes an information gain attribute evaluator. Replaces missing * values with means/modes; Deletes instances with missing class values. * //from ww w. j a va2 s . c om * @param data set of instances serving as training data * @throws Exception if the evaluator has not been generated successfully */ @Override public void buildEvaluator(Instances data) throws Exception { data = new Instances(data); data.deleteWithMissingClass(); ReplaceMissingValues rmv = new ReplaceMissingValues(); rmv.setInputFormat(data); data = Filter.useFilter(data, rmv); int numClasses = data.classAttribute().numValues(); int classIndex = data.classIndex(); int numInstances = data.numInstances(); m_correlations = new double[data.numAttributes()]; /* * boolean hasNominals = false; boolean hasNumerics = false; */ List<Integer> numericIndexes = new ArrayList<Integer>(); List<Integer> nominalIndexes = new ArrayList<Integer>(); if (m_detailedOutput) { m_detailedOutputBuff = new StringBuffer(); } // TODO for instance weights (folded into computing weighted correlations) // add another dimension just before the last [2] (0 for 0/1 binary vector // and // 1 for corresponding instance weights for the 1's) double[][][] nomAtts = new double[data.numAttributes()][][]; for (int i = 0; i < data.numAttributes(); i++) { if (data.attribute(i).isNominal() && i != classIndex) { nomAtts[i] = new double[data.attribute(i).numValues()][data.numInstances()]; Arrays.fill(nomAtts[i][0], 1.0); // set zero index for this att to all // 1's nominalIndexes.add(i); } else if (data.attribute(i).isNumeric() && i != classIndex) { numericIndexes.add(i); } } // do the nominal attributes if (nominalIndexes.size() > 0) { for (int i = 0; i < data.numInstances(); i++) { Instance current = data.instance(i); for (int j = 0; j < current.numValues(); j++) { if (current.attribute(current.index(j)).isNominal() && current.index(j) != classIndex) { // Will need to check for zero in case this isn't a sparse // instance (unless we add 1 and subtract 1) nomAtts[current.index(j)][(int) current.valueSparse(j)][i] += 1; nomAtts[current.index(j)][0][i] -= 1; } } } } if (data.classAttribute().isNumeric()) { double[] classVals = data.attributeToDoubleArray(classIndex); // do the numeric attributes for (Integer i : numericIndexes) { double[] numAttVals = data.attributeToDoubleArray(i); m_correlations[i] = Utils.correlation(numAttVals, classVals, numAttVals.length); if (m_correlations[i] == 1.0) { // check for zero variance (useless numeric attribute) if (Utils.variance(numAttVals) == 0) { m_correlations[i] = 0; } } } // do the nominal attributes if (nominalIndexes.size() > 0) { // now compute the correlations for the binarized nominal attributes for (Integer i : nominalIndexes) { double sum = 0; double corr = 0; double sumCorr = 0; double sumForValue = 0; if (m_detailedOutput) { m_detailedOutputBuff.append("\n\n").append(data.attribute(i).name()); } for (int j = 0; j < data.attribute(i).numValues(); j++) { sumForValue = Utils.sum(nomAtts[i][j]); corr = Utils.correlation(nomAtts[i][j], classVals, classVals.length); // useless attribute - all instances have the same value if (sumForValue == numInstances || sumForValue == 0) { corr = 0; } if (corr < 0.0) { corr = -corr; } sumCorr += sumForValue * corr; sum += sumForValue; if (m_detailedOutput) { m_detailedOutputBuff.append("\n\t").append(data.attribute(i).value(j)).append(": "); m_detailedOutputBuff.append(Utils.doubleToString(corr, 6)); } } m_correlations[i] = (sum > 0) ? sumCorr / sum : 0; } } } else { // class is nominal // TODO extra dimension for storing instance weights too double[][] binarizedClasses = new double[data.classAttribute().numValues()][data.numInstances()]; // this is equal to the number of instances for all inst weights = 1 double[] classValCounts = new double[data.classAttribute().numValues()]; for (int i = 0; i < data.numInstances(); i++) { Instance current = data.instance(i); binarizedClasses[(int) current.classValue()][i] = 1; } for (int i = 0; i < data.classAttribute().numValues(); i++) { classValCounts[i] = Utils.sum(binarizedClasses[i]); } double sumClass = Utils.sum(classValCounts); // do numeric attributes first if (numericIndexes.size() > 0) { for (Integer i : numericIndexes) { double[] numAttVals = data.attributeToDoubleArray(i); double corr = 0; double sumCorr = 0; for (int j = 0; j < data.classAttribute().numValues(); j++) { corr = Utils.correlation(numAttVals, binarizedClasses[j], numAttVals.length); if (corr < 0.0) { corr = -corr; } if (corr == 1.0) { // check for zero variance (useless numeric attribute) if (Utils.variance(numAttVals) == 0) { corr = 0; } } sumCorr += classValCounts[j] * corr; } m_correlations[i] = sumCorr / sumClass; } } if (nominalIndexes.size() > 0) { for (Integer i : nominalIndexes) { if (m_detailedOutput) { m_detailedOutputBuff.append("\n\n").append(data.attribute(i).name()); } double sumForAtt = 0; double corrForAtt = 0; for (int j = 0; j < data.attribute(i).numValues(); j++) { double sumForValue = Utils.sum(nomAtts[i][j]); double corr = 0; double sumCorr = 0; double avgCorrForValue = 0; sumForAtt += sumForValue; for (int k = 0; k < numClasses; k++) { // corr between value j and class k corr = Utils.correlation(nomAtts[i][j], binarizedClasses[k], binarizedClasses[k].length); // useless attribute - all instances have the same value if (sumForValue == numInstances || sumForValue == 0) { corr = 0; } if (corr < 0.0) { corr = -corr; } sumCorr += classValCounts[k] * corr; } avgCorrForValue = sumCorr / sumClass; corrForAtt += sumForValue * avgCorrForValue; if (m_detailedOutput) { m_detailedOutputBuff.append("\n\t").append(data.attribute(i).value(j)).append(": "); m_detailedOutputBuff.append(Utils.doubleToString(avgCorrForValue, 6)); } } // the weighted average corr for att i as // a whole (wighted by value frequencies) m_correlations[i] = (sumForAtt > 0) ? corrForAtt / sumForAtt : 0; } } } if (m_detailedOutputBuff != null && m_detailedOutputBuff.length() > 0) { m_detailedOutputBuff.append("\n"); } }
From source file:mlda.attributes.AvgAbsoluteCorrelationBetweenNumericAttributes.java
License:Open Source License
/** * Calculate metric value/*from ww w. j av a2 s . co m*/ * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData) { Instances instances = mlData.getDataSet(); int numInstances = mlData.getNumInstances(); double res = 0.0; int count = 0; int[] featureIndices = mlData.getFeatureIndices(); Vector<Integer> numericFeatureIndices = new Vector<>(); for (int fIndex : featureIndices) { if (instances.attribute(fIndex).isNumeric()) { numericFeatureIndices.add(fIndex); } } if (numericFeatureIndices.size() <= 0) { return Double.NaN; } double[][] attributesToDoubleArray = new double[numericFeatureIndices.size()][numInstances]; for (int fIndex : numericFeatureIndices) { attributesToDoubleArray[fIndex] = instances.attributeToDoubleArray(fIndex); } for (int fIndex1 : numericFeatureIndices) { for (int fIndex2 = fIndex1 + 1; fIndex2 < numericFeatureIndices.size(); fIndex2++) { count++; res += Utils.correlation(attributesToDoubleArray[fIndex1], attributesToDoubleArray[fIndex2], numInstances); } } if (count > 0) { this.value = res / count; } else { this.value = Double.NaN; } //this.value = res/count; return value; }
From source file:mlda.util.Utils.java
License:Open Source License
/** * Get array of ImbalancedFeature with labels frequency * /*from ww w .j av a 2s. c om*/ * @param dataset Multi-label dataset * @return Array of ImbalancedFeature with the labels frequency */ public static ImbalancedFeature[] getAppearancesPerLabel(MultiLabelInstances dataset) { int[] labelIndices = dataset.getLabelIndices(); ImbalancedFeature[] labels = new ImbalancedFeature[labelIndices.length]; Instances instances = dataset.getDataSet(); int appearances = 0; Attribute currentAtt; for (int i = 0; i < labelIndices.length; i++) { currentAtt = instances.attribute(labelIndices[i]); appearances = 0; for (int j = 0; j < instances.size(); j++) { if (instances.instance(j).value(currentAtt) == 1.0) { appearances++; } } labels[i] = new ImbalancedFeature(currentAtt.name(), appearances); } return labels; }
From source file:mlda.util.Utils.java
License:Open Source License
/** * Calculate IRs of the ImbalancedFeatures * //from ww w . ja va 2s.c o m * @param dataset Multi-label dataset * @param labels Labels of the dataset as ImbalancedFeature objects * @return Array of ImbalancedFeature objects with calculated IR */ public static ImbalancedFeature[] getImbalancedWithIR(MultiLabelInstances dataset, ImbalancedFeature[] labels) { int[] labelIndices = dataset.getLabelIndices(); ImbalancedFeature[] labels_imbalanced = new ImbalancedFeature[labelIndices.length]; Instances instances = dataset.getDataSet(); int nOnes = 0, nZeros = 0, maxAppearance = 0; double IRIntraClass; double variance; double IRInterClass; double mean = dataset.getNumInstances() / 2; Attribute current; ImbalancedFeature currentLabel; for (int i = 0; i < labelIndices.length; i++) //for each label { nZeros = 0; nOnes = 0; current = instances.attribute(labelIndices[i]); //current label for (int j = 0; j < instances.size(); j++) //for each instance { if (instances.instance(j).value(current) == 1.0) { nOnes++; } else { nZeros++; } } try { if (nZeros == 0 || nOnes == 0) { IRIntraClass = 0; } else if (nZeros > nOnes) { IRIntraClass = (double) nZeros / nOnes; } else { IRIntraClass = (double) nOnes / nZeros; } } catch (Exception e1) { IRIntraClass = 0; } variance = (Math.pow((nZeros - mean), 2) + Math.pow((nOnes - mean), 2)) / 2; currentLabel = getLabelByName(current.name(), labels); maxAppearance = labels[0].getAppearances(); if (currentLabel.getAppearances() <= 0) { IRInterClass = Double.NaN; } else { IRInterClass = (double) maxAppearance / currentLabel.getAppearances(); } labels_imbalanced[i] = new ImbalancedFeature(current.name(), currentLabel.getAppearances(), IRInterClass, IRIntraClass, variance); } return labels_imbalanced; }
From source file:mlflex.WekaInMemoryLearner.java
License:Open Source License
@Override protected ArrayList<String> SelectOrRankFeatures(ArrayList<String> algorithmParameters, DataInstanceCollection trainData, DataInstanceCollection dependentVariableInstances) throws Exception { ArrayList<String> dataPointNames = Lists.SortStringList(trainData.GetDataPointNames()); FastVector attVector = GetAttributeVector(dependentVariableInstances, dataPointNames, trainData); Instances instances = GetInstances(dependentVariableInstances, attVector, trainData); AttributeSelection attsel = new AttributeSelection(); ASEvaluation eval = GetAttributeEvaluator(algorithmParameters); ASSearch search = GetSearchMethod(algorithmParameters); attsel.setEvaluator(eval);//from w w w . j av a2 s. co m attsel.setSearch(search); boolean isRanker = algorithmParameters.get(2).equals(Ranker.class.getName()); if (isRanker) attsel.setRanking(true); attsel.SelectAttributes(instances); ArrayList<String> features = new ArrayList<String>(); if (isRanker) { for (double[] rank : attsel.rankedAttributes()) features.add(instances.attribute((int) rank[0]).name()); } else { for (int i : attsel.selectedAttributes()) features.add(instances.attribute(i).name()); } instances = null; return features; }
From source file:moa.classifiers.macros.TACNB.java
License:Open Source License
public void initHeader(Instances dataset) { int numLabels = this.numOldLabelsOption.getValue(); Attribute target = dataset.classAttribute(); List<String> possibleValues = new ArrayList<String>(); int n = target.numValues(); for (int i = 0; i < n; i++) { possibleValues.add(target.value(i)); }/*from www . j ava 2 s . c o m*/ ArrayList<Attribute> attrs = new ArrayList<Attribute>(numLabels + dataset.numAttributes()); for (int i = 0; i < numLabels; i++) { attrs.add(new Attribute(target.name() + "_" + i, possibleValues)); } for (int i = 0; i < dataset.numAttributes(); i++) { attrs.add((Attribute) dataset.attribute(i).copy()); } this.header = new Instances("extended_" + dataset.relationName(), attrs, 0); this.header.setClassIndex(numLabels + dataset.classIndex()); }
From source file:moa.classifiers.novelClass.AbstractNovelClassClassifier.java
License:Apache License
final public static Instances augmentInstances(Instances datum) { ArrayList<Attribute> attInfo = new ArrayList<>(datum.numAttributes()); for (int aIdx = 0; aIdx < datum.numAttributes(); aIdx++) { Attribute a = datum.attribute(aIdx).copy(datum.attribute(aIdx).name()); if ((aIdx == datum.classIndex()) && (a.indexOfValue(NOVEL_LABEL_STR) < 0)) { // only if we don't already have these List<String> values = new ArrayList<>(a.numValues() + 2); for (int i = 0; i < a.numValues(); ++i) { values.add(a.value(i));//from w w w. j av a 2s.c o m } values.add(OUTLIER_LABEL_STR); values.add(NOVEL_LABEL_STR); a = new Attribute(a.name(), values, a.getMetadata()); } attInfo.add(a); } String relationshipName = NOVEL_CLASS_INSTANCE_RELATIONSHIP_TYPE + "-" + datum.relationName(); Instances ret = new Instances(relationshipName, attInfo, 1); ret.setClassIndex(datum.classIndex()); return ret; }
From source file:moa.core.VectorDistances.java
License:Apache License
/** * Generalized Minkowski distance equation to cover the entire family of distances * power < 1 --> Minimum distance (strictly speaking, not a Minkowski distance) * power = 1 --> Manhattan distance * power = 2 --> Euclidian distance * power = INF --> Chebyshev (or maximum) distance * @param src first data point to compare from * @param dst second data point to compare to * @param header feature weight (strictly speaking, all weights should be 1 for pure Minkowski) * @param power power used to raise each component distance and 1/p for final reduction * @return Minkowski distance between the two points *///w w w. j av a 2 s.com public static synchronized double distanceMinkowski(double[] src, double[] dst, Instances header, double power) { double ret = 0.0; int minSize = Math.min(src.length, Math.min(dst.length, header.numAttributes())); if (minSize < 1) { return Double.MAX_VALUE; } double minDist = Double.MAX_VALUE; double maxDist = Double.MIN_VALUE; for (int i = 0; i < minSize; i++) { double d = Math.abs(src[i] - dst[i]); double w = header.attribute(i).weight(); ret += (d >= (epsilon * epsilon)) ? Math.abs(Math.pow(d, power)) * w : 0; if (w > 0) { minDist = Math.min(minDist, d); } if (w > 0) { maxDist = Math.max(maxDist, d); } } if (power >= Minkowski_Chebyshev) { ret = maxDist; } else if (power < 0.000000001) { ret = minDist; } else { ret = (ret >= (epsilon * epsilon)) ? Math.pow(ret, 1.0 / power) : 0; } // Safety... if (Double.isInfinite(ret)) { ret = Double.MAX_VALUE; } else if (Double.isNaN(ret)) { ret = 0.0; } return ret; }
From source file:moa.core.VectorDistances.java
License:Apache License
/** * Average distance, which is a modification of Euclidian distance * @param src first data point to compare from * @param dst second data point to compare to * @param header feature weights and meta data (strictly speaking, all weights should be 1 for pure Minkowski) * @return component-averaged Euclidian distance *///w w w . jav a 2 s . c om public static synchronized double distanceAverage(double[] src, double[] dst, Instances header) { double ret = 0.0; int minSize = Math.min(src.length, Math.min(dst.length, header.numAttributes())); if (minSize < 1) { return Double.MAX_VALUE; } for (int i = 0; i < minSize; i++) { double d = Math.abs(src[i] - dst[i]); ret += d * d * header.attribute(i).weight(); } ret = Math.sqrt(ret / minSize); // Safety... if (Double.isInfinite(ret)) { ret = Double.MAX_VALUE; } else if (Double.isNaN(ret)) { ret = 0.0; } return ret; }
From source file:moa.core.VectorDistances.java
License:Apache License
/** * Average distance, which is a modification of Euclidian distance * @param src first data point to compare from * @param dst second data point to compare to * @param header data set header used to determine attribute/feature type for mixed distance * @return component-averaged Euclidian distance *//*from www. j a v a 2s . c o m*/ public static synchronized double distanceGower(double[] src, double[] dst, Instances header) { double ret = 0.0; int minSize = Math.min(src.length, Math.min(dst.length, header.numAttributes())); if (minSize < 1) { return Double.MAX_VALUE; } double wSum = 0.0; for (int i = 0; i < minSize; i++) { Attribute att = header.attribute(i); double d = 0.0; double w = header.attribute(i).weight(); if (att == null) { continue; } switch (att.type()) { case Attribute.NUMERIC: w = (src[i] == 0 || dst[i] == 0) ? 0.0 : 1.0; double sigma = Math.abs( header.attribute(i).getUpperNumericBound() - header.attribute(i).getLowerNumericBound()); d = (Double.isFinite(sigma) && sigma > 0) ? Math.abs(src[i] - dst[i]) / sigma : Math.abs(src[i] - dst[i]) / 1;//Math.max(src[i], dst[i]); break; case Attribute.NOMINAL: case Attribute.STRING: d = (src[i] == dst[i]) ? 0.0 : 1.0; break; case Attribute.DATE: case Attribute.RELATIONAL: default: System.err.println("Attribute type " + Attribute.typeToString(att) + " is not yet supported... ignoring feature " + i); d = 0.0; w = 0; } wSum += w; ret += d * d * w; } ret = (wSum > 0) ? Math.sqrt(ret / wSum) : 0.0; // Safety... if (Double.isInfinite(ret)) { ret = Double.MAX_VALUE; } else if (Double.isNaN(ret)) { ret = 0.0; } return ret; }