List of usage examples for weka.core Instances attribute
publicAttribute attribute(String name)
From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java
License:Apache License
/** * Loads the given decent file and tranform it from decent->arffx->arff * //from w w w . j a va 2 s. c o m * @return Instances in WEKA format */ @Override public Instances load(File file) { // Set attributeFilter setAttributeFilter(); // Register MetaModels try { registerMetaModels(); } catch (Exception e1) { Console.printerrln("Metamodels cannot be registered!"); e1.printStackTrace(); } // Set location of decent and arffx Model String decentModelLocation = file.getAbsolutePath(); String pathToDecentModelFolder = decentModelLocation.substring(0, decentModelLocation.lastIndexOf(File.separator)); String arffxModelLocation = pathToDecentModelFolder + "/model.arffx"; String logModelLocation = pathToDecentModelFolder + "/model.log"; String arffLocation = pathToDecentModelFolder + "/model.arff"; // If arff File exists, load from it! if (new File(arffLocation).exists()) { System.out.println("Loading arff File..."); BufferedReader reader; Instances data = null; try { reader = new BufferedReader(new FileReader(arffLocation)); data = new Instances(reader); reader.close(); } catch (FileNotFoundException e) { Console.printerrln("File with path: " + arffLocation + " was not found."); throw new RuntimeException(e); } catch (IOException e) { Console.printerrln("File with path: " + arffLocation + " cannot be read."); throw new RuntimeException(e); } // Set class attribute if not set if (data.classIndex() == -1) { Attribute classAttribute = data.attribute(classAttributeName); data.setClass(classAttribute); } return data; } // Location of EOL Scripts String preprocess = "./decent/epsilon/query/preprocess.eol"; String arffxToArffSource = "./decent/epsilon/query/addLabels.eol"; // Set Log Properties System.setProperty("epsilon.logLevel", logLevel); System.setProperty("epsilon.logToFile", logToFile); System.setProperty("epsilon.logFileAvailable", "false"); // Set decent2arffx Properties System.setProperty("epsilon.transformation.decent2arffx.skipSource", "false"); System.setProperty("epsilon.transformation.decent2arffx.type", "code"); // Preprocess Data, transform from decent2arffx try { IEolExecutableModule preProcessModule = loadModule(preprocess); IModel preProcessDecentModel = modelHandler.getDECENTModel(decentModelLocation, true, true); IModel preProcessArffxarffxModel = modelHandler.getARFFxModel(arffxModelLocation, false, true); preProcessModule.getContext().getModelRepository().addModel(preProcessDecentModel); preProcessModule.getContext().getModelRepository().addModel(preProcessArffxarffxModel); execute(preProcessModule, logModelLocation); preProcessDecentModel.dispose(); preProcessArffxarffxModel.dispose(); preProcessModule.reset(); } catch (URISyntaxException e) { Console.printerrln("URI Syntax for decent or arffx model is wrong."); e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } // Transform to arff, for label and confidence attributes try { IEolExecutableModule arffxToArffModule = loadModule(arffxToArffSource); IModel arffxToArffArffxModel = modelHandler.getARFFxModel(arffxModelLocation, true, true); arffxToArffModule.getContext().getModelRepository().addModel(arffxToArffArffxModel); execute(arffxToArffModule, logModelLocation); arffxToArffArffxModel.dispose(); // can be stored and retained alternatively arffxToArffModule.reset(); } catch (URISyntaxException e) { Console.printerrln("URI Syntax for arffx model is wrong."); e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } // Unregister MetaModels, otherwise cast will fail HashMap<String, Object> metaModelCache = new HashMap<>(); for (String key : EPackage.Registry.INSTANCE.keySet()) { metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); } ; for (String key : metaModelCache.keySet()) { EPackage.Registry.INSTANCE.remove(key); } ; // Workaround to gernerate a usable URI. Absolute path is not // possible, therefore we need to construct a relative path URL location = DecentDataLoader.class.getProtectionDomain().getCodeSource().getLocation(); String basePath = location.getFile(); // Location is the bin folder, so we need to delete the last 4 characters basePath = basePath.substring(0, basePath.length() - 4); String relativePath = new File(basePath).toURI().relativize(new File(arffxModelLocation).toURI()).getPath(); // Loard arffx file and create WEKA Instances ARFFxResourceTool tool = new ARFFxResourceTool(); Resource resource = tool.loadResourceFromXMI(relativePath, "arffx"); Instances dataSet = null; for (EObject o : resource.getContents()) { Model m = (Model) o; dataSet = createWekaDataFormat(m); for (Instance i : m.getData()) { createWekaInstance(dataSet, i); } } // Set class attribute Attribute classAttribute = dataSet.attribute(classAttributeName); dataSet.setClass(classAttribute); // Save as ARFF save(dataSet, arffLocation); return dataSet; }
From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java
License:Apache License
/** * Creates a WekaInstance from an ARFFX Model Instance * //from w w w .j a v a 2s . c om * @param dataSet * WekaInstance dataset, where the arffx model instances should be added to * @param i * arffx model instance */ private void createWekaInstance(Instances dataSet, Instance i) { double[] values = new double[dataSet.numAttributes()]; int j = 0; for (Value value : i.getValues()) { String dataValue = value.getContent(); String attributeName = value.getOfAttribute().getName(); if (attributeFilter.contains(attributeName)) { continue; } // Is value a LABEL.* attribute? if (isLabel(attributeName)) { values[j] = dataSet.attribute(j).indexOfValue(dataValue); } else if (isConfidenceLabel(attributeName)) { // Is value a CONFIDENCE.* attribute? values[j] = dataSet.attribute(j).indexOfValue(dataValue); } else if (attributeName.equals("Artifact.Name")) { // Is it the name of the artifact? artifactNames.add(dataValue); values[j] = getIndexOfArtifactName(dataValue); } else { // Is it a numeric value? values[j] = Double.parseDouble(dataValue); } j++; } DenseInstance inst = new DenseInstance(1.0, values); dataSet.add(inst); }
From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java
License:Apache License
@Override public Instances load(File fileMetricsFile) { // first determine all files String path = fileMetricsFile.getParentFile().getAbsolutePath(); String project = fileMetricsFile.getName().split("_")[0]; File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv"); File networkMetrics = new File(path + "/" + project + "_network_metrics.csv"); Instances metricsData = null;/* w ww. j a va 2 s . c o m*/ try { CSVLoader wekaCsvLoader = new CSVLoader(); wekaCsvLoader.setSource(fileMetricsFile); metricsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(bugsFile); Instances bugsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(networkMetrics); Instances networkData = wekaCsvLoader.getDataSet(); metricsData.setRelationName(project); // fix nominal attributes (i.e., NA values) for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isNominal()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } // fix string attributes for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isString()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } Map<String, Integer> filenames = new HashMap<>(); for (int j = 0; j < metricsData.size(); j++) { filenames.put(metricsData.instance(j).stringValue(0), j); } // merge with network data int attributeIndex; for (int j = 2; j < networkData.numAttributes(); j++) { attributeIndex = metricsData.numAttributes(); metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex); for (int i = 0; i < networkData.size(); i++) { Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, networkData.instance(i).value(j)); } } } // add bug information attributeIndex = metricsData.numAttributes(); final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); metricsData.insertAttributeAt(classAtt, attributeIndex); for (int i = 0; i < bugsData.size(); i++) { if (bugsData.instance(i).value(2) > 0.0d) { Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0); } } } // remove filenames metricsData.deleteAttributeAt(0); Attribute eigenvector = metricsData.attribute("eigenvector"); if (eigenvector != null) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.attribute(j) == eigenvector) { metricsData.deleteAttributeAt(j); } } } metricsData.setClassIndex(metricsData.numAttributes() - 1); // set all missing values to 0 for (int i = 0; i < metricsData.size(); i++) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.instance(i).isMissing(j)) { metricsData.instance(i).setValue(j, 0.0d); } } } } catch (IOException e) { Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage()); metricsData = null; } return metricsData; }
From source file:de.unidue.langtech.grading.tc.ClusteringTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }//from w w w .j av a2s .c o m boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext); ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>(); for (Integer clusterId : clusterMap.keySet()) { System.out.println("CLUSTER: " + clusterId); for (Integer offset : clusterMap.get(clusterId)) { // get instance ID from instance Instance instance = copyTrainData.get(offset); Double classOffset = new Double(instance.value(copyTrainData.classAttribute())); String label = (String) trainOutcomeValues.get(classOffset.intValue()); clusterAssignments.addSample(clusterId, label); String instanceId = instance .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index()); System.out.println(label + "\t" + instanceId2TextMap.get(instanceId)); } System.out.println(); } System.out.println("ID\tSIZE\tPURITY\tRMSE"); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); String purityString = String.format("%.2f", purity); double rmse = getRMSE(fd, trainOutcomeValues); String rmseString = String.format("%.2f", rmse); System.out.println( clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString); } System.out.println(); }
From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java
License:Apache License
/** * Creates an Instance object for the specified List of Features. * <br>/*from w w w . j a va 2s . c o m*/ * Extracts the Instance objects from a source file and suppresses all features but the ones * specified. * * @param fileName File to the training results in ARFF format. * @param features List of {@link AbstractFeatureExtractor}s which are currently being tested. * @return Instances object consisting of the desired attribute structure. * @throws Exception If the ARFF file couldn't be read, an exception is thrown. */ public Instances createInstances(String fileName, List<AbstractFeatureExtractor> features) throws Exception { final Instances train = new Instances(new BufferedReader(new FileReader(fileName))); ArrayList<Attribute> newAttributes = new ArrayList<Attribute>(); for (int i = 0; i < train.numAttributes(); i++) { for (AbstractFeatureExtractor feature : features) { if (train.attribute(i).name().equals(feature.getName())) { newAttributes.add(train.attribute(i)); continue; } } } /* * add the last two features (ACR-System + correct/false predictions) as those * are no features gathered by a FeatureExtractor. */ newAttributes.add(train.attribute(train.numAttributes() - 2)); newAttributes.add(train.attribute(train.numAttributes() - 1)); Instances trainCopy = copyInstances(train, newAttributes); trainCopy.setClassIndex(trainCopy.numAttributes() - 1); return trainCopy; }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options.//from w w w . java2s . c om * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Instance.missingValue(); } } m_ClusterStdDevs.add(new Instance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } // Save memory!! m_DistanceFunction.clean(); }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * //from ww w. j av a2 s . co m * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; // used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { // in case of Euclidian distance the centroid is the mean point // in case of Manhattan distance the centroid is the median point // in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedoids.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * //from w w w . j a v a2s. c o m * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; if (!updateClusterInfo) { vals[0] = 100D; return vals; } double smallestError = Double.MAX_VALUE; Instance currentCentroid = null; for (int j = 0; j < members.numInstances(); j++) { Instance currentInstance = members.instance(j); double distanceError = 0D; for (int i = 0; i < members.numInstances(); i++) { distanceError += m_DistanceFunction.distance(currentInstance, members.instance(i)); } if (distanceError < smallestError) { smallestError = distanceError; currentCentroid = currentInstance; } } vals[0] = currentCentroid.valueSparse(0); for (int j = 0; j < members.numAttributes(); j++) { if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:decisiontree.MyC45.java
/** * Method for building an C45 tree.//from w w w . ja v a2s. c om * * @param instances the training data * @exception Exception if decision tree can't be built successfully */ private void makeTree(Instances instances) throws Exception { // Check if no instances have reached this node. if (instances.numInstances() == 0) { m_Attribute = null; m_ClassValue = Instance.missingValue(); m_Distribution = new double[instances.numClasses()]; return; } // Compute attribute with maximum gain ratio. double[] gainRatios = new double[instances.numAttributes()]; Enumeration attrEnum = instances.enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (attr.isNominal()) { gainRatios[attr.index()] = computeGainRatio(instances, attr); } else if (attr.isNumeric()) { gainRatios[attr.index()] = computeGainRatio(instances, attr, computeThreshold(instances, attr)); } } m_Attribute = instances.attribute(Utils.maxIndex(gainRatios)); // Make leaf if gain ratio is zero. // Otherwise create successors. if (Utils.eq(gainRatios[m_Attribute.index()], 0)) { m_Attribute = null; m_Distribution = new double[instances.numClasses()]; Enumeration instEnum = instances.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); m_Distribution[(int) inst.classValue()]++; } Utils.normalize(m_Distribution); m_ClassValue = Utils.maxIndex(m_Distribution); m_ClassAttribute = instances.classAttribute(); } else { Instances[] splitData = null; int child = 0; if (m_Attribute.isNominal()) { child = m_Attribute.numValues(); splitData = splitData(instances, m_Attribute); } else if (m_Attribute.isNumeric()) { child = 2; splitData = splitData(instances, m_Attribute, computeThreshold(instances, m_Attribute)); } m_Successors = new MyC45[child]; for (int j = 0; j < child; j++) { m_Successors[j] = new MyC45(); m_Successors[j].makeTree(splitData[j]); } } }
From source file:decisiontree.MyID3.java
private void makeTree(Instances data) { // Check if no instances have reached this node. if (data.numInstances() == 0) { splitAttr = null;//from ww w . j a v a 2 s . c o m leafValue = Double.NaN; leafDist = new double[data.numClasses()]; return; } if (data.numDistinctValues(data.classIndex()) == 1) { leafValue = data.firstInstance().classValue(); return; } // Compute attribute with maximum information gain. double[] infoGains = new double[data.numAttributes()]; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); infoGains[att.index()] = computeInfoGain(data, att); } splitAttr = data.attribute(maxIndex(infoGains)); // Make leaf if information gain is zero. // Otherwise create successors. if (Utils.eq(infoGains[splitAttr.index()], 0)) { splitAttr = null; leafDist = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); leafDist[(int) inst.classValue()]++; } normalize(leafDist); leafValue = Utils.maxIndex(leafDist); classAttr = data.classAttribute(); } else { Instances[] splitData = splitData(data, splitAttr); child = new MyID3[splitAttr.numValues()]; for (int j = 0; j < splitAttr.numValues(); j++) { child[j] = new MyID3(); child[j].makeTree(splitData[j]); } } }