List of usage examples for weka.core Instances Instances
public Instances(Instances dataset)
From source file:classifiers.mlp.MultilayerPerceptronCustom.java
License:Open Source License
/** * Call this function to build and train a neural network for the training * data provided.// w w w . j a v a 2s . c o m * @param i The training data. * @throws Exception if can't build classification properly. */ public void buildClassifier(Instances i) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(i); // remove instances with missing class i = new Instances(i); i.deleteWithMissingClass(); m_ZeroR = new weka.classifiers.rules.ZeroR(); m_ZeroR.buildClassifier(i); // only class? -> use ZeroR model if (i.numAttributes() == 1) { System.err.println( "Cannot build model (only class attribute present in data!), " + "using ZeroR model instead!"); m_useDefaultModel = true; return; } else { m_useDefaultModel = false; } m_epoch = 0; m_error = 0; m_instances = null; m_currentInstance = null; m_controlPanel = null; m_nodePanel = null; m_outputs = new NeuralEnd[0]; m_inputs = new NeuralEnd[0]; m_numAttributes = 0; m_numClasses = 0; m_neuralNodes = new NeuralConnection[0]; m_selected = new FastVector(4); m_graphers = new FastVector(2); m_nextId = 0; m_stopIt = true; m_stopped = true; m_accepted = false; m_instances = new Instances(i); m_random = new Random(m_randomSeed); m_instances.randomize(m_random); if (m_useNomToBin) { m_nominalToBinaryFilter = new NominalToBinary(); m_nominalToBinaryFilter.setInputFormat(m_instances); m_instances = Filter.useFilter(m_instances, m_nominalToBinaryFilter); } m_numAttributes = m_instances.numAttributes() - 1; m_numClasses = m_instances.numClasses(); setClassType(m_instances); //this sets up the validation set. Instances valSet = null; //numinval is needed later int numInVal = (int) (m_valSize / 100.0 * m_instances.numInstances()); if (m_valSize > 0) { if (numInVal == 0) { numInVal = 1; } valSet = new Instances(m_instances, 0, numInVal); } /////////// setupInputs(); setupOutputs(); if (m_autoBuild) { setupHiddenLayer(); } ///////////////////////////// //this sets up the gui for usage if (m_gui) { m_win = new JFrame(); m_win.addWindowListener(new WindowAdapter() { public void windowClosing(WindowEvent e) { boolean k = m_stopIt; m_stopIt = true; int well = JOptionPane .showConfirmDialog(m_win, "Are You Sure...\n" + "Click Yes To Accept" + " The Neural Network" + "\n Click No To Return", "Accept Neural Network", JOptionPane.YES_NO_OPTION); if (well == 0) { m_win.setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE); m_accepted = true; blocker(false); } else { m_win.setDefaultCloseOperation(JFrame.DO_NOTHING_ON_CLOSE); } m_stopIt = k; } }); m_win.getContentPane().setLayout(new BorderLayout()); m_win.setTitle("Neural Network"); m_nodePanel = new NodePanel(); // without the following two lines, the NodePanel.paintComponents(Graphics) // method will go berserk if the network doesn't fit completely: it will // get called on a constant basis, using 100% of the CPU // see the following forum thread: // http://forum.java.sun.com/thread.jspa?threadID=580929&messageID=2945011 m_nodePanel.setPreferredSize(new Dimension(640, 480)); m_nodePanel.revalidate(); JScrollPane sp = new JScrollPane(m_nodePanel, JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, JScrollPane.HORIZONTAL_SCROLLBAR_NEVER); m_controlPanel = new ControlPanel(); m_win.getContentPane().add(sp, BorderLayout.CENTER); m_win.getContentPane().add(m_controlPanel, BorderLayout.SOUTH); m_win.setSize(640, 480); m_win.setVisible(true); } //This sets up the initial state of the gui if (m_gui) { blocker(true); m_controlPanel.m_changeEpochs.setEnabled(false); m_controlPanel.m_changeLearning.setEnabled(false); m_controlPanel.m_changeMomentum.setEnabled(false); } //For silly situations in which the network gets accepted before training //commenses if (m_numeric) { setEndsToLinear(); } if (m_accepted) { m_win.dispose(); m_controlPanel = null; m_nodePanel = null; m_instances = new Instances(m_instances, 0); m_currentInstance = null; return; } //connections done. double right = 0; double driftOff = 0; double lastRight = Double.POSITIVE_INFINITY; double bestError = Double.POSITIVE_INFINITY; double tempRate; double totalWeight = 0; double totalValWeight = 0; double origRate = m_learningRate; //only used for when reset //ensure that at least 1 instance is trained through. if (numInVal == m_instances.numInstances()) { numInVal--; } if (numInVal < 0) { numInVal = 0; } for (int noa = numInVal; noa < m_instances.numInstances(); noa++) { if (!m_instances.instance(noa).classIsMissing()) { totalWeight += m_instances.instance(noa).weight(); } } if (m_valSize != 0) { for (int noa = 0; noa < valSet.numInstances(); noa++) { if (!valSet.instance(noa).classIsMissing()) { totalValWeight += valSet.instance(noa).weight(); } } } m_stopped = false; for (int noa = 1; noa < m_numEpochs + 1; noa++) { right = 0; for (int nob = numInVal; nob < m_instances.numInstances(); nob++) { m_currentInstance = m_instances.instance(nob); if (!m_currentInstance.classIsMissing()) { //this is where the network updating (and training occurs, for the //training set resetNetwork(); calculateOutputs(); tempRate = m_learningRate * m_currentInstance.weight(); if (m_decay) { tempRate /= noa; } right += (calculateErrors() / m_instances.numClasses()) * m_currentInstance.weight(); updateNetworkWeights(tempRate, m_momentum); } } right /= totalWeight; if (Double.isInfinite(right) || Double.isNaN(right)) { if (!m_reset) { m_instances = null; throw new Exception("Network cannot train. Try restarting with a" + " smaller learning rate."); } else { //reset the network if possible if (m_learningRate <= Utils.SMALL) throw new IllegalStateException( "Learning rate got too small (" + m_learningRate + " <= " + Utils.SMALL + ")!"); m_learningRate /= 2; buildClassifier(i); m_learningRate = origRate; m_instances = new Instances(m_instances, 0); m_currentInstance = null; return; } } ////////////////////////do validation testing if applicable if (m_valSize != 0) { right = 0; for (int nob = 0; nob < valSet.numInstances(); nob++) { m_currentInstance = valSet.instance(nob); if (!m_currentInstance.classIsMissing()) { //this is where the network updating occurs, for the validation set resetNetwork(); calculateOutputs(); right += (calculateErrors() / valSet.numClasses()) * m_currentInstance.weight(); //note 'right' could be calculated here just using //the calculate output values. This would be faster. //be less modular } } if (right < lastRight) { if (right < bestError) { bestError = right; // save the network weights at this point for (int noc = 0; noc < m_numClasses; noc++) { m_outputs[noc].saveWeights(); } driftOff = 0; } } else { driftOff++; } lastRight = right; if (driftOff > m_driftThreshold || noa + 1 >= m_numEpochs) { for (int noc = 0; noc < m_numClasses; noc++) { m_outputs[noc].restoreWeights(); } m_accepted = true; } right /= totalValWeight; } m_epoch = noa; m_error = right; //shows what the neuralnet is upto if a gui exists. updateDisplay(); //This junction controls what state the gui is in at the end of each //epoch, Such as if it is paused, if it is resumable etc... if (m_gui) { while ((m_stopIt || (m_epoch >= m_numEpochs && m_valSize == 0)) && !m_accepted) { m_stopIt = true; m_stopped = true; if (m_epoch >= m_numEpochs && m_valSize == 0) { m_controlPanel.m_startStop.setEnabled(false); } else { m_controlPanel.m_startStop.setEnabled(true); } m_controlPanel.m_startStop.setText("Start"); m_controlPanel.m_startStop.setActionCommand("Start"); m_controlPanel.m_changeEpochs.setEnabled(true); m_controlPanel.m_changeLearning.setEnabled(true); m_controlPanel.m_changeMomentum.setEnabled(true); blocker(true); if (m_numeric) { setEndsToLinear(); } } m_controlPanel.m_changeEpochs.setEnabled(false); m_controlPanel.m_changeLearning.setEnabled(false); m_controlPanel.m_changeMomentum.setEnabled(false); m_stopped = false; //if the network has been accepted stop the training loop if (m_accepted) { m_win.dispose(); m_controlPanel = null; m_nodePanel = null; m_instances = new Instances(m_instances, 0); m_currentInstance = null; return; } } if (m_accepted) { m_instances = new Instances(m_instances, 0); m_currentInstance = null; return; } //TODO: // Customization: store the model created after this epoch ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream("mlp/temp/" + noa + ".model")); oos.writeObject(this); oos.flush(); oos.close(); } if (m_gui) { m_win.dispose(); m_controlPanel = null; m_nodePanel = null; } m_instances = new Instances(m_instances, 0); m_currentInstance = null; }
From source file:clusterer.SimpleKMeansWithSilhouette.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options.// ww w. j av a 2 s . com * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { m_canopyClusters = null; // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][]; m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = instances.variances(); } m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false); m_FullMissingCounts = m_ClusterMissingCounts[0]; m_FullNominalCounts = m_ClusterNominalCounts[0]; double sumOfWeights = instances.sumOfWeights(); for (int i = 0; i < instances.numAttributes(); i++) { if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(m_FullStdDevs[i]); } if (m_FullMissingCounts[i] == sumOfWeights) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap<DecisionTableHashKey, Integer> initC = new HashMap<DecisionTableHashKey, Integer>(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } if (m_speedUpDistanceCompWithCanopies) { m_canopyClusters = new Canopy(); m_canopyClusters.setNumClusters(m_NumClusters); m_canopyClusters.setSeed(getSeed()); m_canopyClusters.setT2(getCanopyT2()); m_canopyClusters.setT1(getCanopyT1()); m_canopyClusters.setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory()); m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate()); m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity()); m_canopyClusters.setDebug(getDebug()); m_canopyClusters.buildClusterer(initInstances); // System.err.println(m_canopyClusters); m_centroidCanopyAssignments = new ArrayList<long[]>(); m_dataPointCanopyAssignments = new ArrayList<long[]>(); } if (m_initializationMethod == KMEANS_PLUS_PLUS) { kMeansPlusPlusInit(initInstances); m_initialStartPoints = new Instances(m_ClusterCentroids); } else if (m_initializationMethod == CANOPY) { canopyInit(initInstances); m_initialStartPoints = new Instances(m_canopyClusters.getCanopies()); } else if (m_initializationMethod == FARTHEST_FIRST) { farthestFirstInit(initInstances); m_initialStartPoints = new Instances(m_ClusterCentroids); } else { // random for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_initialStartPoints = new Instances(m_ClusterCentroids); } if (m_speedUpDistanceCompWithCanopies) { // assign canopies to training data for (int i = 0; i < instances.numInstances(); i++) { m_dataPointCanopyAssignments.add(m_canopyClusters.assignCanopies(instances.instance(i))); } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()]; startExecutorPool(); while (!converged) { if (m_speedUpDistanceCompWithCanopies) { // re-assign canopies to the current cluster centers m_centroidCanopyAssignments.clear(); for (int kk = 0; kk < m_ClusterCentroids.numInstances(); kk++) { m_centroidCanopyAssignments .add(m_canopyClusters.assignCanopies(m_ClusterCentroids.instance(kk))); } } emptyClusterCount = 0; m_Iterations++; converged = true; if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, false, true, m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments.get(i) : null); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } } else { converged = launchAssignToClusters(instances, clusterAssignments); } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true, true); } } } else { emptyClusterCount = launchMoveCentroids(tempI); } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0]; } } // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false, null); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new double[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = tempI[i].variances(); for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(vals2[j]); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].sumOfWeights(); } m_executorPool.shutdown(); // save memory! m_DistanceFunction.clean(); // Calculate Silhouette Coefficient SilCoeff = new double[instances.numInstances()]; AvgSilCoeff = 0; for (int z = 0; z < instances.numInstances(); z++) { double[] distance = new double[m_NumClusters]; Arrays.fill(distance, 0.0); //Sum for (int y = 0; y < instances.numInstances(); y++) { distance[clusterAssignments[y]] += m_DistanceFunction.distance(instances.get(z), instances.get(y)); } //Average for (int x = 0; x < m_NumClusters; x++) { distance[x] = distance[x] / m_ClusterSizes[x]; } double a = distance[clusterAssignments[z]]; distance[clusterAssignments[z]] = Double.MAX_VALUE; Arrays.sort(distance); double b = distance[0]; SilCoeff[z] = (b - a) / Math.max(a, b); AvgSilCoeff += SilCoeff[z]; } AvgSilCoeff = AvgSilCoeff / instances.numInstances(); //System.out.println("AvgSilCoeff: " + AvgSilCoeff); }
From source file:clusterer.SimpleKMeansWithSilhouette.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * /*from w ww. j a v a2 s .com*/ * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @param addToCentroidInstances true if the method is to add the computed * coordinates to the Instances holding the centroids * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo, boolean addToCentroidInstances) { double[] vals = new double[members.numAttributes()]; double[][] nominalDists = new double[members.numAttributes()][]; double[] weightMissing = new double[members.numAttributes()]; double[] weightNonMissing = new double[members.numAttributes()]; // Quickly calculate some relevant statistics for (int j = 0; j < members.numAttributes(); j++) { if (members.attribute(j).isNominal()) { nominalDists[j] = new double[members.attribute(j).numValues()]; } } for (Instance inst : members) { for (int j = 0; j < members.numAttributes(); j++) { if (inst.isMissing(j)) { weightMissing[j] += inst.weight(); } else { weightNonMissing[j] += inst.weight(); if (members.attribute(j).isNumeric()) { vals[j] += inst.weight() * inst.value(j); // Will be overwritten in Manhattan case } else { nominalDists[j][(int) inst.value(j)] += inst.weight(); } } } } for (int j = 0; j < members.numAttributes(); j++) { if (members.attribute(j).isNumeric()) { if (weightNonMissing[j] > 0) { vals[j] /= weightNonMissing[j]; } else { vals[j] = Utils.missingValue(); } } else { double max = -Double.MAX_VALUE; double maxIndex = -1; for (int i = 0; i < nominalDists[j].length; i++) { if (nominalDists[j][i] > max) { max = nominalDists[j][i]; maxIndex = i; } if (max < weightMissing[j]) { vals[j] = Utils.missingValue(); } else { vals[j] = maxIndex; } } } } if (m_DistanceFunction instanceof ManhattanDistance) { // Need to replace means by medians Instances sortedMembers = null; int middle = (members.numInstances() - 1) / 2; boolean dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } for (int j = 0; j < members.numAttributes(); j++) { if ((weightNonMissing[j] > 0) && members.attribute(j).isNumeric()) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } } } if (updateClusterInfo) { for (int j = 0; j < members.numAttributes(); j++) { m_ClusterMissingCounts[centroidIndex][j] = weightMissing[j]; m_ClusterNominalCounts[centroidIndex][j] = nominalDists[j]; } } if (addToCentroidInstances) { m_ClusterCentroids.add(new DenseInstance(1.0, vals)); } return vals; }
From source file:cn.edu.xjtu.dbmine.source.NaiveBayes.java
License:Open Source License
/** * Generates the classifier./*from w w w . ja va 2s .c o m*/ * * @param instances set of instances serving as training data * @exception Exception if the classifier has not been generated * successfully */ public void buildClassifier(Instances instances) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_NumClasses = instances.numClasses(); // Copy the instances m_Instances = new Instances(instances); // Discretize instances if required if (m_UseDiscretization) { m_Disc = new weka.filters.supervised.attribute.Discretize(); m_Disc.setInputFormat(m_Instances); m_Instances = weka.filters.Filter.useFilter(m_Instances, m_Disc); } else { m_Disc = null; } // Reserve space for the distributions m_Distributions = new Estimator[m_Instances.numAttributes() - 1][m_Instances.numClasses()]; m_ClassDistribution = new DiscreteEstimator(m_Instances.numClasses(), true); int attIndex = 0; Enumeration enu = m_Instances.enumerateAttributes(); while (enu.hasMoreElements()) { Attribute attribute = (Attribute) enu.nextElement(); // If the attribute is numeric, determine the estimator // numeric precision from differences between adjacent values double numPrecision = DEFAULT_NUM_PRECISION; if (attribute.type() == Attribute.NUMERIC) { m_Instances.sort(attribute); if ((m_Instances.numInstances() > 0) && !m_Instances.instance(0).isMissing(attribute)) { double lastVal = m_Instances.instance(0).value(attribute); double currentVal, deltaSum = 0; int distinct = 0; for (int i = 1; i < m_Instances.numInstances(); i++) { Instance currentInst = m_Instances.instance(i); if (currentInst.isMissing(attribute)) { break; } currentVal = currentInst.value(attribute); if (currentVal != lastVal) { deltaSum += currentVal - lastVal; lastVal = currentVal; distinct++; } } if (distinct > 0) { numPrecision = deltaSum / distinct; } } } for (int j = 0; j < m_Instances.numClasses(); j++) { switch (attribute.type()) { case Attribute.NUMERIC: if (m_UseKernelEstimator) { m_Distributions[attIndex][j] = new KernelEstimator(numPrecision); } else { m_Distributions[attIndex][j] = new NormalEstimator(numPrecision); } break; case Attribute.NOMINAL: m_Distributions[attIndex][j] = new DiscreteEstimator(attribute.numValues(), true); break; default: throw new Exception("Attribute type unknown to NaiveBayes"); } } attIndex++; } // Compute counts Enumeration enumInsts = m_Instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { Instance instance = (Instance) enumInsts.nextElement(); updateClassifier(instance); } // Save space m_Instances = new Instances(m_Instances, 0); }
From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer * that are not being set via options.//from w w w . j av a 2 s. c om * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) m_Assignments = clusterAssignments; m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) initInstances = new Instances(instances); else initInstances = instances; if (m_initializeWithKMeansPlusPlus) { kMeansPlusPlusInit(initInstances); } else { for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } } m_NumClusters = m_ClusterCentroids.numInstances(); //removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, false, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index++] = tempI[k]; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (m_Iterations == m_MaxIterations) converged = true; if (!converged) { m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } }
From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid coordinates based * on it's members (objects assigned to the cluster of the centroid) and the distance * function being used./*from w ww. j a v a 2 s. com*/ * @param centroidIndex index of the centroid which the coordinates will be computed * @param members the objects that are assigned to the cluster of this centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; //used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { //in case of Euclidian distance the centroid is the mean point //in case of Manhattan distance the centroid is the median point //in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance) { //singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { sortedMembers.kthSmallestValue(j, middle + 1); vals[j] = sortedMembers.instance(middle).value(j); if (dataIsEven) { sortedMembers.kthSmallestValue(j, middle + 2); vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Utils.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Utils.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) m_ClusterCentroids.add(new DenseInstance(1.0, vals)); return vals; }
From source file:cn.ict.zyq.bestConf.bestConf.BestConf.java
License:Open Source License
protected Instances runExp(Instances samplePoints, int round, String postfix, boolean resuming) { Instances retval = null;// w w w . j av a2 s . c om try { //DataIOFile.saveDataToArffFile("data/zyqTestRange.arff", samplePoints); if (resuming) { samplePoints = manager.collectPerfs(samplePoints, perfAttName); } retval = manager.runExp(samplePoints, perfAttName); //we output the result set for future debugging and testing purposes DataIOFile.saveDataToArffFile("data/trainingBestConf" + round + "_" + postfix + ".arff", samplePoints); //evict all bad configurations Attribute perfAtt = retval.attribute(perfAttName); Iterator<Instance> itr = retval.iterator(); ArrayList<Integer> toRemove = new ArrayList<Integer>(); Instance next; while (itr.hasNext()) { next = itr.next(); if (next.value(perfAtt) == -1) toRemove.add(retval.indexOf(next)); } while (!toRemove.isEmpty()) retval.remove(toRemove.remove(0)); } catch (IOException e) { e.printStackTrace(); } if (allInstances == null) { allInstances = new Instances(retval); } else { allInstances.addAll(retval); } return retval; }
From source file:cn.ict.zyq.bestConf.bestConf.RBSoDDSOptimization.java
License:Open Source License
@Override public void optimize(String preLoadDatasetPath) { ResumeParams rParams = resumePrepareTry(); boolean justAfterResume = rParams.isResuming; //detect whether we need to resume if (rParams.isResuming) preLoadDatasetPath = null;//from w ww.ja v a2s . co m double tempBest; while (opParams.currentround < RRSMaxRounds) { //is it a global search if (samplePoints == null || rParams.propsRound < opParams.currentround) { props = bestconf.getAttributes(); saveProps(props, opParams.currentround, opParams.subround);//for resumability opParams.saveToFile(); } if (opParams.currentround != 0 || opParams.subround != 0) { if (!justAfterResume || (justAfterResume && (rParams.samplePointRound < opParams.currentround || rParams.samplePointSubRound < opParams.subround))) { //let's do the sampling ((DDSSampler) sampler).setCurrentRound(opParams.currentround); samplePoints = sampler.getMultiDimContinuous(props, InitialSampleSetSize, false, bestconf); saveSamplePoints(samplePoints, opParams.currentround, opParams.subround); } if (!justAfterResume || (justAfterResume && rParams.trainingRound < opParams.currentround || rParams.trainingSubRound < opParams.subround)) { //traverse the set and initiate the experiments trainingSet = bestconf.runExp(samplePoints, opParams.currentround, "RRS" + String.valueOf(opParams.subround), justAfterResume); saveTrainingSet(trainingSet, opParams.currentround, opParams.subround); } } else {//(currentround==0 && subround==0) if (preLoadDatasetPath == null) { if (samplePoints == null) { //let's do the sampling ((DDSSampler) sampler).setCurrentRound(opParams.currentround); samplePoints = sampler.getMultiDimContinuous(props, InitialSampleSetSize, false, bestconf); samplePoints.add(0, bestconf.defltSettings.firstInstance()); saveSamplePoints(samplePoints, opParams.currentround, opParams.subround); } if (trainingSet == null) { //traverse the set and initiate the experiments trainingSet = bestconf.runExp(samplePoints, opParams.currentround, "RRS" + String.valueOf(opParams.subround), justAfterResume); saveTrainingSet(trainingSet, opParams.currentround, opParams.subround); } } else { try { bestconf.allInstances = DataIOFile.loadDataFromArffFile(preLoadDatasetPath); bestconf.allInstances.setClassIndex(bestconf.allInstances.numAttributes() - 1); samplePoints = trainingSet = new Instances(bestconf.allInstances); saveSamplePoints(samplePoints, opParams.currentround, opParams.subround); saveTrainingSet(trainingSet, opParams.currentround, opParams.subround); } catch (IOException e) { e.printStackTrace(); } } } //get the point with the best performance Instance tempIns = BestConf.findBestPerf(trainingSet); tempBest = tempIns.value(trainingSet.numAttributes() - 1); if (tempBest > opParams.currentBest || (justAfterResume && tempBest == opParams.currentBest && (rParams.propsRound < opParams.currentround || rParams.propsSubRound < opParams.subround))) { System.err.println("Previous best is " + opParams.currentBest + "; Current best is " + tempBest); opParams.currentBest = tempBest; opParams.currentIns = tempIns; opParams.saveToFile(); try { //output the best instance of this round Instances bestInstances = new Instances(samplePoints, 1); bestInstances.add(opParams.currentIns); DataIOFile.saveDataToArffFile("data/trainingBestConf_RRS_" + opParams.currentround + "_" + opParams.subround + "_" + opParams.currentBest + ".arff", bestInstances); } catch (IOException e) { e.printStackTrace(); } //let's search locally if (!justAfterResume || (justAfterResume && rParams.propsRound < opParams.currentround || rParams.propsSubRound < opParams.subround)) { props = ConfigSampler.scaleDownDetour(trainingSet, tempIns); saveProps(props, opParams.currentround, opParams.subround);//for resumability } opParams.subround++; opParams.saveToFile(); } else {//let's do the restart samplePoints = null; opParams.currentround++; opParams.subround = 0; opParams.saveToFile(); System.err.println("Entering into round " + opParams.currentround); /*if(opParams.currentround>=RRSMaxRounds) break;*/ } justAfterResume = false; } //RRS search System.err.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); System.err.println("We are ending the optimization experiments!"); System.err.println("Please wait and don't shutdown!"); System.err.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); System.err.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); //output the best Map<Attribute, Double> attsmap = BestConf.instanceToMap(opParams.currentIns); System.out.println(attsmap.toString()); //set the best configuration to the cluster System.err.println("The best performance is : " + opParams.currentBest); System.out.println("========================================="); TxtFileOperation.writeToFile("bestConfOutput_RRS", attsmap.toString() + "\n"); System.out.println("========================================="); //output the whole trainings dataset try { DataIOFile.saveDataToArffFile("data/trainingAllRSS.arff", bestconf.allInstances); } catch (IOException e) { e.printStackTrace(); } }
From source file:cn.ict.zyq.bestConf.COMT2.COMT2.java
License:Open Source License
private void train() throws Exception { models = new M5P[ModelNum]; for (int i = 0; i < ModelNum; i++) { models[i] = buildModel(labeledInstances, M[i]); }/*from www . j a va 2 s . c o m*/ for (int i = 0; i < this.comtIterations; i++) { ArrayList<Instance>[] InstancePiSet = new ArrayList[ModelNum]; for (int j = 0; j < ModelNum; j++) InstancePiSet[j] = new ArrayList<Instance>(); for (int m = 0; m < ModelNum; m++) { double maxDelta = 0; Instance maxDeltaXY = null; Enumeration<Instance> enu = this.unlabeledInstances.enumerateInstances(); while (enu.hasMoreElements()) { Instance ulIns = enu.nextElement(); Instances omega = getSiblings(models[m], ulIns); double y = models[m].classifyInstance(ulIns); if (indexOfClass == -1) indexOfClass = labeledInstances.classIndex(); ulIns.setValue(indexOfClass, y); Instances instancesPi = new Instances(models[m].getM5RootNode().zyqGetTrainingSet()); instancesPi.add(ulIns); M5P modelPi = buildModel(instancesPi, M[m]); double delta = computeOmegaDelta(models[m], modelPi, omega); if (maxDelta < delta) { maxDelta = delta; maxDeltaXY = ulIns; } } //now check facts about delta if (maxDelta > 0) { InstancePiSet[m].add(maxDeltaXY); this.unlabeledInstances.delete(this.unlabeledInstances.indexOf(maxDeltaXY)); } } //check for both model boolean toExit = true; for (int m = 0; m < ModelNum; m++) { if (InstancePiSet[m].size() > 0) { toExit = false; break; } } if (toExit) break; else { //update the models int toGen = 0; for (int m = 0; m < ModelNum; m++) { Instances set = models[m].getM5RootNode().zyqGetTrainingSet(); toGen += InstancePiSet[m].size(); for (Instance ins : InstancePiSet[m]) set.add(ins); models[m] = buildModel(set, M[m]); } //Replenish pool U' to size p Instances toAdd = retrieveMore(toGen); unlabeledInstances.addAll(toAdd); } //we will go to another round of iteration } //iterate for a number of rounds or break out on empty InstancesPiSets //now we have the model as y = 0.5*sum(models[m].predict(x)) }
From source file:cn.pku.sei.GHRC.SpectralClusterer.java
License:Open Source License
/** * Generates a clusterer by the mean of spectral clustering algorithm. * //from w ww.j ava 2s . co m * @param data * set of instances serving as training data */ @Override public void buildClusterer(@SuppressWarnings("hiding") final Instances data) { setData(new Instances(data)); final int n = getData().numInstances(); final DoubleMatrix2D w = useSparseMatrix ? DoubleFactory2D.sparse.make(n, n) : DoubleFactory2D.dense.make(n, n); /* * final double[][] v1 = new double[n][]; for (int i = 0; i < n; i++) * v1[i] = data.instance(i).toDoubleArray(); final DoubleMatrix2D v = * DoubleFactory2D.dense.make(v1); */ final double sigma_sq = sigma * sigma; // Sets up similarity matrix for (int i = 0; i < n; i++) for (int j = i; j < n; j++) { final double dist = getDistanceFunction().distance(getData().instance(i), getData().instance(j)); if ((r <= 0) || (dist < r)) { final double sim = Math.exp(-(dist * dist) / (2 * sigma_sq)); w.set(i, j, sim); w.set(j, i, sim); } } // Compute point partitions final int[][] p = partition(w /* , alpha_star */); // Deploys results numOfClusters = p.length; cluster = new int[n]; for (int i = 0; i < p.length; i++) for (int j = 0; j < p[i].length; j++) cluster[p[i][j]] = i; }