List of usage examples for weka.core Instances numInstances
publicint numInstances()
From source file:ID3Chi.java
License:Open Source License
/** * Computes Chi-Square element for given subset. * * @param subset/*from w w w.j a va 2 s . c o m*/ * the data for which info gain is to be computed * @param att * the attribute * @setClassCounts class counts for initial set of instances * @setNumInstances number of instances for set of data * @return the chi-square for the given attribute and data * @throws Exception * if computation fails */ private double computeChiSquareForSubset(Instances subset, Attribute att, double[] setClassCounts, double setNumInstances) { double[] subsetClassCounts = GetClassCounts(subset); double result = 0; double d = subset.numInstances() / setNumInstances; for (int j = 0; j < subset.numClasses(); j++) { double ciNew = setClassCounts[j] * d; if (ciNew > 0) { result += Math.pow(subsetClassCounts[j] - ciNew, 2) / ciNew; } } return result; }
From source file:ID3Chi.java
License:Open Source License
/** * Computes information gain for an attribute. * * @param data//from w w w. j a va2s .c om * the data for which info gain is to be computed * @param att * the attribute * @param entropyOfAllData * entropy of data set * @return the information gain for the given attribute and data * @throws Exception * if computation fails */ private double computeInfoGain(Instances data, Attribute att, double entropyOfAllData) throws Exception { double infoGain = entropyOfAllData; Instances[] subset = splitData(data, att); int numUnknown = subset[att.numValues()].numInstances(); if (numUnknown == data.numInstances()) { return 0; } double[] classCountsUnknownData = GetClassCounts(subset[att.numValues()]); for (int j = 0; j < att.numValues(); j++) { if (subset[j].numInstances() > 0) { double ratio = (double) subset[j].numInstances() / (double) data.numInstances(); infoGain -= (((double) subset[j].numInstances() + (double) numUnknown * ratio) / (double) data.numInstances()) * computeEntropyWithUnknowns(subset[j], subset[att.numValues()], classCountsUnknownData, ratio); } } return infoGain; }
From source file:ID3Chi.java
License:Open Source License
/** * Computes the entropy of a dataset./*from w w w.j a v a 2 s.c o m*/ * * @param data * the data for which entropy is to be computed * @return the entropy of the data's class distribution * @throws Exception * if computation fails */ private double computeEntropy(Instances data) throws Exception { double[] classCounts = GetClassCounts(data); double entropy = 0; for (int j = 0; j < data.numClasses(); j++) { if (classCounts[j] > 0) { entropy -= classCounts[j] * Utils.log2(classCounts[j]); } } entropy /= (double) data.numInstances(); return entropy + Utils.log2(data.numInstances()); }
From source file:ID3Chi.java
License:Open Source License
private double computeEntropyWithUnknowns(Instances data, Instances unknownData, double[] classCountsUnknownData, double ratio) throws Exception { double[] classCounts = GetClassCounts(data); double entropy = 0; for (int j = 0; j < data.numClasses(); j++) { double p = classCounts[j] + classCountsUnknownData[j] * ratio; if (p > 0) { entropy -= p * Utils.log2(p); }//from w w w . j av a2 s . c o m } entropy /= (double) data.numInstances(); return entropy + Utils.log2(data.numInstances()); }
From source file:ID3Chi.java
License:Open Source License
/** * Splits a dataset according to the values of a nominal attribute. * * @param data//w w w . j a v a 2 s. c o m * the data which is to be split * @param att * the attribute to be used for splitting * @return the sets of instances produced by the split */ private Instances[] splitData(Instances data, Attribute att) { // [att.numValues()] is location for "unknown" values Instances[] subset = new Instances[att.numValues() + 1]; for (int j = 0; j <= att.numValues(); j++) { subset[j] = new Instances(data, data.numInstances()); } Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); if (inst.isMissing(att)) { subset[att.numValues()].add(inst); } else { subset[(int) inst.value(att)].add(inst); } } for (int i = 0; i < subset.length; i++) { subset[i].compactify(); } return subset; }
From source file:MachinLearningInterface.java
private void jButton7ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton7ActionPerformed Instances data;/* w ww . ja v a 2 s. c o m*/ try { data = new Instances(new BufferedReader(new FileReader(this.name3 + ".arff"))); Instances newData = null; Add filter; newData = new Instances(data); filter = new Add(); filter.setAttributeIndex("last"); filter.setNominalLabels("rods,punctua,networks"); filter.setAttributeName("target"); filter.setInputFormat(newData); newData = Filter.useFilter(newData, filter); System.out.print(newData); Vector vec = new Vector(); newData.setClassIndex(newData.numAttributes() - 1); if (!newData.equalHeaders(newData)) { throw new IllegalArgumentException("Train and test are not compatible!"); } URL urlToModel = this.getClass().getResource("/" + "Final.model"); InputStream stream = urlToModel.openStream(); Classifier cls = (Classifier) weka.core.SerializationHelper.read(stream); System.out.println("PROVANT MODEL.classifyInstance"); for (int i = 0; i < newData.numInstances(); i++) { double pred = cls.classifyInstance(newData.instance(i)); double[] dist = cls.distributionForInstance(newData.instance(i)); System.out.print((i + 1) + " - "); System.out.print(newData.classAttribute().value((int) pred) + " - "); //txtarea2.setText(Utils.arrayToString(dist)); System.out.println(Utils.arrayToString(dist)); vec.add(newData.classAttribute().value((int) pred)); } int p = 0, n = 0, r = 0; //txtarea2.append(Utils.arrayToString(this.target)); for (Object vec1 : vec) { if ("rods".equals(vec1.toString())) { r = r + 1; } if ("punctua".equals(vec1.toString())) { p = p + 1; } if ("networks".equals(vec1.toString())) { n = n + 1; } PrintWriter out = null; try { out = new PrintWriter(this.name3 + "_morphology.txt"); out.println(vec); out.close(); } catch (Exception ex) { ex.printStackTrace(); } //System.out.println(vec.get(i)); } System.out.println("VECTOR-> punctua: " + p + ", rods: " + r + ", networks: " + n); IJ.showMessage( "Your file:" + this.name3 + "arff" + "\nhas been analysed, and it is composed by-> punctua: " + p + ", rods: " + r + ", networks: " + n); } catch (IOException ex) { Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex); } catch (Exception ex) { Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex); } IJ.showMessage("analysing complete "); }
From source file:MachinLearningInterface.java
private void jButton10ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton10ActionPerformed Instances data;/* w w w.j a v a 2 s .co m*/ try { data = new Instances(new BufferedReader(new FileReader(this.name3 + ".arff"))); Instances newData = null; Add filter; newData = new Instances(data); filter = new Add(); filter.setAttributeIndex("last"); filter.setNominalLabels(this.liststring); filter.setAttributeName("target"); filter.setInputFormat(newData); newData = Filter.useFilter(newData, filter); System.out.print(newData); Vector vec = new Vector(); newData.setClassIndex(newData.numAttributes() - 1); if (!newData.equalHeaders(newData)) { throw new IllegalArgumentException("Train and test are not compatible!"); } Classifier cls = (Classifier) weka.core.SerializationHelper.read(this.model); System.out.println("PROVANT MODEL.classifyInstance"); for (int i = 0; i < newData.numInstances(); i++) { double pred = cls.classifyInstance(newData.instance(i)); double[] dist = cls.distributionForInstance(newData.instance(i)); System.out.print((i + 1) + " - "); System.out.print(newData.classAttribute().value((int) pred) + " - "); //txtarea2.setText(Utils.arrayToString(dist)); System.out.println(Utils.arrayToString(dist)); vec.add(newData.classAttribute().value((int) pred)); //txtarea2.append(Utils.arrayToString(dist)); } URL urlToModel = this.getClass().getResource("/" + "Final.model"); InputStream stream = urlToModel.openStream(); Classifier cls2 = (Classifier) weka.core.SerializationHelper.read(stream); System.out.println("PROVANT MODEL.classifyInstance"); for (int i = 0; i < newData.numInstances(); i++) { double pred = cls2.classifyInstance(newData.instance(i)); double[] dist = cls2.distributionForInstance(newData.instance(i)); System.out.print((i + 1) + " - "); System.out.print(newData.classAttribute().value((int) pred) + " - "); //txtarea2.setText(Utils.arrayToString(dist)); System.out.println(Utils.arrayToString(dist)); vec.add(newData.classAttribute().value((int) pred)); } int p = 0, n = 0, r = 0; //txtarea2.append(Utils.arrayToString(this.target)); for (Object vec1 : vec) { if ("rods".equals(vec1.toString())) { r = r + 1; } if ("punctua".equals(vec1.toString())) { p = p + 1; } if ("networks".equals(vec1.toString())) { n = n + 1; } PrintWriter out = null; try { out = new PrintWriter(this.name3 + "_morphology.txt"); out.println(vec); out.close(); } catch (Exception ex) { ex.printStackTrace(); } //System.out.println(vec.get(i)); } System.out.println("VECTOR-> punctua: " + p + ", rods: " + r + ", networks: " + n); IJ.showMessage( "Your file:" + this.name3 + "arff" + "\nhas been analysed, and it is composed by-> punctua: " + p + ", rods: " + r + ", networks: " + n); //txtarea2.setText("Your file:" + this.name3 + ".arff" //+ "\nhas been analysed, and it is composed by-> punctua: " + p + ", rods: " + r + ", networks: " + n //+ "\n" //+ "\nAnalyse complete"); //txtarea.setText("Analyse complete"); } catch (IOException ex) { Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex); } catch (Exception ex) { Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex); } IJ.showMessage("analysing complete "); // TODO add your handling code here: // TODO add your handling code here: }
From source file:MPCKMeans.java
License:Open Source License
/** * Clusters unlabeledData and labeledData (with labels removed), * using constraints in labeledPairs to initialize * * @param labeledPairs labeled pairs to be used to initialize * @param unlabeledData unlabeled instances * @param labeledData labeled instances/* w w w. j a v a2 s . c o m*/ * @param numClusters number of clusters * @param startingIndexOfTest starting index of test set in unlabeled data * @exception Exception if something goes wrong. */ public void buildClusterer(ArrayList labeledPairs, Instances unlabeledData, Instances labeledData, int numClusters, int startingIndexOfTest) throws Exception { m_TotalTrainWithLabels = labeledData; if (labeledPairs != null) { m_SeedHash = new HashSet((int) (unlabeledData.numInstances() / 0.75 + 10)); m_ConstraintsHash = new HashMap(); m_instanceConstraintHash = new HashMap(); for (int i = 0; i < labeledPairs.size(); i++) { InstancePair pair = (InstancePair) labeledPairs.get(i); Integer firstInt = new Integer(pair.first); Integer secondInt = new Integer(pair.second); // for first point if (!m_SeedHash.contains(firstInt)) { // add instances with constraints to seedHash if (m_verbose) { System.out.println("Adding " + firstInt + " to seedHash"); } m_SeedHash.add(firstInt); } // for second point if (!m_SeedHash.contains(secondInt)) { m_SeedHash.add(secondInt); if (m_verbose) { System.out.println("Adding " + secondInt + " to seedHash"); } } if (pair.first >= pair.second) { throw new Exception("Ordering reversed - something wrong!!"); } else { InstancePair newPair = null; newPair = new InstancePair(pair.first, pair.second, InstancePair.DONT_CARE_LINK); m_ConstraintsHash.put(newPair, new Integer(pair.linkType)); // WLOG first < second if (m_verbose) { System.out.println( "Adding constraint (" + pair.first + "," + pair.second + "), " + pair.linkType); } // hash the constraints for the instances involved Object constraintList1 = m_instanceConstraintHash.get(firstInt); if (constraintList1 == null) { ArrayList constraintList = new ArrayList(); constraintList.add(pair); m_instanceConstraintHash.put(firstInt, constraintList); } else { ((ArrayList) constraintList1).add(pair); } Object constraintList2 = m_instanceConstraintHash.get(secondInt); if (constraintList2 == null) { ArrayList constraintList = new ArrayList(); constraintList.add(pair); m_instanceConstraintHash.put(secondInt, constraintList); } else { ((ArrayList) constraintList2).add(pair); } } } } m_StartingIndexOfTest = startingIndexOfTest; if (m_verbose) { System.out.println("Starting index of test: " + m_StartingIndexOfTest); } // learn metric using labeled data, // then cluster both the labeled and unlabeled data System.out.println("Initializing metric: " + m_metric); m_metric.buildMetric(unlabeledData); m_metricBuilt = true; m_metricLearner.setMetric(m_metric); m_metricLearner.setClusterer(this); // normalize all data for SPKMeans if (m_metric.doesNormalizeData()) { for (int i = 0; i < unlabeledData.numInstances(); i++) { m_metric.normalizeInstanceWeighted(unlabeledData.instance(i)); } } // either create a new metric if multiple metrics, // or just point them all to m_metric m_metrics = new LearnableMetric[numClusters]; m_metricLearners = new MPCKMeansMetricLearner[numClusters]; for (int i = 0; i < m_metrics.length; i++) { if (m_useMultipleMetrics) { m_metrics[i] = (LearnableMetric) m_metric.clone(); m_metricLearners[i] = (MPCKMeansMetricLearner) m_metricLearner.clone(); m_metricLearners[i].setMetric(m_metrics[i]); m_metricLearners[i].setClusterer(this); } else { m_metrics[i] = m_metric; m_metricLearners[i] = m_metricLearner; } } buildClusterer(unlabeledData, numClusters); }
From source file:MPCKMeans.java
License:Open Source License
/** * Generates a clusterer. Instances in data have to be * either all sparse or all non-sparse//from w w w . j av a 2 s. co m * * @param data set of instances serving as training data * @exception Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { System.out.println("ML weight=" + m_MLweight); System.out.println("CL weight= " + m_CLweight); System.out.println("LOG term weight=" + m_logTermWeight); System.out.println("Regularizer weight= " + m_regularizerTermWeight); m_RandomNumberGenerator = new Random(m_RandomSeed); if (m_metric instanceof OfflineLearnableMetric) { m_isOfflineMetric = true; } else { m_isOfflineMetric = false; } // Don't rebuild the metric if it was already trained if (!m_metricBuilt) { m_metric.buildMetric(data); m_metricBuilt = true; m_metricLearner.setMetric(m_metric); m_metricLearner.setClusterer(this); m_metrics = new LearnableMetric[m_NumClusters]; m_metricLearners = new MPCKMeansMetricLearner[m_NumClusters]; for (int i = 0; i < m_metrics.length; i++) { if (m_useMultipleMetrics) { m_metrics[i] = (LearnableMetric) m_metric.clone(); m_metricLearners[i] = (MPCKMeansMetricLearner) m_metricLearner.clone(); m_metricLearners[i].setMetric(m_metrics[i]); m_metricLearners[i].setClusterer(this); } else { m_metrics[i] = m_metric; m_metricLearners[i] = m_metricLearner; } } } setInstances(data); m_ClusterCentroids = new Instances(m_Instances, m_NumClusters); m_ClusterAssignments = new int[m_Instances.numInstances()]; if (m_Instances.checkForNominalAttributes() && m_Instances.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle nominal attributes\n"); } m_ClusterCentroids = m_Initializer.initialize(); // if all instances are smoothed by the metric, the centroids // need to be smoothed too (note that this is independent of // centroid smoothing performed by K-Means) if (m_metric instanceof InstanceConverter) { System.out.println("Converting centroids..."); Instances convertedCentroids = new Instances(m_ClusterCentroids, m_NumClusters); for (int i = 0; i < m_ClusterCentroids.numInstances(); i++) { Instance centroid = m_ClusterCentroids.instance(i); convertedCentroids.add(((InstanceConverter) m_metric).convertInstance(centroid)); } m_ClusterCentroids.delete(); for (int i = 0; i < convertedCentroids.numInstances(); i++) { m_ClusterCentroids.add(convertedCentroids.instance(i)); } } System.out.println("Done initializing clustering ..."); getIndexClusters(); if (m_verbose && m_Seedable) { printIndexClusters(); for (int i = 0; i < m_NumClusters; i++) { System.out.println("Centroid " + i + ": " + m_ClusterCentroids.instance(i)); } } // Some extra work for smoothing metrics if (m_metric instanceof SmoothingMetric && ((SmoothingMetric) m_metric).getUseSmoothing()) { SmoothingMetric smoothingMetric = (SmoothingMetric) m_metric; Instances smoothedCentroids = new Instances(m_Instances, m_NumClusters); for (int i = 0; i < m_ClusterCentroids.numInstances(); i++) { Instance smoothedCentroid = smoothingMetric.smoothInstance(m_ClusterCentroids.instance(i)); smoothedCentroids.add(smoothedCentroid); } m_ClusterCentroids = smoothedCentroids; updateSmoothingMetrics(); } runKMeans(); }
From source file:MPCKMeans.java
License:Open Source License
/** Sets training instances */ public void setInstances(Instances instances) { m_Instances = instances;/* w w w.j a v a 2s . co m*/ // create the checksum coefficients m_checksumCoeffs = new double[instances.numAttributes()]; for (int i = 0; i < m_checksumCoeffs.length; i++) { m_checksumCoeffs[i] = m_RandomNumberGenerator.nextDouble(); } // hash the instance checksums m_checksumHash = new HashMap(instances.numInstances()); int classIdx = instances.classIndex(); for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.instance(i); double[] values = instance.toDoubleArray(); double checksum = 0; for (int j = 0; j < values.length; j++) { if (j != classIdx) { checksum += m_checksumCoeffs[j] * values[j]; } } // take care of chaining Object list = m_checksumHash.get(new Double((float) checksum)); ArrayList idxList = null; if (list == null) { idxList = new ArrayList(); m_checksumHash.put(new Double((float) checksum), idxList); } else { // chaining idxList = (ArrayList) list; } idxList.add(new Integer(i)); } }