List of usage examples for weka.core Instances attributeStats
public AttributeStats attributeStats(int index)
From source file:gyc.UnderOverBoostM1.java
License:Open Source License
/** * /*from www .ja v a2 s .c o m*/ * nMajnMin * @param data * @param i * @return */ protected Instances randomSampling(Instances copia, int majC, int minC, int a, Random simplingRandom) { int[] majExamples = new int[copia.numInstances()]; int[] minExamples = new int[copia.numInstances()]; int majCount = 0, minCount = 0; // First, we copy the examples from the minority class and save the indexes of the majority // the new data-set contains samples_min + samples_min * N / 100 int size = copia.attributeStats(copia.classIndex()).nominalCounts[majC] * a / 100 * 2; // class name String majClassName = copia.attribute(copia.classIndex()).value(majC); for (int i = 0; i < copia.numInstances(); i++) { if (copia.instance(i).stringValue(copia.classIndex()).equalsIgnoreCase(majClassName)) { // save index majExamples[majCount] = i; majCount++; } else { minExamples[minCount] = i; minCount++; } } /* random undersampling of the majority */ Instances myDataset = new Instances(copia, 0); int r; for (int i = 0; i < size / 2; i++) { r = simplingRandom.nextInt(majCount); myDataset.add(copia.instance(majExamples[r])); if (minCount > 0) { r = simplingRandom.nextInt(minCount); myDataset.add(copia.instance(minExamples[r])); } } myDataset.randomize(simplingRandom); return myDataset; }
From source file:j48.C45Split.java
License:Open Source License
public void buildClassifier(Instances trainInstances) throws Exception { // Initialize the remaining instance variables. m_numSubsets = 0;//from ww w.j a v a 2 s . co m m_splitPoint = Double.MAX_VALUE; m_infoGain = 0; m_gainRatio = 0; // Different treatment for enumerated and numeric // attributes. if (trainInstances.attribute(m_attIndex).isNominal()) { m_complexityIndex = trainInstances.attribute(m_attIndex).numValues(); m_index = m_complexityIndex; handleEnumeratedAttribute(trainInstances); } else { m_complexityIndex = 2; m_index = 0; trainInstances.sort(trainInstances.attribute(m_attIndex)); // /////////////////////////////////////////////////////////////////////////////////////// double stdDev = trainInstances.attributeStats(m_attIndex).numericStats.stdDev; if (stdDev > 200) { // rrrrr = stdDev/200; // System.out.println(stdDev+" "); rrrrr = Math.log10(stdDev) / 1.2; // rrrrr = 1.1; // lllll = stdDev/2000; // lllll = 0.3; lllll = Math.log10(stdDev) / 8; } else { lllll = Math.log10(stdDev) / 1.2; // lllll = stdDev/200; // lllll = 1.1; // rrrrr = stdDev/2000; // rrrrr = 0.3; rrrrr = Math.log10(stdDev) / 8; } handleNumericAttribute(trainInstances); } }
From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java
License:Open Source License
private static void fillWithSingleAxis(final Instances dataSet, final int dateIdx, final TimeSeriesCollection tsDataset) { final int numInstances = dataSet.numInstances(); final Calendar cal = Calendar.getInstance(); for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) { if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) { System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name() + "', so skip it!"); continue; }/*from ww w . j a va 2 s . c o m*/ final TimeSeries ts = new TimeSeries(dataSet.attribute(i).name()); for (int k = 0; k < numInstances; k++) { final Instance instancek = dataSet.instance(k); final long timeInMilliSec = (long) instancek.value(dateIdx); cal.setTimeInMillis(timeInMilliSec); if (instancek.isMissing(i)) { ts.addOrUpdate(new Millisecond(cal.getTime()), null); } else { ts.addOrUpdate(new Millisecond(cal.getTime()), instancek.value(i)); } } if (!ts.isEmpty()) tsDataset.addSeries(ts); } }
From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java
License:Open Source License
private static void fillWithSingleAxisInterval(final Instances dataSet, final int dateIdx, final YIntervalSeriesCollection tsDataset, final double deviation, final int deviatedAttrIdx) { final int numInstances = dataSet.numInstances(); for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) { if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) { System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name() + "', so skip it!"); continue; }//w w w . j av a2 s .c o m final YIntervalSeries ts = new YIntervalSeries(dataSet.attribute(i).name()); for (int k = 0; k < numInstances; k++) { final Instance instancek = dataSet.instance(k); final long timeInMilliSec = (long) instancek.value(dateIdx); if (instancek.isMissing(i)) { //ts.add(timeInMilliSec,null,0d,0d); } else { if (i == deviatedAttrIdx && k > 0 && k < (numInstances - 1)) { System.out.println(numInstances + " " + k + " " + instancek.value(i) + " " + (instancek.value(i) - deviation) + " " + (instancek.value(i) + deviation)); ts.add(timeInMilliSec, instancek.value(i), instancek.value(i) - deviation, instancek.value(i) + deviation); } else { ts.add(timeInMilliSec, instancek.value(i), instancek.value(i), instancek.value(i)); } //System.out.println(instancek.value(i)+" "+(instancek.value(i)-deviation)+" "+(instancek.value(i)+deviation)); } } if (!ts.isEmpty()) tsDataset.addSeries(ts); } }
From source file:lu.lippmann.cdb.datasetview.tabs.StatsTabView.java
License:Open Source License
public static Instances buildStatsForNumericalAttributes(final Instances dataset) throws Exception { final StringBuilder sb = new StringBuilder("@relation blabla\n"); sb.append("@attribute 'name' string\n"); sb.append("@attribute 'min' string\n"); sb.append("@attribute 'max' string\n"); sb.append("@attribute 'mean' string\n"); sb.append("@attribute 'stdDev' string\n"); sb.append("@attribute 'missing values count' string\n"); sb.append("@attribute 'missing values %' string\n"); sb.append("@attribute 'values repartition' string\n"); sb.append("@data\n"); for (int i = 0; i < dataset.numAttributes(); i++) { if (dataset.attribute(i).isNumeric() && !dataset.attribute(i).isDate()) { sb.append(dataset.attribute(i).name()).append(",") .append(FormatterUtil.DECIMAL_FORMAT.format(dataset.attributeStats(i).numericStats.min)) .append(",") .append(FormatterUtil.DECIMAL_FORMAT.format(dataset.attributeStats(i).numericStats.max)) .append(",") .append(FormatterUtil.DECIMAL_FORMAT.format(dataset.attributeStats(i).numericStats.mean)) .append(",") .append(FormatterUtil.DECIMAL_FORMAT.format(dataset.attributeStats(i).numericStats.stdDev)) .append(",").append(dataset.attributeStats(i).missingCount).append(",") .append(FormatterUtil.DECIMAL_FORMAT .format(100d * dataset.attributeStats(i).missingCount / dataset.numInstances())) .append(",").append("''").append("\n"); } else if (dataset.attribute(i).isNominal()) { sb.append(dataset.attribute(i).name()).append(",'','','','','','','"); final Map<Object, String> nominalRep = WekaDataStatsUtil .getNominalRepartitionForDescription(dataset, i); for (Map.Entry<Object, String> e : nominalRep.entrySet()) { sb.append(e.getKey()).append("=").append(e.getValue()).append(" "); }/*from w ww .j a va 2 s . c om*/ sb.append("'\n"); } } final Instances newds = WekaDataAccessUtil.loadInstancesFromARFFString(sb.toString(), false, false); if (WekaDataStatsUtil.getNominalAttributesIndexes(dataset).length == 0) { newds.deleteAttributeAt(newds.numAttributes() - 1); } return newds; }
From source file:lu.lippmann.cdb.datasetview.tabs.TimeSeriesCalendarPanel.java
License:Open Source License
public void refresh(final Instances dataSet, final int dateIdx, final int attrToHighlightIdx, final Mode calendarMode) { this.jxp.removeAll(); final SimpleDateFormat f = new SimpleDateFormat(dataSet.attribute(dateIdx).getDateFormat()); final LinkedHashMap<Date, Color> map = new LinkedHashMap<Date, Color>(); final AttributeStats attributeStats = (attrToHighlightIdx < 0) ? null : dataSet.attributeStats(attrToHighlightIdx); for (int i = 0; i < dataSet.numInstances(); i++) { //System.out.println(i+" "+dataSet.instance(i).value(dateIdx)); final String val = dataSet.instance(i).stringValue(dateIdx); try {/*from w ww . java 2 s. c o m*/ final Date d = f.parse(val); if (attrToHighlightIdx < 0) { map.put(d, Color.BLACK); } else if (dataSet.attribute(attrToHighlightIdx).isNominal()) { final int idxOfColor = ((int) dataSet.instance(i).value(attrToHighlightIdx) * (this.colors.length - 1 - this.firstColorIdx)) / attributeStats.nominalCounts.length; map.put(d, this.colors[idxOfColor + this.firstColorIdx]); } else { final double normalizedValue = (dataSet.instance(i).value(attrToHighlightIdx) - attributeStats.numericStats.min) / (attributeStats.numericStats.max - attributeStats.numericStats.min); final int idxOfColor = (int) (normalizedValue * (this.colors.length - 1 - this.firstColorIdx)); //System.out.println(normalizedValue+" "+idxOfColor); map.put(d, this.colors[idxOfColor + this.firstColorIdx]); } } catch (ParseException e) { e.printStackTrace(); } } final JScrollPane scrollp = new JScrollPane( MonthCalendarView.buildMultPanel(map, calendarMode, (int) jxp.getSize().getWidth()), JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, JScrollPane.HORIZONTAL_SCROLLBAR_NEVER); scrollp.setPreferredSize(new Dimension((int) jxp.getSize().getWidth() * 95 / 100, (int) jxp.getSize().getHeight() * 95 / 100)); this.jxp.add(scrollp, BorderLayout.CENTER); if (attrToHighlightIdx >= 0) { final JXPanel legendPanel = new JXPanel(); legendPanel.setBorder(new TitledBorder("Legend")); legendPanel.setBackground(Color.WHITE); legendPanel.setLayout(new GridLayout(0, 1)); if (dataSet.attribute(attrToHighlightIdx).isNominal()) { int c = 0; final Map<Object, Integer> pv = WekaDataStatsUtil.getNominalRepartition(dataSet, attrToHighlightIdx); for (final Map.Entry<Object, Integer> entry : pv.entrySet()) { final JXLabel comp = new JXLabel(entry.getKey().toString()); final int idxOfColor = (c * (this.colors.length - this.firstColorIdx)) / attributeStats.nominalCounts.length; comp.setForeground(this.colors[idxOfColor + this.firstColorIdx]); legendPanel.add(comp); c++; } } else if (dataSet.attribute(attrToHighlightIdx).isNumeric()) { final JXLabel compMin = new JXLabel("Min: " + attributeStats.numericStats.min); compMin.setForeground(this.colors[this.firstColorIdx]); legendPanel.add(compMin); final JXLabel compMax = new JXLabel("Max: " + attributeStats.numericStats.max); compMax.setForeground(this.colors[this.colors.length - 1]); legendPanel.add(compMax); } this.jxp.add(legendPanel, BorderLayout.NORTH); } final JXPanel settingsPanel = new JXPanel(); settingsPanel.setLayout(new GridLayout(1, 0)); final JComboBox calendarModeCombo = new JComboBox(Mode.values()); calendarModeCombo.setBorder(new TitledBorder("Mode")); final JComboBox attrToHighlightCombo = new JComboBox( WekaDataStatsUtil.getAttributeNames(dataSet).toArray()); attrToHighlightCombo.setBorder(new TitledBorder("Attribute to highlight")); calendarModeCombo.setSelectedItem(calendarMode); calendarModeCombo.addActionListener(new ActionListener() { @Override public void actionPerformed(ActionEvent e) { refresh(dataSet, dateIdx, attrToHighlightCombo.getSelectedIndex(), Mode.valueOf(calendarModeCombo.getSelectedItem().toString())); } }); settingsPanel.add(calendarModeCombo); attrToHighlightCombo.setSelectedIndex(attrToHighlightIdx); attrToHighlightCombo.addActionListener(new ActionListener() { @Override public void actionPerformed(ActionEvent e) { refresh(dataSet, dateIdx, attrToHighlightCombo.getSelectedIndex(), Mode.valueOf(calendarModeCombo.getSelectedItem().toString())); } }); settingsPanel.add(attrToHighlightCombo); this.jxp.add(settingsPanel, BorderLayout.SOUTH); }
From source file:lu.lippmann.cdb.lab.mds.MDSViewBuilder.java
License:Open Source License
/** * //from ww w .j a v a 2 s. com * @param instance * @param instances * @param mapAlias * @return */ private static Integer getStrongestClass(final Integer centroidIndex, final CollapsedInstances mds) { final KmeansResult mapCentroid = mds.getCentroidMap(); final Instances newInstances = mapCentroid.getClusters().get(centroidIndex); final int classIndex = newInstances.classIndex(); final AttributeStats classAttributeStats = newInstances.attributeStats(classIndex); int maxIndex = -1; int max = -1; for (int i = 0; i < classAttributeStats.nominalCounts.length; i++) { final int currentCount = classAttributeStats.nominalCounts[i]; if (currentCount > max) { max = currentCount; maxIndex = i; } } // Problem with that line :-( return maxIndex; }
From source file:mao.datamining.RemoveUselessColumnsByMissingValues.java
License:Open Source License
/** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws Exception if no input format defined *//*from w w w .j a va 2 s. co m*/ public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_removeFilter == null) { // establish attributes to remove from first batch Instances toFilter = getInputFormat(); int[] attsToDelete = new int[toFilter.numAttributes()]; int numToDelete = 0; for (int i = 0; i < toFilter.numAttributes(); i++) { if (i == toFilter.classIndex()) continue; // skip class AttributeStats stats = toFilter.attributeStats(i); //remove those attributes who has high ratio of missing values if ((stats.missingCount * 100) / stats.totalCount > m_maxMissingPercentage) { // System.out.println("stats.missingPercentage: " + (stats.missingCount*100)/stats.totalCount+"%"); attsToDelete[numToDelete++] = i; } //remove those columns defined in the list by manual check if (this.column2DeleteSet.contains(toFilter.attribute(i).name())) { attsToDelete[numToDelete++] = i; } } int[] finalAttsToDelete = new int[numToDelete]; System.arraycopy(attsToDelete, 0, finalAttsToDelete, 0, numToDelete); m_removeFilter = new Remove(); m_removeFilter.setAttributeIndicesArray(finalAttsToDelete); m_removeFilter.setInvertSelection(false); m_removeFilter.setInputFormat(toFilter); for (int i = 0; i < toFilter.numInstances(); i++) { m_removeFilter.input(toFilter.instance(i)); } m_removeFilter.batchFinished(); Instance processed; Instances outputDataset = m_removeFilter.getOutputFormat(); // restore old relation name to hide attribute filter stamp outputDataset.setRelationName(toFilter.relationName()); setOutputFormat(outputDataset); while ((processed = m_removeFilter.output()) != null) { processed.setDataset(outputDataset); push(processed); } } flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); }
From source file:mlda.attributes.MeanEntropiesNominalAttributes.java
License:Open Source License
/** * Calculate metric value//w w w. j a v a 2 s. co m * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData) { double mean = 0.0; Instances instances = mlData.getDataSet(); int countNominal = 0; int[] featureIndices = mlData.getFeatureIndices(); for (int fIndex : featureIndices) { AttributeStats attStats = instances.attributeStats(fIndex); if (attStats.nominalCounts != null) { countNominal++; mean += Utils.entropy(attStats.nominalCounts); } } mean = mean / countNominal; this.value = mean; return value; }
From source file:mlda.labelsDistribution.MaxEntropy.java
License:Open Source License
/** * Calculate metric value/*from w ww. j a v a2s. c o m*/ * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData) { Instances instances = mlData.getDataSet(); int nLabels = mlData.getNumLabels(); int[] labels = mlData.getLabelIndices(); double[] entropies = new double[nLabels]; for (int i = 0; i < nLabels; i++) { AttributeStats attStats = instances.attributeStats(labels[i]); if (attStats.nominalCounts != null) { entropies[i] = Utils.entropy(attStats.nominalCounts); } } double maxEntropy = Double.MIN_VALUE; for (double e : entropies) { if (e > maxEntropy) { maxEntropy = e; } } this.value = maxEntropy; return value; }