List of usage examples for weka.filters.supervised.attribute Discretize setInputFormat
@Override public boolean setInputFormat(Instances instanceInfo) throws Exception
From source file:GainRatioAttributeEval1.java
License:Open Source License
/** * Initializes a gain ratio attribute evaluator. * Discretizes all attributes that are numeric. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been * generated successfully// ww w.j av a 2 s . c om */ public void buildEvaluator(Instance data) throws Exception { // can evaluator handle data? program10 program10 = new program10(); getCapabilities().testWithFail(program10.DataTrain()); m_trainInstances = program10.DataTrain(); m_classIndex = m_trainInstances.classIndex(); m_numAttribs = m_trainInstances.numAttributes(); m_numInstances = m_trainInstances.numInstances(); Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, disTransform); m_numClasses = m_trainInstances.attribute(m_classIndex).numValues(); }
From source file:ChiSquare.ChiSquaredAttributeEval.java
License:Open Source License
/** * Initializes a chi-squared attribute evaluator. * Discretizes all attributes that are numeric. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been * generated successfully//from w w w .j a va 2 s . c o m */ public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else { NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute chi-squared values m_ChiSquareds = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != classIndex) { m_ChiSquareds[i] = ContingencyTables.chiVal(ContingencyTables.reduceMatrix(counts[i]), false); } } }
From source file:edu.columbia.cs.ltrie.sampling.queries.generation.ChiSquaredWithYatesCorrectionAttributeEval.java
License:Open Source License
/** * Initializes a chi-squared attribute evaluator. * Discretizes all attributes that are numeric. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been * generated successfully//from ww w .j a v a2s . co m */ public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else { NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute chi-squared values m_ChiSquareds = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != classIndex) { m_ChiSquareds[i] = chiVal(ContingencyTables.reduceMatrix(counts[i])); } } }
From source file:es.jarias.FMC.FMC.java
License:Open Source License
public static void buildModel(MultiLabelInstances trainData, MultiLabelInstances testData, int fold, String baseClassifierClass, String discType, String fss, String outPath, String prune) throws Exception { double start = System.nanoTime(); try {// ww w. ja v a 2 s . c om // DATA PREPROCESING: weka.filters.unsupervised.attribute.Discretize m_unsuperDiscretize = null; if (discType.equals("supervised")) { // pass // Supervised discretization is applied to each model later during the training step. } else if (discType.equals("unsupervised")) { // Apply a baseline discretization filter: m_unsuperDiscretize = new weka.filters.unsupervised.attribute.Discretize(); m_unsuperDiscretize.setUseEqualFrequency(false); m_unsuperDiscretize.setBins(3); m_unsuperDiscretize.setInputFormat(trainData.getDataSet()); trainData = trainData .reintegrateModifiedDataSet(Filter.useFilter(trainData.getDataSet(), m_unsuperDiscretize)); } else throw new Exception("Invalid Discretization Type"); if (!fss.equals("no") && !fss.equals("CFS")) throw new Exception("Invalid FSS strategy"); if (!prune.equals("full") && !prune.equals("tree") && !prune.equals("best") && !prune.equals("hiton") && !prune.equals("bdeu")) throw new Exception("Invalid Pruning strategy"); // Label information int m_numLabels = trainData.getNumLabels(); int[] m_labelIndices = trainData.getLabelIndices(); // Map for reference: HashMap<Integer, Integer> mapLabels = new HashMap<Integer, Integer>(m_numLabels); String[] mapLabelsName = new String[m_numLabels]; for (int l = 0; l < m_numLabels; l++) { mapLabels.put(trainData.getLabelIndices()[l], l); mapLabelsName[l] = trainData.getDataSet().attribute(trainData.getLabelIndices()[l]).name(); } // Get label combinations: int m_numPairs = (m_labelIndices.length * (m_labelIndices.length - 1)) / 2; int[][] labelCombinations = new int[m_numPairs][2]; int counter = 0; for (int i = 0; i < m_labelIndices.length; i++) { for (int j = i + 1; j < m_labelIndices.length; j++) { labelCombinations[counter] = new int[] { m_labelIndices[i], m_labelIndices[j] }; counter++; } } // Select the pairs: int m_numSelected = m_numPairs; int m_numSingleton = 0; int[] ordered; boolean[] selectedPair = new boolean[m_numPairs]; boolean[] singleton = new boolean[m_numLabels]; for (int i = 0; i < m_numPairs; i++) selectedPair[i] = true; if (!prune.equals("full")) { m_numSelected = 0; selectedPair = new boolean[m_numPairs]; // Info gain for pruned model: double[][] mutualInfoPairs = mutualInfo(trainData.getDataSet(), trainData.getLabelIndices()); double[] mutualInfo = new double[m_numPairs]; counter = 0; for (int i = 0; i < m_labelIndices.length; i++) { Instances tempInstances = new Instances(trainData.getDataSet()); tempInstances.setClassIndex(m_labelIndices[i]); for (int j = i + 1; j < m_labelIndices.length; j++) { mutualInfo[counter] = mutualInfoPairs[i][j]; counter++; } } ordered = orderBy(mutualInfo); if (prune.equals("tree")) { // Each labels correspond to its own connex component HashMap<Integer, ArrayList<Integer>> tree_compo = new HashMap<Integer, ArrayList<Integer>>( m_numLabels); HashMap<Integer, Integer> tree_index = new HashMap<Integer, Integer>(m_numLabels); for (int i = 0; i < m_numLabels; i++) { tree_compo.put(i, new ArrayList<Integer>()); tree_compo.get(i).add(i); tree_index.put(i, i); } for (int i = 0; i < m_numPairs; i++) { if (m_numSelected >= m_numLabels - 1) break; int pairIndex = ordered[i]; int pair_i = mapLabels.get(labelCombinations[pairIndex][0]); int pair_j = mapLabels.get(labelCombinations[pairIndex][1]); int conex_i = tree_index.get(pair_i); int conex_j = tree_index.get(pair_j); if (conex_i != conex_j) { ArrayList<Integer> family = tree_compo.get(conex_j); tree_compo.get(conex_i).addAll(family); for (int element : family) { tree_index.put(element, conex_i); } selectedPair[pairIndex] = true; m_numSelected++; } } } // End of the chow-liu algorithm if (prune.equals("best") || prune.equals("tree")) { int amount = 0; if (prune.equals("best")) amount = (int) (m_numLabels * 2); int index = 0; while (m_numSelected < amount && index < m_numPairs) { if (!selectedPair[ordered[index]]) { m_numSelected++; selectedPair[ordered[index]] = true; } index++; } } // End of the linear tree and best procedures if (prune.equals("hiton")) { weka.filters.unsupervised.attribute.Remove m_remove = new weka.filters.unsupervised.attribute.Remove(); m_remove.setAttributeIndicesArray(trainData.getLabelIndices()); m_remove.setInvertSelection(true); m_remove.setInputFormat(trainData.getDataSet()); Instances hitonData = Filter.useFilter(trainData.getDataSet(), m_remove); HITON hiton = new HITON(hitonData); HashSet<Integer>[] markovBlanket = new HashSet[m_numLabels]; for (int l = 0; l < m_numLabels; l++) markovBlanket[l] = hiton.HITONMB(l); for (int p = 0; p < m_numPairs; p++) { int p_i = mapLabels.get(labelCombinations[p][0]); int p_j = mapLabels.get(labelCombinations[p][1]); if (markovBlanket[p_i].contains(p_j) || markovBlanket[p_j].contains(p_i)) { selectedPair[p] = true; m_numSelected++; } } } // end of the hiton pruning algorithm if (prune.equals("bdeu")) { weka.filters.unsupervised.attribute.Remove m_remove = new weka.filters.unsupervised.attribute.Remove(); m_remove.setAttributeIndicesArray(trainData.getLabelIndices()); m_remove.setInvertSelection(true); m_remove.setInputFormat(trainData.getDataSet()); Instances hitonData = Filter.useFilter(trainData.getDataSet(), m_remove); BDeu hiton = new BDeu(hitonData); double[] scores = hiton.singleScore; double[] pairScores = new double[m_numPairs]; double[] sumScores = new double[m_numLabels]; for (int p = 0; p < m_numPairs; p++) { int head = mapLabels.get(labelCombinations[p][0]); int tail = mapLabels.get(labelCombinations[p][1]); pairScores[p] = -1 * (scores[tail] - (hiton.localBdeuScore(tail, new Integer[] { head }))); sumScores[tail] += pairScores[p]; sumScores[head] += pairScores[p]; } HashSet<Integer>[] parents = new HashSet[m_numLabels]; for (int i = 0; i < m_numLabels; i++) parents[i] = new HashSet<Integer>(); ordered = orderBy(pairScores); int[] topologicalOrdering = orderBy(sumScores); int[] relevance = new int[m_numLabels]; for (int i = 0; i < m_numLabels; i++) relevance[topologicalOrdering[i]] = i; for (int p = 0; p < m_numPairs; p++) { int pair = ordered[p]; int head = mapLabels.get(labelCombinations[pair][0]); int tail = mapLabels.get(labelCombinations[pair][1]); if (relevance[head] > relevance[tail]) { int aux = head; head = tail; tail = aux; } // Check if adding this improves parents[tail].add(head); double scoreAdd = hiton.localBdeuScore(tail, parents[tail].toArray(new Integer[parents[tail].size()])); double diff = scores[tail] - scoreAdd; if (diff < 0) { scores[tail] = scoreAdd; selectedPair[pair] = true; m_numSelected++; } else { parents[tail].remove(head); } } // End of the BDeu procedure } // End of the Pruning algorithms // // Determine singleton variables for (int i = 0; i < m_labelIndices.length; i++) singleton[i] = true; for (int p = 0; p < m_numPairs; p++) { if (selectedPair[p]) { singleton[mapLabels.get(labelCombinations[p][0])] = false; singleton[mapLabels.get(labelCombinations[p][1])] = false; } } for (int i = 0; i < m_labelIndices.length; i++) if (singleton[i]) m_numSingleton++; mutualInfo = null; } // Generate single class datasets from the full ML data and learn models: HashMap<Integer, Classifier> models = new HashMap<Integer, Classifier>(); HashMap<Integer, Classifier> singletonModels = new HashMap<Integer, Classifier>(); HashMap<Integer, weka.filters.supervised.attribute.AttributeSelection> singletonFilterSel = new HashMap<Integer, weka.filters.supervised.attribute.AttributeSelection>(); HashMap<Integer, weka.filters.supervised.attribute.Discretize> singletonFilter = new HashMap<Integer, weka.filters.supervised.attribute.Discretize>(); weka.filters.supervised.attribute.AttributeSelection[] m_selecters = new weka.filters.supervised.attribute.AttributeSelection[m_numPairs]; weka.filters.supervised.attribute.Discretize[] m_discretizers = new weka.filters.supervised.attribute.Discretize[m_numPairs]; ClassCompoundTransformation[] converters = new ClassCompoundTransformation[m_numPairs]; for (int i = 0; i < m_numPairs; i++) { if (!selectedPair[i]) { continue; } MultiLabelInstances filteredLabelData = trainData .reintegrateModifiedDataSet(RemoveAllLabels.transformInstances(trainData.getDataSet(), complement(m_labelIndices, labelCombinations[i]))); converters[i] = new ClassCompoundTransformation(); Instances singleLabelData = converters[i].transformInstances(filteredLabelData); if (discType.equals("supervised")) { m_discretizers[i] = new Discretize(); m_discretizers[i].setInputFormat(singleLabelData); singleLabelData = Filter.useFilter(singleLabelData, m_discretizers[i]); } if (fss.equals("CFS")) { m_selecters[i] = new weka.filters.supervised.attribute.AttributeSelection(); m_selecters[i].setSearch(new weka.attributeSelection.BestFirst()); m_selecters[i].setEvaluator(new weka.attributeSelection.CfsSubsetEval()); m_selecters[i].setInputFormat(singleLabelData); singleLabelData = Filter.useFilter(singleLabelData, m_selecters[i]); } models.put(i, (Classifier) Class.forName("weka.classifiers." + baseClassifierClass).newInstance()); models.get(i).buildClassifier(singleLabelData); } // Learn singleton models: for (int i = 0; i < m_labelIndices.length; i++) { if (singleton[i]) { Instances singleLabelData = new Instances(trainData.getDataSet()); singleLabelData.setClassIndex(m_labelIndices[i]); singleLabelData = RemoveAllLabels.transformInstances(singleLabelData, complement(m_labelIndices, new int[] { m_labelIndices[i] })); if (discType.equals("supervised")) { singletonFilter.put(i, new Discretize()); singletonFilter.get(i).setInputFormat(singleLabelData); singleLabelData = Filter.useFilter(singleLabelData, singletonFilter.get(i)); } if (fss.equals("CFS")) { weka.filters.supervised.attribute.AttributeSelection tempFilter = new weka.filters.supervised.attribute.AttributeSelection(); tempFilter.setSearch(new weka.attributeSelection.BestFirst()); tempFilter.setEvaluator(new weka.attributeSelection.CfsSubsetEval()); tempFilter.setInputFormat(singleLabelData); singletonFilterSel.put(i, tempFilter); singleLabelData = Filter.useFilter(singleLabelData, singletonFilterSel.get(i)); } Classifier single; single = (Classifier) Class.forName("weka.classifiers." + baseClassifierClass).newInstance(); single.buildClassifier(singleLabelData); singletonModels.put(i, single); } } // // END OF THE LEARNING STAGE // double train = System.nanoTime() - start; start = System.nanoTime(); Writer writerConf = null; Writer writerDist = null; Writer writerSing = null; Writer writerLayo = null; try { writerConf = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/conf_" + fold + ".txt"), "utf-8")); writerDist = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/dist_" + fold + ".txt"), "utf-8")); writerSing = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/sing_" + fold + ".txt"), "utf-8")); writerLayo = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/layo_" + fold + ".txt"), "utf-8")); for (int l = 0; l < m_numLabels; l++) { writerLayo.write(trainData.getDataSet().attribute(m_labelIndices[l]).numValues() + "\t"); } writerLayo.write("\n"); writerLayo.write(m_numSelected + "\t" + m_numSingleton); writerLayo.close(); // Get distributions for instance for each variable pairs: double[] distributions; for (int i = 0; i < testData.getDataSet().size(); i++) { for (int l : testData.getLabelIndices()) writerConf.write((int) testData.getDataSet().instance(i).value(l) + "\t"); writerConf.write("\n"); Instance inst = testData.getDataSet().get(i); if (discType.equals("unsupervised")) { m_unsuperDiscretize.input(inst); inst = m_unsuperDiscretize.output(); } for (int p = 0; p < m_numPairs; p++) { if (!selectedPair[p]) { continue; } Instance processed = converters[p].transformInstance(inst, testData.getLabelIndices()); if (discType.equals("supervised")) { m_discretizers[p].input(processed); processed = m_discretizers[p].output(); // m_removers[p].input(processed); // processed = m_removers[p].output(); } if (!fss.equals("no")) { m_selecters[p].input(processed); processed = m_selecters[p].output(); } distributions = models.get(p).distributionForInstance(processed); writerDist.write(mapLabels.get(labelCombinations[p][0]) + "\t" + mapLabels.get(labelCombinations[p][1]) + "\t"); for (int d = 0; d < distributions.length; d++) writerDist.write(distributions[d] + "\t"); writerDist.write("\n"); } // Get predictions for singleton labels: for (int m = 0; m < m_labelIndices.length; m++) { if (singleton[m]) { Instance processed = RemoveAllLabels.transformInstance(inst, complement(m_labelIndices, new int[] { m_labelIndices[m] })); if (discType.equals("supervised")) { singletonFilter.get(m).input(processed); processed = singletonFilter.get(m).output(); } if (!fss.equals("no")) { singletonFilterSel.get(m).input(processed); processed = singletonFilterSel.get(m).output(); } double[] distribution = singletonModels.get(m).distributionForInstance(processed); double maxValue = 0; int conf = -1; for (int v = 0; v < distribution.length; v++) { if (distribution[v] > maxValue) { maxValue = distribution[v]; conf = v; } } writerSing.write(i + "\t" + m + "\t" + conf + "\n"); } } } writerConf.close(); writerDist.close(); writerSing.close(); double test = System.nanoTime() - start; // train /= 1000000000.0; // test /= 1000000000.0; // System.out.println(java.lang.String.format("FMC-%s\t%s\t%s\t%d\t%s\t%s\t%.4f\t%.4f",prune,baseClassifierClass,dbName,fold,discType,fss,train,test)); } catch (IOException ex) { // report } finally { try { writerConf.close(); } catch (Exception ex) { } try { writerDist.close(); } catch (Exception ex) { } } } catch (Exception e) { e.printStackTrace(); } }
From source file:feature.InfoGainEval.java
License:Open Source License
/** * Initializes an information gain attribute evaluator. Discretizes all * attributes that are numeric.// w w w. jav a 2 s . c o m * * @param data * set of instances serving as training data * @throws Exception * if the evaluator has not been generated successfully */ public double computeInfoGain(Instances data, int att) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else { NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute info gains m_InfoGains = new double[data.numAttributes()]; m_InfoGains[att] = (ContingencyTables.entropyOverColumns(counts[att]) - ContingencyTables.entropyConditionedOnRows(counts[att])); return m_InfoGains[att]; }
From source file:feature.InfoGainEval.java
License:Open Source License
public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else {//w ww .ja v a 2 s . com NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute info gains m_InfoGains = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != classIndex) { m_InfoGains[i] = (ContingencyTables.entropyOverColumns(counts[i]) - ContingencyTables.entropyConditionedOnRows(counts[i])); } } }
From source file:ffnn.TucilWeka.java
public static Instances filterDiscretize(Instances a) { Discretize filter = new Discretize(); Instances b = null;/*from w w w . j a va 2 s .c o m*/ try { filter.setInputFormat(a); b = Filter.useFilter(a, filter); } catch (Exception ex) { Logger.getLogger(TucilWeka.class.getName()).log(Level.SEVERE, null, ex); } return b; }
From source file:j48.NBTreeSplit.java
License:Open Source License
/** * Creates split on enumerated attribute. * * @exception Exception if something goes wrong *///from w w w .ja v a 2 s. c om private void handleEnumeratedAttribute(Instances trainInstances) throws Exception { m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights); m_c45S.buildClassifier(trainInstances); if (m_c45S.numSubsets() == 0) { return; } m_errors = 0; Instance instance; Instances[] trainingSets = new Instances[m_complexityIndex]; for (int i = 0; i < m_complexityIndex; i++) { trainingSets[i] = new Instances(trainInstances, 0); } /* m_distribution = new Distribution(m_complexityIndex, trainInstances.numClasses()); */ int subset; for (int i = 0; i < trainInstances.numInstances(); i++) { instance = trainInstances.instance(i); subset = m_c45S.whichSubset(instance); if (subset > -1) { trainingSets[subset].add((Instance) instance.copy()); } else { double[] weights = m_c45S.weights(instance); for (int j = 0; j < m_complexityIndex; j++) { try { Instance temp = (Instance) instance.copy(); if (weights.length == m_complexityIndex) { temp.setWeight(temp.weight() * weights[j]); } else { temp.setWeight(temp.weight() / m_complexityIndex); } trainingSets[j].add(temp); } catch (Exception ex) { ex.printStackTrace(); System.err.println("*** " + m_complexityIndex); System.err.println(weights.length); System.exit(1); } } } } /* // compute weights (weights of instances per subset m_weights = new double [m_complexityIndex]; for (int i = 0; i < m_complexityIndex; i++) { m_weights[i] = trainingSets[i].sumOfWeights(); } Utils.normalize(m_weights); */ /* // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (!instance.isMissing(m_attIndex)) { // m_distribution.add((int)instance.value(m_attIndex),instance); trainingSets[(int)instances.value(m_attIndex)].add(instance); } else { // add these to the error count m_errors += instance.weight(); } } */ Random r = new Random(1); int minNumCount = 0; for (int i = 0; i < m_complexityIndex; i++) { if (trainingSets[i].numInstances() >= 5) { minNumCount++; // Discretize the sets Discretize disc = new Discretize(); disc.setInputFormat(trainingSets[i]); trainingSets[i] = Filter.useFilter(trainingSets[i], disc); trainingSets[i].randomize(r); trainingSets[i].stratify(5); NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable(); fullModel.buildClassifier(trainingSets[i]); // add the errors for this branch of the split m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r); } else { // if fewer than min obj then just count them as errors for (int j = 0; j < trainingSets[i].numInstances(); j++) { m_errors += trainingSets[i].instance(j).weight(); } } } // Check if there are at least five instances in at least two of the subsets // subsets. if (minNumCount > 1) { m_numSubsets = m_complexityIndex; } }
From source file:j48.NBTreeSplit.java
License:Open Source License
/** * Creates split on numeric attribute./*w w w .j a v a2s .com*/ * * @exception Exception if something goes wrong */ private void handleNumericAttribute(Instances trainInstances) throws Exception { m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights); m_c45S.buildClassifier(trainInstances); if (m_c45S.numSubsets() == 0) { return; } m_errors = 0; Instances[] trainingSets = new Instances[m_complexityIndex]; trainingSets[0] = new Instances(trainInstances, 0); trainingSets[1] = new Instances(trainInstances, 0); int subset = -1; // populate the subsets for (int i = 0; i < trainInstances.numInstances(); i++) { Instance instance = trainInstances.instance(i); subset = m_c45S.whichSubset(instance); if (subset != -1) { trainingSets[subset].add((Instance) instance.copy()); } else { double[] weights = m_c45S.weights(instance); for (int j = 0; j < m_complexityIndex; j++) { Instance temp = (Instance) instance.copy(); if (weights.length == m_complexityIndex) { temp.setWeight(temp.weight() * weights[j]); } else { temp.setWeight(temp.weight() / m_complexityIndex); } trainingSets[j].add(temp); } } } /* // compute weights (weights of instances per subset m_weights = new double [m_complexityIndex]; for (int i = 0; i < m_complexityIndex; i++) { m_weights[i] = trainingSets[i].sumOfWeights(); } Utils.normalize(m_weights); */ Random r = new Random(1); int minNumCount = 0; for (int i = 0; i < m_complexityIndex; i++) { if (trainingSets[i].numInstances() > 5) { minNumCount++; // Discretize the sets Discretize disc = new Discretize(); disc.setInputFormat(trainingSets[i]); trainingSets[i] = Filter.useFilter(trainingSets[i], disc); trainingSets[i].randomize(r); trainingSets[i].stratify(5); NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable(); fullModel.buildClassifier(trainingSets[i]); // add the errors for this branch of the split m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r); } else { for (int j = 0; j < trainingSets[i].numInstances(); j++) { m_errors += trainingSets[i].instance(j).weight(); } } } // Check if minimum number of Instances in at least two // subsets. if (minNumCount > 1) { m_numSubsets = m_complexityIndex; } }
From source file:myclassifier.MyC45.java
public void buildClassifier(Instances data) throws Exception { /*/*from www . java 2 s. c o m*/ //Need discretize filter to make bins. ID3Classifier will not work unless data is nominal dFilter = new Discretize(); //apply discretize filter on data to create nominal data dFilter.setInputFormat(data); train_data = cloneInstances(data); train_data.delete(); */ Instances train = data; Discretize filter = new Discretize(); filter.setInputFormat(train); train = Filter.useFilter(data, filter); /* if (!train.classAttribute().isNominal()) { throw new Exception("ID3: nominal class, please."); }*/ /* Enumeration enumAtt = train.enumerateAttributes(); while (enumAtt.hasMoreElements()) { Attribute attr = (Attribute) enumAtt.nextElement(); if (!attr.isNominal()) throw new Exception("only nominal attributes, please."); Enumeration en = train.enumerateInstances(); while (en.hasMoreElements()) { if (((Instance) en.nextElement()).isMissing(attr)) throw new Exception("no missing values, please."); } } */ train = new Instances(train); train.deleteWithMissingClass(); makeTree(train); }