List of usage examples for weka.filters.supervised.attribute Discretize Discretize
public Discretize()
From source file:GainRatioAttributeEval1.java
License:Open Source License
/** * Initializes a gain ratio attribute evaluator. * Discretizes all attributes that are numeric. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been * generated successfully/* w ww. ja va 2s . c o m*/ */ public void buildEvaluator(Instance data) throws Exception { // can evaluator handle data? program10 program10 = new program10(); getCapabilities().testWithFail(program10.DataTrain()); m_trainInstances = program10.DataTrain(); m_classIndex = m_trainInstances.classIndex(); m_numAttribs = m_trainInstances.numAttributes(); m_numInstances = m_trainInstances.numInstances(); Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, disTransform); m_numClasses = m_trainInstances.attribute(m_classIndex).numValues(); }
From source file:ChiSquare.ChiSquaredAttributeEval.java
License:Open Source License
/** * Initializes a chi-squared attribute evaluator. * Discretizes all attributes that are numeric. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been * generated successfully/*from ww w . j ava 2 s . co m*/ */ public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else { NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute chi-squared values m_ChiSquareds = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != classIndex) { m_ChiSquareds[i] = ContingencyTables.chiVal(ContingencyTables.reduceMatrix(counts[i]), false); } } }
From source file:com.openkm.kea.filter.KEAFilter.java
License:Open Source License
/** * Builds the classifier.//from www . ja v a 2 s .c om */ // aly: The main function, where everything important happens private void buildClassifier() throws Exception { // Generate input format for classifier FastVector atts = new FastVector(); for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (i == m_DocumentAtt) { atts.addElement(new Attribute("TFxIDF")); atts.addElement(new Attribute("First_occurrence")); if (m_KFused) { atts.addElement(new Attribute("Keyphrase_frequency")); } if (m_STDEVfeature) { atts.addElement(new Attribute("Standard_deviation")); } if (m_NODEfeature) { atts.addElement(new Attribute("Relations_number")); } if (m_LENGTHfeature) { atts.addElement(new Attribute("Phrase_length")); } } else if (i == m_KeyphrasesAtt) { FastVector vals = new FastVector(2); vals.addElement("False"); vals.addElement("True"); //atts.addElement(new Attribute("Keyphrase?", vals)); atts.addElement(new Attribute("Keyphrase?")); } } m_ClassifierData = new Instances("ClassifierData", atts, 0); m_ClassifierData.setClassIndex(m_NumFeatures); if (m_Debug) { log.info("--- Converting instances for classifier"); } // Convert pending input instances into data for classifier for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance current = getInputFormat().instance(i); // Get the key phrases for the document String keyphrases = current.stringValue(m_KeyphrasesAtt); HashMap<String, Counter> hashKeyphrases = getGivenKeyphrases(keyphrases, false); HashMap<String, Counter> hashKeysEval = getGivenKeyphrases(keyphrases, true); // Get the phrases for the document HashMap<String, FastVector> hash = new HashMap<String, FastVector>(); int length = getPhrases(hash, current.stringValue(m_DocumentAtt)); // hash = getComposits(hash); // Compute the feature values for each phrase and // add the instance to the data for the classifier Iterator<String> it = hash.keySet().iterator(); while (it.hasNext()) { String phrase = it.next(); FastVector phraseInfo = (FastVector) hash.get(phrase); double[] vals = featVals(phrase, phraseInfo, true, hashKeysEval, hashKeyphrases, length, hash); //log.info(vals); Instance inst = new Instance(current.weight(), vals); // .err.println(phrase + "\t" + inst.toString()); m_ClassifierData.add(inst); } } if (m_Debug) { log.info("--- Building classifier"); } // Build classifier // Uncomment if you want to use a different classifier // Caution: Other places in the code will have to be adjusted!! /*I. Naive Bayes: FilteredClassifier fclass = new FilteredClassifier(); fclass.setClassifier(new weka.classifiers.bayes.NaiveBayesSimple()); fclass.setFilter(new Discretize()); m_Classifier = fclass; */ //NaiveBayes nb = new NaiveBayes(); //nb.setUseSupervisedDiscretization(true); //m_Classifier = nb; /* II. Linear Regression: LinearRegression lr = new LinearRegression(); lr.setAttributeSelectionMethod(new weka.core.SelectedTag(1, LinearRegression.TAGS_SELECTION)); lr.setEliminateColinearAttributes(false); lr.setDebug(false); m_Classifier = lr;*/ /* III. Bagging with REPTrees Bagging bagging = new Bagging(); String[] ops_bagging = { new String("-P"), new String("100"), new String("-S"), new String("1"), new String("-I"), new String("50")}; */ /* * REPTree rept = new REPTree(); //results are worse! rept.setNoPruning(true); String[] ops_rept = { new String("-M"), new String("2"), new String("-V"), new String("0.0010"), new String("-N"), new String("3"), new String("-S"), new String("1"), new String("-L"), new String("1"),}; rept.setOptions(ops_rept); bagging.setClassifier(rept); */ // bagging.setOptions(ops_bagging); //FilteredClassifier fclass = new FilteredClassifier(); //fclass.setClassifier(new REPTree()); //fclass.setFilter(new Discretize()); //bagging.setClassifier(fclass); // m_Classifier = bagging; RegressionByDiscretization rvd = new RegressionByDiscretization(); FilteredClassifier fclass = new FilteredClassifier(); fclass.setClassifier(new weka.classifiers.bayes.NaiveBayesSimple()); fclass.setFilter(new Discretize()); rvd.setClassifier(fclass); rvd.setNumBins(m_Indexers + 1); m_Classifier = rvd; // log.info(m_ClassifierData); //System.exit(1); m_Classifier.buildClassifier(m_ClassifierData); if (m_Debug) { log.info("" + m_Classifier); } // Save space m_ClassifierData = new Instances(m_ClassifierData, 0); }
From source file:dewaweebtreeclassifier.Sujeong.java
@Override public void buildClassifier(Instances instances) throws java.lang.Exception { filter = new Discretize(); filter.setInputFormat(instances);// w ww.ja v a 2 s.c o m this.buildTree(Filter.useFilter(instances, filter)); }
From source file:edu.columbia.cs.ltrie.sampling.queries.generation.ChiSquaredWithYatesCorrectionAttributeEval.java
License:Open Source License
/** * Initializes a chi-squared attribute evaluator. * Discretizes all attributes that are numeric. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been * generated successfully/*from ww w.ja v a 2 s .co m*/ */ public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else { NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute chi-squared values m_ChiSquareds = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != classIndex) { m_ChiSquareds[i] = chiVal(ContingencyTables.reduceMatrix(counts[i])); } } }
From source file:es.jarias.FMC.FMC.java
License:Open Source License
public static void buildModel(MultiLabelInstances trainData, MultiLabelInstances testData, int fold, String baseClassifierClass, String discType, String fss, String outPath, String prune) throws Exception { double start = System.nanoTime(); try {//from www .j a v a 2 s . c o m // DATA PREPROCESING: weka.filters.unsupervised.attribute.Discretize m_unsuperDiscretize = null; if (discType.equals("supervised")) { // pass // Supervised discretization is applied to each model later during the training step. } else if (discType.equals("unsupervised")) { // Apply a baseline discretization filter: m_unsuperDiscretize = new weka.filters.unsupervised.attribute.Discretize(); m_unsuperDiscretize.setUseEqualFrequency(false); m_unsuperDiscretize.setBins(3); m_unsuperDiscretize.setInputFormat(trainData.getDataSet()); trainData = trainData .reintegrateModifiedDataSet(Filter.useFilter(trainData.getDataSet(), m_unsuperDiscretize)); } else throw new Exception("Invalid Discretization Type"); if (!fss.equals("no") && !fss.equals("CFS")) throw new Exception("Invalid FSS strategy"); if (!prune.equals("full") && !prune.equals("tree") && !prune.equals("best") && !prune.equals("hiton") && !prune.equals("bdeu")) throw new Exception("Invalid Pruning strategy"); // Label information int m_numLabels = trainData.getNumLabels(); int[] m_labelIndices = trainData.getLabelIndices(); // Map for reference: HashMap<Integer, Integer> mapLabels = new HashMap<Integer, Integer>(m_numLabels); String[] mapLabelsName = new String[m_numLabels]; for (int l = 0; l < m_numLabels; l++) { mapLabels.put(trainData.getLabelIndices()[l], l); mapLabelsName[l] = trainData.getDataSet().attribute(trainData.getLabelIndices()[l]).name(); } // Get label combinations: int m_numPairs = (m_labelIndices.length * (m_labelIndices.length - 1)) / 2; int[][] labelCombinations = new int[m_numPairs][2]; int counter = 0; for (int i = 0; i < m_labelIndices.length; i++) { for (int j = i + 1; j < m_labelIndices.length; j++) { labelCombinations[counter] = new int[] { m_labelIndices[i], m_labelIndices[j] }; counter++; } } // Select the pairs: int m_numSelected = m_numPairs; int m_numSingleton = 0; int[] ordered; boolean[] selectedPair = new boolean[m_numPairs]; boolean[] singleton = new boolean[m_numLabels]; for (int i = 0; i < m_numPairs; i++) selectedPair[i] = true; if (!prune.equals("full")) { m_numSelected = 0; selectedPair = new boolean[m_numPairs]; // Info gain for pruned model: double[][] mutualInfoPairs = mutualInfo(trainData.getDataSet(), trainData.getLabelIndices()); double[] mutualInfo = new double[m_numPairs]; counter = 0; for (int i = 0; i < m_labelIndices.length; i++) { Instances tempInstances = new Instances(trainData.getDataSet()); tempInstances.setClassIndex(m_labelIndices[i]); for (int j = i + 1; j < m_labelIndices.length; j++) { mutualInfo[counter] = mutualInfoPairs[i][j]; counter++; } } ordered = orderBy(mutualInfo); if (prune.equals("tree")) { // Each labels correspond to its own connex component HashMap<Integer, ArrayList<Integer>> tree_compo = new HashMap<Integer, ArrayList<Integer>>( m_numLabels); HashMap<Integer, Integer> tree_index = new HashMap<Integer, Integer>(m_numLabels); for (int i = 0; i < m_numLabels; i++) { tree_compo.put(i, new ArrayList<Integer>()); tree_compo.get(i).add(i); tree_index.put(i, i); } for (int i = 0; i < m_numPairs; i++) { if (m_numSelected >= m_numLabels - 1) break; int pairIndex = ordered[i]; int pair_i = mapLabels.get(labelCombinations[pairIndex][0]); int pair_j = mapLabels.get(labelCombinations[pairIndex][1]); int conex_i = tree_index.get(pair_i); int conex_j = tree_index.get(pair_j); if (conex_i != conex_j) { ArrayList<Integer> family = tree_compo.get(conex_j); tree_compo.get(conex_i).addAll(family); for (int element : family) { tree_index.put(element, conex_i); } selectedPair[pairIndex] = true; m_numSelected++; } } } // End of the chow-liu algorithm if (prune.equals("best") || prune.equals("tree")) { int amount = 0; if (prune.equals("best")) amount = (int) (m_numLabels * 2); int index = 0; while (m_numSelected < amount && index < m_numPairs) { if (!selectedPair[ordered[index]]) { m_numSelected++; selectedPair[ordered[index]] = true; } index++; } } // End of the linear tree and best procedures if (prune.equals("hiton")) { weka.filters.unsupervised.attribute.Remove m_remove = new weka.filters.unsupervised.attribute.Remove(); m_remove.setAttributeIndicesArray(trainData.getLabelIndices()); m_remove.setInvertSelection(true); m_remove.setInputFormat(trainData.getDataSet()); Instances hitonData = Filter.useFilter(trainData.getDataSet(), m_remove); HITON hiton = new HITON(hitonData); HashSet<Integer>[] markovBlanket = new HashSet[m_numLabels]; for (int l = 0; l < m_numLabels; l++) markovBlanket[l] = hiton.HITONMB(l); for (int p = 0; p < m_numPairs; p++) { int p_i = mapLabels.get(labelCombinations[p][0]); int p_j = mapLabels.get(labelCombinations[p][1]); if (markovBlanket[p_i].contains(p_j) || markovBlanket[p_j].contains(p_i)) { selectedPair[p] = true; m_numSelected++; } } } // end of the hiton pruning algorithm if (prune.equals("bdeu")) { weka.filters.unsupervised.attribute.Remove m_remove = new weka.filters.unsupervised.attribute.Remove(); m_remove.setAttributeIndicesArray(trainData.getLabelIndices()); m_remove.setInvertSelection(true); m_remove.setInputFormat(trainData.getDataSet()); Instances hitonData = Filter.useFilter(trainData.getDataSet(), m_remove); BDeu hiton = new BDeu(hitonData); double[] scores = hiton.singleScore; double[] pairScores = new double[m_numPairs]; double[] sumScores = new double[m_numLabels]; for (int p = 0; p < m_numPairs; p++) { int head = mapLabels.get(labelCombinations[p][0]); int tail = mapLabels.get(labelCombinations[p][1]); pairScores[p] = -1 * (scores[tail] - (hiton.localBdeuScore(tail, new Integer[] { head }))); sumScores[tail] += pairScores[p]; sumScores[head] += pairScores[p]; } HashSet<Integer>[] parents = new HashSet[m_numLabels]; for (int i = 0; i < m_numLabels; i++) parents[i] = new HashSet<Integer>(); ordered = orderBy(pairScores); int[] topologicalOrdering = orderBy(sumScores); int[] relevance = new int[m_numLabels]; for (int i = 0; i < m_numLabels; i++) relevance[topologicalOrdering[i]] = i; for (int p = 0; p < m_numPairs; p++) { int pair = ordered[p]; int head = mapLabels.get(labelCombinations[pair][0]); int tail = mapLabels.get(labelCombinations[pair][1]); if (relevance[head] > relevance[tail]) { int aux = head; head = tail; tail = aux; } // Check if adding this improves parents[tail].add(head); double scoreAdd = hiton.localBdeuScore(tail, parents[tail].toArray(new Integer[parents[tail].size()])); double diff = scores[tail] - scoreAdd; if (diff < 0) { scores[tail] = scoreAdd; selectedPair[pair] = true; m_numSelected++; } else { parents[tail].remove(head); } } // End of the BDeu procedure } // End of the Pruning algorithms // // Determine singleton variables for (int i = 0; i < m_labelIndices.length; i++) singleton[i] = true; for (int p = 0; p < m_numPairs; p++) { if (selectedPair[p]) { singleton[mapLabels.get(labelCombinations[p][0])] = false; singleton[mapLabels.get(labelCombinations[p][1])] = false; } } for (int i = 0; i < m_labelIndices.length; i++) if (singleton[i]) m_numSingleton++; mutualInfo = null; } // Generate single class datasets from the full ML data and learn models: HashMap<Integer, Classifier> models = new HashMap<Integer, Classifier>(); HashMap<Integer, Classifier> singletonModels = new HashMap<Integer, Classifier>(); HashMap<Integer, weka.filters.supervised.attribute.AttributeSelection> singletonFilterSel = new HashMap<Integer, weka.filters.supervised.attribute.AttributeSelection>(); HashMap<Integer, weka.filters.supervised.attribute.Discretize> singletonFilter = new HashMap<Integer, weka.filters.supervised.attribute.Discretize>(); weka.filters.supervised.attribute.AttributeSelection[] m_selecters = new weka.filters.supervised.attribute.AttributeSelection[m_numPairs]; weka.filters.supervised.attribute.Discretize[] m_discretizers = new weka.filters.supervised.attribute.Discretize[m_numPairs]; ClassCompoundTransformation[] converters = new ClassCompoundTransformation[m_numPairs]; for (int i = 0; i < m_numPairs; i++) { if (!selectedPair[i]) { continue; } MultiLabelInstances filteredLabelData = trainData .reintegrateModifiedDataSet(RemoveAllLabels.transformInstances(trainData.getDataSet(), complement(m_labelIndices, labelCombinations[i]))); converters[i] = new ClassCompoundTransformation(); Instances singleLabelData = converters[i].transformInstances(filteredLabelData); if (discType.equals("supervised")) { m_discretizers[i] = new Discretize(); m_discretizers[i].setInputFormat(singleLabelData); singleLabelData = Filter.useFilter(singleLabelData, m_discretizers[i]); } if (fss.equals("CFS")) { m_selecters[i] = new weka.filters.supervised.attribute.AttributeSelection(); m_selecters[i].setSearch(new weka.attributeSelection.BestFirst()); m_selecters[i].setEvaluator(new weka.attributeSelection.CfsSubsetEval()); m_selecters[i].setInputFormat(singleLabelData); singleLabelData = Filter.useFilter(singleLabelData, m_selecters[i]); } models.put(i, (Classifier) Class.forName("weka.classifiers." + baseClassifierClass).newInstance()); models.get(i).buildClassifier(singleLabelData); } // Learn singleton models: for (int i = 0; i < m_labelIndices.length; i++) { if (singleton[i]) { Instances singleLabelData = new Instances(trainData.getDataSet()); singleLabelData.setClassIndex(m_labelIndices[i]); singleLabelData = RemoveAllLabels.transformInstances(singleLabelData, complement(m_labelIndices, new int[] { m_labelIndices[i] })); if (discType.equals("supervised")) { singletonFilter.put(i, new Discretize()); singletonFilter.get(i).setInputFormat(singleLabelData); singleLabelData = Filter.useFilter(singleLabelData, singletonFilter.get(i)); } if (fss.equals("CFS")) { weka.filters.supervised.attribute.AttributeSelection tempFilter = new weka.filters.supervised.attribute.AttributeSelection(); tempFilter.setSearch(new weka.attributeSelection.BestFirst()); tempFilter.setEvaluator(new weka.attributeSelection.CfsSubsetEval()); tempFilter.setInputFormat(singleLabelData); singletonFilterSel.put(i, tempFilter); singleLabelData = Filter.useFilter(singleLabelData, singletonFilterSel.get(i)); } Classifier single; single = (Classifier) Class.forName("weka.classifiers." + baseClassifierClass).newInstance(); single.buildClassifier(singleLabelData); singletonModels.put(i, single); } } // // END OF THE LEARNING STAGE // double train = System.nanoTime() - start; start = System.nanoTime(); Writer writerConf = null; Writer writerDist = null; Writer writerSing = null; Writer writerLayo = null; try { writerConf = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/conf_" + fold + ".txt"), "utf-8")); writerDist = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/dist_" + fold + ".txt"), "utf-8")); writerSing = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/sing_" + fold + ".txt"), "utf-8")); writerLayo = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/layo_" + fold + ".txt"), "utf-8")); for (int l = 0; l < m_numLabels; l++) { writerLayo.write(trainData.getDataSet().attribute(m_labelIndices[l]).numValues() + "\t"); } writerLayo.write("\n"); writerLayo.write(m_numSelected + "\t" + m_numSingleton); writerLayo.close(); // Get distributions for instance for each variable pairs: double[] distributions; for (int i = 0; i < testData.getDataSet().size(); i++) { for (int l : testData.getLabelIndices()) writerConf.write((int) testData.getDataSet().instance(i).value(l) + "\t"); writerConf.write("\n"); Instance inst = testData.getDataSet().get(i); if (discType.equals("unsupervised")) { m_unsuperDiscretize.input(inst); inst = m_unsuperDiscretize.output(); } for (int p = 0; p < m_numPairs; p++) { if (!selectedPair[p]) { continue; } Instance processed = converters[p].transformInstance(inst, testData.getLabelIndices()); if (discType.equals("supervised")) { m_discretizers[p].input(processed); processed = m_discretizers[p].output(); // m_removers[p].input(processed); // processed = m_removers[p].output(); } if (!fss.equals("no")) { m_selecters[p].input(processed); processed = m_selecters[p].output(); } distributions = models.get(p).distributionForInstance(processed); writerDist.write(mapLabels.get(labelCombinations[p][0]) + "\t" + mapLabels.get(labelCombinations[p][1]) + "\t"); for (int d = 0; d < distributions.length; d++) writerDist.write(distributions[d] + "\t"); writerDist.write("\n"); } // Get predictions for singleton labels: for (int m = 0; m < m_labelIndices.length; m++) { if (singleton[m]) { Instance processed = RemoveAllLabels.transformInstance(inst, complement(m_labelIndices, new int[] { m_labelIndices[m] })); if (discType.equals("supervised")) { singletonFilter.get(m).input(processed); processed = singletonFilter.get(m).output(); } if (!fss.equals("no")) { singletonFilterSel.get(m).input(processed); processed = singletonFilterSel.get(m).output(); } double[] distribution = singletonModels.get(m).distributionForInstance(processed); double maxValue = 0; int conf = -1; for (int v = 0; v < distribution.length; v++) { if (distribution[v] > maxValue) { maxValue = distribution[v]; conf = v; } } writerSing.write(i + "\t" + m + "\t" + conf + "\n"); } } } writerConf.close(); writerDist.close(); writerSing.close(); double test = System.nanoTime() - start; // train /= 1000000000.0; // test /= 1000000000.0; // System.out.println(java.lang.String.format("FMC-%s\t%s\t%s\t%d\t%s\t%s\t%.4f\t%.4f",prune,baseClassifierClass,dbName,fold,discType,fss,train,test)); } catch (IOException ex) { // report } finally { try { writerConf.close(); } catch (Exception ex) { } try { writerDist.close(); } catch (Exception ex) { } } } catch (Exception e) { e.printStackTrace(); } }
From source file:feature.InfoGainEval.java
License:Open Source License
/** * Initializes an information gain attribute evaluator. Discretizes all * attributes that are numeric.// w w w . j a v a2s . c o m * * @param data * set of instances serving as training data * @throws Exception * if the evaluator has not been generated successfully */ public double computeInfoGain(Instances data, int att) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else { NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute info gains m_InfoGains = new double[data.numAttributes()]; m_InfoGains[att] = (ContingencyTables.entropyOverColumns(counts[att]) - ContingencyTables.entropyConditionedOnRows(counts[att])); return m_InfoGains[att]; }
From source file:feature.InfoGainEval.java
License:Open Source License
public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); int classIndex = data.classIndex(); int numInstances = data.numInstances(); if (!m_Binarize) { Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(data); data = Filter.useFilter(data, disTransform); } else {/*from w w w. ja v a 2s .co m*/ NumericToBinary binTransform = new NumericToBinary(); binTransform.setInputFormat(data); data = Filter.useFilter(data, binTransform); } int numClasses = data.attribute(classIndex).numValues(); // Reserve space and initialize counters double[][][] counts = new double[data.numAttributes()][][]; for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); counts[k] = new double[numValues + 1][numClasses + 1]; } } // Initialize counters double[] temp = new double[numClasses + 1]; for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); if (inst.classIsMissing()) { temp[numClasses] += inst.weight(); } else { temp[(int) inst.classValue()] += inst.weight(); } } for (int k = 0; k < counts.length; k++) { if (k != classIndex) { for (int i = 0; i < temp.length; i++) { counts[k][0][i] = temp[i]; } } } // Get counts for (int k = 0; k < numInstances; k++) { Instance inst = data.instance(k); for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != classIndex) { if (inst.isMissingSparse(i) || inst.classIsMissing()) { if (!inst.isMissingSparse(i)) { counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } else if (!inst.classIsMissing()) { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst .classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } else { counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst .weight(); counts[inst.index(i)][0][numClasses] -= inst.weight(); } } else { counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight(); counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight(); } } } } // distribute missing counts if required if (m_missing_merge) { for (int k = 0; k < data.numAttributes(); k++) { if (k != classIndex) { int numValues = data.attribute(k).numValues(); // Compute marginals double[] rowSums = new double[numValues]; double[] columnSums = new double[numClasses]; double sum = 0; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { rowSums[i] += counts[k][i][j]; columnSums[j] += counts[k][i][j]; } sum += rowSums[i]; } if (Utils.gr(sum, 0)) { double[][] additions = new double[numValues][numClasses]; // Compute what needs to be added to each row for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j]; } } // Compute what needs to be added to each column for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses]; } } // Compute what needs to be added to each cell for (int i = 0; i < numClasses; i++) { for (int j = 0; j < numValues; j++) { additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses]; } } // Make new contingency table double[][] newTable = new double[numValues][numClasses]; for (int i = 0; i < numValues; i++) { for (int j = 0; j < numClasses; j++) { newTable[i][j] = counts[k][i][j] + additions[i][j]; } } counts[k] = newTable; } } } } // Compute info gains m_InfoGains = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != classIndex) { m_InfoGains[i] = (ContingencyTables.entropyOverColumns(counts[i]) - ContingencyTables.entropyConditionedOnRows(counts[i])); } } }
From source file:ffnn.TucilWeka.java
public static Instances filterDiscretize(Instances a) { Discretize filter = new Discretize(); Instances b = null;//w ww . j av a 2 s .c o m try { filter.setInputFormat(a); b = Filter.useFilter(a, filter); } catch (Exception ex) { Logger.getLogger(TucilWeka.class.getName()).log(Level.SEVERE, null, ex); } return b; }
From source file:j48.NBTreeNoSplit.java
License:Open Source License
/** * Build the no-split node/*from w w w.j a v a 2 s . c om*/ * * @param instances an <code>Instances</code> value * @exception Exception if an error occurs */ public final void buildClassifier(Instances instances) throws Exception { m_nb = new NaiveBayesUpdateable(); m_disc = new Discretize(); m_disc.setInputFormat(instances); Instances temp = Filter.useFilter(instances, m_disc); m_nb.buildClassifier(temp); if (temp.numInstances() >= 5) { m_errors = crossValidate(m_nb, temp, new Random(1)); } m_numSubsets = 1; }