List of usage examples for weka.filters.supervised.attribute AttributeSelection setEvaluator
public void setEvaluator(ASEvaluation evaluator)
From source file:com.ivanrf.smsspam.SpamClassifier.java
License:Apache License
private static FilteredClassifier initFilterClassifier(int wordsToKeep, String tokenizerOp, boolean useAttributeSelection, String classifierOp, boolean boosting) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setDoNotOperateOnPerClassBasis(true); filter.setLowerCaseTokens(true);//from w w w . j a v a 2 s . c om filter.setWordsToKeep(wordsToKeep); if (!tokenizerOp.equals(TOKENIZER_DEFAULT)) { //Make a tokenizer WordTokenizer wt = new WordTokenizer(); if (tokenizerOp.equals(TOKENIZER_COMPLETE)) wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}"); else //TOKENIZER_COMPLETE_NUMBERS) wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}|~0123456789"); filter.setTokenizer(wt); } FilteredClassifier classifier = new FilteredClassifier(); classifier.setFilter(filter); if (useAttributeSelection) { AttributeSelection as = new AttributeSelection(); as.setEvaluator(new InfoGainAttributeEval()); Ranker r = new Ranker(); r.setThreshold(0); as.setSearch(r); MultiFilter mf = new MultiFilter(); mf.setFilters(new Filter[] { filter, as }); classifier.setFilter(mf); } if (classifierOp.equals(CLASSIFIER_SMO)) classifier.setClassifier(new SMO()); else if (classifierOp.equals(CLASSIFIER_NB)) classifier.setClassifier(new NaiveBayes()); else if (classifierOp.equals(CLASSIFIER_IB1)) classifier.setClassifier(new IBk(1)); else if (classifierOp.equals(CLASSIFIER_IB3)) classifier.setClassifier(new IBk(3)); else if (classifierOp.equals(CLASSIFIER_IB5)) classifier.setClassifier(new IBk(5)); else if (classifierOp.equals(CLASSIFIER_PART)) classifier.setClassifier(new PART()); //Tarda mucho if (boosting) { AdaBoostM1 boost = new AdaBoostM1(); boost.setClassifier(classifier.getClassifier()); classifier.setClassifier(boost); //Con NB tarda mucho } return classifier; }
From source file:com.relationalcloud.main.Explanation.java
License:Open Source License
/** * @param args//from ww w. j a v a 2 s. c o m */ public static void main(String[] args) { // LOADING PROPERTY FILE AND DRIVER Properties ini = new Properties(); try { ini.load(new FileInputStream(System.getProperty("prop"))); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // Register jdbcDriver try { Class.forName(ini.getProperty("driver")); } catch (ClassNotFoundException e) { e.printStackTrace(); } // LOAD PROPERTIES FROM CONFIGURATION FILE String connection = ini.getProperty("conn"); String schemaname = ini.getProperty("schema"); String user = ini.getProperty("user"); String password = ini.getProperty("password"); String txnLogTable = ini.getProperty("txnLogTable"); String numb_trans_to_process = ini.getProperty("Explanation.numTxnsToExtractTemplates"); int numPart = Integer.parseInt(ini.getProperty("numPartitions")); // Initialize the Justification Handler ExplanationHandler jh = new ExplanationHandler(ini); System.out.println("Loading and processing " + jh.schemaname + " traces... considering prop file :" + jh.dbPropertyFile); try { // CREATE A DB CONNEctioN Connection conn = DriverManager.getConnection(connection + schemaname, user, password); Connection infschema_conn = DriverManager.getConnection(connection + "information_schema", user, password); Schema schema = SchemaLoader.loadSchemaFromDB(infschema_conn, schemaname); // ANALYZE WORKLOADS EXTRACTING TABLES, ATTRIBUTES AND FREQUENCIES ExplanationWorkloadPrepocessor wa = ExplanationHandler.analyzeWorkload(txnLogTable, numb_trans_to_process, schemaname, conn, schema); // FOR EACH TABLE CLASSIFY AND POPULATE JUSTIFICATION COLUMN for (String tableProcessed : wa.getAllTableNames()) { System.out.println("-------------------------------------------"); System.out.println("ANALYZING TABLE " + tableProcessed); // FETCH THE INSTANCE FROM THE DB AND SAMPLE IT Instances data = jh.generateInstancesForTable(tableProcessed, wa.getFeatures(tableProcessed), conn); // IF THERE IS ONLY THE PARTITION LABEL, SKIP THE TABLE if (data.numAttributes() < 2) { System.out.println("No transactions touches this table, nothing to be done."); continue; } // INSTANTIATE THE CLASSIFIER String[] options; options = new String[3]; options[0] = "-P"; options[1] = "-C"; options[2] = ini.getProperty("Explanation.j48PruningConfidence"); J48 classifier = new J48(); // new instance of tree classifier.setOptions(options); // set the options Boolean attributeFilter = true; // ATTRIBUTE FILTERING Instances newData; if (data.numClasses() > 1 && attributeFilter) { AttributeSelection filter = new AttributeSelection(); //FIXME TRYING ALTERNATIVE ATTRIBUTE SELECTION STRATEGIES //InfoGainAttributeEval eval = new InfoGainAttributeEval(); //Ranker search = new Ranker(); //search.setNumToSelect(Integer.parseInt(ini.getProperty("Explanation.maxNumberOfAttribute","2"))); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); filter.setEvaluator(eval); filter.setSearch(search); filter.setInputFormat(data); newData = Filter.useFilter(data, filter); } else { newData = data; } String atts = ""; Enumeration e = newData.enumerateAttributes(); ArrayList<String> attributesForPopulation = new ArrayList<String>(); while (e.hasMoreElements()) { String s = ((Attribute) e.nextElement()).name(); attributesForPopulation.add(s); atts += s + ", "; } atts = atts.substring(0, atts.length() - 2); System.out.println("Attribute filtering reduced " + (data.numAttributes() - 1) + " to " + (newData.numAttributes() - 1) + " (" + atts + ")"); data = null; System.gc(); if (newData.numInstances() < 1) { System.err.println("The are no data in the table, skipping classification"); continue; } if (newData.numInstances() > 0) { if (newData.classAttribute().numValues() > 1) { // TRAIN THE CLASSIFIER AND PRINT OUT CLASSIFIER RULES ExplanationHandler.trainClassifier(newData, classifier); if (classifier.measureNumLeaves() == 1) { int partitionvalue = (int) classifier.classifyInstance(newData.firstInstance()); System.out.println( "The classifier decided to put all the tuplesi in the table in one partition: " + partitionvalue); if (Boolean.parseBoolean(ini.getProperty("Explanation.populateExplainedColumn"))) { jh.populateExplainedColumn(tableProcessed, partitionvalue, attributesForPopulation, conn); } } // POPULATING THE justifiedpartition column with the result of this // classifier if required else if (Boolean.parseBoolean(ini.getProperty("Explanation.populateExplainedColumn"))) { jh.populateJustifiedColumn(tableProcessed, classifier, attributesForPopulation, conn, numPart, newData.classAttribute().enumerateValues()); } } else { // easy case... the class attribute is unary!! int partitionvalue = ((int) newData.firstInstance() .value(newData.firstInstance().classIndex())); System.out.println("The table is all stored in one partition, no need to use classifier"); if (Boolean.parseBoolean(ini.getProperty("Explanation.populateExplainedColumn"))) { jh.populateExplainedColumn(tableProcessed, partitionvalue, attributesForPopulation, conn); } } } else throw new Exception("The Instances is empty"); } // SET HASH PARTITION / REPLICATED PARTITION if (Boolean.parseBoolean(ini.getProperty("Explanation.populateHashColumn"))) { jh.populateHashPartition(conn); } if (Boolean.parseBoolean(ini.getProperty("Explanation.populateReplicatedColumn"))) { jh.populateReplicatedPartition(conn, Boolean.parseBoolean(ini.getProperty("Explanation.defaultReplicate"))); } conn.close(); } catch (SQLException e) { e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:es.jarias.FMC.FMC.java
License:Open Source License
public static void buildModel(MultiLabelInstances trainData, MultiLabelInstances testData, int fold, String baseClassifierClass, String discType, String fss, String outPath, String prune) throws Exception { double start = System.nanoTime(); try {/*from ww w . j ava 2 s.c o m*/ // DATA PREPROCESING: weka.filters.unsupervised.attribute.Discretize m_unsuperDiscretize = null; if (discType.equals("supervised")) { // pass // Supervised discretization is applied to each model later during the training step. } else if (discType.equals("unsupervised")) { // Apply a baseline discretization filter: m_unsuperDiscretize = new weka.filters.unsupervised.attribute.Discretize(); m_unsuperDiscretize.setUseEqualFrequency(false); m_unsuperDiscretize.setBins(3); m_unsuperDiscretize.setInputFormat(trainData.getDataSet()); trainData = trainData .reintegrateModifiedDataSet(Filter.useFilter(trainData.getDataSet(), m_unsuperDiscretize)); } else throw new Exception("Invalid Discretization Type"); if (!fss.equals("no") && !fss.equals("CFS")) throw new Exception("Invalid FSS strategy"); if (!prune.equals("full") && !prune.equals("tree") && !prune.equals("best") && !prune.equals("hiton") && !prune.equals("bdeu")) throw new Exception("Invalid Pruning strategy"); // Label information int m_numLabels = trainData.getNumLabels(); int[] m_labelIndices = trainData.getLabelIndices(); // Map for reference: HashMap<Integer, Integer> mapLabels = new HashMap<Integer, Integer>(m_numLabels); String[] mapLabelsName = new String[m_numLabels]; for (int l = 0; l < m_numLabels; l++) { mapLabels.put(trainData.getLabelIndices()[l], l); mapLabelsName[l] = trainData.getDataSet().attribute(trainData.getLabelIndices()[l]).name(); } // Get label combinations: int m_numPairs = (m_labelIndices.length * (m_labelIndices.length - 1)) / 2; int[][] labelCombinations = new int[m_numPairs][2]; int counter = 0; for (int i = 0; i < m_labelIndices.length; i++) { for (int j = i + 1; j < m_labelIndices.length; j++) { labelCombinations[counter] = new int[] { m_labelIndices[i], m_labelIndices[j] }; counter++; } } // Select the pairs: int m_numSelected = m_numPairs; int m_numSingleton = 0; int[] ordered; boolean[] selectedPair = new boolean[m_numPairs]; boolean[] singleton = new boolean[m_numLabels]; for (int i = 0; i < m_numPairs; i++) selectedPair[i] = true; if (!prune.equals("full")) { m_numSelected = 0; selectedPair = new boolean[m_numPairs]; // Info gain for pruned model: double[][] mutualInfoPairs = mutualInfo(trainData.getDataSet(), trainData.getLabelIndices()); double[] mutualInfo = new double[m_numPairs]; counter = 0; for (int i = 0; i < m_labelIndices.length; i++) { Instances tempInstances = new Instances(trainData.getDataSet()); tempInstances.setClassIndex(m_labelIndices[i]); for (int j = i + 1; j < m_labelIndices.length; j++) { mutualInfo[counter] = mutualInfoPairs[i][j]; counter++; } } ordered = orderBy(mutualInfo); if (prune.equals("tree")) { // Each labels correspond to its own connex component HashMap<Integer, ArrayList<Integer>> tree_compo = new HashMap<Integer, ArrayList<Integer>>( m_numLabels); HashMap<Integer, Integer> tree_index = new HashMap<Integer, Integer>(m_numLabels); for (int i = 0; i < m_numLabels; i++) { tree_compo.put(i, new ArrayList<Integer>()); tree_compo.get(i).add(i); tree_index.put(i, i); } for (int i = 0; i < m_numPairs; i++) { if (m_numSelected >= m_numLabels - 1) break; int pairIndex = ordered[i]; int pair_i = mapLabels.get(labelCombinations[pairIndex][0]); int pair_j = mapLabels.get(labelCombinations[pairIndex][1]); int conex_i = tree_index.get(pair_i); int conex_j = tree_index.get(pair_j); if (conex_i != conex_j) { ArrayList<Integer> family = tree_compo.get(conex_j); tree_compo.get(conex_i).addAll(family); for (int element : family) { tree_index.put(element, conex_i); } selectedPair[pairIndex] = true; m_numSelected++; } } } // End of the chow-liu algorithm if (prune.equals("best") || prune.equals("tree")) { int amount = 0; if (prune.equals("best")) amount = (int) (m_numLabels * 2); int index = 0; while (m_numSelected < amount && index < m_numPairs) { if (!selectedPair[ordered[index]]) { m_numSelected++; selectedPair[ordered[index]] = true; } index++; } } // End of the linear tree and best procedures if (prune.equals("hiton")) { weka.filters.unsupervised.attribute.Remove m_remove = new weka.filters.unsupervised.attribute.Remove(); m_remove.setAttributeIndicesArray(trainData.getLabelIndices()); m_remove.setInvertSelection(true); m_remove.setInputFormat(trainData.getDataSet()); Instances hitonData = Filter.useFilter(trainData.getDataSet(), m_remove); HITON hiton = new HITON(hitonData); HashSet<Integer>[] markovBlanket = new HashSet[m_numLabels]; for (int l = 0; l < m_numLabels; l++) markovBlanket[l] = hiton.HITONMB(l); for (int p = 0; p < m_numPairs; p++) { int p_i = mapLabels.get(labelCombinations[p][0]); int p_j = mapLabels.get(labelCombinations[p][1]); if (markovBlanket[p_i].contains(p_j) || markovBlanket[p_j].contains(p_i)) { selectedPair[p] = true; m_numSelected++; } } } // end of the hiton pruning algorithm if (prune.equals("bdeu")) { weka.filters.unsupervised.attribute.Remove m_remove = new weka.filters.unsupervised.attribute.Remove(); m_remove.setAttributeIndicesArray(trainData.getLabelIndices()); m_remove.setInvertSelection(true); m_remove.setInputFormat(trainData.getDataSet()); Instances hitonData = Filter.useFilter(trainData.getDataSet(), m_remove); BDeu hiton = new BDeu(hitonData); double[] scores = hiton.singleScore; double[] pairScores = new double[m_numPairs]; double[] sumScores = new double[m_numLabels]; for (int p = 0; p < m_numPairs; p++) { int head = mapLabels.get(labelCombinations[p][0]); int tail = mapLabels.get(labelCombinations[p][1]); pairScores[p] = -1 * (scores[tail] - (hiton.localBdeuScore(tail, new Integer[] { head }))); sumScores[tail] += pairScores[p]; sumScores[head] += pairScores[p]; } HashSet<Integer>[] parents = new HashSet[m_numLabels]; for (int i = 0; i < m_numLabels; i++) parents[i] = new HashSet<Integer>(); ordered = orderBy(pairScores); int[] topologicalOrdering = orderBy(sumScores); int[] relevance = new int[m_numLabels]; for (int i = 0; i < m_numLabels; i++) relevance[topologicalOrdering[i]] = i; for (int p = 0; p < m_numPairs; p++) { int pair = ordered[p]; int head = mapLabels.get(labelCombinations[pair][0]); int tail = mapLabels.get(labelCombinations[pair][1]); if (relevance[head] > relevance[tail]) { int aux = head; head = tail; tail = aux; } // Check if adding this improves parents[tail].add(head); double scoreAdd = hiton.localBdeuScore(tail, parents[tail].toArray(new Integer[parents[tail].size()])); double diff = scores[tail] - scoreAdd; if (diff < 0) { scores[tail] = scoreAdd; selectedPair[pair] = true; m_numSelected++; } else { parents[tail].remove(head); } } // End of the BDeu procedure } // End of the Pruning algorithms // // Determine singleton variables for (int i = 0; i < m_labelIndices.length; i++) singleton[i] = true; for (int p = 0; p < m_numPairs; p++) { if (selectedPair[p]) { singleton[mapLabels.get(labelCombinations[p][0])] = false; singleton[mapLabels.get(labelCombinations[p][1])] = false; } } for (int i = 0; i < m_labelIndices.length; i++) if (singleton[i]) m_numSingleton++; mutualInfo = null; } // Generate single class datasets from the full ML data and learn models: HashMap<Integer, Classifier> models = new HashMap<Integer, Classifier>(); HashMap<Integer, Classifier> singletonModels = new HashMap<Integer, Classifier>(); HashMap<Integer, weka.filters.supervised.attribute.AttributeSelection> singletonFilterSel = new HashMap<Integer, weka.filters.supervised.attribute.AttributeSelection>(); HashMap<Integer, weka.filters.supervised.attribute.Discretize> singletonFilter = new HashMap<Integer, weka.filters.supervised.attribute.Discretize>(); weka.filters.supervised.attribute.AttributeSelection[] m_selecters = new weka.filters.supervised.attribute.AttributeSelection[m_numPairs]; weka.filters.supervised.attribute.Discretize[] m_discretizers = new weka.filters.supervised.attribute.Discretize[m_numPairs]; ClassCompoundTransformation[] converters = new ClassCompoundTransformation[m_numPairs]; for (int i = 0; i < m_numPairs; i++) { if (!selectedPair[i]) { continue; } MultiLabelInstances filteredLabelData = trainData .reintegrateModifiedDataSet(RemoveAllLabels.transformInstances(trainData.getDataSet(), complement(m_labelIndices, labelCombinations[i]))); converters[i] = new ClassCompoundTransformation(); Instances singleLabelData = converters[i].transformInstances(filteredLabelData); if (discType.equals("supervised")) { m_discretizers[i] = new Discretize(); m_discretizers[i].setInputFormat(singleLabelData); singleLabelData = Filter.useFilter(singleLabelData, m_discretizers[i]); } if (fss.equals("CFS")) { m_selecters[i] = new weka.filters.supervised.attribute.AttributeSelection(); m_selecters[i].setSearch(new weka.attributeSelection.BestFirst()); m_selecters[i].setEvaluator(new weka.attributeSelection.CfsSubsetEval()); m_selecters[i].setInputFormat(singleLabelData); singleLabelData = Filter.useFilter(singleLabelData, m_selecters[i]); } models.put(i, (Classifier) Class.forName("weka.classifiers." + baseClassifierClass).newInstance()); models.get(i).buildClassifier(singleLabelData); } // Learn singleton models: for (int i = 0; i < m_labelIndices.length; i++) { if (singleton[i]) { Instances singleLabelData = new Instances(trainData.getDataSet()); singleLabelData.setClassIndex(m_labelIndices[i]); singleLabelData = RemoveAllLabels.transformInstances(singleLabelData, complement(m_labelIndices, new int[] { m_labelIndices[i] })); if (discType.equals("supervised")) { singletonFilter.put(i, new Discretize()); singletonFilter.get(i).setInputFormat(singleLabelData); singleLabelData = Filter.useFilter(singleLabelData, singletonFilter.get(i)); } if (fss.equals("CFS")) { weka.filters.supervised.attribute.AttributeSelection tempFilter = new weka.filters.supervised.attribute.AttributeSelection(); tempFilter.setSearch(new weka.attributeSelection.BestFirst()); tempFilter.setEvaluator(new weka.attributeSelection.CfsSubsetEval()); tempFilter.setInputFormat(singleLabelData); singletonFilterSel.put(i, tempFilter); singleLabelData = Filter.useFilter(singleLabelData, singletonFilterSel.get(i)); } Classifier single; single = (Classifier) Class.forName("weka.classifiers." + baseClassifierClass).newInstance(); single.buildClassifier(singleLabelData); singletonModels.put(i, single); } } // // END OF THE LEARNING STAGE // double train = System.nanoTime() - start; start = System.nanoTime(); Writer writerConf = null; Writer writerDist = null; Writer writerSing = null; Writer writerLayo = null; try { writerConf = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/conf_" + fold + ".txt"), "utf-8")); writerDist = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/dist_" + fold + ".txt"), "utf-8")); writerSing = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/sing_" + fold + ".txt"), "utf-8")); writerLayo = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outPath + "/layo_" + fold + ".txt"), "utf-8")); for (int l = 0; l < m_numLabels; l++) { writerLayo.write(trainData.getDataSet().attribute(m_labelIndices[l]).numValues() + "\t"); } writerLayo.write("\n"); writerLayo.write(m_numSelected + "\t" + m_numSingleton); writerLayo.close(); // Get distributions for instance for each variable pairs: double[] distributions; for (int i = 0; i < testData.getDataSet().size(); i++) { for (int l : testData.getLabelIndices()) writerConf.write((int) testData.getDataSet().instance(i).value(l) + "\t"); writerConf.write("\n"); Instance inst = testData.getDataSet().get(i); if (discType.equals("unsupervised")) { m_unsuperDiscretize.input(inst); inst = m_unsuperDiscretize.output(); } for (int p = 0; p < m_numPairs; p++) { if (!selectedPair[p]) { continue; } Instance processed = converters[p].transformInstance(inst, testData.getLabelIndices()); if (discType.equals("supervised")) { m_discretizers[p].input(processed); processed = m_discretizers[p].output(); // m_removers[p].input(processed); // processed = m_removers[p].output(); } if (!fss.equals("no")) { m_selecters[p].input(processed); processed = m_selecters[p].output(); } distributions = models.get(p).distributionForInstance(processed); writerDist.write(mapLabels.get(labelCombinations[p][0]) + "\t" + mapLabels.get(labelCombinations[p][1]) + "\t"); for (int d = 0; d < distributions.length; d++) writerDist.write(distributions[d] + "\t"); writerDist.write("\n"); } // Get predictions for singleton labels: for (int m = 0; m < m_labelIndices.length; m++) { if (singleton[m]) { Instance processed = RemoveAllLabels.transformInstance(inst, complement(m_labelIndices, new int[] { m_labelIndices[m] })); if (discType.equals("supervised")) { singletonFilter.get(m).input(processed); processed = singletonFilter.get(m).output(); } if (!fss.equals("no")) { singletonFilterSel.get(m).input(processed); processed = singletonFilterSel.get(m).output(); } double[] distribution = singletonModels.get(m).distributionForInstance(processed); double maxValue = 0; int conf = -1; for (int v = 0; v < distribution.length; v++) { if (distribution[v] > maxValue) { maxValue = distribution[v]; conf = v; } } writerSing.write(i + "\t" + m + "\t" + conf + "\n"); } } } writerConf.close(); writerDist.close(); writerSing.close(); double test = System.nanoTime() - start; // train /= 1000000000.0; // test /= 1000000000.0; // System.out.println(java.lang.String.format("FMC-%s\t%s\t%s\t%d\t%s\t%s\t%.4f\t%.4f",prune,baseClassifierClass,dbName,fold,discType,fss,train,test)); } catch (IOException ex) { // report } finally { try { writerConf.close(); } catch (Exception ex) { } try { writerDist.close(); } catch (Exception ex) { } } } catch (Exception e) { e.printStackTrace(); } }
From source file:etc.aloe.oilspill2010.FeatureGenerationImpl.java
License:Open Source License
protected Filter getFeatureSelectionFilter(ExampleSet examples) throws Exception { AttributeSelection filter = new AttributeSelection(); // package weka.filters.supervised.attribute! //CfsSubsetEval eval = new CfsSubsetEval(); //CorrelationAttributeEval eval = new CorrelationAttributeEval(); //InfoGainAttributeEval eval = new InfoGainAttributeEval(); ReliefFAttributeEval eval = new ReliefFAttributeEval(); //GreedyStepwise search = new GreedyStepwise(); //search.setNumToSelect(980); //search.setSearchBackwards(true); Ranker search = new Ranker(); search.setNumToSelect(980);//from w w w. j a v a 2 s . co m filter.setEvaluator(eval); filter.setSearch(search); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; }
From source file:mao.datamining.DataSetPair.java
/** * Pre-Process the training data set with: * RemoveUselessColumnsByMissingValues filter * SpreadSubsample filter to shrink the majority class instances * AttributeSelection filter with CfsSubsetEval and LinearForwardSelection *//*from w w w . j av a 2 s . c o m*/ private void processTrainRawData() { System.out.println("====================" + this.trainFileName + "===================="); System.out.println("====================" + this.trainFileName + "===================="); System.out.println("====================" + this.trainFileName + "===================="); finalTrainAttrList.clear(); try { doItOnce4All(); String sampleFilePath = null; //step 2, either over sample, or under sample //weka.filters.supervised.instance.SpreadSubsample if (this.resampleMethod.equalsIgnoreCase(resampleUnder)) { System.out.println("Under Samplessssssssssssssssssssssssssssssssssssss"); sampleFilePath = Main.OrangeProcessedDSHome + "/afterUnderSampling.arff"; } else if (resampleMethod.equalsIgnoreCase(resampleOver)) { System.out.println("Over Samplessssssssssssssssssssssssssssssssssssss"); sampleFilePath = Main.OrangeProcessedDSHome + "/afterOverSampling.arff"; } else if (resampleMethod.equalsIgnoreCase(resampleNone)) { //do nothing, System.out.println("None Samplessssssssssssssssssssssssssssssssssssss"); sampleFilePath = Main.OrangeProcessedDSHome + "/afterNoneSampling.arff"; } else if (resampleMethod.equalsIgnoreCase(resampleMatrix)) { //do nothing System.out.println("Matrix Samplessssssssssssssssssssssssssssssssssssss"); sampleFilePath = Main.OrangeProcessedDSHome + "/afterNoneSampling.arff"; } else { doNotSupport(); } Instances newData = ConverterUtils.DataSource.read(sampleFilePath); newData.setClassIndex(newData.numAttributes() - 1); // Main.logging("== New Data After Resampling class instances: ===\n" + newData.toSummaryString()); //Step 3, select features AttributeSelection attrSelectionFilter = new AttributeSelection(); ASEvaluation eval = null; ASSearch search = null; //ranker if (this.featureSelectionMode.equalsIgnoreCase(featureSelectionA)) { System.out.println("Ranker ssssssssssssssssssssssssssssssssssssss"); System.out.println("Ranker ssssssssssssssssssssssssssssssssssssss"); System.out.println("Ranker ssssssssssssssssssssssssssssssssssssss"); eval = new weka.attributeSelection.InfoGainAttributeEval(); //weka.attributeSelection.Ranker -T 0.02 -N -1 search = new Ranker(); String rankerOptios[] = { "-T", "0.01", "-N", "-1" }; if (resampleMethod.equalsIgnoreCase(resampleOver)) { rankerOptios[1] = "0.1"; } ((Ranker) search).setOptions(rankerOptios); Main.logging("== Start to Select Features with InfoGainAttributeEval and Ranker"); } //weka.attributeSelection.LinearForwardSelection -D 0 -N 5 -I -K 50 -T 0 else if (this.featureSelectionMode.equalsIgnoreCase(featureSelectionB)) { System.out.println("CfsSubset ssssssssssssssssssssssssssssssssssssss"); System.out.println("CfsSubset ssssssssssssssssssssssssssssssssssssss"); System.out.println("CfsSubset ssssssssssssssssssssssssssssssssssssss"); eval = new CfsSubsetEval(); search = new LinearForwardSelection(); String linearOptios[] = { "-D", "0", "-N", "5", "-I", "-K", "50", "-T", "0" }; ((LinearForwardSelection) search).setOptions(linearOptios); Main.logging("== Start to Select Features with CfsSubsetEval and LinearForwardSelection"); } else if (this.featureSelectionMode.equalsIgnoreCase(featureSelectionNo)) { System.out.println("None Selection ssssssssssssssssssssssssssssssssssssss"); Main.logging("No Feature Selection Method"); } else { doNotSupport(); } if (eval != null) { attrSelectionFilter.setEvaluator(eval); attrSelectionFilter.setSearch(search); attrSelectionFilter.setInputFormat(newData); newData = Filter.useFilter(newData, attrSelectionFilter); } Main.logging("== New Data After Selecting Features: ===\n" + newData.toSummaryString()); //finally, write the final dataset to file system try (BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(this.trainFileName)))) { writer.write(newData.toString()); } int numAttributes = newData.numAttributes(); for (int i = 0; i < numAttributes; i++) { String attrName = newData.attribute(i).name(); finalTrainAttrList.add(attrName); } Main.logging(finalTrainAttrList.toString()); // //set the final train dataset finalTrainDataSet = newData; finalTrainDataSet.setClassIndex(finalTrainDataSet.numAttributes() - 1); Main.logging("train dataset class attr: " + finalTrainDataSet.classAttribute().toString()); } catch (Exception ex) { Main.logging(null, ex); } }
From source file:mlpoc.MLPOC.java
/** * uses the filter/*from w w w.j a v a 2 s . c om*/ */ protected static void useFilter(Instances data) throws Exception { System.out.println("\n2. Filter"); weka.filters.supervised.attribute.AttributeSelection filter = new weka.filters.supervised.attribute.AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); filter.setEvaluator(eval); filter.setSearch(search); filter.setInputFormat(data); Instances newData = Filter.useFilter(data, filter); System.out.println(newData); }
From source file:org.uclab.mm.kcl.ddkat.datapreprocessor.FeaturesSelector.java
License:Apache License
/** * Method to filter the input data using GreedyStepwise approach. * * @throws Exception the exception//from ww w . j a va 2 s . c o m */ public void filterData() throws Exception { this.confirmationMessage = new ArrayList<String>(); Instances inputData, outputData; String inputFile = BASE_DIR + "OriginalDataSet.csv"; // load CSV file CSVLoader fileLoader = new CSVLoader(); fileLoader.setSource(new File(inputFile)); inputData = fileLoader.getDataSet(); inputData.setClassIndex(inputData.numAttributes() - 1); AttributeSelection filter = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); filter.setEvaluator(eval); filter.setSearch(search); filter.setInputFormat(inputData); outputData = Filter.useFilter(inputData, filter); int indices = outputData.numAttributes(); String selectedAttributesString = ""; for (int i = 0; i < indices; i++) { selectedAttributesString += "\n" + outputData.attribute(i).toString() + ", "; } selectedAttributesString = selectedAttributesString.substring(0, selectedAttributesString.length() - 2); saveFilteredData(inputFile, outputData); }
From source file:trainableSegmentation.WekaSegmentation.java
License:GNU General Public License
/** * Select attributes using BestFirst search to reduce * the number of parameters per instance of a dataset * * @param data input set of instances/*from www . j a v a2s. c o m*/ * @return resampled set of instances */ public static Instances selectAttributes(Instances data) { final AttributeSelection filter = new AttributeSelection(); Instances filteredIns = null; // Evaluator final CfsSubsetEval evaluator = new CfsSubsetEval(); evaluator.setMissingSeparate(true); // Assign evaluator to filter filter.setEvaluator(evaluator); // Search strategy: best first (default values) final BestFirst search = new BestFirst(); filter.setSearch(search); // Apply filter try { filter.setInputFormat(data); filteredIns = Filter.useFilter(data, filter); } catch (Exception e) { IJ.log("Error when resampling input data with selected attributes!"); e.printStackTrace(); } return filteredIns; }