List of usage examples for weka.filters.supervised.attribute AttributeSelection AttributeSelection
public AttributeSelection()
From source file:com.ivanrf.smsspam.SpamClassifier.java
License:Apache License
private static FilteredClassifier initFilterClassifier(int wordsToKeep, String tokenizerOp, boolean useAttributeSelection, String classifierOp, boolean boosting) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setDoNotOperateOnPerClassBasis(true); filter.setLowerCaseTokens(true);/*w ww . j a va2s . c o m*/ filter.setWordsToKeep(wordsToKeep); if (!tokenizerOp.equals(TOKENIZER_DEFAULT)) { //Make a tokenizer WordTokenizer wt = new WordTokenizer(); if (tokenizerOp.equals(TOKENIZER_COMPLETE)) wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}"); else //TOKENIZER_COMPLETE_NUMBERS) wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}|~0123456789"); filter.setTokenizer(wt); } FilteredClassifier classifier = new FilteredClassifier(); classifier.setFilter(filter); if (useAttributeSelection) { AttributeSelection as = new AttributeSelection(); as.setEvaluator(new InfoGainAttributeEval()); Ranker r = new Ranker(); r.setThreshold(0); as.setSearch(r); MultiFilter mf = new MultiFilter(); mf.setFilters(new Filter[] { filter, as }); classifier.setFilter(mf); } if (classifierOp.equals(CLASSIFIER_SMO)) classifier.setClassifier(new SMO()); else if (classifierOp.equals(CLASSIFIER_NB)) classifier.setClassifier(new NaiveBayes()); else if (classifierOp.equals(CLASSIFIER_IB1)) classifier.setClassifier(new IBk(1)); else if (classifierOp.equals(CLASSIFIER_IB3)) classifier.setClassifier(new IBk(3)); else if (classifierOp.equals(CLASSIFIER_IB5)) classifier.setClassifier(new IBk(5)); else if (classifierOp.equals(CLASSIFIER_PART)) classifier.setClassifier(new PART()); //Tarda mucho if (boosting) { AdaBoostM1 boost = new AdaBoostM1(); boost.setClassifier(classifier.getClassifier()); classifier.setClassifier(boost); //Con NB tarda mucho } return classifier; }
From source file:com.relationalcloud.main.Explanation.java
License:Open Source License
/** * @param args/*from ww w . jav a 2 s . c om*/ */ public static void main(String[] args) { // LOADING PROPERTY FILE AND DRIVER Properties ini = new Properties(); try { ini.load(new FileInputStream(System.getProperty("prop"))); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // Register jdbcDriver try { Class.forName(ini.getProperty("driver")); } catch (ClassNotFoundException e) { e.printStackTrace(); } // LOAD PROPERTIES FROM CONFIGURATION FILE String connection = ini.getProperty("conn"); String schemaname = ini.getProperty("schema"); String user = ini.getProperty("user"); String password = ini.getProperty("password"); String txnLogTable = ini.getProperty("txnLogTable"); String numb_trans_to_process = ini.getProperty("Explanation.numTxnsToExtractTemplates"); int numPart = Integer.parseInt(ini.getProperty("numPartitions")); // Initialize the Justification Handler ExplanationHandler jh = new ExplanationHandler(ini); System.out.println("Loading and processing " + jh.schemaname + " traces... considering prop file :" + jh.dbPropertyFile); try { // CREATE A DB CONNEctioN Connection conn = DriverManager.getConnection(connection + schemaname, user, password); Connection infschema_conn = DriverManager.getConnection(connection + "information_schema", user, password); Schema schema = SchemaLoader.loadSchemaFromDB(infschema_conn, schemaname); // ANALYZE WORKLOADS EXTRACTING TABLES, ATTRIBUTES AND FREQUENCIES ExplanationWorkloadPrepocessor wa = ExplanationHandler.analyzeWorkload(txnLogTable, numb_trans_to_process, schemaname, conn, schema); // FOR EACH TABLE CLASSIFY AND POPULATE JUSTIFICATION COLUMN for (String tableProcessed : wa.getAllTableNames()) { System.out.println("-------------------------------------------"); System.out.println("ANALYZING TABLE " + tableProcessed); // FETCH THE INSTANCE FROM THE DB AND SAMPLE IT Instances data = jh.generateInstancesForTable(tableProcessed, wa.getFeatures(tableProcessed), conn); // IF THERE IS ONLY THE PARTITION LABEL, SKIP THE TABLE if (data.numAttributes() < 2) { System.out.println("No transactions touches this table, nothing to be done."); continue; } // INSTANTIATE THE CLASSIFIER String[] options; options = new String[3]; options[0] = "-P"; options[1] = "-C"; options[2] = ini.getProperty("Explanation.j48PruningConfidence"); J48 classifier = new J48(); // new instance of tree classifier.setOptions(options); // set the options Boolean attributeFilter = true; // ATTRIBUTE FILTERING Instances newData; if (data.numClasses() > 1 && attributeFilter) { AttributeSelection filter = new AttributeSelection(); //FIXME TRYING ALTERNATIVE ATTRIBUTE SELECTION STRATEGIES //InfoGainAttributeEval eval = new InfoGainAttributeEval(); //Ranker search = new Ranker(); //search.setNumToSelect(Integer.parseInt(ini.getProperty("Explanation.maxNumberOfAttribute","2"))); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); filter.setEvaluator(eval); filter.setSearch(search); filter.setInputFormat(data); newData = Filter.useFilter(data, filter); } else { newData = data; } String atts = ""; Enumeration e = newData.enumerateAttributes(); ArrayList<String> attributesForPopulation = new ArrayList<String>(); while (e.hasMoreElements()) { String s = ((Attribute) e.nextElement()).name(); attributesForPopulation.add(s); atts += s + ", "; } atts = atts.substring(0, atts.length() - 2); System.out.println("Attribute filtering reduced " + (data.numAttributes() - 1) + " to " + (newData.numAttributes() - 1) + " (" + atts + ")"); data = null; System.gc(); if (newData.numInstances() < 1) { System.err.println("The are no data in the table, skipping classification"); continue; } if (newData.numInstances() > 0) { if (newData.classAttribute().numValues() > 1) { // TRAIN THE CLASSIFIER AND PRINT OUT CLASSIFIER RULES ExplanationHandler.trainClassifier(newData, classifier); if (classifier.measureNumLeaves() == 1) { int partitionvalue = (int) classifier.classifyInstance(newData.firstInstance()); System.out.println( "The classifier decided to put all the tuplesi in the table in one partition: " + partitionvalue); if (Boolean.parseBoolean(ini.getProperty("Explanation.populateExplainedColumn"))) { jh.populateExplainedColumn(tableProcessed, partitionvalue, attributesForPopulation, conn); } } // POPULATING THE justifiedpartition column with the result of this // classifier if required else if (Boolean.parseBoolean(ini.getProperty("Explanation.populateExplainedColumn"))) { jh.populateJustifiedColumn(tableProcessed, classifier, attributesForPopulation, conn, numPart, newData.classAttribute().enumerateValues()); } } else { // easy case... the class attribute is unary!! int partitionvalue = ((int) newData.firstInstance() .value(newData.firstInstance().classIndex())); System.out.println("The table is all stored in one partition, no need to use classifier"); if (Boolean.parseBoolean(ini.getProperty("Explanation.populateExplainedColumn"))) { jh.populateExplainedColumn(tableProcessed, partitionvalue, attributesForPopulation, conn); } } } else throw new Exception("The Instances is empty"); } // SET HASH PARTITION / REPLICATED PARTITION if (Boolean.parseBoolean(ini.getProperty("Explanation.populateHashColumn"))) { jh.populateHashPartition(conn); } if (Boolean.parseBoolean(ini.getProperty("Explanation.populateReplicatedColumn"))) { jh.populateReplicatedPartition(conn, Boolean.parseBoolean(ini.getProperty("Explanation.defaultReplicate"))); } conn.close(); } catch (SQLException e) { e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:etc.aloe.oilspill2010.FeatureGenerationImpl.java
License:Open Source License
protected Filter getFeatureSelectionFilter(ExampleSet examples) throws Exception { AttributeSelection filter = new AttributeSelection(); // package weka.filters.supervised.attribute! //CfsSubsetEval eval = new CfsSubsetEval(); //CorrelationAttributeEval eval = new CorrelationAttributeEval(); //InfoGainAttributeEval eval = new InfoGainAttributeEval(); ReliefFAttributeEval eval = new ReliefFAttributeEval(); //GreedyStepwise search = new GreedyStepwise(); //search.setNumToSelect(980); //search.setSearchBackwards(true); Ranker search = new Ranker(); search.setNumToSelect(980);//from ww w . j av a 2 s . c o m filter.setEvaluator(eval); filter.setSearch(search); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; }
From source file:mao.datamining.DataSetPair.java
/** * Pre-Process the training data set with: * RemoveUselessColumnsByMissingValues filter * SpreadSubsample filter to shrink the majority class instances * AttributeSelection filter with CfsSubsetEval and LinearForwardSelection *//*from w w w. j av a 2s .com*/ private void processTrainRawData() { System.out.println("====================" + this.trainFileName + "===================="); System.out.println("====================" + this.trainFileName + "===================="); System.out.println("====================" + this.trainFileName + "===================="); finalTrainAttrList.clear(); try { doItOnce4All(); String sampleFilePath = null; //step 2, either over sample, or under sample //weka.filters.supervised.instance.SpreadSubsample if (this.resampleMethod.equalsIgnoreCase(resampleUnder)) { System.out.println("Under Samplessssssssssssssssssssssssssssssssssssss"); sampleFilePath = Main.OrangeProcessedDSHome + "/afterUnderSampling.arff"; } else if (resampleMethod.equalsIgnoreCase(resampleOver)) { System.out.println("Over Samplessssssssssssssssssssssssssssssssssssss"); sampleFilePath = Main.OrangeProcessedDSHome + "/afterOverSampling.arff"; } else if (resampleMethod.equalsIgnoreCase(resampleNone)) { //do nothing, System.out.println("None Samplessssssssssssssssssssssssssssssssssssss"); sampleFilePath = Main.OrangeProcessedDSHome + "/afterNoneSampling.arff"; } else if (resampleMethod.equalsIgnoreCase(resampleMatrix)) { //do nothing System.out.println("Matrix Samplessssssssssssssssssssssssssssssssssssss"); sampleFilePath = Main.OrangeProcessedDSHome + "/afterNoneSampling.arff"; } else { doNotSupport(); } Instances newData = ConverterUtils.DataSource.read(sampleFilePath); newData.setClassIndex(newData.numAttributes() - 1); // Main.logging("== New Data After Resampling class instances: ===\n" + newData.toSummaryString()); //Step 3, select features AttributeSelection attrSelectionFilter = new AttributeSelection(); ASEvaluation eval = null; ASSearch search = null; //ranker if (this.featureSelectionMode.equalsIgnoreCase(featureSelectionA)) { System.out.println("Ranker ssssssssssssssssssssssssssssssssssssss"); System.out.println("Ranker ssssssssssssssssssssssssssssssssssssss"); System.out.println("Ranker ssssssssssssssssssssssssssssssssssssss"); eval = new weka.attributeSelection.InfoGainAttributeEval(); //weka.attributeSelection.Ranker -T 0.02 -N -1 search = new Ranker(); String rankerOptios[] = { "-T", "0.01", "-N", "-1" }; if (resampleMethod.equalsIgnoreCase(resampleOver)) { rankerOptios[1] = "0.1"; } ((Ranker) search).setOptions(rankerOptios); Main.logging("== Start to Select Features with InfoGainAttributeEval and Ranker"); } //weka.attributeSelection.LinearForwardSelection -D 0 -N 5 -I -K 50 -T 0 else if (this.featureSelectionMode.equalsIgnoreCase(featureSelectionB)) { System.out.println("CfsSubset ssssssssssssssssssssssssssssssssssssss"); System.out.println("CfsSubset ssssssssssssssssssssssssssssssssssssss"); System.out.println("CfsSubset ssssssssssssssssssssssssssssssssssssss"); eval = new CfsSubsetEval(); search = new LinearForwardSelection(); String linearOptios[] = { "-D", "0", "-N", "5", "-I", "-K", "50", "-T", "0" }; ((LinearForwardSelection) search).setOptions(linearOptios); Main.logging("== Start to Select Features with CfsSubsetEval and LinearForwardSelection"); } else if (this.featureSelectionMode.equalsIgnoreCase(featureSelectionNo)) { System.out.println("None Selection ssssssssssssssssssssssssssssssssssssss"); Main.logging("No Feature Selection Method"); } else { doNotSupport(); } if (eval != null) { attrSelectionFilter.setEvaluator(eval); attrSelectionFilter.setSearch(search); attrSelectionFilter.setInputFormat(newData); newData = Filter.useFilter(newData, attrSelectionFilter); } Main.logging("== New Data After Selecting Features: ===\n" + newData.toSummaryString()); //finally, write the final dataset to file system try (BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(this.trainFileName)))) { writer.write(newData.toString()); } int numAttributes = newData.numAttributes(); for (int i = 0; i < numAttributes; i++) { String attrName = newData.attribute(i).name(); finalTrainAttrList.add(attrName); } Main.logging(finalTrainAttrList.toString()); // //set the final train dataset finalTrainDataSet = newData; finalTrainDataSet.setClassIndex(finalTrainDataSet.numAttributes() - 1); Main.logging("train dataset class attr: " + finalTrainDataSet.classAttribute().toString()); } catch (Exception ex) { Main.logging(null, ex); } }
From source file:org.uclab.mm.kcl.ddkat.datapreprocessor.FeaturesSelector.java
License:Apache License
/** * Method to filter the input data using GreedyStepwise approach. * * @throws Exception the exception/*from w ww .j av a 2s . c o m*/ */ public void filterData() throws Exception { this.confirmationMessage = new ArrayList<String>(); Instances inputData, outputData; String inputFile = BASE_DIR + "OriginalDataSet.csv"; // load CSV file CSVLoader fileLoader = new CSVLoader(); fileLoader.setSource(new File(inputFile)); inputData = fileLoader.getDataSet(); inputData.setClassIndex(inputData.numAttributes() - 1); AttributeSelection filter = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); filter.setEvaluator(eval); filter.setSearch(search); filter.setInputFormat(inputData); outputData = Filter.useFilter(inputData, filter); int indices = outputData.numAttributes(); String selectedAttributesString = ""; for (int i = 0; i < indices; i++) { selectedAttributesString += "\n" + outputData.attribute(i).toString() + ", "; } selectedAttributesString = selectedAttributesString.substring(0, selectedAttributesString.length() - 2); saveFilteredData(inputFile, outputData); }
From source file:trainableSegmentation.WekaSegmentation.java
License:GNU General Public License
/** * Select attributes using BestFirst search to reduce * the number of parameters per instance of a dataset * * @param data input set of instances/* www .j av a2 s. com*/ * @return resampled set of instances */ public static Instances selectAttributes(Instances data) { final AttributeSelection filter = new AttributeSelection(); Instances filteredIns = null; // Evaluator final CfsSubsetEval evaluator = new CfsSubsetEval(); evaluator.setMissingSeparate(true); // Assign evaluator to filter filter.setEvaluator(evaluator); // Search strategy: best first (default values) final BestFirst search = new BestFirst(); filter.setSearch(search); // Apply filter try { filter.setInputFormat(data); filteredIns = Filter.useFilter(data, filter); } catch (Exception e) { IJ.log("Error when resampling input data with selected attributes!"); e.printStackTrace(); } return filteredIns; }