List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setIDFTransform
public void setIDFTransform(boolean IDFTransform)
From source file:nl.uva.expose.classification.WekaClassification.java
private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setAttributeIndices("first-last"); filter.setIDFTransform(true); filter.setLowerCaseTokens(true);/*from w ww . java 2 s.c o m*/ filter.setMinTermFreq(2); filter.setLowerCaseTokens(true); filter.setNormalizeDocLength( new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); filter.setOutputWordCounts(true); // filter.setTokenizer(); // filter.setWordsToKeep(); filter.setInputFormat(dRaw); dFiltered = Filter.useFilter(dRaw, filter); }
From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerIDFT.java
License:Open Source License
public void indexingToTokenizer(String inPath, String outPath) throws Exception { WordTokenizer wordTokenizer = new WordTokenizer(); wordTokenizer.setDelimiters("\r \t.,;:'\"()?!"); Instances inputInstances = WekaUtils.loadARFF(inPath); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(inputInstances); filter.setIDFTransform(true); filter.setTFTransform(true);//from w w w.j ava 2 s . co m filter.setDoNotOperateOnPerClassBasis(false); filter.setInvertSelection(false); filter.setLowerCaseTokens(true); filter.setMinTermFreq(3); filter.setOutputWordCounts(true); filter.setTokenizer(wordTokenizer); filter.setUseStoplist(true); filter.setWordsToKeep(200); Instances outputInstances = Filter.useFilter(inputInstances, filter); OutputStreamUtils.writeSimple(outputInstances.toString(), outPath); }
From source file:util.FeatureExtract.java
public static void createArff(String directory) { TextDirectoryLoader loader = new TextDirectoryLoader(); try {/*from ww w. j a v a 2 s .c o m*/ // convert the directory into a dataset loader.setDirectory(new File(directory)); Instances dataRaw = loader.getDataSet(); // apply the StringToWordVector and tf-idf weighting StringToWordVector filter = new StringToWordVector(); filter.setIDFTransform(true); filter.setInputFormat(dataRaw); Instances dataFiltered = Filter.useFilter(dataRaw, filter); // output the arff file ArffSaver saver = new ArffSaver(); saver.setInstances(dataFiltered); saver.setFile(new File(SpamFilterConfig.getArffFilePath())); saver.writeBatch(); // train with simple cart SimpleCart classifier = new SimpleCart(); classifier.buildClassifier(dataFiltered); System.out.println("\n\nClassifier model:\n\n" + classifier.toString()); // using 10 cross validation Evaluation eval = new Evaluation(dataFiltered); eval.crossValidateModel(classifier, dataFiltered, 10, new Random(1)); System.out.println("\n\nCross fold:\n\n" + eval.toSummaryString()); } catch (Exception ex) { Logger.getLogger(FeatureExtract.class.getName()).log(Level.SEVERE, null, ex); } }