List of usage examples for weka.core.stemmers SnowballStemmer SnowballStemmer
public SnowballStemmer()
From source file:graph.clustering.NodeClusterer.java
License:Apache License
private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) { String[] filterOptions = new String[10]; filterOptions[0] = "-R"; // attribute indices filterOptions[1] = "first-last"; filterOptions[2] = "-W"; // The number of words (per class if there is a // class attribute assigned) to attempt to // keep.//from ww w . ja v a 2 s . com filterOptions[3] = "1000"; filterOptions[4] = "-prune-rate"; // periodical pruning filterOptions[5] = "-1.0"; filterOptions[6] = "-N"; // 0=not normalize filterOptions[7] = "0"; filterOptions[8] = "-M"; // The minimum term frequency filterOptions[9] = "1"; SnowballStemmer stemmer = new SnowballStemmer(); stemmer.setStemmer("english"); WordTokenizer tokenizer = new WordTokenizer(); StringToWordVector s2wFilterer = new StringToWordVector(); try { s2wFilterer.setOptions(filterOptions); s2wFilterer.setStemmer(stemmer); s2wFilterer.setTokenizer(tokenizer); s2wFilterer.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer); } catch (Exception e1) { System.out.println("Error in converting string into word vectors:"); e1.printStackTrace(); } RemoveUseless ruFilter = new RemoveUseless(); try { ruFilter.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter); } catch (Exception e1) { System.out.println("Error in removing useless terms:"); e1.printStackTrace(); } return clusterTrainingSet; }
From source file:sumblr.Preprocessor.java
public Preprocessor() { this.stopwords = new Stopwords(); this.stemmer = new SnowballStemmer(); idfValues = new HashMap(); TrainingData = new ArrayList(); sg = new SummaryGenerator(); }