List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setOptions
@Override public void setOptions(String[] options) throws Exception
From source file:com.reactivetechnologies.analytics.lucene.InstanceTokenizer.java
License:Open Source License
/** * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. * The set of words (attributes) is determined by the first batch filtered (typically training data). Uses a Lucene analyzer to tokenize * the string. NOTE: The text string should either be the first or last attribute * @param dataRaw/* w ww .j a v a2 s . c om*/ * @param opts * @param isLast - whether last attribute is the text to be filtered, else first * @return * @throws Exception * @see {@linkplain StringToWordVector} */ public static Instances filter(Instances dataRaw, String opts, boolean isLast) throws Exception { StringToWordVector filter = new StringToWordVector(); if (StringUtils.hasText(opts)) { filter.setOptions(Utils.splitOptions(opts)); } filter.setTokenizer(new InstanceTokenizer()); filter.setUseStoplist(false);//ignore any other stop list filter.setStemmer(new NullStemmer());//ignore any other stemmer filter.setInputFormat(dataRaw); filter.setAttributeIndices(isLast ? "last" : "first"); return Filter.useFilter(dataRaw, filter); }
From source file:graph.clustering.NodeClusterer.java
License:Apache License
private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) { String[] filterOptions = new String[10]; filterOptions[0] = "-R"; // attribute indices filterOptions[1] = "first-last"; filterOptions[2] = "-W"; // The number of words (per class if there is a // class attribute assigned) to attempt to // keep./* w w w.j a v a2s .c o m*/ filterOptions[3] = "1000"; filterOptions[4] = "-prune-rate"; // periodical pruning filterOptions[5] = "-1.0"; filterOptions[6] = "-N"; // 0=not normalize filterOptions[7] = "0"; filterOptions[8] = "-M"; // The minimum term frequency filterOptions[9] = "1"; SnowballStemmer stemmer = new SnowballStemmer(); stemmer.setStemmer("english"); WordTokenizer tokenizer = new WordTokenizer(); StringToWordVector s2wFilterer = new StringToWordVector(); try { s2wFilterer.setOptions(filterOptions); s2wFilterer.setStemmer(stemmer); s2wFilterer.setTokenizer(tokenizer); s2wFilterer.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer); } catch (Exception e1) { System.out.println("Error in converting string into word vectors:"); e1.printStackTrace(); } RemoveUseless ruFilter = new RemoveUseless(); try { ruFilter.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter); } catch (Exception e1) { System.out.println("Error in removing useless terms:"); e1.printStackTrace(); } return clusterTrainingSet; }
From source file:newsclassifier.NewsClassifier.java
public void StrToWV(String sFile) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(data);// www . j a v a2 s.c om /*filter.setIDFTransform(false); filter.setTFTransform(true); filter.setAttributeIndices("1-2"); //attributenameprefix filter.setDoNotOperateOnPerClassBasis(true); filter.setInvertSelection(false); filter.setLowerCaseTokens(true); filter.setMinTermFreq(1); //filter.setNormalizeDocLength(true); filter.setOutputWordCounts(false); //filter.setPeriodicPruning(-1); //filter.setStemmer(null); filter.setStopwords(new File(sFile));*/ //String[] opts = weka.core.Utils.splitOptions("-tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r \\\\t.,;:\\\\\\'\\\\\\\"()?!1234567890 `~!@#\\\\\\%^&*[]-_+={}\\\\\\\\/|?>< \\\\r\\\\t\\\"\""); String[] opts = weka.core.Utils.splitOptions( "-R 1-2 -W 3000 -prune-rate -1.0 -T -N 0 -L -S -stemmer weka.core.stemmers.NullStemmer -M 1 -O -stopwords \"C:\\\\Users\\\\USER\\\\Dropbox\\\\Works\\\\IF\\\\AI\\\\Tubes 2\\\\s.txt\" -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\t.,;:\\\\\\'\\\\\\\"()?!1234567890 `~!@#\\\\\\%^&*[]-_+={}\\\\\\\\/|?>< \\\\t\\\"\""); filter.setOptions(opts); //belum pake delimiter!! filter.setWordsToKeep(3000); data = Filter.useFilter(data, filter); //return newData; //data = newData; }