List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setAttributeIndices
public void setAttributeIndices(String rangeList)
From source file:com.reactivetechnologies.analytics.lucene.InstanceTokenizer.java
License:Open Source License
/** * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. * The set of words (attributes) is determined by the first batch filtered (typically training data). Uses a Lucene analyzer to tokenize * the string. NOTE: The text string should either be the first or last attribute * @param dataRaw/*from www .j a v a 2 s .c o m*/ * @param opts * @param isLast - whether last attribute is the text to be filtered, else first * @return * @throws Exception * @see {@linkplain StringToWordVector} */ public static Instances filter(Instances dataRaw, String opts, boolean isLast) throws Exception { StringToWordVector filter = new StringToWordVector(); if (StringUtils.hasText(opts)) { filter.setOptions(Utils.splitOptions(opts)); } filter.setTokenizer(new InstanceTokenizer()); filter.setUseStoplist(false);//ignore any other stop list filter.setStemmer(new NullStemmer());//ignore any other stemmer filter.setInputFormat(dataRaw); filter.setAttributeIndices(isLast ? "last" : "first"); return Filter.useFilter(dataRaw, filter); }
From source file:nl.uva.expose.classification.WekaClassification.java
private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setAttributeIndices("first-last"); filter.setIDFTransform(true);//from w w w. j a v a 2 s .co m filter.setLowerCaseTokens(true); filter.setMinTermFreq(2); filter.setLowerCaseTokens(true); filter.setNormalizeDocLength( new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); filter.setOutputWordCounts(true); // filter.setTokenizer(); // filter.setWordsToKeep(); filter.setInputFormat(dRaw); dFiltered = Filter.useFilter(dRaw, filter); }