List of usage examples for weka.core.stemmers NullStemmer NullStemmer
NullStemmer
From source file:classifier.CustomStringToWordVector.java
License:Open Source License
/** * the stemming algorithm to use, null means no stemming at all (i.e., the * NullStemmer is used).//from ww w. ja v a 2s . c om * * @param value * the configured stemming algorithm, or null * @see NullStemmer */ public void setStemmer(Stemmer value) { if (value != null) m_Stemmer = value; else m_Stemmer = new NullStemmer(); }
From source file:com.reactivetechnologies.analytics.lucene.InstanceTokenizer.java
License:Open Source License
/** * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. * The set of words (attributes) is determined by the first batch filtered (typically training data). Uses a Lucene analyzer to tokenize * the string. NOTE: The text string should either be the first or last attribute * @param dataRaw//from www. ja va2s. c o m * @param opts * @param isLast - whether last attribute is the text to be filtered, else first * @return * @throws Exception * @see {@linkplain StringToWordVector} */ public static Instances filter(Instances dataRaw, String opts, boolean isLast) throws Exception { StringToWordVector filter = new StringToWordVector(); if (StringUtils.hasText(opts)) { filter.setOptions(Utils.splitOptions(opts)); } filter.setTokenizer(new InstanceTokenizer()); filter.setUseStoplist(false);//ignore any other stop list filter.setStemmer(new NullStemmer());//ignore any other stemmer filter.setInputFormat(dataRaw); filter.setAttributeIndices(isLast ? "last" : "first"); return Filter.useFilter(dataRaw, filter); }
From source file:com.reactivetechnologies.analytics.lucene.TextInstanceFilter.java
License:Open Source License
/** * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. * The set of words (attributes) is determined by the first batch filtered (typically training data). Uses a Lucene analyzer to tokenize * the string. NOTE: The text string should either be the first or last attribute * @param dataRaw//from w w w .j ava 2 s .com * @param opts * @param isLast - whether last attribute is the text to be filtered, else first * @return * @throws Exception * @see {@linkplain StringToWordVector} */ public static Instances filter(Instances dataRaw, String opts, boolean isLast) throws Exception { TextInstanceFilter filter = new TextInstanceFilter(); if (StringUtils.hasText(opts)) { filter.setOptions(Utils.splitOptions(opts)); } filter.setTokenizer(new InstanceTokenizer()); filter.setUseStoplist(false);//ignore any other stop list filter.setStemmer(new NullStemmer());//ignore any other stemmer filter.setInputFormat(dataRaw); filter.setAttributeIndices(isLast ? "last" : "first"); filter.setDoNotOperateOnPerClassBasis(true); filter.setWordsToKeep(10000); return useFilter(dataRaw, filter); }