List of usage examples for weka.core.tokenizers WordTokenizer WordTokenizer
WordTokenizer
From source file:classifier.CustomStringToWordVector.java
License:Open Source License
/** * Parses a given list of options./*from ww w . j a v a2s .c o m*/ * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -C * Output word counts rather than boolean word presence. * </pre> * * <pre> * -R <index1,index2-index4,...> * Specify list of string attributes to convert to words (as weka Range). * (default: select all string attributes) * </pre> * * <pre> * -V * Invert matching sense of column indexes. * </pre> * * <pre> * -P <attribute name prefix> * Specify a prefix for the created attribute names. * (default: "") * </pre> * * <pre> * -W <number of words to keep> * Specify approximate number of word fields to create. * Surplus words will be discarded.. * (default: 1000) * </pre> * * <pre> * -prune-rate <rate as a percentage of dataset> * Specify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary. * -W prunes after creating a full dictionary. You may not have enough memory for this approach. * (default: no periodic pruning) * </pre> * * <pre> * -T * Transform the word frequencies into log(1+fij) * where fij is the frequency of word i in jth document(instance). * </pre> * * <pre> * -I * Transform each word frequency into: * fij*log(num of Documents/num of documents containing word i) * where fij if frequency of word i in jth document(instance) * </pre> * * <pre> * -N * Whether to 0=not normalize/1=normalize all data/2=normalize test data only * to average length of training documents (default 0=don't normalize). * </pre> * * <pre> * -L * Convert all tokens to lowercase before adding to the dictionary. * </pre> * * <pre> * -S * Ignore words that are in the stoplist. * </pre> * * <pre> * -stemmer <spec> * The stemmering algorihtm (classname plus parameters) to use. * </pre> * * <pre> * -M <int> * The minimum term frequency (default = 1). * </pre> * * <pre> * -O * If this is set, the maximum number of words and the * minimum term frequency is not enforced on a per-class * basis but based on the documents in all the classes * (even if a class attribute is set). * </pre> * * <pre> * -stopwords <file> * A file containing stopwords to override the default ones. * Using this option automatically sets the flag ('-S') to use the * stoplist if the file exists. * Format: one stopword per line, lines starting with '#' * are interpreted as comments and ignored. * </pre> * * <pre> * -tokenizer <spec> * The tokenizing algorihtm (classname plus parameters) to use. * (default: weka.core.tokenizers.WordTokenizer) * </pre> * * <!-- options-end --> * * @param options * the list of options as an array of strings * @throws Exception * if an option is not supported */ public void setOptions(String[] options) throws Exception { String value; value = Utils.getOption('R', options); if (value.length() != 0) setSelectedRange(value); else setSelectedRange("first-last"); setInvertSelection(Utils.getFlag('V', options)); value = Utils.getOption('P', options); if (value.length() != 0) setAttributeNamePrefix(value); else setAttributeNamePrefix(""); value = Utils.getOption('W', options); if (value.length() != 0) setWordsToKeep(Integer.valueOf(value).intValue()); else setWordsToKeep(1000); value = Utils.getOption("prune-rate", options); if (value.length() > 0) setPeriodicPruning(Double.parseDouble(value)); else setPeriodicPruning(-1); value = Utils.getOption('M', options); if (value.length() != 0) setMinTermFreq(Integer.valueOf(value).intValue()); else setMinTermFreq(1); setOutputWordCounts(Utils.getFlag('C', options)); setTFTransform(Utils.getFlag('T', options)); setIDFTransform(Utils.getFlag('I', options)); setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options)); String nString = Utils.getOption('N', options); if (nString.length() != 0) setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER)); else setNormalizeDocLength(new SelectedTag(FILTER_NONE, TAGS_FILTER)); setLowerCaseTokens(Utils.getFlag('L', options)); setUseStoplist(Utils.getFlag('S', options)); String stemmerString = Utils.getOption("stemmer", options); if (stemmerString.length() == 0) { setStemmer(null); } else { String[] stemmerSpec = Utils.splitOptions(stemmerString); if (stemmerSpec.length == 0) throw new Exception("Invalid stemmer specification string"); String stemmerName = stemmerSpec[0]; stemmerSpec[0] = ""; Stemmer stemmer = (Stemmer) Class.forName(stemmerName).newInstance(); if (stemmer instanceof OptionHandler) ((OptionHandler) stemmer).setOptions(stemmerSpec); setStemmer(stemmer); } value = Utils.getOption("stopwords", options); if (value.length() != 0) setStopwords(new File(value)); else setStopwords(null); String tokenizerString = Utils.getOption("tokenizer", options); if (tokenizerString.length() == 0) { setTokenizer(new WordTokenizer()); } else { String[] tokenizerSpec = Utils.splitOptions(tokenizerString); if (tokenizerSpec.length == 0) throw new Exception("Invalid tokenizer specification string"); String tokenizerName = tokenizerSpec[0]; tokenizerSpec[0] = ""; Tokenizer tokenizer = (Tokenizer) Class.forName(tokenizerName).newInstance(); if (tokenizer instanceof OptionHandler) ((OptionHandler) tokenizer).setOptions(tokenizerSpec); setTokenizer(tokenizer); } }
From source file:com.ivanrf.smsspam.SpamClassifier.java
License:Apache License
private static FilteredClassifier initFilterClassifier(int wordsToKeep, String tokenizerOp, boolean useAttributeSelection, String classifierOp, boolean boosting) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setDoNotOperateOnPerClassBasis(true); filter.setLowerCaseTokens(true);/*w w w. j a va 2s . c o m*/ filter.setWordsToKeep(wordsToKeep); if (!tokenizerOp.equals(TOKENIZER_DEFAULT)) { //Make a tokenizer WordTokenizer wt = new WordTokenizer(); if (tokenizerOp.equals(TOKENIZER_COMPLETE)) wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}"); else //TOKENIZER_COMPLETE_NUMBERS) wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}|~0123456789"); filter.setTokenizer(wt); } FilteredClassifier classifier = new FilteredClassifier(); classifier.setFilter(filter); if (useAttributeSelection) { AttributeSelection as = new AttributeSelection(); as.setEvaluator(new InfoGainAttributeEval()); Ranker r = new Ranker(); r.setThreshold(0); as.setSearch(r); MultiFilter mf = new MultiFilter(); mf.setFilters(new Filter[] { filter, as }); classifier.setFilter(mf); } if (classifierOp.equals(CLASSIFIER_SMO)) classifier.setClassifier(new SMO()); else if (classifierOp.equals(CLASSIFIER_NB)) classifier.setClassifier(new NaiveBayes()); else if (classifierOp.equals(CLASSIFIER_IB1)) classifier.setClassifier(new IBk(1)); else if (classifierOp.equals(CLASSIFIER_IB3)) classifier.setClassifier(new IBk(3)); else if (classifierOp.equals(CLASSIFIER_IB5)) classifier.setClassifier(new IBk(5)); else if (classifierOp.equals(CLASSIFIER_PART)) classifier.setClassifier(new PART()); //Tarda mucho if (boosting) { AdaBoostM1 boost = new AdaBoostM1(); boost.setClassifier(classifier.getClassifier()); classifier.setClassifier(boost); //Con NB tarda mucho } return classifier; }
From source file:form.ml.ClassifierTemplate.java
/** * Create bayes classifier instance// w w w. j a v a2s .c om * * @param data_set_path * @param stop_words_path * @param class_index * @throws FileNotFoundException * @throws IOException * @throws Exception */ public ClassifierTemplate(String data_set_path, String stop_words_path, int class_index) throws Exception { /** * loading the arff file content */ BufferedReader reader = new BufferedReader(new FileReader(data_set_path)); ArffReader arff = new ArffReader(reader); train = arff.getData(); train.setClassIndex(class_index); /** * initializing the filter */ wordVector = new StringToWordVector(); wordVector.setInputFormat(train); tokenizer = new WordTokenizer(); wordVector.setStopwords(new File(stop_words_path)); wordVector.setTokenizer(tokenizer); wordVector.setIDFTransform(true); wordVector.setLowerCaseTokens(true); /** * generating the TF*IDF Vector */ trainFiltered = Filter.useFilter(train, wordVector); }
From source file:graph.clustering.NodeClusterer.java
License:Apache License
private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) { String[] filterOptions = new String[10]; filterOptions[0] = "-R"; // attribute indices filterOptions[1] = "first-last"; filterOptions[2] = "-W"; // The number of words (per class if there is a // class attribute assigned) to attempt to // keep./*from w w w . ja va 2 s . co m*/ filterOptions[3] = "1000"; filterOptions[4] = "-prune-rate"; // periodical pruning filterOptions[5] = "-1.0"; filterOptions[6] = "-N"; // 0=not normalize filterOptions[7] = "0"; filterOptions[8] = "-M"; // The minimum term frequency filterOptions[9] = "1"; SnowballStemmer stemmer = new SnowballStemmer(); stemmer.setStemmer("english"); WordTokenizer tokenizer = new WordTokenizer(); StringToWordVector s2wFilterer = new StringToWordVector(); try { s2wFilterer.setOptions(filterOptions); s2wFilterer.setStemmer(stemmer); s2wFilterer.setTokenizer(tokenizer); s2wFilterer.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer); } catch (Exception e1) { System.out.println("Error in converting string into word vectors:"); e1.printStackTrace(); } RemoveUseless ruFilter = new RemoveUseless(); try { ruFilter.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter); } catch (Exception e1) { System.out.println("Error in removing useless terms:"); e1.printStackTrace(); } return clusterTrainingSet; }
From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerBoolean.java
License:Open Source License
public void indexingToTokenizer(String inPath, String outPath) throws Exception { WordTokenizer wordTokenizer = new WordTokenizer(); wordTokenizer.setDelimiters("\r \t.,;:'\"()?!"); Instances inputInstances = WekaUtils.loadARFF(inPath); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(inputInstances); filter.setDoNotOperateOnPerClassBasis(false); filter.setInvertSelection(false);/*from w w w . j a v a 2 s .c o m*/ filter.setLowerCaseTokens(true); filter.setOutputWordCounts(false); filter.setTokenizer(wordTokenizer); filter.setUseStoplist(true); filter.setWordsToKeep(wordsTokeep); Instances outputInstances = Filter.useFilter(inputInstances, filter); OutputStreamUtils.writeSimple(outputInstances.toString(), outPath); }
From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerIDFT.java
License:Open Source License
public void indexingToTokenizer(String inPath, String outPath) throws Exception { WordTokenizer wordTokenizer = new WordTokenizer(); wordTokenizer.setDelimiters("\r \t.,;:'\"()?!"); Instances inputInstances = WekaUtils.loadARFF(inPath); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(inputInstances); filter.setIDFTransform(true);//w w w . ja va 2s . c om filter.setTFTransform(true); filter.setDoNotOperateOnPerClassBasis(false); filter.setInvertSelection(false); filter.setLowerCaseTokens(true); filter.setMinTermFreq(3); filter.setOutputWordCounts(true); filter.setTokenizer(wordTokenizer); filter.setUseStoplist(true); filter.setWordsToKeep(200); Instances outputInstances = Filter.useFilter(inputInstances, filter); OutputStreamUtils.writeSimple(outputInstances.toString(), outPath); }
From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerVector.java
License:Open Source License
public void indexingToTokenizer(String inPath, String outPath) throws Exception { WordTokenizer wordTokenizer = new WordTokenizer(); wordTokenizer.setDelimiters("\r \t.,;:'\"()?!"); Instances inputInstances = WekaUtils.loadARFF(inPath); StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(inputInstances); filter.setDoNotOperateOnPerClassBasis(false); filter.setInvertSelection(false);/*from w ww . ja v a 2s .c o m*/ filter.setLowerCaseTokens(true); filter.setMinTermFreq(3); filter.setOutputWordCounts(true); filter.setTokenizer(wordTokenizer); filter.setUseStoplist(true); filter.setWordsToKeep(200); Instances outputInstances = Filter.useFilter(inputInstances, filter); OutputStreamUtils.writeSimple(outputInstances.toString(), outPath); }