Example usage for weka.core.tokenizers WordTokenizer WordTokenizer

List of usage examples for weka.core.tokenizers WordTokenizer WordTokenizer

Introduction

In this page you can find the example usage for weka.core.tokenizers WordTokenizer WordTokenizer.

Prototype

WordTokenizer

Source Link

Usage

From source file:classifier.CustomStringToWordVector.java

License:Open Source License

/**
 * Parses a given list of options./*from  ww w .  j a  v  a2s .c  o m*/
 * <p/>
 * 
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 * -C
 *  Output word counts rather than boolean word presence.
 * </pre>
 * 
 * <pre>
 * -R &lt;index1,index2-index4,...&gt;
 *  Specify list of string attributes to convert to words (as weka Range).
 *  (default: select all string attributes)
 * </pre>
 * 
 * <pre>
 * -V
 *  Invert matching sense of column indexes.
 * </pre>
 * 
 * <pre>
 * -P &lt;attribute name prefix&gt;
 *  Specify a prefix for the created attribute names.
 *  (default: "")
 * </pre>
 * 
 * <pre>
 * -W &lt;number of words to keep&gt;
 *  Specify approximate number of word fields to create.
 *  Surplus words will be discarded..
 *  (default: 1000)
 * </pre>
 * 
 * <pre>
 * -prune-rate &lt;rate as a percentage of dataset&gt;
 *  Specify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.
 *  -W prunes after creating a full dictionary. You may not have enough memory for this approach.
 *  (default: no periodic pruning)
 * </pre>
 * 
 * <pre>
 * -T
 *  Transform the word frequencies into log(1+fij)
 *  where fij is the frequency of word i in jth document(instance).
 * </pre>
 * 
 * <pre>
 * -I
 *  Transform each word frequency into:
 *  fij*log(num of Documents/num of documents containing word i)
 *    where fij if frequency of word i in jth document(instance)
 * </pre>
 * 
 * <pre>
 * -N
 *  Whether to 0=not normalize/1=normalize all data/2=normalize test data only
 *  to average length of training documents (default 0=don't normalize).
 * </pre>
 * 
 * <pre>
 * -L
 *  Convert all tokens to lowercase before adding to the dictionary.
 * </pre>
 * 
 * <pre>
 * -S
 *  Ignore words that are in the stoplist.
 * </pre>
 * 
 * <pre>
 * -stemmer &lt;spec&gt;
 *  The stemmering algorihtm (classname plus parameters) to use.
 * </pre>
 * 
 * <pre>
 * -M &lt;int&gt;
 *  The minimum term frequency (default = 1).
 * </pre>
 * 
 * <pre>
 * -O
 *  If this is set, the maximum number of words and the 
 *  minimum term frequency is not enforced on a per-class 
 *  basis but based on the documents in all the classes 
 *  (even if a class attribute is set).
 * </pre>
 * 
 * <pre>
 * -stopwords &lt;file&gt;
 *  A file containing stopwords to override the default ones.
 *  Using this option automatically sets the flag ('-S') to use the
 *  stoplist if the file exists.
 *  Format: one stopword per line, lines starting with '#'
 *  are interpreted as comments and ignored.
 * </pre>
 * 
 * <pre>
 * -tokenizer &lt;spec&gt;
 *  The tokenizing algorihtm (classname plus parameters) to use.
 *  (default: weka.core.tokenizers.WordTokenizer)
 * </pre>
 * 
 * <!-- options-end -->
 * 
 * @param options
 *            the list of options as an array of strings
 * @throws Exception
 *             if an option is not supported
 */
public void setOptions(String[] options) throws Exception {
    String value;

    value = Utils.getOption('R', options);
    if (value.length() != 0)
        setSelectedRange(value);
    else
        setSelectedRange("first-last");

    setInvertSelection(Utils.getFlag('V', options));

    value = Utils.getOption('P', options);
    if (value.length() != 0)
        setAttributeNamePrefix(value);
    else
        setAttributeNamePrefix("");

    value = Utils.getOption('W', options);
    if (value.length() != 0)
        setWordsToKeep(Integer.valueOf(value).intValue());
    else
        setWordsToKeep(1000);

    value = Utils.getOption("prune-rate", options);
    if (value.length() > 0)
        setPeriodicPruning(Double.parseDouble(value));
    else
        setPeriodicPruning(-1);

    value = Utils.getOption('M', options);
    if (value.length() != 0)
        setMinTermFreq(Integer.valueOf(value).intValue());
    else
        setMinTermFreq(1);

    setOutputWordCounts(Utils.getFlag('C', options));

    setTFTransform(Utils.getFlag('T', options));

    setIDFTransform(Utils.getFlag('I', options));

    setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));

    String nString = Utils.getOption('N', options);
    if (nString.length() != 0)
        setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER));
    else
        setNormalizeDocLength(new SelectedTag(FILTER_NONE, TAGS_FILTER));

    setLowerCaseTokens(Utils.getFlag('L', options));

    setUseStoplist(Utils.getFlag('S', options));

    String stemmerString = Utils.getOption("stemmer", options);
    if (stemmerString.length() == 0) {
        setStemmer(null);
    } else {
        String[] stemmerSpec = Utils.splitOptions(stemmerString);
        if (stemmerSpec.length == 0)
            throw new Exception("Invalid stemmer specification string");
        String stemmerName = stemmerSpec[0];
        stemmerSpec[0] = "";
        Stemmer stemmer = (Stemmer) Class.forName(stemmerName).newInstance();
        if (stemmer instanceof OptionHandler)
            ((OptionHandler) stemmer).setOptions(stemmerSpec);
        setStemmer(stemmer);
    }

    value = Utils.getOption("stopwords", options);
    if (value.length() != 0)
        setStopwords(new File(value));
    else
        setStopwords(null);

    String tokenizerString = Utils.getOption("tokenizer", options);
    if (tokenizerString.length() == 0) {
        setTokenizer(new WordTokenizer());
    } else {
        String[] tokenizerSpec = Utils.splitOptions(tokenizerString);
        if (tokenizerSpec.length == 0)
            throw new Exception("Invalid tokenizer specification string");
        String tokenizerName = tokenizerSpec[0];
        tokenizerSpec[0] = "";
        Tokenizer tokenizer = (Tokenizer) Class.forName(tokenizerName).newInstance();
        if (tokenizer instanceof OptionHandler)
            ((OptionHandler) tokenizer).setOptions(tokenizerSpec);
        setTokenizer(tokenizer);
    }
}

From source file:com.ivanrf.smsspam.SpamClassifier.java

License:Apache License

private static FilteredClassifier initFilterClassifier(int wordsToKeep, String tokenizerOp,
        boolean useAttributeSelection, String classifierOp, boolean boosting) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setDoNotOperateOnPerClassBasis(true);
    filter.setLowerCaseTokens(true);/*w  w w. j a  va 2s  .  c  o  m*/
    filter.setWordsToKeep(wordsToKeep);

    if (!tokenizerOp.equals(TOKENIZER_DEFAULT)) {
        //Make a tokenizer
        WordTokenizer wt = new WordTokenizer();
        if (tokenizerOp.equals(TOKENIZER_COMPLETE))
            wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}");
        else //TOKENIZER_COMPLETE_NUMBERS)
            wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}|~0123456789");
        filter.setTokenizer(wt);
    }

    FilteredClassifier classifier = new FilteredClassifier();
    classifier.setFilter(filter);

    if (useAttributeSelection) {
        AttributeSelection as = new AttributeSelection();
        as.setEvaluator(new InfoGainAttributeEval());
        Ranker r = new Ranker();
        r.setThreshold(0);
        as.setSearch(r);

        MultiFilter mf = new MultiFilter();
        mf.setFilters(new Filter[] { filter, as });

        classifier.setFilter(mf);
    }

    if (classifierOp.equals(CLASSIFIER_SMO))
        classifier.setClassifier(new SMO());
    else if (classifierOp.equals(CLASSIFIER_NB))
        classifier.setClassifier(new NaiveBayes());
    else if (classifierOp.equals(CLASSIFIER_IB1))
        classifier.setClassifier(new IBk(1));
    else if (classifierOp.equals(CLASSIFIER_IB3))
        classifier.setClassifier(new IBk(3));
    else if (classifierOp.equals(CLASSIFIER_IB5))
        classifier.setClassifier(new IBk(5));
    else if (classifierOp.equals(CLASSIFIER_PART))
        classifier.setClassifier(new PART()); //Tarda mucho

    if (boosting) {
        AdaBoostM1 boost = new AdaBoostM1();
        boost.setClassifier(classifier.getClassifier());
        classifier.setClassifier(boost); //Con NB tarda mucho
    }

    return classifier;
}

From source file:form.ml.ClassifierTemplate.java

/**
 * Create bayes classifier instance//  w w w.  j  a  v a2s .c  om
 *
 * @param data_set_path
 * @param stop_words_path
 * @param class_index
 * @throws FileNotFoundException
 * @throws IOException
 * @throws Exception
 */
public ClassifierTemplate(String data_set_path, String stop_words_path, int class_index) throws Exception {

    /**
     * loading the arff file content
     */
    BufferedReader reader = new BufferedReader(new FileReader(data_set_path));
    ArffReader arff = new ArffReader(reader);
    train = arff.getData();
    train.setClassIndex(class_index);
    /**
     * initializing the filter
     */
    wordVector = new StringToWordVector();
    wordVector.setInputFormat(train);
    tokenizer = new WordTokenizer();
    wordVector.setStopwords(new File(stop_words_path));
    wordVector.setTokenizer(tokenizer);
    wordVector.setIDFTransform(true);
    wordVector.setLowerCaseTokens(true);
    /**
     * generating the TF*IDF Vector
     */
    trainFiltered = Filter.useFilter(train, wordVector);

}

From source file:graph.clustering.NodeClusterer.java

License:Apache License

private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) {
    String[] filterOptions = new String[10];
    filterOptions[0] = "-R"; // attribute indices
    filterOptions[1] = "first-last";
    filterOptions[2] = "-W"; // The number of words (per class if there is a
    // class attribute assigned) to attempt to
    // keep./*from w w w .  ja va 2 s  .  co  m*/
    filterOptions[3] = "1000";
    filterOptions[4] = "-prune-rate"; // periodical pruning
    filterOptions[5] = "-1.0";
    filterOptions[6] = "-N"; // 0=not normalize
    filterOptions[7] = "0";
    filterOptions[8] = "-M"; // The minimum term frequency
    filterOptions[9] = "1";

    SnowballStemmer stemmer = new SnowballStemmer();
    stemmer.setStemmer("english");
    WordTokenizer tokenizer = new WordTokenizer();

    StringToWordVector s2wFilterer = new StringToWordVector();
    try {
        s2wFilterer.setOptions(filterOptions);
        s2wFilterer.setStemmer(stemmer);
        s2wFilterer.setTokenizer(tokenizer);
        s2wFilterer.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer);
    } catch (Exception e1) {
        System.out.println("Error in converting string into word vectors:");
        e1.printStackTrace();
    }

    RemoveUseless ruFilter = new RemoveUseless();
    try {
        ruFilter.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter);
    } catch (Exception e1) {
        System.out.println("Error in removing useless terms:");
        e1.printStackTrace();
    }

    return clusterTrainingSet;
}

From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerBoolean.java

License:Open Source License

public void indexingToTokenizer(String inPath, String outPath) throws Exception {
    WordTokenizer wordTokenizer = new WordTokenizer();
    wordTokenizer.setDelimiters("\r \t.,;:'\"()?!");

    Instances inputInstances = WekaUtils.loadARFF(inPath);
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(inputInstances);
    filter.setDoNotOperateOnPerClassBasis(false);
    filter.setInvertSelection(false);/*from w w w . j  a v  a 2  s  .c o m*/
    filter.setLowerCaseTokens(true);
    filter.setOutputWordCounts(false);
    filter.setTokenizer(wordTokenizer);
    filter.setUseStoplist(true);
    filter.setWordsToKeep(wordsTokeep);

    Instances outputInstances = Filter.useFilter(inputInstances, filter);

    OutputStreamUtils.writeSimple(outputInstances.toString(), outPath);
}

From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerIDFT.java

License:Open Source License

public void indexingToTokenizer(String inPath, String outPath) throws Exception {

    WordTokenizer wordTokenizer = new WordTokenizer();
    wordTokenizer.setDelimiters("\r \t.,;:'\"()?!");

    Instances inputInstances = WekaUtils.loadARFF(inPath);
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(inputInstances);
    filter.setIDFTransform(true);//w  w  w  .  ja  va  2s .  c om
    filter.setTFTransform(true);
    filter.setDoNotOperateOnPerClassBasis(false);
    filter.setInvertSelection(false);
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(3);
    filter.setOutputWordCounts(true);
    filter.setTokenizer(wordTokenizer);
    filter.setUseStoplist(true);
    filter.setWordsToKeep(200);

    Instances outputInstances = Filter.useFilter(inputInstances, filter);

    OutputStreamUtils.writeSimple(outputInstances.toString(), outPath);
}

From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerVector.java

License:Open Source License

public void indexingToTokenizer(String inPath, String outPath) throws Exception {
    WordTokenizer wordTokenizer = new WordTokenizer();
    wordTokenizer.setDelimiters("\r \t.,;:'\"()?!");

    Instances inputInstances = WekaUtils.loadARFF(inPath);
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(inputInstances);
    filter.setDoNotOperateOnPerClassBasis(false);
    filter.setInvertSelection(false);/*from   w ww  .  ja  v a 2s .c  o m*/
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(3);
    filter.setOutputWordCounts(true);
    filter.setTokenizer(wordTokenizer);
    filter.setUseStoplist(true);
    filter.setWordsToKeep(200);

    Instances outputInstances = Filter.useFilter(inputInstances, filter);

    OutputStreamUtils.writeSimple(outputInstances.toString(), outPath);
}