Example usage for weka.filters.unsupervised.attribute StringToWordVector setOptions

List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setOptions

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute StringToWordVector setOptions.

Prototype

@Override
public void setOptions(String[] options) throws Exception 

Source Link

Document

Parses a given list of options.

Usage

From source file:com.reactivetechnologies.analytics.lucene.InstanceTokenizer.java

License:Open Source License

/**
 * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. 
 * The set of words (attributes) is determined by the first batch filtered (typically training data). Uses a Lucene analyzer to tokenize
 * the string. NOTE: The text string should either be the first or last attribute
 * @param dataRaw/* w  ww  .j  a  v a2 s . c om*/
 * @param opts
 * @param isLast - whether last attribute is the text to be filtered, else first
 * @return
 * @throws Exception
 * @see {@linkplain StringToWordVector}
 */
public static Instances filter(Instances dataRaw, String opts, boolean isLast) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    if (StringUtils.hasText(opts)) {
        filter.setOptions(Utils.splitOptions(opts));
    }
    filter.setTokenizer(new InstanceTokenizer());
    filter.setUseStoplist(false);//ignore any other stop list
    filter.setStemmer(new NullStemmer());//ignore any other stemmer
    filter.setInputFormat(dataRaw);
    filter.setAttributeIndices(isLast ? "last" : "first");
    return Filter.useFilter(dataRaw, filter);
}

From source file:graph.clustering.NodeClusterer.java

License:Apache License

private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) {
    String[] filterOptions = new String[10];
    filterOptions[0] = "-R"; // attribute indices
    filterOptions[1] = "first-last";
    filterOptions[2] = "-W"; // The number of words (per class if there is a
    // class attribute assigned) to attempt to
    // keep./* w  w w.j a  v  a2s .c  o m*/
    filterOptions[3] = "1000";
    filterOptions[4] = "-prune-rate"; // periodical pruning
    filterOptions[5] = "-1.0";
    filterOptions[6] = "-N"; // 0=not normalize
    filterOptions[7] = "0";
    filterOptions[8] = "-M"; // The minimum term frequency
    filterOptions[9] = "1";

    SnowballStemmer stemmer = new SnowballStemmer();
    stemmer.setStemmer("english");
    WordTokenizer tokenizer = new WordTokenizer();

    StringToWordVector s2wFilterer = new StringToWordVector();
    try {
        s2wFilterer.setOptions(filterOptions);
        s2wFilterer.setStemmer(stemmer);
        s2wFilterer.setTokenizer(tokenizer);
        s2wFilterer.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer);
    } catch (Exception e1) {
        System.out.println("Error in converting string into word vectors:");
        e1.printStackTrace();
    }

    RemoveUseless ruFilter = new RemoveUseless();
    try {
        ruFilter.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter);
    } catch (Exception e1) {
        System.out.println("Error in removing useless terms:");
        e1.printStackTrace();
    }

    return clusterTrainingSet;
}

From source file:newsclassifier.NewsClassifier.java

public void StrToWV(String sFile) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(data);//  www  .  j a  v  a2 s.c  om
    /*filter.setIDFTransform(false);
    filter.setTFTransform(true);
    filter.setAttributeIndices("1-2");
    //attributenameprefix
    filter.setDoNotOperateOnPerClassBasis(true);
    filter.setInvertSelection(false);
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(1);
    //filter.setNormalizeDocLength(true);
    filter.setOutputWordCounts(false);
    //filter.setPeriodicPruning(-1);
    //filter.setStemmer(null);
    filter.setStopwords(new File(sFile));*/
    //String[] opts = weka.core.Utils.splitOptions("-tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r \\\\t.,;:\\\\\\'\\\\\\\"()?!1234567890 `~!@#\\\\\\%^&*[]-_+={}\\\\\\\\/|?><  \\\\r\\\\t\\\"\"");
    String[] opts = weka.core.Utils.splitOptions(
            "-R 1-2 -W 3000 -prune-rate -1.0 -T -N 0 -L -S -stemmer weka.core.stemmers.NullStemmer -M 1 -O -stopwords \"C:\\\\Users\\\\USER\\\\Dropbox\\\\Works\\\\IF\\\\AI\\\\Tubes 2\\\\s.txt\" -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\"   \\\\t.,;:\\\\\\'\\\\\\\"()?!1234567890 `~!@#\\\\\\%^&*[]-_+={}\\\\\\\\/|?><   \\\\t\\\"\"");
    filter.setOptions(opts);
    //belum pake delimiter!!
    filter.setWordsToKeep(3000);

    data = Filter.useFilter(data, filter);
    //return newData;
    //data = newData;
}