Example usage for weka.filters.unsupervised.attribute StringToWordVector StringToWordVector

List of usage examples for weka.filters.unsupervised.attribute StringToWordVector StringToWordVector

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute StringToWordVector StringToWordVector.

Prototype

public StringToWordVector() 

Source Link

Document

Default constructor.

Usage

From source file:at.aictopic1.sentimentanalysis.machinelearning.impl.TwitterClassifer.java

public Instances loadTrainingData() {

    try {//  ww w  . j av  a  2 s .co m
        //DataSource source = new DataSource("C:\\Users\\David\\Documents\\Datalogi\\TU Wien\\2014W_Advanced Internet Computing\\Labs\\aic_group2_topic1\\Other Stuff\\training_dataset.arff");
        DataSource source = new DataSource(
                "C:\\Users\\David\\Documents\\Datalogi\\TU Wien\\2014W_Advanced Internet Computing\\Labs\\Data sets\\labelled.arff");

        //            System.out.println("Data Structure pre processing: " + source.getStructure());
        Instances data = source.getDataSet();

        // Get and save the dataStructure of the dataset
        dataStructure = source.getStructure();
        try {
            // Save the datastructure to file
            // serialize dataStructure
            weka.core.SerializationHelper.write(modelDir + algorithm + ".dataStruct", dataStructure);
        } catch (Exception ex) {
            Logger.getLogger(TwitterClassifer.class.getName()).log(Level.SEVERE, null, ex);
        }
        // Set class index
        data.setClassIndex(2);

        // Giving attributes unique names before converting strings
        data.renameAttribute(2, "class_attr");
        data.renameAttribute(0, "twitter_id");

        // Convert String attribute to Words using filter
        StringToWordVector filter = new StringToWordVector();

        filter.setInputFormat(data);

        Instances filteredData = Filter.useFilter(data, filter);

        System.out.println("filteredData struct: " + filteredData.attribute(0));
        System.out.println("filteredData struct: " + filteredData.attribute(1));
        System.out.println("filteredData struct: " + filteredData.attribute(2));

        return filteredData;

    } catch (Exception ex) {
        System.out.println("Error loading training set: " + ex.toString());
        return null;
        //Logger.getLogger(Trainer.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:at.aictopic1.sentimentanalysis.machinelearning.impl.TwitterClassifer.java

public Integer classify(Tweet[] tweets) {
    // TEST//  w ww . ja  v a 2  s .  c  o  m

    // Generate two tweet examples
    Tweet exOne = new Tweet("This is good and fantastic");
    exOne.setPreprocessedText("This is good and fantastic");
    Tweet exTwo = new Tweet("Horribly, terribly bad and more");
    exTwo.setPreprocessedText("Horribly, terribly bad and more");
    Tweet exThree = new Tweet(
            "I want to update lj and read my friends list, but I\\'m groggy and sick and blargh.");
    exThree.setPreprocessedText(
            "I want to update lj and read my friends list, but I\\'m groggy and sick and blargh.");
    Tweet exFour = new Tweet("bad hate worst sick");
    exFour.setPreprocessedText("bad hate worst sick");
    tweets = new Tweet[] { exOne, exTwo, exThree, exFour };
    // TEST

    // Load model
    //        loadModel();
    // Convert Tweet to Instance type 
    // Get String Data
    // Create attributes for the Instances set
    Attribute twitter_id = new Attribute("twitter_id");
    //        Attribute body = new Attribute("body");

    FastVector classVal = new FastVector(2);
    classVal.addElement("pos");
    classVal.addElement("neg");

    Attribute class_attr = new Attribute("class_attr", classVal);

    // Add them to a list
    FastVector attrVector = new FastVector(3);
    //        attrVector.addElement(twitter_id);
    //        attrVector.addElement(new Attribute("body", (FastVector) null));
    //        attrVector.addElement(class_attr);

    // Get the number of tweets and then create predictSet
    int numTweets = tweets.length;
    Enumeration structAttrs = dataStructure.enumerateAttributes();

    //        ArrayList<Attribute> attrList = new ArrayList<Attribute>(dataStructure.numAttributes());
    while (structAttrs.hasMoreElements()) {
        attrVector.addElement((Attribute) structAttrs.nextElement());
    }
    Instances predictSet = new Instances("predictInstances", attrVector, numTweets);
    //        Instances predictSet = new Instances(dataStructure);
    predictSet.setClassIndex(2);

    // init prediction
    double prediction = -1;

    System.out.println("PredictSet matches source structure: " + predictSet.equalHeaders(dataStructure));

    System.out.println("PredSet struct: " + predictSet.attribute(0));
    System.out.println("PredSet struct: " + predictSet.attribute(1));
    System.out.println("PredSet struct: " + predictSet.attribute(2));
    // Array to return predictions 
    //double[] tweetsClassified = new double[2][numTweets];
    //List<Integer, Double> tweetsClass = new ArrayList<Integer, Double>(numTweets);
    for (int i = 0; i < numTweets; i++) {
        String content = (String) tweets[i].getPreprocessedText();

        System.out.println("Tweet content: " + content);

        //            attrList
        Instance tweetInstance = new Instance(predictSet.numAttributes());

        tweetInstance.setDataset(predictSet);

        tweetInstance.setValue(predictSet.attribute(0), i);
        tweetInstance.setValue(predictSet.attribute(1), content);
        tweetInstance.setClassMissing();

        predictSet.add(tweetInstance);

        try {
            // Apply string filter
            StringToWordVector filter = new StringToWordVector();

            filter.setInputFormat(predictSet);
            Instances filteredPredictSet = Filter.useFilter(predictSet, filter);

            // Apply model
            prediction = trainedModel.classifyInstance(filteredPredictSet.instance(i));
            filteredPredictSet.instance(i).setClassValue(prediction);
            System.out.println("Classification: " + filteredPredictSet.instance(i).toString());
            System.out.println("Prediction: " + prediction);

        } catch (Exception ex) {
            Logger.getLogger(TwitterClassifer.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    return 0;
}

From source file:classifier.SellerClassifier.java

private Instances startFeatureExtraction(Instances raw) throws Exception {
    myFilter = new StringToWordVector();
    myFilter.setInputFormat(raw);/* w  w  w . j  av  a  2s .  c  om*/

    return Filter.useFilter(raw, myFilter);
}

From source file:com.dhamacher.sentimentanalysis4tweets.preprocessing.TweetFeatureExtractor.java

License:Apache License

/**
 * Method which contructs the arff file for weka with the training data
 *//*from   w w  w.java  2s .co  m*/
public static void constructModel() {
    Instances instdata = null;
    try {
        FastVector atts;
        atts = new FastVector();
        atts.addElement(new Attribute("content", (FastVector) null));
        FastVector fvClassVal = new FastVector(4);
        fvClassVal.addElement("");
        fvClassVal.addElement("neutral");
        fvClassVal.addElement("negative");
        fvClassVal.addElement("positive");
        Attribute ClassAttribute = new Attribute("Class", fvClassVal);
        atts.addElement(ClassAttribute);
        instdata = new Instances("tweetData", atts, 0);
        CsvReader data = new CsvReader("../classified data/traindata.csv");
        int i = 0;
        while (data.readRecord()) {
            double[] vals = new double[instdata.numAttributes()];
            String class_id = data.get(0);
            switch (Integer.parseInt(class_id)) {
            case 0:
                class_id = "negative";
                break;
            case 2:
                class_id = "neutral";
                break;
            case 4:
                class_id = "positive";
                break;
            }
            String tweet_content = data.get(5);
            Instance iInst = new Instance(2);
            iInst.setValue((Attribute) atts.elementAt(0), tweet_content);
            iInst.setValue((Attribute) atts.elementAt(1), class_id);
            instdata.add(iInst);
            System.out.println("[" + i + "] " + class_id + ":" + tweet_content);
            i++;
        }
        data.close();
        StringToWordVector filter = new StringToWordVector();
        instdata.setClassIndex(instdata.numAttributes() - 1);
        filter.setInputFormat(instdata);
        Instances newdata = Filter.useFilter(instdata, filter);
        ArffSaver saver = new ArffSaver();
        saver.setInstances(newdata);
        saver.setFile(new File("./data/train2data.arff"));
        saver.writeBatch();
    } catch (Exception ex) {
        Logger.getLogger(TweetFeatureExtractor.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:com.hack23.cia.service.impl.action.user.wordcount.WordCounterImpl.java

License:Apache License

@Override
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData,
        final int maxResult) {

    final String html = documentContentData.getContent();

    final Attribute input = new Attribute("html", (ArrayList<String>) null);

    final ArrayList<Attribute> inputVec = new ArrayList<>();
    inputVec.add(input);//from  w w  w.ja v  a  2s  .c om

    final Instances htmlInst = new Instances("html", inputVec, 1);

    htmlInst.add(new DenseInstance(1));
    htmlInst.instance(0).setValue(0, html);

    final StopwordsHandler StopwordsHandler = new StopwordsHandler() {

        @Override
        public boolean isStopword(final String word) {

            return word.length() < 5;
        }
    };

    final NGramTokenizer tokenizer = new NGramTokenizer();
    tokenizer.setNGramMinSize(1);
    tokenizer.setNGramMaxSize(1);
    tokenizer.setDelimiters(" \r\n\t.,;:'\"()?!'");

    final StringToWordVector filter = new StringToWordVector();
    filter.setTokenizer(tokenizer);
    filter.setStopwordsHandler(StopwordsHandler);
    filter.setLowerCaseTokens(true);
    filter.setOutputWordCounts(true);
    filter.setWordsToKeep(maxResult);

    final Map<String, Integer> result = new HashMap<>();

    try {
        filter.setInputFormat(htmlInst);
        final Instances dataFiltered = Filter.useFilter(htmlInst, filter);

        final Instance last = dataFiltered.lastInstance();

        final int numAttributes = last.numAttributes();

        for (int i = 0; i < numAttributes; i++) {
            result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
        }
    } catch (final Exception e) {
        LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e);
    }

    return result;
}

From source file:com.ivanrf.smsspam.SpamClassifier.java

License:Apache License

private static FilteredClassifier initFilterClassifier(int wordsToKeep, String tokenizerOp,
        boolean useAttributeSelection, String classifierOp, boolean boosting) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setDoNotOperateOnPerClassBasis(true);
    filter.setLowerCaseTokens(true);//from w w  w .  j a v a2  s . c o  m
    filter.setWordsToKeep(wordsToKeep);

    if (!tokenizerOp.equals(TOKENIZER_DEFAULT)) {
        //Make a tokenizer
        WordTokenizer wt = new WordTokenizer();
        if (tokenizerOp.equals(TOKENIZER_COMPLETE))
            wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}");
        else //TOKENIZER_COMPLETE_NUMBERS)
            wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}|~0123456789");
        filter.setTokenizer(wt);
    }

    FilteredClassifier classifier = new FilteredClassifier();
    classifier.setFilter(filter);

    if (useAttributeSelection) {
        AttributeSelection as = new AttributeSelection();
        as.setEvaluator(new InfoGainAttributeEval());
        Ranker r = new Ranker();
        r.setThreshold(0);
        as.setSearch(r);

        MultiFilter mf = new MultiFilter();
        mf.setFilters(new Filter[] { filter, as });

        classifier.setFilter(mf);
    }

    if (classifierOp.equals(CLASSIFIER_SMO))
        classifier.setClassifier(new SMO());
    else if (classifierOp.equals(CLASSIFIER_NB))
        classifier.setClassifier(new NaiveBayes());
    else if (classifierOp.equals(CLASSIFIER_IB1))
        classifier.setClassifier(new IBk(1));
    else if (classifierOp.equals(CLASSIFIER_IB3))
        classifier.setClassifier(new IBk(3));
    else if (classifierOp.equals(CLASSIFIER_IB5))
        classifier.setClassifier(new IBk(5));
    else if (classifierOp.equals(CLASSIFIER_PART))
        classifier.setClassifier(new PART()); //Tarda mucho

    if (boosting) {
        AdaBoostM1 boost = new AdaBoostM1();
        boost.setClassifier(classifier.getClassifier());
        classifier.setClassifier(boost); //Con NB tarda mucho
    }

    return classifier;
}

From source file:com.reactivetechnologies.analytics.lucene.InstanceTokenizer.java

License:Open Source License

/**
 * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. 
 * The set of words (attributes) is determined by the first batch filtered (typically training data). Uses a Lucene analyzer to tokenize
 * the string. NOTE: The text string should either be the first or last attribute
 * @param dataRaw/*from w ww  .j  av  a 2  s. com*/
 * @param opts
 * @param isLast - whether last attribute is the text to be filtered, else first
 * @return
 * @throws Exception
 * @see {@linkplain StringToWordVector}
 */
public static Instances filter(Instances dataRaw, String opts, boolean isLast) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    if (StringUtils.hasText(opts)) {
        filter.setOptions(Utils.splitOptions(opts));
    }
    filter.setTokenizer(new InstanceTokenizer());
    filter.setUseStoplist(false);//ignore any other stop list
    filter.setStemmer(new NullStemmer());//ignore any other stemmer
    filter.setInputFormat(dataRaw);
    filter.setAttributeIndices(isLast ? "last" : "first");
    return Filter.useFilter(dataRaw, filter);
}

From source file:epsi.i5.datamining.Weka.java

public void generationArffFilter() throws IOException, Exception {

    BufferedReader reader = new BufferedReader(new FileReader("src/epsi/i5/data/" + fileOne + ".arff"));
    Instances data = new Instances(reader);
    reader.close();//from   www  .  j  av a 2  s .co m

    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(data);
    instances = Filter.useFilter(data, filter);
    fileOne = fileOne + "Two";
    generationArff();
}

From source file:form.ml.ClassifierTemplate.java

/**
 * Create bayes classifier instance/*from  w w w .ja va  2  s . c  o  m*/
 *
 * @param data_set_path
 * @param stop_words_path
 * @param class_index
 * @throws FileNotFoundException
 * @throws IOException
 * @throws Exception
 */
public ClassifierTemplate(String data_set_path, String stop_words_path, int class_index) throws Exception {

    /**
     * loading the arff file content
     */
    BufferedReader reader = new BufferedReader(new FileReader(data_set_path));
    ArffReader arff = new ArffReader(reader);
    train = arff.getData();
    train.setClassIndex(class_index);
    /**
     * initializing the filter
     */
    wordVector = new StringToWordVector();
    wordVector.setInputFormat(train);
    tokenizer = new WordTokenizer();
    wordVector.setStopwords(new File(stop_words_path));
    wordVector.setTokenizer(tokenizer);
    wordVector.setIDFTransform(true);
    wordVector.setLowerCaseTokens(true);
    /**
     * generating the TF*IDF Vector
     */
    trainFiltered = Filter.useFilter(train, wordVector);

}

From source file:graph.clustering.NodeClusterer.java

License:Apache License

private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) {
    String[] filterOptions = new String[10];
    filterOptions[0] = "-R"; // attribute indices
    filterOptions[1] = "first-last";
    filterOptions[2] = "-W"; // The number of words (per class if there is a
    // class attribute assigned) to attempt to
    // keep./* www .j  a  v  a  2  s. co  m*/
    filterOptions[3] = "1000";
    filterOptions[4] = "-prune-rate"; // periodical pruning
    filterOptions[5] = "-1.0";
    filterOptions[6] = "-N"; // 0=not normalize
    filterOptions[7] = "0";
    filterOptions[8] = "-M"; // The minimum term frequency
    filterOptions[9] = "1";

    SnowballStemmer stemmer = new SnowballStemmer();
    stemmer.setStemmer("english");
    WordTokenizer tokenizer = new WordTokenizer();

    StringToWordVector s2wFilterer = new StringToWordVector();
    try {
        s2wFilterer.setOptions(filterOptions);
        s2wFilterer.setStemmer(stemmer);
        s2wFilterer.setTokenizer(tokenizer);
        s2wFilterer.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer);
    } catch (Exception e1) {
        System.out.println("Error in converting string into word vectors:");
        e1.printStackTrace();
    }

    RemoveUseless ruFilter = new RemoveUseless();
    try {
        ruFilter.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter);
    } catch (Exception e1) {
        System.out.println("Error in removing useless terms:");
        e1.printStackTrace();
    }

    return clusterTrainingSet;
}