Example usage for weka.filters.unsupervised.attribute StringToWordVector setInputFormat

List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setInputFormat

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute StringToWordVector setInputFormat.

Prototype

@Override
public boolean setInputFormat(Instances instanceInfo) throws Exception 

Source Link

Document

Sets the format of the input instances.

Usage

From source file:at.aictopic1.sentimentanalysis.machinelearning.impl.TwitterClassifer.java

public Instances loadTrainingData() {

    try {/*from w  w w.  j av  a2s .  c o m*/
        //DataSource source = new DataSource("C:\\Users\\David\\Documents\\Datalogi\\TU Wien\\2014W_Advanced Internet Computing\\Labs\\aic_group2_topic1\\Other Stuff\\training_dataset.arff");
        DataSource source = new DataSource(
                "C:\\Users\\David\\Documents\\Datalogi\\TU Wien\\2014W_Advanced Internet Computing\\Labs\\Data sets\\labelled.arff");

        //            System.out.println("Data Structure pre processing: " + source.getStructure());
        Instances data = source.getDataSet();

        // Get and save the dataStructure of the dataset
        dataStructure = source.getStructure();
        try {
            // Save the datastructure to file
            // serialize dataStructure
            weka.core.SerializationHelper.write(modelDir + algorithm + ".dataStruct", dataStructure);
        } catch (Exception ex) {
            Logger.getLogger(TwitterClassifer.class.getName()).log(Level.SEVERE, null, ex);
        }
        // Set class index
        data.setClassIndex(2);

        // Giving attributes unique names before converting strings
        data.renameAttribute(2, "class_attr");
        data.renameAttribute(0, "twitter_id");

        // Convert String attribute to Words using filter
        StringToWordVector filter = new StringToWordVector();

        filter.setInputFormat(data);

        Instances filteredData = Filter.useFilter(data, filter);

        System.out.println("filteredData struct: " + filteredData.attribute(0));
        System.out.println("filteredData struct: " + filteredData.attribute(1));
        System.out.println("filteredData struct: " + filteredData.attribute(2));

        return filteredData;

    } catch (Exception ex) {
        System.out.println("Error loading training set: " + ex.toString());
        return null;
        //Logger.getLogger(Trainer.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:at.aictopic1.sentimentanalysis.machinelearning.impl.TwitterClassifer.java

public Integer classify(Tweet[] tweets) {
    // TEST//from w w  w  .ja v  a2 s  .  c  o m

    // Generate two tweet examples
    Tweet exOne = new Tweet("This is good and fantastic");
    exOne.setPreprocessedText("This is good and fantastic");
    Tweet exTwo = new Tweet("Horribly, terribly bad and more");
    exTwo.setPreprocessedText("Horribly, terribly bad and more");
    Tweet exThree = new Tweet(
            "I want to update lj and read my friends list, but I\\'m groggy and sick and blargh.");
    exThree.setPreprocessedText(
            "I want to update lj and read my friends list, but I\\'m groggy and sick and blargh.");
    Tweet exFour = new Tweet("bad hate worst sick");
    exFour.setPreprocessedText("bad hate worst sick");
    tweets = new Tweet[] { exOne, exTwo, exThree, exFour };
    // TEST

    // Load model
    //        loadModel();
    // Convert Tweet to Instance type 
    // Get String Data
    // Create attributes for the Instances set
    Attribute twitter_id = new Attribute("twitter_id");
    //        Attribute body = new Attribute("body");

    FastVector classVal = new FastVector(2);
    classVal.addElement("pos");
    classVal.addElement("neg");

    Attribute class_attr = new Attribute("class_attr", classVal);

    // Add them to a list
    FastVector attrVector = new FastVector(3);
    //        attrVector.addElement(twitter_id);
    //        attrVector.addElement(new Attribute("body", (FastVector) null));
    //        attrVector.addElement(class_attr);

    // Get the number of tweets and then create predictSet
    int numTweets = tweets.length;
    Enumeration structAttrs = dataStructure.enumerateAttributes();

    //        ArrayList<Attribute> attrList = new ArrayList<Attribute>(dataStructure.numAttributes());
    while (structAttrs.hasMoreElements()) {
        attrVector.addElement((Attribute) structAttrs.nextElement());
    }
    Instances predictSet = new Instances("predictInstances", attrVector, numTweets);
    //        Instances predictSet = new Instances(dataStructure);
    predictSet.setClassIndex(2);

    // init prediction
    double prediction = -1;

    System.out.println("PredictSet matches source structure: " + predictSet.equalHeaders(dataStructure));

    System.out.println("PredSet struct: " + predictSet.attribute(0));
    System.out.println("PredSet struct: " + predictSet.attribute(1));
    System.out.println("PredSet struct: " + predictSet.attribute(2));
    // Array to return predictions 
    //double[] tweetsClassified = new double[2][numTweets];
    //List<Integer, Double> tweetsClass = new ArrayList<Integer, Double>(numTweets);
    for (int i = 0; i < numTweets; i++) {
        String content = (String) tweets[i].getPreprocessedText();

        System.out.println("Tweet content: " + content);

        //            attrList
        Instance tweetInstance = new Instance(predictSet.numAttributes());

        tweetInstance.setDataset(predictSet);

        tweetInstance.setValue(predictSet.attribute(0), i);
        tweetInstance.setValue(predictSet.attribute(1), content);
        tweetInstance.setClassMissing();

        predictSet.add(tweetInstance);

        try {
            // Apply string filter
            StringToWordVector filter = new StringToWordVector();

            filter.setInputFormat(predictSet);
            Instances filteredPredictSet = Filter.useFilter(predictSet, filter);

            // Apply model
            prediction = trainedModel.classifyInstance(filteredPredictSet.instance(i));
            filteredPredictSet.instance(i).setClassValue(prediction);
            System.out.println("Classification: " + filteredPredictSet.instance(i).toString());
            System.out.println("Prediction: " + prediction);

        } catch (Exception ex) {
            Logger.getLogger(TwitterClassifer.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    return 0;
}

From source file:com.dhamacher.sentimentanalysis4tweets.preprocessing.TweetFeatureExtractor.java

License:Apache License

/**
 * Method which contructs the arff file for weka with the training data
 *//*from   w w  w.j  a  v  a  2  s  .  co  m*/
public static void constructModel() {
    Instances instdata = null;
    try {
        FastVector atts;
        atts = new FastVector();
        atts.addElement(new Attribute("content", (FastVector) null));
        FastVector fvClassVal = new FastVector(4);
        fvClassVal.addElement("");
        fvClassVal.addElement("neutral");
        fvClassVal.addElement("negative");
        fvClassVal.addElement("positive");
        Attribute ClassAttribute = new Attribute("Class", fvClassVal);
        atts.addElement(ClassAttribute);
        instdata = new Instances("tweetData", atts, 0);
        CsvReader data = new CsvReader("../classified data/traindata.csv");
        int i = 0;
        while (data.readRecord()) {
            double[] vals = new double[instdata.numAttributes()];
            String class_id = data.get(0);
            switch (Integer.parseInt(class_id)) {
            case 0:
                class_id = "negative";
                break;
            case 2:
                class_id = "neutral";
                break;
            case 4:
                class_id = "positive";
                break;
            }
            String tweet_content = data.get(5);
            Instance iInst = new Instance(2);
            iInst.setValue((Attribute) atts.elementAt(0), tweet_content);
            iInst.setValue((Attribute) atts.elementAt(1), class_id);
            instdata.add(iInst);
            System.out.println("[" + i + "] " + class_id + ":" + tweet_content);
            i++;
        }
        data.close();
        StringToWordVector filter = new StringToWordVector();
        instdata.setClassIndex(instdata.numAttributes() - 1);
        filter.setInputFormat(instdata);
        Instances newdata = Filter.useFilter(instdata, filter);
        ArffSaver saver = new ArffSaver();
        saver.setInstances(newdata);
        saver.setFile(new File("./data/train2data.arff"));
        saver.writeBatch();
    } catch (Exception ex) {
        Logger.getLogger(TweetFeatureExtractor.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:com.hack23.cia.service.impl.action.user.wordcount.WordCounterImpl.java

License:Apache License

@Override
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData,
        final int maxResult) {

    final String html = documentContentData.getContent();

    final Attribute input = new Attribute("html", (ArrayList<String>) null);

    final ArrayList<Attribute> inputVec = new ArrayList<>();
    inputVec.add(input);/*from  w w  w. j a v a2  s.  c om*/

    final Instances htmlInst = new Instances("html", inputVec, 1);

    htmlInst.add(new DenseInstance(1));
    htmlInst.instance(0).setValue(0, html);

    final StopwordsHandler StopwordsHandler = new StopwordsHandler() {

        @Override
        public boolean isStopword(final String word) {

            return word.length() < 5;
        }
    };

    final NGramTokenizer tokenizer = new NGramTokenizer();
    tokenizer.setNGramMinSize(1);
    tokenizer.setNGramMaxSize(1);
    tokenizer.setDelimiters(" \r\n\t.,;:'\"()?!'");

    final StringToWordVector filter = new StringToWordVector();
    filter.setTokenizer(tokenizer);
    filter.setStopwordsHandler(StopwordsHandler);
    filter.setLowerCaseTokens(true);
    filter.setOutputWordCounts(true);
    filter.setWordsToKeep(maxResult);

    final Map<String, Integer> result = new HashMap<>();

    try {
        filter.setInputFormat(htmlInst);
        final Instances dataFiltered = Filter.useFilter(htmlInst, filter);

        final Instance last = dataFiltered.lastInstance();

        final int numAttributes = last.numAttributes();

        for (int i = 0; i < numAttributes; i++) {
            result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
        }
    } catch (final Exception e) {
        LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e);
    }

    return result;
}

From source file:com.reactivetechnologies.analytics.lucene.InstanceTokenizer.java

License:Open Source License

/**
 * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. 
 * The set of words (attributes) is determined by the first batch filtered (typically training data). Uses a Lucene analyzer to tokenize
 * the string. NOTE: The text string should either be the first or last attribute
 * @param dataRaw// w  w w. ja  v  a  2s  . c om
 * @param opts
 * @param isLast - whether last attribute is the text to be filtered, else first
 * @return
 * @throws Exception
 * @see {@linkplain StringToWordVector}
 */
public static Instances filter(Instances dataRaw, String opts, boolean isLast) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    if (StringUtils.hasText(opts)) {
        filter.setOptions(Utils.splitOptions(opts));
    }
    filter.setTokenizer(new InstanceTokenizer());
    filter.setUseStoplist(false);//ignore any other stop list
    filter.setStemmer(new NullStemmer());//ignore any other stemmer
    filter.setInputFormat(dataRaw);
    filter.setAttributeIndices(isLast ? "last" : "first");
    return Filter.useFilter(dataRaw, filter);
}

From source file:epsi.i5.datamining.Weka.java

public void generationArffFilter() throws IOException, Exception {

    BufferedReader reader = new BufferedReader(new FileReader("src/epsi/i5/data/" + fileOne + ".arff"));
    Instances data = new Instances(reader);
    reader.close();//from w  ww .j av a 2  s.c  o  m

    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(data);
    instances = Filter.useFilter(data, filter);
    fileOne = fileOne + "Two";
    generationArff();
}

From source file:graph.clustering.NodeClusterer.java

License:Apache License

private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) {
    String[] filterOptions = new String[10];
    filterOptions[0] = "-R"; // attribute indices
    filterOptions[1] = "first-last";
    filterOptions[2] = "-W"; // The number of words (per class if there is a
    // class attribute assigned) to attempt to
    // keep./*from   w  w  w . j ava2  s .  c o m*/
    filterOptions[3] = "1000";
    filterOptions[4] = "-prune-rate"; // periodical pruning
    filterOptions[5] = "-1.0";
    filterOptions[6] = "-N"; // 0=not normalize
    filterOptions[7] = "0";
    filterOptions[8] = "-M"; // The minimum term frequency
    filterOptions[9] = "1";

    SnowballStemmer stemmer = new SnowballStemmer();
    stemmer.setStemmer("english");
    WordTokenizer tokenizer = new WordTokenizer();

    StringToWordVector s2wFilterer = new StringToWordVector();
    try {
        s2wFilterer.setOptions(filterOptions);
        s2wFilterer.setStemmer(stemmer);
        s2wFilterer.setTokenizer(tokenizer);
        s2wFilterer.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer);
    } catch (Exception e1) {
        System.out.println("Error in converting string into word vectors:");
        e1.printStackTrace();
    }

    RemoveUseless ruFilter = new RemoveUseless();
    try {
        ruFilter.setInputFormat(clusterTrainingSet);
        clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter);
    } catch (Exception e1) {
        System.out.println("Error in removing useless terms:");
        e1.printStackTrace();
    }

    return clusterTrainingSet;
}

From source file:newsclassifier.NewsClassifier.java

public void StrToWV(String sFile) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(data);
    /*filter.setIDFTransform(false);
    filter.setTFTransform(true);//from w ww  .j  a v  a 2 s  .c o m
    filter.setAttributeIndices("1-2");
    //attributenameprefix
    filter.setDoNotOperateOnPerClassBasis(true);
    filter.setInvertSelection(false);
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(1);
    //filter.setNormalizeDocLength(true);
    filter.setOutputWordCounts(false);
    //filter.setPeriodicPruning(-1);
    //filter.setStemmer(null);
    filter.setStopwords(new File(sFile));*/
    //String[] opts = weka.core.Utils.splitOptions("-tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r \\\\t.,;:\\\\\\'\\\\\\\"()?!1234567890 `~!@#\\\\\\%^&*[]-_+={}\\\\\\\\/|?><  \\\\r\\\\t\\\"\"");
    String[] opts = weka.core.Utils.splitOptions(
            "-R 1-2 -W 3000 -prune-rate -1.0 -T -N 0 -L -S -stemmer weka.core.stemmers.NullStemmer -M 1 -O -stopwords \"C:\\\\Users\\\\USER\\\\Dropbox\\\\Works\\\\IF\\\\AI\\\\Tubes 2\\\\s.txt\" -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\"   \\\\t.,;:\\\\\\'\\\\\\\"()?!1234567890 `~!@#\\\\\\%^&*[]-_+={}\\\\\\\\/|?><   \\\\t\\\"\"");
    filter.setOptions(opts);
    //belum pake delimiter!!
    filter.setWordsToKeep(3000);

    data = Filter.useFilter(data, filter);
    //return newData;
    //data = newData;
}

From source file:nl.uva.expose.classification.WekaClassification.java

private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setAttributeIndices("first-last");
    filter.setIDFTransform(true);/*  w w w. jav  a2s  .  co m*/
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(2);
    filter.setLowerCaseTokens(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));
    filter.setOutputWordCounts(true);
    //        filter.setTokenizer();
    //        filter.setWordsToKeep();
    filter.setInputFormat(dRaw);
    dFiltered = Filter.useFilter(dRaw, filter);
}

From source file:nlpmusic.StringClusterer.java

public ArrayList<ArrayList<String>> cluster(ArrayList<String> tem) throws Exception {
    Instances source = listLoad(tem);//from   w  w  w . jav  a2  s .c  o  m

    StringToWordVector vect = new StringToWordVector();
    vect.setWordsToKeep(to_keep);
    vect.setInputFormat(source);
    Instances datas = Filter.useFilter(source, vect);
    //vect.setDoNotOperateOnPerClassBasis(true);        
    //System.out.println("ASDASD" + vect.wordsToKeepTipText());
    //System.out.println(datas.numAttributes());
    //System.out.println("ASDASD" + vect.getWordsToKeep());
    DBSCAN clusterer = new DBSCAN();
    clusterer.setEpsilon(threshold);
    clusterer.setMinPoints(min_points);

    clusterer.buildClusterer(datas);

    ArrayList<ArrayList<String>> ret = new ArrayList<>();

    for (int i = 0; i < clusterer.numberOfClusters(); i++) {
        ArrayList<String> to_add = new ArrayList<>();
        //System.out.println(i);
        for (int j = 0; j < datas.size(); j++) {
            try {
                if (clusterer.clusterInstance(datas.get(j)) == i)
                    //System.out.println("* " + source.get(j).toString() + " *");
                    to_add.add(source.get(j).toString());
            } catch (Exception e) {
                //e.printStackTrace();
            }
        }
        ret.add(to_add);
    }
    return ret;
}