List of usage examples for weka.filters.unsupervised.attribute StringToWordVector StringToWordVector
public StringToWordVector()
From source file:at.aictopic1.sentimentanalysis.machinelearning.impl.TwitterClassifer.java
public Instances loadTrainingData() { try {// ww w . j av a 2 s .co m //DataSource source = new DataSource("C:\\Users\\David\\Documents\\Datalogi\\TU Wien\\2014W_Advanced Internet Computing\\Labs\\aic_group2_topic1\\Other Stuff\\training_dataset.arff"); DataSource source = new DataSource( "C:\\Users\\David\\Documents\\Datalogi\\TU Wien\\2014W_Advanced Internet Computing\\Labs\\Data sets\\labelled.arff"); // System.out.println("Data Structure pre processing: " + source.getStructure()); Instances data = source.getDataSet(); // Get and save the dataStructure of the dataset dataStructure = source.getStructure(); try { // Save the datastructure to file // serialize dataStructure weka.core.SerializationHelper.write(modelDir + algorithm + ".dataStruct", dataStructure); } catch (Exception ex) { Logger.getLogger(TwitterClassifer.class.getName()).log(Level.SEVERE, null, ex); } // Set class index data.setClassIndex(2); // Giving attributes unique names before converting strings data.renameAttribute(2, "class_attr"); data.renameAttribute(0, "twitter_id"); // Convert String attribute to Words using filter StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(data); Instances filteredData = Filter.useFilter(data, filter); System.out.println("filteredData struct: " + filteredData.attribute(0)); System.out.println("filteredData struct: " + filteredData.attribute(1)); System.out.println("filteredData struct: " + filteredData.attribute(2)); return filteredData; } catch (Exception ex) { System.out.println("Error loading training set: " + ex.toString()); return null; //Logger.getLogger(Trainer.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:at.aictopic1.sentimentanalysis.machinelearning.impl.TwitterClassifer.java
public Integer classify(Tweet[] tweets) { // TEST// w ww . ja v a 2 s . c o m // Generate two tweet examples Tweet exOne = new Tweet("This is good and fantastic"); exOne.setPreprocessedText("This is good and fantastic"); Tweet exTwo = new Tweet("Horribly, terribly bad and more"); exTwo.setPreprocessedText("Horribly, terribly bad and more"); Tweet exThree = new Tweet( "I want to update lj and read my friends list, but I\\'m groggy and sick and blargh."); exThree.setPreprocessedText( "I want to update lj and read my friends list, but I\\'m groggy and sick and blargh."); Tweet exFour = new Tweet("bad hate worst sick"); exFour.setPreprocessedText("bad hate worst sick"); tweets = new Tweet[] { exOne, exTwo, exThree, exFour }; // TEST // Load model // loadModel(); // Convert Tweet to Instance type // Get String Data // Create attributes for the Instances set Attribute twitter_id = new Attribute("twitter_id"); // Attribute body = new Attribute("body"); FastVector classVal = new FastVector(2); classVal.addElement("pos"); classVal.addElement("neg"); Attribute class_attr = new Attribute("class_attr", classVal); // Add them to a list FastVector attrVector = new FastVector(3); // attrVector.addElement(twitter_id); // attrVector.addElement(new Attribute("body", (FastVector) null)); // attrVector.addElement(class_attr); // Get the number of tweets and then create predictSet int numTweets = tweets.length; Enumeration structAttrs = dataStructure.enumerateAttributes(); // ArrayList<Attribute> attrList = new ArrayList<Attribute>(dataStructure.numAttributes()); while (structAttrs.hasMoreElements()) { attrVector.addElement((Attribute) structAttrs.nextElement()); } Instances predictSet = new Instances("predictInstances", attrVector, numTweets); // Instances predictSet = new Instances(dataStructure); predictSet.setClassIndex(2); // init prediction double prediction = -1; System.out.println("PredictSet matches source structure: " + predictSet.equalHeaders(dataStructure)); System.out.println("PredSet struct: " + predictSet.attribute(0)); System.out.println("PredSet struct: " + predictSet.attribute(1)); System.out.println("PredSet struct: " + predictSet.attribute(2)); // Array to return predictions //double[] tweetsClassified = new double[2][numTweets]; //List<Integer, Double> tweetsClass = new ArrayList<Integer, Double>(numTweets); for (int i = 0; i < numTweets; i++) { String content = (String) tweets[i].getPreprocessedText(); System.out.println("Tweet content: " + content); // attrList Instance tweetInstance = new Instance(predictSet.numAttributes()); tweetInstance.setDataset(predictSet); tweetInstance.setValue(predictSet.attribute(0), i); tweetInstance.setValue(predictSet.attribute(1), content); tweetInstance.setClassMissing(); predictSet.add(tweetInstance); try { // Apply string filter StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(predictSet); Instances filteredPredictSet = Filter.useFilter(predictSet, filter); // Apply model prediction = trainedModel.classifyInstance(filteredPredictSet.instance(i)); filteredPredictSet.instance(i).setClassValue(prediction); System.out.println("Classification: " + filteredPredictSet.instance(i).toString()); System.out.println("Prediction: " + prediction); } catch (Exception ex) { Logger.getLogger(TwitterClassifer.class.getName()).log(Level.SEVERE, null, ex); } } return 0; }
From source file:classifier.SellerClassifier.java
private Instances startFeatureExtraction(Instances raw) throws Exception { myFilter = new StringToWordVector(); myFilter.setInputFormat(raw);/* w w w . j av a 2s . c om*/ return Filter.useFilter(raw, myFilter); }
From source file:com.dhamacher.sentimentanalysis4tweets.preprocessing.TweetFeatureExtractor.java
License:Apache License
/** * Method which contructs the arff file for weka with the training data *//*from w w w.java 2s .co m*/ public static void constructModel() { Instances instdata = null; try { FastVector atts; atts = new FastVector(); atts.addElement(new Attribute("content", (FastVector) null)); FastVector fvClassVal = new FastVector(4); fvClassVal.addElement(""); fvClassVal.addElement("neutral"); fvClassVal.addElement("negative"); fvClassVal.addElement("positive"); Attribute ClassAttribute = new Attribute("Class", fvClassVal); atts.addElement(ClassAttribute); instdata = new Instances("tweetData", atts, 0); CsvReader data = new CsvReader("../classified data/traindata.csv"); int i = 0; while (data.readRecord()) { double[] vals = new double[instdata.numAttributes()]; String class_id = data.get(0); switch (Integer.parseInt(class_id)) { case 0: class_id = "negative"; break; case 2: class_id = "neutral"; break; case 4: class_id = "positive"; break; } String tweet_content = data.get(5); Instance iInst = new Instance(2); iInst.setValue((Attribute) atts.elementAt(0), tweet_content); iInst.setValue((Attribute) atts.elementAt(1), class_id); instdata.add(iInst); System.out.println("[" + i + "] " + class_id + ":" + tweet_content); i++; } data.close(); StringToWordVector filter = new StringToWordVector(); instdata.setClassIndex(instdata.numAttributes() - 1); filter.setInputFormat(instdata); Instances newdata = Filter.useFilter(instdata, filter); ArffSaver saver = new ArffSaver(); saver.setInstances(newdata); saver.setFile(new File("./data/train2data.arff")); saver.writeBatch(); } catch (Exception ex) { Logger.getLogger(TweetFeatureExtractor.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:com.hack23.cia.service.impl.action.user.wordcount.WordCounterImpl.java
License:Apache License
@Override public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) { final String html = documentContentData.getContent(); final Attribute input = new Attribute("html", (ArrayList<String>) null); final ArrayList<Attribute> inputVec = new ArrayList<>(); inputVec.add(input);//from w w w.ja v a 2s .c om final Instances htmlInst = new Instances("html", inputVec, 1); htmlInst.add(new DenseInstance(1)); htmlInst.instance(0).setValue(0, html); final StopwordsHandler StopwordsHandler = new StopwordsHandler() { @Override public boolean isStopword(final String word) { return word.length() < 5; } }; final NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(1); tokenizer.setNGramMaxSize(1); tokenizer.setDelimiters(" \r\n\t.,;:'\"()?!'"); final StringToWordVector filter = new StringToWordVector(); filter.setTokenizer(tokenizer); filter.setStopwordsHandler(StopwordsHandler); filter.setLowerCaseTokens(true); filter.setOutputWordCounts(true); filter.setWordsToKeep(maxResult); final Map<String, Integer> result = new HashMap<>(); try { filter.setInputFormat(htmlInst); final Instances dataFiltered = Filter.useFilter(htmlInst, filter); final Instance last = dataFiltered.lastInstance(); final int numAttributes = last.numAttributes(); for (int i = 0; i < numAttributes; i++) { result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i))); } } catch (final Exception e) { LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e); } return result; }
From source file:com.ivanrf.smsspam.SpamClassifier.java
License:Apache License
private static FilteredClassifier initFilterClassifier(int wordsToKeep, String tokenizerOp, boolean useAttributeSelection, String classifierOp, boolean boosting) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setDoNotOperateOnPerClassBasis(true); filter.setLowerCaseTokens(true);//from w w w . j a v a2 s . c o m filter.setWordsToKeep(wordsToKeep); if (!tokenizerOp.equals(TOKENIZER_DEFAULT)) { //Make a tokenizer WordTokenizer wt = new WordTokenizer(); if (tokenizerOp.equals(TOKENIZER_COMPLETE)) wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}"); else //TOKENIZER_COMPLETE_NUMBERS) wt.setDelimiters(" \r\n\t.,;:\'\"()?!-+*&#$%/=<>[]_`@\\^{}|~0123456789"); filter.setTokenizer(wt); } FilteredClassifier classifier = new FilteredClassifier(); classifier.setFilter(filter); if (useAttributeSelection) { AttributeSelection as = new AttributeSelection(); as.setEvaluator(new InfoGainAttributeEval()); Ranker r = new Ranker(); r.setThreshold(0); as.setSearch(r); MultiFilter mf = new MultiFilter(); mf.setFilters(new Filter[] { filter, as }); classifier.setFilter(mf); } if (classifierOp.equals(CLASSIFIER_SMO)) classifier.setClassifier(new SMO()); else if (classifierOp.equals(CLASSIFIER_NB)) classifier.setClassifier(new NaiveBayes()); else if (classifierOp.equals(CLASSIFIER_IB1)) classifier.setClassifier(new IBk(1)); else if (classifierOp.equals(CLASSIFIER_IB3)) classifier.setClassifier(new IBk(3)); else if (classifierOp.equals(CLASSIFIER_IB5)) classifier.setClassifier(new IBk(5)); else if (classifierOp.equals(CLASSIFIER_PART)) classifier.setClassifier(new PART()); //Tarda mucho if (boosting) { AdaBoostM1 boost = new AdaBoostM1(); boost.setClassifier(classifier.getClassifier()); classifier.setClassifier(boost); //Con NB tarda mucho } return classifier; }
From source file:com.reactivetechnologies.analytics.lucene.InstanceTokenizer.java
License:Open Source License
/** * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. * The set of words (attributes) is determined by the first batch filtered (typically training data). Uses a Lucene analyzer to tokenize * the string. NOTE: The text string should either be the first or last attribute * @param dataRaw/*from w ww .j av a 2 s. com*/ * @param opts * @param isLast - whether last attribute is the text to be filtered, else first * @return * @throws Exception * @see {@linkplain StringToWordVector} */ public static Instances filter(Instances dataRaw, String opts, boolean isLast) throws Exception { StringToWordVector filter = new StringToWordVector(); if (StringUtils.hasText(opts)) { filter.setOptions(Utils.splitOptions(opts)); } filter.setTokenizer(new InstanceTokenizer()); filter.setUseStoplist(false);//ignore any other stop list filter.setStemmer(new NullStemmer());//ignore any other stemmer filter.setInputFormat(dataRaw); filter.setAttributeIndices(isLast ? "last" : "first"); return Filter.useFilter(dataRaw, filter); }
From source file:epsi.i5.datamining.Weka.java
public void generationArffFilter() throws IOException, Exception { BufferedReader reader = new BufferedReader(new FileReader("src/epsi/i5/data/" + fileOne + ".arff")); Instances data = new Instances(reader); reader.close();//from www . j av a 2 s .co m StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(data); instances = Filter.useFilter(data, filter); fileOne = fileOne + "Two"; generationArff(); }
From source file:form.ml.ClassifierTemplate.java
/** * Create bayes classifier instance/*from w w w .ja va 2 s . c o m*/ * * @param data_set_path * @param stop_words_path * @param class_index * @throws FileNotFoundException * @throws IOException * @throws Exception */ public ClassifierTemplate(String data_set_path, String stop_words_path, int class_index) throws Exception { /** * loading the arff file content */ BufferedReader reader = new BufferedReader(new FileReader(data_set_path)); ArffReader arff = new ArffReader(reader); train = arff.getData(); train.setClassIndex(class_index); /** * initializing the filter */ wordVector = new StringToWordVector(); wordVector.setInputFormat(train); tokenizer = new WordTokenizer(); wordVector.setStopwords(new File(stop_words_path)); wordVector.setTokenizer(tokenizer); wordVector.setIDFTransform(true); wordVector.setLowerCaseTokens(true); /** * generating the TF*IDF Vector */ trainFiltered = Filter.useFilter(train, wordVector); }
From source file:graph.clustering.NodeClusterer.java
License:Apache License
private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) { String[] filterOptions = new String[10]; filterOptions[0] = "-R"; // attribute indices filterOptions[1] = "first-last"; filterOptions[2] = "-W"; // The number of words (per class if there is a // class attribute assigned) to attempt to // keep./* www .j a v a 2 s. co m*/ filterOptions[3] = "1000"; filterOptions[4] = "-prune-rate"; // periodical pruning filterOptions[5] = "-1.0"; filterOptions[6] = "-N"; // 0=not normalize filterOptions[7] = "0"; filterOptions[8] = "-M"; // The minimum term frequency filterOptions[9] = "1"; SnowballStemmer stemmer = new SnowballStemmer(); stemmer.setStemmer("english"); WordTokenizer tokenizer = new WordTokenizer(); StringToWordVector s2wFilterer = new StringToWordVector(); try { s2wFilterer.setOptions(filterOptions); s2wFilterer.setStemmer(stemmer); s2wFilterer.setTokenizer(tokenizer); s2wFilterer.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer); } catch (Exception e1) { System.out.println("Error in converting string into word vectors:"); e1.printStackTrace(); } RemoveUseless ruFilter = new RemoveUseless(); try { ruFilter.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter); } catch (Exception e1) { System.out.println("Error in removing useless terms:"); e1.printStackTrace(); } return clusterTrainingSet; }