List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setInputFormat
@Override public boolean setInputFormat(Instances instanceInfo) throws Exception
From source file:at.aictopic1.sentimentanalysis.machinelearning.impl.TwitterClassifer.java
public Instances loadTrainingData() { try {/*from w w w. j av a2s . c o m*/ //DataSource source = new DataSource("C:\\Users\\David\\Documents\\Datalogi\\TU Wien\\2014W_Advanced Internet Computing\\Labs\\aic_group2_topic1\\Other Stuff\\training_dataset.arff"); DataSource source = new DataSource( "C:\\Users\\David\\Documents\\Datalogi\\TU Wien\\2014W_Advanced Internet Computing\\Labs\\Data sets\\labelled.arff"); // System.out.println("Data Structure pre processing: " + source.getStructure()); Instances data = source.getDataSet(); // Get and save the dataStructure of the dataset dataStructure = source.getStructure(); try { // Save the datastructure to file // serialize dataStructure weka.core.SerializationHelper.write(modelDir + algorithm + ".dataStruct", dataStructure); } catch (Exception ex) { Logger.getLogger(TwitterClassifer.class.getName()).log(Level.SEVERE, null, ex); } // Set class index data.setClassIndex(2); // Giving attributes unique names before converting strings data.renameAttribute(2, "class_attr"); data.renameAttribute(0, "twitter_id"); // Convert String attribute to Words using filter StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(data); Instances filteredData = Filter.useFilter(data, filter); System.out.println("filteredData struct: " + filteredData.attribute(0)); System.out.println("filteredData struct: " + filteredData.attribute(1)); System.out.println("filteredData struct: " + filteredData.attribute(2)); return filteredData; } catch (Exception ex) { System.out.println("Error loading training set: " + ex.toString()); return null; //Logger.getLogger(Trainer.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:at.aictopic1.sentimentanalysis.machinelearning.impl.TwitterClassifer.java
public Integer classify(Tweet[] tweets) { // TEST//from w w w .ja v a2 s . c o m // Generate two tweet examples Tweet exOne = new Tweet("This is good and fantastic"); exOne.setPreprocessedText("This is good and fantastic"); Tweet exTwo = new Tweet("Horribly, terribly bad and more"); exTwo.setPreprocessedText("Horribly, terribly bad and more"); Tweet exThree = new Tweet( "I want to update lj and read my friends list, but I\\'m groggy and sick and blargh."); exThree.setPreprocessedText( "I want to update lj and read my friends list, but I\\'m groggy and sick and blargh."); Tweet exFour = new Tweet("bad hate worst sick"); exFour.setPreprocessedText("bad hate worst sick"); tweets = new Tweet[] { exOne, exTwo, exThree, exFour }; // TEST // Load model // loadModel(); // Convert Tweet to Instance type // Get String Data // Create attributes for the Instances set Attribute twitter_id = new Attribute("twitter_id"); // Attribute body = new Attribute("body"); FastVector classVal = new FastVector(2); classVal.addElement("pos"); classVal.addElement("neg"); Attribute class_attr = new Attribute("class_attr", classVal); // Add them to a list FastVector attrVector = new FastVector(3); // attrVector.addElement(twitter_id); // attrVector.addElement(new Attribute("body", (FastVector) null)); // attrVector.addElement(class_attr); // Get the number of tweets and then create predictSet int numTweets = tweets.length; Enumeration structAttrs = dataStructure.enumerateAttributes(); // ArrayList<Attribute> attrList = new ArrayList<Attribute>(dataStructure.numAttributes()); while (structAttrs.hasMoreElements()) { attrVector.addElement((Attribute) structAttrs.nextElement()); } Instances predictSet = new Instances("predictInstances", attrVector, numTweets); // Instances predictSet = new Instances(dataStructure); predictSet.setClassIndex(2); // init prediction double prediction = -1; System.out.println("PredictSet matches source structure: " + predictSet.equalHeaders(dataStructure)); System.out.println("PredSet struct: " + predictSet.attribute(0)); System.out.println("PredSet struct: " + predictSet.attribute(1)); System.out.println("PredSet struct: " + predictSet.attribute(2)); // Array to return predictions //double[] tweetsClassified = new double[2][numTweets]; //List<Integer, Double> tweetsClass = new ArrayList<Integer, Double>(numTweets); for (int i = 0; i < numTweets; i++) { String content = (String) tweets[i].getPreprocessedText(); System.out.println("Tweet content: " + content); // attrList Instance tweetInstance = new Instance(predictSet.numAttributes()); tweetInstance.setDataset(predictSet); tweetInstance.setValue(predictSet.attribute(0), i); tweetInstance.setValue(predictSet.attribute(1), content); tweetInstance.setClassMissing(); predictSet.add(tweetInstance); try { // Apply string filter StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(predictSet); Instances filteredPredictSet = Filter.useFilter(predictSet, filter); // Apply model prediction = trainedModel.classifyInstance(filteredPredictSet.instance(i)); filteredPredictSet.instance(i).setClassValue(prediction); System.out.println("Classification: " + filteredPredictSet.instance(i).toString()); System.out.println("Prediction: " + prediction); } catch (Exception ex) { Logger.getLogger(TwitterClassifer.class.getName()).log(Level.SEVERE, null, ex); } } return 0; }
From source file:com.dhamacher.sentimentanalysis4tweets.preprocessing.TweetFeatureExtractor.java
License:Apache License
/** * Method which contructs the arff file for weka with the training data *//*from w w w.j a v a 2 s . co m*/ public static void constructModel() { Instances instdata = null; try { FastVector atts; atts = new FastVector(); atts.addElement(new Attribute("content", (FastVector) null)); FastVector fvClassVal = new FastVector(4); fvClassVal.addElement(""); fvClassVal.addElement("neutral"); fvClassVal.addElement("negative"); fvClassVal.addElement("positive"); Attribute ClassAttribute = new Attribute("Class", fvClassVal); atts.addElement(ClassAttribute); instdata = new Instances("tweetData", atts, 0); CsvReader data = new CsvReader("../classified data/traindata.csv"); int i = 0; while (data.readRecord()) { double[] vals = new double[instdata.numAttributes()]; String class_id = data.get(0); switch (Integer.parseInt(class_id)) { case 0: class_id = "negative"; break; case 2: class_id = "neutral"; break; case 4: class_id = "positive"; break; } String tweet_content = data.get(5); Instance iInst = new Instance(2); iInst.setValue((Attribute) atts.elementAt(0), tweet_content); iInst.setValue((Attribute) atts.elementAt(1), class_id); instdata.add(iInst); System.out.println("[" + i + "] " + class_id + ":" + tweet_content); i++; } data.close(); StringToWordVector filter = new StringToWordVector(); instdata.setClassIndex(instdata.numAttributes() - 1); filter.setInputFormat(instdata); Instances newdata = Filter.useFilter(instdata, filter); ArffSaver saver = new ArffSaver(); saver.setInstances(newdata); saver.setFile(new File("./data/train2data.arff")); saver.writeBatch(); } catch (Exception ex) { Logger.getLogger(TweetFeatureExtractor.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:com.hack23.cia.service.impl.action.user.wordcount.WordCounterImpl.java
License:Apache License
@Override public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) { final String html = documentContentData.getContent(); final Attribute input = new Attribute("html", (ArrayList<String>) null); final ArrayList<Attribute> inputVec = new ArrayList<>(); inputVec.add(input);/*from w w w. j a v a2 s. c om*/ final Instances htmlInst = new Instances("html", inputVec, 1); htmlInst.add(new DenseInstance(1)); htmlInst.instance(0).setValue(0, html); final StopwordsHandler StopwordsHandler = new StopwordsHandler() { @Override public boolean isStopword(final String word) { return word.length() < 5; } }; final NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(1); tokenizer.setNGramMaxSize(1); tokenizer.setDelimiters(" \r\n\t.,;:'\"()?!'"); final StringToWordVector filter = new StringToWordVector(); filter.setTokenizer(tokenizer); filter.setStopwordsHandler(StopwordsHandler); filter.setLowerCaseTokens(true); filter.setOutputWordCounts(true); filter.setWordsToKeep(maxResult); final Map<String, Integer> result = new HashMap<>(); try { filter.setInputFormat(htmlInst); final Instances dataFiltered = Filter.useFilter(htmlInst, filter); final Instance last = dataFiltered.lastInstance(); final int numAttributes = last.numAttributes(); for (int i = 0; i < numAttributes; i++) { result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i))); } } catch (final Exception e) { LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e); } return result; }
From source file:com.reactivetechnologies.analytics.lucene.InstanceTokenizer.java
License:Open Source License
/** * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. * The set of words (attributes) is determined by the first batch filtered (typically training data). Uses a Lucene analyzer to tokenize * the string. NOTE: The text string should either be the first or last attribute * @param dataRaw// w w w. ja v a 2s . c om * @param opts * @param isLast - whether last attribute is the text to be filtered, else first * @return * @throws Exception * @see {@linkplain StringToWordVector} */ public static Instances filter(Instances dataRaw, String opts, boolean isLast) throws Exception { StringToWordVector filter = new StringToWordVector(); if (StringUtils.hasText(opts)) { filter.setOptions(Utils.splitOptions(opts)); } filter.setTokenizer(new InstanceTokenizer()); filter.setUseStoplist(false);//ignore any other stop list filter.setStemmer(new NullStemmer());//ignore any other stemmer filter.setInputFormat(dataRaw); filter.setAttributeIndices(isLast ? "last" : "first"); return Filter.useFilter(dataRaw, filter); }
From source file:epsi.i5.datamining.Weka.java
public void generationArffFilter() throws IOException, Exception { BufferedReader reader = new BufferedReader(new FileReader("src/epsi/i5/data/" + fileOne + ".arff")); Instances data = new Instances(reader); reader.close();//from w ww .j av a 2 s.c o m StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(data); instances = Filter.useFilter(data, filter); fileOne = fileOne + "Two"; generationArff(); }
From source file:graph.clustering.NodeClusterer.java
License:Apache License
private Instances preprocessNodesInfoInstances(Instances clusterTrainingSet) { String[] filterOptions = new String[10]; filterOptions[0] = "-R"; // attribute indices filterOptions[1] = "first-last"; filterOptions[2] = "-W"; // The number of words (per class if there is a // class attribute assigned) to attempt to // keep./*from w w w . j ava2 s . c o m*/ filterOptions[3] = "1000"; filterOptions[4] = "-prune-rate"; // periodical pruning filterOptions[5] = "-1.0"; filterOptions[6] = "-N"; // 0=not normalize filterOptions[7] = "0"; filterOptions[8] = "-M"; // The minimum term frequency filterOptions[9] = "1"; SnowballStemmer stemmer = new SnowballStemmer(); stemmer.setStemmer("english"); WordTokenizer tokenizer = new WordTokenizer(); StringToWordVector s2wFilterer = new StringToWordVector(); try { s2wFilterer.setOptions(filterOptions); s2wFilterer.setStemmer(stemmer); s2wFilterer.setTokenizer(tokenizer); s2wFilterer.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, s2wFilterer); } catch (Exception e1) { System.out.println("Error in converting string into word vectors:"); e1.printStackTrace(); } RemoveUseless ruFilter = new RemoveUseless(); try { ruFilter.setInputFormat(clusterTrainingSet); clusterTrainingSet = Filter.useFilter(clusterTrainingSet, ruFilter); } catch (Exception e1) { System.out.println("Error in removing useless terms:"); e1.printStackTrace(); } return clusterTrainingSet; }
From source file:newsclassifier.NewsClassifier.java
public void StrToWV(String sFile) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setInputFormat(data); /*filter.setIDFTransform(false); filter.setTFTransform(true);//from w ww .j a v a 2 s .c o m filter.setAttributeIndices("1-2"); //attributenameprefix filter.setDoNotOperateOnPerClassBasis(true); filter.setInvertSelection(false); filter.setLowerCaseTokens(true); filter.setMinTermFreq(1); //filter.setNormalizeDocLength(true); filter.setOutputWordCounts(false); //filter.setPeriodicPruning(-1); //filter.setStemmer(null); filter.setStopwords(new File(sFile));*/ //String[] opts = weka.core.Utils.splitOptions("-tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r \\\\t.,;:\\\\\\'\\\\\\\"()?!1234567890 `~!@#\\\\\\%^&*[]-_+={}\\\\\\\\/|?>< \\\\r\\\\t\\\"\""); String[] opts = weka.core.Utils.splitOptions( "-R 1-2 -W 3000 -prune-rate -1.0 -T -N 0 -L -S -stemmer weka.core.stemmers.NullStemmer -M 1 -O -stopwords \"C:\\\\Users\\\\USER\\\\Dropbox\\\\Works\\\\IF\\\\AI\\\\Tubes 2\\\\s.txt\" -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\t.,;:\\\\\\'\\\\\\\"()?!1234567890 `~!@#\\\\\\%^&*[]-_+={}\\\\\\\\/|?>< \\\\t\\\"\""); filter.setOptions(opts); //belum pake delimiter!! filter.setWordsToKeep(3000); data = Filter.useFilter(data, filter); //return newData; //data = newData; }
From source file:nl.uva.expose.classification.WekaClassification.java
private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setAttributeIndices("first-last"); filter.setIDFTransform(true);/* w w w. jav a2s . co m*/ filter.setLowerCaseTokens(true); filter.setMinTermFreq(2); filter.setLowerCaseTokens(true); filter.setNormalizeDocLength( new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); filter.setOutputWordCounts(true); // filter.setTokenizer(); // filter.setWordsToKeep(); filter.setInputFormat(dRaw); dFiltered = Filter.useFilter(dRaw, filter); }
From source file:nlpmusic.StringClusterer.java
public ArrayList<ArrayList<String>> cluster(ArrayList<String> tem) throws Exception { Instances source = listLoad(tem);//from w w w . jav a2 s .c o m StringToWordVector vect = new StringToWordVector(); vect.setWordsToKeep(to_keep); vect.setInputFormat(source); Instances datas = Filter.useFilter(source, vect); //vect.setDoNotOperateOnPerClassBasis(true); //System.out.println("ASDASD" + vect.wordsToKeepTipText()); //System.out.println(datas.numAttributes()); //System.out.println("ASDASD" + vect.getWordsToKeep()); DBSCAN clusterer = new DBSCAN(); clusterer.setEpsilon(threshold); clusterer.setMinPoints(min_points); clusterer.buildClusterer(datas); ArrayList<ArrayList<String>> ret = new ArrayList<>(); for (int i = 0; i < clusterer.numberOfClusters(); i++) { ArrayList<String> to_add = new ArrayList<>(); //System.out.println(i); for (int j = 0; j < datas.size(); j++) { try { if (clusterer.clusterInstance(datas.get(j)) == i) //System.out.println("* " + source.get(j).toString() + " *"); to_add.add(source.get(j).toString()); } catch (Exception e) { //e.printStackTrace(); } } ret.add(to_add); } return ret; }