List of usage examples for edu.stanford.nlp.process Morphology Morphology
public Morphology(Reader in)
From source file:com.music.service.text.TimelineToMusicService.java
License:Open Source License
private Variation getVariation(List<Tweet> tweets, TimelineMusic meta) { Morphology morphology = new Morphology(new StringReader("")); Multiset<String> words = HashMultiset.create(); for (Tweet tweet : tweets) { String tweetText = tweet.getText().toLowerCase(); List<String> urls = TimelineToMusicService.extractUrls(tweetText); for (String url : urls) { tweetText = tweetText.replace(url, ""); }/*from w w w . jav a 2s. c om*/ List<String> usernames = TimelineToMusicService.extractMentionedUsernames(tweetText); for (String username : usernames) { tweetText = tweetText.replace(username, "").replace("rt", ""); } String[] wordsInTweet = tweetText.split("[^\\p{L}&&[^']]+"); for (String word : wordsInTweet) { try { words.add(morphology.stem(word)); } catch (Exception ex) { words.add(word); } } } words.removeAll(stopwords); // if a word is mentioned more times than is 4% of the tweets, it's considered a topic double topicThreshold = tweets.size() * 4 / 100; for (Iterator<String> it = words.iterator(); it.hasNext();) { String word = it.next(); // remove stopwords not in the list (e.g. in a different language). // We consider all words less than 4 characters to be stop words if (word == null || word.length() < 4) { it.remove(); } else if (words.count(word) < topicThreshold) { it.remove(); } } meta.setTopKeywords(new HashSet<>(words.elementSet())); // the more topics you have, the more variative music if (meta.getTopKeywords().size() > 40) { return Variation.EXTREMELY_VARIATIVE; } else if (meta.getTopKeywords().size() > 30) { return Variation.VERY_VARIATIVE; } else if (meta.getTopKeywords().size() > 20) { return Variation.MOVING; } else if (meta.getTopKeywords().size() > 10) { return Variation.AVERAGE; } else { return Variation.MONOTONOUS; } }