Java tutorial
/* * Copyright (c) 2011-2015 EPFL DATA Laboratory * Copyright (c) 2014-2015 The Squall Collaboration (see NOTICE) * * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ch.epfl.data.squall.operators; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import ch.epfl.data.squall.expressions.ValueExpression; import ch.epfl.data.squall.storage.AggregationStorage; import ch.epfl.data.squall.storage.BasicStore; import ch.epfl.data.squall.storage.WindowAggregationStorage; import ch.epfl.data.squall.types.LongType; import ch.epfl.data.squall.types.NumericType; import ch.epfl.data.squall.types.Type; import ch.epfl.data.squall.utilities.MyUtilities; import ch.epfl.data.squall.visitors.OperatorVisitor; import ch.epfl.data.squall.window_semantics.WindowSemanticsManager; public class TwitterParserOperator extends OneToOneOperator implements AggregateOperator<Long> { private static final long serialVersionUID = 1L; private static Logger LOG = Logger.getLogger(TwitterParserOperator.class); // the GroupBy type private static final int GB_UNSET = -1; private static final int GB_COLUMNS = 0; private static final int GB_PROJECTION = 1; private DistinctOperator _distinct; private int _groupByType = GB_UNSET; private List<Integer> _groupByColumns = new ArrayList<Integer>(); private ProjectOperator _groupByProjection; private int _numTuplesProcessed = 0; private final NumericType<Long> _wrapper = new LongType(); private BasicStore<Long> _storage; private final Map _map; private List<String> _heavyHitters = new ArrayList<String>(); private Map<Object, Integer> _heavyHittersMap = new HashMap<Object, Integer>(); private Random _random = new Random(); private boolean isWindowSemantics; private int _windowRangeSecs = -1; private int _slideRangeSecs = -1; private int _field; public TwitterParserOperator(int field, Map map) { _field = field; _map = map; _storage = new AggregationStorage<Long>(this, _wrapper, _map, true); } @Override public void accept(OperatorVisitor ov) { ov.visit(this); } private boolean alreadySetOther(int GB_COLUMNS) { return (_groupByType != GB_COLUMNS && _groupByType != GB_UNSET); } @Override public void clearStorage() { _storage.reset(); } // for this method it is essential that HASH_DELIMITER, which is used in // tupleToString method, // is the same as DIP_GLOBAL_ADD_DELIMITER @Override public List<String> getContent() { final String str = _storage.getContent(); return str == null ? null : Arrays.asList(str.split("\\r?\\n")); } @Override public DistinctOperator getDistinct() { return _distinct; } @Override public List<ValueExpression> getExpressions() { return new ArrayList<ValueExpression>(); } @Override public List<Integer> getGroupByColumns() { return _groupByColumns; } @Override public ProjectOperator getGroupByProjection() { return _groupByProjection; } private String getGroupByStr() { final StringBuilder sb = new StringBuilder(); sb.append("("); for (int i = 0; i < _groupByColumns.size(); i++) { sb.append(_groupByColumns.get(i)); if (i == _groupByColumns.size() - 1) sb.append(")"); else sb.append(", "); } return sb.toString(); } @Override public int getNumTuplesProcessed() { return _numTuplesProcessed; } @Override public BasicStore getStorage() { return _storage; } @Override public Type getType() { return _wrapper; } @Override public boolean hasGroupBy() { return _groupByType != GB_UNSET; } @Override public boolean isBlocking() { return true; } @Override public String printContent() { return _storage.getContent(); } // from Operator @Override public List<String> processOne(List<String> tuple, long lineageTimestamp) { _numTuplesProcessed++; //System.out.println("[TwitterParseOperator.processOne] tuple=" + tuple); // Setup variables List<String> stopWords = Arrays.asList("a", "able", "about", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ah", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "auth", "available", "away", "awfully", "b", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "between", "beyond", "biol", "both", "brief", "briefly", "but", "by", "c", "ca", "came", "can", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "could", "couldnt", "d", "date", "did", "didn't", "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "few", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "for", "former", "formerly", "forth", "found", "four", "from", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "had", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "hed", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "how", "howbeit", "however", "hundred", "i", "id", "ie", "if", "i'll", "i'm", "immediate", "immediately", "importance", "important", "in", "inc", "indeed", "index", "information", "instead", "into", "invention", "inward", "is", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "i've", "j", "just", "k", "keep keeps", "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "mug", "must", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "ones", "only", "onto", "or", "ord", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "she", "shed", "she'll", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure t", "take", "taken", "taking", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'll", "theyre", "they've", "think", "this", "those", "thou", "though", "thoughh", "thousand", "throug", "through", "throughout", "thru", "thus", "til", "tip", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "v", "value", "various", "'ve", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "was", "wasnt", "way", "we", "wed", "welcome", "we'll", "went", "were", "werent", "we've", "what", "whatever", "what'll", "whats", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "whose", "why", "widely", "will", "willing", "wish", "with", "within", "without", "wont", "words", "world", "would", "wouldnt", "www", "x", "y", "yes", "yet", "you", "youd", "you'll", "your", "youre", "yours", "yourself", "yourselves", "you've", "z", "zero", "t", "http", "https", "rt"); List<String> returnWords = new ArrayList<String>(); String[] tweetWords; String thisTweetText; String tupleData = tuple.get(0).toString(); // Extract useful words from the tweet thisTweetText = tupleData.toLowerCase().replaceAll("(\\r|\\n|)", ""); tweetWords = thisTweetText.split(" "); // Return all useful words for (String thisWord : tweetWords) { // Clean up the word using a regular expression Pattern wordCleanPattern = Pattern.compile("^[\"',.?!;:()]*([a-z]([a-z'\\-]*[a-z])?)[\"',.?!;:()]*$"); Matcher wordCleanMatcher = wordCleanPattern.matcher(thisWord); String cleanedWord = thisWord; // If the word checks out, maybe add it to the heavy hitters list if (wordCleanMatcher.matches() && !stopWords.contains(cleanedWord)) { cleanedWord = wordCleanMatcher.group(1).toString(); //System.out.println("\t=> adding \"" + cleanedWord + "\""); returnWords.add(cleanedWord); } } return returnWords; } // actual operator implementation @Override public Long runAggregateFunction(Long value, List<String> tuple) { return value + 1; } @Override public Long runAggregateFunction(Long value1, Long value2) { return value1 + value2; } @Override public TwitterParserOperator setDistinct(DistinctOperator distinct) { _distinct = distinct; return this; } @Override public TwitterParserOperator setGroupByColumns(int... hashIndexes) { return setGroupByColumns(Arrays.asList(ArrayUtils.toObject(hashIndexes))); } // from AgregateOperator @Override public TwitterParserOperator setGroupByColumns(List<Integer> groupByColumns) { if (!alreadySetOther(GB_COLUMNS)) { _groupByType = GB_COLUMNS; _groupByColumns = groupByColumns; _storage.setSingleEntry(false); return this; } else throw new RuntimeException("Aggragation already has groupBy set!"); } @Override public TwitterParserOperator setGroupByProjection(ProjectOperator groupByProjection) { if (!alreadySetOther(GB_PROJECTION)) { _groupByType = GB_PROJECTION; _groupByProjection = groupByProjection; _storage.setSingleEntry(false); return this; } else throw new RuntimeException("Aggragation already has groupBy set!"); } @Override public String toString() { final StringBuilder sb = new StringBuilder(); sb.append("HeavyHittersOperator "); if (_groupByColumns.isEmpty() && _groupByProjection == null) sb.append("\n No groupBy!"); else if (!_groupByColumns.isEmpty()) sb.append("\n GroupByColumns are ").append(getGroupByStr()).append("."); else if (_groupByProjection != null) sb.append("\n GroupByProjection is ").append(_groupByProjection.toString()).append("."); if (_distinct != null) sb.append("\n It also has distinct ").append(_distinct.toString()); return sb.toString(); } @Override public AggregateOperator<Long> SetWindowSemantics(int windowRangeInSeconds, int windowSlideInSeconds) { WindowSemanticsManager._IS_WINDOW_SEMANTICS = true; isWindowSemantics = true; _windowRangeSecs = windowRangeInSeconds; _slideRangeSecs = windowSlideInSeconds; _storage = new WindowAggregationStorage<>(this, _wrapper, _map, true, _windowRangeSecs, _slideRangeSecs); if (_groupByColumns != null || _groupByProjection != null) _storage.setSingleEntry(false); return this; } @Override public AggregateOperator<Long> SetWindowSemantics(int windowRangeInSeconds) { return SetWindowSemantics(windowRangeInSeconds, windowRangeInSeconds); } @Override public int[] getWindowSemanticsInfo() { int[] res = new int[2]; res[0] = _windowRangeSecs; res[1] = _slideRangeSecs; return res; } public static int safeLongToInt(long l) { if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) { throw new IllegalArgumentException(l + " cannot be cast to int without changing its value."); } int zz = (int) l; if (zz > 0) { return zz; } else { return -zz; } } /* * Sorts a Map structure. * Stolen from: http://stackoverflow.com/questions/8119366/sorting-hashmap-by-values */ private static Map<Object, Integer> sortByComparator(Map<Object, Integer> _heavyHittersMap2, final boolean order) { List<Entry<Object, Integer>> list = new LinkedList<Entry<Object, Integer>>(_heavyHittersMap2.entrySet()); // Sorting the list based on values Collections.sort(list, new Comparator<Entry<Object, Integer>>() { public int compare(Entry<Object, Integer> o1, Entry<Object, Integer> o2) { if (order) { return o1.getValue().compareTo(o2.getValue()); } else { return o2.getValue().compareTo(o1.getValue()); } } }); // Maintaining insertion order with the help of LinkedList Map<Object, Integer> sortedMap = new LinkedHashMap<Object, Integer>(); for (Entry<Object, Integer> entry : list) { sortedMap.put(entry.getKey(), entry.getValue()); } return sortedMap; } }