Java tutorial
/** * Harvester * Copyright 29.11.2015 by Michael Peter Christen, @0rb1t3r * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package org.loklak.android.wok; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Random; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.logging.Level; import java.util.logging.Logger; import org.json.JSONException; import org.json.JSONObject; import org.loklak.android.tools.LogLines; import org.loklak.client.PushClient; import org.loklak.client.SearchClient; import org.loklak.client.SuggestClient; import org.loklak.harvester.TwitterScraper; import org.loklak.objects.MessageEntry; import org.loklak.objects.QueryEntry; import org.loklak.objects.ResultList; import org.loklak.objects.Timeline; public class Harvester { private final static Logger LOG = Logger.getGlobal(); private final static Random random = new Random(System.currentTimeMillis()); private final static int MAX_PENDING_CONEXT_QUERIES = 200; // this could be // much larger but // we don't want to // cache too many // of these private final static int MAX_PENDING_DISPLAY_LINES = 300; private final static int MAX_HARVESTED = 10000; // just to prevent a memory // leak with possible OOM // after a long time we flush // that cache after a while private final static int HITS_LIMIT_4_QUERIES = 30; private final static int FETCH_RANDOM = 3; public final static String backend = "http://loklak.org"; // public final static String backend = "http://10.0.2.2:9001"; public final static LinkedHashSet<String> pendingQueries = new LinkedHashSet<>(); public final static ArrayList<String> pendingContext = new ArrayList<>(); public final static Set<String> harvestedContext = new HashSet<>(); public static int suggestionsOnBackend = 1000; public static int contribution_message_count = -1; private static boolean isPushing = false, isLoading = false; private static void checkContext(Timeline tl, boolean front) { for (MessageEntry tweet : tl) { for (String user : tweet.getMentions()) checkContext("from:" + user, front); for (String hashtag : tweet.getHashtags()) checkContext(hashtag, true); } } private static void checkContext(String s, boolean front) { if (!front && pendingContext.size() > MAX_PENDING_CONEXT_QUERIES) return; // queue is full if (!harvestedContext.contains(s) && !pendingContext.contains(s)) { if (front) pendingContext.add(0, s); else pendingContext.add(s); } while (pendingContext.size() > MAX_PENDING_CONEXT_QUERIES) pendingContext.remove(pendingContext.size() - 1); if (harvestedContext.size() > MAX_HARVESTED) harvestedContext.clear(); } public static BlockingQueue<Timeline> pushToBackendIndividualTimeline = new LinkedBlockingQueue<Timeline>(); public static BlockingQueue<Timeline> pushToBackendAccumulationTimeline = new LinkedBlockingQueue<Timeline>(); public static LogLines<MessageEntry> displayMessages = new LogLines<MessageEntry>(MAX_PENDING_DISPLAY_LINES); public static void reduceDisplayMessages() { if (displayMessages.size() > 0) { displayMessages.poll(); } } public static void harvest() { if (isPushing || isLoading) return; // if we must push to the backend, do that first if (pushToBackendIndividualTimeline.size() > 0) { try { Timeline tl = pushToBackendIndividualTimeline.take(); isPushing = true; Sketch.statusLine.show("Storing " + tl.size() + " Messages about '" + tl.getQuery() + "'", 2000); new PushThread(tl).start(); return; } catch (InterruptedException e) { } } // if there are enough messages in the accumulation stack, push that as well // first Timeline tl = takeTimelineMin(pushToBackendAccumulationTimeline, Timeline.Order.CREATED_AT, 200); if (tl != null && tl.size() > 0) { // transmit the timeline isPushing = true; Sketch.statusLine.show("Storing " + tl.size() + " Messages", 2000); new PushThread(tl).start(); return; } // only if the push-work is done, harvest more isLoading = true; new LoadThread().start(); return; } private static class LoadThread extends Thread { @Override public void run() { isLoading = true; if (random.nextInt(20) != 0 && suggestionsOnBackend < HITS_LIMIT_4_QUERIES && pendingQueries.size() == 0 && pendingContext.size() > 0) { // harvest using the collected keys instead using the queries int r = random.nextInt((pendingContext.size() / 2) + 1); String q = pendingContext.remove(r); harvestedContext.add(q); Timeline tl = TwitterScraper.search(q, Timeline.Order.CREATED_AT); if (tl == null || tl.size() == 0) { isLoading = false; return; } // display the tweets for (MessageEntry me : tl) { // we don't want to throttle down just because the display is too full if (displayMessages.size() >= MAX_PENDING_DISPLAY_LINES) reduceDisplayMessages(); // add a line at the end of the list displayMessages.add(me); } // enqueue the tweets pushToBackendAccumulationTimeline.add(tl); // find content query strings and store them in the context cache checkContext(tl, false); LOG.log(Level.INFO, "harvest", "retrieval of " + tl.size() + " new messages for q = " + q + ", scheduled push; pendingQueries = " + pendingQueries.size() + ", pendingContext = " + pendingContext.size() + ", harvestedContext = " + harvestedContext.size()); isLoading = false; return; } // load more queries if pendingQueries is empty if (pendingQueries.size() == 0) try { ResultList<QueryEntry> rl = SuggestClient.suggest(backend, "", "query", Math.max(FETCH_RANDOM * 30, suggestionsOnBackend / 10), "asc", "retrieval_next", 0, null, "now", "retrieval_next", FETCH_RANDOM); Sketch.statusLine.show("Loading Suggestions", 2000); for (QueryEntry qe : rl) { Sketch.statusLine.show("Got Query '" + qe.getQuery() + "'", 2000); pendingQueries.add(qe.getQuery()); } suggestionsOnBackend = (int) rl.getHits(); if (rl.size() == 0) { // the backend does not have any new query words for this time. if (pendingContext.size() == 0) { // try to fill the pendingContext using a matchall-query from the // cache // http://loklak.org/api/search.json?source=cache&q= try { Timeline tl = SearchClient.search(backend, "", Timeline.Order.CREATED_AT, "cache", 100, 0, 60000); checkContext(tl, false); } catch (IOException e) { } } // if we still don't have any context, we are a bit helpless and // hope that this situation // will be better in the future. To prevent that this is called // excessively fast, do a pause. if (pendingContext.size() == 0) try { Thread.sleep(10000); } catch (InterruptedException e) { } } } catch (IOException | JSONException e) { } if (pendingQueries.size() == 0) { isLoading = false; return; } // take one of the pending queries or pending context and load the tweets String q = pendingQueries.iterator().next(); pendingQueries.remove(q); pendingContext.remove(q); harvestedContext.add(q); Timeline tl = TwitterScraper.search(q, Timeline.Order.CREATED_AT); if (tl == null || tl.size() == 0) { isLoading = false; return; } // display the tweets for (MessageEntry me : tl) { // we don't want to throttle down just because the display is too full if (displayMessages.size() >= MAX_PENDING_DISPLAY_LINES) reduceDisplayMessages(); // add a line at the end of the list displayMessages.add(me); } // find content query strings and store them in the context cache checkContext(tl, true); // if we loaded a pending query, push results to backpeer right now tl.setQuery(q); pushToBackendIndividualTimeline.add(tl); isLoading = false; return; } } private static class PushThread extends Thread { private final Timeline timeline; public PushThread(final Timeline timeline) { this.timeline = timeline; } @Override public void run() { /* * try { Timeline ttl = SearchClient.search(backend, "ccc", * Timeline.Order.CREATED_AT, "cache", 100, 0, 4000); for (MessageEntry * me: ttl) { Log.d("ttl", me.getText(100000, null)); } } catch * (IOException e) {} */ isPushing = true; Timeline tl = timeline; String apphash = Preferences.getConfig(Preferences.Key.APPHASH, ""); tl.setPeerId(apphash); boolean success = false; try { for (int i = 0; i < 5; i++) { try { long start = System.currentTimeMillis(); JSONObject json = PushClient.push(backend, tl); if (json != null) { LOG.log(Level.FINE, "PushThread", "pushed " + tl.size() + " messages to backend in " + (System.currentTimeMillis() - start) + " ms; pendingQueries = " + pendingQueries.size() + ", pendingContext = " + pendingContext.size() + ", harvestedContext = " + harvestedContext.size() + ", attempt = " + i); // The client tells us how many messages we have pushed already! Object contribution_message_count_obj = json.get("contribution_message_count"); if (contribution_message_count_obj != null) { contribution_message_count = (Integer) contribution_message_count_obj; } return; } } catch (Throwable e) { // e.printStackTrace(); LOG.log(Level.FINE, "PushThread", "failed synchronous push to backend, attempt " + i); try { Thread.sleep((i + 1) * 3000); } catch (InterruptedException e1) { } } } } catch (Throwable e) { } finally { isPushing = false; } String q = tl.getQuery(); tl.setQuery(null); pushToBackendAccumulationTimeline.add(tl); LOG.log(Level.FINE, "PushThread", "retrieval of " + tl.size() + " new messages for q = " + q + ", scheduled push"); return; } } /** * if the given list of timelines contain at least the wanted minimum size of * messages, they are flushed from the queue and combined into a new timeline * * @param dumptl * @param order * @param minsize * @return */ public static Timeline takeTimelineMin(final BlockingQueue<Timeline> dumptl, final Timeline.Order order, final int minsize) { int c = 0; for (Timeline tl : dumptl) c += tl.size(); if (c < minsize) return new Timeline(order); // now flush the timeline queue completely Timeline tl = new Timeline(order); try { while (dumptl.size() > 0) { Timeline tl0 = dumptl.take(); if (tl0 == null) return tl; tl.putAll(tl0); } return tl; } catch (InterruptedException e) { return tl; } } }