samza.examples.rss.system.RssFeed.java Source code

Java tutorial

Introduction

Here is the source code for samza.examples.rss.system.RssFeed.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package samza.examples.rss.system;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.collect.Sets;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import samza.examples.rss.utils.Datum;
import samza.examples.rss.utils.FeedDetails;
import samza.examples.rss.utils.RFC3339Utils;
import samza.examples.rss.utils.SyndEntrySerializer;

public class RssFeed implements Runnable {
    private static final Logger log = LoggerFactory.getLogger(RssFeed.class);

    private final String urlsFilePath;
    private final int timeOut;
    private final int waitTime;
    private List<FeedDetails> feedDetails;
    private SyndEntrySerializer serializer;
    private DateTime publishedSince;
    private RssConsumer rssConsumer;
    private boolean workFlag;

    private static final String RSS_KEY = "rssFeed";
    private static final String URI_KEY = "uri";
    private static final String LINK_KEY = "link";
    private static final String DATE_KEY = "publishedDate";

    protected static final Map<String, Set<String>> PREVIOUSLY_SEEN = new ConcurrentHashMap<>();

    /**
     * Constructor
     * @param urlsFilePath
     * @param timeOut
     * @param waitTime
     */
    public RssFeed(String urlsFilePath, int timeOut, int waitTime) {
        this.urlsFilePath = urlsFilePath;
        this.timeOut = timeOut;
        this.waitTime = waitTime;
        this.serializer = new SyndEntrySerializer();
        // TODO pass this as a parameter?
        this.publishedSince = new DateTime().withYear(2014).withDayOfMonth(5).withMonthOfYear(9)
                .withZone(DateTimeZone.UTC);
    }

    /**
     * Starts the feed reader
     * @param rssConsumer
     */
    public void start(RssConsumer rssConsumer) {
        // load urls to be read
        feedDetails = readUrlFile();
        this.setConsumer(rssConsumer);
        this.workFlag = true;
    }

    public void run() {
        // start polling from urls
        checkForNewEntries();
    }

    private void checkForNewEntries() {
        while (workFlag) {
            this.rssConsumer.onIncommingBatch(getNextBatch());
        }
    }

    /**
     * Reads assigned URLs file from classpath
     *
     * @return
     */
    private List<FeedDetails> readUrlFile() {
        List<FeedDetails> rssQueue = new ArrayList<>();
        try {
            InputStream in = this.getClass().getClassLoader().getResourceAsStream(this.urlsFilePath);
            BufferedReader reader = new BufferedReader(new InputStreamReader(in));
            String line;
            while ((line = reader.readLine()) != null) {
                if (!line.startsWith("#")) {
                    String[] split = line.split(",");
                    rssQueue.add(new FeedDetails(split[0], split[1]));
                }
            }
            reader.close();
        } catch (IOException e) {
            log.error("Error while reading RSS list file.");
            e.printStackTrace();
        }
        return rssQueue;
    }

    /**
     * Stops the feed reader
     */
    public void stop() {
        this.workFlag = false;
        log.info("Stoping RssFeed consumer.");
    }

    /**
     * Gets the next batch of feeds from URLs
     *
     * @return
     */
    public List<Datum> getNextBatch() {
        FeedDetails feedDetail = null;
        Set<String> batch = null;
        Iterator<FeedDetails> it = feedDetails.iterator();
        List<Datum> dataQueue = new ArrayList<>();
        while (it.hasNext()) {
            try {
                feedDetail = it.next();
                // if enough time has passed, then read again
                long elapsedTime = (System.currentTimeMillis() - feedDetail.getLastPolled()) / 1000;
                if (elapsedTime > feedDetail.getPollIntervalMillis() * 60) {
                    log.info(feedDetail.getUrl() + " polling.");
                    // logging previously seen feeds
                    batch = queueFeedEntries(feedDetail, dataQueue);
                    PREVIOUSLY_SEEN.put(feedDetail.getUrl(), batch);
                    // updating previously seen feeds
                    feedDetail.setLastPolled(System.currentTimeMillis());
                } else {
                    log.info(feedDetail.getUrl() + " has been already polled.");
                    this.waiting(this.waitTime);
                }
            } catch (IOException e) {
                log.error("Error while reading data from RSS. IOException. " + feedDetail.getUrl());
                e.printStackTrace();
            } catch (FeedException e) {
                log.error("Error while reading data from RSS. FeedException. " + feedDetail.getUrl());
                e.printStackTrace();
            } catch (InterruptedException e) {
                log.error("Error while reading data from RSS. InterruptedException." + feedDetail);
                e.printStackTrace();
            }
        }
        return dataQueue;
    }

    /**
     * Reads the url and queues the data
     *
     * @param feedDetail feedDetails object
     * @return set of all article urls that were read from the feed
     * @throws IOException                          when it cannot connect to the url or the url is malformed
     * @throws com.sun.syndication.io.FeedException when it cannot reed the feed.
     */
    protected Set<String> queueFeedEntries(FeedDetails feedDetail, List<Datum> dataQueue)
            throws IOException, FeedException {
        URL feedUrl = new URL(feedDetail.getUrl());
        URLConnection connection = feedUrl.openConnection();
        connection.setConnectTimeout(this.timeOut);
        SyndFeedInput input = new SyndFeedInput();
        SyndFeed feed = input.build(new InputStreamReader(connection.getInputStream()));
        Set<String> batch = Sets.newConcurrentHashSet();
        for (Object entryObj : feed.getEntries()) {
            SyndEntry entry = (SyndEntry) entryObj;
            ObjectNode nodeEntry = this.serializer.deserialize(entry);
            nodeEntry.put(RSS_KEY, feedDetail.getUrl());
            String entryId = determineId(nodeEntry);
            batch.add(entryId);
            Datum datum = new Datum(nodeEntry, entryId, DateTime.now());
            JsonNode published = nodeEntry.get(DATE_KEY);
            if (published != null) {
                try {
                    DateTime date = RFC3339Utils.parseToUTC(published.asText());
                    if (date.isAfter(this.publishedSince) && (!seenBefore(entryId, feedDetail.getUrl()))) {
                        dataQueue.add(datum);
                        log.debug("Added entry, {}, to provider queue.", entryId);
                    }
                } catch (Exception e) {
                    log.trace("Failed to parse date from object node, attempting to add node to queue by default.");
                    if (!seenBefore(entryId, feedDetail.getUrl())) {
                        dataQueue.add(datum);
                        log.debug("Added entry, {}, to provider queue.", entryId);
                    }
                }
            } else {
                log.debug("No published date present, attempting to add node to queue by default.");
                if (!seenBefore(entryId, feedDetail.getUrl())) {
                    dataQueue.add(datum);
                    log.debug("Added entry, {}, to provider queue.", entryId);
                }
            }
        }
        return batch;
    }

    /**
     * Safe waiting
     *
     * @param waitTime
     * @throws InterruptedException
     */
    private void waiting(long waitTime) throws InterruptedException {
        log.warn("Waiting for " + waitTime + " mlsecs.");
        synchronized (this) {
            this.wait(waitTime);
        }
    }

    /**
     * Returns a link to the article to use as the id
     *
     * @param node
     * @return
     */
    private String determineId(ObjectNode node) {
        String id = null;
        if (node.get(URI_KEY) != null && !node.get(URI_KEY).textValue().equals("")) {
            id = node.get(URI_KEY).textValue();
        } else if (node.get(LINK_KEY) != null && !node.get(LINK_KEY).textValue().equals("")) {
            id = node.get(LINK_KEY).textValue();
        }
        return id;
    }

    /**
     * Returns false if the artile was previously seen in another task for this
     * feed
     *
     * @param id
     * @param rssFeed
     * @return
     */
    private boolean seenBefore(String id, String rssFeed) {
        Set<String> previousBatch = PREVIOUSLY_SEEN.get(rssFeed);
        if (previousBatch == null) {
            return false;
        }
        return previousBatch.contains(id);
    }

    /**
     * Registers a new rssConsumer to whom entries can be delivered.
     * @param rssConsumer
     */
    public void setConsumer(RssConsumer rssConsumer) {
        this.rssConsumer = rssConsumer;
    }

    public static void main(String[] args) throws InterruptedException {
        RssFeed feed = new RssFeed(
                "/Users/renatomarroquin/Documents/workspace/workspaceApache/hello-samza/src/main/resources/rss.file",
                3000, 10000);
        feed.start(new RssConsumer("rss", feed, null));

        feed.checkForNewEntries();

        Thread.sleep(20000);
        feed.stop();
    }
}