Example usage for org.apache.commons.feedparser FeedParser parse

List of usage examples for org.apache.commons.feedparser FeedParser parse

Introduction

In this page you can find the example usage for org.apache.commons.feedparser FeedParser parse.

Prototype

public void parse(FeedParserListener listener, InputStream is, String resource) throws FeedParserException;

Source Link

Document

Parse this feed.

Usage

From source file:org.apache.commons.feedparser.example.HelloFeedParser.java

public static void main(String[] args) throws Exception {

    //create a new FeedParser...
    FeedParser parser = FeedParserFactory.newFeedParser();

    //create a listener for handling our callbacks
    FeedParserListener listener = new DefaultFeedParserListener() {

        public void onChannel(FeedParserState state, String title, String link, String description)
                throws FeedParserException {

            System.out.println("Found a new channel: " + title);

        }/*from   ww w . jav  a  2 s  . c o m*/

        public void onItem(FeedParserState state, String title, String link, String description,
                String permalink) throws FeedParserException {

            System.out.println("Found a new published article: " + permalink);

        }

        public void onCreated(FeedParserState state, Date date) throws FeedParserException {
            System.out.println("Which was created on: " + date);
        }

    };

    //specify the feed we want to fetch

    String resource = "http://peerfear.org/rss/index.rss";

    if (args.length == 1)
        resource = args[0];

    System.out.println("Fetching resource:" + resource);

    //use the FeedParser network IO package to fetch our resource URL
    ResourceRequest request = ResourceRequestFactory.getResourceRequest(resource);

    //grab our input stream
    InputStream is = request.getInputStream();

    //start parsing our feed and have the above onItem methods called
    parser.parse(listener, is, resource);

}

From source file:org.apache.nutch.parse.rss.RSSParser.java

/**
 * <p>/*from  www  .  j  a  va2  s.  c om*/
 * Implementation method, parses the RSS content, and then returns a
 * {@link ParseImpl}.
 * </p>
 * 
 * @param content
 *            The content to parse (hopefully an RSS content stream)
 * @return A {@link ParseImpl}which implements the {@link Parse}interface.
 */
public ParseResult getParse(Content content) {

    List theRSSChannels = null;

    try {
        byte[] raw = content.getContent();

        // create a new FeedParser...
        FeedParser parser = FeedParserFactory.newFeedParser();

        // create a listener for handling our callbacks
        FeedParserListener listener = new FeedParserListenerImpl();

        // start parsing our feed and have the onItem methods called
        parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
                null);

        theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();

    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
            e.printStackTrace(LogUtil.getWarnStream(LOG));
            LOG.warn("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
        }
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as rss document. " + e)
                .getEmptyParseResult(content.getUrl(), getConf());
    }

    StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
    List theOutlinks = new Vector();

    // for us, the contentTitle will be a concatenation of the titles of the
    // RSS Channels that we've parsed
    // and the index text will be a concatenation of the RSS Channel
    // descriptions, and descriptions of the RSS Items in the channel

    // also get the outlinks

    if (theRSSChannels != null) {
        for (int i = 0; i < theRSSChannels.size(); i++) {
            RSSChannel r = (RSSChannel) theRSSChannels.get(i);
            contentTitle.append(r.getTitle());
            contentTitle.append(" ");

            // concat the description to the index text
            indexText.append(r.getDescription());
            indexText.append(" ");

            if (r.getLink() != null) {
                try {
                    // get the outlink
                    if (r.getDescription() != null) {
                        theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
                    } else {
                        theOutlinks.add(new Outlink(r.getLink(), ""));
                    }
                } catch (MalformedURLException e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("MalformedURL: " + r.getLink());
                        LOG.warn("Attempting to continue processing outlinks");
                        e.printStackTrace(LogUtil.getWarnStream(LOG));
                    }
                    continue;
                }
            }

            // now get the descriptions of all the underlying RSS Items and
            // then index them too
            for (int j = 0; j < r.getItems().size(); j++) {
                RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
                indexText.append(theRSSItem.getDescription());
                indexText.append(" ");

                String whichLink = null;

                if (theRSSItem.getPermalink() != null)
                    whichLink = theRSSItem.getPermalink();
                else
                    whichLink = theRSSItem.getLink();

                if (whichLink != null) {
                    try {
                        if (theRSSItem.getDescription() != null) {
                            theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
                        } else {
                            theOutlinks.add(new Outlink(whichLink, ""));
                        }
                    } catch (MalformedURLException e) {
                        if (LOG.isWarnEnabled()) {
                            LOG.warn("MalformedURL: " + whichLink);
                            LOG.warn("Attempting to continue processing outlinks");
                            e.printStackTrace(LogUtil.getWarnStream(LOG));
                        }
                        continue;
                    }
                }

            }

        }

        if (LOG.isTraceEnabled()) {
            LOG.trace("nutch:parse-rss:getParse:indexText=" + indexText);
            LOG.trace("nutch:parse-rss:getParse:contentTitle=" + contentTitle);
        }

    } else if (LOG.isTraceEnabled()) {
        LOG.trace("nutch:parse-rss:Error:getParse: No RSS Channels recorded!");
    }

    // format the outlinks
    Outlink[] outlinks = (Outlink[]) theOutlinks.toArray(new Outlink[theOutlinks.size()]);

    if (LOG.isTraceEnabled()) {
        LOG.trace("nutch:parse-rss:getParse:found " + outlinks.length + " outlinks");
    }
    // if (LOG.isInfoEnabled()) {
    //   LOG.info("Outlinks: "+outlinks);
    // }

    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, contentTitle.toString(), outlinks,
            content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData));
}

From source file:org.scify.NewSumServer.Server.Sources.RssParser.java

/**
 * Processes the feeds from the given URL string and adds them to a List
 * containing an {@link Article} for each item found.
 *
 * @param urlString the URL string to parse
 * @param sCategory The category that the specified URL is about
 * @throws NetworkException/*from  w  ww .  jav  a2  s  .  co  m*/
 * @throws IOException
 */
public void ProcessFeed(final String urlString, final String sCategory) throws NetworkException, IOException {

    //create a listener for handling our callbacks
    FeedParserListener listener;
    listener = new DefaultFeedParserListener() {
        @Override
        public void onItem(FeedParserState state, String title, String link, String description,
                String permalink) throws FeedParserException {
            // Use first 30 characters for title...
            if ((title == null) || (title.trim().length() == 0)) {
                title = description.substring(0, 30) + "...";
            }
            // TODO for later version
            // check if category is "" || "Top News" and if such, create
            // new UnlabeledArticle so that it gets category from the
            // classification Module.

            if (sCategory.equals(UNCLASSIFIED)) {
                // Initiate an Unlabeled Article (null Category) with boolean
                // toWrap = false, so that
                // it is not accessed by the classification trainer
                UnlabeledArticle tmpUnArt = new UnlabeledArticle(permalink, title.trim(), description, null,
                        urlString, false);
                //filter Article text
                tmpUnArt = (UnlabeledArticle) preProcessArticle(tmpUnArt, 9);
                // Add the Article found to the list, avoid possible duplicates
                if (tmpUnArt != null) {
                    Utilities.addItemToList(lsItems, tmpUnArt);
                }
                // Otherwise procceed normally with provided category
            } else {
                // Initiate a new article with toWrap = true,
                // so that it feeds the classification trainer
                Article tmpArt = new Article(permalink, title.trim(), description, sCategory, urlString, true);
                //filter article text
                tmpArt = preProcessArticle(tmpArt, 10);
                // Add the Article found to the list, avoid possible duplicates
                if (tmpArt != null) {
                    Utilities.addItemToList(lsItems, tmpArt);
                }
            }
        }

        @Override
        public void onCreated(FeedParserState state, Date date) throws FeedParserException {
            if (!lsItems.isEmpty()) {
                //Adding date to current Article -- Some feeds don't provide date
                Article tmpArt = lsItems.get(lsItems.size() - 1);
                tmpArt.setDate(date);
            }
        }
    };
    // debug
    //        System.out.println("Fetching resource: " + urlString);
    // debug
    //use the FeedParser network IO package to fetch our resource URL
    ResourceRequest request = ResourceRequestFactory.getResourceRequest(urlString);
    request.setRequestHeaderField("User-Agent",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0");
    FeedParser parser = null;
    try {
        // Grab input stream
        InputStream is = request.getInputStream();
        parser = FeedParserFactory.newFeedParser();
        parser.parse(listener, is, urlString);
    } catch (FeedParserException ex) {
        LOGGER.log(Level.WARNING, ex.getMessage(), ex);
    } catch (Exception ex) {
        LOGGER.log(Level.WARNING, ex.getMessage());
    }
}