List of usage examples for org.apache.commons.feedparser FeedParser parse
public void parse(FeedParserListener listener, InputStream is, String resource) throws FeedParserException;
From source file:org.apache.commons.feedparser.example.HelloFeedParser.java
public static void main(String[] args) throws Exception { //create a new FeedParser... FeedParser parser = FeedParserFactory.newFeedParser(); //create a listener for handling our callbacks FeedParserListener listener = new DefaultFeedParserListener() { public void onChannel(FeedParserState state, String title, String link, String description) throws FeedParserException { System.out.println("Found a new channel: " + title); }/*from ww w . jav a 2 s . c o m*/ public void onItem(FeedParserState state, String title, String link, String description, String permalink) throws FeedParserException { System.out.println("Found a new published article: " + permalink); } public void onCreated(FeedParserState state, Date date) throws FeedParserException { System.out.println("Which was created on: " + date); } }; //specify the feed we want to fetch String resource = "http://peerfear.org/rss/index.rss"; if (args.length == 1) resource = args[0]; System.out.println("Fetching resource:" + resource); //use the FeedParser network IO package to fetch our resource URL ResourceRequest request = ResourceRequestFactory.getResourceRequest(resource); //grab our input stream InputStream is = request.getInputStream(); //start parsing our feed and have the above onItem methods called parser.parse(listener, is, resource); }
From source file:org.apache.nutch.parse.rss.RSSParser.java
/** * <p>/*from www . j a va2 s. c om*/ * Implementation method, parses the RSS content, and then returns a * {@link ParseImpl}. * </p> * * @param content * The content to parse (hopefully an RSS content stream) * @return A {@link ParseImpl}which implements the {@link Parse}interface. */ public ParseResult getParse(Content content) { List theRSSChannels = null; try { byte[] raw = content.getContent(); // create a new FeedParser... FeedParser parser = FeedParserFactory.newFeedParser(); // create a listener for handling our callbacks FeedParserListener listener = new FeedParserListenerImpl(); // start parsing our feed and have the onItem methods called parser.parse(listener, new ByteArrayInputStream(raw), /* resource */ null); theRSSChannels = ((FeedParserListenerImpl) listener).getChannels(); } catch (Exception e) { // run time exception if (LOG.isWarnEnabled()) { e.printStackTrace(LogUtil.getWarnStream(LOG)); LOG.warn("nutch:parse-rss:RSSParser Exception: " + e.getMessage()); } return new ParseStatus(ParseStatus.FAILED, "Can't be handled as rss document. " + e) .getEmptyParseResult(content.getUrl(), getConf()); } StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer(); List theOutlinks = new Vector(); // for us, the contentTitle will be a concatenation of the titles of the // RSS Channels that we've parsed // and the index text will be a concatenation of the RSS Channel // descriptions, and descriptions of the RSS Items in the channel // also get the outlinks if (theRSSChannels != null) { for (int i = 0; i < theRSSChannels.size(); i++) { RSSChannel r = (RSSChannel) theRSSChannels.get(i); contentTitle.append(r.getTitle()); contentTitle.append(" "); // concat the description to the index text indexText.append(r.getDescription()); indexText.append(" "); if (r.getLink() != null) { try { // get the outlink if (r.getDescription() != null) { theOutlinks.add(new Outlink(r.getLink(), r.getDescription())); } else { theOutlinks.add(new Outlink(r.getLink(), "")); } } catch (MalformedURLException e) { if (LOG.isWarnEnabled()) { LOG.warn("MalformedURL: " + r.getLink()); LOG.warn("Attempting to continue processing outlinks"); e.printStackTrace(LogUtil.getWarnStream(LOG)); } continue; } } // now get the descriptions of all the underlying RSS Items and // then index them too for (int j = 0; j < r.getItems().size(); j++) { RSSItem theRSSItem = (RSSItem) r.getItems().get(j); indexText.append(theRSSItem.getDescription()); indexText.append(" "); String whichLink = null; if (theRSSItem.getPermalink() != null) whichLink = theRSSItem.getPermalink(); else whichLink = theRSSItem.getLink(); if (whichLink != null) { try { if (theRSSItem.getDescription() != null) { theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription())); } else { theOutlinks.add(new Outlink(whichLink, "")); } } catch (MalformedURLException e) { if (LOG.isWarnEnabled()) { LOG.warn("MalformedURL: " + whichLink); LOG.warn("Attempting to continue processing outlinks"); e.printStackTrace(LogUtil.getWarnStream(LOG)); } continue; } } } } if (LOG.isTraceEnabled()) { LOG.trace("nutch:parse-rss:getParse:indexText=" + indexText); LOG.trace("nutch:parse-rss:getParse:contentTitle=" + contentTitle); } } else if (LOG.isTraceEnabled()) { LOG.trace("nutch:parse-rss:Error:getParse: No RSS Channels recorded!"); } // format the outlinks Outlink[] outlinks = (Outlink[]) theOutlinks.toArray(new Outlink[theOutlinks.size()]); if (LOG.isTraceEnabled()) { LOG.trace("nutch:parse-rss:getParse:found " + outlinks.length + " outlinks"); } // if (LOG.isInfoEnabled()) { // LOG.info("Outlinks: "+outlinks); // } ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, contentTitle.toString(), outlinks, content.getMetadata()); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData)); }
From source file:org.scify.NewSumServer.Server.Sources.RssParser.java
/** * Processes the feeds from the given URL string and adds them to a List * containing an {@link Article} for each item found. * * @param urlString the URL string to parse * @param sCategory The category that the specified URL is about * @throws NetworkException/*from w ww . jav a2 s . co m*/ * @throws IOException */ public void ProcessFeed(final String urlString, final String sCategory) throws NetworkException, IOException { //create a listener for handling our callbacks FeedParserListener listener; listener = new DefaultFeedParserListener() { @Override public void onItem(FeedParserState state, String title, String link, String description, String permalink) throws FeedParserException { // Use first 30 characters for title... if ((title == null) || (title.trim().length() == 0)) { title = description.substring(0, 30) + "..."; } // TODO for later version // check if category is "" || "Top News" and if such, create // new UnlabeledArticle so that it gets category from the // classification Module. if (sCategory.equals(UNCLASSIFIED)) { // Initiate an Unlabeled Article (null Category) with boolean // toWrap = false, so that // it is not accessed by the classification trainer UnlabeledArticle tmpUnArt = new UnlabeledArticle(permalink, title.trim(), description, null, urlString, false); //filter Article text tmpUnArt = (UnlabeledArticle) preProcessArticle(tmpUnArt, 9); // Add the Article found to the list, avoid possible duplicates if (tmpUnArt != null) { Utilities.addItemToList(lsItems, tmpUnArt); } // Otherwise procceed normally with provided category } else { // Initiate a new article with toWrap = true, // so that it feeds the classification trainer Article tmpArt = new Article(permalink, title.trim(), description, sCategory, urlString, true); //filter article text tmpArt = preProcessArticle(tmpArt, 10); // Add the Article found to the list, avoid possible duplicates if (tmpArt != null) { Utilities.addItemToList(lsItems, tmpArt); } } } @Override public void onCreated(FeedParserState state, Date date) throws FeedParserException { if (!lsItems.isEmpty()) { //Adding date to current Article -- Some feeds don't provide date Article tmpArt = lsItems.get(lsItems.size() - 1); tmpArt.setDate(date); } } }; // debug // System.out.println("Fetching resource: " + urlString); // debug //use the FeedParser network IO package to fetch our resource URL ResourceRequest request = ResourceRequestFactory.getResourceRequest(urlString); request.setRequestHeaderField("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0"); FeedParser parser = null; try { // Grab input stream InputStream is = request.getInputStream(); parser = FeedParserFactory.newFeedParser(); parser.parse(listener, is, urlString); } catch (FeedParserException ex) { LOGGER.log(Level.WARNING, ex.getMessage(), ex); } catch (Exception ex) { LOGGER.log(Level.WARNING, ex.getMessage()); } }