org.apache.nutch.parse.feed.FeedParser.java Source code

Introduction

Here is the source code for org.apache.nutch.parse.feed.FeedParser.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.parse.feed;

// JDK imports
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;

// APACHE imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.metadata.Feed;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.parse.ParserNotFound;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.EncodingDetector;
import org.apache.nutch.util.NutchConfiguration;
import org.xml.sax.InputSource;

// ROME imports
import com.sun.syndication.feed.synd.SyndCategory;
import com.sun.syndication.feed.synd.SyndContent;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.feed.synd.SyndPerson;
import com.sun.syndication.io.SyndFeedInput;

/**
 * 
 * @author dogacan
 * @author mattmann
 * @since NUTCH-444
 * 
 * <p>
 * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced links
 * and content present in the feed.
 * </p>
 * 
 */
public class FeedParser implements Parser {

    public static final String CHARSET_UTF8 = "charset=UTF-8";

    public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; " + CHARSET_UTF8;

    public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.feed");

    private Configuration conf;

    private ParserFactory parserFactory;

    private URLNormalizers normalizers;

    private URLFilters filters;

    private String defaultEncoding;

    /**
     * Parses the given feed and extracts out and parsers all linked items within
     * the feed, using the underlying ROME feed parsing library.
     * 
     * @param content
     *          A {@link Content} object representing the feed that is being
     *          parsed by this {@link Parser}.
     * 
     * @return A {@link ParseResult} containing all {@link Parse}d feeds that
     *         were present in the feed file that this {@link Parser} dealt with.
     * 
     */
    public ParseResult getParse(Content content) {
        SyndFeed feed = null;
        ParseResult parseResult = new ParseResult(content.getUrl());

        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(content, true);
        String encoding = detector.guessEncoding(content, defaultEncoding);
        try {
            InputSource input = new InputSource(new ByteArrayInputStream(content.getContent()));
            input.setEncoding(encoding);
            SyndFeedInput feedInput = new SyndFeedInput();
            feed = feedInput.build(input);
        } catch (Exception e) {
            // return empty parse
            LOG.warn(
                    "Parse failed: url: " + content.getUrl() + ", exception: " + StringUtils.stringifyException(e));
            return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
        }

        List entries = feed.getEntries();
        String feedLink = feed.getLink();
        try {
            feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
            if (feedLink != null)
                feedLink = filters.filter(feedLink);
        } catch (Exception e) {
            feedLink = null;
        }

        for (Iterator i = entries.iterator(); i.hasNext();) {
            SyndEntry entry = (SyndEntry) i.next();
            addToMap(parseResult, feed, feedLink, entry, content);
        }

        String feedDesc = stripTags(feed.getDescriptionEx());
        String feedTitle = stripTags(feed.getTitleEx());

        parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
                new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], content.getMetadata()));

        return parseResult;
    }

    /**
     * 
     * Sets the {@link Configuration} object for this {@link Parser}. This
     * {@link Parser} expects the following configuration properties to be set:
     * 
     * <ul>
     * <li>URLNormalizers - properties in the configuration object to set up the
     * default url normalizers.</li>
     * <li>URLFilters - properties in the configuration object to set up the
     * default url filters.</li>
     * </ul>
     * 
     * @param conf
     *          The Hadoop {@link Configuration} object to use to configure this
     *          {@link Parser}.
     * 
     */
    public void setConf(Configuration conf) {
        this.conf = conf;
        this.parserFactory = new ParserFactory(conf);
        this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
        this.filters = new URLFilters(conf);
        this.defaultEncoding = conf.get("parser.character.encoding.default", "windows-1252");
    }

    /**
     * 
     * @return The {@link Configuration} object used to configure this
     *         {@link Parser}.
     */
    public Configuration getConf() {
        return this.conf;
    }

    /**
     * Runs a command line version of this {@link Parser}.
     * 
     * @param args
     *          A single argument (expected at arg[0]) representing a path on the
     *          local filesystem that points to a feed file.
     * 
     * @throws Exception
     *           If any error occurs.
     */
    public static void main(String[] args) throws Exception {
        if (args.length != 1) {
            System.err.println("Usage: FeedParser <feed>");
            System.exit(1);
        }
        String name = args[0];
        String url = "file:" + name;
        Configuration conf = NutchConfiguration.create();
        FeedParser parser = new FeedParser();
        parser.setConf(conf);
        File file = new File(name);
        byte[] bytes = new byte[(int) file.length()];
        DataInputStream in = new DataInputStream(new FileInputStream(file));
        in.readFully(bytes);
        ParseResult parseResult = parser
                .getParse(new Content(url, url, bytes, "application/rss+xml", new Metadata(), conf));
        for (Entry<Text, Parse> entry : parseResult) {
            System.out.println("key: " + entry.getKey());
            Parse parse = entry.getValue();
            System.out.println("data: " + parse.getData());
            System.out.println("text: " + parse.getText() + "\n");
        }
    }

    private void addToMap(ParseResult parseResult, SyndFeed feed, String feedLink, SyndEntry entry,
            Content content) {
        String link = entry.getLink(), text = null, title = null;
        Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
        Parse parse = null;
        SyndContent description = entry.getDescription();

        try {
            link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);

            if (link != null)
                link = filters.filter(link);
        } catch (Exception e) {
            e.printStackTrace();
            return;
        }

        if (link == null)
            return;

        title = stripTags(entry.getTitleEx());

        if (feedLink != null)
            parseMeta.set("feed", feedLink);

        addFields(parseMeta, contentMeta, feed, entry);

        // some item descriptions contain markup text in them,
        // so we temporarily set their content-type to parse them
        // with another plugin
        String contentType = contentMeta.get(Response.CONTENT_TYPE);

        if (description != null)
            text = description.getValue();

        if (text == null) {
            List contents = entry.getContents();
            StringBuilder buf = new StringBuilder();
            for (Iterator i = contents.iterator(); i.hasNext();) {
                SyndContent syndContent = (SyndContent) i.next();
                buf.append(syndContent.getValue());
            }
            text = buf.toString();
        }

        try {
            Parser parser = parserFactory.getParsers(contentType, link)[0];
            parse = parser.getParse(new Content(link, link, text.getBytes(), contentType, contentMeta, conf))
                    .get(link);
        } catch (ParserNotFound e) { /* ignore */
        }

        if (parse != null) {
            ParseData data = parse.getData();
            data.getContentMeta().remove(Response.CONTENT_TYPE);
            mergeMetadata(data.getParseMeta(), parseMeta);
            parseResult.put(link, new ParseText(parse.getText()), new ParseData(ParseStatus.STATUS_SUCCESS, title,
                    data.getOutlinks(), data.getContentMeta(), data.getParseMeta()));
        } else {
            contentMeta.remove(Response.CONTENT_TYPE);
            parseResult.put(link, new ParseText(text),
                    new ParseData(ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, parseMeta));
        }

    }

    private static String stripTags(SyndContent c) {
        if (c == null)
            return "";

        String value = c.getValue();

        String[] parts = value.split("<[^>]*>");
        StringBuffer buf = new StringBuffer();

        for (String part : parts)
            buf.append(part);

        return buf.toString().trim();
    }

    private void addFields(Metadata parseMeta, Metadata contentMeta, SyndFeed feed, SyndEntry entry) {
        List authors = entry.getAuthors(), categories = entry.getCategories();
        Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate();
        String contentType = null;

        if (authors != null) {
            for (Object o : authors) {
                SyndPerson author = (SyndPerson) o;
                String authorName = author.getName();
                if (checkString(authorName)) {
                    parseMeta.add(Feed.FEED_AUTHOR, authorName);
                }
            }
        } else {
            // getAuthors may return null if feed is non-atom
            // if so, call getAuthor to get Dublin Core module creator.
            String authorName = entry.getAuthor();
            if (checkString(authorName)) {
                parseMeta.set(Feed.FEED_AUTHOR, authorName);
            }
        }

        for (Iterator i = categories.iterator(); i.hasNext();) {
            parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i.next()).getName());
        }

        if (published != null) {
            parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
        }
        if (updated != null) {
            parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
        }

        SyndContent description = entry.getDescription();
        if (description != null) {
            contentType = description.getType();
        } else {
            // TODO: What to do if contents.size() > 1?
            List contents = entry.getContents();
            if (contents.size() > 0) {
                contentType = ((SyndContent) contents.get(0)).getType();
            }
        }

        if (checkString(contentType)) {
            // ROME may return content-type as html
            if (contentType.equals("html"))
                contentType = "text/html";
            else if (contentType.equals("xhtml"))
                contentType = "text/xhtml";
            contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8);
        } else {
            contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
        }

    }

    private void mergeMetadata(Metadata first, Metadata second) {
        for (String name : second.names()) {
            String[] values = second.getValues(name);
            for (String value : values) {
                first.add(name, value);
            }
        }
    }

    private boolean checkString(String s) {
        return s != null && !s.equals("");
    }

}