org.gbif.pubindex.service.impl.FeedParserRome.java Source code

Introduction

Here is the source code for org.gbif.pubindex.service.impl.FeedParserRome.java
Source

/*
 * Copyright 2011 Global Biodiversity Information Facility (GBIF)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gbif.pubindex.service.impl;

import org.gbif.pubindex.service.FeedParser;
import org.gbif.pubindex.model.Article;
import org.gbif.pubindex.rome.modules.prism.PrismModule;
import org.gbif.pubindex.util.DoiUtils;

import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import com.google.inject.Inject;
import com.sun.syndication.feed.synd.SyndCategoryImpl;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FeedParserRome implements FeedParser {
    private Logger log = LoggerFactory.getLogger(getClass());

    @Inject
    public FeedParserRome() {
    }

    /**
     * Extracts and persists articles from a given volume feed.
     * This does not include downloading potentially linked article files to the repository yet.
     * The volume will be updated with information found in the feed, for example the publishing date.
     */
    @Override
    public List<Article> buildArticles(SyndFeed feed, Integer journalId) {
        List<Article> articles = new ArrayList<Article>();

        // go thru items to find volume and date published if not set already
        Iterator entryIter = feed.getEntries().iterator();
        while (entryIter.hasNext()) {
            Article a = handleFeedItem((SyndEntry) entryIter.next(), feed.getTitle());
            if (a != null) {
                a.setJournalId(journalId);
                a.setCreated(new Date());
                articles.add(a);
            }
        }

        return articles;
    }

    private Article handleFeedItem(SyndEntry item, String feedTitle) {
        Article a = new Article();

        a.setTitle(StringUtils.trimToNull(item.getTitle()));
        a.setUrl(StringUtils.trimToNull(item.getLink()));
        a.setAuthors(concat(item.getAuthors()));
        a.setKeywords(concatCategories(item.getCategories()));

        a.setPublishedDate(item.getPublishedDate());
        a.setPublishedIn(StringUtils.trimToNull(feedTitle));
        if (item.getDescription() != null) {
            a.setDescription(StringUtils.trimToNull(item.getDescription().getValue()));
        }
        String guid = StringUtils.trimToNull(item.getUri());
        // dont trust BioStor UUIDs as they change all the time
        if (guid == null || guid.startsWith("urn:uuid")) {
            // if no guid was given use the link
            guid = StringUtils.trimToNull(item.getLink());
            if (guid == null) {
                // last resort use the hascode for the entire item
                guid = "" + item.hashCode();
            }
        }
        a.setGuid(guid);
        // check if url or guid containts a doi
        a.setDoi(findDOI(guid, a.getUrl()));

        // parse prism module
        PrismModule prism = (PrismModule) item.getModule(PrismModule.URI);
        if (prism != null) {
            if (prism.getDoi() != null) {
                a.setDoi(prism.getDoi());
            }
        }
        return a;
    }

    protected static String concat(List objs) {
        StringBuilder keywords = new StringBuilder();
        boolean first = true;
        for (Object cat : objs) {
            if (cat != null) {
                if (!first) {
                    keywords.append("; ");
                }
                keywords.append(cat.toString());
                first = false;
            }
        }
        return StringUtils.trimToNull(keywords.toString());
    }

    protected static String concatCategories(List<SyndCategoryImpl> objs) {
        StringBuilder keywords = new StringBuilder();
        boolean first = true;
        for (Object cat : objs) {
            if (cat != null) {
                if (!first) {
                    keywords.append("; ");
                }
                keywords.append(((SyndCategoryImpl) cat).getName());
                first = false;
            }
        }
        return StringUtils.trimToNull(keywords.toString());
    }

    /**
     * http://dx.doi.org/10.1111%2Fj.1654-1103.2011.01343.x
     * doi:10.1111/j.1654-1103.2011.01343.x
     * 10.1111/j.1654-1103.2011.01343.x
     * @param text
     * @return
     */
    private String findDOI(String... text) {
        String doi = null;
        for (String x : text) {
            doi = DoiUtils.extractDoi(x);
            if (doi != null)
                break;
        }
        return doi;
    }
}