com.seajas.search.contender.service.enricher.EnricherService.java Source code

Introduction

Here is the source code for com.seajas.search.contender.service.enricher.EnricherService.java
Source

/**
 * Copyright (C) 2013 Seajas, the Netherlands.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.seajas.search.contender.service.enricher;

import com.seajas.search.bridge.contender.metadata.SeajasModule;
import com.seajas.search.bridge.jms.model.CompositeEntry;
import com.seajas.search.bridge.jms.model.EnricherDocument;
import com.seajas.search.bridge.jms.model.Feed;
import com.seajas.search.bridge.jms.model.ModifiedContent;
import com.seajas.search.bridge.jms.model.OriginalContent;
import com.seajas.search.bridge.jms.model.Source;
import com.seajas.search.bridge.jms.model.SourceElement;
import com.seajas.search.contender.replication.TaxonomyCache;
import com.seajas.search.contender.service.ContenderService;
import com.sun.syndication.feed.module.DCModule;
import com.sun.syndication.feed.module.Module;
import com.sun.syndication.feed.module.georss.GeoRSSModule;
import com.sun.syndication.feed.module.mediarss.MediaEntryModule;
import com.sun.syndication.feed.module.mediarss.MediaModule;
import com.sun.syndication.feed.module.mediarss.types.MediaContent;
import com.sun.syndication.feed.module.mediarss.types.MediaGroup;
import com.sun.syndication.feed.synd.SyndCategory;
import com.sun.syndication.feed.synd.SyndEnclosure;
import com.sun.syndication.feed.synd.SyndEntry;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.message.BasicNameValuePair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;

/**
 * The enricher service used to communicate with the enricher.
 * 
 * @author Jasper van Veghel <jasper@seajas.com>
 */
@Service
public class EnricherService {
    /**
     * The static logger.
     */
    private static final Logger logger = LoggerFactory.getLogger(EnricherService.class);

    /**
     * Constants.
     */
    private static final String ENCODING_UNKNOWN = "UNKNOWN";

    /**
     * The contender service.
     */
    @Autowired
    private ContenderService contenderService;

    /**
     * The taxonomy cache.
     */
    @Autowired
    private TaxonomyCache taxonomyCache;

    /**
     * The enricher HTTP client.
     */
    @Autowired
    @Qualifier("enricherHttpClient")
    private HttpClient httpClient;

    /**
     * The search enricher URL.
     */
    @Value("${contender.project.search.enricher.url}")
    private String searchEnricherUrl;

    /**
     * List of admissible hostnames for GeoRSS fields (or empty / * for all).
     */
    private List<String> geoRssInclusions;

    /**
     * The thumbnail enclosures.
     */
    private List<String> thumbnailEnclosures;

    /**
     * Default constructor.
     */
    public EnricherService() {
        // Do nothing
    }

    /**
     * Default constructor.
     * 
     * @param thumbnailEnclosures
     * @param geoRssInclusions
     */
    @Autowired
    public EnricherService(
            @Value("${contender.project.rss.reader.thumbnail.enclosures}") final String thumbnailEnclosures,
            @Value("${contender.project.search.enricher.georss.included.hosts}") final String geoRssInclusions) {
        this.thumbnailEnclosures = Arrays
                .asList(StringUtils.tokenizeToStringArray(thumbnailEnclosures, ",", true, true));
        this.geoRssInclusions = Arrays.asList(StringUtils.tokenizeToStringArray(geoRssInclusions, ",", true, true));
    }

    /**
     * Create a new envelope request and return its contents.
     *
     * @param jobName
     * @param entry
     * @return String
     */
    public String createEnvelope(final String jobName, final CompositeEntry entry) {
        StringBuffer envelopeRequest = new StringBuffer("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n\n");

        envelopeRequest.append("<autn:import xmlns:autn=\"http://schemas.autonomy.com/aci/\">\n");

        envelopeRequest.append("\t<autn:envelope>\n");

        // NOTE: Content is stored elsewhere, and so we don't create a <stubIdx> for it

        envelopeRequest.append("\t\t<autn:document xmlns:autn=\"http://schemas.autonomy.com/aci/\">\n");
        envelopeRequest.append("\t\t\t<autn:fetch url=\"data://" + entry.getId() + "\" />\n");
        envelopeRequest.append("\t\t</autn:document>\n");

        envelopeRequest.append("\t</autn:envelope>\n");

        envelopeRequest.append("</autn:import>\n");

        return envelopeRequest.toString();
    }

    /**
     * Send the actual envelope content to the enricher.
     * 
     * @param jobName
     * @param encoding
     * @param request
     * @return boolean
     */
    public boolean sendEnvelope(final String jobName, final String encoding, final String request) {
        HttpPost importMethod = new HttpPost(searchEnricherUrl);
        List<NameValuePair> importParameters = new ArrayList<NameValuePair>();

        importParameters.add(new BasicNameValuePair("action", "ImportEnvelope"));
        importParameters.add(new BasicNameValuePair("JobName", jobName));

        if (encoding != null && !encoding.equals(ENCODING_UNKNOWN))
            importParameters.add(new BasicNameValuePair("ReferenceEncoding", encoding));

        importParameters.add(new BasicNameValuePair("EnvelopeXML", request));

        try {
            importMethod.setEntity(new UrlEncodedFormEntity(importParameters, "UTF-8"));
        } catch (UnsupportedEncodingException e) {
            logger.error("Unsupported encoding while encoding the URL form entities", e);
        }

        // Set the form encoding to match that of the Search Enricher

        importMethod.addHeader("Content-Type", PostMethod.FORM_URL_ENCODED_CONTENT_TYPE + ";charset=UTF-8");

        boolean result = true;

        try {
            if (logger.isTraceEnabled())
                logger.trace("Executing the requested envelope post");

            // Send the import request

            HttpResponse importStatus = httpClient.execute(importMethod);

            if (importStatus.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
                logger.error("EnvelopeXML failed to successfully POST, response: " + importStatus.getStatusLine());

                result = false;
            } else {
                if (logger.isDebugEnabled())
                    logger.debug("EnvelopeXML succesfully POSTed, response: " + importStatus.getStatusLine());

                importMethod.abort();
                importMethod = null;
            }
        } catch (HttpException e) {
            logger.error("HTTP execution exception for " + searchEnricherUrl, e);

            result = false;
        } catch (IOException e) {
            logger.error("Resource execution exception for " + searchEnricherUrl, e);

            result = false;
        } finally {
            if (importMethod != null)
                importMethod.abort();
        }

        return result;
    }

    /**
     * Create an enricher document from the given entry.
     *
     * @param jobName
     * @param entry
     * @return EnricherDocument
     */
    public EnricherDocument createEnricherDocument(final String jobName, final CompositeEntry entry) {
        EnricherDocument document = new EnricherDocument();

        // Take out everything we need

        Source source = entry.getSource();
        SourceElement element = entry.getElement();
        OriginalContent originalContent = entry.getOriginalContent();
        ModifiedContent modifiedContent = entry.getModifiedContent();

        // The reference is either the final URL, or the entry link, or the entry URI

        String reference = originalContent.getUri() != null ? originalContent.getUri().toString()
                : StringUtils.hasText(element.getEntry().getLink()) ? element.getEntry().getLink()
                        : element.getEntry().getUri();

        // Set the document ID to be the reference

        document.setId(reference);

        // We should always have at least one separator in the path

        if (StringUtils.hasText(element.getEntry().getTitle()))
            document.setTitle(element.getEntry().getTitle());
        else if (reference.lastIndexOf('/') != -1) {
            logger.warn("No entry title given - deriving it from the reference");

            document.setTitle(reference.substring(reference.lastIndexOf("/") + 1));
        }

        // Retrieve the taxonomy fields

        handleTaxonomy(jobName, document, originalContent.getHostname(), element.getEntry());

        // Add the remaining metadata as Dublin Core fields

        document.setReference(reference);

        if (StringUtils.hasText(element.getEntry().getAuthor()))
            document.setAuthor(element.getEntry().getAuthor());

        handleDatesAndTimes(
                modifiedContent != null ? modifiedContent.getDateSubmitted() : originalContent.getDateSubmitted(),
                document, element.getEntry());

        // Add the feed ID

        document.setFeedId(source.getId());

        // Add any additionally provided Dublin Core metadata

        handleFeedMetadata(document, element.getEntry(), reference,
                source instanceof Feed ? ((Feed) source).getLanguageOverride() : null);

        // Add any additionally provided Geo RSS metadata

        handleGeoRss(document, element.getEntry(), originalContent.getHostname());

        // Add any additionally provided thumbnail image

        handleImageThumbnail(document, element.getEntry(), reference);

        // Add any additional fields for this result

        Map<String, String> additionalFields = new HashMap<String, String>();

        if (source.getResultParameters() != null)
            additionalFields.putAll(source.getResultParameters());

        document.setAdditionalFields(additionalFields);

        return document;
    }

    /**
     * Handle the taxonomy-related envelope fields.
     * 
     * @param jobName
     * @param document
     * @param hostname
     * @param entry
     */
    private void handleTaxonomy(final String jobName, final EnricherDocument document, final String hostname,
            final SyndEntry entry) {
        String taxonomyMatch = getTaxonomyMatch(hostname, entry);

        if (logger.isTraceEnabled())
            logger.trace(
                    String.format("Retrieved taxonomy match '%s' - will retrieve identifiers next", taxonomyMatch));

        document.setTaxonomyHost(taxonomyMatch.trim().toLowerCase());

        List<Integer> taxonomyIdentifiers = taxonomyCache.getIdsByMatch(taxonomyMatch);

        // Check if we need to create a new unassigned taxonomy entry

        if (taxonomyIdentifiers == null)
            taxonomyIdentifiers = taxonomyCache.addToUnassigned(jobName, taxonomyMatch);

        if (taxonomyIdentifiers != null) {
            if (logger.isTraceEnabled())
                logger.trace(String.format("Retrieved (either fresh or cached) %d matching identifiers",
                        taxonomyIdentifiers.size()));

            document.setTaxonomyIds(new ArrayList<String>());

            for (Integer taxonomyIdentifier : taxonomyIdentifiers)
                document.getTaxonomyIds().add(taxonomyCache.getFieldPrefix() + taxonomyIdentifier);
        } else
            logger.warn(String.format(
                    "Could not associate the given match '%s' with any known taxonomy identifiers, nor could a new one be created",
                    taxonomyMatch));
    }

    /**
     * Handle the date and time related envelope fields.
     * 
     * @param dateSubmitted
     * @param document
     * @param entry
     */
    private void handleDatesAndTimes(final Date dateSubmitted, final EnricherDocument document,
            final SyndEntry entry) {
        Date createdDate = entry.getPublishedDate(), modifiedDate = entry.getUpdatedDate();

        if (createdDate == null)
            createdDate = dateSubmitted;

        document.setCreated(createdDate);

        if (modifiedDate == null)
            modifiedDate = dateSubmitted;

        document.setModified(modifiedDate);

        document.setSubmitted(dateSubmitted);
    }

    /**
     * Handle the Dublin Core related envelope fields, and return the provided source from the Dublin Core metadata values.
     * 
     * @param document
     * @param entry
     * @param reference
     * @return String
     */
    private void handleFeedMetadata(final EnricherDocument document, final SyndEntry entry, final String reference,
            final String languageOverride) {
        final Module dcModule;

        // Derive the DC module from the Seajas module, if possible

        Module seajasModule = entry.getModule(SeajasModule.URI);

        if (seajasModule != null && seajasModule instanceof DCModule)
            dcModule = seajasModule;
        else
            dcModule = entry.getModule("http://purl.org/dc/elements/1.1/");

        if (seajasModule != null)
            handleFeedMetadataInternal(document, (SeajasModule) seajasModule, reference);

        if (dcModule != null)
            handleFeedMetadataDublinCore(document, (DCModule) dcModule, reference, languageOverride);
    }

    /**
     * Handle the internal Seajas-related envelope fields.
     * 
     * @param document
     * @param module
     * @param reference
     */
    private void handleFeedMetadataInternal(final EnricherDocument document, final SeajasModule module,
            final String reference) {
        // Add any supporting content

        if (module.getSupportContent() != null)
            document.setSupportContent(module.getSupportContent());

        // Add the linked images, if any

        if (module.getLinkedImageUrls() != null && !module.getLinkedImageUrls().isEmpty()) {
            List<String> notifiedUrls = contenderService.getMediaNotificationUrls(module.getLinkedImageUrls(),
                    buildContext(document, reference), "image");

            if (notifiedUrls != null) {
                if (document.getLinkedImageUrls() == null)
                    document.setLinkedImageUrls(new ArrayList<String>());

                for (String notifiedUrl : notifiedUrls)
                    document.getLinkedImageUrls().add(notifiedUrl);
            }
        }

        if (module.getLinkedVideoUrl() != null)
            document.setLinkedVideoUrl(module.getLinkedVideoUrl());
        if (module.getLinkedSubtitlesUrl() != null)
            document.setLinkedSubtitlesUrl(module.getLinkedSubtitlesUrl());

        if (module.getRepublicationCount() != null)
            document.setRepublicationCount(module.getRepublicationCount());
        if (StringUtils.hasText(module.getAuthorName()))
            document.setAuthorName(module.getAuthorName());

        if (module.getThumbnailUrls() != null && !module.getThumbnailUrls().isEmpty()) {
            List<String> notifiedUrls = contenderService.getMediaNotificationUrls(module.getThumbnailUrls(),
                    buildContext(document, reference), "image");

            if (notifiedUrls != null) {
                if (document.getThumbnailUrls() == null)
                    document.setThumbnailUrls(new ArrayList<String>());

                for (String notifiedUrl : notifiedUrls)
                    document.getThumbnailUrls().add(notifiedUrl);
            }
        }

        if (StringUtils.hasText(module.getProfileImageUrl())) {
            String notifiedUrl = contenderService.getMediaNotificationUrl(module.getProfileImageUrl(),
                    buildContext(document, reference), "image");

            if (notifiedUrl != null)
                document.setProfileImageUrl(notifiedUrl);
        }
    }

    /**
     * Build the media notification context.
     *
     * @param document
     * @param reference
     * @return List<String>
     */
    private List<String> buildContext(final EnricherDocument document, final String reference) {
        List<String> result = new ArrayList<String>();

        result.add("referrer=" + reference);

        if (!StringUtils.isEmpty(document.getTitle()))
            result.add("title=" + document.getTitle());

        return result;
    }

    /**
     * Handle the Dublin Core related envelope fields.
     * 
     * @param document
     * @param module
     * @param reference
     * @param languageOverride
     */
    private void handleFeedMetadataDublinCore(final EnricherDocument document, final DCModule module,
            final String reference, final String languageOverride) {
        String providedSource = null;

        document.setLanguage(languageOverride != null ? languageOverride : module.getLanguage());

        document.setPublishers(module.getPublishers());
        document.setContributors(module.getContributors());
        document.setRights(module.getRightsList());
        document.setTypes(module.getTypes());
        document.setCreators(module.getCreators());

        // Only use this if it contains something substantive (no 'html' or 'text' types)

        if (StringUtils.hasText(module.getFormat()) && !module.getFormat().contains("html")
                && !module.getFormat().contains("text"))
            document.setFormat(module.getFormat());

        // Always fall back to the reference for the sourceId

        document.setSourceId(reference);

        // Treat dcterms_source as a special case; it's only valid if only one entry is provided which is a valid URL

        if (module.getSources().size() == 1) {
            String potentialSource = module.getSources().get(0).toString();

            try {
                new URL(potentialSource);

                document.setSourceId(potentialSource);
            } catch (MalformedURLException e) {
                if (logger.isDebugEnabled())
                    logger.debug(
                            String.format("Not setting enricher document source to '%s' as it's not a valid URL",
                                    potentialSource));
            }
        }
    }

    /**
     * Handle the image thumbnail related envelope field.
     * 
     * @param document
     * @param entry
     * @param reference
     */
    private void handleImageThumbnail(final EnricherDocument document, final SyndEntry entry,
            final String reference) {
        String imageThumbnail = extractImageThumbnail(entry);

        if (imageThumbnail != null) {
            String mediaNotificationUrl = contenderService.getMediaNotificationUrl(imageThumbnail,
                    buildContext(document, reference), "image");

            if (mediaNotificationUrl != null) {
                if (document.getThumbnailUrls() == null)
                    document.setThumbnailUrls(new ArrayList<String>());

                document.getThumbnailUrls().add(mediaNotificationUrl);
            }
        }
    }

    /**
     * Handle the GeoRSS related envelope fields.
     * 
     * @param document
     * @param entry
     * @param hostname
     */
    private void handleGeoRss(final EnricherDocument document, final SyndEntry entry, final String hostname) {
        Module geoModule = entry.getModule(GeoRSSModule.GEORSS_W3CGEO_URI);

        if (geoModule == null)
            geoModule = entry.getModule(GeoRSSModule.GEORSS_GEORSS_URI);

        if (geoModule != null)
            if (geoRssInclusions.size() == 0 || geoRssInclusions.contains("*")
                    || geoRssInclusions.contains(hostname)) {
                if (logger.isDebugEnabled())
                    logger.debug(String.format(
                            "Geo-RSS module detected for hostname '%s' - which falls within the set inclusions",
                            hostname));

                if (((GeoRSSModule) geoModule).getPosition() != null) {
                    if (logger.isDebugEnabled())
                        logger.debug(String.format("Adding geo_point field for hostname '%s'", hostname));

                    document.setGeoPoint(((GeoRSSModule) geoModule).getPosition().getLatitude() + ","
                            + ((GeoRSSModule) geoModule).getPosition().getLongitude());
                }
            } else {
                if (logger.isDebugEnabled())
                    logger.debug(String.format(
                            "Geo-RSS value provided, but hostname '%s' not contained within GeoRSS inclusion-hosts list",
                            hostname));
            }
    }

    /**
     * Extract an image thumbnail from the given syndication entry.
     * 
     * @param entry
     * @return String
     */
    private String extractImageThumbnail(final SyndEntry entry) {
        String imageThumbnail = null;

        if (entry.getEnclosures().size() > 0 && thumbnailEnclosures.size() > 0) {
            // Prefer the explicit media types

            Module mediaModule = entry.getModule(MediaModule.URI);

            if (mediaModule != null && mediaModule instanceof MediaEntryModule)
                for (MediaGroup mediaGroup : ((MediaEntryModule) mediaModule).getMediaGroups()) {
                    for (MediaContent mediaContent : mediaGroup.getContents()) {
                        for (String thumbnailEnclosure : thumbnailEnclosures)
                            if (thumbnailEnclosure.equals(mediaContent.getType())) {
                                imageThumbnail = mediaContent.getReference().toString();

                                break;
                            }

                        if (imageThumbnail != null)
                            break;
                    }

                    if (imageThumbnail != null)
                        break;
                }

            // Then search through the thumbnail enclosures

            if (imageThumbnail == null)
                for (SyndEnclosure enclosure : (Collection<SyndEnclosure>) entry.getEnclosures()) {
                    for (String thumbnailEnclosure : thumbnailEnclosures)
                        if (thumbnailEnclosure.equals(enclosure.getType())) {
                            imageThumbnail = enclosure.getUrl();

                            break;
                        }
                }
        }
        return imageThumbnail;
    }

    /**
     * Commit the given jobs.
     * 
     * @param jobNames
     * @return boolean
     */
    public boolean commitJobs(final List<String> jobNames) {
        Boolean result = true;

        for (String jobName : jobNames) {
            HttpGet commitMethod = new HttpGet(searchEnricherUrl + "?action=Commit&JobName=" + jobName);

            try {
                HttpResponse commitStatus = httpClient.execute(commitMethod);

                if (commitStatus.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
                    logger.error(
                            "Commit failed to successfully process, response: " + commitStatus.getStatusLine());

                    result = false;
                } else if (logger.isDebugEnabled())
                    logger.debug("Commit succesfully sent, response: " + commitStatus.getStatusLine());
            } catch (HttpException e) {
                logger.error("HTTP execution exception for " + searchEnricherUrl, e);

                result = false;
            } catch (IOException e) {
                logger.error("Resource execution exception for " + searchEnricherUrl, e);

                result = false;
            } finally {
                commitMethod.abort();
            }
        }

        return result;
    }

    /**
     * Attempt to delete the given search result from the given collection.
     * 
     * @param jobName
     * @param key
     * @return boolean
     */
    public boolean deleteSearchResult(final String jobName, final String key) {
        // Set the reference encoding only if we know what it is - otherwise we leave it up to the fetcher to guess what it is

        HttpPost deleteMethod = new HttpPost(searchEnricherUrl);
        List<BasicNameValuePair> deleteParameters = new ArrayList<BasicNameValuePair>();

        deleteParameters.add(new BasicNameValuePair("action", "Delete"));
        deleteParameters.add(new BasicNameValuePair("Docs", key));
        deleteParameters.add(new BasicNameValuePair("JobName", jobName));
        deleteParameters.add(new BasicNameValuePair("DeleteType", "ref"));
        deleteParameters.add(new BasicNameValuePair("Field", "url"));

        try {
            deleteMethod.setEntity(new UrlEncodedFormEntity(deleteParameters, "UTF-8"));
        } catch (UnsupportedEncodingException e) {
            logger.error("Unsupported encoding while encoding the URL form entities", e);
        }

        HttpGet commitMethod = new HttpGet(searchEnricherUrl + "?action=Commit&JobName=" + jobName);

        boolean result = true;

        try {
            // Send the import request

            HttpResponse deleteStatus = httpClient.execute(deleteMethod);

            if (deleteStatus.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
                logger.error("Delete failed to successfully POST, response: " + deleteStatus.getStatusLine());

                result = false;
            } else {
                if (logger.isDebugEnabled())
                    logger.debug("Delete succesfully POSTed, response: " + deleteStatus.getStatusLine());

                deleteMethod.abort();
                deleteMethod = null;

                // Send the commit request

                HttpResponse commitStatus = httpClient.execute(commitMethod);

                if (commitStatus.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
                    logger.error(
                            "Commit failed to successfully process, response: " + commitStatus.getStatusLine());

                    result = false;
                } else if (logger.isDebugEnabled())
                    logger.debug("Commit succesfully sent, response: " + commitStatus.getStatusLine());
            }
        } catch (HttpException e) {
            logger.error("HTTP execution exception for " + searchEnricherUrl, e);

            result = false;
        } catch (IOException e) {
            logger.error("Resource execution exception for " + searchEnricherUrl, e);

            result = false;
        } finally {
            if (deleteMethod != null)
                deleteMethod.abort();
            else
                commitMethod.abort();
        }

        return result;
    }

    /**
     * Retrieve the taxonomy match from the given hostname and entry.
     * 
     * @param hostname
     * @param entry
     * @return String
     */
    public static String getTaxonomyMatch(final String hostname, final SyndEntry entry) {
        if (entry.getCategories() != null && entry.getCategories().size() > 0)
            for (SyndCategory category : (Collection<SyndCategory>) entry.getCategories())
                if (StringUtils.hasText(category.getTaxonomyUri())) {
                    if (category.getTaxonomyUri().startsWith("http://")
                            || category.getTaxonomyUri().startsWith("https://")) {
                        try {
                            String domainHostname = new URL(category.getTaxonomyUri()).getHost();

                            return domainHostname.startsWith("www.") ? domainHostname.substring("www.".length())
                                    : domainHostname;
                        } catch (MalformedURLException e) {
                            logger.warn("The given taxonomy URI contained a fully qualified URL ("
                                    + category.getTaxonomyUri() + "), but it could not be parsed", e);

                            continue;
                        }
                    } else
                        return category.getTaxonomyUri();
                }

        return hostname;
    }
}