Java tutorial
/** * Copyright (C) 2013 Seajas, the Netherlands. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3, as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.seajas.search.contender.service.enricher; import com.seajas.search.bridge.contender.metadata.SeajasModule; import com.seajas.search.bridge.jms.model.CompositeEntry; import com.seajas.search.bridge.jms.model.EnricherDocument; import com.seajas.search.bridge.jms.model.Feed; import com.seajas.search.bridge.jms.model.ModifiedContent; import com.seajas.search.bridge.jms.model.OriginalContent; import com.seajas.search.bridge.jms.model.Source; import com.seajas.search.bridge.jms.model.SourceElement; import com.seajas.search.contender.replication.TaxonomyCache; import com.seajas.search.contender.service.ContenderService; import com.sun.syndication.feed.module.DCModule; import com.sun.syndication.feed.module.Module; import com.sun.syndication.feed.module.georss.GeoRSSModule; import com.sun.syndication.feed.module.mediarss.MediaEntryModule; import com.sun.syndication.feed.module.mediarss.MediaModule; import com.sun.syndication.feed.module.mediarss.types.MediaContent; import com.sun.syndication.feed.module.mediarss.types.MediaGroup; import com.sun.syndication.feed.synd.SyndCategory; import com.sun.syndication.feed.synd.SyndEnclosure; import com.sun.syndication.feed.synd.SyndEntry; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.methods.PostMethod; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.NameValuePair; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.message.BasicNameValuePair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.util.StringUtils; /** * The enricher service used to communicate with the enricher. * * @author Jasper van Veghel <jasper@seajas.com> */ @Service public class EnricherService { /** * The static logger. */ private static final Logger logger = LoggerFactory.getLogger(EnricherService.class); /** * Constants. */ private static final String ENCODING_UNKNOWN = "UNKNOWN"; /** * The contender service. */ @Autowired private ContenderService contenderService; /** * The taxonomy cache. */ @Autowired private TaxonomyCache taxonomyCache; /** * The enricher HTTP client. */ @Autowired @Qualifier("enricherHttpClient") private HttpClient httpClient; /** * The search enricher URL. */ @Value("${contender.project.search.enricher.url}") private String searchEnricherUrl; /** * List of admissible hostnames for GeoRSS fields (or empty / * for all). */ private List<String> geoRssInclusions; /** * The thumbnail enclosures. */ private List<String> thumbnailEnclosures; /** * Default constructor. */ public EnricherService() { // Do nothing } /** * Default constructor. * * @param thumbnailEnclosures * @param geoRssInclusions */ @Autowired public EnricherService( @Value("${contender.project.rss.reader.thumbnail.enclosures}") final String thumbnailEnclosures, @Value("${contender.project.search.enricher.georss.included.hosts}") final String geoRssInclusions) { this.thumbnailEnclosures = Arrays .asList(StringUtils.tokenizeToStringArray(thumbnailEnclosures, ",", true, true)); this.geoRssInclusions = Arrays.asList(StringUtils.tokenizeToStringArray(geoRssInclusions, ",", true, true)); } /** * Create a new envelope request and return its contents. * * @param jobName * @param entry * @return String */ public String createEnvelope(final String jobName, final CompositeEntry entry) { StringBuffer envelopeRequest = new StringBuffer("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n\n"); envelopeRequest.append("<autn:import xmlns:autn=\"http://schemas.autonomy.com/aci/\">\n"); envelopeRequest.append("\t<autn:envelope>\n"); // NOTE: Content is stored elsewhere, and so we don't create a <stubIdx> for it envelopeRequest.append("\t\t<autn:document xmlns:autn=\"http://schemas.autonomy.com/aci/\">\n"); envelopeRequest.append("\t\t\t<autn:fetch url=\"data://" + entry.getId() + "\" />\n"); envelopeRequest.append("\t\t</autn:document>\n"); envelopeRequest.append("\t</autn:envelope>\n"); envelopeRequest.append("</autn:import>\n"); return envelopeRequest.toString(); } /** * Send the actual envelope content to the enricher. * * @param jobName * @param encoding * @param request * @return boolean */ public boolean sendEnvelope(final String jobName, final String encoding, final String request) { HttpPost importMethod = new HttpPost(searchEnricherUrl); List<NameValuePair> importParameters = new ArrayList<NameValuePair>(); importParameters.add(new BasicNameValuePair("action", "ImportEnvelope")); importParameters.add(new BasicNameValuePair("JobName", jobName)); if (encoding != null && !encoding.equals(ENCODING_UNKNOWN)) importParameters.add(new BasicNameValuePair("ReferenceEncoding", encoding)); importParameters.add(new BasicNameValuePair("EnvelopeXML", request)); try { importMethod.setEntity(new UrlEncodedFormEntity(importParameters, "UTF-8")); } catch (UnsupportedEncodingException e) { logger.error("Unsupported encoding while encoding the URL form entities", e); } // Set the form encoding to match that of the Search Enricher importMethod.addHeader("Content-Type", PostMethod.FORM_URL_ENCODED_CONTENT_TYPE + ";charset=UTF-8"); boolean result = true; try { if (logger.isTraceEnabled()) logger.trace("Executing the requested envelope post"); // Send the import request HttpResponse importStatus = httpClient.execute(importMethod); if (importStatus.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { logger.error("EnvelopeXML failed to successfully POST, response: " + importStatus.getStatusLine()); result = false; } else { if (logger.isDebugEnabled()) logger.debug("EnvelopeXML succesfully POSTed, response: " + importStatus.getStatusLine()); importMethod.abort(); importMethod = null; } } catch (HttpException e) { logger.error("HTTP execution exception for " + searchEnricherUrl, e); result = false; } catch (IOException e) { logger.error("Resource execution exception for " + searchEnricherUrl, e); result = false; } finally { if (importMethod != null) importMethod.abort(); } return result; } /** * Create an enricher document from the given entry. * * @param jobName * @param entry * @return EnricherDocument */ public EnricherDocument createEnricherDocument(final String jobName, final CompositeEntry entry) { EnricherDocument document = new EnricherDocument(); // Take out everything we need Source source = entry.getSource(); SourceElement element = entry.getElement(); OriginalContent originalContent = entry.getOriginalContent(); ModifiedContent modifiedContent = entry.getModifiedContent(); // The reference is either the final URL, or the entry link, or the entry URI String reference = originalContent.getUri() != null ? originalContent.getUri().toString() : StringUtils.hasText(element.getEntry().getLink()) ? element.getEntry().getLink() : element.getEntry().getUri(); // Set the document ID to be the reference document.setId(reference); // We should always have at least one separator in the path if (StringUtils.hasText(element.getEntry().getTitle())) document.setTitle(element.getEntry().getTitle()); else if (reference.lastIndexOf('/') != -1) { logger.warn("No entry title given - deriving it from the reference"); document.setTitle(reference.substring(reference.lastIndexOf("/") + 1)); } // Retrieve the taxonomy fields handleTaxonomy(jobName, document, originalContent.getHostname(), element.getEntry()); // Add the remaining metadata as Dublin Core fields document.setReference(reference); if (StringUtils.hasText(element.getEntry().getAuthor())) document.setAuthor(element.getEntry().getAuthor()); handleDatesAndTimes( modifiedContent != null ? modifiedContent.getDateSubmitted() : originalContent.getDateSubmitted(), document, element.getEntry()); // Add the feed ID document.setFeedId(source.getId()); // Add any additionally provided Dublin Core metadata handleFeedMetadata(document, element.getEntry(), reference, source instanceof Feed ? ((Feed) source).getLanguageOverride() : null); // Add any additionally provided Geo RSS metadata handleGeoRss(document, element.getEntry(), originalContent.getHostname()); // Add any additionally provided thumbnail image handleImageThumbnail(document, element.getEntry(), reference); // Add any additional fields for this result Map<String, String> additionalFields = new HashMap<String, String>(); if (source.getResultParameters() != null) additionalFields.putAll(source.getResultParameters()); document.setAdditionalFields(additionalFields); return document; } /** * Handle the taxonomy-related envelope fields. * * @param jobName * @param document * @param hostname * @param entry */ private void handleTaxonomy(final String jobName, final EnricherDocument document, final String hostname, final SyndEntry entry) { String taxonomyMatch = getTaxonomyMatch(hostname, entry); if (logger.isTraceEnabled()) logger.trace( String.format("Retrieved taxonomy match '%s' - will retrieve identifiers next", taxonomyMatch)); document.setTaxonomyHost(taxonomyMatch.trim().toLowerCase()); List<Integer> taxonomyIdentifiers = taxonomyCache.getIdsByMatch(taxonomyMatch); // Check if we need to create a new unassigned taxonomy entry if (taxonomyIdentifiers == null) taxonomyIdentifiers = taxonomyCache.addToUnassigned(jobName, taxonomyMatch); if (taxonomyIdentifiers != null) { if (logger.isTraceEnabled()) logger.trace(String.format("Retrieved (either fresh or cached) %d matching identifiers", taxonomyIdentifiers.size())); document.setTaxonomyIds(new ArrayList<String>()); for (Integer taxonomyIdentifier : taxonomyIdentifiers) document.getTaxonomyIds().add(taxonomyCache.getFieldPrefix() + taxonomyIdentifier); } else logger.warn(String.format( "Could not associate the given match '%s' with any known taxonomy identifiers, nor could a new one be created", taxonomyMatch)); } /** * Handle the date and time related envelope fields. * * @param dateSubmitted * @param document * @param entry */ private void handleDatesAndTimes(final Date dateSubmitted, final EnricherDocument document, final SyndEntry entry) { Date createdDate = entry.getPublishedDate(), modifiedDate = entry.getUpdatedDate(); if (createdDate == null) createdDate = dateSubmitted; document.setCreated(createdDate); if (modifiedDate == null) modifiedDate = dateSubmitted; document.setModified(modifiedDate); document.setSubmitted(dateSubmitted); } /** * Handle the Dublin Core related envelope fields, and return the provided source from the Dublin Core metadata values. * * @param document * @param entry * @param reference * @return String */ private void handleFeedMetadata(final EnricherDocument document, final SyndEntry entry, final String reference, final String languageOverride) { final Module dcModule; // Derive the DC module from the Seajas module, if possible Module seajasModule = entry.getModule(SeajasModule.URI); if (seajasModule != null && seajasModule instanceof DCModule) dcModule = seajasModule; else dcModule = entry.getModule("http://purl.org/dc/elements/1.1/"); if (seajasModule != null) handleFeedMetadataInternal(document, (SeajasModule) seajasModule, reference); if (dcModule != null) handleFeedMetadataDublinCore(document, (DCModule) dcModule, reference, languageOverride); } /** * Handle the internal Seajas-related envelope fields. * * @param document * @param module * @param reference */ private void handleFeedMetadataInternal(final EnricherDocument document, final SeajasModule module, final String reference) { // Add any supporting content if (module.getSupportContent() != null) document.setSupportContent(module.getSupportContent()); // Add the linked images, if any if (module.getLinkedImageUrls() != null && !module.getLinkedImageUrls().isEmpty()) { List<String> notifiedUrls = contenderService.getMediaNotificationUrls(module.getLinkedImageUrls(), buildContext(document, reference), "image"); if (notifiedUrls != null) { if (document.getLinkedImageUrls() == null) document.setLinkedImageUrls(new ArrayList<String>()); for (String notifiedUrl : notifiedUrls) document.getLinkedImageUrls().add(notifiedUrl); } } if (module.getLinkedVideoUrl() != null) document.setLinkedVideoUrl(module.getLinkedVideoUrl()); if (module.getLinkedSubtitlesUrl() != null) document.setLinkedSubtitlesUrl(module.getLinkedSubtitlesUrl()); if (module.getRepublicationCount() != null) document.setRepublicationCount(module.getRepublicationCount()); if (StringUtils.hasText(module.getAuthorName())) document.setAuthorName(module.getAuthorName()); if (module.getThumbnailUrls() != null && !module.getThumbnailUrls().isEmpty()) { List<String> notifiedUrls = contenderService.getMediaNotificationUrls(module.getThumbnailUrls(), buildContext(document, reference), "image"); if (notifiedUrls != null) { if (document.getThumbnailUrls() == null) document.setThumbnailUrls(new ArrayList<String>()); for (String notifiedUrl : notifiedUrls) document.getThumbnailUrls().add(notifiedUrl); } } if (StringUtils.hasText(module.getProfileImageUrl())) { String notifiedUrl = contenderService.getMediaNotificationUrl(module.getProfileImageUrl(), buildContext(document, reference), "image"); if (notifiedUrl != null) document.setProfileImageUrl(notifiedUrl); } } /** * Build the media notification context. * * @param document * @param reference * @return List<String> */ private List<String> buildContext(final EnricherDocument document, final String reference) { List<String> result = new ArrayList<String>(); result.add("referrer=" + reference); if (!StringUtils.isEmpty(document.getTitle())) result.add("title=" + document.getTitle()); return result; } /** * Handle the Dublin Core related envelope fields. * * @param document * @param module * @param reference * @param languageOverride */ private void handleFeedMetadataDublinCore(final EnricherDocument document, final DCModule module, final String reference, final String languageOverride) { String providedSource = null; document.setLanguage(languageOverride != null ? languageOverride : module.getLanguage()); document.setPublishers(module.getPublishers()); document.setContributors(module.getContributors()); document.setRights(module.getRightsList()); document.setTypes(module.getTypes()); document.setCreators(module.getCreators()); // Only use this if it contains something substantive (no 'html' or 'text' types) if (StringUtils.hasText(module.getFormat()) && !module.getFormat().contains("html") && !module.getFormat().contains("text")) document.setFormat(module.getFormat()); // Always fall back to the reference for the sourceId document.setSourceId(reference); // Treat dcterms_source as a special case; it's only valid if only one entry is provided which is a valid URL if (module.getSources().size() == 1) { String potentialSource = module.getSources().get(0).toString(); try { new URL(potentialSource); document.setSourceId(potentialSource); } catch (MalformedURLException e) { if (logger.isDebugEnabled()) logger.debug( String.format("Not setting enricher document source to '%s' as it's not a valid URL", potentialSource)); } } } /** * Handle the image thumbnail related envelope field. * * @param document * @param entry * @param reference */ private void handleImageThumbnail(final EnricherDocument document, final SyndEntry entry, final String reference) { String imageThumbnail = extractImageThumbnail(entry); if (imageThumbnail != null) { String mediaNotificationUrl = contenderService.getMediaNotificationUrl(imageThumbnail, buildContext(document, reference), "image"); if (mediaNotificationUrl != null) { if (document.getThumbnailUrls() == null) document.setThumbnailUrls(new ArrayList<String>()); document.getThumbnailUrls().add(mediaNotificationUrl); } } } /** * Handle the GeoRSS related envelope fields. * * @param document * @param entry * @param hostname */ private void handleGeoRss(final EnricherDocument document, final SyndEntry entry, final String hostname) { Module geoModule = entry.getModule(GeoRSSModule.GEORSS_W3CGEO_URI); if (geoModule == null) geoModule = entry.getModule(GeoRSSModule.GEORSS_GEORSS_URI); if (geoModule != null) if (geoRssInclusions.size() == 0 || geoRssInclusions.contains("*") || geoRssInclusions.contains(hostname)) { if (logger.isDebugEnabled()) logger.debug(String.format( "Geo-RSS module detected for hostname '%s' - which falls within the set inclusions", hostname)); if (((GeoRSSModule) geoModule).getPosition() != null) { if (logger.isDebugEnabled()) logger.debug(String.format("Adding geo_point field for hostname '%s'", hostname)); document.setGeoPoint(((GeoRSSModule) geoModule).getPosition().getLatitude() + "," + ((GeoRSSModule) geoModule).getPosition().getLongitude()); } } else { if (logger.isDebugEnabled()) logger.debug(String.format( "Geo-RSS value provided, but hostname '%s' not contained within GeoRSS inclusion-hosts list", hostname)); } } /** * Extract an image thumbnail from the given syndication entry. * * @param entry * @return String */ private String extractImageThumbnail(final SyndEntry entry) { String imageThumbnail = null; if (entry.getEnclosures().size() > 0 && thumbnailEnclosures.size() > 0) { // Prefer the explicit media types Module mediaModule = entry.getModule(MediaModule.URI); if (mediaModule != null && mediaModule instanceof MediaEntryModule) for (MediaGroup mediaGroup : ((MediaEntryModule) mediaModule).getMediaGroups()) { for (MediaContent mediaContent : mediaGroup.getContents()) { for (String thumbnailEnclosure : thumbnailEnclosures) if (thumbnailEnclosure.equals(mediaContent.getType())) { imageThumbnail = mediaContent.getReference().toString(); break; } if (imageThumbnail != null) break; } if (imageThumbnail != null) break; } // Then search through the thumbnail enclosures if (imageThumbnail == null) for (SyndEnclosure enclosure : (Collection<SyndEnclosure>) entry.getEnclosures()) { for (String thumbnailEnclosure : thumbnailEnclosures) if (thumbnailEnclosure.equals(enclosure.getType())) { imageThumbnail = enclosure.getUrl(); break; } } } return imageThumbnail; } /** * Commit the given jobs. * * @param jobNames * @return boolean */ public boolean commitJobs(final List<String> jobNames) { Boolean result = true; for (String jobName : jobNames) { HttpGet commitMethod = new HttpGet(searchEnricherUrl + "?action=Commit&JobName=" + jobName); try { HttpResponse commitStatus = httpClient.execute(commitMethod); if (commitStatus.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { logger.error( "Commit failed to successfully process, response: " + commitStatus.getStatusLine()); result = false; } else if (logger.isDebugEnabled()) logger.debug("Commit succesfully sent, response: " + commitStatus.getStatusLine()); } catch (HttpException e) { logger.error("HTTP execution exception for " + searchEnricherUrl, e); result = false; } catch (IOException e) { logger.error("Resource execution exception for " + searchEnricherUrl, e); result = false; } finally { commitMethod.abort(); } } return result; } /** * Attempt to delete the given search result from the given collection. * * @param jobName * @param key * @return boolean */ public boolean deleteSearchResult(final String jobName, final String key) { // Set the reference encoding only if we know what it is - otherwise we leave it up to the fetcher to guess what it is HttpPost deleteMethod = new HttpPost(searchEnricherUrl); List<BasicNameValuePair> deleteParameters = new ArrayList<BasicNameValuePair>(); deleteParameters.add(new BasicNameValuePair("action", "Delete")); deleteParameters.add(new BasicNameValuePair("Docs", key)); deleteParameters.add(new BasicNameValuePair("JobName", jobName)); deleteParameters.add(new BasicNameValuePair("DeleteType", "ref")); deleteParameters.add(new BasicNameValuePair("Field", "url")); try { deleteMethod.setEntity(new UrlEncodedFormEntity(deleteParameters, "UTF-8")); } catch (UnsupportedEncodingException e) { logger.error("Unsupported encoding while encoding the URL form entities", e); } HttpGet commitMethod = new HttpGet(searchEnricherUrl + "?action=Commit&JobName=" + jobName); boolean result = true; try { // Send the import request HttpResponse deleteStatus = httpClient.execute(deleteMethod); if (deleteStatus.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { logger.error("Delete failed to successfully POST, response: " + deleteStatus.getStatusLine()); result = false; } else { if (logger.isDebugEnabled()) logger.debug("Delete succesfully POSTed, response: " + deleteStatus.getStatusLine()); deleteMethod.abort(); deleteMethod = null; // Send the commit request HttpResponse commitStatus = httpClient.execute(commitMethod); if (commitStatus.getStatusLine().getStatusCode() != HttpStatus.SC_OK) { logger.error( "Commit failed to successfully process, response: " + commitStatus.getStatusLine()); result = false; } else if (logger.isDebugEnabled()) logger.debug("Commit succesfully sent, response: " + commitStatus.getStatusLine()); } } catch (HttpException e) { logger.error("HTTP execution exception for " + searchEnricherUrl, e); result = false; } catch (IOException e) { logger.error("Resource execution exception for " + searchEnricherUrl, e); result = false; } finally { if (deleteMethod != null) deleteMethod.abort(); else commitMethod.abort(); } return result; } /** * Retrieve the taxonomy match from the given hostname and entry. * * @param hostname * @param entry * @return String */ public static String getTaxonomyMatch(final String hostname, final SyndEntry entry) { if (entry.getCategories() != null && entry.getCategories().size() > 0) for (SyndCategory category : (Collection<SyndCategory>) entry.getCategories()) if (StringUtils.hasText(category.getTaxonomyUri())) { if (category.getTaxonomyUri().startsWith("http://") || category.getTaxonomyUri().startsWith("https://")) { try { String domainHostname = new URL(category.getTaxonomyUri()).getHost(); return domainHostname.startsWith("www.") ? domainHostname.substring("www.".length()) : domainHostname; } catch (MalformedURLException e) { logger.warn("The given taxonomy URI contained a fully qualified URL (" + category.getTaxonomyUri() + "), but it could not be parsed", e); continue; } } else return category.getTaxonomyUri(); } return hostname; } }