Java tutorial
package uk.bl.wa.indexer; /* * #%L * warc-indexer * $Id:$ * $HeadURL:$ * %% * Copyright (C) 2013 - 2014 The UK Web Archive * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import static org.archive.format.warc.WARCConstants.HEADER_KEY_TYPE; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Properties; import java.util.TimeZone; import org.apache.commons.codec.binary.Base64; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; import org.apache.commons.httpclient.ProtocolException; import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.HttpHeaders; import org.apache.log4j.PropertyConfigurator; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.tika.mime.MediaType; import org.archive.format.warc.WARCConstants; import org.archive.io.ArchiveRecord; import org.archive.io.ArchiveRecordHeader; import org.archive.io.arc.ARCRecord; import org.archive.io.warc.WARCRecord; import org.archive.util.ArchiveUtils; import org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; import uk.bl.wa.analyser.payload.WARCPayloadAnalysers; import uk.bl.wa.analyser.text.TextAnalysers; import uk.bl.wa.annotation.Annotations; import uk.bl.wa.annotation.Annotator; import uk.bl.wa.extract.LinkExtractor; import uk.bl.wa.parsers.HtmlFeatureParser; import uk.bl.wa.solr.SolrFields; import uk.bl.wa.solr.SolrRecord; import uk.bl.wa.solr.SolrWebServer; import uk.bl.wa.util.HashedCachedInputStream; import uk.bl.wa.util.Instrument; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigRenderOptions; /** * * Core indexer class that takes a web archive record and generates a Solr record. * * TODO Currently a rather crude, monolithic code structure. Should pull the different metadata generation logic out into separate classes or at least methods. * * @author Andrew Jackson <Andrew.Jackson@bl.uk> * */ public class WARCIndexer { private static Log log = LogFactory.getLog(WARCIndexer.class); private List<String> url_excludes; private List<String> protocol_includes; private List<String> response_includes; private List<String> record_type_includes; private MessageDigest md5 = null; private AggressiveUrlCanonicalizer canon = new AggressiveUrlCanonicalizer(); /** */ private boolean extractText; private boolean storeText; private boolean hashUrlId; /** Wayback-style URI filtering: */ private StaticMapExclusionFilterFactory smef = null; /** Hook to the solr server: */ private boolean checkSolrForDuplicates = false; private SolrWebServer solrServer = null; /** Payload Analysers */ private long inMemoryThreshold; private long onDiskThreshold; private WARCPayloadAnalysers wpa; /** Text Analysers */ private TextAnalysers txa; /** Annotations */ private Annotator ant = null; // Paired with HtmlFeatureParsers links-extractor private final boolean addNormalisedURL; private final AggressiveUrlCanonicalizer urlNormaliser = new AggressiveUrlCanonicalizer(); /* ------------------------------------------------------------ */ /** * Default constructor, with empty configuration. */ public WARCIndexer() throws NoSuchAlgorithmException { this(ConfigFactory.parseString(ConfigFactory.load().root().render(ConfigRenderOptions.concise()))); } /** * Preferred constructor, allows passing in configuration from execution environment. */ public WARCIndexer(Config conf) throws NoSuchAlgorithmException { log.info("Initialising WARCIndexer..."); try { Properties props = new Properties(); props.load(getClass().getResourceAsStream("/log4j-override.properties")); PropertyConfigurator.configure(props); } catch (IOException e1) { log.error("Failed to load log4j config from properties file."); } // Optional configurations: this.extractText = conf.getBoolean("warc.index.extract.content.text"); log.info("Extract text = " + extractText); this.storeText = conf.getBoolean("warc.index.extract.content.text_stored"); log.info("Store text = " + storeText); this.hashUrlId = conf.getBoolean("warc.solr.use_hash_url_id"); log.info("hashUrlId = " + hashUrlId); addNormalisedURL = conf.hasPath(HtmlFeatureParser.CONF_LINKS_NORMALISE) ? conf.getBoolean(HtmlFeatureParser.CONF_LINKS_NORMALISE) : HtmlFeatureParser.DEFAULT_LINKS_NORMALISE; this.checkSolrForDuplicates = conf.getBoolean("warc.solr.check_solr_for_duplicates"); if (this.hashUrlId == false && this.checkSolrForDuplicates == true) { log.warn( "Checking Solr for duplicates may not work as expected when using the timestamp+md5(URL) key."); log.warn("You need to use the payload-hash+md5(URL) key option to resolve revisit records."); } // URLs to exclude: this.url_excludes = conf.getStringList("warc.index.extract.url_exclude"); // Protocols to include: this.protocol_includes = conf.getStringList("warc.index.extract.protocol_include"); // Response codes to include: this.response_includes = conf.getStringList("warc.index.extract.response_include"); // Record types to include: this.record_type_includes = conf.getStringList("warc.index.extract.record_type_include"); // URL Filtering options: if (conf.getBoolean("warc.index.exclusions.enabled")) { smef = new StaticMapExclusionFilterFactory(); smef.setFile(conf.getString("warc.index.exclusions.file")); smef.setCheckInterval(conf.getInt("warc.index.exclusions.check_interval")); try { smef.init(); } catch (IOException e) { log.error("Failed to load exclusions file."); throw new RuntimeException( "StaticMapExclusionFilterFactory failed with IOException when loading " + smef.getFile()); } } // Instanciate required helpers: md5 = MessageDigest.getInstance("MD5"); // Also hook up to Solr server for queries: if (this.checkSolrForDuplicates) { log.info("Initialisating connection to Solr..."); solrServer = new SolrWebServer(conf); } // Set up hash-cache properties: this.inMemoryThreshold = conf.getBytes("warc.index.extract.inMemoryThreshold"); this.onDiskThreshold = conf.getBytes("warc.index.extract.onDiskThreshold"); log.info("Hashing & Caching thresholds are: < " + this.inMemoryThreshold + " in memory, < " + this.onDiskThreshold + " on disk."); // Set up analysers log.info("Setting up analysers..."); this.wpa = new WARCPayloadAnalysers(conf); this.txa = new TextAnalysers(conf); // Log so it's clear this completed ok: log.info("Initialisation of WARCIndexer complete."); } /** * * @param ann */ public void setAnnotations(Annotations ann) { this.ant = new Annotator(ann); } /** * @return the checkSolrForDuplicates */ public boolean isCheckSolrForDuplicates() { return checkSolrForDuplicates; } /** * @param checkSolrForDuplicates the checkSolrForDuplicates to set */ public void setCheckSolrForDuplicates(boolean checkSolrForDuplicates) { this.checkSolrForDuplicates = checkSolrForDuplicates; } /** * This extracts metadata and text from the ArchiveRecord and creates a suitable SolrRecord. * * @param archiveName * @param record * @return * @throws IOException */ public SolrRecord extract(String archiveName, ArchiveRecord record) throws IOException { return this.extract(archiveName, record, this.extractText); } /** * This extracts metadata from the ArchiveRecord and creates a suitable SolrRecord. * Removes the text field if flag set. * * @param archiveName * @param record * @param isTextIncluded * @return * @throws IOException */ public SolrRecord extract(String archiveName, ArchiveRecord record, boolean isTextIncluded) throws IOException { final long start = System.nanoTime(); ArchiveRecordHeader header = record.getHeader(); SolrRecord solr = new SolrRecord(archiveName, header); if (!header.getHeaderFields().isEmpty()) { if (header.getHeaderFieldKeys().contains(HEADER_KEY_TYPE)) { if (!checkRecordType((String) header.getHeaderValue(HEADER_KEY_TYPE))) { return null; } } // else we're processing ARCs if (header.getUrl() == null) return null; String fullUrl = header.getUrl(); log.debug( "Current heap usage: " + FileUtils.byteCountToDisplaySize(Runtime.getRuntime().totalMemory())); log.debug("Processing " + fullUrl + " from " + archiveName); // Check the filters: if (this.checkProtocol(fullUrl) == false) return null; if (this.checkUrl(fullUrl) == false) return null; if (this.checkExclusionFilter(fullUrl) == false) return null; // --- Basic headers --- // Basic metadata: solr.setField(SolrFields.SOURCE_FILE, archiveName + "@" + header.getOffset()); // solr.setField(SolrFields.SOURCE_FILE_OFFSET, // "" + header.getOffset()); byte[] md5digest = md5.digest(fullUrl.getBytes("UTF-8")); String md5hex = new String(Base64.encodeBase64(md5digest)); solr.setField(SolrFields.SOLR_URL, fullUrl); if (addNormalisedURL) { solr.setField(SolrFields.SOLR_URL_NORMALISED, urlNormaliser.canonicalize(fullUrl)); } // Get the length, but beware, this value also includes the HTTP headers (i.e. it is the payload_length): long content_length = header.getLength(); // Also pull out the file extension, if any: solr.addField(SolrFields.CONTENT_TYPE_EXT, parseExtension(fullUrl)); // Strip down very long URLs to avoid "org.apache.commons.httpclient.URIException: Created (escaped) uuri > 2083" // Trac #2271: replace string-splitting with URI-based methods. URL url = null; if (fullUrl.length() > 2000) fullUrl = fullUrl.substring(0, 2000); try { url = new URL(fullUrl); } catch (MalformedURLException e) { // Some URIs causing problem, so try the canonicalizer; in which // case try with the full URL. log.error(e.getMessage()); try { url = new URL("http://" + canon.urlStringToKey(fullUrl)); } catch (Exception e2) { // If this fails, abandon all hope. log.error(e2.getMessage()); return null; } } // Spot 'slash pages': if (url.getPath().equals("/") || url.getPath().equals("") || url.getPath().matches("/index\\.[a-z]+$")) { solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_SLASHPAGE); // Spot 'robots.txt': } else if (url.getPath().equals("/robots.txt")) { solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_ROBOTS_TXT); } else { solr.setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_NORMAL); } // Record the domain (strictly, the host): String host = url.getHost(); solr.setField(SolrFields.SOLR_HOST, host); solr.setField(SolrFields.DOMAIN, LinkExtractor.extractPrivateSuffixFromHost(host)); solr.setField(SolrFields.PUBLIC_SUFFIX, LinkExtractor.extractPublicSuffixFromHost(host)); Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#archeaders", start); InputStream tikainput = null; // Only parse HTTP headers for HTTP URIs if (fullUrl.startsWith("http")) { // Parse HTTP headers: String statusCode = null; if (record instanceof WARCRecord) { // There are not always headers! The code should check first. String statusLine = HttpParser.readLine(record, "UTF-8"); if (statusLine != null && statusLine.startsWith("HTTP")) { String firstLine[] = statusLine.split(" "); if (firstLine.length > 1) { statusCode = firstLine[1].trim(); try { this.processHeaders(solr, statusCode, HttpParser.parseHeaders(record, "UTF-8")); } catch (ProtocolException p) { log.error("ProtocolException [" + statusCode + "]: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY), p); } } else { log.warn("Could not parse status line: " + statusLine); } } else { log.warn("Invalid status line: " + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY)); } // No need for this, as the headers have already been read from the InputStream (above): // WARCRecordUtils.getPayload(record); tikainput = record; } else if (record instanceof ARCRecord) { ARCRecord arcr = (ARCRecord) record; statusCode = "" + arcr.getStatusCode(); this.processHeaders(solr, statusCode, arcr.getHttpHeaders()); arcr.skipHttpHeader(); tikainput = arcr; } else { log.error("FAIL! Unsupported archive record type."); return solr; } // Skip recording non-content URLs (i.e. 2xx responses only please): if (this.checkResponseCode(statusCode) == false) { log.debug("Skipping this record based on status code " + statusCode + ": " + header.getUrl()); return null; } } // Update the content_length based on what's available: content_length = tikainput.available(); // Record the length: solr.setField(SolrFields.CONTENT_LENGTH, "" + content_length); // ----------------------------------------------------- // Headers have been processed, payload ready to cache: // ----------------------------------------------------- // Create an appropriately cached version of the payload, to allow analysis. final long hashStreamStart = System.nanoTime(); HashedCachedInputStream hcis = new HashedCachedInputStream(header, tikainput, content_length); tikainput = hcis.getInputStream(); String hash = hcis.getHash(); Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#hashstreamwrap", hashStreamStart); // Prepare crawl date information: String waybackDate = (header.getDate().replaceAll("[^0-9]", "")); Date crawlDate = getWaybackDate(waybackDate); String crawlDateString = parseCrawlDate(waybackDate); // Optionally use a hash-based ID to store only one version of a URL: String id = null; if (hashUrlId) { id = hash + "/" + md5hex; } else { id = waybackDate + "/" + md5hex; } // Set these last: solr.setField(SolrFields.ID, id); solr.setField(SolrFields.HASH, hash); // ----------------------------------------------------- // Payload has been cached, ready to check crawl dates: // ----------------------------------------------------- // Query for currently known crawl dates: HashSet<Date> currentCrawlDates = new HashSet<Date>(); if (this.checkSolrForDuplicates && solrServer != null) { SolrQuery q = new SolrQuery("id:\"" + id + "\""); q.addField(SolrFields.CRAWL_DATES); try { QueryResponse results = solrServer.query(q); if (results.getResults().size() > 0) { SolrDocument fr = results.getResults().get(0); if (fr.containsKey(SolrFields.CRAWL_DATES)) { for (Object cds : fr.getFieldValues(SolrFields.CRAWL_DATES)) { currentCrawlDates.add((Date) cds); } } } else { log.debug("No matching entries found."); } } catch (SolrServerException e) { e.printStackTrace(); // FIXME retry? } } // Is the current date unknown? (inc. no-solr-check case): if (!currentCrawlDates.contains(crawlDate)) { // Dates to be merged under the CRAWL_DATES field: solr.mergeField(SolrFields.CRAWL_DATES, crawlDateString); solr.mergeField(SolrFields.CRAWL_YEARS, extractYear(header.getDate())); } else { // Otherwise, ensure the all the known dates (i.e. including this one) are copied over: for (Date ccd : currentCrawlDates) { solr.addField(SolrFields.CRAWL_DATES, formatter.format(ccd)); solr.addField(SolrFields.CRAWL_YEARS, getYearFromDate(ccd)); } // TODO This could optionally skip re-submission instead? } // Sort the dates and find the earliest: List<Date> dateList = new ArrayList<Date>(currentCrawlDates); dateList.add(crawlDate); Collections.sort(dateList); Date firstDate = dateList.get(0); solr.getSolrDocument().setField(SolrFields.CRAWL_DATE, firstDate); solr.setField(SolrFields.CRAWL_YEAR, getYearFromDate(firstDate)); // Use the current value as the waybackDate: solr.setField(SolrFields.WAYBACK_DATE, waybackDate); // If this is a revisit record, we should just return an update to the crawl_dates: if (WARCConstants.WARCRecordType.revisit.name().equals(header.getHeaderValue(HEADER_KEY_TYPE))) { if (currentCrawlDates.contains(crawlDate)) { return null; } SolrRecord revisited = new SolrRecord(); revisited.setField(SolrFields.ID, id); revisited.mergeField(SolrFields.CRAWL_DATES, crawlDateString); revisited.mergeField(SolrFields.CRAWL_YEARS, extractYear(header.getDate())); return revisited; } // ----------------------------------------------------- // Apply any annotations: // ----------------------------------------------------- if (ant != null) { try { ant.applyAnnotations(url.toURI(), solr.getSolrDocument()); } catch (URISyntaxException e) { e.printStackTrace(); log.error("Failed to annotate " + url + " : " + e); } } // ----------------------------------------------------- // Payload duplication has been checked, ready to parse: // ----------------------------------------------------- final long analyzeStart = System.nanoTime(); // Mark the start of the payload. tikainput.mark((int) content_length); // Pass on to other extractors as required, resetting the stream before each: this.wpa.analyse(header, tikainput, solr); Instrument.timeRel("WARCIndexer.extract#total", "WARCIndexer.extract#analyzetikainput", analyzeStart); // Clear up the caching of the payload: hcis.cleanup(); // Derive normalised/simplified content type: processContentType(solr, header, content_length); // ----------------------------------------------------- // Payload analysis complete, now performing text analysis: // ----------------------------------------------------- this.txa.analyse(solr); // Remove the Text Field if required if (!isTextIncluded) { solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT); } else { // Otherwise, decide whether to store or both store and index // the text: if (storeText == false) { // Copy the text into the indexed (but not stored) field: solr.setField(SolrFields.SOLR_EXTRACTED_TEXT_NOT_STORED, (String) solr.getField(SolrFields.SOLR_EXTRACTED_TEXT).getFirstValue()); // Take the text out of the original (stored) field. solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT); } } } Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#solrdocCreation", "WARCIndexer.extract#total", start); return solr; } /** * @param firstDate * @return */ private synchronized String getYearFromDate(Date date) { calendar.setTime(date); return Integer.toString(calendar.get(Calendar.YEAR)); } private final Calendar calendar = Calendar.getInstance(); /* ----------------------------------- */ private void processHeaders(SolrRecord solr, String statusCode, Header[] httpHeaders) { try { // This is a simple test that the status code setting worked: int statusCodeInt = Integer.parseInt(statusCode); if (statusCodeInt < 0 || statusCodeInt > 1000) throw new Exception("Status code out of range: " + statusCodeInt); // Get the other headers: for (Header h : httpHeaders) { // Get the type from the server if (h.getName().equals(HttpHeaders.CONTENT_TYPE) && solr.getField(SolrFields.CONTENT_TYPE_SERVED) == null) { String servedType = h.getValue(); if (servedType.length() > 200) servedType = servedType.substring(0, 200); solr.addField(SolrFields.CONTENT_TYPE_SERVED, servedType); } // Also, grab the X-Powered-By or Server headers if present: if (h.getName().equals("X-Powered-By")) solr.addField(SolrFields.SERVER, h.getValue()); if (h.getName().equals(HttpHeaders.SERVER)) solr.addField(SolrFields.SERVER, h.getValue()); } } catch (NumberFormatException e) { log.error("Exception when parsing status code: " + statusCode + ": " + e); solr.addParseException("when parsing statusCode", e); } catch (Exception e) { log.error("Exception when parsing headers: " + e); solr.addParseException("when parsing headers", e); } } /** * * @param fullUrl * @return */ protected static String parseExtension(String fullUrl) { if (fullUrl.lastIndexOf("/") != -1) { String path = fullUrl.substring(fullUrl.lastIndexOf("/")); if (path.indexOf("?") != -1) { path = path.substring(0, path.indexOf("?")); } if (path.indexOf("&") != -1) { path = path.substring(0, path.indexOf("&")); } if (path.indexOf(".") != -1) { String ext = path.substring(path.lastIndexOf(".")); ext = ext.toLowerCase(); // Avoid odd/malformed extensions: // if( ext.contains("%") ) // ext = ext.substring(0, path.indexOf("%")); ext = ext.replaceAll("[^0-9a-z]", ""); return ext; } } return null; } /** * Timestamp parsing, for the Crawl Date. */ public static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); static { formatter.setTimeZone(TimeZone.getTimeZone("GMT")); } /** * Returns a Java Date object representing the crawled date. * * @param timestamp * @return */ public static Date getWaybackDate(String timestamp) { Date date = new Date(); try { if (timestamp.length() == 12) { date = ArchiveUtils.parse12DigitDate(timestamp); } else if (timestamp.length() == 14) { date = ArchiveUtils.parse14DigitDate(timestamp); } else if (timestamp.length() == 16) { date = ArchiveUtils.parse17DigitDate(timestamp + "0"); } else if (timestamp.length() >= 17) { date = ArchiveUtils.parse17DigitDate(timestamp.substring(0, 17)); } } catch (ParseException p) { p.printStackTrace(); } return date; } /** * Returns a formatted String representing the crawled date. * * @param waybackDate * @return */ protected static String parseCrawlDate(String waybackDate) { return formatter.format(getWaybackDate(waybackDate)); } /** * * @param timestamp * @return */ public static String extractYear(String timestamp) { // Default to 'unknown': String waybackYear = "unknown"; String waybackDate = timestamp.replaceAll("[^0-9]", ""); if (waybackDate != null) waybackYear = waybackDate.substring(0, 4); // Reject bad values by resetting to 'unknown': if ("0000".equals(waybackYear)) waybackYear = "unknown"; // Return return waybackYear; } /** * * @param solr * @param header * @param content_length */ private void processContentType(SolrRecord solr, ArchiveRecordHeader header, long content_length) { // Get the current content-type: String contentType = (String) solr.getFieldValue(SolrFields.SOLR_CONTENT_TYPE); // Store the raw content type from Tika: solr.setField(SolrFields.CONTENT_TYPE_TIKA, contentType); // Also get the other content types: MediaType mt_tika = MediaType.parse(contentType); if (solr.getField(SolrFields.CONTENT_TYPE_DROID) != null) { MediaType mt_droid = MediaType .parse((String) solr.getField(SolrFields.CONTENT_TYPE_DROID).getFirstValue()); if (mt_tika == null || mt_tika.equals(MediaType.OCTET_STREAM)) { contentType = mt_droid.toString(); } else if (mt_droid.getBaseType().equals(mt_tika.getBaseType()) && mt_droid.getParameters().get("version") != null) { // Union of results: mt_tika = new MediaType(mt_tika, mt_droid.getParameters()); contentType = mt_tika.toString(); } if (mt_droid.getParameters().get("version") != null) { solr.addField(SolrFields.CONTENT_VERSION, mt_droid.getParameters().get("version")); } } // Allow header MIME if (contentType != null && contentType.isEmpty()) { if (header.getHeaderFieldKeys().contains("WARC-Identified-Payload-Type")) { contentType = ((String) header.getHeaderFields().get("WARC-Identified-Payload-Type")); } else { contentType = header.getMimetype(); } } // Determine content type: if (contentType != null) solr.setField(SolrFields.FULL_CONTENT_TYPE, contentType); // If zero-length, then change to application/x-empty for the 'content_type' field. if (content_length == 0) contentType = "application/x-empty"; // Content-Type can still be null if (contentType != null) { // Strip parameters out of main type field: solr.setField(SolrFields.SOLR_CONTENT_TYPE, contentType.replaceAll(";.*$", "")); // Also add a more general, simplified type, as appropriate: if (contentType.matches("^image/.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "image"); } else if (contentType.matches("^audio/.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "audio"); } else if (contentType.matches("^video/.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "video"); } else if (contentType.matches("^text/htm.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "html"); } else if (contentType.matches("^application/pdf.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "pdf"); } else if (contentType.matches("^.*word$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "word"); } else if (contentType.matches("^.*excel$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "excel"); } else if (contentType.matches("^.*powerpoint$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "powerpoint"); } else if (contentType.matches("^text/plain.*$")) { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "text"); } else { solr.setField(SolrFields.SOLR_NORMALISED_CONTENT_TYPE, "other"); } // Remove text from JavaScript, CSS, ... if (contentType.startsWith("application/javascript") || contentType.startsWith("text/javascript") || contentType.startsWith("text/css")) { solr.removeField(SolrFields.SOLR_EXTRACTED_TEXT); } } } private boolean checkUrl(String url) { for (String exclude : url_excludes) { if (!"".equals(exclude) && url.matches(".*" + exclude + ".*")) { return false; } } return true; } private boolean checkProtocol(String url) { for (String include : protocol_includes) { if ("".equals(include) || url.startsWith(include)) { return true; } } return false; } private boolean checkResponseCode(String statusCode) { if (statusCode == null) return false; // Check for match: for (String include : response_includes) { if ("".equals(include) || statusCode.startsWith(include)) { return true; } } // Exclude return false; } private boolean checkRecordType(String type) { for (String include : record_type_includes) { if (type.equals(include)) { return true; } } return false; } private boolean checkExclusionFilter(String uri) { // Default to no exclusions: if (smef == null) return true; // Otherwise: ExclusionFilter ef = smef.get(); CaptureSearchResult r = new CaptureSearchResult(); // r.setOriginalUrl(uri); r.setUrlKey(uri); try { if (ef.filterObject(r) == ExclusionFilter.FILTER_INCLUDE) { return true; } } catch (Exception e) { log.error("Exclusion filtering failed with exception: " + e); e.printStackTrace(); } log.debug("EXCLUDING this URL due to filter: " + uri); // Exclude: return false; } }