Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler.frontier; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.httpclient.URIException; import org.archive.crawler.event.AMQPUrlReceivedEvent; import org.archive.crawler.event.CrawlStateEvent; import org.archive.crawler.postprocessor.CandidatesProcessor; import org.archive.modules.CrawlURI; import org.archive.modules.SchedulingConstants; import org.archive.modules.extractor.Hop; import org.archive.modules.extractor.LinkContext; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.spring.KeyedProperties; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.springframework.beans.BeansException; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContextAware; import org.springframework.context.ApplicationListener; import org.springframework.context.Lifecycle; import com.rabbitmq.client.AMQP.BasicProperties; import com.rabbitmq.client.Channel; import com.rabbitmq.client.Connection; import com.rabbitmq.client.ConnectionFactory; import com.rabbitmq.client.Consumer; import com.rabbitmq.client.DefaultConsumer; import com.rabbitmq.client.Envelope; import com.rabbitmq.client.ShutdownSignalException; /** * @contributor nlevitt */ public class AMQPUrlReceiver implements Lifecycle, ApplicationContextAware, ApplicationListener<CrawlStateEvent> { @SuppressWarnings("unused") private static final long serialVersionUID = 2L; private static final Logger logger = Logger.getLogger(AMQPUrlReceiver.class.getName()); public static final String A_RECEIVED_FROM_AMQP = "receivedFromAMQP"; protected ApplicationContext appCtx; public void setApplicationContext(ApplicationContext appCtx) throws BeansException { this.appCtx = appCtx; } protected CandidatesProcessor candidates; public CandidatesProcessor getCandidates() { return candidates; } /** * Received urls are run through the supplied CandidatesProcessor, which * checks scope and schedules the urls. By default the crawl job's normal * candidates processor is autowired in, but a different one can be * configured if special scoping rules are desired. */ @Autowired public void setCandidates(CandidatesProcessor candidates) { this.candidates = candidates; } protected String amqpUri = "amqp://guest:guest@localhost:5672/%2f"; public String getAmqpUri() { return this.amqpUri; } public void setAmqpUri(String uri) { this.amqpUri = uri; } protected String exchange = "umbra"; public String getExchange() { return exchange; } public void setExchange(String exchange) { this.exchange = exchange; } protected String queueName = "requests"; public String getQueueName() { return queueName; } public void setQueueName(String queueName) { this.queueName = queueName; } protected boolean isRunning = false; @Override public boolean isRunning() { return isRunning; } private boolean durable = false; public boolean isDurable() { return durable; } /** Should be queues be marked as durable? */ public void setDurable(boolean durable) { this.durable = durable; } private boolean autoDelete = true; public boolean isAutoDelete() { return autoDelete; } /** Should be queues be marked as auto-delete? */ public void setAutoDelete(boolean autoDelete) { this.autoDelete = autoDelete; } private boolean forceFetch = false; public boolean isForceFetch() { return forceFetch; } public void setForceFetch(boolean forceFetch) { this.forceFetch = forceFetch; } /** * The maximum prefetch count to use, meaning the maximum number of messages * to be consumed without being acknowledged. Using 'null' would specify * there should be no upper limit (the default). */ private Integer prefetchCount = 1000; private transient Lock lock = new ReentrantLock(true); private transient boolean pauseConsumer = false; private transient String consumerTag = null; private class StarterRestarter extends Thread { public StarterRestarter(String name) { super(name); } @Override public void run() { while (!Thread.interrupted()) { try { lock.lockInterruptibly(); logger.finest("Checking consumerTag=" + consumerTag + " and pauseConsumer=" + pauseConsumer); try { if (consumerTag == null && !pauseConsumer) { // start up again try { startConsumer(); } catch (IOException e) { logger.log(Level.SEVERE, "problem starting AMQP consumer (will try again after 10 seconds)", e); } } if (consumerTag != null && pauseConsumer) { try { if (consumerTag != null) { logger.info("Attempting to cancel URLConsumer with consumerTag=" + consumerTag); channel().basicCancel(consumerTag); consumerTag = null; logger.info("Cancelled URLConsumer."); } } catch (IOException e) { logger.log(Level.SEVERE, "problem cancelling AMQP consumer (will try again after 10 seconds)", e); } } } finally { lock.unlock(); } Thread.sleep(10 * 1000); } catch (InterruptedException e) { return; } } } public void startConsumer() throws IOException { Consumer consumer = new UrlConsumer(channel()); channel().exchangeDeclare(getExchange(), "direct", true); channel().queueDeclare(getQueueName(), durable, false, autoDelete, null); channel().queueBind(getQueueName(), getExchange(), getQueueName()); if (prefetchCount != null) channel().basicQos(prefetchCount); consumerTag = channel().basicConsume(getQueueName(), false, consumer); logger.info("started AMQP consumer uri=" + getAmqpUri() + " exchange=" + getExchange() + " queueName=" + getQueueName() + " consumerTag=" + consumerTag); } } transient private StarterRestarter starterRestarter; @Override public void start() { lock.lock(); try { // spawn off a thread to start up the amqp consumer, and try to restart it if it dies if (!isRunning) { starterRestarter = new StarterRestarter( AMQPUrlReceiver.class.getSimpleName() + "-starter-restarter"); try { // try to synchronously start the consumer right now, so // that the queue is bound before crawling starts starterRestarter.startConsumer(); } catch (IOException e) { logger.log(Level.SEVERE, "problem starting AMQP consumer (will try again soon)", e); } starterRestarter.start(); } isRunning = true; } finally { lock.unlock(); } } @Override public void stop() { lock.lock(); try { logger.info("shutting down"); if (starterRestarter != null && starterRestarter.isAlive()) { starterRestarter.interrupt(); try { starterRestarter.join(); } catch (InterruptedException e) { } } starterRestarter = null; if (connection != null && connection.isOpen()) { try { connection.close(); } catch (IOException e) { logger.log(Level.SEVERE, "problem closing AMQP connection", e); } } connection = null; channel = null; isRunning = false; } finally { lock.unlock(); } } transient protected Connection connection = null; transient protected Channel channel = null; protected Connection connection() throws IOException { lock.lock(); try { if (connection != null && !connection.isOpen()) { logger.warning("connection is closed, creating a new one"); connection = null; } if (connection == null) { ConnectionFactory factory = new ConnectionFactory(); try { factory.setUri(getAmqpUri()); } catch (Exception e) { throw new IOException("problem with AMQP uri " + getAmqpUri(), e); } connection = factory.newConnection(); } return connection; } finally { lock.unlock(); } } protected Channel channel() throws IOException { lock.lock(); try { if (channel != null && !channel.isOpen()) { logger.warning("channel is not open, creating a new one"); channel = null; } if (channel == null) { channel = connection().createChannel(); } return channel; } finally { lock.unlock(); } } protected static final Set<String> REQUEST_HEADER_BLACKLIST = new HashSet<String>( Arrays.asList("accept-encoding", "upgrade-insecure-requests", "host", "connection")); // XXX should we be using QueueingConsumer because of possible blocking in // frontier.schedule()? // "Note: all methods of this interface are invoked inside the Connection's // thread. This means they a) should be non-blocking and generally do little // work, b) must not call Channel or Connection methods, or a deadlock will // ensue. One way of ensuring this is to use/subclass QueueingConsumer." protected class UrlConsumer extends DefaultConsumer { public UrlConsumer(Channel channel) { super(channel); } @Override public void handleDelivery(String consumerTag, Envelope envelope, BasicProperties properties, byte[] body) throws IOException { String decodedBody; try { decodedBody = new String(body, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); // can't happen } JSONObject jo = new JSONObject(decodedBody); if ("GET".equals(jo.getString("method"))) { try { CrawlURI curi = makeCrawlUri(jo); KeyedProperties.clearAllOverrideContexts(); candidates.runCandidateChain(curi, null); appCtx.publishEvent(new AMQPUrlReceivedEvent(AMQPUrlReceiver.this, curi)); } catch (URIException e) { logger.log(Level.WARNING, "problem creating CrawlURI from json received via AMQP " + decodedBody, e); } catch (JSONException e) { logger.log(Level.SEVERE, "problem creating CrawlURI from json received via AMQP " + decodedBody, e); } catch (Exception e) { logger.log(Level.SEVERE, "Unanticipated problem creating CrawlURI from json received via AMQP " + decodedBody, e); } } else { logger.info("ignoring url with method other than GET - " + decodedBody); } logger.finest("Now ACKing: " + decodedBody); this.getChannel().basicAck(envelope.getDeliveryTag(), false); } @Override public void handleShutdownSignal(String consumerTag, ShutdownSignalException sig) { if (!sig.isInitiatedByApplication()) { logger.log(Level.SEVERE, "amqp channel/connection unexpectedly shut down consumerTag=" + consumerTag, sig); } else { logger.info("amqp channel/connection shut down consumerTag=" + consumerTag); } AMQPUrlReceiver.this.consumerTag = null; } // { // "headers": { // "Referer": "https://archive.org/", // "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/32.0.1700.102 Chrome/32.0.1700.102 Safari/537.36", // "Accept": "image/webp,*/*;q=0.8" // }, // "url": "https://analytics.archive.org/0.gif?server_ms=256&server_name=www19.us.archive.org&service=ao&loadtime=358&timediff=-8&locale=en-US&referrer=-&version=2&count=9", // "method": "GET" // } protected CrawlURI makeCrawlUri(JSONObject jo) throws URIException, JSONException { JSONObject joHeaders = jo.getJSONObject("headers"); UURI uuri = UURIFactory.getInstance(jo.getString("url")); UURI via = UURIFactory.getInstance(jo.getString("parentUrl")); JSONObject parentUrlMetadata = jo.getJSONObject("parentUrlMetadata"); String parentHopPath = parentUrlMetadata.getString("pathFromSeed"); String hop = jo.optString("hop", Hop.INFERRED.getHopString()); String hopPath = parentHopPath + hop; CrawlURI curi = new CrawlURI(uuri, hopPath, via, LinkContext.INFERRED_MISC); populateHeritableMetadata(curi, parentUrlMetadata); // set the http headers from the amqp message Map<String, String> customHttpRequestHeaders = new HashMap<String, String>(); for (Object key : joHeaders.keySet()) { String k = key.toString(); if (!k.startsWith(":") && !REQUEST_HEADER_BLACKLIST.contains(k)) { customHttpRequestHeaders.put(k, joHeaders.getString(key.toString())); } } curi.getData().put("customHttpRequestHeaders", customHttpRequestHeaders); /* * Crawl job must be configured to use * HighestUriQueuePrecedencePolicy to ensure these high priority * urls really get crawled ahead of others. See * https://webarchive.jira.com/wiki/display/Heritrix/Precedence+ * Feature+Notes */ if (Hop.INFERRED.getHopString().equals(curi.getLastHop())) { curi.setSchedulingDirective(SchedulingConstants.HIGH); curi.setPrecedence(1); } curi.setForceFetch(forceFetch || jo.optBoolean("forceFetch")); curi.setSeed(jo.optBoolean("isSeed")); curi.getAnnotations().add(A_RECEIVED_FROM_AMQP); return curi; } // set the heritable data from the parent url, passed back to us via amqp // XXX brittle, only goes one level deep, and only handles strings and arrays, the latter of which it converts to a Set. // 'heritableData': {'source': 'https://facebook.com/whitehouse/', 'heritable': ['source', 'heritable']} @SuppressWarnings("unchecked") protected void populateHeritableMetadata(CrawlURI curi, JSONObject parentUrlMetadata) { JSONObject heritableData = parentUrlMetadata.getJSONObject("heritableData"); for (String key : (Set<String>) heritableData.keySet()) { Object value = heritableData.get(key); if (value instanceof JSONArray) { Set<String> valueSet = new HashSet<String>(); JSONArray arr = ((JSONArray) value); for (int i = 0; i < arr.length(); i++) { valueSet.add(arr.getString(i)); } curi.getData().put(key, valueSet); } else { curi.getData().put(key, heritableData.get(key)); } } } } @Override public void onApplicationEvent(CrawlStateEvent event) { switch (event.getState()) { case PAUSING: case PAUSED: if (!this.pauseConsumer) { logger.info("Requesting a pause of the URLConsumer..."); this.pauseConsumer = true; } break; case RUNNING: if (this.pauseConsumer) { logger.info("Requesting unpause of the URLConsumer..."); this.pauseConsumer = false; } break; default: } } }