Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.postprocessor; import java.io.UnsupportedEncodingException; import java.util.Map; import org.apache.commons.collections.Closure; import org.archive.crawler.framework.Frontier; import org.archive.crawler.frontier.AbstractFrontier; import org.archive.crawler.frontier.BdbFrontier; import org.archive.crawler.io.UriProcessingFormatter; import org.archive.modules.AMQPProducerProcessor; import org.archive.modules.CrawlURI; import org.archive.modules.net.ServerCache; import org.json.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.Lifecycle; import com.rabbitmq.client.AMQP; import com.rabbitmq.client.AMQP.BasicProperties; /** * @see UriProcessingFormatter * @contributor nlevitt */ public class AMQPCrawlLogFeed extends AMQPProducerProcessor implements Lifecycle { protected Frontier frontier; public Frontier getFrontier() { return this.frontier; } /** Autowired frontier, needed to determine when a url is finished. */ @Autowired public void setFrontier(Frontier frontier) { this.frontier = frontier; } protected ServerCache serverCache; public ServerCache getServerCache() { return this.serverCache; } @Autowired public void setServerCache(ServerCache serverCache) { this.serverCache = serverCache; } protected Map<String, String> extraFields; public Map<String, String> getExtraFields() { return extraFields; } public void setExtraFields(Map<String, String> extraFields) { this.extraFields = extraFields; } protected boolean dumpPendingAtClose = false; public boolean getDumpPendingAtClose() { return dumpPendingAtClose; } /** * If true, publish all pending urls (i.e. queued urls still in the * frontier) when crawl job is stopping. They are recognizable by the status * field which has the value 0. * * @see BdbFrontier#setDumpPendingAtClose(boolean) */ public void setDumpPendingAtClose(boolean dumpPendingAtClose) { this.dumpPendingAtClose = dumpPendingAtClose; } public AMQPCrawlLogFeed() { // set default values setExchange("heritrix.realTimeFeed"); setRoutingKey("crawlLog"); } @Override protected byte[] buildMessage(CrawlURI curi) { JSONObject jo = CrawlLogJsonBuilder.buildJson(curi, getExtraFields(), getServerCache()); try { return jo.toString().getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } @Override protected boolean shouldProcess(CrawlURI curi) { if (frontier instanceof AbstractFrontier) { return !((AbstractFrontier) frontier).needsReenqueuing(curi); } else { return false; } } private transient long pendingDumpedCount = 0l; @Override public synchronized void stop() { if (!isRunning) { return; } if (dumpPendingAtClose) { if (frontier instanceof BdbFrontier) { Closure closure = new Closure() { public void execute(Object curi) { try { innerProcessResult((CrawlURI) curi); pendingDumpedCount++; } catch (InterruptedException e) { } } }; logger.info("dumping " + frontier.queuedUriCount() + " queued urls to amqp feed"); ((BdbFrontier) frontier).forAllPendingDo(closure); logger.info("dumped " + pendingDumpedCount + " queued urls to amqp feed"); } else { logger.warning("frontier is not a BdbFrontier, cannot dumpPendingAtClose"); } } // closes amqp connection super.stop(); } protected BasicProperties props = new AMQP.BasicProperties.Builder().contentType("application/json").build(); @Override protected BasicProperties amqpMessageProperties() { return props; } }