org.archive.modules.postprocessor.KafkaCrawlLogFeed.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.modules.postprocessor.KafkaCrawlLogFeed.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.postprocessor;

import java.io.UnsupportedEncodingException;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.logging.Logger;

import org.apache.commons.collections.Closure;
import org.apache.kafka.clients.producer.Callback;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import org.apache.kafka.common.serialization.ByteArraySerializer;
import org.apache.kafka.common.serialization.StringSerializer;
import org.archive.crawler.framework.Frontier;
import org.archive.crawler.frontier.AbstractFrontier;
import org.archive.crawler.frontier.BdbFrontier;
import org.archive.crawler.io.UriProcessingFormatter;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.net.ServerCache;
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

/**
 * For Kafka 0.8.x. Sends messages in asynchronous mode (producer.type=async)
 * and does not wait for acknowledgment from kafka (request.required.acks=0).
 * Sends messages with no key. These things could be configurable if needed.
 * 
 * @see UriProcessingFormatter
 * @contributor nlevitt
 */
public class KafkaCrawlLogFeed extends Processor implements Lifecycle {

    protected static final Logger logger = Logger.getLogger(KafkaCrawlLogFeed.class.getName());

    protected Frontier frontier;

    public Frontier getFrontier() {
        return this.frontier;
    }

    /** Autowired frontier, needed to determine when a url is finished. */
    @Autowired
    public void setFrontier(Frontier frontier) {
        this.frontier = frontier;
    }

    protected ServerCache serverCache;

    public ServerCache getServerCache() {
        return this.serverCache;
    }

    @Autowired
    public void setServerCache(ServerCache serverCache) {
        this.serverCache = serverCache;
    }

    protected Map<String, String> extraFields;

    public Map<String, String> getExtraFields() {
        return extraFields;
    }

    public void setExtraFields(Map<String, String> extraFields) {
        this.extraFields = extraFields;
    }

    protected boolean dumpPendingAtClose = false;

    public boolean getDumpPendingAtClose() {
        return dumpPendingAtClose;
    }

    /**
     * If true, publish all pending urls (i.e. queued urls still in the
     * frontier) when crawl job is stopping. They are recognizable by the status
     * field which has the value 0.
     *
     * @see BdbFrontier#setDumpPendingAtClose(boolean)
     */
    public void setDumpPendingAtClose(boolean dumpPendingAtClose) {
        this.dumpPendingAtClose = dumpPendingAtClose;
    }

    protected String brokerList = "localhost:9092";

    /** Kafka broker list (kafka property "metadata.broker.list"). */
    public void setBrokerList(String brokerList) {
        this.brokerList = brokerList;
    }

    public String getBrokerList() {
        return brokerList;
    }

    protected String topic = "heritrix-crawl-log";

    public void setTopic(String topic) {
        this.topic = topic;
    }

    public String getTopic() {
        return topic;
    }

    protected byte[] buildMessage(CrawlURI curi) {
        JSONObject jo = CrawlLogJsonBuilder.buildJson(curi, getExtraFields(), getServerCache());
        try {
            return jo.toString().getBytes("UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    protected boolean shouldProcess(CrawlURI curi) {
        if (frontier instanceof AbstractFrontier) {
            return !((AbstractFrontier) frontier).needsReenqueuing(curi);
        } else {
            return false;
        }
    }

    private transient long pendingDumpedCount = 0l;

    @Override
    public synchronized void stop() {
        if (!isRunning) {
            return;
        }

        if (dumpPendingAtClose) {
            if (frontier instanceof BdbFrontier) {

                Closure closure = new Closure() {
                    public void execute(Object curi) {
                        try {
                            innerProcess((CrawlURI) curi);
                            pendingDumpedCount++;
                        } catch (InterruptedException e) {
                        }
                    }
                };

                logger.info("dumping " + frontier.queuedUriCount() + " queued urls to kafka feed");
                ((BdbFrontier) frontier).forAllPendingDo(closure);
                logger.info("dumped " + pendingDumpedCount + " queued urls to kafka feed");
            } else {
                logger.warning("frontier is not a BdbFrontier, cannot dumpPendingAtClose");
            }
        }

        if (kafkaProducer != null) {
            kafkaProducer.close();
            kafkaProducer = null;
        }
        if (kafkaProducerThreads != null) {
            kafkaProducerThreads.destroy();
            kafkaProducerThreads = null;
        }

        super.stop();
    }

    private transient ThreadGroup kafkaProducerThreads;

    transient protected KafkaProducer<String, byte[]> kafkaProducer;

    protected KafkaProducer<String, byte[]> kafkaProducer() {
        if (kafkaProducer == null) {
            synchronized (this) {
                if (kafkaProducer == null) {
                    final Properties props = new Properties();
                    props.put("bootstrap.servers", getBrokerList());
                    props.put("acks", "1");
                    props.put("producer.type", "async");
                    props.put("key.serializer", StringSerializer.class.getName());
                    props.put("value.serializer", ByteArraySerializer.class.getName());

                    /*
                     * XXX This mess here exists so that the kafka producer
                     * thread is in a thread group that is not the ToePool,
                     * so that it doesn't get interrupted at the end of the
                     * crawl in ToePool.cleanup(). 
                     */
                    kafkaProducerThreads = new ThreadGroup(Thread.currentThread().getThreadGroup().getParent(),
                            "KafkaProducerThreads");
                    ThreadFactory threadFactory = new ThreadFactory() {
                        public Thread newThread(Runnable r) {
                            return new Thread(kafkaProducerThreads, r);
                        }
                    };
                    Callable<KafkaProducer<String, byte[]>> task = new Callable<KafkaProducer<String, byte[]>>() {
                        public KafkaProducer<String, byte[]> call() throws InterruptedException {
                            return new KafkaProducer<String, byte[]>(props);
                        }
                    };
                    ExecutorService executorService = Executors.newFixedThreadPool(1, threadFactory);
                    Future<KafkaProducer<String, byte[]>> future = executorService.submit(task);
                    try {
                        kafkaProducer = future.get();
                    } catch (InterruptedException e) {
                        throw new RuntimeException(e);
                    } catch (ExecutionException e) {
                        throw new RuntimeException(e);
                    } finally {
                        executorService.shutdown();
                    }
                }
            }
        }
        return kafkaProducer;
    }

    protected static class KafkaResultCallback implements Callback {
        private CrawlURI curi;

        public KafkaResultCallback(CrawlURI curi) {
            this.curi = curi;
        }

        @Override
        public void onCompletion(RecordMetadata metadata, Exception exception) {
            if (exception != null) {
                logger.warning("kafka delivery failed for " + curi + " - " + exception);
            }
        }
    }

    @Override
    protected void innerProcess(CrawlURI curi) throws InterruptedException {
        byte[] message = buildMessage(curi);
        ProducerRecord<String, byte[]> producerRecord = new ProducerRecord<String, byte[]>(getTopic(), message);
        kafkaProducer().send(producerRecord, new KafkaResultCallback(curi));
    }
}