bamboo.trove.common.BaseWarcDomainManager.java Source code

Java tutorial

Introduction

Here is the source code for bamboo.trove.common.BaseWarcDomainManager.java

Source

/**
 * Copyright 2016 National Library of Australia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *  http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package bamboo.trove.common;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;

import au.gov.nla.trove.indexer.api.BaseDomainManager;
import au.gov.nla.trove.indexer.api.EndPointDomainManager;
import au.gov.nla.trove.indexer.api.WorkProcessor;
import bamboo.task.Document;
import bamboo.trove.services.FilteringCoordinationService;
import bamboo.trove.workers.FilterWorker;
import bamboo.trove.workers.IndexerWorker;
import bamboo.trove.workers.TransformWorker;
import com.codahale.metrics.Histogram;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.SharedMetricRegistries;
import com.codahale.metrics.Timer;
import com.codahale.metrics.UniformReservoir;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Required;

public abstract class BaseWarcDomainManager extends BaseDomainManager implements Runnable {
    private static Logger log = LoggerFactory.getLogger(BaseWarcDomainManager.class);

    protected boolean runAtStart;

    @Required
    public void setRunAtStart(boolean runAtStart) {
        this.runAtStart = runAtStart;
    }

    // Reading data from Bamboo
    private static Timer bambooReadTimer;
    private static Timer bambooParseTimer;
    private static Histogram warcDocCountHistogram;
    private static Histogram warcSizeHistogram;
    private static String bambooBaseUrl;
    private ObjectMapper objectMapper = new ObjectMapper();
    private JsonFactory jsonFactory = new JsonFactory();

    // Managing pool tasks
    private static Queue<IndexerDocument> filterQueue = new ConcurrentLinkedQueue<>();
    private static Queue<IndexerDocument> transformQueue = new ConcurrentLinkedQueue<>();
    private static Queue<IndexerDocument> indexQueue = new ConcurrentLinkedQueue<>();

    // Performing pool tasks
    private static int filterPoolLimit;
    private static WorkProcessor filterPool;
    private static int transformPoolLimit;
    private static WorkProcessor transformPool;
    private static int indexPoolLimit;
    private static WorkProcessor indexPool;

    private static void notAlreadyStarted() {
        if (imStarted) {
            throw new IllegalStateException("The domain has already started!");
        }
    }

    protected static void setBambooApiBaseUrl(String newBambooBaseUrl) {
        notAlreadyStarted();
        bambooBaseUrl = newBambooBaseUrl;
    }

    protected static void setWorkerCounts(int filters, int transformers, int indexers) {
        notAlreadyStarted();
        filterPoolLimit = filters;
        transformPoolLimit = transformers;
        indexPoolLimit = indexers;
    }

    // This could be more elegant... but rather that have static init() code here that runs itself we
    // want to get config from a particular domain. This is solely to be consistent with how the standard
    // Trove indexer works. So here we want all domains to park and wait until after the 'full' domain has
    // started the shared worker pools.
    private static boolean imStarted = false;

    protected static synchronized void startMe(EndPointDomainManager solr, FilteringCoordinationService filtering) {
        if (imStarted) {
            throw new IllegalStateException("You started me twice!");
        }
        log.info("Bamboo Base URL  : {}", bambooBaseUrl);
        log.info("Metrics registry : {}", filtering.getMetricsRegistryName());
        log.info("# Filters        : {}", filterPoolLimit);
        log.info("# Transformers   : {}", transformPoolLimit);
        log.info("# Indexers       : {}", indexPoolLimit);

        // Metrics are fun...
        MetricRegistry metrics = SharedMetricRegistries.getOrCreate(filtering.getMetricsRegistryName());
        bambooReadTimer = new Timer();
        metrics.register("bambooReadTimer", bambooReadTimer);
        bambooParseTimer = new Timer();
        metrics.register("bambooParseTimer", bambooParseTimer);
        warcDocCountHistogram = new Histogram(new UniformReservoir());
        metrics.register("warcDocCountHistogram", warcDocCountHistogram);
        warcSizeHistogram = new Histogram(new UniformReservoir());
        metrics.register("warcSizeHistogram", warcSizeHistogram);
        Timer filterTimer = new Timer();
        metrics.register("filterTimer", filterTimer);
        Timer transformTimer = new Timer();
        metrics.register("transformTimer", transformTimer);
        Timer indexTimer = new Timer();
        metrics.register("indexTimer", indexTimer);

        // Filter workers
        filterPool = new WorkProcessor(filterPoolLimit);
        for (int i = 0; i < filterPoolLimit; i++) {
            filterPool.process(new FilterWorker(filtering, filterTimer));
        }

        // Transform workers
        transformPool = new WorkProcessor(transformPoolLimit);
        for (int i = 0; i < transformPoolLimit; i++) {
            transformPool.process(new TransformWorker(transformTimer));
        }

        // Indexing workers
        indexPool = new WorkProcessor(indexPoolLimit);
        for (int i = 0; i < indexPoolLimit; i++) {
            indexPool.process(new IndexerWorker(solr, indexTimer));
        }

        imStarted = true;
    }

    @VisibleForTesting
    public static void forTestSetMetricsRegistryName(String metricsRegistryName) {
        if (imStarted) {
            throw new IllegalStateException("Unit tests only!!!");
        }
        MetricRegistry metrics = SharedMetricRegistries.getOrCreate(metricsRegistryName);
        bambooReadTimer = new Timer();
        metrics.register("bambooReadTimer", bambooReadTimer);
        bambooParseTimer = new Timer();
        metrics.register("bambooParseTimer", bambooParseTimer);
        warcDocCountHistogram = new Histogram(new UniformReservoir());
        metrics.register("warcDocCountHistogram", warcDocCountHistogram);
        warcSizeHistogram = new Histogram(new UniformReservoir());
        metrics.register("warcSizeHistogram", warcSizeHistogram);
    }

    @VisibleForTesting
    public static void forTestSetBambooBaseUrl(String bambooBaseUrl) {
        if (imStarted) {
            throw new IllegalStateException("Unit tests only!!!");
        }
        BaseWarcDomainManager.bambooBaseUrl = bambooBaseUrl;
    }

    protected static void waitUntilStarted() throws InterruptedException {
        while (!imStarted) {
            Thread.sleep(1000);
        }
    }

    @Override
    public abstract boolean isRunning();

    @Override
    public abstract boolean isStopping();

    @Override
    public abstract void start();

    @Override
    public abstract void stop();

    @Override
    public abstract String getName();

    @Override
    public abstract long getUpdateCount();

    @Override
    public abstract String getLastIdProcessed();

    public static IndexerDocument getNextFilterJob(IndexerDocument lastJob) {
        if (lastJob != null) {
            transformQueue.offer(lastJob);
        }
        return filterQueue.poll();
    }

    public static IndexerDocument getNextTransformJob(IndexerDocument lastJob) {
        if (lastJob != null) {
            indexQueue.offer(lastJob);
        }
        return transformQueue.poll();
    }

    public static IndexerDocument getNextIndexJob(IndexerDocument lastJob) {
        return indexQueue.poll();
    }

    public WarcProgressManager getAndEnqueueWarc(long warcId, long urlCountEstimate) {
        return getAndEnqueueWarc(warcId, -1, urlCountEstimate);
    }

    // Full domain overrides this
    protected WarcProgressManager newWarc(long warcId, long trackedOffset, long urlCountEstimate) {
        return new WarcProgressManager(warcId, trackedOffset, urlCountEstimate);
    }

    public WarcProgressManager getAndEnqueueWarc(long warcId, long trackedOffset, long urlCountEstimate) {
        Timer.Context ctx = bambooReadTimer.time();
        HttpURLConnection connection = null;
        WarcProgressManager warc = newWarc(warcId, trackedOffset, urlCountEstimate);

        try {
            URL url = new URL(bambooBaseUrl + "warcs/" + warcId + "/text");
            connection = (HttpURLConnection) url.openConnection();
            InputStream in = new BufferedInputStream(connection.getInputStream());
            parseJson(warc, in);
            warc.setLoadComplete();
            return warc;

        } catch (IOException e) {
            log.error("Error talking to Bamboo (warc#{}): {}", warcId, e.getMessage());
            warc.setLoadFailed();
            return null;

        } catch (Exception e) {
            log.error("Unknown error getting data from Bamboo: {}", e.getMessage());
            warc.setLoadFailed();
            return null;

        } finally {
            if (connection != null) {
                connection.disconnect();
            }
            ctx.stop();
        }
    }

    protected ObjectMapper getObjectMapper() {
        return objectMapper;
    }

    protected JsonParser createParser(InputStream in) throws IOException {
        return jsonFactory.createParser(in);
    }

    // Important to note that this stream based parsing works very simple because to the POJO's being parsed
    // are very simple. If the Document class became more complicated this method would have to be as well.
    private void parseJson(WarcProgressManager warc, InputStream in) throws IOException {
        Timer.Context ctx = bambooParseTimer.time();
        JsonParser json = createParser(in);
        JsonToken token = json.nextToken();
        if (token == null) {
            ctx.stop();
            throw new IllegalArgumentException("No JSON data found in response");
        }
        if (!JsonToken.START_ARRAY.equals(token)) {
            ctx.stop();
            throw new IllegalArgumentException("JSON response is not an array");
        }

        try {
            long warcSize = 0;
            while (json.nextToken() == JsonToken.START_OBJECT) {
                Document d = objectMapper.readValue(json, Document.class);
                warcSize += d.getContentLength();
                // Track it by batch
                IndexerDocument doc = warc.add(d);
                // Enqueue it for work
                filterQueue.offer(doc);
            }
            warcDocCountHistogram.update(warc.size());
            warcSizeHistogram.update(warcSize);
            warc.setBatchBytes(warcSize);

        } finally {
            ctx.stop();
        }
    }

    @Override
    public void autoStart() {
        if (runAtStart) {
            start();
        }
    }
}