org.apache.storm.kafka.spout.KafkaSpoutConfig.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.storm.kafka.spout.KafkaSpoutConfig.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 *   or more contributor license agreements.  See the NOTICE file
 *   distributed with this work for additional information
 *   regarding copyright ownership.  The ASF licenses this file
 *   to you under the Apache License, Version 2.0 (the
 *   "License"); you may not use this file except in compliance
 *   with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */

package org.apache.storm.kafka.spout;

import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.lang.builder.ToStringBuilder;
import org.apache.commons.lang.builder.ToStringStyle;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.storm.Config;
import org.apache.storm.annotation.InterfaceStability;
import org.apache.storm.kafka.spout.KafkaSpoutRetryExponentialBackoff.TimeInterval;
import org.apache.storm.kafka.spout.internal.CommonKafkaSpoutConfig;
import org.apache.storm.kafka.spout.subscription.ManualPartitioner;
import org.apache.storm.kafka.spout.subscription.TopicFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * KafkaSpoutConfig defines the required configuration to connect a consumer to a consumer group, as well as the subscribing topics.
 */
public class KafkaSpoutConfig<K, V> extends CommonKafkaSpoutConfig<K, V> {

    private static final long serialVersionUID = 141902646130682494L;
    public static final long DEFAULT_OFFSET_COMMIT_PERIOD_MS = 30_000;
    // Retry forever
    public static final int DEFAULT_MAX_RETRIES = Integer.MAX_VALUE;
    // 10,000,000 records => 80MBs of memory footprint in the worst case
    public static final int DEFAULT_MAX_UNCOMMITTED_OFFSETS = 10_000_000;

    public static final KafkaSpoutRetryService DEFAULT_RETRY_SERVICE = new KafkaSpoutRetryExponentialBackoff(
            TimeInterval.seconds(0), TimeInterval.milliSeconds(2), DEFAULT_MAX_RETRIES, TimeInterval.seconds(10));

    public static final ProcessingGuarantee DEFAULT_PROCESSING_GUARANTEE = ProcessingGuarantee.AT_LEAST_ONCE;

    public static final KafkaTupleListener DEFAULT_TUPLE_LISTENER = new EmptyKafkaTupleListener();
    public static final Logger LOG = LoggerFactory.getLogger(KafkaSpoutConfig.class);

    public static final int DEFAULT_METRICS_TIME_BUCKET_SIZE_SECONDS = 60;

    // Kafka spout configuration
    private final long offsetCommitPeriodMs;
    private final int maxUncommittedOffsets;
    private final KafkaSpoutRetryService retryService;
    private final KafkaTupleListener tupleListener;
    private final boolean emitNullTuples;
    private final ProcessingGuarantee processingGuarantee;
    private final boolean tupleTrackingEnforced;
    private final int metricsTimeBucketSizeInSecs;

    /**
     * Creates a new KafkaSpoutConfig using a Builder.
     *
     * @param builder The Builder to construct the KafkaSpoutConfig from
     */
    public KafkaSpoutConfig(Builder<K, V> builder) {
        super(builder.setKafkaPropsForProcessingGuarantee());
        this.offsetCommitPeriodMs = builder.offsetCommitPeriodMs;
        this.maxUncommittedOffsets = builder.maxUncommittedOffsets;
        this.retryService = builder.retryService;
        this.tupleListener = builder.tupleListener;
        this.emitNullTuples = builder.emitNullTuples;
        this.processingGuarantee = builder.processingGuarantee;
        this.tupleTrackingEnforced = builder.tupleTrackingEnforced;
        this.metricsTimeBucketSizeInSecs = builder.metricsTimeBucketSizeInSecs;
    }

    /**
     * This enum controls when the tuple with the {@link ConsumerRecord} for an offset is marked as processed,
     * i.e. when the offset can be committed to Kafka. The default value is AT_LEAST_ONCE.
     * The commit interval is controlled by {@link KafkaSpoutConfig#getOffsetsCommitPeriodMs() }, if the mode commits on an interval.
     * NO_GUARANTEE may be removed in a later release without warning, we're still evaluating whether it makes sense to keep.
     */
    @InterfaceStability.Unstable
    public enum ProcessingGuarantee {
        /**
         * An offset is ready to commit only after the corresponding tuple has been processed and acked (at least once). If a tuple fails or
         * times out it will be re-emitted, as controlled by the {@link KafkaSpoutRetryService}. Commits synchronously on the defined
         * interval.
         */
        AT_LEAST_ONCE,
        /**
         * Every offset will be synchronously committed to Kafka right after being polled but before being emitted to the downstream
         * components of the topology. The commit interval is ignored. This mode guarantees that the offset is processed at most once by
         * ensuring the spout won't retry tuples that fail or time out after the commit to Kafka has been done
         */
        AT_MOST_ONCE,
        /**
         * The polled offsets are ready to commit immediately after being polled. The offsets are committed periodically, i.e. a message may
         * be processed 0, 1 or more times. This behavior is similar to setting enable.auto.commit=true in the consumer, but allows the
         * spout to control when commits occur. Commits asynchronously on the defined interval.
         */
        NO_GUARANTEE,
    }

    public static class Builder<K, V> extends CommonKafkaSpoutConfig.Builder<K, V, Builder<K, V>> {

        private long offsetCommitPeriodMs = DEFAULT_OFFSET_COMMIT_PERIOD_MS;
        private int maxUncommittedOffsets = DEFAULT_MAX_UNCOMMITTED_OFFSETS;
        private KafkaSpoutRetryService retryService = DEFAULT_RETRY_SERVICE;
        private KafkaTupleListener tupleListener = DEFAULT_TUPLE_LISTENER;
        private boolean emitNullTuples = false;
        private ProcessingGuarantee processingGuarantee = DEFAULT_PROCESSING_GUARANTEE;
        private boolean tupleTrackingEnforced = false;
        private int metricsTimeBucketSizeInSecs = DEFAULT_METRICS_TIME_BUCKET_SIZE_SECONDS;

        public Builder(String bootstrapServers, String... topics) {
            super(bootstrapServers, topics);
        }

        public Builder(String bootstrapServers, Set<String> topics) {
            super(bootstrapServers, topics);
        }

        public Builder(String bootstrapServers, Pattern topics) {
            super(bootstrapServers, topics);
        }

        /**
         * Create a KafkaSpoutConfig builder with default property values and no key/value deserializers.
         *
         * @param bootstrapServers The bootstrap servers the consumer will use
         * @param topicFilter The topic filter defining which topics and partitions the spout will read
         * @param topicPartitioner The topic partitioner defining which topics and partitions are assinged to each spout task
         */
        public Builder(String bootstrapServers, TopicFilter topicFilter, ManualPartitioner topicPartitioner) {
            super(bootstrapServers, topicFilter, topicPartitioner);
        }

        //Spout Settings
        /**
         * Specifies the period, in milliseconds, the offset commit task is periodically called. Default is 15s.
         *
         * <p>This setting only has an effect if the configured {@link ProcessingGuarantee} is {@link ProcessingGuarantee#AT_LEAST_ONCE} or
         * {@link ProcessingGuarantee#NO_GUARANTEE}.
         *
         * @param offsetCommitPeriodMs time in ms
         */
        public Builder<K, V> setOffsetCommitPeriodMs(long offsetCommitPeriodMs) {
            this.offsetCommitPeriodMs = offsetCommitPeriodMs;
            return this;
        }

        /**
         * Defines the max number of polled offsets (records) that can be pending commit, before another poll can take place.
         * Once this limit is reached, no more offsets (records) can be polled until the next successful commit(s) sets the number
         * of pending offsets below the threshold. The default is {@link #DEFAULT_MAX_UNCOMMITTED_OFFSETS}.
         * This limit is per partition and may in some cases be exceeded,
         * but each partition cannot exceed this limit by more than maxPollRecords - 1.
         * 
         * <p>This setting only has an effect if the configured {@link ProcessingGuarantee} is {@link ProcessingGuarantee#AT_LEAST_ONCE}.
         *
         * @param maxUncommittedOffsets max number of records that can be be pending commit
         */
        public Builder<K, V> setMaxUncommittedOffsets(int maxUncommittedOffsets) {
            this.maxUncommittedOffsets = maxUncommittedOffsets;
            return this;
        }

        /**
         * Sets the retry service for the spout to use.
         *
         * <p>This setting only has an effect if the configured {@link ProcessingGuarantee} is {@link ProcessingGuarantee#AT_LEAST_ONCE}.
         *
         * @param retryService the new retry service
         * @return the builder (this).
         */
        public Builder<K, V> setRetry(KafkaSpoutRetryService retryService) {
            if (retryService == null) {
                throw new NullPointerException("retryService cannot be null");
            }
            this.retryService = retryService;
            return this;
        }

        /**
         * Sets the tuple listener for the spout to use.
         *
         * @param tupleListener the tuple listener
         * @return the builder (this).
         */
        public Builder<K, V> setTupleListener(KafkaTupleListener tupleListener) {
            if (tupleListener == null) {
                throw new NullPointerException("KafkaTupleListener cannot be null");
            }
            this.tupleListener = tupleListener;
            return this;
        }

        /**
         * Specifies if the spout should emit null tuples to the component downstream, or rather not emit and directly ack them. By default
         * this parameter is set to false, which means that null tuples are not emitted.
         *
         * @param emitNullTuples sets if null tuples should or not be emitted downstream
         */
        public Builder<K, V> setEmitNullTuples(boolean emitNullTuples) {
            this.emitNullTuples = emitNullTuples;
            return this;
        }

        /**
         * Specifies which processing guarantee the spout should offer. Refer to the documentation for {@link ProcessingGuarantee}.
         *
         * @param processingGuarantee The processing guarantee the spout should offer.
         */
        public Builder<K, V> setProcessingGuarantee(ProcessingGuarantee processingGuarantee) {
            this.processingGuarantee = processingGuarantee;
            return this;
        }

        /**
         * Specifies whether the spout should require Storm to track emitted tuples when using a {@link ProcessingGuarantee} other than
         * {@link ProcessingGuarantee#AT_LEAST_ONCE}. The spout will always track emitted tuples when offering at-least-once guarantees
         * regardless of this setting. This setting is false by default.
         *
         * <p>Enabling tracking can be useful even in cases where reliability is not a concern, because it allows
         * {@link Config#TOPOLOGY_MAX_SPOUT_PENDING} to have an effect, and enables some spout metrics (e.g. complete-latency) that would
         * otherwise be disabled.
         *
         * @param tupleTrackingEnforced true if Storm should track emitted tuples, false otherwise
         */
        public Builder<K, V> setTupleTrackingEnforced(boolean tupleTrackingEnforced) {
            this.tupleTrackingEnforced = tupleTrackingEnforced;
            return this;
        }

        /**
         * The time period that metrics data in bucketed into.
         * @param metricsTimeBucketSizeInSecs time in seconds
         */
        public Builder<K, V> setMetricsTimeBucketSizeInSecs(int metricsTimeBucketSizeInSecs) {
            this.metricsTimeBucketSizeInSecs = metricsTimeBucketSizeInSecs;
            return this;
        }

        private Builder<K, V> withStringDeserializers() {
            setProp(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
            setProp(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
            return this;
        }

        private Builder<K, V> setKafkaPropsForProcessingGuarantee() {
            if (getKafkaProps().containsKey(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG)) {
                throw new IllegalStateException("The KafkaConsumer " + ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG
                        + " setting is not supported."
                        + " You can configure similar behavior through KafkaSpoutConfig.Builder.setProcessingGuarantee");
            }
            String autoOffsetResetPolicy = (String) getKafkaProps().get(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG);
            if (processingGuarantee == ProcessingGuarantee.AT_LEAST_ONCE) {
                if (autoOffsetResetPolicy == null) {
                    /*
                     * If the user wants to explicitly set an auto offset reset policy, we should respect it, but when the spout is
                     * configured for at-least-once processing we should default to seeking to the earliest offset in case there's an offset
                     * out of range error, rather than seeking to the latest (Kafka's default). This type of error will typically happen
                     * when the consumer requests an offset that was deleted.
                     */
                    LOG.info(
                            "Setting Kafka consumer property '{}' to 'earliest' to ensure at-least-once processing",
                            ConsumerConfig.AUTO_OFFSET_RESET_CONFIG);
                    setProp(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
                } else if (!autoOffsetResetPolicy.equals("earliest") && !autoOffsetResetPolicy.equals("none")) {
                    LOG.warn(
                            "Cannot guarantee at-least-once processing with auto.offset.reset.policy other than 'earliest' or 'none'."
                                    + " Some messages may be skipped.");
                }
            } else if (processingGuarantee == ProcessingGuarantee.AT_MOST_ONCE) {
                if (autoOffsetResetPolicy != null
                        && (!autoOffsetResetPolicy.equals("latest") && !autoOffsetResetPolicy.equals("none"))) {
                    LOG.warn(
                            "Cannot guarantee at-most-once processing with auto.offset.reset.policy other than 'latest' or 'none'."
                                    + " Some messages may be processed more than once.");
                }
            }
            LOG.info(
                    "Setting Kafka consumer property '{}' to 'false', because the spout does not support auto-commit",
                    ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG);
            setProp(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false);
            return this;
        }

        public KafkaSpoutConfig<K, V> build() {
            return new KafkaSpoutConfig<>(this);
        }
    }

    /**
     * Factory method that creates a Builder with String key/value deserializers.
     *
     * @param bootstrapServers The bootstrap servers for the consumer
     * @param topics The topics to subscribe to
     * @return The new builder
     */
    public static Builder<String, String> builder(String bootstrapServers, String... topics) {
        return new Builder<String, String>(bootstrapServers, topics).withStringDeserializers();
    }

    /**
     * Factory method that creates a Builder with String key/value deserializers.
     *
     * @param bootstrapServers The bootstrap servers for the consumer
     * @param topics The topics to subscribe to
     * @return The new builder
     */
    public static Builder<String, String> builder(String bootstrapServers, Set<String> topics) {
        return new Builder<String, String>(bootstrapServers, topics).withStringDeserializers();
    }

    /**
     * Factory method that creates a Builder with String key/value deserializers.
     *
     * @param bootstrapServers The bootstrap servers for the consumer
     * @param topics The topic pattern to subscribe to
     * @return The new builder
     */
    public static Builder<String, String> builder(String bootstrapServers, Pattern topics) {
        return new Builder<String, String>(bootstrapServers, topics).withStringDeserializers();
    }

    public long getOffsetsCommitPeriodMs() {
        return offsetCommitPeriodMs;
    }

    public ProcessingGuarantee getProcessingGuarantee() {
        return processingGuarantee;
    }

    public boolean isTupleTrackingEnforced() {
        return tupleTrackingEnforced;
    }

    public String getConsumerGroupId() {
        return (String) getKafkaProps().get(ConsumerConfig.GROUP_ID_CONFIG);
    }

    public int getMaxUncommittedOffsets() {
        return maxUncommittedOffsets;
    }

    public KafkaSpoutRetryService getRetryService() {
        return retryService;
    }

    public KafkaTupleListener getTupleListener() {
        return tupleListener;
    }

    public boolean isEmitNullTuples() {
        return emitNullTuples;
    }

    public int getMetricsTimeBucketSizeInSecs() {
        return metricsTimeBucketSizeInSecs;
    }

    @Override
    public String toString() {
        return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE)
                .append("offsetCommitPeriodMs", offsetCommitPeriodMs)
                .append("maxUncommittedOffsets", maxUncommittedOffsets).append("retryService", retryService)
                .append("tupleListener", tupleListener).append("processingGuarantee", processingGuarantee)
                .append("emitNullTuples", emitNullTuples).append("tupleTrackingEnforced", tupleTrackingEnforced)
                .append("metricsTimeBucketSizeInSecs", metricsTimeBucketSizeInSecs).toString();
    }
}