org.apache.druid.indexing.common.task.CompactionTask.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.druid.indexing.common.task.CompactionTask.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.indexing.common.task;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.Lists;
import org.apache.druid.client.coordinator.CoordinatorClient;
import org.apache.druid.data.input.impl.DimensionSchema;
import org.apache.druid.data.input.impl.DimensionSchema.MultiValueHandling;
import org.apache.druid.data.input.impl.DimensionsSpec;
import org.apache.druid.data.input.impl.DoubleDimensionSchema;
import org.apache.druid.data.input.impl.FloatDimensionSchema;
import org.apache.druid.data.input.impl.InputRowParser;
import org.apache.druid.data.input.impl.LongDimensionSchema;
import org.apache.druid.data.input.impl.NoopInputRowParser;
import org.apache.druid.data.input.impl.StringDimensionSchema;
import org.apache.druid.data.input.impl.TimeAndDimsParseSpec;
import org.apache.druid.indexer.TaskStatus;
import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
import org.apache.druid.indexer.partitions.HashedPartitionsSpec;
import org.apache.druid.indexing.common.RetryPolicyFactory;
import org.apache.druid.indexing.common.SegmentLoaderFactory;
import org.apache.druid.indexing.common.TaskToolbox;
import org.apache.druid.indexing.common.actions.SegmentListUsedAction;
import org.apache.druid.indexing.common.actions.TaskActionClient;
import org.apache.druid.indexing.common.stats.RowIngestionMetersFactory;
import org.apache.druid.indexing.common.task.IndexTask.IndexIOConfig;
import org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec;
import org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig;
import org.apache.druid.indexing.firehose.IngestSegmentFirehoseFactory;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.JodaUtils;
import org.apache.druid.java.util.common.Numbers;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.RE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularities;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.granularity.GranularityType;
import org.apache.druid.java.util.common.guava.Comparators;
import org.apache.druid.java.util.common.jackson.JacksonUtils;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.segment.DimensionHandler;
import org.apache.druid.segment.IndexIO;
import org.apache.druid.segment.QueryableIndex;
import org.apache.druid.segment.column.ColumnHolder;
import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.indexing.DataSchema;
import org.apache.druid.segment.indexing.granularity.GranularitySpec;
import org.apache.druid.segment.indexing.granularity.UniformGranularitySpec;
import org.apache.druid.segment.loading.SegmentLoadingException;
import org.apache.druid.segment.realtime.appenderator.AppenderatorsManager;
import org.apache.druid.segment.realtime.firehose.ChatHandlerProvider;
import org.apache.druid.server.coordinator.DataSourceCompactionConfig;
import org.apache.druid.server.security.AuthorizerMapper;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.TimelineLookup;
import org.apache.druid.timeline.TimelineObjectHolder;
import org.apache.druid.timeline.VersionedIntervalTimeline;
import org.apache.druid.timeline.partition.PartitionChunk;
import org.apache.druid.timeline.partition.PartitionHolder;
import org.joda.time.Interval;

import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.StreamSupport;

public class CompactionTask extends AbstractBatchIndexTask {
    private static final Logger log = new Logger(CompactionTask.class);
    private static final String TYPE = "compact";

    private final Interval interval;
    private final List<DataSegment> segments;
    @Nullable
    private final DimensionsSpec dimensionsSpec;
    @Nullable
    private final AggregatorFactory[] metricsSpec;
    @Nullable
    private final Granularity segmentGranularity;
    @Nullable
    private final Long targetCompactionSizeBytes;
    @Nullable
    private final IndexTuningConfig tuningConfig;
    private final ObjectMapper jsonMapper;
    @JsonIgnore
    private final SegmentProvider segmentProvider;
    @JsonIgnore
    private final PartitionConfigurationManager partitionConfigurationManager;

    @JsonIgnore
    private final AuthorizerMapper authorizerMapper;

    @JsonIgnore
    private final ChatHandlerProvider chatHandlerProvider;

    @JsonIgnore
    private final RowIngestionMetersFactory rowIngestionMetersFactory;

    @JsonIgnore
    private final CoordinatorClient coordinatorClient;

    @JsonIgnore
    private final SegmentLoaderFactory segmentLoaderFactory;

    @JsonIgnore
    private final RetryPolicyFactory retryPolicyFactory;

    @JsonIgnore
    private final AppenderatorsManager appenderatorsManager;

    @JsonIgnore
    private final CurrentSubTaskHolder currentSubTaskHolder = new CurrentSubTaskHolder((taskObject, config) -> {
        final IndexTask indexTask = (IndexTask) taskObject;
        indexTask.stopGracefully(config);
    });

    @JsonIgnore
    private List<IndexTask> indexTaskSpecs;

    @JsonCreator
    public CompactionTask(@JsonProperty("id") final String id,
            @JsonProperty("resource") final TaskResource taskResource,
            @JsonProperty("dataSource") final String dataSource,
            @JsonProperty("interval") @Nullable final Interval interval,
            @JsonProperty("segments") @Nullable final List<DataSegment> segments,
            @JsonProperty("dimensions") @Nullable final DimensionsSpec dimensions,
            @JsonProperty("dimensionsSpec") @Nullable final DimensionsSpec dimensionsSpec,
            @JsonProperty("metricsSpec") @Nullable final AggregatorFactory[] metricsSpec,
            @JsonProperty("segmentGranularity") @Nullable final Granularity segmentGranularity,
            @JsonProperty("targetCompactionSizeBytes") @Nullable final Long targetCompactionSizeBytes,
            @JsonProperty("tuningConfig") @Nullable final IndexTuningConfig tuningConfig,
            @JsonProperty("context") @Nullable final Map<String, Object> context,
            @JacksonInject ObjectMapper jsonMapper, @JacksonInject AuthorizerMapper authorizerMapper,
            @JacksonInject ChatHandlerProvider chatHandlerProvider,
            @JacksonInject RowIngestionMetersFactory rowIngestionMetersFactory,
            @JacksonInject CoordinatorClient coordinatorClient,
            @JacksonInject SegmentLoaderFactory segmentLoaderFactory,
            @JacksonInject RetryPolicyFactory retryPolicyFactory,
            @JacksonInject AppenderatorsManager appenderatorsManager) {
        super(getOrMakeId(id, TYPE, dataSource), null, taskResource, dataSource, context);
        Preconditions.checkArgument(interval != null || segments != null,
                "interval or segments should be specified");
        Preconditions.checkArgument(interval == null || segments == null,
                "one of interval and segments should be null");

        if (interval != null && interval.toDurationMillis() == 0) {
            throw new IAE("Interval[%s] is empty, must specify a nonempty interval", interval);
        }

        this.interval = interval;
        this.segments = segments;
        this.dimensionsSpec = dimensionsSpec == null ? dimensions : dimensionsSpec;
        this.metricsSpec = metricsSpec;
        this.segmentGranularity = segmentGranularity;
        this.targetCompactionSizeBytes = targetCompactionSizeBytes;
        this.tuningConfig = tuningConfig;
        this.jsonMapper = jsonMapper;
        this.segmentProvider = segments == null ? new SegmentProvider(dataSource, interval)
                : new SegmentProvider(segments);
        this.partitionConfigurationManager = new PartitionConfigurationManager(targetCompactionSizeBytes,
                tuningConfig);
        this.authorizerMapper = authorizerMapper;
        this.chatHandlerProvider = chatHandlerProvider;
        this.rowIngestionMetersFactory = rowIngestionMetersFactory;
        this.coordinatorClient = coordinatorClient;
        this.segmentLoaderFactory = segmentLoaderFactory;
        this.retryPolicyFactory = retryPolicyFactory;
        this.appenderatorsManager = appenderatorsManager;
    }

    @JsonProperty
    public Interval getInterval() {
        return interval;
    }

    @JsonProperty
    public List<DataSegment> getSegments() {
        return segments;
    }

    @JsonProperty
    @Nullable
    public DimensionsSpec getDimensionsSpec() {
        return dimensionsSpec;
    }

    @JsonProperty
    @Nullable
    public AggregatorFactory[] getMetricsSpec() {
        return metricsSpec;
    }

    @JsonProperty
    @Nullable
    @Override
    public Granularity getSegmentGranularity() {
        return segmentGranularity;
    }

    @Nullable
    @JsonProperty
    public Long getTargetCompactionSizeBytes() {
        return targetCompactionSizeBytes;
    }

    @Nullable
    @JsonProperty
    public IndexTuningConfig getTuningConfig() {
        return tuningConfig;
    }

    @Override
    public String getType() {
        return TYPE;
    }

    @Override
    public int getPriority() {
        return getContextValue(Tasks.PRIORITY_KEY, Tasks.DEFAULT_MERGE_TASK_PRIORITY);
    }

    @Override
    public boolean isReady(TaskActionClient taskActionClient) throws Exception {
        final List<DataSegment> segments = segmentProvider.checkAndGetSegments(taskActionClient);
        return determineLockGranularityandTryLockWithSegments(taskActionClient, segments);
    }

    @Override
    public boolean requireLockExistingSegments() {
        return true;
    }

    @Override
    public List<DataSegment> findSegmentsToLock(TaskActionClient taskActionClient, List<Interval> intervals)
            throws IOException {
        return taskActionClient.submit(new SegmentListUsedAction(getDataSource(), null, intervals));
    }

    @Override
    public boolean isPerfectRollup() {
        return tuningConfig != null && tuningConfig.isForceGuaranteedRollup();
    }

    @Override
    public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
        if (indexTaskSpecs == null) {
            final List<IndexIngestionSpec> ingestionSpecs = createIngestionSchema(toolbox, segmentProvider,
                    partitionConfigurationManager, dimensionsSpec, metricsSpec, segmentGranularity, jsonMapper,
                    coordinatorClient, segmentLoaderFactory, retryPolicyFactory);
            indexTaskSpecs = IntStream.range(0, ingestionSpecs.size())
                    .mapToObj(i -> new IndexTask(createIndexTaskSpecId(i), getGroupId(), getTaskResource(),
                            getDataSource(), ingestionSpecs.get(i), getContext(), authorizerMapper,
                            chatHandlerProvider, rowIngestionMetersFactory, appenderatorsManager

                    )).collect(Collectors.toList());
        }

        if (indexTaskSpecs.isEmpty()) {
            log.warn("Interval[%s] has no segments, nothing to do.", interval);
            return TaskStatus.failure(getId());
        } else {
            registerResourceCloserOnAbnormalExit(currentSubTaskHolder);
            final int totalNumSpecs = indexTaskSpecs.size();
            log.info("Generated [%d] compaction task specs", totalNumSpecs);

            int failCnt = 0;
            for (IndexTask eachSpec : indexTaskSpecs) {
                final String json = jsonMapper.writerWithDefaultPrettyPrinter().writeValueAsString(eachSpec);
                if (!currentSubTaskHolder.setTask(eachSpec)) {
                    log.info("Task is asked to stop. Finish as failed.");
                    return TaskStatus.failure(getId());
                }
                try {
                    if (eachSpec.isReady(toolbox.getTaskActionClient())) {
                        log.info("Running indexSpec: " + json);
                        final TaskStatus eachResult = eachSpec.run(toolbox);
                        if (!eachResult.isSuccess()) {
                            failCnt++;
                            log.warn("Failed to run indexSpec: [%s].\nTrying the next indexSpec.", json);
                        }
                    } else {
                        failCnt++;
                        log.warn("indexSpec is not ready: [%s].\nTrying the next indexSpec.", json);
                    }
                } catch (Exception e) {
                    failCnt++;
                    log.warn(e, "Failed to run indexSpec: [%s].\nTrying the next indexSpec.", json);
                }
            }

            log.info("Run [%d] specs, [%d] succeeded, [%d] failed", totalNumSpecs, totalNumSpecs - failCnt,
                    failCnt);
            return failCnt == 0 ? TaskStatus.success(getId()) : TaskStatus.failure(getId());
        }
    }

    private String createIndexTaskSpecId(int i) {
        return StringUtils.format("%s_%d", getId(), i);
    }

    /**
     * Generate {@link IndexIngestionSpec} from input segments.
     *
     * @return an empty list if input segments don't exist. Otherwise, a generated ingestionSpec.
     */
    @VisibleForTesting
    static List<IndexIngestionSpec> createIngestionSchema(final TaskToolbox toolbox,
            final SegmentProvider segmentProvider,
            final PartitionConfigurationManager partitionConfigurationManager,
            @Nullable final DimensionsSpec dimensionsSpec, @Nullable final AggregatorFactory[] metricsSpec,
            @Nullable final Granularity segmentGranularity, final ObjectMapper jsonMapper,
            final CoordinatorClient coordinatorClient, final SegmentLoaderFactory segmentLoaderFactory,
            final RetryPolicyFactory retryPolicyFactory) throws IOException, SegmentLoadingException {
        Pair<Map<DataSegment, File>, List<TimelineObjectHolder<String, DataSegment>>> pair = prepareSegments(
                toolbox, segmentProvider);
        final Map<DataSegment, File> segmentFileMap = pair.lhs;
        final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = pair.rhs;

        if (timelineSegments.size() == 0) {
            return Collections.emptyList();
        }

        // find metadata for interval
        // queryableIndexAndSegments is sorted by the interval of the dataSegment
        final List<Pair<QueryableIndex, DataSegment>> queryableIndexAndSegments = loadSegments(timelineSegments,
                segmentFileMap, toolbox.getIndexIO());

        final IndexTuningConfig compactionTuningConfig = partitionConfigurationManager
                .computeTuningConfig(queryableIndexAndSegments);

        if (segmentGranularity == null) {
            // original granularity
            final Map<Interval, List<Pair<QueryableIndex, DataSegment>>> intervalToSegments = new TreeMap<>(
                    Comparators.intervalsByStartThenEnd());
            //noinspection ConstantConditions
            queryableIndexAndSegments.forEach(
                    p -> intervalToSegments.computeIfAbsent(p.rhs.getInterval(), k -> new ArrayList<>()).add(p));

            final List<IndexIngestionSpec> specs = new ArrayList<>(intervalToSegments.size());
            for (Entry<Interval, List<Pair<QueryableIndex, DataSegment>>> entry : intervalToSegments.entrySet()) {
                final Interval interval = entry.getKey();
                final List<Pair<QueryableIndex, DataSegment>> segmentsToCompact = entry.getValue();
                final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, segmentsToCompact,
                        dimensionsSpec, metricsSpec,
                        GranularityType.fromPeriod(interval.toPeriod()).getDefaultGranularity(), jsonMapper);

                specs.add(new IndexIngestionSpec(dataSchema, createIoConfig(toolbox, dataSchema, interval,
                        coordinatorClient, segmentLoaderFactory, retryPolicyFactory), compactionTuningConfig));
            }

            return specs;
        } else {
            // given segment granularity
            final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, queryableIndexAndSegments,
                    dimensionsSpec, metricsSpec, segmentGranularity, jsonMapper);

            return Collections
                    .singletonList(new IndexIngestionSpec(
                            dataSchema, createIoConfig(toolbox, dataSchema, segmentProvider.interval,
                                    coordinatorClient, segmentLoaderFactory, retryPolicyFactory),
                            compactionTuningConfig));
        }
    }

    private static IndexIOConfig createIoConfig(TaskToolbox toolbox, DataSchema dataSchema, Interval interval,
            CoordinatorClient coordinatorClient, SegmentLoaderFactory segmentLoaderFactory,
            RetryPolicyFactory retryPolicyFactory) {
        return new IndexIOConfig(new IngestSegmentFirehoseFactory(dataSchema.getDataSource(), interval, null, null, // no filter
                // set dimensions and metrics names to make sure that the generated dataSchema is used for the firehose
                dataSchema.getParser().getParseSpec().getDimensionsSpec().getDimensionNames(),
                Arrays.stream(dataSchema.getAggregators()).map(AggregatorFactory::getName)
                        .collect(Collectors.toList()),
                null, toolbox.getIndexIO(), coordinatorClient, segmentLoaderFactory, retryPolicyFactory), false);
    }

    private static Pair<Map<DataSegment, File>, List<TimelineObjectHolder<String, DataSegment>>> prepareSegments(
            TaskToolbox toolbox, SegmentProvider segmentProvider) throws IOException, SegmentLoadingException {
        final List<DataSegment> usedSegments = segmentProvider.checkAndGetSegments(toolbox.getTaskActionClient());
        final Map<DataSegment, File> segmentFileMap = toolbox.fetchSegments(usedSegments);
        final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = VersionedIntervalTimeline
                .forSegments(usedSegments).lookup(segmentProvider.interval);
        return Pair.of(segmentFileMap, timelineSegments);
    }

    private static DataSchema createDataSchema(String dataSource,
            List<Pair<QueryableIndex, DataSegment>> queryableIndexAndSegments,
            @Nullable DimensionsSpec dimensionsSpec, @Nullable AggregatorFactory[] metricsSpec,
            Granularity segmentGranularity, ObjectMapper jsonMapper) {
        // check index metadata
        for (Pair<QueryableIndex, DataSegment> pair : queryableIndexAndSegments) {
            final QueryableIndex index = pair.lhs;
            if (index.getMetadata() == null) {
                throw new RE("Index metadata doesn't exist for segment[%s]", pair.rhs.getId());
            }
        }

        // find granularity spec
        // set rollup only if rollup is set for all segments
        final boolean rollup = queryableIndexAndSegments.stream().allMatch(pair -> {
            // We have already checked getMetadata() doesn't return null
            final Boolean isRollup = pair.lhs.getMetadata().isRollup();
            return isRollup != null && isRollup;
        });

        final Interval totalInterval = JodaUtils.umbrellaInterval(
                queryableIndexAndSegments.stream().map(p -> p.rhs.getInterval()).collect(Collectors.toList()));

        final GranularitySpec granularitySpec = new UniformGranularitySpec(
                Preconditions.checkNotNull(segmentGranularity), Granularities.NONE, rollup,
                Collections.singletonList(totalInterval));

        // find unique dimensions
        final DimensionsSpec finalDimensionsSpec = dimensionsSpec == null
                ? createDimensionsSpec(queryableIndexAndSegments)
                : dimensionsSpec;
        final AggregatorFactory[] finalMetricsSpec = metricsSpec == null
                ? createMetricsSpec(queryableIndexAndSegments)
                : convertToCombiningFactories(metricsSpec);
        final InputRowParser parser = new NoopInputRowParser(new TimeAndDimsParseSpec(null, finalDimensionsSpec));

        return new DataSchema(dataSource,
                jsonMapper.convertValue(parser, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT), finalMetricsSpec,
                granularitySpec, null, jsonMapper);
    }

    private static AggregatorFactory[] createMetricsSpec(
            List<Pair<QueryableIndex, DataSegment>> queryableIndexAndSegments) {
        final List<AggregatorFactory[]> aggregatorFactories = queryableIndexAndSegments.stream()
                .map(pair -> pair.lhs.getMetadata().getAggregators()) // We have already done null check on index.getMetadata()
                .collect(Collectors.toList());
        final AggregatorFactory[] mergedAggregators = AggregatorFactory.mergeAggregators(aggregatorFactories);

        if (mergedAggregators == null) {
            throw new ISE("Failed to merge aggregators[%s]", aggregatorFactories);
        }
        return mergedAggregators;
    }

    private static AggregatorFactory[] convertToCombiningFactories(AggregatorFactory[] metricsSpec) {
        return Arrays.stream(metricsSpec).map(AggregatorFactory::getCombiningFactory)
                .toArray(AggregatorFactory[]::new);
    }

    private static DimensionsSpec createDimensionsSpec(List<Pair<QueryableIndex, DataSegment>> queryableIndices) {
        final BiMap<String, Integer> uniqueDims = HashBiMap.create();
        final Map<String, DimensionSchema> dimensionSchemaMap = new HashMap<>();

        // Here, we try to retain the order of dimensions as they were specified since the order of dimensions may be
        // optimized for performance.
        // Dimensions are extracted from the recent segments to olders because recent segments are likely to be queried more
        // frequently, and thus the performance should be optimized for recent ones rather than old ones.

        // timelineSegments are sorted in order of interval, but we do a sanity check here.
        final Comparator<Interval> intervalComparator = Comparators.intervalsByStartThenEnd();
        for (int i = 0; i < queryableIndices.size() - 1; i++) {
            final Interval shouldBeSmaller = queryableIndices.get(i).lhs.getDataInterval();
            final Interval shouldBeLarger = queryableIndices.get(i + 1).lhs.getDataInterval();
            Preconditions.checkState(intervalComparator.compare(shouldBeSmaller, shouldBeLarger) <= 0,
                    "QueryableIndexes are not sorted! Interval[%s] of segment[%s] is laster than interval[%s] of segment[%s]",
                    shouldBeSmaller, queryableIndices.get(i).rhs.getId(), shouldBeLarger,
                    queryableIndices.get(i + 1).rhs.getId());
        }

        int index = 0;
        for (Pair<QueryableIndex, DataSegment> pair : Lists.reverse(queryableIndices)) {
            final QueryableIndex queryableIndex = pair.lhs;
            final Map<String, DimensionHandler> dimensionHandlerMap = queryableIndex.getDimensionHandlers();

            for (String dimension : queryableIndex.getAvailableDimensions()) {
                final ColumnHolder columnHolder = Preconditions.checkNotNull(
                        queryableIndex.getColumnHolder(dimension), "Cannot find column for dimension[%s]",
                        dimension);

                if (!uniqueDims.containsKey(dimension)) {
                    final DimensionHandler dimensionHandler = Preconditions.checkNotNull(
                            dimensionHandlerMap.get(dimension), "Cannot find dimensionHandler for dimension[%s]",
                            dimension);

                    uniqueDims.put(dimension, index++);
                    dimensionSchemaMap.put(dimension,
                            createDimensionSchema(columnHolder.getCapabilities().getType(), dimension,
                                    dimensionHandler.getMultivalueHandling(),
                                    columnHolder.getCapabilities().hasBitmapIndexes()));
                }
            }
        }

        final BiMap<Integer, String> orderedDims = uniqueDims.inverse();
        final List<DimensionSchema> dimensionSchemas = IntStream.range(0, orderedDims.size()).mapToObj(i -> {
            final String dimName = orderedDims.get(i);
            return Preconditions.checkNotNull(dimensionSchemaMap.get(dimName),
                    "Cannot find dimension[%s] from dimensionSchemaMap", dimName);
        }).collect(Collectors.toList());

        return new DimensionsSpec(dimensionSchemas, null, null);
    }

    private static List<Pair<QueryableIndex, DataSegment>> loadSegments(
            List<TimelineObjectHolder<String, DataSegment>> timelineObjectHolders,
            Map<DataSegment, File> segmentFileMap, IndexIO indexIO) throws IOException {
        final List<Pair<QueryableIndex, DataSegment>> segments = new ArrayList<>();

        for (TimelineObjectHolder<String, DataSegment> timelineObjectHolder : timelineObjectHolders) {
            final PartitionHolder<DataSegment> partitionHolder = timelineObjectHolder.getObject();
            for (PartitionChunk<DataSegment> chunk : partitionHolder) {
                final DataSegment segment = chunk.getObject();
                final QueryableIndex queryableIndex = indexIO.loadIndex(Preconditions
                        .checkNotNull(segmentFileMap.get(segment), "File for segment %s", segment.getId()));
                segments.add(Pair.of(queryableIndex, segment));
            }
        }

        return segments;
    }

    private static DimensionSchema createDimensionSchema(ValueType type, String name,
            MultiValueHandling multiValueHandling, boolean hasBitmapIndexes) {
        switch (type) {
        case FLOAT:
            Preconditions.checkArgument(multiValueHandling == null,
                    "multi-value dimension [%s] is not supported for float type yet", name);
            return new FloatDimensionSchema(name);
        case LONG:
            Preconditions.checkArgument(multiValueHandling == null,
                    "multi-value dimension [%s] is not supported for long type yet", name);
            return new LongDimensionSchema(name);
        case DOUBLE:
            Preconditions.checkArgument(multiValueHandling == null,
                    "multi-value dimension [%s] is not supported for double type yet", name);
            return new DoubleDimensionSchema(name);
        case STRING:
            return new StringDimensionSchema(name, multiValueHandling, hasBitmapIndexes);
        default:
            throw new ISE("Unsupported value type[%s] for dimension[%s]", type, name);
        }
    }

    @VisibleForTesting
    static class SegmentProvider {
        private final String dataSource;
        private final Interval interval;
        @Nullable
        private final List<DataSegment> segments;

        SegmentProvider(String dataSource, Interval interval) {
            this.dataSource = Preconditions.checkNotNull(dataSource);
            this.interval = Preconditions.checkNotNull(interval);
            this.segments = null;
        }

        SegmentProvider(List<DataSegment> segments) {
            Preconditions.checkArgument(segments != null && !segments.isEmpty());
            final String dataSource = segments.get(0).getDataSource();
            Preconditions.checkArgument(
                    segments.stream().allMatch(segment -> segment.getDataSource().equals(dataSource)),
                    "segments should have the same dataSource");
            this.dataSource = dataSource;
            this.segments = segments;
            this.interval = JodaUtils
                    .umbrellaInterval(segments.stream().map(DataSegment::getInterval).collect(Collectors.toList()));
        }

        @Nullable
        List<DataSegment> getSegments() {
            return segments;
        }

        List<DataSegment> checkAndGetSegments(TaskActionClient actionClient) throws IOException {
            final List<DataSegment> usedSegments = actionClient
                    .submit(new SegmentListUsedAction(dataSource, interval, null));
            final TimelineLookup<String, DataSegment> timeline = VersionedIntervalTimeline
                    .forSegments(usedSegments);
            final List<DataSegment> latestSegments = timeline.lookup(interval).stream()
                    .map(TimelineObjectHolder::getObject)
                    .flatMap(partitionHolder -> StreamSupport.stream(partitionHolder.spliterator(), false))
                    .map(PartitionChunk::getObject).collect(Collectors.toList());

            if (segments != null) {
                Collections.sort(latestSegments);
                Collections.sort(segments);

                if (!latestSegments.equals(segments)) {
                    final List<DataSegment> unknownSegments = segments.stream()
                            .filter(segment -> !latestSegments.contains(segment)).collect(Collectors.toList());
                    final List<DataSegment> missingSegments = latestSegments.stream()
                            .filter(segment -> !segments.contains(segment)).collect(Collectors.toList());
                    throw new ISE(
                            "Specified segments in the spec are different from the current used segments. "
                                    + "There are unknown segments[%s] and missing segments[%s] in the spec.",
                            unknownSegments, missingSegments);
                }
            }
            return latestSegments;
        }
    }

    @VisibleForTesting
    static class PartitionConfigurationManager {
        @Nullable
        private final Long targetCompactionSizeBytes;
        @Nullable
        private final IndexTuningConfig tuningConfig;

        PartitionConfigurationManager(@Nullable Long targetCompactionSizeBytes,
                @Nullable IndexTuningConfig tuningConfig) {
            this.targetCompactionSizeBytes = getValidTargetCompactionSizeBytes(targetCompactionSizeBytes,
                    tuningConfig);
            this.tuningConfig = tuningConfig;
        }

        @Nullable
        IndexTuningConfig computeTuningConfig(List<Pair<QueryableIndex, DataSegment>> queryableIndexAndSegments) {
            if (!hasPartitionConfig(tuningConfig)) {
                final long nonNullTargetCompactionSizeBytes = Preconditions.checkNotNull(targetCompactionSizeBytes,
                        "targetCompactionSizeBytes");
                // Find IndexTuningConfig.maxRowsPerSegment which is the number of rows per segment.
                // Assume that the segment size is proportional to the number of rows. We can improve this later.
                final long totalNumRows = queryableIndexAndSegments.stream()
                        .mapToLong(queryableIndexAndDataSegment -> queryableIndexAndDataSegment.lhs.getNumRows())
                        .sum();
                final long totalSizeBytes = queryableIndexAndSegments.stream()
                        .mapToLong(queryableIndexAndDataSegment -> queryableIndexAndDataSegment.rhs.getSize())
                        .sum();

                if (totalSizeBytes == 0L) {
                    throw new ISE("Total input segment size is 0 byte");
                }

                final double avgRowsPerByte = totalNumRows / (double) totalSizeBytes;
                final long maxRowsPerSegmentLong = Math.round(avgRowsPerByte * nonNullTargetCompactionSizeBytes);
                final int maxRowsPerSegment = Numbers.toIntExact(maxRowsPerSegmentLong,
                        StringUtils.format(
                                "Estimated maxRowsPerSegment[%s] is out of integer value range. "
                                        + "Please consider reducing targetCompactionSizeBytes[%s].",
                                maxRowsPerSegmentLong, targetCompactionSizeBytes));
                Preconditions.checkState(maxRowsPerSegment > 0, "Negative maxRowsPerSegment[%s]",
                        maxRowsPerSegment);

                log.info("Estimated maxRowsPerSegment[%d] = avgRowsPerByte[%f] * targetCompactionSizeBytes[%d]",
                        maxRowsPerSegment, avgRowsPerByte, nonNullTargetCompactionSizeBytes);
                // Setting maxTotalRows to Long.MAX_VALUE to respect the computed maxRowsPerSegment.
                // If this is set to something too small, compactionTask can generate small segments
                // which need to be compacted again, which in turn making auto compaction stuck in the same interval.
                final IndexTuningConfig newTuningConfig = tuningConfig == null ? IndexTuningConfig.createDefault()
                        : tuningConfig;
                if (newTuningConfig.isForceGuaranteedRollup()) {
                    return newTuningConfig
                            .withPartitionsSpec(new HashedPartitionsSpec(maxRowsPerSegment, null, null));
                } else {
                    return newTuningConfig
                            .withPartitionsSpec(new DynamicPartitionsSpec(maxRowsPerSegment, Long.MAX_VALUE));
                }
            } else {
                return tuningConfig;
            }
        }

        /**
         * Check the validity of {@link #targetCompactionSizeBytes} and return a valid value. Note that
         * targetCompactionSizeBytes cannot be used with {@link IndexTuningConfig#getPartitionsSpec} together.
         * {@link #hasPartitionConfig} checks one of those configs is set.
         * <p>
         * This throws an {@link IllegalArgumentException} if targetCompactionSizeBytes is set and hasPartitionConfig
         * returns true. If targetCompactionSizeBytes is not set, this returns null or
         * {@link DataSourceCompactionConfig#DEFAULT_TARGET_COMPACTION_SIZE_BYTES} according to the result of
         * hasPartitionConfig.
         */
        @Nullable
        private static Long getValidTargetCompactionSizeBytes(@Nullable Long targetCompactionSizeBytes,
                @Nullable IndexTuningConfig tuningConfig) {
            if (targetCompactionSizeBytes != null && tuningConfig != null) {
                Preconditions.checkArgument(!hasPartitionConfig(tuningConfig),
                        "targetCompactionSizeBytes[%s] cannot be used with partitionsSpec[%s]",
                        targetCompactionSizeBytes, tuningConfig.getPartitionsSpec());
                return targetCompactionSizeBytes;
            } else {
                return hasPartitionConfig(tuningConfig) ? null
                        : DataSourceCompactionConfig.DEFAULT_TARGET_COMPACTION_SIZE_BYTES;
            }
        }

        private static boolean hasPartitionConfig(@Nullable IndexTuningConfig tuningConfig) {
            if (tuningConfig != null) {
                return tuningConfig.getPartitionsSpec() != null;
            } else {
                return false;
            }
        }
    }

    public static class Builder {
        private final String dataSource;
        private final ObjectMapper jsonMapper;
        private final AuthorizerMapper authorizerMapper;
        private final ChatHandlerProvider chatHandlerProvider;
        private final RowIngestionMetersFactory rowIngestionMetersFactory;
        private final CoordinatorClient coordinatorClient;
        private final SegmentLoaderFactory segmentLoaderFactory;
        private final RetryPolicyFactory retryPolicyFactory;
        private final AppenderatorsManager appenderatorsManager;

        @Nullable
        private Interval interval;
        @Nullable
        private List<DataSegment> segments;
        @Nullable
        private DimensionsSpec dimensionsSpec;
        @Nullable
        private AggregatorFactory[] metricsSpec;
        @Nullable
        private Granularity segmentGranularity;
        @Nullable
        private Long targetCompactionSizeBytes;
        @Nullable
        private IndexTuningConfig tuningConfig;
        @Nullable
        private Map<String, Object> context;

        public Builder(String dataSource, ObjectMapper jsonMapper, AuthorizerMapper authorizerMapper,
                ChatHandlerProvider chatHandlerProvider, RowIngestionMetersFactory rowIngestionMetersFactory,
                CoordinatorClient coordinatorClient, SegmentLoaderFactory segmentLoaderFactory,
                RetryPolicyFactory retryPolicyFactory, AppenderatorsManager appenderatorsManager) {
            this.dataSource = dataSource;
            this.jsonMapper = jsonMapper;
            this.authorizerMapper = authorizerMapper;
            this.chatHandlerProvider = chatHandlerProvider;
            this.rowIngestionMetersFactory = rowIngestionMetersFactory;
            this.coordinatorClient = coordinatorClient;
            this.segmentLoaderFactory = segmentLoaderFactory;
            this.retryPolicyFactory = retryPolicyFactory;
            this.appenderatorsManager = appenderatorsManager;
        }

        public Builder interval(Interval interval) {
            this.interval = interval;
            return this;
        }

        public Builder segments(List<DataSegment> segments) {
            this.segments = segments;
            return this;
        }

        public Builder dimensionsSpec(DimensionsSpec dimensionsSpec) {
            this.dimensionsSpec = dimensionsSpec;
            return this;
        }

        public Builder metricsSpec(AggregatorFactory[] metricsSpec) {
            this.metricsSpec = metricsSpec;
            return this;
        }

        public Builder segmentGranularity(Granularity segmentGranularity) {
            this.segmentGranularity = segmentGranularity;
            return this;
        }

        public Builder targetCompactionSizeBytes(long targetCompactionSizeBytes) {
            this.targetCompactionSizeBytes = targetCompactionSizeBytes;
            return this;
        }

        public Builder tuningConfig(IndexTuningConfig tuningConfig) {
            this.tuningConfig = tuningConfig;
            return this;
        }

        public Builder context(Map<String, Object> context) {
            this.context = context;
            return this;
        }

        public CompactionTask build() {
            return new CompactionTask(null, null, dataSource, interval, segments, null, dimensionsSpec, metricsSpec,
                    segmentGranularity, targetCompactionSizeBytes, tuningConfig, context, jsonMapper,
                    authorizerMapper, chatHandlerProvider, rowIngestionMetersFactory, coordinatorClient,
                    segmentLoaderFactory, retryPolicyFactory, appenderatorsManager);
        }
    }
}