com.uber.hoodie.common.table.view.IncrementalTimelineSyncFileSystemView.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.common.table.view.IncrementalTimelineSyncFileSystemView.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.common.table.view;

import com.uber.hoodie.avro.model.HoodieCleanMetadata;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
import com.uber.hoodie.avro.model.HoodieRestoreMetadata;
import com.uber.hoodie.avro.model.HoodieRollbackMetadata;
import com.uber.hoodie.common.model.CompactionOperation;
import com.uber.hoodie.common.model.FileSlice;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieFileGroup;
import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.AvroUtils;
import com.uber.hoodie.common.util.CompactionUtils;
import com.uber.hoodie.common.util.TimelineDiffHelper;
import com.uber.hoodie.common.util.TimelineDiffHelper.TimelineDiffResult;
import com.uber.hoodie.common.util.collection.Pair;
import com.uber.hoodie.exception.HoodieException;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

/**
 * Adds the capability to incrementally sync the changes to file-system view as and when new instants gets completed.
 */
public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTableFileSystemView {

    private static Logger log = LogManager.getLogger(IncrementalTimelineSyncFileSystemView.class);

    // Allows incremental Timeline syncing
    private final boolean incrementalTimelineSyncEnabled;

    // This is the visible active timeline used only for incremental view syncing
    private HoodieTimeline visibleActiveTimeline;

    protected IncrementalTimelineSyncFileSystemView(boolean enableIncrementalTimelineSync) {
        this.incrementalTimelineSyncEnabled = enableIncrementalTimelineSync;
    }

    @Override
    protected void refreshTimeline(HoodieTimeline visibleActiveTimeline) {
        this.visibleActiveTimeline = visibleActiveTimeline;
        super.refreshTimeline(visibleActiveTimeline);
    }

    @Override
    protected void runSync(HoodieTimeline oldTimeline, HoodieTimeline newTimeline) {
        try {
            if (incrementalTimelineSyncEnabled) {
                TimelineDiffResult diffResult = TimelineDiffHelper.getNewInstantsForIncrementalSync(oldTimeline,
                        newTimeline);
                if (diffResult.canSyncIncrementally()) {
                    log.info("Doing incremental sync");
                    runIncrementalSync(newTimeline, diffResult);
                    log.info("Finished incremental sync");
                    // Reset timeline to latest
                    refreshTimeline(newTimeline);
                    return;
                }
            }
        } catch (Exception ioe) {
            log.error("Got exception trying to perform incremental sync. Reverting to complete sync", ioe);
        }

        log.warn("Incremental Sync of timeline is turned off or deemed unsafe. Will revert to full syncing");
        super.runSync(oldTimeline, newTimeline);
    }

    /**
     * Run incremental sync based on the diff result produced.
     *
     * @param timeline New Timeline
     * @param diffResult Timeline Diff Result
     */
    private void runIncrementalSync(HoodieTimeline timeline, TimelineDiffResult diffResult) {

        log.info("Timeline Diff Result is :" + diffResult);

        // First remove pending compaction instants which were completed
        diffResult.getFinishedCompactionInstants().stream().forEach(instant -> {
            try {
                removePendingCompactionInstant(timeline, instant);
            } catch (IOException e) {
                throw new HoodieException(e);
            }
        });

        // Add new completed instants found in the latest timeline
        diffResult.getNewlySeenInstants().stream().filter(
                instant -> instant.isCompleted() || instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION))
                .forEach(instant -> {
                    try {
                        if (instant.getAction().equals(HoodieTimeline.COMMIT_ACTION)
                                || instant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)) {
                            addCommitInstant(timeline, instant);
                        } else if (instant.getAction().equals(HoodieTimeline.RESTORE_ACTION)) {
                            addRestoreInstant(timeline, instant);
                        } else if (instant.getAction().equals(HoodieTimeline.CLEAN_ACTION)) {
                            addCleanInstant(timeline, instant);
                        } else if (instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) {
                            addPendingCompactionInstant(timeline, instant);
                        } else if (instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)) {
                            addRollbackInstant(timeline, instant);
                        }
                    } catch (IOException ioe) {
                        throw new HoodieException(ioe);
                    }
                });
    }

    /**
     * Remove Pending compaction instant
     *
     * @param timeline New Hoodie Timeline
     * @param instant Compaction Instant to be removed
     */
    private void removePendingCompactionInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException {
        log.info("Removing completed compaction instant (" + instant + ")");
        HoodieCompactionPlan plan = CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp());
        removePendingCompactionOperations(CompactionUtils.getPendingCompactionOperations(instant, plan)
                .map(instantPair -> Pair.of(instantPair.getValue().getKey(),
                        CompactionOperation.convertFromAvroRecordInstance(instantPair.getValue().getValue()))));
    }

    /**
     * Add newly found compaction instant
     *
     * @param timeline Hoodie Timeline
     * @param instant Compaction Instant
     */
    private void addPendingCompactionInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException {
        log.info("Syncing pending compaction instant (" + instant + ")");
        HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp());
        List<Pair<String, CompactionOperation>> pendingOps = CompactionUtils
                .getPendingCompactionOperations(instant, compactionPlan)
                .map(p -> Pair.of(p.getValue().getKey(),
                        CompactionOperation.convertFromAvroRecordInstance(p.getValue().getValue())))
                .collect(Collectors.toList());
        // First, update Pending compaction instants
        addPendingCompactionOperations(pendingOps.stream());

        Map<String, List<Pair<String, HoodieFileGroup>>> partitionToFileGroups = pendingOps.stream().map(opPair -> {
            String compactionInstantTime = opPair.getKey();
            HoodieFileGroup fileGroup = new HoodieFileGroup(opPair.getValue().getFileGroupId(), timeline);
            fileGroup.addNewFileSliceAtInstant(compactionInstantTime);
            return Pair.of(compactionInstantTime, fileGroup);
        }).collect(Collectors.groupingBy(x -> x.getValue().getPartitionPath()));
        partitionToFileGroups.entrySet().forEach(entry -> {
            if (isPartitionAvailableInStore(entry.getKey())) {
                applyDeltaFileSlicesToPartitionView(entry.getKey(),
                        entry.getValue().stream().map(Pair::getValue).collect(Collectors.toList()),
                        DeltaApplyMode.ADD);
            }
        });
    }

    /**
     * Add newly found commit/delta-commit instant
     *
     * @param timeline Hoodie Timeline
     * @param instant Instant
     */
    private void addCommitInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException {
        log.info("Syncing committed instant (" + instant + ")");
        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
                .fromBytes(timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class);
        commitMetadata.getPartitionToWriteStats().entrySet().stream().forEach(entry -> {
            String partition = entry.getKey();
            if (isPartitionAvailableInStore(partition)) {
                log.info("Syncing partition (" + partition + ") of instant (" + instant + ")");
                FileStatus[] statuses = entry.getValue().stream().map(p -> {
                    FileStatus status = new FileStatus(p.getFileSizeInBytes(), false, 0, 0, 0, 0, null, null, null,
                            new Path(String.format("%s/%s", metaClient.getBasePath(), p.getPath())));
                    return status;
                }).toArray(FileStatus[]::new);
                List<HoodieFileGroup> fileGroups = buildFileGroups(statuses,
                        timeline.filterCompletedAndCompactionInstants(), false);
                applyDeltaFileSlicesToPartitionView(partition, fileGroups, DeltaApplyMode.ADD);
            } else {
                log.warn("Skipping partition (" + partition + ") when syncing instant (" + instant
                        + ") as it is not loaded");
            }
        });
        log.info("Done Syncing committed instant (" + instant + ")");
    }

    /**
     * Add newly found restore instant
     *
     * @param timeline Hoodie Timeline
     * @param instant Restore Instant
     */
    private void addRestoreInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException {
        log.info("Syncing restore instant (" + instant + ")");
        HoodieRestoreMetadata metadata = AvroUtils
                .deserializeAvroMetadata(timeline.getInstantDetails(instant).get(), HoodieRestoreMetadata.class);

        Map<String, List<Pair<String, String>>> partitionFiles = metadata.getHoodieRestoreMetadata().entrySet()
                .stream().flatMap(entry -> {
                    return entry.getValue().stream()
                            .flatMap(e -> e.getPartitionMetadata().entrySet().stream().flatMap(e2 -> {
                                return e2.getValue().getSuccessDeleteFiles().stream()
                                        .map(x -> Pair.of(e2.getKey(), x));
                            }));
                }).collect(Collectors.groupingBy(Pair::getKey));
        partitionFiles.entrySet().stream().forEach(e -> {
            removeFileSlicesForPartition(timeline, instant, e.getKey(),
                    e.getValue().stream().map(x -> x.getValue()).collect(Collectors.toList()));
        });
        log.info("Done Syncing restore instant (" + instant + ")");
    }

    /**
     * Add newly found rollback instant
     *
     * @param timeline Hoodie Timeline
     * @param instant Rollback Instant
     */
    private void addRollbackInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException {
        log.info("Syncing rollback instant (" + instant + ")");
        HoodieRollbackMetadata metadata = AvroUtils
                .deserializeAvroMetadata(timeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class);

        metadata.getPartitionMetadata().entrySet().stream().forEach(e -> {
            removeFileSlicesForPartition(timeline, instant, e.getKey(), e.getValue().getSuccessDeleteFiles());
        });
        log.info("Done Syncing rollback instant (" + instant + ")");
    }

    /**
     * Add newly found clean instant
     *
     * @param timeline Timeline
     * @param instant Clean instant
     */
    private void addCleanInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException {
        log.info("Syncing cleaner instant (" + instant + ")");
        HoodieCleanMetadata cleanMetadata = AvroUtils
                .deserializeHoodieCleanMetadata(timeline.getInstantDetails(instant).get());
        cleanMetadata.getPartitionMetadata().entrySet().stream().forEach(entry -> {
            removeFileSlicesForPartition(timeline, instant, entry.getKey(),
                    entry.getValue().getSuccessDeleteFiles());
        });
        log.info("Done Syncing cleaner instant (" + instant + ")");
    }

    private void removeFileSlicesForPartition(HoodieTimeline timeline, HoodieInstant instant, String partition,
            List<String> paths) {
        if (isPartitionAvailableInStore(partition)) {
            log.info("Removing file slices for partition (" + partition + ") for instant (" + instant + ")");
            FileStatus[] statuses = paths.stream().map(p -> {
                FileStatus status = new FileStatus();
                status.setPath(new Path(p));
                return status;
            }).toArray(FileStatus[]::new);
            List<HoodieFileGroup> fileGroups = buildFileGroups(statuses,
                    timeline.filterCompletedAndCompactionInstants(), false);
            applyDeltaFileSlicesToPartitionView(partition, fileGroups, DeltaApplyMode.REMOVE);
        } else {
            log.warn("Skipping partition (" + partition + ") when syncing instant (" + instant
                    + ") as it is not loaded");
        }
    }

    /**
     * Apply mode whether to add or remove the delta view
     */
    enum DeltaApplyMode {
        ADD, REMOVE
    }

    /**
     * Apply changes to partition file-system view. Base Implementation overwrites the entire partitions view assuming
     * some sort of map (in-mem/disk-based) is used. For View implementation which supports fine-granular updates (e:g
     * RocksDB), override this method.
     *
     * @param partition PartitionPath
     * @param deltaFileGroups Changed file-slices aggregated as file-groups
     * @param mode Delta Apply mode
     */
    protected void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups,
            DeltaApplyMode mode) {
        if (deltaFileGroups.isEmpty()) {
            log.info("No delta file groups for partition :" + partition);
            return;
        }

        List<HoodieFileGroup> fileGroups = fetchAllStoredFileGroups(partition).collect(Collectors.toList());
        /**
         * Note that while finding the new data/log files added/removed, the path stored in metadata will be missing
         * the base-path,scheme and authority. Ensure the matching process takes care of this discrepancy.
         */
        Map<String, HoodieDataFile> viewDataFiles = fileGroups.stream()
                .flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getDataFile)
                .filter(Optional::isPresent).map(Optional::get)
                .map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df))
                .collect(Collectors.toMap(Pair::getKey, Pair::getValue));
        //Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions
        Map<String, HoodieDataFile> deltaDataFiles = deltaFileGroups.stream()
                .flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getDataFile)
                .filter(Optional::isPresent).map(Optional::get)
                .map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df))
                .collect(Collectors.toMap(Pair::getKey, Pair::getValue));

        Map<String, HoodieLogFile> viewLogFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices)
                .flatMap(FileSlice::getLogFiles)
                .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf))
                .collect(Collectors.toMap(Pair::getKey, Pair::getValue));
        Map<String, HoodieLogFile> deltaLogFiles = deltaFileGroups.stream()
                .flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles)
                .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf))
                .collect(Collectors.toMap(Pair::getKey, Pair::getValue));

        switch (mode) {
        case ADD:
            viewDataFiles.putAll(deltaDataFiles);
            viewLogFiles.putAll(deltaLogFiles);
            break;
        case REMOVE:
            deltaDataFiles.keySet().stream().forEach(p -> viewDataFiles.remove(p));
            deltaLogFiles.keySet().stream().forEach(p -> viewLogFiles.remove(p));
            break;
        default:
            throw new IllegalStateException("Unknown diff apply mode=" + mode);
        }

        HoodieTimeline timeline = deltaFileGroups.stream().map(df -> df.getTimeline()).findAny().get();
        List<HoodieFileGroup> fgs = buildFileGroups(viewDataFiles.values().stream(), viewLogFiles.values().stream(),
                timeline, true);
        storePartitionView(partition, fgs);
    }

    @Override
    public HoodieTimeline getTimeline() {
        return visibleActiveTimeline;
    }
}