Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.uber.hoodie.common.table.view; import com.uber.hoodie.avro.model.HoodieCleanMetadata; import com.uber.hoodie.avro.model.HoodieCompactionPlan; import com.uber.hoodie.avro.model.HoodieRestoreMetadata; import com.uber.hoodie.avro.model.HoodieRollbackMetadata; import com.uber.hoodie.common.model.CompactionOperation; import com.uber.hoodie.common.model.FileSlice; import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodieFileGroup; import com.uber.hoodie.common.model.HoodieLogFile; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.AvroUtils; import com.uber.hoodie.common.util.CompactionUtils; import com.uber.hoodie.common.util.TimelineDiffHelper; import com.uber.hoodie.common.util.TimelineDiffHelper.TimelineDiffResult; import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.exception.HoodieException; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; /** * Adds the capability to incrementally sync the changes to file-system view as and when new instants gets completed. */ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTableFileSystemView { private static Logger log = LogManager.getLogger(IncrementalTimelineSyncFileSystemView.class); // Allows incremental Timeline syncing private final boolean incrementalTimelineSyncEnabled; // This is the visible active timeline used only for incremental view syncing private HoodieTimeline visibleActiveTimeline; protected IncrementalTimelineSyncFileSystemView(boolean enableIncrementalTimelineSync) { this.incrementalTimelineSyncEnabled = enableIncrementalTimelineSync; } @Override protected void refreshTimeline(HoodieTimeline visibleActiveTimeline) { this.visibleActiveTimeline = visibleActiveTimeline; super.refreshTimeline(visibleActiveTimeline); } @Override protected void runSync(HoodieTimeline oldTimeline, HoodieTimeline newTimeline) { try { if (incrementalTimelineSyncEnabled) { TimelineDiffResult diffResult = TimelineDiffHelper.getNewInstantsForIncrementalSync(oldTimeline, newTimeline); if (diffResult.canSyncIncrementally()) { log.info("Doing incremental sync"); runIncrementalSync(newTimeline, diffResult); log.info("Finished incremental sync"); // Reset timeline to latest refreshTimeline(newTimeline); return; } } } catch (Exception ioe) { log.error("Got exception trying to perform incremental sync. Reverting to complete sync", ioe); } log.warn("Incremental Sync of timeline is turned off or deemed unsafe. Will revert to full syncing"); super.runSync(oldTimeline, newTimeline); } /** * Run incremental sync based on the diff result produced. * * @param timeline New Timeline * @param diffResult Timeline Diff Result */ private void runIncrementalSync(HoodieTimeline timeline, TimelineDiffResult diffResult) { log.info("Timeline Diff Result is :" + diffResult); // First remove pending compaction instants which were completed diffResult.getFinishedCompactionInstants().stream().forEach(instant -> { try { removePendingCompactionInstant(timeline, instant); } catch (IOException e) { throw new HoodieException(e); } }); // Add new completed instants found in the latest timeline diffResult.getNewlySeenInstants().stream().filter( instant -> instant.isCompleted() || instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) .forEach(instant -> { try { if (instant.getAction().equals(HoodieTimeline.COMMIT_ACTION) || instant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)) { addCommitInstant(timeline, instant); } else if (instant.getAction().equals(HoodieTimeline.RESTORE_ACTION)) { addRestoreInstant(timeline, instant); } else if (instant.getAction().equals(HoodieTimeline.CLEAN_ACTION)) { addCleanInstant(timeline, instant); } else if (instant.getAction().equals(HoodieTimeline.COMPACTION_ACTION)) { addPendingCompactionInstant(timeline, instant); } else if (instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)) { addRollbackInstant(timeline, instant); } } catch (IOException ioe) { throw new HoodieException(ioe); } }); } /** * Remove Pending compaction instant * * @param timeline New Hoodie Timeline * @param instant Compaction Instant to be removed */ private void removePendingCompactionInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Removing completed compaction instant (" + instant + ")"); HoodieCompactionPlan plan = CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp()); removePendingCompactionOperations(CompactionUtils.getPendingCompactionOperations(instant, plan) .map(instantPair -> Pair.of(instantPair.getValue().getKey(), CompactionOperation.convertFromAvroRecordInstance(instantPair.getValue().getValue())))); } /** * Add newly found compaction instant * * @param timeline Hoodie Timeline * @param instant Compaction Instant */ private void addPendingCompactionInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Syncing pending compaction instant (" + instant + ")"); HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp()); List<Pair<String, CompactionOperation>> pendingOps = CompactionUtils .getPendingCompactionOperations(instant, compactionPlan) .map(p -> Pair.of(p.getValue().getKey(), CompactionOperation.convertFromAvroRecordInstance(p.getValue().getValue()))) .collect(Collectors.toList()); // First, update Pending compaction instants addPendingCompactionOperations(pendingOps.stream()); Map<String, List<Pair<String, HoodieFileGroup>>> partitionToFileGroups = pendingOps.stream().map(opPair -> { String compactionInstantTime = opPair.getKey(); HoodieFileGroup fileGroup = new HoodieFileGroup(opPair.getValue().getFileGroupId(), timeline); fileGroup.addNewFileSliceAtInstant(compactionInstantTime); return Pair.of(compactionInstantTime, fileGroup); }).collect(Collectors.groupingBy(x -> x.getValue().getPartitionPath())); partitionToFileGroups.entrySet().forEach(entry -> { if (isPartitionAvailableInStore(entry.getKey())) { applyDeltaFileSlicesToPartitionView(entry.getKey(), entry.getValue().stream().map(Pair::getValue).collect(Collectors.toList()), DeltaApplyMode.ADD); } }); } /** * Add newly found commit/delta-commit instant * * @param timeline Hoodie Timeline * @param instant Instant */ private void addCommitInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Syncing committed instant (" + instant + ")"); HoodieCommitMetadata commitMetadata = HoodieCommitMetadata .fromBytes(timeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class); commitMetadata.getPartitionToWriteStats().entrySet().stream().forEach(entry -> { String partition = entry.getKey(); if (isPartitionAvailableInStore(partition)) { log.info("Syncing partition (" + partition + ") of instant (" + instant + ")"); FileStatus[] statuses = entry.getValue().stream().map(p -> { FileStatus status = new FileStatus(p.getFileSizeInBytes(), false, 0, 0, 0, 0, null, null, null, new Path(String.format("%s/%s", metaClient.getBasePath(), p.getPath()))); return status; }).toArray(FileStatus[]::new); List<HoodieFileGroup> fileGroups = buildFileGroups(statuses, timeline.filterCompletedAndCompactionInstants(), false); applyDeltaFileSlicesToPartitionView(partition, fileGroups, DeltaApplyMode.ADD); } else { log.warn("Skipping partition (" + partition + ") when syncing instant (" + instant + ") as it is not loaded"); } }); log.info("Done Syncing committed instant (" + instant + ")"); } /** * Add newly found restore instant * * @param timeline Hoodie Timeline * @param instant Restore Instant */ private void addRestoreInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Syncing restore instant (" + instant + ")"); HoodieRestoreMetadata metadata = AvroUtils .deserializeAvroMetadata(timeline.getInstantDetails(instant).get(), HoodieRestoreMetadata.class); Map<String, List<Pair<String, String>>> partitionFiles = metadata.getHoodieRestoreMetadata().entrySet() .stream().flatMap(entry -> { return entry.getValue().stream() .flatMap(e -> e.getPartitionMetadata().entrySet().stream().flatMap(e2 -> { return e2.getValue().getSuccessDeleteFiles().stream() .map(x -> Pair.of(e2.getKey(), x)); })); }).collect(Collectors.groupingBy(Pair::getKey)); partitionFiles.entrySet().stream().forEach(e -> { removeFileSlicesForPartition(timeline, instant, e.getKey(), e.getValue().stream().map(x -> x.getValue()).collect(Collectors.toList())); }); log.info("Done Syncing restore instant (" + instant + ")"); } /** * Add newly found rollback instant * * @param timeline Hoodie Timeline * @param instant Rollback Instant */ private void addRollbackInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Syncing rollback instant (" + instant + ")"); HoodieRollbackMetadata metadata = AvroUtils .deserializeAvroMetadata(timeline.getInstantDetails(instant).get(), HoodieRollbackMetadata.class); metadata.getPartitionMetadata().entrySet().stream().forEach(e -> { removeFileSlicesForPartition(timeline, instant, e.getKey(), e.getValue().getSuccessDeleteFiles()); }); log.info("Done Syncing rollback instant (" + instant + ")"); } /** * Add newly found clean instant * * @param timeline Timeline * @param instant Clean instant */ private void addCleanInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException { log.info("Syncing cleaner instant (" + instant + ")"); HoodieCleanMetadata cleanMetadata = AvroUtils .deserializeHoodieCleanMetadata(timeline.getInstantDetails(instant).get()); cleanMetadata.getPartitionMetadata().entrySet().stream().forEach(entry -> { removeFileSlicesForPartition(timeline, instant, entry.getKey(), entry.getValue().getSuccessDeleteFiles()); }); log.info("Done Syncing cleaner instant (" + instant + ")"); } private void removeFileSlicesForPartition(HoodieTimeline timeline, HoodieInstant instant, String partition, List<String> paths) { if (isPartitionAvailableInStore(partition)) { log.info("Removing file slices for partition (" + partition + ") for instant (" + instant + ")"); FileStatus[] statuses = paths.stream().map(p -> { FileStatus status = new FileStatus(); status.setPath(new Path(p)); return status; }).toArray(FileStatus[]::new); List<HoodieFileGroup> fileGroups = buildFileGroups(statuses, timeline.filterCompletedAndCompactionInstants(), false); applyDeltaFileSlicesToPartitionView(partition, fileGroups, DeltaApplyMode.REMOVE); } else { log.warn("Skipping partition (" + partition + ") when syncing instant (" + instant + ") as it is not loaded"); } } /** * Apply mode whether to add or remove the delta view */ enum DeltaApplyMode { ADD, REMOVE } /** * Apply changes to partition file-system view. Base Implementation overwrites the entire partitions view assuming * some sort of map (in-mem/disk-based) is used. For View implementation which supports fine-granular updates (e:g * RocksDB), override this method. * * @param partition PartitionPath * @param deltaFileGroups Changed file-slices aggregated as file-groups * @param mode Delta Apply mode */ protected void applyDeltaFileSlicesToPartitionView(String partition, List<HoodieFileGroup> deltaFileGroups, DeltaApplyMode mode) { if (deltaFileGroups.isEmpty()) { log.info("No delta file groups for partition :" + partition); return; } List<HoodieFileGroup> fileGroups = fetchAllStoredFileGroups(partition).collect(Collectors.toList()); /** * Note that while finding the new data/log files added/removed, the path stored in metadata will be missing * the base-path,scheme and authority. Ensure the matching process takes care of this discrepancy. */ Map<String, HoodieDataFile> viewDataFiles = fileGroups.stream() .flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getDataFile) .filter(Optional::isPresent).map(Optional::get) .map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); //Note: Delta Log Files and Data FIles can be empty when adding/removing pending compactions Map<String, HoodieDataFile> deltaDataFiles = deltaFileGroups.stream() .flatMap(HoodieFileGroup::getAllRawFileSlices).map(FileSlice::getDataFile) .filter(Optional::isPresent).map(Optional::get) .map(df -> Pair.of(Path.getPathWithoutSchemeAndAuthority(new Path(df.getPath())).toString(), df)) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); Map<String, HoodieLogFile> viewLogFiles = fileGroups.stream().flatMap(HoodieFileGroup::getAllRawFileSlices) .flatMap(FileSlice::getLogFiles) .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); Map<String, HoodieLogFile> deltaLogFiles = deltaFileGroups.stream() .flatMap(HoodieFileGroup::getAllRawFileSlices).flatMap(FileSlice::getLogFiles) .map(lf -> Pair.of(Path.getPathWithoutSchemeAndAuthority(lf.getPath()).toString(), lf)) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); switch (mode) { case ADD: viewDataFiles.putAll(deltaDataFiles); viewLogFiles.putAll(deltaLogFiles); break; case REMOVE: deltaDataFiles.keySet().stream().forEach(p -> viewDataFiles.remove(p)); deltaLogFiles.keySet().stream().forEach(p -> viewLogFiles.remove(p)); break; default: throw new IllegalStateException("Unknown diff apply mode=" + mode); } HoodieTimeline timeline = deltaFileGroups.stream().map(df -> df.getTimeline()).findAny().get(); List<HoodieFileGroup> fgs = buildFileGroups(viewDataFiles.values().stream(), viewLogFiles.values().stream(), timeline, true); storePartitionView(partition, fgs); } @Override public HoodieTimeline getTimeline() { return visibleActiveTimeline; } }