Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.uber.hoodie; import static com.uber.hoodie.common.HoodieTestDataGenerator.DEFAULT_PARTITION_DEPTH; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import com.uber.hoodie.common.HoodieCleanStat; import com.uber.hoodie.common.HoodieClientTestUtils; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus; import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.SyncableFileSystemView; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.view.FileSystemViewStorageConfig; import com.uber.hoodie.common.table.view.FileSystemViewStorageType; import com.uber.hoodie.common.util.ConsistencyGuardConfig; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieStorageConfig; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.index.HoodieIndex.IndexType; import com.uber.hoodie.table.HoodieTable; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SQLContext; import org.junit.After; import org.junit.Before; import org.junit.rules.TemporaryFolder; /** * Base Class providing setup/cleanup and utility methods for testing Hoodie Client facing tests */ public class TestHoodieClientBase implements Serializable { protected static Logger logger = LogManager.getLogger(TestHoodieClientBase.class); protected transient JavaSparkContext jsc = null; protected transient SQLContext sqlContext; protected transient FileSystem fs; protected String basePath = null; protected TemporaryFolder folder = null; protected transient HoodieTestDataGenerator dataGen = null; private HoodieWriteClient writeClient; private HoodieReadClient readClient; protected HoodieWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) { return getHoodieWriteClient(cfg, false); } protected HoodieWriteClient getHoodieWriteClient(HoodieWriteConfig cfg, boolean rollbackInflightCommit) { return getHoodieWriteClient(cfg, rollbackInflightCommit, HoodieIndex.createIndex(cfg, jsc)); } protected HoodieWriteClient getHoodieWriteClient(HoodieWriteConfig cfg, boolean rollbackInflightCommit, HoodieIndex index) { closeWriteClient(); writeClient = new HoodieWriteClient(jsc, cfg, rollbackInflightCommit, index); return writeClient; } protected HoodieReadClient getHoodieReadClient(String basePath) { closeReadClient(); readClient = new HoodieReadClient(jsc, basePath); return readClient; } private void closeWriteClient() { if (null != writeClient) { writeClient.close(); writeClient = null; } } private void closeReadClient() { if (null != readClient) { readClient.close(); readClient = null; } } @Before public void init() throws IOException { // Initialize a local spark env jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieClient")); jsc.setLogLevel("ERROR"); //SQLContext stuff sqlContext = new SQLContext(jsc); folder = new TemporaryFolder(); folder.create(); basePath = folder.getRoot().getAbsolutePath(); fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); if (fs instanceof LocalFileSystem) { LocalFileSystem lfs = (LocalFileSystem) fs; // With LocalFileSystem, with checksum disabled, fs.open() returns an inputStream which is FSInputStream // This causes ClassCastExceptions in LogRecordScanner (and potentially other places) calling fs.open // So, for the tests, we enforce checksum verification to circumvent the problem lfs.setVerifyChecksum(true); } HoodieTestUtils.initTableType(jsc.hadoopConfiguration(), basePath, getTableType()); dataGen = new HoodieTestDataGenerator(); } @After /** * Properly release resources at end of each test */ public void tearDown() throws IOException { closeWriteClient(); closeReadClient(); if (null != sqlContext) { logger.info("Clearing sql context cache of spark-session used in previous test-case"); sqlContext.clearCache(); } if (null != jsc) { logger.info("Closing spark context used in previous test-case"); jsc.close(); } // Create a temp folder as the base path if (null != folder) { logger.info("Explicitly removing workspace used in previously run test-case"); folder.delete(); } if (null != fs) { logger.warn("Closing file-system instance used in previous test-run"); fs.close(); } } /** * Get Default HoodieWriteConfig for tests * * @return Default Hoodie Write Config for tests */ protected HoodieWriteConfig getConfig() { return getConfigBuilder().build(); } /** * Get Config builder with default configs set * * @return Config Builder */ HoodieWriteConfig.Builder getConfigBuilder() { return HoodieWriteConfig.newBuilder().withPath(basePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withBulkInsertParallelism(2).withFinalizeWriteParallelism(2) .withWriteStatusClass(MetadataMergeWriteStatus.class) .withConsistencyGuardConfig( ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) .withCompactionConfig( HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) .forTable("test-trip-table") .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build()) .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig .newBuilder().withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()); } protected HoodieTable getHoodieTable(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc); ((SyncableFileSystemView) (table.getRTFileSystemView())).reset(); return table; } /** * Assert no failures in writing hoodie files * * @param statuses List of Write Status */ static void assertNoWriteErrors(List<WriteStatus> statuses) { // Verify there are no errors for (WriteStatus status : statuses) { assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors()); } } /** * Ensure presence of partition meta-data at known depth * * @param partitionPaths Partition paths to check * @param fs File System * @throws IOException in case of error */ void assertPartitionMetadata(String[] partitionPaths, FileSystem fs) throws IOException { for (String partitionPath : partitionPaths) { assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, new Path(basePath, partitionPath))); HoodiePartitionMetadata pmeta = new HoodiePartitionMetadata(fs, new Path(basePath, partitionPath)); pmeta.readFromFS(); assertEquals(DEFAULT_PARTITION_DEPTH, pmeta.getPartitionDepth()); } } /** * Ensure records have location field set * * @param taggedRecords Tagged Records * @param commitTime Commit Timestamp */ void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) { for (HoodieRecord rec : taggedRecords) { assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown()); assertEquals("All records should have commit time " + commitTime + ", since updates were made", rec.getCurrentLocation().getInstantTime(), commitTime); } } /** * Assert that there is no duplicate key at the partition level * * @param records List of Hoodie records */ void assertNodupesWithinPartition(List<HoodieRecord> records) { Map<String, Set<String>> partitionToKeys = new HashMap<>(); for (HoodieRecord r : records) { String key = r.getRecordKey(); String partitionPath = r.getPartitionPath(); if (!partitionToKeys.containsKey(partitionPath)) { partitionToKeys.put(partitionPath, new HashSet<>()); } assertTrue("key " + key + " is duplicate within partition " + partitionPath, !partitionToKeys.get(partitionPath).contains(key)); partitionToKeys.get(partitionPath).add(key); } } /** * Helper to generate records generation function for testing Prepped version of API. Prepped APIs expect the records * to be already de-duped and have location set. This wrapper takes care of record-location setting. Uniqueness is * guaranteed by record-generation function itself. * * @param writeConfig Hoodie Write Config * @param recordGenFunction Records Generation function * @return Wrapped function */ private Function2<List<HoodieRecord>, String, Integer> wrapRecordsGenFunctionForPreppedCalls( final HoodieWriteConfig writeConfig, final Function2<List<HoodieRecord>, String, Integer> recordGenFunction) { return (commit, numRecords) -> { final HoodieIndex index = HoodieIndex.createIndex(writeConfig, jsc); List<HoodieRecord> records = recordGenFunction.apply(commit, numRecords); final HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true); HoodieTable table = HoodieTable.getHoodieTable(metaClient, writeConfig, jsc); JavaRDD<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), jsc, table); return taggedRecords.collect(); }; } /** * Generate wrapper for record generation function for testing Prepped APIs * * @param isPreppedAPI Flag to indicate if this is for testing prepped-version of APIs * @param writeConfig Hoodie Write Config * @param wrapped Actual Records Generation function * @return Wrapped Function */ Function2<List<HoodieRecord>, String, Integer> generateWrapRecordsFn(boolean isPreppedAPI, HoodieWriteConfig writeConfig, Function2<List<HoodieRecord>, String, Integer> wrapped) { if (isPreppedAPI) { return wrapRecordsGenFunctionForPreppedCalls(writeConfig, wrapped); } else { return wrapped; } } /** * Helper to insert first batch of records and do regular assertions on the state after successful completion * * @param writeConfig Hoodie Write Config * @param client Hoodie Write Client * @param newCommitTime New Commit Timestamp to be used * @param initCommitTime Begin Timestamp (usually "000") * @param numRecordsInThisCommit Number of records to be added in the new commit * @param writeFn Write Function to be used for insertion * @param isPreppedAPI Boolean flag to indicate writeFn expects prepped records * @param assertForCommit Enable Assertion of Writes * @param expRecordsInThisCommit Expected number of records in this commit * @return RDD of write-status * @throws Exception in case of error */ JavaRDD<WriteStatus> insertFirstBatch(HoodieWriteConfig writeConfig, HoodieWriteClient client, String newCommitTime, String initCommitTime, int numRecordsInThisCommit, Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI, boolean assertForCommit, int expRecordsInThisCommit) throws Exception { final Function2<List<HoodieRecord>, String, Integer> recordGenFunction = generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateInserts); return writeBatch(client, newCommitTime, initCommitTime, Optional.empty(), initCommitTime, numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1); } /** * Helper to upsert batch of records and do regular assertions on the state after successful completion * * @param writeConfig Hoodie Write Config * @param client Hoodie Write Client * @param newCommitTime New Commit Timestamp to be used * @param prevCommitTime Commit Timestamp used in previous commit * @param commitTimesBetweenPrevAndNew Sample of Timestamps between prevCommitTime and newCommitTime * @param initCommitTime Begin Timestamp (usually "000") * @param numRecordsInThisCommit Number of records to be added in the new commit * @param writeFn Write Function to be used for upsert * @param isPreppedAPI Boolean flag to indicate writeFn expects prepped records * @param assertForCommit Enable Assertion of Writes * @param expRecordsInThisCommit Expected number of records in this commit * @param expTotalRecords Expected number of records when scanned * @param expTotalCommits Expected number of commits (including this commit) * @return RDD of write-status * @throws Exception in case of error */ JavaRDD<WriteStatus> updateBatch(HoodieWriteConfig writeConfig, HoodieWriteClient client, String newCommitTime, String prevCommitTime, Optional<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI, boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception { final Function2<List<HoodieRecord>, String, Integer> recordGenFunction = generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateUniqueUpdates); return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits); } /** * Helper to insert/upsert batch of records and do regular assertions on the state after successful completion * * @param client Hoodie Write Client * @param newCommitTime New Commit Timestamp to be used * @param prevCommitTime Commit Timestamp used in previous commit * @param commitTimesBetweenPrevAndNew Sample of Timestamps between prevCommitTime and newCommitTime * @param initCommitTime Begin Timestamp (usually "000") * @param numRecordsInThisCommit Number of records to be added in the new commit * @param recordGenFunction Records Generation Function * @param writeFn Write Function to be used for upsert * @param assertForCommit Enable Assertion of Writes * @param expRecordsInThisCommit Expected number of records in this commit * @param expTotalRecords Expected number of records when scanned * @param expTotalCommits Expected number of commits (including this commit) * @throws Exception in case of error */ JavaRDD<WriteStatus> writeBatch(HoodieWriteClient client, String newCommitTime, String prevCommitTime, Optional<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, Function2<List<HoodieRecord>, String, Integer> recordGenFunction, Function3<JavaRDD<WriteStatus>, HoodieWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception { //Write 1 (only inserts) client.startCommitWithTime(newCommitTime); List<HoodieRecord> records = recordGenFunction.apply(newCommitTime, numRecordsInThisCommit); JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1); JavaRDD<WriteStatus> result = writeFn.apply(client, writeRecords, newCommitTime); List<WriteStatus> statuses = result.collect(); assertNoWriteErrors(statuses); // check the partition metadata is written out assertPartitionMetadata(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, fs); // verify that there is a commit HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); if (assertForCommit) { assertEquals("Expecting " + expTotalCommits + " commits.", expTotalCommits, timeline.findInstantsAfter(initCommitTime, Integer.MAX_VALUE).countInstants()); assertEquals("Latest commit should be " + newCommitTime, newCommitTime, timeline.lastInstant().get().getTimestamp()); assertEquals("Must contain " + expRecordsInThisCommit + " records", expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); // Check the entire dataset has all records still String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; for (int i = 0; i < fullPartitionPaths.length; i++) { fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); } assertEquals("Must contain " + expTotalRecords + " records", expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count()); // Check that the incremental consumption from prevCommitTime assertEquals( "Incremental consumption from " + prevCommitTime + " should give all records in latest commit", HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, prevCommitTime).count()); if (commitTimesBetweenPrevAndNew.isPresent()) { commitTimesBetweenPrevAndNew.get().forEach(ct -> { assertEquals("Incremental consumption from " + ct + " should give all records in latest commit", HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, ct).count()); }); } } return result; } @After public void clean() { if (basePath != null) { new File(basePath).delete(); } if (jsc != null) { jsc.stop(); } } /** * Get Cleaner state corresponding to a partition path * * @param hoodieCleanStatsTwo List of Clean Stats * @param partitionPath Partition path for filtering * @return Cleaner state corresponding to partition path */ HoodieCleanStat getCleanStat(List<HoodieCleanStat> hoodieCleanStatsTwo, String partitionPath) { return hoodieCleanStatsTwo.stream().filter(e -> e.getPartitionPath().equals(partitionPath)).findFirst() .get(); } /** * Utility to simulate commit touching files in a partition * * @param files List of file-Ids to be touched * @param partitionPath Partition * @param commitTime Commit Timestamp * @throws IOException in case of error */ void updateAllFilesInPartition(List<String> files, String partitionPath, String commitTime) throws IOException { for (String fileId : files) { HoodieTestUtils.createDataFile(basePath, partitionPath, commitTime, fileId); } } /** * Helper methods to create new data files in a partition * * @param partitionPath Partition * @param commitTime Commit Timestamp * @param numFiles Number of files to be added * @return Created files * @throws IOException in case of error */ List<String> createFilesInPartition(String partitionPath, String commitTime, int numFiles) throws IOException { List<String> files = new ArrayList<>(); for (int i = 0; i < numFiles; i++) { files.add(HoodieTestUtils.createNewDataFile(basePath, partitionPath, commitTime)); } return files; } // Functional Interfaces for passing lambda and Hoodie Write API contexts @FunctionalInterface interface Function2<R, T1, T2> { R apply(T1 v1, T2 v2) throws IOException; } @FunctionalInterface interface Function3<R, T1, T2, T3> { R apply(T1 v1, T2 v2, T3 v3) throws IOException; } protected HoodieTableType getTableType() { return HoodieTableType.COPY_ON_WRITE; } }