com.uber.hoodie.index.TestHoodieBloomIndex.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.index.TestHoodieBloomIndex.java

Source

/*
 * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.uber.hoodie.index;

import com.google.common.base.Optional;
import com.google.common.collect.Lists;

import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.TestRawTripPayload;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.common.util.HoodieAvroUtils;

import com.uber.hoodie.table.HoodieTable;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import scala.Tuple2;

import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;

import static org.junit.Assert.*;

public class TestHoodieBloomIndex {
    private JavaSparkContext jsc = null;
    private String basePath = null;
    private transient final FileSystem fs;

    public TestHoodieBloomIndex() throws Exception {
        fs = FSUtils.getFs();
    }

    @Before
    public void init() throws IOException {
        // Initialize a local spark env
        SparkConf sparkConf = new SparkConf().setAppName("TestHoodieBloomIndex").setMaster("local[4]");
        jsc = new JavaSparkContext(sparkConf);
        // Create a temp folder as the base path
        TemporaryFolder folder = new TemporaryFolder();
        folder.create();
        basePath = folder.getRoot().getAbsolutePath();
        HoodieTestUtils.init(basePath);
    }

    @Test
    public void testLoadUUIDsInMemory() throws IOException {
        // Create one RDD of hoodie record
        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
        String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
        String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";

        String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieRecord record1 = new HoodieRecord(
                new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieRecord record2 = new HoodieRecord(
                new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
        TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
        HoodieRecord record3 = new HoodieRecord(
                new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
        TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
        HoodieRecord record4 = new HoodieRecord(
                new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);

        JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));

        // Load to memory
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
        Map<String, Iterable<String>> map = index.getPartitionToRowKeys(recordRDD);
        assertEquals(map.size(), 2);
        List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));
        List<String> list2 = Lists.newArrayList(map.get("2015/01/31"));
        assertEquals(list1.size(), 3);
        assertEquals(list2.size(), 1);
    }

    @Test
    public void testLoadInvolvedFiles() throws IOException {
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);

        // Create some partitions, and put some files
        // "2016/01/21": 0 file
        // "2016/04/01": 1 file (2_0_20160401010101.parquet)
        // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet)
        new File(basePath + "/2016/01/21").mkdirs();
        new File(basePath + "/2016/04/01").mkdirs();
        new File(basePath + "/2015/03/12").mkdirs();
        new File(basePath + "/2016/04/01/2_0_20160401010101.parquet").createNewFile();
        new File(basePath + "/2015/03/12/1_0_20150312101010.parquet").createNewFile();
        new File(basePath + "/2015/03/12/3_0_20150312101010.parquet").createNewFile();
        new File(basePath + "/2015/03/12/4_0_20150312101010.parquet").createNewFile();
        List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
        JavaPairRDD<String, String> rdd = index.loadInvolvedFiles(partitions, table);
        // Still 0, as no valid commit
        assertEquals(rdd.count(), 0);

        // Add some commits
        new File(basePath + "/.hoodie").mkdirs();
        new File(basePath + "/.hoodie/20160401010101.commit").createNewFile();
        new File(basePath + "/.hoodie/20150312101010.commit").createNewFile();
        metadata = new HoodieTableMetaClient(fs, basePath);
        rdd = index.loadInvolvedFiles(partitions, table);
        final List<Tuple2<String, String>> filesList = rdd.collect();
        assertEquals(filesList.size(), 4);

        // no longer sorted, but should have same files.
        Set<String> actualFiles = new HashSet<String>() {
            {
                add(filesList.get(0)._1 + "/" + filesList.get(0)._2);
                add(filesList.get(1)._1 + "/" + filesList.get(1)._2);
                add(filesList.get(2)._1 + "/" + filesList.get(2)._2);
                add(filesList.get(3)._1 + "/" + filesList.get(3)._2);
            }
        };

        Set<String> expected = new HashSet<String>() {
            {
                add("2016/04/01/2_0_20160401010101.parquet");
                add("2015/03/12/1_0_20150312101010.parquet");
                add("2015/03/12/3_0_20150312101010.parquet");
                add("2015/03/12/4_0_20150312101010.parquet");
            }
        };
        assertEquals(expected, actualFiles);
    }

    @Test
    public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException {
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);

        String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
        Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));
        // Create some records to use
        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
        String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
        String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieRecord record1 = new HoodieRecord(
                new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieRecord record2 = new HoodieRecord(
                new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
        TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
        HoodieRecord record3 = new HoodieRecord(
                new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
        TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
        HoodieRecord record4 = new HoodieRecord(
                new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);

        // We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
        BloomFilter filter = new BloomFilter(10000, 0.0000001);
        filter.add(record3.getRecordKey());
        String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true);

        // The bloom filter contains 3 records
        assertTrue(filter.mightContain(record1.getRecordKey()));
        assertTrue(filter.mightContain(record2.getRecordKey()));
        assertTrue(filter.mightContain(record3.getRecordKey()));
        assertFalse(filter.mightContain(record4.getRecordKey()));

        // Compare with file
        List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(),
                record4.getRecordKey());

        List<String> results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(uuids,
                new Path(basePath + "/2016/01/31/" + filename));
        assertEquals(results.size(), 2);
        assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
                || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
        assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")
                || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
        // TODO(vc): Need more coverage on actual filenames
        //assertTrue(results.get(0)._2().equals(filename));
        //assertTrue(results.get(1)._2().equals(filename));
    }

    @Test
    public void testTagLocationWithEmptyRDD() throws Exception {
        // We have some records to be tagged (two different partitions)
        JavaRDD<HoodieRecord> recordRDD = jsc.emptyRDD();
        // Also create the metadata and config
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config);

        // Let's tag
        HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);

        try {
            bloomIndex.tagLocation(recordRDD, table);
        } catch (IllegalArgumentException e) {
            fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
        }
    }

    @Test
    public void testTagLocation() throws Exception {
        // We have some records to be tagged (two different partitions)
        String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
        Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));

        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
        String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
        String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieRecord record1 = new HoodieRecord(
                new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieRecord record2 = new HoodieRecord(
                new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
        TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
        HoodieRecord record3 = new HoodieRecord(
                new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
        TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
        HoodieRecord record4 = new HoodieRecord(
                new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
        JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));

        // Also create the metadata and config
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config);

        // Let's tag
        HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
        JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);

        // Should not find any files
        for (HoodieRecord record : taggedRecordRDD.collect()) {
            assertTrue(!record.isCurrentLocationKnown());
        }

        // We create three parquet file, each having one record. (two different partitions)
        String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true);
        String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true);
        String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);

        // We do the tag again
        metadata = new HoodieTableMetaClient(fs, basePath);
        table = HoodieTable.getHoodieTable(metadata, config);

        taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);

        // Check results
        for (HoodieRecord record : taggedRecordRDD.collect()) {
            if (record.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1)));
            } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2)));
            } else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
                assertTrue(!record.isCurrentLocationKnown());
            } else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
            }
        }
    }

    @Test
    public void testCheckExists() throws Exception {
        // We have some records to be tagged (two different partitions)
        String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
        Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));

        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
        String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
        String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
        HoodieRecord record1 = new HoodieRecord(key1, rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath());
        HoodieRecord record2 = new HoodieRecord(key2, rowChange2);
        TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
        HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath());
        HoodieRecord record3 = new HoodieRecord(key3, rowChange3);
        TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
        HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath());
        HoodieRecord record4 = new HoodieRecord(key4, rowChange4);
        JavaRDD<HoodieKey> keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4));

        // Also create the metadata and config
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config);

        // Let's tag
        HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
        JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);

        // Should not find any files
        for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
            assertTrue(!record._2.isPresent());
        }

        // We create three parquet file, each having one record. (two different partitions)
        String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true);
        String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true);
        String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);

        // We do the tag again
        metadata = new HoodieTableMetaClient(fs, basePath);
        table = HoodieTable.getHoodieTable(metadata, config);
        taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);

        // Check results
        for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
            if (record._1.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record._2.isPresent());
                Path path1 = new Path(record._2.get());
                assertEquals(FSUtils.getFileId(filename1), FSUtils.getFileId(path1.getName()));
            } else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record._2.isPresent());
                Path path2 = new Path(record._2.get());
                assertEquals(FSUtils.getFileId(filename2), FSUtils.getFileId(path2.getName()));
            } else if (record._1.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
                assertTrue(!record._2.isPresent());
            } else if (record._1.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record._2.isPresent());
                Path path3 = new Path(record._2.get());
                assertEquals(FSUtils.getFileId(filename3), FSUtils.getFileId(path3.getName()));
            }
        }
    }

    @Test
    public void testBloomFilterFalseError() throws IOException, InterruptedException {
        // We have two hoodie records
        String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
        String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";

        // We write record1 to a parquet file, using a bloom filter having both records
        String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
        Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));

        TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
        HoodieRecord record1 = new HoodieRecord(
                new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
        TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
        HoodieRecord record2 = new HoodieRecord(
                new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);

        BloomFilter filter = new BloomFilter(10000, 0.0000001);
        filter.add(record2.getRecordKey());
        String filename = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, filter, true);
        assertTrue(filter.mightContain(record1.getRecordKey()));
        assertTrue(filter.mightContain(record2.getRecordKey()));

        // We do the tag
        JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2));
        HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
        HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
        HoodieTable table = HoodieTable.getHoodieTable(metadata, config);

        HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
        JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);

        // Check results
        for (HoodieRecord record : taggedRecordRDD.collect()) {
            if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
                assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename)));
            } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
                assertFalse(record.isCurrentLocationKnown());
            }
        }
    }

    private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema,
            BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException {
        Thread.sleep(1000);
        String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
        String fileId = UUID.randomUUID().toString();
        String filename = FSUtils.makeDataFileName(commitTime, 1, fileId);

        return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
    }

    private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records,
            Schema schema, BloomFilter filter, boolean createCommitTime) throws IOException {
        if (filter == null) {
            filter = new BloomFilter(10000, 0.0000001);
        }
        HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema),
                schema, filter);
        ParquetWriter writer = new ParquetWriter(new Path(basePath + "/" + partitionPath + "/" + filename),
                writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
        int seqId = 1;
        String commitTime = FSUtils.getCommitTime(filename);
        for (HoodieRecord record : records) {
            GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
            HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
            HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
                    filename);
            writer.write(avroRecord);
            filter.add(record.getRecordKey());
        }
        writer.close();

        if (createCommitTime) {
            // Also make sure the commit is valid
            new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
            new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
                    .createNewFile();
        }
        return filename;
    }

    @After
    public void clean() {
        if (jsc != null) {
            jsc.stop();
        }
        if (basePath != null) {
            new File(basePath).delete();
        }
    }
}