Source code

Java tutorial


Here is the source code for


 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

package com.uber.hoodie.hadoop;

import static org.junit.Assert.assertEquals;

import com.uber.hoodie.common.util.FSUtils;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

public class HoodieInputFormatTest {

    private HoodieInputFormat inputFormat;
    private JobConf jobConf;

    public void setUp() {
        inputFormat = new HoodieInputFormat();
        jobConf = new JobConf();

    public TemporaryFolder basePath = new TemporaryFolder();

    public void testInputFormatLoad() throws IOException {
        // initial commit
        File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
        InputFormatTestUtil.commit(basePath, "100");

        // Add the paths
        FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

        InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10);
        assertEquals(10, inputSplits.length);

        FileStatus[] files = inputFormat.listStatus(jobConf);
        assertEquals(10, files.length);

    public void testInputFormatUpdates() throws IOException {
        // initial commit
        File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
        InputFormatTestUtil.commit(basePath, "100");

        // Add the paths
        FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

        FileStatus[] files = inputFormat.listStatus(jobConf);
        assertEquals(10, files.length);

        // update files
        InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true);
        // Before the commit
        files = inputFormat.listStatus(jobConf);
        assertEquals(10, files.length);
        ensureFilesInCommit("Commit 200 has not been committed. We should not see files from this commit", files,
                "200", 0);
        InputFormatTestUtil.commit(basePath, "200");
        files = inputFormat.listStatus(jobConf);
        assertEquals(10, files.length);
        ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 "
                + "files from 100 commit", files, "200", 5);
        ensureFilesInCommit("5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 "
                + "files from 200 commit", files, "100", 5);

    public void testIncrementalSimple() throws IOException {
        // initial commit
        File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
        InputFormatTestUtil.commit(basePath, "100");

        // Add the paths
        FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

        InputFormatTestUtil.setupIncremental(jobConf, "100", 1);

        FileStatus[] files = inputFormat.listStatus(jobConf);
                "We should exclude commit 100 when returning incremental pull with start commit time as " + "100",
                0, files.length);

    public void testIncrementalWithMultipleCommits() throws IOException {
        // initial commit
        File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
        InputFormatTestUtil.commit(basePath, "100");
        // Add the paths
        FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
        // update files
        InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false);
        InputFormatTestUtil.commit(basePath, "200");

        InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false);
        InputFormatTestUtil.commit(basePath, "300");

        InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false);
        InputFormatTestUtil.commit(basePath, "400");

        InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false);
        InputFormatTestUtil.commit(basePath, "500");

        InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false);
        InputFormatTestUtil.commit(basePath, "600");

        InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
        FileStatus[] files = inputFormat.listStatus(jobConf);
        assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5, files.length);
        ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200", files, "200",

        InputFormatTestUtil.setupIncremental(jobConf, "100", 3);
        files = inputFormat.listStatus(jobConf);

        assertEquals("Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 "
                + "commit and 1 file from 200 commit", 5, files.length);
        ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit", files, "400",
        ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit", files, "300",
        ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit", files, "200",

        InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL);
        files = inputFormat.listStatus(jobConf);

        assertEquals("Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 "
                + "commits", 5, files.length);
        ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600",
        ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500",
        ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400",
        ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300",
        ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200",

    //TODO enable this after enabling predicate pushdown
    public void testPredicatePushDown() throws IOException {
        // initial commit
        Schema schema = InputFormatTestUtil.readSchema("/sample1.avsc");
        String commit1 = "20160628071126";
        File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 10, commit1);
        InputFormatTestUtil.commit(basePath, commit1);
        // Add the paths
        FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
        // check whether we have 10 records at this point
        ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, 10, 10);

        // update 2 records in the original parquet file and save it as commit 200
        String commit2 = "20160629193623";
        InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2);
        InputFormatTestUtil.commit(basePath, commit2);

        InputFormatTestUtil.setupIncremental(jobConf, commit1, 1);
        // check whether we have 2 records at this point
        ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more",
                commit2, 2, 2);
        // Make sure we have the 10 records if we roll back the stattime
        InputFormatTestUtil.setupIncremental(jobConf, "0", 2);
        ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more",
                commit1, 8, 10);
        ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more",
                commit2, 2, 10);

    private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit,
            int totalExpected) throws IOException {
        int actualCount = 0;
        int totalCount = 0;
        InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
        for (InputSplit split : splits) {
            RecordReader<NullWritable, ArrayWritable> recordReader = inputFormat.getRecordReader(split, jobConf,
            NullWritable key = recordReader.createKey();
            ArrayWritable writable = recordReader.createValue();

            while (, writable)) {
                // writable returns an array with [field1, field2, _hoodie_commit_time,
                // _hoodie_commit_seqno]
                // Take the commit time and compare with the one we are interested in
                if (commit.equals((writable.get()[2]).toString())) {
        assertEquals(msg, expectedNumberOfRecordsInCommit, actualCount);
        assertEquals(msg, totalExpected, totalCount);

    public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit, int expected) {
        int count = 0;
        for (FileStatus file : files) {
            String commitTs = FSUtils.getCommitTime(file.getPath().getName());
            if (commit.equals(commitTs)) {
        assertEquals(msg, expected, count);