Source code

Java tutorial


Here is the source code for


 *  Copyright (c) 2017 Uber Technologies, Inc. (
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

package com.uber.hoodie.utilities.deltastreamer;

import com.beust.jcommander.IStringConverter;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.HoodieIndex;
import com.uber.hoodie.utilities.HiveIncrementalPuller;
import com.uber.hoodie.utilities.UtilHelpers;
import com.uber.hoodie.utilities.keygen.SimpleKeyGenerator;
import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
import com.uber.hoodie.utilities.sources.DFSSource;
import com.uber.hoodie.utilities.keygen.KeyGenerator;
import com.uber.hoodie.utilities.schema.SchemaProvider;
import com.uber.hoodie.utilities.sources.Source;
import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException;
import com.uber.hoodie.utilities.sources.SourceDataFormat;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Optional;
import java.util.Properties;

import scala.collection.JavaConversions;

 * An Utility which can incrementally take the output from {@link HiveIncrementalPuller} and apply it to the target dataset.
 * Does not maintain any state, queries at runtime to see how far behind the target dataset is from
 * the source dataset. This can be overriden to force sync from a timestamp.
public class HoodieDeltaStreamer implements Serializable {

    private static volatile Logger log = LogManager.getLogger(HoodieDeltaStreamer.class);

    private static String CHECKPOINT_KEY = "deltastreamer.checkpoint.key";

    private final Config cfg;

     * Source to pull deltas from
    private transient Source source;

     * Schema provider that supplies the command for reading the input and writing out the
     * target table.
    private transient SchemaProvider schemaProvider;

     * Extract the key for the target dataset
    private KeyGenerator keyGenerator;

     * Filesystem used
    private transient FileSystem fs;

     * Timeline with completed commits
    private transient Optional<HoodieTimeline> commitTimelineOpt;

     * Spark context
    private transient JavaSparkContext jssc;

    public HoodieDeltaStreamer(Config cfg) throws IOException {
        this.cfg = cfg;
        this.fs = FSUtils.getFs();

        if (fs.exists(new Path(cfg.targetBasePath))) {
            HoodieTableMetaClient meta = new HoodieTableMetaClient(fs, cfg.targetBasePath);
            this.commitTimelineOpt = Optional
        } else {
            this.commitTimelineOpt = Optional.empty();

        //TODO(vc) Should these be passed from outside?
        this.jssc = getSparkContext();


    private void initSource() throws IOException {
        // Create the source & schema providers
        PropertiesConfiguration sourceCfg = UtilHelpers.readConfig(fs, new Path(cfg.sourceConfigProps));"Creating source " + cfg.sourceClassName + " with configs : " + sourceCfg.toString());
        this.source = UtilHelpers.createSource(cfg.sourceClassName, sourceCfg, jssc, cfg.sourceFormat,

    private void initSchemaProvider() throws IOException {
        PropertiesConfiguration schemaCfg = UtilHelpers.readConfig(fs, new Path(cfg.schemaProviderConfigProps));"Creating schema provider " + cfg.schemaProviderClassName + " with configs : "
                + schemaCfg.toString());
        this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, schemaCfg);

    private void initKeyGenerator() throws IOException {
        PropertiesConfiguration keygenCfg = UtilHelpers.readConfig(fs, new Path(cfg.keyGeneratorProps));"Creating key generator " + cfg.keyGeneratorClass + " with configs : " + keygenCfg.toString());
        this.keyGenerator = UtilHelpers.createKeyGenerator(cfg.keyGeneratorClass, keygenCfg);

    private JavaSparkContext getSparkContext() {
        SparkConf sparkConf = new SparkConf().setAppName("hoodie-delta-streamer-" + cfg.targetTableName);
        sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
        sparkConf.set("spark.driver.maxResultSize", "2g");

        // Configure hadoop conf
        sparkConf.set("spark.hadoop.mapred.output.compress", "true");
        sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
        sparkConf.set("spark.hadoop.mapred.output.compression.codec", "");
        sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");

        sparkConf = HoodieWriteClient.registerClasses(sparkConf);
        // register the schemas, so that shuffle does not serialize the full schemas
        List<Schema> schemas = Arrays.asList(schemaProvider.getSourceSchema(), schemaProvider.getTargetSchema());
        return new JavaSparkContext(sparkConf);

    private void sync() throws Exception {

        // Retrieve the previous round checkpoints, if any
        Optional<String> resumeCheckpointStr = Optional.empty();
        if (commitTimelineOpt.isPresent()) {
            Optional<HoodieInstant> lastCommit = commitTimelineOpt.get().lastInstant();
            if (lastCommit.isPresent()) {
                HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
                if (commitMetadata.getMetadata(CHECKPOINT_KEY) != null) {
                    resumeCheckpointStr = Optional.of(commitMetadata.getMetadata(CHECKPOINT_KEY));
                } else {
                    throw new HoodieDeltaStreamerException(
                            "Unable to find previous checkpoint. Please double check if this table "
                                    + "was indeed built via delta streamer ");
        } else {
            Properties properties = new Properties();
            properties.put(HoodieWriteConfig.TABLE_NAME, cfg.targetTableName);
            HoodieTableMetaClient.initializePathAsHoodieDataset(FSUtils.getFs(), cfg.targetBasePath, properties);
        }"Checkpoint to resume from : " + resumeCheckpointStr);

        // Pull the data from the source & prepare the write
        Pair<Optional<JavaRDD<GenericRecord>>, String> dataAndCheckpoint = source.fetchNewData(resumeCheckpointStr,

        if (!dataAndCheckpoint.getKey().isPresent()) {
  "No new data, nothing to commit.. ");

        JavaRDD<GenericRecord> avroRDD = dataAndCheckpoint.getKey().get();
        JavaRDD<HoodieRecord> records = -> {
            HoodieRecordPayload payload = UtilHelpers.createPayload(cfg.payloadClassName, gr,
                    (Comparable) gr.get(cfg.sourceOrderingField));
            return new HoodieRecord<>(keyGenerator.getKey(gr), payload);

        // Perform the write
        HoodieWriteConfig hoodieCfg = getHoodieClientConfig(cfg.hoodieClientProps);
        HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg);
        String commitTime = client.startCommit();"Starting commit  : " + commitTime);

        JavaRDD<WriteStatus> writeStatusRDD;
        if (cfg.operation == Operation.INSERT) {
            writeStatusRDD = client.insert(records, commitTime);
        } else if (cfg.operation == Operation.UPSERT) {
            writeStatusRDD = client.upsert(records, commitTime);
        } else {
            throw new HoodieDeltaStreamerException("Unknown operation :" + cfg.operation);

        // Simply commit for now. TODO(vc): Support better error handlers later on
        HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
        checkpointCommitMetadata.put(CHECKPOINT_KEY, dataAndCheckpoint.getValue());

        boolean success = client.commit(commitTime, writeStatusRDD, Optional.of(checkpointCommitMetadata));
        if (success) {
  "Commit " + commitTime + " successful!");
            // TODO(vc): Kick off hive sync from here.

        } else {
  "Commit " + commitTime + " failed!");

    private HoodieWriteConfig getHoodieClientConfig(String hoodieClientCfgPath) throws Exception {
        // TODO(vc): Double check all the options can be passed in like this. CompactionConfig, IndexConfig everything.
        return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
                .fromInputStream( Path(hoodieClientCfgPath))).build();

    private enum Operation {

    private class OperationConvertor implements IStringConverter<Operation> {
        public Operation convert(String value) throws ParameterException {
            return Operation.valueOf(value);

    private class SourceFormatConvertor implements IStringConverter<SourceDataFormat> {
        public SourceDataFormat convert(String value) throws ParameterException {
            return SourceDataFormat.valueOf(value);

    public static class Config implements Serializable {

        /** TARGET CONFIGS **/
        @Parameter(names = {
                "--target-base-path" }, description = "base path for the target hoodie dataset", required = true)
        public String targetBasePath;

        // TODO: How to obtain hive configs to register?
        @Parameter(names = { "--target-table" }, description = "name of the target table in Hive", required = true)
        public String targetTableName;

        @Parameter(names = {
                "--hoodie-client-config" }, description = "path to properties file on localfs or dfs, with hoodie client config. Sane defaults"
                        + "are used, but recommend use to provide basic things like metrics endpoints, hive configs etc")
        public String hoodieClientProps = null;

        /** SOURCE CONFIGS **/
        @Parameter(names = {
                "--source-class" }, description = "subclass of com.uber.hoodie.utilities.sources.Source to use to read data. "
                        + "built-in options: com.uber.hoodie.utilities.common.{DFSSource (default), KafkaSource, HiveIncrPullSource}")
        public String sourceClassName = DFSSource.class.getName();

        @Parameter(names = {
                "--source-config" }, description = "path to properties file on localfs or dfs, with source configs. "
                        + "For list of acceptable properties, refer the source class", required = true)
        public String sourceConfigProps = null;

        @Parameter(names = {
                "--source-format" }, description = "Format of data in source, JSON (default), Avro. All source data is "
                        + "converted to Avro using the provided schema in any case", converter = SourceFormatConvertor.class)
        public SourceDataFormat sourceFormat = SourceDataFormat.JSON;

        @Parameter(names = {
                "--source-ordering-field" }, description = "Field within source record to decide how to break ties between "
                        + " records with same key in input data. Default: 'ts' holding unix timestamp of record")
        public String sourceOrderingField = "ts";

        @Parameter(names = {
                "--key-generator-class" }, description = "Subclass of com.uber.hoodie.utilities.common.KeyExtractor to generate"
                        + "a HoodieKey from the given avro record. Built in: SimpleKeyGenerator (Uses provided field names as recordkey & partitionpath. "
                        + "Nested fields specified via dot notation, e.g: a.b.c)")
        public String keyGeneratorClass = SimpleKeyGenerator.class.getName();

        @Parameter(names = {
                "--key-generator-config" }, description = "Path to properties file on localfs or dfs, with KeyGenerator configs. "
                        + "For list of acceptable properites, refer the KeyGenerator class", required = true)
        public String keyGeneratorProps = null;

        @Parameter(names = {
                "--payload-class" }, description = "subclass of HoodieRecordPayload, that works off a GenericRecord. "
                        + "Default: SourceWrapperPayload. Implement your own, if you want to do something other than overwriting existing value")
        public String payloadClassName = DeltaStreamerAvroPayload.class.getName();

        @Parameter(names = {
                "--schemaprovider-class" }, description = "subclass of com.uber.hoodie.utilities.schema.SchemaProvider "
                        + "to attach schemas to input & target table data, built in options: FilebasedSchemaProvider")
        public String schemaProviderClassName = FilebasedSchemaProvider.class.getName();

        @Parameter(names = {
                "--schemaprovider-config" }, description = "path to properties file on localfs or dfs, with schema configs. "
                        + "For list of acceptable properties, refer the schema provider class", required = true)
        public String schemaProviderConfigProps = null;

        /** Other configs **/
        @Parameter(names = {
                "--max-input-bytes" }, description = "Maximum number of bytes to read from source. Default: 1TB")
        public long maxInputBytes = 1L * 1024 * 1024 * 1024 * 1024;

        @Parameter(names = {
                "--op" }, description = "Takes one of these values : UPSERT (default), INSERT (use when input "
                        + "is purely new data/inserts to gain speed)", converter = OperationConvertor.class)
        public Operation operation = Operation.UPSERT;

        @Parameter(names = { "--help", "-h" }, help = true)
        public Boolean help = false;

    public static void main(String[] args) throws Exception {
        final Config cfg = new Config();
        JCommander cmd = new JCommander(cfg, args);
        // TODO(vc): Do proper validation
        if ( || args.length == 0) {
        new HoodieDeltaStreamer(cfg).sync();