com.liveramp.hank.cascading.DomainBuilderTap.java Source code

Java tutorial

Introduction

Here is the source code for com.liveramp.hank.cascading.DomainBuilderTap.java

Source

/**
 *  Copyright 2011 LiveRamp
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package com.liveramp.hank.cascading;

import cascading.flow.FlowProcess;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntry;
import com.liveramp.hank.hadoop.*;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;

import java.io.IOException;

/**
 * A sink-only tap to write tuples to Hank Domains.
 */
public class DomainBuilderTap extends Hfs {

    private static final long serialVersionUID = 1L;
    private final String domainName;
    private final Class<? extends DomainBuilderAbstractOutputFormat> outputFormatClass;

    public DomainBuilderTap(String keyFieldName, String valueFieldName, int versionNumber,
            DomainBuilderProperties properties) {
        // Set the output to the temporary output path
        super(new DomainBuilderScheme(DomainBuilderAssembly.PARTITION_FIELD_NAME, keyFieldName, valueFieldName),
                properties.getTmpOutputPath(versionNumber));
        this.domainName = properties.getDomainName();
        this.outputFormatClass = properties.getOutputFormatClass();
    }

    public void sinkConfInit(FlowProcess<JobConf> process, JobConf conf) {
        super.sinkConfInit(process, conf);
        // Output Format
        conf.setOutputFormat(this.outputFormatClass);
        // Output Committer
        conf.setOutputCommitter(DomainBuilderOutputCommitter.class);
        // Set this tap's Domain name locally in the conf
        if (conf.get(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME) != null) {
            throw new RuntimeException("Trying to set domain name configuration parameter to " + domainName
                    + " but it was previously set to "
                    + conf.get(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME));
        } else {
            conf.set(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME, domainName);
        }
    }

    @Override
    public void sourceConfInit(FlowProcess process, JobConf conf) {
        throw new RuntimeException("DomainBuilderTap cannot be used as a source");
    }

    private static class DomainBuilderScheme
            extends Scheme<JobConf, RecordReader, OutputCollector, Object[], Void> {

        private static final long serialVersionUID = 1L;
        private final String partitionFieldName;
        private final String keyFieldName;
        private final String valueFieldName;

        public DomainBuilderScheme(String partitionFieldName, String keyFieldName, String valueFieldName) {
            super(new Fields(keyFieldName, valueFieldName),
                    new Fields(partitionFieldName, keyFieldName, valueFieldName));
            this.partitionFieldName = partitionFieldName;
            this.keyFieldName = keyFieldName;
            this.valueFieldName = valueFieldName;
        }

        @Override
        public void sourceConfInit(FlowProcess<JobConf> jobConfFlowProcess,
                Tap<JobConf, RecordReader, OutputCollector> jobConfRecordReaderOutputCollectorTap,
                JobConf entries) {
            throw new RuntimeException("DomainBuilderScheme cannot be used as a source.");
        }

        @Override
        public void sinkConfInit(FlowProcess<JobConf> jobConfFlowProcess,
                Tap<JobConf, RecordReader, OutputCollector> jobConfRecordReaderOutputCollectorTap,
                JobConf entries) {
        }

        @Override
        public boolean source(FlowProcess<JobConf> jobConfFlowProcess,
                SourceCall<Object[], RecordReader> recordReaderSourceCall) throws IOException {
            throw new RuntimeException("DomainBuilderScheme cannot be used as a source.");
        }

        @Override
        public void sink(FlowProcess<JobConf> jobConfFlowProcess, SinkCall<Void, OutputCollector> sinkCall)
                throws IOException {
            TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
            IntWritable partition = new IntWritable(tupleEntry.getInteger(partitionFieldName));
            BytesWritable key = (BytesWritable) tupleEntry.getObject(keyFieldName);
            BytesWritable value = (BytesWritable) tupleEntry.getObject(valueFieldName);
            KeyAndPartitionWritable keyAndPartitionWritable = new KeyAndPartitionWritable(key, partition);
            ValueWritable valueWritable = new ValueWritable(value);
            sinkCall.getOutput().collect(keyAndPartitionWritable, valueWritable);
        }
    }
}