com.liveramp.cascading_ext.bloom.BloomAssemblyStrategy.java Source code

Java tutorial

Introduction

Here is the source code for com.liveramp.cascading_ext.bloom.BloomAssemblyStrategy.java

Source

/**
 *  Copyright 2012 LiveRamp
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.liveramp.cascading_ext.bloom;

import cascading.flow.Flow;
import cascading.flow.FlowStep;
import cascading.flow.FlowStepStrategy;
import cascading.flow.planner.BaseFlowStep;
import cascading.stats.FlowStepStats;
import com.liveramp.cascading_ext.assembly.CreateBloomFilter;
import com.liveramp.cascading_ext.counters.Counters;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;

import java.util.List;
import java.util.Map;

/**
 * Does any configuration necessary for a job that involves stuff from BloomAssembly
 */
public class BloomAssemblyStrategy implements FlowStepStrategy<JobConf> {

    private static Logger LOG = Logger.getLogger(BloomAssemblyStrategy.class);

    @Override
    public void apply(Flow<JobConf> flow, List<FlowStep<JobConf>> predecessorSteps, FlowStep<JobConf> flowStep) {
        JobConf conf = flowStep.getConfig();

        String targetBloomID = conf.get(BloomProps.TARGET_BLOOM_FILTER_ID);
        if (targetBloomID != null) {
            prepareBloomFilterBuilder(flowStep);
        }
        //  the job is the filter which needs to use the bloom filter
        String sourceBloomID = conf.get(BloomProps.SOURCE_BLOOM_FILTER_ID);
        if (sourceBloomID != null) {
            buildBloomfilter(sourceBloomID, flowStep, predecessorSteps);
        }

    }

    private void prepareBloomFilterBuilder(FlowStep<JobConf> currentStep) {
        JobConf currentStepConf = currentStep.getConfig();
        currentStepConf.set("mapred.reduce.tasks", Integer.toString(BloomProps.getNumSplits(currentStepConf)));
        currentStepConf.set("io.sort.record.percent",
                Double.toString(BloomProps.getIOSortPercent(currentStepConf)));
    }

    /**
     * Merges bloom filter parts created across multiple splits of the keys and put the result in the distributed cache.
     */
    private void buildBloomfilter(String bloomID, FlowStep<JobConf> currentStep,
            List<FlowStep<JobConf>> predecessorSteps) {
        try {
            JobConf currentStepConf = currentStep.getConfig();
            currentStepConf.set("io.sort.mb", Integer.toString(BloomProps.getBufferSize(currentStepConf)));
            currentStepConf.set("mapred.job.reuse.jvm.num.tasks", "-1");

            String requiredBloomPath = currentStepConf.get(BloomProps.REQUIRED_BLOOM_FILTER_PATH);

            for (FlowStep<JobConf> step : predecessorSteps) {
                JobConf prevStepConf = step.getConfig();
                String targetBloomID = prevStepConf.get(BloomProps.TARGET_BLOOM_FILTER_ID);

                if (bloomID.equals(targetBloomID)) {
                    LOG.info("Found step generating required bloom filter: " + targetBloomID);

                    // Extract the counters from the previous job to approximate the average key/tuple size
                    FlowStepStats stats = ((BaseFlowStep) step).getFlowStepStats();

                    // Collect some of the stats gathered. This will help configure the bloom filter
                    long numSampled = Counters.get(stats, CreateBloomFilter.StatsCounters.TOTAL_SAMPLED_TUPLES);
                    long keySizeSum = Counters.get(stats, CreateBloomFilter.StatsCounters.KEY_SIZE_SUM);
                    long matchSizeSum = Counters.get(stats, CreateBloomFilter.StatsCounters.TUPLE_SIZE_SUM);

                    int avgKeySize = 0;
                    int avgMatchSize = 0;

                    if (numSampled != 0) {
                        avgKeySize = (int) (keySizeSum / numSampled);
                        avgMatchSize = (int) (matchSizeSum / numSampled);
                    }

                    LOG.info("Avg key size ~= " + avgKeySize);
                    LOG.info("Avg match size ~= " + avgMatchSize);
                    for (Map.Entry<String, String> entry : BloomUtil
                            .getPropertiesForBloomFilter(avgMatchSize, avgKeySize).entrySet()) {
                        currentStepConf.set(entry.getKey(), entry.getValue());
                    }

                    // Put merged result in distributed cache
                    LOG.info("Adding dist cache properties to config:");
                    for (Map.Entry<String, String> prop : BloomUtil.getPropertiesForDistCache(requiredBloomPath)
                            .entrySet()) {
                        LOG.info(prop.getKey() + " = " + prop.getValue());
                        String previousProperty = currentStepConf.get(prop.getKey());
                        if (previousProperty != null) {
                            LOG.info("found already existing value for key: " + prop.getKey() + ", found "
                                    + previousProperty + ".  Appending.");
                            currentStepConf.set(prop.getKey(), previousProperty + "," + prop.getValue());
                        } else {
                            currentStepConf.set(prop.getKey(), prop.getValue());
                        }
                    }

                    BloomUtil.writeFilterToHdfs(prevStepConf, requiredBloomPath);
                }
            }
        } catch (Exception e) {
            throw new RuntimeException("Failed to create bloom filter!", e);
        }
    }
}