org.apache.beam.sdk.io.hadoop.inputformat.hashing.HashingFn.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.beam.sdk.io.hadoop.inputformat.hashing.HashingFn.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.beam.sdk.io.hadoop.inputformat.hashing;

import com.google.common.collect.Lists;
import com.google.common.hash.HashCode;
import com.google.common.hash.Hashing;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.List;

import org.apache.beam.sdk.coders.CannotProvideCoderException;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderRegistry;
import org.apache.beam.sdk.coders.SerializableCoder;
import org.apache.beam.sdk.transforms.Combine.CombineFn;

/**
 * Custom Function for Hashing. The combiner is combineUnordered, and accumulator is a
 * HashCode.
 */
public class HashingFn extends CombineFn<String, HashingFn.Accum, String> {

    /**
     * Serializable Class to store the HashCode of input String.
     */
    public static class Accum implements Serializable {
        HashCode hashCode = null;

        public Accum(HashCode value) {
            this.hashCode = value;
        }

        private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
            in.defaultReadObject();
        }

        private void writeObject(ObjectOutputStream out) throws IOException {
            out.defaultWriteObject();
        }
    }

    @Override
    public Accum addInput(Accum accum, String input) {
        List<HashCode> elementHashes = Lists.newArrayList();
        if (accum.hashCode != null) {
            elementHashes.add(accum.hashCode);
        }
        HashCode inputHashCode = Hashing.sha1().hashString(input, StandardCharsets.UTF_8);
        elementHashes.add(inputHashCode);
        accum.hashCode = Hashing.combineUnordered(elementHashes);
        return accum;
    }

    @Override
    public Accum mergeAccumulators(Iterable<Accum> accums) {
        Accum merged = createAccumulator();
        List<HashCode> elementHashes = Lists.newArrayList();
        for (Accum accum : accums) {
            if (accum.hashCode != null) {
                elementHashes.add(accum.hashCode);
            }
        }
        merged.hashCode = Hashing.combineUnordered(elementHashes);
        return merged;
    }

    @Override
    public String extractOutput(Accum accum) {
        // Return the combined hash code of list of elements in the Pcollection.
        String consolidatedHash = "";
        if (accum.hashCode != null) {
            consolidatedHash = accum.hashCode.toString();
        }
        return consolidatedHash;
    }

    @Override
    public Coder<Accum> getAccumulatorCoder(CoderRegistry registry, Coder<String> inputCoder)
            throws CannotProvideCoderException {
        return SerializableCoder.of(Accum.class);
    }

    @Override
    public Coder<String> getDefaultOutputCoder(CoderRegistry registry, Coder<String> inputCoder) {
        return inputCoder;
    }

    @Override
    public Accum createAccumulator() {
        return new Accum(null);
    }
}