edu.uci.ics.fuzzyjoin.hadoop.recordpairs.Reduce.java Source code

Java tutorial

Introduction

Here is the source code for edu.uci.ics.fuzzyjoin.hadoop.recordpairs.Reduce.java

Source

/**
 * Copyright 2010-2011 The Regents of the University of California
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on
 * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations under
 * the License.
 * 
 * Author: Rares Vernica <rares (at) ics.uci.edu>
 */

package edu.uci.ics.fuzzyjoin.hadoop.recordpairs;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;

import edu.uci.ics.fuzzyjoin.FuzzyJoinConfig;
import edu.uci.ics.fuzzyjoin.hadoop.FuzzyJoinDriver;

/**
 * @author rares
 * 
 *         KEY1: unused
 * 
 *         VALUE1: "Similarity;Record"
 * 
 *         KEY2: "Record1;Similarity;Record2"
 * 
 *         VALUE2: null
 */
public class Reduce extends MapReduceBase implements Reducer<Object, Text, Text, NullWritable> {

    private boolean isSelfJoin;

    private final Text outputKey = new Text();

    private final NullWritable outputValue = NullWritable.get();

    @Override
    public void configure(JobConf job) {
        isSelfJoin = "".equals(job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, ""));
    }

    public void reduce(Object unused, Iterator<Text> values, OutputCollector<Text, NullWritable> output,
            Reporter reporter) throws IOException {
        String records[] = new String[2];
        String splits[] = values.next().toString().split(FuzzyJoinConfig.RECORD_EXTRA_SEPARATOR_REGEX);
        String sim = splits[0];
        records[0] = splits[1];

        int rid0, rid1;
        rid0 = Integer
                .valueOf(records[0].split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX)[FuzzyJoinConfig.RECORD_KEY]);

        // need to loop and check RIDs because MapBroadcast might send
        // duplicates
        do {
            records[1] = values.next().toString().split(FuzzyJoinConfig.RECORD_EXTRA_SEPARATOR_REGEX)[1];
            rid1 = Integer
                    .valueOf(records[1].split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX)[FuzzyJoinConfig.RECORD_KEY]);

            if ((isSelfJoin && rid0 != rid1) || (!isSelfJoin && !records[0].equals(records[1]))) {
                /*
                 * if you want to have correct job counters (i.e. map output
                 * records = reduce input records)
                 */
                // while (values.hasNext()) {
                // values.next();
                // }
                break;
            }
        } while (values.hasNext());

        int i0 = 0, i1 = 1;
        if (isSelfJoin && rid0 > rid1) {
            i0 = 1;
            i1 = 0;
        }
        outputKey.set(records[i0] + FuzzyJoinConfig.RECORD_EXTRA_SEPARATOR + sim
                + FuzzyJoinConfig.RECORD_EXTRA_SEPARATOR + records[i1]);
        output.collect(outputKey, outputValue);
    }
}