edu.uci.ics.fuzzyjoin.hadoop.tokens.array.ReduceSelect.java Source code

Java tutorial

Introduction

Here is the source code for edu.uci.ics.fuzzyjoin.hadoop.tokens.array.ReduceSelect.java

Source

/**
 * Copyright 2010-2011 The Regents of the University of California
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on
 * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations under
 * the License.
 * 
 * Author: Rares Vernica <rares (at) ics.uci.edu>
 */

package edu.uci.ics.fuzzyjoin.hadoop.tokens.array;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.net.URI;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;

import edu.uci.ics.fuzzyjoin.hadoop.FuzzyJoinDriver;
import edu.uci.ics.fuzzyjoin.hadoop.IntArrayWritable;

public class ReduceSelect extends MapReduceBase implements Reducer<IntArrayWritable, Text, Text, NullWritable> {

    private JobConf conf;
    private final HashMap<Integer, Integer> lengthFreq = new HashMap<Integer, Integer>();
    private int max = Integer.MIN_VALUE;
    private int min = Integer.MAX_VALUE;
    private final NullWritable nullWritable = NullWritable.get();
    private boolean lengthStats;

@Override
public void close() throws IOException {
    String bucket = "YOUR_S3_BUCKET"
    String uriStr = "s3://"+bucket+"/input/";
    URI uri = URI.create(uriStr);
    FileSystem fs = FileSystem.get(uri,conf);
    String path = FileOutputFormat.getWorkOutputPath(conf).toString();

    FSDataOutputStream statsWriter = fs.create(new Path(path.toString()
            + "/" + FuzzyJoinDriver.DATA_LENGTH_STATS_FILE));
    statsWriter.writeInt(min);
    statsWriter.writeInt(max);
    for (Integer length : lengthFreq.keySet()) {
        statsWriter.writeInt(length);
        statsWriter.writeInt(lengthFreq.get(length));
    }
    statsWriter.close();

    // System.out.println(min);
    // System.out.println(max);
    // System.out.println(lengthFreq);
    // int sum = 0;
    // for (Integer freq : lengthFreq.values()) {
    // sum += freq;
    // }
    // System.out.println(sum);
}

    @Override
    public void configure(JobConf job) {
        conf = job;
        lengthStats = job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY,
                FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE);
    }

    public void reduce(IntArrayWritable key, Iterator<Text> values, OutputCollector<Text, NullWritable> output,
            Reporter reporter) throws IOException {
        IntWritable[] stats = (IntWritable[]) key.toArray();

        if (stats.length == 2) {
            int length = stats[1].get();
            min = Math.min(min, length);
            max = Math.max(max, length);
        } else if (stats.length > 2) {
            min = Math.min(min, stats[1].get());
            max = Math.max(max, stats[2].get());
        }

        while (values.hasNext()) {
            output.collect(values.next(), nullWritable);
            // output.collect(new Text(key.get()[0] + " " + values.next()),
            // nullWritable);
            if (stats.length == 2 && lengthStats) {
                ReduceSort.addMap(lengthFreq, stats[1].get()); // for min
                // and max
                // only
            } else if (stats.length > 2) {
                for (int i = 3; i < stats.length; i += 2) {
                    ReduceSort.addMap(lengthFreq, stats[i].get(), stats[i + 1].get());
                }
            }
        }
    }

}