org.macau.util.FuzzyJoinDriver.java Source code

Java tutorial

Introduction

Here is the source code for org.macau.util.FuzzyJoinDriver.java

Source

/**
 * Copyright 2010-2011 The Regents of the University of California
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on
 * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations under
 * the License.
 * 
 * Author: Rares Vernica <rares (at) ics.uci.edu>
 */

package org.macau.util;

import java.io.IOException;
import java.util.Date;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.ProgramDriver;

public class FuzzyJoinDriver {
    public static final String NAMESPACE = "fuzzyjoin";
    public static final String VERSION_PROPERTY = NAMESPACE + ".version";
    //
    // tokens package
    //
    public static final String TOKENS_PACKAGE_PROPERTY = NAMESPACE + ".tokens.package";
    public static final String TOKENS_PACKAGE_VALUE = "Scalar";
    //
    // tokens length stats
    //
    public static final String TOKENS_LENGTHSTATS_PROPERTY = NAMESPACE + ".tokens.lengthstats";
    public static final boolean TOKENS_LENGTHSTATS_VALUE = false;
    //
    // record group
    //
    public static final String RIDPAIRS_GROUP_CLASS_PROPERTY = NAMESPACE + ".ridpairs.group.class";
    public static final String RIDPAIRS_GROUP_CLASS_VALUE = "TokenIdentity";
    public static final String RIDPAIRS_GROUP_FACTOR_PROPERTY = NAMESPACE + ".ridpairs.group.factor";
    public static final int RIDPAIRS_GROUP_FACTOR_VALUE = 1;
    //
    // data properties
    //
    public static final String DATA_DIR_PROPERTY = NAMESPACE + ".data.dir";
    public static final String DATA_RAW_PROPERTY = NAMESPACE + ".data.raw";
    public static final String DATA_RAW_VALUE = "*";
    public static final String DATA_LENGTHSTATS_PROPERTY = NAMESPACE + ".data.lengthstats";
    public static final String DATA_JOININDEX_PROPERTY = NAMESPACE + ".data.joinindex";
    public static final String DATA_CRTCOPY_PROPERTY = NAMESPACE + ".data.crtcopy";
    public static final String DATA_COPY_PROPERTY = NAMESPACE + ".data.copy";
    public static final String DATA_COPY_START_PROPERTY = NAMESPACE + ".data.copystart";
    public static final String DATA_SUFFIX_INPUT_PROPERTY = NAMESPACE + ".data.suffix.input";
    public static final String DATA_NORECORDS_PROPERTY = NAMESPACE + ".data.norecords";
    public static final String DATA_DICTIONARY_FACTOR_PROPERTY = NAMESPACE + ".data.dictionary.factor";
    //
    // other constants
    //
    //length statistic
    public static final String DATA_LENGTH_STATS_FILE = "lengthstats";
    public static final char SEPSARATOR = ',';
    public static final String SEPSARATOR_REGEX = ",";

    //
    // data
    //

    public static void main(String argv[]) {
        int exitCode = -1;
        ProgramDriver pgd = new ProgramDriver();
        try {
            //            pgd.addClass("dummyjob", DummyJob.class, "");
            //            pgd.addClass("recordbuild", RecordBuild.class, "");
            //            pgd.addClass("fuzzyjoin", FuzzyJoin.class, "");
            //            pgd.addClass("tokensbasic", TokensBasic.class, "");
            //            pgd.addClass("tokensimproved", TokensImproved.class, "");
            //            // pgd.addClass("ridpairsbasic", RIDPairsBasic.class, "");
            //            pgd.addClass("ridpairsimproved", RIDPairsImproved.class, "");
            //            pgd.addClass("ridpairsppjoin", RIDPairsPPJoin.class, "");
            //            pgd.addClass("recordpairsbasic", RecordPairsBasic.class, "");
            //            pgd.addClass("recordpairsimproved", RecordPairsImproved.class, "");
            //            pgd.addClass("ridrecordpairsimproved",
            //                    RIDRecordPairsImproved.class, "");
            //            pgd.addClass("ridrecordpairsppjoin", RIDRecordPairsPPJoin.class, "");
            //            pgd.addClass("recordgenerate", RecordGenerate.class, "");
            //            pgd.addClass("recordbalance", RecordBalance.class, "");
            //            pgd.driver(argv);
            //            // Success
            //            exitCode = 0;
        } catch (Throwable e) {
            e.printStackTrace();
        }
        System.exit(exitCode);
    }

    /**
     * 
     * @param job
     * @throws IOException
     * run the job and output the basic information of the job
     * the start time
     * the finished time
     * the running time(finished_Time - start_Time)
     */
    public static void run(JobConf job) throws IOException {
        job.setJarByClass(FuzzyJoinDriver.class);
        //
        // print info
        //
        String ret = "FuzzyJoinDriver(" + job.getJobName() + ")\n" + "  Input Path:  {";
        Path inputs[] = FileInputFormat.getInputPaths(job);
        for (int ctr = 0; ctr < inputs.length; ctr++) {
            if (ctr > 0) {
                ret += "\n                ";
            }
            ret += inputs[ctr].toString();
        }
        ret += "}\n";
        ret += "  Output Path: " + FileOutputFormat.getOutputPath(job) + "\n" + "  Map Jobs:    "
                + job.getNumMapTasks() + "\n" + "  Reduce Jobs: " + job.getNumReduceTasks() + "\n"
                + "  Properties:  {";
        String[][] properties = new String[][] {
                new String[] { FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY, FuzzyJoinConfig.SIMILARITY_NAME_VALUE },
                new String[] { FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY,
                        "" + FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE },
                new String[] { FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE },
                new String[] { TOKENS_PACKAGE_PROPERTY, TOKENS_PACKAGE_VALUE },
                new String[] { TOKENS_LENGTHSTATS_PROPERTY, "" + TOKENS_LENGTHSTATS_VALUE },
                new String[] { RIDPAIRS_GROUP_CLASS_PROPERTY, RIDPAIRS_GROUP_CLASS_VALUE },
                new String[] { RIDPAIRS_GROUP_FACTOR_PROPERTY, "" + RIDPAIRS_GROUP_FACTOR_VALUE },
                new String[] { FuzzyJoinConfig.DATA_TOKENS_PROPERTY, "" },
                new String[] { DATA_JOININDEX_PROPERTY, "" }, };
        for (int crt = 0; crt < properties.length; crt++) {
            if (crt > 0) {
                ret += "\n                ";
            }
            ret += properties[crt][0] + "=" + job.get(properties[crt][0], properties[crt][1]);
        }
        ret += "}";
        System.out.println(ret);
        //
        // run job
        //
        Date startTime = new Date();
        System.out.println("Job started: " + startTime);
        JobClient.runJob(job);
        Date end_time = new Date();
        System.out.println("Job ended: " + end_time);
        System.out.println(
                "The job took " + (end_time.getTime() - startTime.getTime()) / (float) 1000.0 + " seconds.");
    }
}