org.ankus.mapreduce.algorithms.statistics.nominalstats.NominalStatsDriver.java Source code

Java tutorial

Introduction

Here is the source code for org.ankus.mapreduce.algorithms.statistics.nominalstats.NominalStatsDriver.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.ankus.mapreduce.algorithms.statistics.nominalstats;

import java.io.IOException;

import org.ankus.util.ArgumentsConstants;
import org.ankus.util.Constants;
import org.ankus.util.Usage;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.ankus.util.ConfigurationVariable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * NominalStatsDriver
 * @desc Statistics Computation for Nominal Features (Frequency, Ratio)
 *
 * @version 0.0.1
 * @date : 2013.08.20
 * @author Moonie
 */
public class NominalStatsDriver extends Configured implements Tool {

    private Logger logger = LoggerFactory.getLogger(NominalStatsDriver.class);

    @Override
    public int run(String[] args) throws Exception {
        /**
         * 1st Job - Frequency Computation (MR)
         * 2nd Job - Ratio Computation (By Total Record Count, Map Only)
         */
        logger.info("Nominal Statistics MR-Job is Started..");

        Configuration conf = new Configuration();
        if (!ConfigurationVariable.setFromArguments(args, conf)) {
            logger.error("MR Job Setting Failed..");
            Usage.printUsage(Constants.ALGORITHM_NOMINAL_STATS);
            logger.info("Error: MR Job Setting Failed..: Configuration Failed");
            return 1;
        }

        String tempStr = "_freqs";

        logger.info("1st-Step of MR-Job is Started..");

        Job job1 = new Job();
        set2StepJob1(job1, conf, tempStr);
        job1.setJarByClass(NominalStatsDriver.class);

        job1.setMapperClass(NominalStatsFrequencyMapper.class);
        job1.setReducerClass(NominalStatsFrequencyReducer.class);

        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(IntWritable.class);

        job1.setOutputKeyClass(NullWritable.class);
        job1.setOutputValueClass(Text.class);

        if (!job1.waitForCompletion(true)) {
            logger.error("Error: MR(Step-1) for Nominal Stats is not Completion");
            logger.info("MR-Job is Failed..");
            return 1;
        }

        logger.info("1st-Step of MR-Job is successfully finished..");
        logger.info("2nd-Step of MR-Job is Started..");

        long mapOutCnt = job1.getCounters()
                .findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_OUTPUT_RECORDS").getValue();

        Job job2 = new Job();
        set2StepJob2(job2, conf, tempStr, mapOutCnt);
        job2.setJarByClass(NominalStatsDriver.class);

        job2.setMapperClass(NominalStatsRatioMapper.class);

        job2.setMapOutputKeyClass(NullWritable.class);
        job2.setMapOutputValueClass(Text.class);

        job2.setNumReduceTasks(0);

        if (!job2.waitForCompletion(true)) {
            logger.error("Error: MR(Step-2) for Nominal Stats is not Completeion");
            logger.info("MR-Job is Failed..");
            return 1;
        }

        // temp deletetion
        if (conf.get(ArgumentsConstants.TEMP_DELETE, "true").equals("true")) {
            logger.info("Temporary Files are Deleted..: " + conf.get(ArgumentsConstants.OUTPUT_PATH) + tempStr);
            FileSystem.get(conf).delete(new Path(conf.get(ArgumentsConstants.OUTPUT_PATH) + tempStr), true);
        }
        logger.info("MR-Job is successfully finished..");
        return 0;
    }

    public static void main(String args[]) throws Exception {
        int res = ToolRunner.run(new NominalStatsDriver(), args);
        System.exit(res);
    }

    /**
     * @desc configuration setting for 1st job of 2-step mr job
     * @parameter
     *      job : job identifier
     *      conf : configuration identifier for job
     *      outputPathStr : output path for job
     */
    private void set2StepJob1(Job job, Configuration conf, String outputPathStr) throws IOException {
        FileInputFormat.addInputPaths(job, conf.get(ArgumentsConstants.INPUT_PATH));
        FileOutputFormat.setOutputPath(job, new Path(conf.get(ArgumentsConstants.OUTPUT_PATH) + outputPathStr));
        job.getConfiguration().set(ArgumentsConstants.DELIMITER, conf.get(ArgumentsConstants.DELIMITER, "\t"));
        job.getConfiguration().set(ArgumentsConstants.TARGET_INDEX,
                conf.get(ArgumentsConstants.TARGET_INDEX, "-1"));
    }

    /**
     * @desc configuration setting for 2nd job of 2-step mr job
     * @parameter
     *      job : job identifier
     *      conf : configuration identifier for job
     *      inputPathStr : input path for job
     *      mapOutCnt : total count of values (map count of 1st mr job)
     */
    private void set2StepJob2(Job job, Configuration conf, String inputPathStr, long mapOutCnt) throws IOException {
        FileInputFormat.addInputPaths(job, conf.get(ArgumentsConstants.OUTPUT_PATH) + inputPathStr);
        FileOutputFormat.setOutputPath(job, new Path(conf.get(ArgumentsConstants.OUTPUT_PATH)));
        job.getConfiguration().set(ArgumentsConstants.DELIMITER, conf.get(ArgumentsConstants.DELIMITER, "\t"));
        job.getConfiguration().set(ConfigurationVariable.MAP_OUTPUT_RECORDS_CNT, mapOutCnt + "");
    }
}