Example usage for org.apache.hadoop.mapreduce Job setReducerClass

List of usage examples for org.apache.hadoop.mapreduce Job setReducerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setReducerClass.

Prototype

public void setReducerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the Reducer for the job.

Usage

From source file:com.bizosys.hsearch.kv.indexing.KVIndexer.java

License:Apache License

/**
 * Given a indexing parameters it starts a indexing.
 * Different indexing type are:/*from w w w . j a  va 2  s .  c om*/
 * SF2HB = Simple File(csv,tsv) to hbase directly.
 * SF2HF = Simple File(csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * SF2MF = Simple File(csv,tsv) to MapFile (key as {@link Text} and value as {@link BytesWritable})
 * MF2HB = Map File(key and value as csv,tsv) to hbase.
 * MF2HF = Map File(key and value as csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * MF2MF = Map File(key and value as csv,tsv) to MapFile(key as {@link Text} and value as {@link BytesWritable})
 * HB2HB = Hbase to Hbase
 * HB2HF = Hbase to HFile which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * HB2MF = Hbase to MapFile(key as {@link Text} and value as {@link BytesWritable})
 * @param args
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void execute(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    if (args.length < 7) {
        String err = "Usage : " + KVIndexer.class
                + " <<Job Type(SF2HB|SF2HF|SF2MF...)>> <<Input Source>> <<Output Sink>> <<XML File Configuration>> <<Skip Header(true|false)>> <<Run KeyGeneration Job>> <<Number Of reducer>> <<Speculative Execution>> <<scanner-cache-size>> <<filter>>";
        IdSearchLog.l.fatal(err);
        System.exit(1);
    }

    String msg = this.getClass().getName() + " > Initializing indexer job.";
    IdSearchLog.l.info(msg);

    int seq = 0;
    int len = args.length;

    String jobType = (len > seq) ? args[seq++] : "";
    String inputSource = (len > seq) ? args[seq++] : "";
    String outputSink = (len > seq) ? args[seq++] : "/tmp/hsearch-index";
    String xmlFilePath = (len > seq) ? args[seq++] : "";
    String skipHeader = (len > seq) ? args[seq++] : "false";
    boolean runKeyGenJob = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : false;
    int numberOfReducer = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 1;
    boolean speculativeExecution = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : true;
    int scannerCacheSize = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 300;
    String filter = (len > seq) ? args[seq++] : "";

    if (isEmpty(jobType)) {
        String err = this.getClass().getName()
                + " > Please enter Job type as one of these :\n SF2HB|SF2HF|SF2MF|MF2HB|MF2HF|MF2MF|HB2HB|HB2HF|HB2MF|IMF2HF";
        System.err.println(err);
        throw new IOException(err);
    }

    if (isEmpty(inputSource)) {
        String err = this.getClass().getName() + " > Please enter input file path.";
        System.err.println(err);
        throw new IOException(err);
    }

    Configuration conf = HBaseConfiguration.create();

    FieldMapping fm = createFieldMapping(conf, xmlFilePath, new StringBuilder());
    outputSink = outputSink.charAt(outputSink.length() - 1) == '/' ? outputSink : outputSink + "/";
    outputSink = outputSink + fm.tableName;

    createHBaseTable(fm);

    KVIndexer.FAM_NAME = fm.familyName.getBytes();
    KVIndexer.FIELD_SEPARATOR = fm.fieldSeparator;

    conf.set(XML_FILE_PATH, xmlFilePath);
    conf.set(OUTPUT_FOLDER, outputSink);
    conf.set(SKIP_HEADER, skipHeader);
    conf.setBoolean("mapreduce.map.speculative", speculativeExecution);

    Job job = Job.getInstance(conf, "com.bizosys.hsearch.kv.indexing.KVIndexer type : " + jobType + "\n"
            + inputSource + "\n" + outputSink);
    job.setJarByClass(this.getClass());
    job.setNumReduceTasks(numberOfReducer);

    Integer jobTypeI = JobTypeMapping.get(jobType);
    if (jobTypeI == null)
        throw new IOException("Invalid Jobtype " + jobType);

    /**
     *  if internal keyIndex is given then generate the keys first and then do indexing 
     *  else just run indexer by creating keys from hbase 
     */
    boolean keyGenjobStatus = false;
    if (-1 != fm.internalKey && runKeyGenJob) {

        Configuration keyGenConf = HBaseConfiguration.create();
        keyGenConf.set(INPUT_SOURCE, inputSource);
        keyGenConf.set(XML_FILE_PATH, xmlFilePath);
        keyGenConf.set(OUTPUT_FOLDER, outputSink);
        keyGenConf.set(SKIP_HEADER, skipHeader);

        Job keyGenJob = Job.getInstance(keyGenConf, "Creating Keys KVKeyGenerator for " + inputSource);

        switch (jobTypeI) {
        case SF2HB:
        case SF2HF:
        case SF2MF: {

            FileInputFormat.addInputPath(keyGenJob, new Path(inputSource));

            keyGenJob.setMapperClass(KVKeyGeneratorMapperFile.class);
            keyGenJob.setInputFormatClass(TextInputFormat.class);
            keyGenJob.setMapOutputKeyClass(Text.class);
            keyGenJob.setMapOutputValueClass(Text.class);

            keyGenJob.setReducerClass(KVKeyGeneratorReducerFile.class);
            keyGenJob.setNumReduceTasks(numberOfReducer);
            keyGenJob.setOutputKeyClass(NullWritable.class);
            keyGenJob.setOutputValueClass(Text.class);

            inputSource = outputSink + "_" + INPUTWITH_KEY;
            Path intermediatePath = new Path(inputSource);
            System.out.println("Final input path " + inputSource);
            FileOutputFormat.setOutputPath(keyGenJob, intermediatePath);

            keyGenjobStatus = keyGenJob.waitForCompletion(true);
            if (!keyGenjobStatus) {
                throw new IOException("Error in running Job for Key Generation");
            }

            break;
        }
        case HB2HB:
        case HB2HF:
        case HB2MF: {

            Scan scan = new Scan();
            scan.setCaching(scannerCacheSize);
            scan.setCacheBlocks(false);

            // Added Filter
            if (null != filter) {
                if (filter.trim().length() > 0) {
                    int index = filter.indexOf('=');
                    scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                            filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                            filter.substring(index + 1).getBytes()));
                }
            }

            byte[] family = fm.familyName.getBytes();
            for (String name : fm.nameWithField.keySet()) {

                Field fld = fm.nameWithField.get(name);
                if (!fld.isMergedKey)
                    continue;
                scan.addColumn(family, fld.sourceName.trim().getBytes());
            }

            TableMapReduceUtil.initTableMapperJob(inputSource, // input table
                    scan, // Scan instance to control CF and attribute selection
                    KVKeyGeneratorMapperHBase.class, // mapper class
                    Text.class, // mapper output key
                    ImmutableBytesWritable.class, // mapper output value
                    keyGenJob);

            TableMapReduceUtil.initTableReducerJob(inputSource, // output table
                    KVKeyGeneratorReducerHBase.class, // reducer class
                    keyGenJob);

            keyGenjobStatus = keyGenJob.waitForCompletion(true);
            if (!keyGenjobStatus) {
                throw new IOException("Error in running Job for Key Generation");
            }
            break;
        }
        case MF2HB:
        case MF2HF:
        case MF2MF: {
            break;
        }
        default:
            break;
        }
    }
    /*
     * Run job based on job type eg. SF2HB,SF2MF,SF2HF etc.
     */
    System.out.println("Sending path " + inputSource);
    runJob(jobTypeI, job, fm, inputSource, outputSink, scannerCacheSize, filter);
}

From source file:com.bizosys.hsearch.kv.indexing.KVIndexer.java

License:Apache License

private static int runJob(int jobTypeI, Job job, FieldMapping fm, String input, String output,
        int scannerCacheSize, String filter) throws IOException, InterruptedException, ClassNotFoundException {

    int jobStatus = -1;

    switch (jobTypeI) {
    case SF2HB: {

        IdSearchLog.l.info("Starting Job for SF2HB input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);
        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }/*w  w  w  . j a  v  a  2s  .c  om*/
    case SF2HF: {

        //First creates map file and then convert to hfile.
        //create intermediate dir for map file output

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for SF2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("SF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case SF2MF: {

        IdSearchLog.l.info("Starting Job for SF2MF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);

        job.setSortComparatorClass(TextPair.FirstComparator.class);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case MF2HB: {

        job.setMapperClass(KVMapperMapFile.class);
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        SequenceFileAsTextInputFormat.addInputPath(job, new Path(input));

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case MF2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("MF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case MF2MF: {

        job.setMapperClass(KVMapperMapFile.class);
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        SequenceFileAsTextInputFormat.addInputPath(job, new Path(input));

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case HB2HB: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());
        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                TextPair.class, // mapper output key
                Text.class, // mapper output value
                job);

        TableMapReduceUtil.initTableReducerJob(fm.tableName, // output table
                KVReducerHBase.class, // reducer class
                job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case HB2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("HB2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case HB2MF: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());

        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                TextPair.class, // mapper output key
                Text.class, // mapper output value
                job);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case IMF2HF: {

        Path finalOutputDir = new Path(output);
        job.setJarByClass(KVIndexer.class);
        job.setMapperClass(KVMapperHFile.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(input));
        FileOutputFormat.setOutputPath(job, finalOutputDir);

        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        HTable hTable = new HTable(job.getConfiguration(), fm.tableName);
        HFileOutputFormat.configureIncrementalLoad(job, hTable);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }

    default:
        throw new IOException("Invalid Jobtype " + jobTypeI);
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVReplicatorMapFile.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    int seq = 0;/*from www . j a  va  2s. c  o  m*/
    String inputFile = (args.length > seq) ? args[seq] : "";
    seq++;

    String outputFile = (args.length > seq) ? args[seq++] : "/tmp/hsearch-index";

    String outputFileName = (args.length > seq) ? args[seq++] : "file1";

    String xmlFilePath = (args.length > seq) ? args[seq++] : "";

    String replaceFrom = (args.length > seq) ? args[seq++] : "";

    String replaceTo = (args.length > seq) ? args[seq++] : "";

    String startIndex = (args.length > seq) ? args[seq++] : "";

    String endIndex = (args.length > seq) ? args[seq++] : "";

    String numberOfReducerStr = (args.length > seq) ? args[seq] : "1";
    int numberOfReducer = Integer.parseInt(numberOfReducerStr);

    if (null == inputFile || inputFile.trim().isEmpty()) {
        String err = KVReplicatorHFile.class + " > Please enter input file path.";
        System.err.println(err);
        throw new IOException(err);
    }

    Configuration conf = HBaseConfiguration.create();

    FieldMapping fm = KVIndexer.createFieldMapping(conf, xmlFilePath, new StringBuilder());
    outputFile = outputFile.charAt(outputFile.length() - 1) == '/' ? outputFile : outputFile + "/";
    outputFile = outputFile + fm.tableName;

    conf.set(OUTPUT_FILE_PATH, outputFile);
    conf.set(OUTPUT_FILE_NAME, outputFileName);

    conf.set(REPLACE_FROM, replaceFrom);
    conf.set(REPLACE_TO, replaceTo);
    conf.set(START_INDEX, startIndex);
    conf.set(END_INDEX, endIndex);

    Job job = Job.getInstance(conf, "KVReplicatorMapFile - Replicating Map File");

    job.setJarByClass(KVReplicatorMapFile.class);
    job.setMapperClass(KVReplicatorMapper.class);
    job.setReducerClass(KVReplicatorReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BytesWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setNumReduceTasks(numberOfReducer);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputFile.trim()));

    FileSystem fs = FileSystem.get(conf);
    Path dummyPath = new Path("/tmp", "dummy");
    if (fs.exists(dummyPath)) {
        fs.delete(dummyPath, true);
    }

    FileOutputFormat.setOutputPath(job, dummyPath);

    boolean result = job.waitForCompletion(true);
    return (result ? 0 : 1);
}

From source file:com.blogclustermr.EdgeLister.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job;
    job = Job.getInstance(conf, "edge_lister");
    job.setJarByClass(EdgeLister.class);
    job.setMapperClass(EdgeMapper.class);
    //job.setCombinerClass(EdgeWeightReducer.class);
    job.setReducerClass(EdgeWeightReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.waitForCompletion(true);// ww  w. j  a  v  a 2s  . co  m

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java

License:Apache License

/**
 * Run the aggregation Job to aggregate the different TopK patterns and group each Pattern by the features
 * present in it and thus calculate the final Top K frequent Patterns for each feature
 *//*www  .j  a v  a 2  s.c  o m*/
public static void startAggregating(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {

    conf.set(PFP_PARAMETERS, params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    Path input = new Path(params.get(OUTPUT), FPGROWTH);
    Job job = new Job(conf, "PFP Aggregator Driver running over input: " + input);
    job.setJarByClass(PFPGrowth.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TopKStringPatterns.class);

    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS);
    FileOutputFormat.setOutputPath(job, outPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(AggregatorMapper.class);
    job.setCombinerClass(AggregatorReducer.class);
    job.setReducerClass(AggregatorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, outPath);
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java

License:Apache License

/**
 * Count the frequencies of various features in parallel using Map/Reduce
 *//*from  w  ww .  ja  v  a  2  s. com*/
public static void startParallelCounting(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(PFP_PARAMETERS, params.toString());

    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    String input = params.get(INPUT);
    Job job = new Job(conf, "Parallel Counting Driver running over input: " + input);
    job.setJarByClass(PFPGrowth.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.addInputPath(job, new Path(input));
    Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING);
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelCountingMapper.class);
    job.setCombinerClass(ParallelCountingReducer.class);
    job.setReducerClass(ParallelCountingReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java

License:Apache License

/**
 * Run the Parallel FPGrowth Map/Reduce Job to calculate the Top K features of group dependent shards
 *//*  w ww.  j  av  a  2 s. c  om*/
public static void startParallelFPGrowth(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(PFP_PARAMETERS, params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");
    Path input = new Path(params.get(INPUT));
    Job job = new Job(conf, "PFP Growth Driver running over input" + input);
    job.setJarByClass(PFPGrowth.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(TransactionTree.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TopKStringPatterns.class);

    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), FPGROWTH);
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelFPGrowthMapper.class);
    job.setCombinerClass(ParallelFPGrowthCombiner.class);
    job.setReducerClass(ParallelFPGrowthReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:com.cg.mapreduce.myfpgrowth.PFPGrowth.java

License:Apache License

/**
 * Count the frequencies of various features in parallel using Map/Reduce
 *///w  ww.ja  v a2s.  c  o  m
public static void startParallelCounting(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(PFP_PARAMETERS, params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    String input = params.get(INPUT);
    Job job = new Job(conf, "Parallel Counting Driver running over input: " + input);
    job.setJarByClass(PFPGrowth.class);

    //    Job job = initJob(conf);  
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.addInputPath(job, new Path(input));
    Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING);
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelCountingMapper.class);
    job.setCombinerClass(ParallelCountingReducer.class);
    job.setReducerClass(ParallelCountingReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

}

From source file:com.cg.mapreduce.myfpgrowth.PFPGrowth.java

License:Apache License

/**
 * Run the Parallel FPGrowth Map/Reduce Job to calculate the Top K features of group dependent shards
 *//*from   w ww .  j a v a 2  s  .  co m*/
public static void startParallelFPGrowth(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(PFP_PARAMETERS, params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    Path input = new Path(params.get(INPUT));
    Job job = new Job(conf, "PFP Growth Driver running over input" + input);
    job.setJarByClass(PFPGrowth.class);
    //    Job job = initJob(conf);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(ArrayList.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), FPGROWTH);
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelFPGrowthMapper.class);
    //job.setCombinerClass(ParallelFPGrowthCombiner.class);
    job.setReducerClass(ParallelFPGrowthReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:com.ci.backports.hadoop.hbase.ZHFileOutputFormat.java

License:Apache License

/**
 * Configure a MapReduce Job to perform an incremental load into the given
 * table. This// www .  j  av  a 2 s  .c  o m
 * <ul>
 *   <li>Inspects the table to configure a total order partitioner</li>
 *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
 *   <li>Sets the number of reduce tasks to match the current number of regions</li>
 *   <li>Sets the output key/value class to match ZHFileOutputFormat's requirements</li>
 *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
 *     ZPutSortReducer)</li>
 * </ul> 
 * The user should be sure to set the map output value class to either KeyValue or Put before
 * running this function.
 */
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {
    Configuration conf = job.getConfiguration();
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);
    job.setOutputFormatClass(ZHFileOutputFormat.class);

    // Based on the configured map output class, set the correct reducer to properly
    // sort the incoming values.
    // TODO it would be nice to pick one or the other of these formats.
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(KeyValueSortReducer.class);
    } else if (Put.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(ZPutSortReducer.class);
    } else {
        LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

    LOG.info("Looking up current regions for table " + table);
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");
    job.setNumReduceTasks(startKeys.size());

    Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionsPath);

    FileSystem fs = partitionsPath.getFileSystem(conf);
    writePartitions(conf, partitionsPath, startKeys);
    partitionsPath.makeQualified(fs);
    URI cacheUri;
    try {
        cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.addCacheFile(cacheUri, conf);
    DistributedCache.createSymlink(conf);

    LOG.info("Incremental table output configured.");
}