Example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass.

Prototype

public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls which keys are grouped together for a single call to Reducer#reduce(Object,Iterable,org.apache.hadoop.mapreduce.Reducer.Context)

Usage

From source file:simsql.runtime.MRLoader.java

License:Apache License

public long run(String inputPath, String outputPath, short typeCode, Relation r, int sortAtt) {

    // make a directory for the relation
    Configuration conf = new Configuration();
    FileSystem dfs = null;//from  www. j a  v a 2s  . c o  m

    try {
        dfs = FileSystem.get(conf);
    } catch (Exception e) {
        throw new RuntimeException("Cannot access HDFS!", e);
    }

    try {
        // if it exists, destroy it.
        Path path = new Path(outputPath);
        if (dfs.exists(path)) {
            dfs.delete(path, true);
        }
    } catch (Exception e) {
        throw new RuntimeException("Could not create the file to bulk load to!", e);
    }

    // find a file name 
    String tempPath = null;
    if (inputPath.startsWith("hdfs:")) {
        tempPath = inputPath.replace("hdfs:", "");
    } else {
        tempPath = "/tempDataFile_" + r.getName();
        try {
            dfs.delete(new Path(tempPath), true);
        } catch (Exception e) {
            // ignore this.
        }

        // upload the text file
        try {
            dfs.copyFromLocalFile(false, true, new Path(inputPath), new Path(tempPath));
            dfs.deleteOnExit(new Path(tempPath));
        } catch (Exception e) {
            throw new RuntimeException("Failed to upload text file " + inputPath + " to HDFS!", e);
        }
    }

    // set up the new job's parameters.
    conf.setBoolean("mapred.compress.map.output", true);
    conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass());

    conf.set("io.serializations",
            "simsql.runtime.RecordSerialization,simsql.runtime.RecordKeySerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt("simsql.loader.numAtts", r.getAttributes().size());
    conf.setInt("simsql.loader.typeCode", (int) typeCode);
    conf.setInt("simsql.loader.sortAtt", sortAtt);

    String[] myStrings = new String[r.getAttributes().size()];
    int j = 0;
    for (simsql.compiler.Attribute a : r.getAttributes()) {
        myStrings[j++] = a.getPhysicalRealization().getClass().getName();
    }

    conf.setStrings("simsql.loader.types", myStrings);

    // create a job
    Job job;
    try {
        job = new Job(conf);
    } catch (Exception e) {
        throw new RuntimeException("Unable to create bulk loading job!", e);
    }

    // set the split size (number of mappers)
    long fSize = 0;
    if (inputPath.startsWith("hdfs")) {
        fSize = RelOp.getPathsTotalSize(new String[] { tempPath });
    } else {
        fSize = new File(inputPath).length();
    }

    FileInputFormat.setMinInputSplitSize(job, fSize / (long) numTasks);
    FileInputFormat.setMaxInputSplitSize(job, fSize / (long) numTasks);

    // and the number of reducers
    job.setNumReduceTasks(numTasks);

    // the mapper/reducer/jar
    job.setMapperClass(MRLoaderMapper.class);
    job.setReducerClass(MRLoaderReducer.class);
    job.setJarByClass(MRLoader.class);

    // I/O settings.
    job.setOutputFormatClass(RecordOutputFormat.class);

    job.setMapOutputKeyClass(RecordKey.class);
    job.setMapOutputValueClass(RecordWrapper.class);
    job.setOutputKeyClass(Nothing.class);
    job.setOutputValueClass(Record.class);
    try {
        FileInputFormat.setInputPaths(job, new Path(tempPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
    } catch (Exception e) {
        throw new RuntimeException("Could not set job inputs/outputs", e);
    }
    job.setGroupingComparatorClass(RecordKeyGroupingComparator.class);
    job.setPartitionerClass(RecordPartitioner.class);
    job.setSortComparatorClass(RecordKeySortComparator.class);

    job.setJobName("MRLoader: " + inputPath + " ==> " + outputPath);

    // run it
    Counters counters;
    try {
        job.waitForCompletion(true);
        counters = job.getCounters();
    } catch (Exception e) {
        throw new RuntimeException("Could not set up bulk loader job!", e);
    }

    // now, delete all the empty part files
    try {

        // get a filesystem
        FileSystem ddfs = FileSystem.get(conf);
        Path outPath = new Path(outputPath);
        if (ddfs.exists(outPath) && ddfs.isDirectory(outPath)) {
            FileStatus fstatus[] = ddfs.listStatus(outPath, new TableFileFilter());
            for (FileStatus ff : fstatus) {
                if (ddfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around...
                    ddfs.delete(ff.getPath(), true);
                }
            }
        }
    } catch (Exception e) { // this isn't disastrous 
    }

    // get the counter for the output of the mapper.
    Counter bytesCounter = counters.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN);
    return bytesCounter.getValue();
}

From source file:simsql.runtime.RelOp.java

License:Apache License

public boolean run(RuntimeParameter params, boolean verbose) {

    ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params;

    // build the jar.
    String jarFile = buildJarFile(params);

    // Get the default configuration object
    Configuration conf = new Configuration();

    // set quite mode on/off
    conf.setQuietMode(!verbose);/*from   w  w  w  .j ava  2 s .co  m*/

    /***
    conf.setBoolean("mapred.task.profile", true);
    conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," +
        "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s");
    ***/

    // tell it how to serialize and deserialize records and recordkeys
    conf.set("io.serializations", getSerializations());
    conf.setBoolean("mapred.compress.map.output", true);

    int ioSortMB = conf.getInt("io.sort.mb", 256);
    conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.setInt("simsql.input.numSplits", pp.getNumCPUs());
    conf.setInt("mapred.job.reuse.jvm.num.tasks", 1);
    // conf.setBoolean ("mapred.map.tasks.speculative.execution", false);
    // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false);

    // tell it to use the jar that we just created
    conf.set("mapred.jar", jarFile);

    // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar");

    conf.setBoolean("mapred.output.compress", true);
    conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" });

    // use snappy for the intermediate stuff
    conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass());

    // do some additional operator-specific configurations
    setConfigurations(conf, params);

    // collect statistics for final relations always
    conf.setBoolean("simsql.collectStats", isFinal || collectStats);

    // figure out what file to map
    String[] inDirs = myInputNetwork.getPipelinedInputFiles();
    inDirs = excludeAnyWhoWillNotBeMapped(inDirs);
    String inSingleString = inDirs[0];
    conf.set("simsql.fileToMap", inSingleString);
    for (int i = 1; i < inDirs.length; i++) {
        inSingleString += "," + inDirs[i];
    }

    // create and name the job
    Job job;
    try {
        job = new Job(conf);
    } catch (Exception e) {
        throw new RuntimeException("Unable to create a new job!", e);
    }

    job.setJobName(getJobName());

    // set the map-reduce input and output types
    job.setMapOutputKeyClass(getMapOutputKeyClass());
    job.setMapOutputValueClass(getMapOutputValueClass());
    job.setOutputKeyClass(getOutputKeyClass());
    job.setOutputValueClass(getOutputValueClass());

    int numReducers = getNumReducers(params);

    job.setMapperClass(getMapperClass());
    job.setReducerClass(getReducerClass());

    // set the number of reducers
    job.setNumReduceTasks(numReducers);

    // set the input and the output formats... these extend FileInputFormat and FileOutputFormat
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(getOutputFormatClass());

    // set the input and output paths
    try {
        System.out.println("input file: " + inSingleString);
        FileInputFormat.setInputPaths(job, inSingleString);
        FileInputFormat.setInputPathFilter(job, TableFileFilter.class);
        FileOutputFormat.setOutputPath(job, new Path(getOutput()));
    } catch (Exception e) {
        throw new RuntimeException("Unable to set up the input/output path for the job.", e);
    }

    // set the split size
    FileInputFormat.setMinInputSplitSize(job, getSplitSize(params));
    FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params));

    // set the various sorting/grouping/mapping classes
    job.setGroupingComparatorClass(getGroupingComparatorClass());
    job.setPartitionerClass(getPartitionerClass());
    job.setSortComparatorClass(getSortComparatorClass());

    // and now, submit the job and wait for things to finish
    int exitCode;
    try {
        exitCode = job.waitForCompletion(verbose) ? 0 : 1;

        // get the output bytes counter.
        Counters c = job.getCounters();
        Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN);

        // and use them to set the size of the output relation.
        if (myDB != null) {
            myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue());
            myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length);
        }

    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Unable to run the job", e);
    }

    // now, delete all the empty part files
    try {

        // get a filesystem
        FileSystem dfs = FileSystem.get(conf);
        Path outPath = new Path(getOutput());
        if (dfs.exists(outPath) && dfs.isDirectory(outPath)) {
            FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter());
            for (FileStatus ff : fstatus) {
                if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around...
                    dfs.delete(ff.getPath(), true);
                }
            }
        }
    } catch (Exception e) { // this isn't disastrous 
    }
    return (exitCode == 0);
}

From source file:SpecializedByState.Driver.java

public static void main(String args[]) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "Specialities by State");
    job.setJarByClass(Driver.class);
    job.setMapperClass(DriverMapper.class);
    //        job.setCombinerClass(DriverReducer.class);
    job.setMapOutputKeyClass(SpecialityDetailsWritable.class);
    job.setGroupingComparatorClass(GroupingStateComparator.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setReducerClass(DriverReducer.class);
    job.setOutputKeyClass(SpecialityDetailsWritable.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}