Example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass.

Prototype

public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls how the keys are sorted before they are passed to the Reducer .

Usage

From source file:ph.fingra.hadoop.mapred.parts.performance.UserSessionStatistic.java

License:Apache License

public Job createJob(Configuration conf, Path[] inputpaths, Path outputpath, int numreduce,
        FingraphConfig finconfig) throws IOException {

    conf.setBoolean("verbose", finconfig.getDebug().isDebug_show_verbose());
    conf.setBoolean("counter", finconfig.getDebug().isDebug_show_counter());

    Job job = new Job(conf);
    String jobName = "perform/usersession job";
    job.setJobName(jobName);/*from www.j  a va2s.co  m*/

    job.setJarByClass(UserSessionStatistic.class);

    for (int i = 0; i < inputpaths.length; i++) {
        FileInputFormat.addInputPath(job, inputpaths[i]);
    }
    FileOutputFormat.setOutputPath(job, outputpath);

    job.setMapperClass(UserSessionMapper.class);
    job.setReducerClass(UserSessionReducer.class);

    job.setMapOutputKeyClass(UserSessionKey.class);
    job.setMapOutputValueClass(UserSessionEntity.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setPartitionerClass(UserSessionPartitioner.class);
    job.setSortComparatorClass(UserSessionSortComparator.class);
    job.setGroupingComparatorClass(UserSessionGroupComparator.class);

    job.setNumReduceTasks(numreduce);

    return job;
}

From source file:ph.fingra.hadoop.mapred.parts.performance.UserSessionStatistic.java

License:Apache License

public Job createHourJob(Configuration conf, Path[] inputpaths, Path outputpath, int numreduce,
        FingraphConfig finconfig, TargetDate targetdate) throws IOException {

    conf.setBoolean("verbose", finconfig.getDebug().isDebug_show_verbose());
    conf.setBoolean("counter", finconfig.getDebug().isDebug_show_counter());
    conf.set("hour", targetdate.getHour());

    Job job = new Job(conf);
    String jobName = "perform/usersession hour job";
    job.setJobName(jobName);/*from  w ww .  j  av a 2 s  .c om*/

    job.setJarByClass(UserSessionStatistic.class);

    for (int i = 0; i < inputpaths.length; i++) {
        FileInputFormat.addInputPath(job, inputpaths[i]);
    }
    FileOutputFormat.setOutputPath(job, outputpath);

    job.setMapperClass(UserSessionHourMapper.class);
    job.setReducerClass(UserSessionHourReducer.class);

    job.setMapOutputKeyClass(UserSessionHourKey.class);
    job.setMapOutputValueClass(UserSessionHourEntity.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setPartitionerClass(UserSessionHourPartitioner.class);
    job.setSortComparatorClass(UserSessionHourSortComparator.class);
    job.setGroupingComparatorClass(UserSessionHourGroupComparator.class);

    job.setNumReduceTasks(numreduce);

    return job;
}

From source file:ph.fingra.hadoop.mapred.parts.prerole.PreTransform.java

License:Apache License

public Job createJob(Configuration conf, Path[] inputpaths, Path outputpath, int numreduce,
        FingraphConfig finconfig) throws IOException {

    conf.setBoolean("verbose", finconfig.getDebug().isDebug_show_verbose());
    conf.setBoolean("counter", finconfig.getDebug().isDebug_show_counter());

    Job job = new Job(conf);
    String jobName = "prerole/pretransform job";
    job.setJobName(jobName);//from w  w w  . j a  v  a  2s.  c  o  m

    job.setJarByClass(PreTransform.class);

    for (int i = 0; i < inputpaths.length; i++) {
        FileInputFormat.addInputPath(job, inputpaths[i]);
    }
    FileOutputFormat.setOutputPath(job, outputpath);

    job.setMapperClass(PreTransformMapper.class);
    job.setReducerClass(PreTransformReducer.class);

    job.setMapOutputKeyClass(TransformKey.class);
    job.setMapOutputValueClass(TransformContainer.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setPartitionerClass(PreTransformPartitioner.class);
    job.setSortComparatorClass(PreTransformSortComparator.class);
    job.setGroupingComparatorClass(PreTransformGroupComparator.class);

    job.setNumReduceTasks(numreduce);

    return job;
}

From source file:SecondarySort.HashToAlternateWithSS.java

protected Job jobConfig() throws IOException {
    JobConf conf = new JobConf();
    Job job = new Job(conf, "iteration");
    job.setJarByClass(HashToAlternateWithSS.class);
    job.setReducerClass(ReduceSS.class);
    job.setPartitionerClass(LongPair.HPartitioner.class);
    job.setSortComparatorClass(LongPair.Comparator.class);
    job.setGroupingComparatorClass(LongPair.GroupComparator.class);
    job.setOutputKeyClass(LongPair.class);
    job.setOutputValueClass(Text.class);
    return job;/*from   ww  w.j  a v  a  2s. c o m*/
}

From source file:SecondarySort.HashToMinWithSS.java

protected Job jobConfig() throws IOException {
    JobConf conf = new JobConf();
    Job job = new Job(conf, "iteration");
    job.setJarByClass(HashToMinWithSS.class);
    job.setReducerClass(ReduceSS.class);
    job.setPartitionerClass(LongPair.HPartitioner.class);
    job.setSortComparatorClass(LongPair.Comparator.class);
    job.setGroupingComparatorClass(LongPair.GroupComparator.class);
    job.setOutputKeyClass(LongPair.class);
    job.setOutputValueClass(Text.class);
    return job;/*ww w .  j a  v  a 2s. com*/
}

From source file:SecondarySort.inputMaker.java

License:Open Source License

protected Job jobConfig() throws IOException {
    JobConf conf = new JobConf();
    Job job = new Job(conf, "iteration");
    job.setJarByClass(inputMaker.class);
    job.setMapperClass(MapMSS.class);
    job.setReducerClass(ReduceSS.class);
    job.setPartitionerClass(LongPair.HPartitioner.class);
    job.setSortComparatorClass(LongPair.Comparator.class);
    job.setGroupingComparatorClass(LongPair.GroupComparator.class);
    job.setOutputKeyClass(LongPair.class);
    job.setOutputValueClass(Text.class);
    return job;//from w  w w  .  j  a  va  2 s .  c  o  m
}

From source file:simsql.runtime.MRLoader.java

License:Apache License

public long run(String inputPath, String outputPath, short typeCode, Relation r, int sortAtt) {

    // make a directory for the relation
    Configuration conf = new Configuration();
    FileSystem dfs = null;/* ww w.ja  v  a  2  s  . c o m*/

    try {
        dfs = FileSystem.get(conf);
    } catch (Exception e) {
        throw new RuntimeException("Cannot access HDFS!", e);
    }

    try {
        // if it exists, destroy it.
        Path path = new Path(outputPath);
        if (dfs.exists(path)) {
            dfs.delete(path, true);
        }
    } catch (Exception e) {
        throw new RuntimeException("Could not create the file to bulk load to!", e);
    }

    // find a file name 
    String tempPath = null;
    if (inputPath.startsWith("hdfs:")) {
        tempPath = inputPath.replace("hdfs:", "");
    } else {
        tempPath = "/tempDataFile_" + r.getName();
        try {
            dfs.delete(new Path(tempPath), true);
        } catch (Exception e) {
            // ignore this.
        }

        // upload the text file
        try {
            dfs.copyFromLocalFile(false, true, new Path(inputPath), new Path(tempPath));
            dfs.deleteOnExit(new Path(tempPath));
        } catch (Exception e) {
            throw new RuntimeException("Failed to upload text file " + inputPath + " to HDFS!", e);
        }
    }

    // set up the new job's parameters.
    conf.setBoolean("mapred.compress.map.output", true);
    conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass());

    conf.set("io.serializations",
            "simsql.runtime.RecordSerialization,simsql.runtime.RecordKeySerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt("simsql.loader.numAtts", r.getAttributes().size());
    conf.setInt("simsql.loader.typeCode", (int) typeCode);
    conf.setInt("simsql.loader.sortAtt", sortAtt);

    String[] myStrings = new String[r.getAttributes().size()];
    int j = 0;
    for (simsql.compiler.Attribute a : r.getAttributes()) {
        myStrings[j++] = a.getPhysicalRealization().getClass().getName();
    }

    conf.setStrings("simsql.loader.types", myStrings);

    // create a job
    Job job;
    try {
        job = new Job(conf);
    } catch (Exception e) {
        throw new RuntimeException("Unable to create bulk loading job!", e);
    }

    // set the split size (number of mappers)
    long fSize = 0;
    if (inputPath.startsWith("hdfs")) {
        fSize = RelOp.getPathsTotalSize(new String[] { tempPath });
    } else {
        fSize = new File(inputPath).length();
    }

    FileInputFormat.setMinInputSplitSize(job, fSize / (long) numTasks);
    FileInputFormat.setMaxInputSplitSize(job, fSize / (long) numTasks);

    // and the number of reducers
    job.setNumReduceTasks(numTasks);

    // the mapper/reducer/jar
    job.setMapperClass(MRLoaderMapper.class);
    job.setReducerClass(MRLoaderReducer.class);
    job.setJarByClass(MRLoader.class);

    // I/O settings.
    job.setOutputFormatClass(RecordOutputFormat.class);

    job.setMapOutputKeyClass(RecordKey.class);
    job.setMapOutputValueClass(RecordWrapper.class);
    job.setOutputKeyClass(Nothing.class);
    job.setOutputValueClass(Record.class);
    try {
        FileInputFormat.setInputPaths(job, new Path(tempPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
    } catch (Exception e) {
        throw new RuntimeException("Could not set job inputs/outputs", e);
    }
    job.setGroupingComparatorClass(RecordKeyGroupingComparator.class);
    job.setPartitionerClass(RecordPartitioner.class);
    job.setSortComparatorClass(RecordKeySortComparator.class);

    job.setJobName("MRLoader: " + inputPath + " ==> " + outputPath);

    // run it
    Counters counters;
    try {
        job.waitForCompletion(true);
        counters = job.getCounters();
    } catch (Exception e) {
        throw new RuntimeException("Could not set up bulk loader job!", e);
    }

    // now, delete all the empty part files
    try {

        // get a filesystem
        FileSystem ddfs = FileSystem.get(conf);
        Path outPath = new Path(outputPath);
        if (ddfs.exists(outPath) && ddfs.isDirectory(outPath)) {
            FileStatus fstatus[] = ddfs.listStatus(outPath, new TableFileFilter());
            for (FileStatus ff : fstatus) {
                if (ddfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around...
                    ddfs.delete(ff.getPath(), true);
                }
            }
        }
    } catch (Exception e) { // this isn't disastrous 
    }

    // get the counter for the output of the mapper.
    Counter bytesCounter = counters.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN);
    return bytesCounter.getValue();
}

From source file:simsql.runtime.RelOp.java

License:Apache License

public boolean run(RuntimeParameter params, boolean verbose) {

    ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params;

    // build the jar.
    String jarFile = buildJarFile(params);

    // Get the default configuration object
    Configuration conf = new Configuration();

    // set quite mode on/off
    conf.setQuietMode(!verbose);//from w  w w.ja  v  a 2 s.  c  o m

    /***
    conf.setBoolean("mapred.task.profile", true);
    conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," +
        "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s");
    ***/

    // tell it how to serialize and deserialize records and recordkeys
    conf.set("io.serializations", getSerializations());
    conf.setBoolean("mapred.compress.map.output", true);

    int ioSortMB = conf.getInt("io.sort.mb", 256);
    conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.setInt("simsql.input.numSplits", pp.getNumCPUs());
    conf.setInt("mapred.job.reuse.jvm.num.tasks", 1);
    // conf.setBoolean ("mapred.map.tasks.speculative.execution", false);
    // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false);

    // tell it to use the jar that we just created
    conf.set("mapred.jar", jarFile);

    // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar");

    conf.setBoolean("mapred.output.compress", true);
    conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" });

    // use snappy for the intermediate stuff
    conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass());

    // do some additional operator-specific configurations
    setConfigurations(conf, params);

    // collect statistics for final relations always
    conf.setBoolean("simsql.collectStats", isFinal || collectStats);

    // figure out what file to map
    String[] inDirs = myInputNetwork.getPipelinedInputFiles();
    inDirs = excludeAnyWhoWillNotBeMapped(inDirs);
    String inSingleString = inDirs[0];
    conf.set("simsql.fileToMap", inSingleString);
    for (int i = 1; i < inDirs.length; i++) {
        inSingleString += "," + inDirs[i];
    }

    // create and name the job
    Job job;
    try {
        job = new Job(conf);
    } catch (Exception e) {
        throw new RuntimeException("Unable to create a new job!", e);
    }

    job.setJobName(getJobName());

    // set the map-reduce input and output types
    job.setMapOutputKeyClass(getMapOutputKeyClass());
    job.setMapOutputValueClass(getMapOutputValueClass());
    job.setOutputKeyClass(getOutputKeyClass());
    job.setOutputValueClass(getOutputValueClass());

    int numReducers = getNumReducers(params);

    job.setMapperClass(getMapperClass());
    job.setReducerClass(getReducerClass());

    // set the number of reducers
    job.setNumReduceTasks(numReducers);

    // set the input and the output formats... these extend FileInputFormat and FileOutputFormat
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(getOutputFormatClass());

    // set the input and output paths
    try {
        System.out.println("input file: " + inSingleString);
        FileInputFormat.setInputPaths(job, inSingleString);
        FileInputFormat.setInputPathFilter(job, TableFileFilter.class);
        FileOutputFormat.setOutputPath(job, new Path(getOutput()));
    } catch (Exception e) {
        throw new RuntimeException("Unable to set up the input/output path for the job.", e);
    }

    // set the split size
    FileInputFormat.setMinInputSplitSize(job, getSplitSize(params));
    FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params));

    // set the various sorting/grouping/mapping classes
    job.setGroupingComparatorClass(getGroupingComparatorClass());
    job.setPartitionerClass(getPartitionerClass());
    job.setSortComparatorClass(getSortComparatorClass());

    // and now, submit the job and wait for things to finish
    int exitCode;
    try {
        exitCode = job.waitForCompletion(verbose) ? 0 : 1;

        // get the output bytes counter.
        Counters c = job.getCounters();
        Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN);

        // and use them to set the size of the output relation.
        if (myDB != null) {
            myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue());
            myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length);
        }

    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Unable to run the job", e);
    }

    // now, delete all the empty part files
    try {

        // get a filesystem
        FileSystem dfs = FileSystem.get(conf);
        Path outPath = new Path(getOutput());
        if (dfs.exists(outPath) && dfs.isDirectory(outPath)) {
            FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter());
            for (FileStatus ff : fstatus) {
                if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around...
                    dfs.delete(ff.getPath(), true);
                }
            }
        }
    } catch (Exception e) { // this isn't disastrous 
    }
    return (exitCode == 0);
}

From source file:top10flight.Top25MoviesChaining.java

/**
 * @param args the command line arguments
 *//*from w ww . j a v a  2 s.co m*/
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    //job 1
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "Job: 1, top 25 movie based on ratings");
    job.setJarByClass(Top25MoviesChaining.class);
    job.setMapperClass(FirstMapper.class);
    job.setReducerClass(FirstReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[1]));
    Path firstJobOutput = new Path(args[2]);
    FileOutputFormat.setOutputPath(job, firstJobOutput);
    job.waitForCompletion(true);

    //job 2
    Job job2 = Job.getInstance(conf, "Job: 2, top 25 movie based on ratings");
    job2.setJarByClass(Top25MoviesChaining.class);
    job2.setMapperClass(SecondMapper.class);
    job2.setReducerClass(SecondReducer.class);
    job2.setSortComparatorClass(SortKeyComparator.class);
    job2.setMapOutputKeyClass(DoubleWritable.class);
    job2.setMapOutputValueClass(Text.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(job2, firstJobOutput);
    String timeStamp = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss", Locale.US)
            .format(new Timestamp(System.currentTimeMillis()));
    FileOutputFormat.setOutputPath(job2, new Path(args[2] + timeStamp));
    System.exit(job2.waitForCompletion(true) ? 0 : 1);

}

From source file:top10_categories.Top10_Categories.java

/**
 * @param args the command line arguments
 *//* w  ww  .ja v a  2s. co  m*/
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf1 = new Configuration();
    Configuration conf = new Configuration();
    Path inputDir = new Path(args[0]);
    Path hdfsFile = new Path(args[1]);
    FileSystem hdfs = FileSystem.get(conf);
    FileSystem local = FileSystem.getLocal(conf);

    try {
        FileStatus[] inputFiles = local.listStatus(inputDir);
        FSDataOutputStream out = hdfs.create(hdfsFile);

        for (int i = 0; i < inputFiles.length; i++) {
            System.out.println(inputFiles[i].getPath().getName());
            FSDataInputStream in = local.open(inputFiles[i].getPath());
            byte[] buffer = new byte[256];
            int bytesRead = 0;
            while ((bytesRead = in.read(buffer)) > 0) {
                out.write(buffer, 0, bytesRead);
            }
            in.close();
        }
        out.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
    Job job1 = Job.getInstance(conf1, "Chaining");
    job1.setJarByClass(Top10_Categories.class);

    job1.setMapperClass(Map1.class);
    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(FloatWritable.class);

    job1.setReducerClass(Reduce1.class);
    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(DoubleWritable.class);
    job1.setCombinerClass(Reduce1.class);

    FileInputFormat.addInputPath(job1, hdfsFile);
    FileOutputFormat.setOutputPath(job1, new Path(args[2]));

    boolean complete = job1.waitForCompletion(true);

    Configuration conf2 = new Configuration();
    Job job2 = Job.getInstance(conf2, "Chaining");

    if (complete) {

        job2.setJarByClass(Top10_Categories.class);
        job2.setMapperClass(Map2.class);
        job2.setMapOutputKeyClass(FloatWritable.class);
        job2.setMapOutputValueClass(Text.class);

        job2.setReducerClass(Reduce2.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(FloatWritable.class);

        job2.setSortComparatorClass(SortKeyComparator.class);

        job2.setNumReduceTasks(1);

        FileInputFormat.addInputPath(job2, new Path(args[2]));
        FileOutputFormat.setOutputPath(job2, new Path(args[3]));

        System.exit(job2.waitForCompletion(true) ? 0 : 1);
    }
}