Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(Configuration conf, Class exampleClass) 

Source Link

Document

Construct a map/reduce job configuration.

Usage

From source file:com.github.gaoyangthu.demo.mapred.Grep.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }/*w  ww  .  j  av  a 2s.  c om*/

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf grepJob = new JobConf(getConf(), Grep.class);

    try {

        grepJob.setJobName("grep-search");

        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.setMapperClass(RegexMapper.class);
        grepJob.set("mapred.mapper.regex", args[2]);
        if (args.length == 4)
            grepJob.set("mapred.mapper.regex.group", args[3]);

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(grepJob, tempDir);
        grepJob.setOutputFormat(SequenceFileOutputFormat.class);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);

        JobClient.runJob(grepJob);

        JobConf sortJob = new JobConf(Grep.class);
        sortJob.setJobName("grep-sort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormat(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setOutputKeyComparatorClass // sort by decreasing freq
        (LongWritable.DecreasingComparator.class);

        JobClient.runJob(sortJob);
    } finally {
        FileSystem.get(grepJob).delete(tempDir, true);
    }
    return 0;
}

From source file:com.hadoopilluminated.examples.Grep.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }/*  w  ww .ja v a  2  s .  com*/

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf grepJob = new JobConf(getConf(), Grep.class);

    try {

        grepJob.setJobName("grep-search");

        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.setMapperClass(RegexMapper.class);
        grepJob.set("mapred.mapper.regex", args[2]);
        if (args.length == 4) {
            grepJob.set("mapred.mapper.regex.group", args[3]);
        }

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(grepJob, tempDir);
        grepJob.setOutputFormat(SequenceFileOutputFormat.class);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);

        JobClient.runJob(grepJob);

        JobConf sortJob = new JobConf(getConf(), Grep.class);
        sortJob.setJobName("grep-sort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormat(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setOutputKeyComparatorClass // sort by decreasing freq
        (LongWritable.DecreasingComparator.class);

        JobClient.runJob(sortJob);
    } finally {
        FileSystem.get(grepJob).delete(tempDir, true);
    }
    return 0;
}

From source file:com.hadoopilluminated.examples.Join.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job.//from   w w w  .  j av  a  2s .c  om
 *
 * @throws IOException When there is communication problems with the job
 * tracker.
 */
@Override
public int run(String[] args) throws Exception {
    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("join");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_maps = cluster.getTaskTrackers() * jobConf.getInt("test.sort.maps_per_host", 10);
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = TupleWritable.class;
    String op = "inner";
    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                num_maps = Integer.parseInt(args[++i]);
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-joinOp".equals(args[i])) {
                op = args[++i];
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumMapTasks(num_maps);
    jobConf.setNumReduceTasks(num_reduces);

    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }

    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.remove(otherArgs.size() - 1)));
    List<Path> plist = new ArrayList<Path>(otherArgs.size());
    for (String s : otherArgs) {
        plist.add(new Path(s));
    }

    jobConf.setInputFormat(CompositeInputFormat.class);
    jobConf.set("mapred.join.expr",
            CompositeInputFormat.compose(op, inputFormatClass, plist.toArray(new Path[0])));
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException {

    job = new JobConf(getConf(), Crush.class);

    /*/*from  w ww .  jav  a2 s  .  c o m*/
     * Turn off speculative execution because that's just wasting network io.
     */
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    /*
     * Turn off pre-emption because we don't want to kill a task after two hours of network io.
     */
    job.set("mapred.fairscheduler.preemption", "false");

    tmpDir = new Path("tmp/crush-" + UUID.randomUUID());
    outDir = new Path(tmpDir, "out");

    double threshold = 0.75;

    List<String> regexes = asList(".+");
    List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
    List<String> inFormats = asList(SequenceFileInputFormat.class.getName());
    List<String> outFormats = asList(SequenceFileOutputFormat.class.getName());

    String crushTimestamp;

    Options options = buildOptions();
    CommandLine cli = new GnuParser().parse(options, args);

    if (cli.hasOption("?")) {
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt")));

        try {
            String line;

            while (null != (line = reader.readLine())) {
                System.out.println(line);
            }
        } finally {
            reader.close();
        }

        return false;
    }

    if (cli.hasOption("verbose")) {
        console = Verbosity.VERBOSE;
    } else if (cli.hasOption("info")) {
        console = Verbosity.INFO;
    } else {
        console = Verbosity.NONE;
    }

    if (cli.hasOption("ignore-regex")) {
        ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher("");
    }

    excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs");

    String[] nonOptions = cli.getArgs();

    if (2 == nonOptions.length) {
        /*
         * Stand alone mode accepts two arguments.
         */
        mode = Mode.STAND_ALONE;

        srcDir = new Path(nonOptions[0]);

        dest = new Path(nonOptions[1]);

        if (cli.hasOption("input-format")) {
            inFormats = asList(cli.getOptionValue("input-format"));
        }

        if (cli.hasOption("output-format")) {
            outFormats = asList(cli.getOptionValue("output-format"));
        }

        replacements = asList(dest.getName());

        crushTimestamp = Long.toString(currentTimeMillis());

    } else {
        /*
         * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an
         * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking
         * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much
         * smaller.
         */

        if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length
                && args[2].length() != 14) {

            int maxTasks = Integer.parseInt(args[2]);

            if (maxTasks <= 0 || maxTasks > 4000) {
                throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks);
            }

            job.setInt("mapred.reduce.tasks", maxTasks);

            maxFileBlocks = Integer.MAX_VALUE;

            crushTimestamp = Long.toString(currentTimeMillis());

            srcDir = new Path(args[0]);
            dest = new Path(args[1]);

            mode = Mode.CLONE;

            if (args.length == 4) {
                if (args[3].equals("TEXT")) {
                    /*
                     * These are the defaults except with text input and output formats.
                     */
                    inFormats = asList(TextInputFormat.class.getName());
                    outFormats = asList(TextOutputFormat.class.getName());

                } else if (!args[3].equals("SEQUENCE")) {
                    throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]);
                }
            }
        } else {
            /*
             * V2 style arguments.
             */
            if (cli.hasOption("threshold")) {
                threshold = Double.parseDouble(cli.getOptionValue("threshold"));

                if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold)
                        || Double.isNaN(threshold)) {
                    throw new IllegalArgumentException("Block size threshold must be in (0, 1]: " + threshold);
                }
            }

            if (cli.hasOption("max-file-blocks")) {
                int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks"));

                if (0 > maxFileBlocksOption) {
                    throw new IllegalArgumentException(
                            "Maximum file size in blocks must be positive: " + maxFileBlocksOption);
                }

                maxFileBlocks = maxFileBlocksOption;
            } else {
                maxFileBlocks = 8;
            }

            if (cli.hasOption("regex")) {
                regexes = asList(cli.getOptionValues("regex"));
            }

            if (cli.hasOption("replacement")) {
                replacements = asList(cli.getOptionValues("replacement"));
            }

            if (cli.hasOption("input-format")) {
                inFormats = asList(cli.getOptionValues("input-format"));
            }

            if (cli.hasOption("output-format")) {
                outFormats = asList(cli.getOptionValues("output-format"));
            }

            if (3 != nonOptions.length) {
                throw new IllegalArgumentException(
                        "Could not find source directory, out directory, and job timestamp");
            }

            srcDir = new Path(nonOptions[0]);
            dest = new Path(nonOptions[1]);

            crushTimestamp = nonOptions[2];

            if (cli.hasOption("clone")) {
                mode = Mode.CLONE;
            } else {
                mode = Mode.MAP_REDUCE;
            }

            if (!crushTimestamp.matches("\\d{14}")) {
                throw new IllegalArgumentException(
                        "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp);
            }
        }

        dfsBlockSize = parseDfsBlockSize(job);
        maxEligibleSize = (long) (dfsBlockSize * threshold);
    }

    /*
     * Add the crush specs and compression options to the configuration.
     */
    job.set("crush.timestamp", crushTimestamp);

    if (ignoredFiles != null) {
        job.set("crush.ignore-regex", ignoredFiles.pattern().pattern());
    }

    if (regexes.size() != replacements.size() || replacements.size() != inFormats.size()
            || inFormats.size() != outFormats.size()) {
        throw new IllegalArgumentException(
                "Must be an equal number of regex, replacement, in-format, and out-format options");
    }

    job.setInt("crush.num.specs", regexes.size());

    matchers = new ArrayList<Matcher>(regexes.size());

    for (int i = 0; i < regexes.size(); i++) {
        job.set(format("crush.%d.regex", i), regexes.get(i));

        matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy"));

        job.set(format("crush.%d.regex.replacement", i), replacements.get(i));

        String inFmt = inFormats.get(i);

        if ("sequence".equals(inFmt)) {
            inFmt = SequenceFileInputFormat.class.getName();
        } else if ("text".equals(inFmt)) {
            inFmt = TextInputFormat.class.getName();
        } else {
            try {
                if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) {
                    throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
            }
        }

        job.set(format("crush.%d.input.format", i), inFmt);

        String outFmt = outFormats.get(i);

        if ("sequence".equals(outFmt)) {
            outFmt = SequenceFileOutputFormat.class.getName();
        } else if ("text".equals(outFmt)) {
            outFmt = TextOutputFormat.class.getName();
        } else {
            try {
                if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) {
                    throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
                }
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
            }
        }

        job.set(format("crush.%d.output.format", i), outFmt);
    }

    String codec = cli.getOptionValue("compress");

    if (null == codec) {
        codec = DefaultCodec.class.getName();
    } else if ("none".equals(codec)) {
        codec = null;
    } else if ("gzip".equals(codec)) {
        codec = GzipCodec.class.getName();
    } else {
        try {
            if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) {
                throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
            }
        } catch (ClassNotFoundException e) {
            throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
        }
    }

    if (null == codec) {
        job.setBoolean("mapred.output.compress", false);
    } else {
        job.setBoolean("mapred.output.compress", true);
        job.set("mapred.output.compression.type", "BLOCK");
        job.set("mapred.output.compression.codec", codec);

        try {
            CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance();
            codecExtension = instance.getDefaultExtension();
        } catch (Exception e) {
            throw new AssertionError();
        }
    }

    return true;
}

From source file:com.hp.hplc.mr.driver.WordCount.java

License:Apache License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the
 *                     job tracker.//from w  w w.ja  va  2 s . co m
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WordCount.class);
    conf.setJobName("wordcount");

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
                System.out.println("# of reduces: " + conf.getNumReduceTasks());
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);

    return 0;
}