Example usage for org.apache.hadoop.mapreduce Job setJobName

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setJobName.

Prototype

public void setJobName(String name) throws IllegalStateException

Source Link

Document

Set the user-specified job name.

Usage

From source file:de.l3s.content.timex.extracting.WikiTimex.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from   ww w . j a  va 2 s  .c o  m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = "en"; // Assume 'en' by default.
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - language: " + language);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(WikiTimex.class);
    job.setJobName(String.format("CountWikipediaPages[%s: %s, %s: %s]", INPUT_OPTION, inputPath,
            LANGUAGE_OPTION, language));

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    job.setMapperClass(TMapper.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:de.tuberlin.dima.aim3.HadoopJob.java

License:Open Source License

protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat) throws IOException {

    Job job = new Job(new Configuration(getConf()));
    Configuration jobConf = job.getConfiguration();

    if (mapper.equals(Mapper.class)) {
        throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
    } else {/*from  ww w  . j  av a 2s  .  com*/
        job.setJarByClass(mapper);
    }

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath.toString());

    job.setMapperClass(mapper);
    job.setMapOutputKeyClass(mapperKey);
    job.setMapOutputValueClass(mapperValue);
    job.setOutputKeyClass(mapperKey);
    job.setOutputValueClass(mapperValue);

    jobConf.setBoolean("mapred.compress.map.output", true);

    job.setNumReduceTasks(0);

    job.setJobName(getCustomJobName(job, mapper));

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath.toString());

    return job;
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams//from   w ww.j a va2 s  .  co m
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);
    con.set(WINDOW_TYPE, mode.toString());
    con.setInt(WINDOW_SIZE, winsize);

    if (mode.toString().equalsIgnoreCase("DOCUMENT")) {
        con.setInt("mapred.job.map.memory.mb", 3000);

        con.set("mapred.child.java.opts", "-Xmx2900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx8000M");

        con.setInt("mapred.job.reduce.memory.mb", 8120);
    } else {
        con.setInt("mapred.job.map.memory.mb", 2000);

        con.set("mapred.child.java.opts", "-Xmx1900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx2900M");

        con.setInt("mapred.job.reduce.memory.mb", 3000);
    }
    con.setBoolean("mapred.compress.map.output", true);
    con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setBoolean("mapred.compress.output", true);
    con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setInt("mapred.task.timeout", 6000000);
    con.setInt("io.sort.factor", 50);
    con.setInt("mapreduce.map.tasks", 256);
    con.setInt("dfs.replication", 1);
    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(512);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

/**
 * pass2: perform the LLR calculation/* ww w .j  a va  2 s  . c  om*/
 */
private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal,
        boolean emitUnigrams, float minValue, int reduceTasks)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    conf.setLong(AssocReducer.NGRAM_TOTAL, nGramTotal);
    conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    conf.setFloat(AssocReducer.MIN_VALUE, minValue);
    conf.setInt("mapred.job.map.memory.mb", 1280);
    conf.setInt("mapred.job.reduce.memory.mb", 2560);
    conf.set("mapred.reduce.child.java.opts", "-Xmx2G");
    conf.setInt("mapred.task.timeout", 6000000);
    conf.set(AssocReducer.ASSOC_METRIC, "llr");

    Job job = new Job(conf);
    job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output + " pruning: " + minValue);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(Gram.class);
    job.setMapOutputValueClass(Gram.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
    Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY + "_llr");
    FileOutputFormat.setOutputPath(job, outPath);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.setReducerClass(AssocReducer.class);
    job.setNumReduceTasks(reduceTasks);
    // Defines additional single text based output 'text' for the job
    MultipleOutputs.addNamedOutput(job, "contingency", TextOutputFormat.class, Text.class, Text.class);

    // Defines additional multi sequencefile based output 'sequence' for the
    // job
    MultipleOutputs.addNamedOutput(job, "llr", TextOutputFormat.class, Text.class, DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "pmi", TextOutputFormat.class, Text.class, DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "chi", TextOutputFormat.class, Text.class, DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "dice", TextOutputFormat.class, Text.class, DoubleWritable.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.ConfigurationHelper.java

License:Apache License

/**
 * Job configurator/*from w w  w .  j a  va  2  s  .  co m*/
 *
 * @param job                      job instance
 * @param jarByClass               class of the jar
 * @param mapperClass              mapper
 * @param reducerClass             reducer
 * @param commaSeparatedInputFiles input paths
 * @param outputPath               output
 * @throws IOException I/O exception
 */
public static void configureJob(Job job, Class<?> jarByClass, Class<? extends Mapper> mapperClass,
        Class<? extends Reducer> reducerClass, String commaSeparatedInputFiles, String outputPath)
        throws IOException {
    job.setJarByClass(jarByClass);
    job.setJobName(jarByClass.getName());

    // mapper
    job.setMapperClass(mapperClass);

    // reducer
    job.setReducerClass(reducerClass);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // prevent producing empty files
    LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);

    // intermediate data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // output data
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.SimpleTextSearch.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(SimpleTextSearch.class);

    job.setJobName(SimpleTextSearch.class.getName());

    // mapper/* www  .j  av a 2 s. co m*/
    job.setMapperClass(TextSearchMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    // regex with a phrase to be searched for
    String regex = otherArgs[2];
    job.getConfiguration().set(MAPREDUCE_MAP_REGEX, regex);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.WordCounterExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(WordCounterExample.class);

    job.setJobName(WordCounterExample.class.getName());

    // mapper/*  w  w  w  .  j av  a 2s  . co  m*/
    job.setMapperClass(WordCounterMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase1FullJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    // set from the command line

    job.setJarByClass(Phase1FullJob.class);
    job.setJobName(Phase1FullJob.class.getName());

    // mapper/*from  www  .j a v a 2s.c  o m*/
    job.setMapperClass(MapperClass.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration().setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(SimpleWarcWriterReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase2ExactMatchDeDuplication.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    //set from the command line

    job.setJarByClass(Phase2ExactMatchDeDuplication.class);
    job.setJobName(Phase2ExactMatchDeDuplication.class.getName());

    // mapper//from   www  .j  a  v a 2  s .c  om
    job.setMapperClass(ExactMatchDetectionMapper.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration().setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(UniqueWarcWriterReducer.class);
    // no combiner, as the output classes in mapper and reducer are different!

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step1ExtractNearDupInfo.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step1ExtractNearDupInfo.class);
    job.setJobName(Phase3Step1ExtractNearDupInfo.class.getName());

    // mapper//  ww w .  jav  a2  s  .c o m
    job.setMapperClass(MapperClass.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DocumentInfo.class);

    // reducer
    job.setReducerClass(DeDuplicationTextOutputReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(List.class);

    job.setInputFormatClass(WARCInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, DocumentInfoOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;

}