Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:de.hpi.fgis.hdrs.mapreduce.examples.PredicateAnalysis.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJarByClass(PredicateAnalysis.class);
    job.setJobName("Predicate Analysis");

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(LongWritable.class);

    //job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(Text.class);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(Map.class);
    //job.setReducerClass(Reduce.class);

    job.setNumReduceTasks(0);/*from   ww  w .  ja  v  a2 s .  c om*/

    job.setInputFormatClass(TripleInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    TripleInputFormat.setStoreAddress(job, args[0]);
    TripleInputFormat.setIndex(job, "POS");
    TripleInputFormat.setPattern(job, Triple.newPattern(null, args[1], null));
    TripleInputFormat.setAggregationLevel2(job);

    SequenceFileOutputFormat.setOutputPath(job, new Path(args[2]));

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:de.hpi.fgis.hdrs.mapreduce.examples.TripleCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJarByClass(TripleCount.class);
    job.setJobName("TripleCount");

    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setNumReduceTasks(1);//from w ww  .  j a  v a 2s  .  com

    job.setInputFormatClass(TripleInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    int argc = 0;

    TripleInputFormat.setStoreAddress(job, args[argc++]);
    TripleInputFormat.setIndex(job, args[argc++]);
    if ("-p".equals(args[argc])) {
        argc++;
        String s = args[argc++];
        String p = args[argc++];
        String o = args[argc++];
        if ("*".equals(s))
            s = null;
        if ("*".equals(p))
            p = null;
        if ("*".equals(o))
            o = null;
        TripleInputFormat.setPattern(job, Triple.newPattern(s, p, o));
    } else {
        TextOutputFormat.setOutputPath(job, new Path(args[argc]));
    }

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:de.hpi.fgis.hdrs.mapreduce.examples.TripleSize.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJarByClass(TripleSize.class);
    job.setJobName("TripleSize");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Combine.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TripleInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    int argc = 0;

    TripleInputFormat.setStoreAddress(job, args[argc++]);
    TripleInputFormat.setIndex(job, args[argc++]);
    if ("-p".equals(args[argc])) {
        argc++;//from   www .  j a v a  2  s . c  o  m
        String s = args[argc++];
        String p = args[argc++];
        String o = args[argc++];
        if ("*".equals(s))
            s = null;
        if ("*".equals(p))
            p = null;
        if ("*".equals(o))
            o = null;
        TripleInputFormat.setPattern(job, Triple.newPattern(s, p, o));
    } else {
        TextOutputFormat.setOutputPath(job, new Path(args[argc]));
    }

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:de.hpi.fgis.hdrs.mapreduce.IndexLoader.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (3 != args.length) {
        System.out.println(/*  w  ww .  j  a v a  2 s  .c o m*/
                "Usage: IndexLoader <StoreAddres> <SourceIndex> " + "<TargetIndex1>[,<TargetIndex2>...]");
        return 0;
    }

    Job job = new Job(getConf());
    job.setJarByClass(IndexLoader.class);
    job.setJobName("HDRS Index Loader");

    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(TripleOutputFormat.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(TripleOutputFormat.class);

    job.setMapperClass(Map.class);
    job.setNumReduceTasks(0);

    job.setInputFormatClass(TripleInputFormat.class);
    job.setOutputFormatClass(TripleOutputFormat.class);

    TripleInputFormat.setStoreAddress(job, args[0]);
    TripleInputFormat.setIndex(job, args[1]);

    TripleOutputFormat.setStoreAddress(job, args[0]);
    TripleOutputFormat.setOutputIndexes(job, args[2]);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:de.l3s.common.features.hadoop.TimeSeriesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options opts = new Options();

    Option jnameOpt = OptionBuilder.withArgName("job-name").hasArg(true).withDescription("Timeseries analysis")
            .create(JOB_NAME);//from   w w w  .  j  av a2s . com

    Option inputOpt = OptionBuilder.withArgName("input-path").hasArg(true)
            .withDescription("Timeseries file path (required)").create(INPUT_OPT);

    Option outputOpt = OptionBuilder.withArgName("output-path").hasArg(true)
            .withDescription("output file path (required)").create(OUTPUT_OPT);

    Option reduceOpt = OptionBuilder.withArgName("reduce-no").hasArg(true)
            .withDescription("number of reducer nodes").create(REDUCE_NO);

    Option rmOpt = OptionBuilder.withArgName("remove-out").hasArg(false)
            .withDescription("remove the output then create again before writing files onto it")
            .create(REMOVE_OUTPUT);

    Option cOpt = OptionBuilder.withArgName("compress-option").hasArg(true)
            .withDescription("compression option").create(COMPRESS_OPT);

    opts.addOption(jnameOpt);
    opts.addOption(inputOpt);
    opts.addOption(reduceOpt);
    opts.addOption(outputOpt);
    opts.addOption(rmOpt);
    opts.addOption(cOpt);
    CommandLine cl;
    CommandLineParser parser = new GnuParser();
    try {
        cl = parser.parse(opts, args);
    } catch (ParseException e) {
        System.err.println("Error parsing command line: " + e.getMessage());
        return -1;
    }

    if (!cl.hasOption(INPUT_OPT) || !cl.hasOption(OUTPUT_OPT)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(getClass().getName(), opts);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    int reduceNo = DEFAULT_REDUCER_NO;
    if (cl.hasOption(REDUCE_NO)) {
        try {
            reduceNo = Integer.parseInt(cl.getOptionValue(REDUCE_NO));
        } catch (NumberFormatException e) {
            System.err.println("Error parsing reducer number: " + e.getMessage());
        }
    }

    String jobName = "Distributed timeseries [R] correlation";
    if (cl.hasOption(JOB_NAME)) {
        jobName = cl.getOptionValue(JOB_NAME);
        jobName = jobName.replace('-', ' ');
    }

    if (cl.hasOption(REMOVE_OUTPUT)) {

    }

    String input = cl.getOptionValue(INPUT_OPT);
    String output = cl.getOptionValue(OUTPUT_OPT);

    Configuration conf = getConf();
    //DistributedCache.createSymlink(conf); 
    //DistributedCache.addCacheFile(new URI("hdfs://master.hadoop:8020/user/nguyen/lib/"), conf);
    Job job = Job.getInstance(conf, jobName);
    job.setJarByClass(TimeSeriesJob.class);
    job.setMapperClass(TimeSeriesMapper.class);
    job.setReducerClass(TimeSeriesReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Timeseries.class);

    job.setNumReduceTasks(reduceNo);
    job.setInputFormatClass(WholeFileInputFormat.class);
    WholeFileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, new Path(output));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tuberlin.dima.aim3.HadoopJob.java

License:Open Source License

protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat) throws IOException {

    Job job = new Job(new Configuration(getConf()));
    Configuration jobConf = job.getConfiguration();

    if (mapper.equals(Mapper.class)) {
        throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
    } else {// w w  w . j a  v a2s  .c  om
        job.setJarByClass(mapper);
    }

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath.toString());

    job.setMapperClass(mapper);
    job.setMapOutputKeyClass(mapperKey);
    job.setMapOutputValueClass(mapperValue);
    job.setOutputKeyClass(mapperKey);
    job.setOutputValueClass(mapperValue);

    jobConf.setBoolean("mapred.compress.map.output", true);

    job.setNumReduceTasks(0);

    job.setJobName(getCustomJobName(job, mapper));

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath.toString());

    return job;
}

From source file:de.tudarmstadt.lt.wiki.statistics.ResourceInlinkCount.java

public static boolean runJob1(String inDir, String outDir) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapred.child.java.opts", "-Xmx1200M");
    conf.set("mapred.job.map.memory.mb", "1280");
    conf.set("mapreduce.job.queuename", "smalljob");
    Job job = Job.getInstance(conf);
    job.setJarByClass(ResourceInlinkCount.class);
    FileInputFormat.addInputPath(job, new Path(inDir));
    FileOutputFormat.setOutputPath(job, new Path(outDir));
    job.setMapperClass(Map.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(TextInputFormat.class);
    return job.waitForCompletion(true);
}

From source file:de.tudarmstadt.lt.wiki.statistics.ResourceInlinkCount.java

public static boolean runJob2(String inDir, String outDir) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapreduce.job.queuename", "smalljob");
    Job job = Job.getInstance(conf);
    job.setJarByClass(ResourceInlinkCount.class);
    FileInputFormat.addInputPath(job, new Path(inDir));
    FileOutputFormat.setOutputPath(job, new Path(outDir));
    job.setMapperClass(Map2.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(TextInputFormat.class);
    return job.waitForCompletion(true);
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/*from w  w w.ja  va 2 s.  c o m*/
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);
    con.set(WINDOW_TYPE, mode.toString());
    con.setInt(WINDOW_SIZE, winsize);

    if (mode.toString().equalsIgnoreCase("DOCUMENT")) {
        con.setInt("mapred.job.map.memory.mb", 3000);

        con.set("mapred.child.java.opts", "-Xmx2900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx8000M");

        con.setInt("mapred.job.reduce.memory.mb", 8120);
    } else {
        con.setInt("mapred.job.map.memory.mb", 2000);

        con.set("mapred.child.java.opts", "-Xmx1900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx2900M");

        con.setInt("mapred.job.reduce.memory.mb", 3000);
    }
    con.setBoolean("mapred.compress.map.output", true);
    con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setBoolean("mapred.compress.output", true);
    con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setInt("mapred.task.timeout", 6000000);
    con.setInt("io.sort.factor", 50);
    con.setInt("mapreduce.map.tasks", 256);
    con.setInt("dfs.replication", 1);
    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(512);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

/**
 * pass2: perform the LLR calculation/*from   w  w  w  .  j  a v  a  2  s  .  c o  m*/
 */
private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal,
        boolean emitUnigrams, float minValue, int reduceTasks)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    conf.setLong(AssocReducer.NGRAM_TOTAL, nGramTotal);
    conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    conf.setFloat(AssocReducer.MIN_VALUE, minValue);
    conf.setInt("mapred.job.map.memory.mb", 1280);
    conf.setInt("mapred.job.reduce.memory.mb", 2560);
    conf.set("mapred.reduce.child.java.opts", "-Xmx2G");
    conf.setInt("mapred.task.timeout", 6000000);
    conf.set(AssocReducer.ASSOC_METRIC, "llr");

    Job job = new Job(conf);
    job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output + " pruning: " + minValue);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(Gram.class);
    job.setMapOutputValueClass(Gram.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
    Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY + "_llr");
    FileOutputFormat.setOutputPath(job, outPath);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.setReducerClass(AssocReducer.class);
    job.setNumReduceTasks(reduceTasks);
    // Defines additional single text based output 'text' for the job
    MultipleOutputs.addNamedOutput(job, "contingency", TextOutputFormat.class, Text.class, Text.class);

    // Defines additional multi sequencefile based output 'sequence' for the
    // job
    MultipleOutputs.addNamedOutput(job, "llr", TextOutputFormat.class, Text.class, DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "pmi", TextOutputFormat.class, Text.class, DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "chi", TextOutputFormat.class, Text.class, DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "dice", TextOutputFormat.class, Text.class, DoubleWritable.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}