Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the key class for the map output data.

Usage

From source file:de.hpi.fgis.hdrs.mapreduce.examples.PredicateAnalysis.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJarByClass(PredicateAnalysis.class);
    job.setJobName("Predicate Analysis");

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(LongWritable.class);

    //job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(Text.class);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(Map.class);
    //job.setReducerClass(Reduce.class);

    job.setNumReduceTasks(0);/*from   ww  w .  ja  v  a2 s .  c om*/

    job.setInputFormatClass(TripleInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    TripleInputFormat.setStoreAddress(job, args[0]);
    TripleInputFormat.setIndex(job, "POS");
    TripleInputFormat.setPattern(job, Triple.newPattern(null, args[1], null));
    TripleInputFormat.setAggregationLevel2(job);

    SequenceFileOutputFormat.setOutputPath(job, new Path(args[2]));

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:de.hpi.fgis.hdrs.mapreduce.examples.TripleCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJarByClass(TripleCount.class);
    job.setJobName("TripleCount");

    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setNumReduceTasks(1);//from w ww  .  j a  v a 2s  .  com

    job.setInputFormatClass(TripleInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    int argc = 0;

    TripleInputFormat.setStoreAddress(job, args[argc++]);
    TripleInputFormat.setIndex(job, args[argc++]);
    if ("-p".equals(args[argc])) {
        argc++;
        String s = args[argc++];
        String p = args[argc++];
        String o = args[argc++];
        if ("*".equals(s))
            s = null;
        if ("*".equals(p))
            p = null;
        if ("*".equals(o))
            o = null;
        TripleInputFormat.setPattern(job, Triple.newPattern(s, p, o));
    } else {
        TextOutputFormat.setOutputPath(job, new Path(args[argc]));
    }

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:de.hpi.fgis.hdrs.mapreduce.examples.TripleSize.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJarByClass(TripleSize.class);
    job.setJobName("TripleSize");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Combine.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TripleInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    int argc = 0;

    TripleInputFormat.setStoreAddress(job, args[argc++]);
    TripleInputFormat.setIndex(job, args[argc++]);
    if ("-p".equals(args[argc])) {
        argc++;//from   www .  j a v a  2  s . c  o  m
        String s = args[argc++];
        String p = args[argc++];
        String o = args[argc++];
        if ("*".equals(s))
            s = null;
        if ("*".equals(p))
            p = null;
        if ("*".equals(o))
            o = null;
        TripleInputFormat.setPattern(job, Triple.newPattern(s, p, o));
    } else {
        TextOutputFormat.setOutputPath(job, new Path(args[argc]));
    }

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:de.hpi.fgis.hdrs.mapreduce.IndexLoader.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (3 != args.length) {
        System.out.println(/*  w  ww .  j  a v a  2 s  .c o m*/
                "Usage: IndexLoader <StoreAddres> <SourceIndex> " + "<TargetIndex1>[,<TargetIndex2>...]");
        return 0;
    }

    Job job = new Job(getConf());
    job.setJarByClass(IndexLoader.class);
    job.setJobName("HDRS Index Loader");

    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(TripleOutputFormat.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(TripleOutputFormat.class);

    job.setMapperClass(Map.class);
    job.setNumReduceTasks(0);

    job.setInputFormatClass(TripleInputFormat.class);
    job.setOutputFormatClass(TripleOutputFormat.class);

    TripleInputFormat.setStoreAddress(job, args[0]);
    TripleInputFormat.setIndex(job, args[1]);

    TripleOutputFormat.setStoreAddress(job, args[0]);
    TripleOutputFormat.setOutputIndexes(job, args[2]);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:de.l3s.common.features.hadoop.TimeSeriesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options opts = new Options();

    Option jnameOpt = OptionBuilder.withArgName("job-name").hasArg(true).withDescription("Timeseries analysis")
            .create(JOB_NAME);//from   w w w  .  j  av a2s . com

    Option inputOpt = OptionBuilder.withArgName("input-path").hasArg(true)
            .withDescription("Timeseries file path (required)").create(INPUT_OPT);

    Option outputOpt = OptionBuilder.withArgName("output-path").hasArg(true)
            .withDescription("output file path (required)").create(OUTPUT_OPT);

    Option reduceOpt = OptionBuilder.withArgName("reduce-no").hasArg(true)
            .withDescription("number of reducer nodes").create(REDUCE_NO);

    Option rmOpt = OptionBuilder.withArgName("remove-out").hasArg(false)
            .withDescription("remove the output then create again before writing files onto it")
            .create(REMOVE_OUTPUT);

    Option cOpt = OptionBuilder.withArgName("compress-option").hasArg(true)
            .withDescription("compression option").create(COMPRESS_OPT);

    opts.addOption(jnameOpt);
    opts.addOption(inputOpt);
    opts.addOption(reduceOpt);
    opts.addOption(outputOpt);
    opts.addOption(rmOpt);
    opts.addOption(cOpt);
    CommandLine cl;
    CommandLineParser parser = new GnuParser();
    try {
        cl = parser.parse(opts, args);
    } catch (ParseException e) {
        System.err.println("Error parsing command line: " + e.getMessage());
        return -1;
    }

    if (!cl.hasOption(INPUT_OPT) || !cl.hasOption(OUTPUT_OPT)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(getClass().getName(), opts);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    int reduceNo = DEFAULT_REDUCER_NO;
    if (cl.hasOption(REDUCE_NO)) {
        try {
            reduceNo = Integer.parseInt(cl.getOptionValue(REDUCE_NO));
        } catch (NumberFormatException e) {
            System.err.println("Error parsing reducer number: " + e.getMessage());
        }
    }

    String jobName = "Distributed timeseries [R] correlation";
    if (cl.hasOption(JOB_NAME)) {
        jobName = cl.getOptionValue(JOB_NAME);
        jobName = jobName.replace('-', ' ');
    }

    if (cl.hasOption(REMOVE_OUTPUT)) {

    }

    String input = cl.getOptionValue(INPUT_OPT);
    String output = cl.getOptionValue(OUTPUT_OPT);

    Configuration conf = getConf();
    //DistributedCache.createSymlink(conf); 
    //DistributedCache.addCacheFile(new URI("hdfs://master.hadoop:8020/user/nguyen/lib/"), conf);
    Job job = Job.getInstance(conf, jobName);
    job.setJarByClass(TimeSeriesJob.class);
    job.setMapperClass(TimeSeriesMapper.class);
    job.setReducerClass(TimeSeriesReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Timeseries.class);

    job.setNumReduceTasks(reduceNo);
    job.setInputFormatClass(WholeFileInputFormat.class);
    WholeFileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, new Path(output));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tuberlin.dima.aim3.HadoopJob.java

License:Open Source License

protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat) throws IOException {

    Job job = new Job(new Configuration(getConf()));
    Configuration jobConf = job.getConfiguration();

    if (mapper.equals(Mapper.class)) {
        throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
    } else {// w w  w . j a  v a2s  .c  om
        job.setJarByClass(mapper);
    }

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath.toString());

    job.setMapperClass(mapper);
    job.setMapOutputKeyClass(mapperKey);
    job.setMapOutputValueClass(mapperValue);
    job.setOutputKeyClass(mapperKey);
    job.setOutputValueClass(mapperValue);

    jobConf.setBoolean("mapred.compress.map.output", true);

    job.setNumReduceTasks(0);

    job.setJobName(getCustomJobName(job, mapper));

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath.toString());

    return job;
}

From source file:de.tudarmstadt.lt.wiki.statistics.ResourceInlinkCount.java

public static boolean runJob1(String inDir, String outDir) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapred.child.java.opts", "-Xmx1200M");
    conf.set("mapred.job.map.memory.mb", "1280");
    conf.set("mapreduce.job.queuename", "smalljob");
    Job job = Job.getInstance(conf);
    job.setJarByClass(ResourceInlinkCount.class);
    FileInputFormat.addInputPath(job, new Path(inDir));
    FileOutputFormat.setOutputPath(job, new Path(outDir));
    job.setMapperClass(Map.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(TextInputFormat.class);
    return job.waitForCompletion(true);
}

From source file:de.tudarmstadt.lt.wiki.statistics.ResourceInlinkCount.java

public static boolean runJob2(String inDir, String outDir) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapreduce.job.queuename", "smalljob");
    Job job = Job.getInstance(conf);
    job.setJarByClass(ResourceInlinkCount.class);
    FileInputFormat.addInputPath(job, new Path(inDir));
    FileOutputFormat.setOutputPath(job, new Path(outDir));
    job.setMapperClass(Map2.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(TextInputFormat.class);
    return job.waitForCompletion(true);
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/*from w  w w.ja  va 2 s.  c o m*/
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);
    con.set(WINDOW_TYPE, mode.toString());
    con.setInt(WINDOW_SIZE, winsize);

    if (mode.toString().equalsIgnoreCase("DOCUMENT")) {
        con.setInt("mapred.job.map.memory.mb", 3000);

        con.set("mapred.child.java.opts", "-Xmx2900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx8000M");

        con.setInt("mapred.job.reduce.memory.mb", 8120);
    } else {
        con.setInt("mapred.job.map.memory.mb", 2000);

        con.set("mapred.child.java.opts", "-Xmx1900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx2900M");

        con.setInt("mapred.job.reduce.memory.mb", 3000);
    }
    con.setBoolean("mapred.compress.map.output", true);
    con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setBoolean("mapred.compress.output", true);
    con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setInt("mapred.task.timeout", 6000000);
    con.setInt("io.sort.factor", 50);
    con.setInt("mapreduce.map.tasks", 256);
    con.setInt("dfs.replication", 1);
    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(512);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

/**
 * pass2: perform the LLR calculation/*from   w  w  w  .  j  a v  a  2  s  .  c o  m*/
 */
private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal,
        boolean emitUnigrams, float minValue, int reduceTasks)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    conf.setLong(AssocReducer.NGRAM_TOTAL, nGramTotal);
    conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    conf.setFloat(AssocReducer.MIN_VALUE, minValue);
    conf.setInt("mapred.job.map.memory.mb", 1280);
    conf.setInt("mapred.job.reduce.memory.mb", 2560);
    conf.set("mapred.reduce.child.java.opts", "-Xmx2G");
    conf.setInt("mapred.task.timeout", 6000000);
    conf.set(AssocReducer.ASSOC_METRIC, "llr");

    Job job = new Job(conf);
    job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output + " pruning: " + minValue);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(Gram.class);
    job.setMapOutputValueClass(Gram.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
    Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY + "_llr");
    FileOutputFormat.setOutputPath(job, outPath);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.setReducerClass(AssocReducer.class);
    job.setNumReduceTasks(reduceTasks);
    // Defines additional single text based output 'text' for the job
    MultipleOutputs.addNamedOutput(job, "contingency", TextOutputFormat.class, Text.class, Text.class);

    // Defines additional multi sequencefile based output 'sequence' for the
    // job
    MultipleOutputs.addNamedOutput(job, "llr", TextOutputFormat.class, Text.class, DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "pmi", TextOutputFormat.class, Text.class, DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "chi", TextOutputFormat.class, Text.class, DoubleWritable.class);
    MultipleOutputs.addNamedOutput(job, "dice", TextOutputFormat.class, Text.class, DoubleWritable.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}