Example usage for org.apache.hadoop.mapreduce Job setJarByClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setJarByClass.

Prototype

public void setJarByClass(Class<?> cls)

Source Link

Document

Set the Jar by finding where a given class came from.

Usage

From source file:com.daleway.training.hadoop.pagerank.PageRankSecondarySort.java

License:Apache License

public static Job createJob(Configuration conf, String inputPath, String outputPath) throws IOException {
    Job job = new Job(conf, "pair wise count");
    job.setJarByClass(PageRankSecondarySort.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setReducerClass(IntSumReducer.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setSortComparatorClass(LongWritable.DecreasingComparator.class);
    job.setMaxReduceAttempts(1);/*from   w ww. java2  s.c o m*/
    job.setNumReduceTasks(1);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    return job;
}

From source file:com.daleway.training.hadoop.pagerank.PageRankSimple.java

License:Apache License

public static Job createJob(Configuration conf, String inputPath, String outputPath) throws IOException {
    Job job = new Job(conf, "pair wise count");
    job.setJarByClass(PageRankSimple.class);
    job.setMapperClass(PageRankMapper.class);
    //job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(PageRankReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    return job;//from   w w  w . j ava  2 s  . c  o m

}

From source file:com.datasalt.pangool.benchmark.secondarysort.HadoopSecondarySort.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);//from   w  ww. j a v  a 2s . c o m
    }
    Job job = new Job(conf, "Hadoop Secondary Sort");
    FileSystem fS = FileSystem.get(conf);
    fS.delete(new Path(otherArgs[1]), true);

    job.setJarByClass(HadoopSecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    job.setPartitionerClass(KeyPartitioner.class);
    job.setGroupingComparatorClass(GroupingComparator.class);

    job.setMapOutputKeyClass(ComplexType.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    job.waitForCompletion(true);
}

From source file:com.datasalt.pangool.benchmark.wordcount.HadoopWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//from   w  ww .j  a v  a  2s  .  c o  m
    }
    //conf.setBoolean("hadoop.security.authorization", false);
    //conf.set("hadoop.security.authentication","simple");
    Job job = new Job(conf, "word count");
    job.setJarByClass(HadoopWordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    HadoopUtils.deleteIfExists(FileSystem.get(conf), new Path(otherArgs[1]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    job.waitForCompletion(true);
}

From source file:com.datasalt.pangool.tuplemr.MapOnlyJobBuilder.java

License:Apache License

public Job createJob() throws IOException, TupleMRException, URISyntaxException {
    // perform a deep copy of the configuration
    this.conf = new Configuration(this.conf);

    String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat";
    try {/*from w  w  w.  ja  va2  s .c o  m*/
        InstancesDistributor.distribute(outputFormat, uniqueName, conf);
        instanceFilesCreated.add(uniqueName);
    } catch (URISyntaxException e1) {
        throw new TupleMRException(e1);
    }

    Job job;
    if (jobName == null) {
        job = new Job(conf);
    } else {
        job = new Job(conf, jobName);
    }
    job.setNumReduceTasks(0);

    job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName);
    job.setOutputFormatClass(ProxyOutputFormat.class);

    if (outputKeyClass == null) {
        throw new TupleMRException("Output spec must be defined, use setOutput()");
    }
    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);
    FileOutputFormat.setOutputPath(job, outputPath);

    Input lastInput = null;

    for (Input input : multipleInputs.getMultiInputs()) {
        if (input.inputProcessor == null) {
            input.inputProcessor = mapOnlyMapper;
            if (input.inputProcessor == null) {
                throw new TupleMRException("Either mapOnlyMapper property or full Input spec must be set.");
            }
        }
        lastInput = input;
    }

    if (lastInput == null) {
        throw new TupleMRException("At least one input must be specified");
    }
    job.setJarByClass((jarByClass != null) ? jarByClass : lastInput.inputProcessor.getClass());

    instanceFilesCreated.addAll(multipleInputs.configureJob(job));
    instanceFilesCreated.addAll(namedOutputs.configureJob(job));

    return job;
}

From source file:com.datasalt.pangool.tuplemr.TupleMRBuilder.java

License:Apache License

public Job createJob() throws IOException, TupleMRException {

    failIfNull(tupleReducer, "Need to set a group handler");
    failIfEmpty(multipleInputs.getMultiInputs(), "Need to add at least one input");
    failIfNull(outputFormat, "Need to set output format");
    failIfNull(outputKeyClass, "Need to set outputKeyClass");
    failIfNull(outputValueClass, "Need to set outputValueClass");
    failIfNull(outputPath, "Need to set outputPath");

    // perform a deep copy of the Configuration
    this.conf = new Configuration(this.conf);

    TupleMRConfig tupleMRConf = buildConf();
    // Serialize PangoolConf in Hadoop Configuration
    instanceFilesCreated.addAll(TupleMRConfig.set(tupleMRConf, conf));
    Job job = (jobName == null) ? new Job(conf) : new Job(conf, jobName);
    if (tupleMRConf.getRollupFrom() != null) {
        job.setReducerClass(RollupReducer.class);
    } else {//  w  w  w . j a  v a  2s  . c  o m
        job.setReducerClass(SimpleReducer.class);
    }

    if (tupleCombiner != null) {
        job.setCombinerClass(SimpleCombiner.class); // not rollup by now
        // Set Combiner Handler
        String uniqueName = UUID.randomUUID().toString() + '.' + "combiner-handler.dat";
        try {
            InstancesDistributor.distribute(tupleCombiner, uniqueName, job.getConfiguration());
            instanceFilesCreated.add(uniqueName);
            job.getConfiguration().set(SimpleCombiner.CONF_COMBINER_HANDLER, uniqueName);
        } catch (URISyntaxException e1) {
            throw new TupleMRException(e1);
        }
    }

    // Set Tuple Reducer
    try {
        String uniqueName = UUID.randomUUID().toString() + '.' + "group-handler.dat";
        InstancesDistributor.distribute(tupleReducer, uniqueName, job.getConfiguration());
        instanceFilesCreated.add(uniqueName);
        job.getConfiguration().set(SimpleReducer.CONF_REDUCER_HANDLER, uniqueName);
    } catch (URISyntaxException e1) {
        throw new TupleMRException(e1);
    }

    // Enabling serialization
    TupleSerialization.enableSerialization(job.getConfiguration());

    job.setJarByClass((jarByClass != null) ? jarByClass : tupleReducer.getClass());
    job.setMapOutputKeyClass(DatumWrapper.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setPartitionerClass(TupleHashPartitioner.class);
    job.setGroupingComparatorClass(GroupComparator.class);
    job.setSortComparatorClass(SortComparator.class);
    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);
    FileOutputFormat.setOutputPath(job, outputPath);
    instanceFilesCreated.addAll(multipleInputs.configureJob(job));
    instanceFilesCreated.addAll(namedOutputs.configureJob(job));
    // Configure a {@link ProxyOutputFormat} for Pangool's Multiple Outputs to
    // work: {@link PangoolMultipleOutput}
    String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat";
    try {
        InstancesDistributor.distribute(outputFormat, uniqueName, conf);
        instanceFilesCreated.add(uniqueName);
    } catch (URISyntaxException e1) {
        throw new TupleMRException(e1);
    }
    job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName);
    job.setOutputFormatClass(ProxyOutputFormat.class);

    return job;
}

From source file:com.datasalt.utils.mapred.counter.MapRedCounter.java

License:Apache License

protected static Job buildMapRedCounterJobWithoutCombiner(String name,
        @SuppressWarnings("rawtypes") Class<? extends OutputFormat> outputFormat, String outPath,
        Configuration conf) throws IOException {

    Job job = new Job(conf, name);

    Path output = new Path(outPath);
    HadoopUtils.deleteIfExists(FileSystem.get(conf), output);
    job.setJarByClass(MapRedCounter.class);

    job.setReducerClass(MapRedCountReducer.class);
    job.setMapOutputKeyClass(CounterKey.class);
    job.setMapOutputValueClass(CounterValue.class);
    job.setOutputFormatClass(outputFormat);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Secondary sorting configuration.
    job.setGroupingComparatorClass(CounterKey.IdGroupComparator.class);
    job.setPartitionerClass(CounterKey.IdGroupPartitioner.class);

    FileOutputFormat.setOutputPath(job, output);

    String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat";
    try {/* ww w  . j  a v  a2 s .c  o m*/
        DCUtils.serializeToDC(new HadoopOutputFormat(SequenceFileOutputFormat.class), uniqueName, conf);
        job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName);
        job.setOutputFormatClass(ProxyOutputFormat.class);
        // Multioutput configuration
        PangoolMultipleOutputs.addNamedOutput(job, Outputs.COUNTFILE.toString(),
                new HadoopOutputFormat(SequenceFileOutputFormat.class), CounterKey.class, LongWritable.class);
        PangoolMultipleOutputs.addNamedOutput(job, Outputs.COUNTDISTINCTFILE.toString(),
                new HadoopOutputFormat(SequenceFileOutputFormat.class), CounterDistinctKey.class,
                LongPairWritable.class);
    } catch (URISyntaxException e) {
        e.printStackTrace();
        throw new IOException(e);
    }
    return job;
}

From source file:com.datasalt.utils.mapred.counter.MapRedCounter.java

License:Apache License

/**
 * Adds an input file and {@link MapRedCounterMapper} to be processed for emit groups and items that then will be
 * counted. Remember you have to implement your own {@link MapRedCounterMapper} to be provided here.
 *//*from   ww w.  ja va  2s  .c  om*/
@SuppressWarnings({ "rawtypes" })
public static void addInput(Job job, Path location, Class<? extends InputFormat> inputFormat,
        Class<? extends MapRedCounterMapper> mapper) throws IOException {

    MultipleInputs.addInputPath(job, location, inputFormat, mapper);
    job.setJarByClass(mapper);
}

From source file:com.declum.squzer.example.hbase.table2file.Export.java

License:Apache License

/**
 * Sets up the actual job.//from w  w  w .j a  v a 2 s .com
 * 
 * @param conf
 *            The current configuration.
 * @param args
 *            The command line parameters.
 * @return The newly created job.
 * @throws IOException
 *             When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];
    Path outputDir = new Path(args[1]);

    Job job = Job.getInstance(conf);
    job.setJobName(tableName);
    job.setJobName(NAME + "_" + tableName);
    job.setJarByClass(Exporter.class);
    // TODO: Allow passing filter and subset of rows/columns.
    Scan s = new Scan();
    // Optional arguments.
    int versions = args.length > 2 ? Integer.parseInt(args[2]) : 1;
    s.setMaxVersions(versions);
    long startTime = args.length > 3 ? Long.parseLong(args[3]) : 0L;
    long endTime = args.length > 4 ? Long.parseLong(args[4]) : Long.MAX_VALUE;
    s.setTimeRange(startTime, endTime);
    s.setCacheBlocks(false);
    if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) {
        s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY)));
    }
    LOG.info("verisons=" + versions + ", starttime=" + startTime + ", endtime=" + endTime);
    TableMapReduceUtil.initTableMapperJob(tableName, s, Exporter.class, null, null, job);
    // No reducers. Just write straight to output files.
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Result.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    return job;
}

From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java

License:Apache License

/**
 * Convert the input documents into token array using the
 * {@link StringTuple} The input documents has to be in the
 * {@link org.apache.hadoop.io.SequenceFile} format
 * // w  w w  .  j a v  a  2s  . com
 * @param input
 *            input directory of the documents in
 *            {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *            output directory were the {@link StringTuple} token array of
 *            each document has to be created
 * @param type
 *            The annotation type representing the tokens
 * @param feature
 *            The name of the features holding the token value
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public static void tokenizeDocuments(Path input, String type, String feature, Path output)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    // this conf parameter needs to be set enable serialisation of conf
    // values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.set(TOKEN_TYPE, type);
    conf.set(FEATURE_NAME, feature);

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::BehemothTokenizer: input-folder: " + input);
    job.setJarByClass(BehemothDocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(BehemothTokenizerMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}