Example usage for org.apache.hadoop.mapred JobConf setJobName

List of usage examples for org.apache.hadoop.mapred JobConf setJobName

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setJobName.

Prototype

public void setJobName(String name) 

Source Link

Document

Set the user-specified job name.

Usage

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {

        if (args.length != 3) {
            System.err.printf("Usage: %s [generic options] <input> <output> <schema-file>\n",
                    getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }//from  www.j  a v a  2 s  .co m

        String input = args[0];
        String output = args[1];
        String schemaFile = args[2];

        JobConf conf = new JobConf(getConf(), getClass());
        conf.setJobName("Avro projection");

        FileInputFormat.addInputPath(conf, new Path(input));
        FileOutputFormat.setOutputPath(conf, new Path(output));

        Schema schema = new Schema.Parser().parse(new File(schemaFile));
        AvroJob.setInputSchema(conf, schema);
        AvroJob.setMapOutputSchema(conf, schema);
        AvroJob.setOutputSchema(conf, schema);
        conf.setNumReduceTasks(0);

        JobClient.runJob(conf);
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {

        if (args.length != 3) {
            System.err.printf("Usage: %s [generic options] <input> <output> <schema-file>\n",
                    getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }/*from  w  w w.  ja  va 2 s . c  o  m*/

        String input = args[0];
        String output = args[1];
        String schemaFile = args[2];

        JobConf conf = new JobConf(getConf(), getClass());
        conf.setJobName("Avro sort");

        FileInputFormat.addInputPath(conf, new Path(input));
        FileOutputFormat.setOutputPath(conf, new Path(output));

        Schema schema = new Schema.Parser().parse(new File(schemaFile));
        AvroJob.setInputSchema(conf, schema);
        Schema intermediateSchema = Pair.getPairSchema(schema, schema);
        AvroJob.setMapOutputSchema(conf, intermediateSchema);
        AvroJob.setOutputSchema(conf, schema);

        AvroJob.setMapperClass(conf, SortMapper.class);
        AvroJob.setReducerClass(conf, SortReducer.class);

        JobClient.runJob(conf);
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }/*from  w  w w  .java2  s .c o  m*/

        JobConf conf = new JobConf(getConf(), getClass());
        conf.setJobName("Max temperature");

        FileInputFormat.addInputPath(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        AvroJob.setInputSchema(conf, Schema.create(Schema.Type.STRING));
        AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.INT), WeatherRecord.SCHEMA$));
        AvroJob.setOutputSchema(conf, WeatherRecord.SCHEMA$);

        conf.setInputFormat(AvroUtf8InputFormat.class);

        AvroJob.setMapperClass(conf, MaxTemperatureMapper.class);
        AvroJob.setCombinerClass(conf, MaxTemperatureCombiner.class);
        AvroJob.setReducerClass(conf, MaxTemperatureReducer.class);

        JobClient.runJob(conf);
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 3) {
            JobBuilder.printUsage(this, "<ncdc input> <metoffice input> <output>");
            return -1;
        }/*from   w w w .  j  a v a  2 s. c  o m*/

        JobConf conf = new JobConf(getConf(), getClass());
        conf.setJobName("Max temperature with multiple input formats");

        Path ncdcInputPath = new Path(args[0]);
        Path metOfficeInputPath = new Path(args[1]);
        Path outputPath = new Path(args[2]);

        MultipleInputs.addInputPath(conf, ncdcInputPath, TextInputFormat.class, MaxTemperatureMapper.class);
        MultipleInputs.addInputPath(conf, metOfficeInputPath, TextInputFormat.class,
                MetOfficeMaxTemperatureMapper.class);
        FileOutputFormat.setOutputPath(conf, outputPath);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        conf.setCombinerClass(MaxTemperatureReducer.class);
        conf.setReducerClass(MaxTemperatureReducer.class);

        JobClient.runJob(conf);
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 3) {
            JobBuilder.printUsage(this, "<ncdc input> <station input> <output>");
            return -1;
        }//from w  w w  .ja va  2 s  . co  m

        JobConf conf = new JobConf(getConf(), getClass());
        conf.setJobName("Join record with station name");

        Path ncdcInputPath = new Path(args[0]);
        Path stationInputPath = new Path(args[1]);
        Path outputPath = new Path(args[2]);

        MultipleInputs.addInputPath(conf, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class);
        MultipleInputs.addInputPath(conf, stationInputPath, TextInputFormat.class, JoinStationMapper.class);
        FileOutputFormat.setOutputPath(conf, outputPath);

        /*[*/conf.setPartitionerClass(KeyPartitioner.class);
        conf.setOutputValueGroupingComparator(TextPair.FirstComparator.class);/*]*/

        conf.setMapOutputKeyClass(TextPair.class);

        conf.setReducerClass(JoinReducer.class);

        conf.setOutputKeyClass(Text.class);

        JobClient.runJob(conf);
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf(), getClass());
        conf.setJobName("Max temperature");

        Path inputDir = new Path("widgets");
        FileInputFormat.addInputPath(conf, inputDir);
        FileOutputFormat.setOutputPath(conf, new Path("maxwidget"));

        Schema schema = readSchema(inputDir, conf);

        conf.setInputFormat(AvroInputFormat.class);
        conf.setOutputFormat(AvroOutputFormat.class);

        AvroJob.setInputSchema(conf, schema);
        AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.LONG), schema));
        AvroJob.setOutputSchema(conf, schema);

        AvroJob.setMapperClass(conf, MaxWidgetMapper.class);
        AvroJob.setReducerClass(conf, MaxWidgetReducer.class);

        conf.setNumReduceTasks(1);/*from w w w . jav a 2 s . com*/

        JobClient.runJob(conf);
        return 0;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

/**
     * Create a JobConf for a Job that will calculate the number of unique listeners per track.
     * //from   ww w  . ja  v a  2  s  .c om
     * @param inputDir The path to the folder containing the raw listening data files.
     * @return The unique listeners JobConf.
     */
    private JobConf getUniqueListenersJobConf(Path inputDir) {
        log.info("Creating configuration for unique listeners Job");

        // output results to a temporary intermediate folder, this will get deleted by start() method
        Path uniqueListenersOutput = new Path("uniqueListeners");

        JobConf conf = new JobConf(TrackStatisticsProgram.class);
        conf.setOutputKeyClass(IntWritable.class); // track id
        conf.setOutputValueClass(IntWritable.class); // number of unique listeners
        conf.setInputFormat(TextInputFormat.class); // raw listening data
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setMapperClass(UniqueListenersMapper.class);
        conf.setCombinerClass(UniqueListenersCombiner.class);
        conf.setReducerClass(UniqueListenersReducer.class);

        FileInputFormat.addInputPath(conf, inputDir);
        FileOutputFormat.setOutputPath(conf, uniqueListenersOutput);
        conf.setJobName("uniqueListeners");
        return conf;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

/**
     * Creates a JobConf for a Job that will sum up the TrackStatistics per track.
     * //from   ww  w.  j  a  va  2 s.c om
     * @param inputDir The path to the folder containing the raw input data files.
     * @return The sum JobConf.
     */
    private JobConf getSumJobConf(Path inputDir) {
        log.info("Creating configuration for sum job");
        // output results to a temporary intermediate folder, this will get deleted by start() method
        Path playsOutput = new Path("sum");

        JobConf conf = new JobConf(TrackStatisticsProgram.class);
        conf.setOutputKeyClass(IntWritable.class); // track id
        conf.setOutputValueClass(TrackStats.class); // statistics for a track
        conf.setInputFormat(TextInputFormat.class); // raw listening data
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setMapperClass(SumMapper.class);
        conf.setCombinerClass(SumReducer.class);
        conf.setReducerClass(SumReducer.class);

        FileInputFormat.addInputPath(conf, inputDir);
        FileOutputFormat.setOutputPath(conf, playsOutput);
        conf.setJobName("sum");
        return conf;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

/**
     * Creates a JobConf for a Job that will merge the unique listeners and track statistics.
     * //from w ww .j  a  va 2s . co m
     * @param outputPath The path for the results to be output to.
     * @param sumInputDir The path containing the data from the sum Job.
     * @param listenersInputDir The path containing the data from the unique listeners job.
     * @return The merge JobConf.
     */
    private JobConf getMergeConf(Path outputPath, Path sumInputDir, Path listenersInputDir) {
        log.info("Creating configuration for merge job");
        JobConf conf = new JobConf(TrackStatisticsProgram.class);
        conf.setOutputKeyClass(IntWritable.class); // track id
        conf.setOutputValueClass(TrackStats.class); // overall track statistics
        conf.setCombinerClass(SumReducer.class); // safe to re-use reducer as a combiner here
        conf.setReducerClass(SumReducer.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileOutputFormat.setOutputPath(conf, outputPath);

        MultipleInputs.addInputPath(conf, sumInputDir, SequenceFileInputFormat.class, IdentityMapper.class);
        MultipleInputs.addInputPath(conf, listenersInputDir, SequenceFileInputFormat.class,
                MergeListenersMapper.class);
        conf.setJobName("merge");
        return conf;
    }

From source file:de.l3s.streamcorpus.mapreduce.TerrierIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args/*w ww  .  j av a2  s . co m*/
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    // For the moment: Hard-code the terrier home to quick test
    System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer");

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return 0;
    } else if (args.length == 0) {
        logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    }

    /*else
    {
       logger.fatal(usage());
       return 0;
    }*/

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return 0;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJarByClass(TerrierIndexing.class);
    conf.setJobName("StreamCorpusIndexer: Terrier Indexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return 0;
    }

    // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    boolean blockIndexing = true;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));

    // not sure if this is effective in YARN
    conf.setNumMapTasks(2000);

    // increase the heap usage
    conf.set("mapreduce.map.memory.mb", "6100");
    conf.set("mapred.job.map.memory.mb", "6100");
    conf.set("mapreduce.reduce.memory.mb", "6144");
    conf.set("mapred.job.reduce.memory.mb", "6144");

    conf.set("mapreduce.map.java.opts", "-Xmx6100m");
    conf.set("mapred.map.child.java.opts", "-Xmx6100m");
    conf.set("mapreduce.reduce.java.opts", "-Xmx6144m");
    conf.set("mapred.reduce.child.opts", "-Xmx6144m");

    //conf.setBoolean("mapred.used.genericoptionsparser", true) ;

    // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it
    conf.set("mapreduce.job.user.classpath.first", "true");

    // increase the yarn memory to 10 GB
    conf.set("yarn.nodemanager.resource.memory-mb", "12288");
    conf.set("yarn.nodemanager.resource.cpu-vcores", "16");
    conf.set("yarn.scheduler.minimum-allocation-mb", "4096");

    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    /*JobID jobId = null;
    boolean ranOK = true;
    try{
       RunningJob rj = JobClient.runJob(conf);
       jobId = rj.getID();
       HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) { 
       logger.error("Problem running job", e);
       e.printStackTrace();
       ranOK = false;
    }
    if (jobId != null)
    {
       deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }  */

    //if (ranOK)
    //{
    System.out.println("Merging indices");
    if (!docPartitioned) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
    }

    Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
            docPartitioned ? numberOfReducers : 1, jf);
    //}
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
    return 0;
}