Example usage for org.apache.hadoop.mapred JobConf setInt

List of usage examples for org.apache.hadoop.mapred JobConf setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:cascading.flow.FlowStep.java

License:Open Source License

protected JobConf getJobConf(JobConf parentConf) throws IOException {
    JobConf conf = parentConf == null ? new JobConf() : new JobConf(parentConf);

    // set values first so they can't break things downstream
    if (hasProperties()) {
        for (Map.Entry entry : getProperties().entrySet())
            conf.set(entry.getKey().toString(), entry.getValue().toString());
    }//from   w  w  w  .j  a  va2 s  . co  m

    // disable warning
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    conf.setJobName(getStepName());

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(Tuple.class);

    conf.setMapperClass(FlowMapper.class);
    conf.setReducerClass(FlowReducer.class);

    // set for use by the shuffling phase
    TupleSerialization.setSerializations(conf);

    initFromSources(conf);

    initFromSink(conf);

    initFromTraps(conf);

    if (sink.getScheme().getNumSinkParts() != 0) {
        // if no reducer, set num map tasks to control parts
        if (getGroup() != null)
            conf.setNumReduceTasks(sink.getScheme().getNumSinkParts());
        else
            conf.setNumMapTasks(sink.getScheme().getNumSinkParts());
    }

    conf.setOutputKeyComparatorClass(TupleComparator.class);

    if (getGroup() == null) {
        conf.setNumReduceTasks(0); // disable reducers
    } else {
        // must set map output defaults when performing a reduce
        conf.setMapOutputKeyClass(Tuple.class);
        conf.setMapOutputValueClass(Tuple.class);

        // handles the case the groupby sort should be reversed
        if (getGroup().isSortReversed())
            conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

        addComparators(conf, "cascading.group.comparator", getGroup().getGroupingSelectors());

        if (getGroup().isGroupBy())
            addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors());

        if (!getGroup().isGroupBy()) {
            conf.setPartitionerClass(CoGroupingPartitioner.class);
            conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
            conf.setMapOutputValueClass(IndexTuple.class);
            conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
            conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
        }

        if (getGroup().isSorted()) {
            conf.setPartitionerClass(GroupingPartitioner.class);
            conf.setMapOutputKeyClass(TuplePair.class);

            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
            else
                conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

            // no need to supply a reverse comparator, only equality is checked
            conf.setOutputValueGroupingComparator(GroupingComparator.class);
        }
    }

    // perform last so init above will pass to tasks
    conf.setInt("cascading.flow.step.id", id);
    conf.set("cascading.flow.step", Util.serializeBase64(this));

    return conf;
}

From source file:cascading.flow.FlowStep.java

License:Open Source License

private void addComparators(JobConf conf, String property, Map<String, Fields> map) throws IOException {
    Iterator<Fields> fieldsIterator = map.values().iterator();

    if (!fieldsIterator.hasNext())
        return;//from w w  w  .j av  a2s . c  om

    Fields fields = fieldsIterator.next();

    if (fields.hasComparators()) {
        conf.set(property, Util.serializeBase64(fields));
        return;
    }

    // use resolved fields if there are no comparators.
    Set<Scope> previousScopes = getPreviousScopes(getGroup());

    fields = previousScopes.iterator().next().getOutValuesFields();

    if (fields.size() != 0) // allows fields.UNKNOWN to be used
        conf.setInt(property + ".size", fields.size());

    return;
}

From source file:com.benchmark.mapred.SleepJob.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), SleepJob.class);
    job.setNumMapTasks(numMapper);/*from w  ww.ja va  2s.com*/
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(SleepJob.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(SleepJob.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(SleepJob.class);
    job.setSpeculativeExecution(false);
    job.setJobName("Sleep job");
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:com.benchmark.mapred.terasort.TeraSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();
    Path inputDir = new Path(args[0]);
    if (args.length != 3) {
        System.out.println("ERROR: Wrong number of parameters: " + args.length + " instead of 3.");
    }/* w  w w.j  av  a 2 s .  c o  m*/
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setNumReduceTasks(Integer.parseInt(args[2]));
    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", 1);
    TeraOutputFormat.setFinalSync(job, true);
    Date startIteration = new Date();
    JobClient.runJob(job);
    Date endIteration = new Date();
    System.out.println(
            "The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000 + " seconds.");
    LOG.info("done");
    return 0;
}

From source file:com.cloudera.circus.test.TestXTest.java

License:Open Source License

@Test
@TestHadoop//  w  w w.ja v a 2s  . co  m
public void testHadoopMapReduce() throws Exception {
    JobConf conf = getHadoopConf();
    FileSystem fs = FileSystem.get(conf);
    JobClient jobClient = new JobClient(conf);
    try {
        Path inputDir = new Path(getHadoopTestDir(), "input");
        Path outputDir = new Path(getHadoopTestDir(), "output");

        fs.mkdirs(inputDir);
        Writer writer = new OutputStreamWriter(fs.create(new Path(inputDir, "data.txt")));
        writer.write("a\n");
        writer.write("b\n");
        writer.write("c\n");
        writer.close();

        JobConf jobConf = getHadoopConf();
        jobConf.setInt("mapred.map.tasks", 1);
        jobConf.setInt("mapred.map.max.attempts", 1);
        jobConf.setInt("mapred.reduce.max.attempts", 1);
        jobConf.set("mapred.input.dir", inputDir.toString());
        jobConf.set("mapred.output.dir", outputDir.toString());
        final RunningJob runningJob = jobClient.submitJob(jobConf);
        waitFor(60 * 1000, true, new Predicate() {
            @Override
            public boolean evaluate() throws Exception {
                return runningJob.isComplete();
            }
        });
        Assert.assertTrue(runningJob.isSuccessful());
        Assert.assertTrue(fs.exists(new Path(outputDir, "part-00000")));
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(fs.open(new Path(outputDir, "part-00000"))));
        Assert.assertTrue(reader.readLine().trim().endsWith("a"));
        Assert.assertTrue(reader.readLine().trim().endsWith("b"));
        Assert.assertTrue(reader.readLine().trim().endsWith("c"));
        Assert.assertNull(reader.readLine());
        reader.close();
    } finally {
        fs.close();
        jobClient.close();
    }
}

From source file:com.cloudera.recordservice.avro.mapred.ColorCount.java

License:Apache License

/**
 * Run the MR1 color count with generic records, and return a map of favorite colors to
 * the number of users.//from w w w.ja  v a 2  s  .  c  o  m
 */
public static java.util.Map<String, Integer> countColors() throws IOException {
    String output = TestUtil.getTempDirectory();
    Path outputPath = new Path(output);

    JobConf conf = new JobConf(ColorCount.class);
    conf.setJobName("MR1 Color Count With Generic Records");
    conf.setInt("mapreduce.job.reduces", 1);

    conf.setBoolean(com.cloudera.recordservice.avro.AvroJob.USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, true);
    com.cloudera.recordservice.avro.AvroJob.setInputFormat(conf, org.apache.avro.mapred.AvroInputFormat.class);

    RecordServiceConfig.setInputTable(conf, "rs", "users");
    FileOutputFormat.setOutputPath(conf, outputPath);

    AvroJob.setMapperClass(conf, Map.class);
    AvroJob.setReducerClass(conf, Reduce.class);
    AvroJob.setOutputSchema(conf,
            Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)));

    JobClient.runJob(conf);

    // Read the result and return it. Since we set the number of reducers to 1,
    // there is always just one file containing the value.
    SeekableInput input = new FsInput(new Path(output + "/part-00000.avro"), conf);
    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
    java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>();
    for (GenericRecord datum : fileReader) {
        colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString()));
    }
    return colorMap;
}

From source file:com.cloudera.recordservice.avro.mapreduce.ColorCount.java

License:Apache License

/**
 * Run the MR2 color count with generic records, and return a map of favorite colors to
 * the number of users.//from   w w  w. j  ava 2 s .c  o  m
 */
public static java.util.Map<String, Integer> countColors()
        throws IOException, ClassNotFoundException, InterruptedException {
    String output = TestUtil.getTempDirectory();
    Path outputPath = new Path(output);
    JobConf conf = new JobConf(ColorCount.class);
    conf.setInt("mapreduce.job.reduces", 1);

    Job job = Job.getInstance(conf);
    job.setJarByClass(ColorCount.class);
    job.setJobName("MR2 Color Count With Generic Records");

    RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users");
    job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(Reduce.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    job.waitForCompletion(false);

    // Read the result and return it. Since we set the number of reducers to 1,
    // there is always just one file containing the value.
    SeekableInput input = new FsInput(new Path(output + "/part-r-00000.avro"), conf);
    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
    java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>();
    for (GenericRecord datum : fileReader) {
        colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString()));
    }
    return colorMap;
}

From source file:com.cloudera.recordservice.mapreduce.testapps.RecordCount.java

License:Apache License

public static long countRecords(String path) throws IOException {
    String output = TestUtil.getTempDirectory();
    Path inputPath = new Path(path);
    Path outputPath = new Path(output);

    JobConf conf = new JobConf(RecordCount.class);
    conf.setJobName("recordcount");

    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(LongWritable.class);

    conf.setInt("mapreduce.job.reduces", 1);
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    JobClient.runJob(conf);/*from w  ww  .  j  av a 2 s  .  c o m*/

    // Read the result and return it. Since we set the number of reducers to 1,
    // there is always just one file containing the value.
    FileSystem fs = outputPath.getFileSystem(conf);
    FSDataInputStream resultStream = fs.open(new Path(output + "/part-00000"));
    byte[] bytes = new byte[16];
    int length = resultStream.read(bytes);
    String result = new String(bytes, 0, length).trim();
    return Long.parseLong(result);
}

From source file:com.finderbots.miner.MinerWorkflow.java

License:Apache License

public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, MinerOptions options, BaseUrlFilter crawlUrlFilter, BaseUrlFilter mineUrlFilter)
        throws IOException, InterruptedException {

    // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir.
    // HTML only.

    // We want to extract the cleaned up HTML, and pass that to the parser, which will
    // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and
    // any results.

    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    boolean isLocal = HadoopUtils.isJobLocal(conf);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    conf.setInt("mapred.min.split.size", 64 * 1024 * 1024);
    Properties props = HadoopUtils.getDefaultProperties(MinerWorkflow.class, false, conf);
    FileSystem fs = crawlDbPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }//w w w . j  a  v  a  2s.  c o m

    //Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString());
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString(), true);
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());

    Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
    Pipe urlsToFetchPipe = splitter.getLHSPipe();

    // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, 
    // or MAX_LOCAL_FETCH if running locally. So first we sort the entries 
    // from high to low by links score.
    // TODO add unit test
    urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
    long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

    // Create the sub-assembly that runs the fetch job
    int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
    SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);

    Pipe analyzerPipe = new Pipe("analyzer pipe");
    analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());

    //add a regex url filter to filter outlinks
    Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
    outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());
    if (crawlUrlFilter != null) {
        outlinksPipe = new Each(outlinksPipe, new UrlFilter(crawlUrlFilter));
    }

    Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
    resultsPipe = new Each(resultsPipe, new CreateResultsFunction());

    // Group the finished datums, the skipped datums, status, outlinks
    Pipe updatePipe = new CoGroup("update pipe",
            Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
            Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                    new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)),
            null, new OuterJoin());
    updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

    // output : loop dir specific crawldb
    Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap crawlDbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    // Status, 
    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // Content
    Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    // PageResults
    Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
    Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(updatePipe.getName(), crawlDbSink);
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(resultsPipe.getName(), resultsSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe);

    return flow;
}

From source file:com.github.gaoyangthu.demo.mapred.dancing.DistributedPentomino.java

License:Apache License

public int run(String[] args) throws Exception {
    JobConf conf;
    int depth = 5;
    int width = 9;
    int height = 10;
    Class<? extends Pentomino> pentClass;
    if (args.length == 0) {
        System.out.println("Usage: pentomino <output> [-depth #] [-height #] [-width #]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }//from w w w  .ja v a  2  s .co  m

    conf = new JobConf(getConf());

    // Pick up the parameters, should the user set these
    width = conf.getInt("pent.width", width);
    height = conf.getInt("pent.height", height);
    depth = conf.getInt("pent.depth", depth);
    pentClass = conf.getClass("pent.class", OneSidedPentomino.class, Pentomino.class);

    for (int i = 0; i < args.length; i++) {
        if (args[i].equalsIgnoreCase("-depth")) {
            depth = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-height")) {
            height = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-width")) {
            width = Integer.parseInt(args[++i].trim());
        }
    }

    // Set parameters for MR tasks to pick up either which way the user sets
    // them or not
    conf.setInt("pent.width", width);
    conf.setInt("pent.height", height);
    conf.setInt("pent.depth", depth);

    Path output = new Path(args[0]);
    Path input = new Path(output + "_input");
    FileSystem fileSys = FileSystem.get(conf);
    try {
        FileInputFormat.setInputPaths(conf, input);
        FileOutputFormat.setOutputPath(conf, output);
        conf.setJarByClass(PentMap.class);

        conf.setJobName("dancingElephant");
        Pentomino pent = ReflectionUtils.newInstance(pentClass, conf);
        pent.initialize(width, height);
        createInputDirectory(fileSys, input, pent, depth);

        // the keys are the prefix strings
        conf.setOutputKeyClass(Text.class);
        // the values are puzzle solutions
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(PentMap.class);
        conf.setReducerClass(IdentityReducer.class);

        conf.setNumMapTasks(2000);
        conf.setNumReduceTasks(1);

        JobClient.runJob(conf);
    } finally {
        fileSys.delete(input, true);
    }
    return 0;
}