Example usage for org.apache.hadoop.mapreduce Job Job

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job Job.

Prototype

Job(JobConf conf) throws IOException

Source Link

Usage

From source file:cmd.sampler.java

License:Apache License

/**
 * Driver for InputSampler from the command line. Configures a JobConf
 * instance and calls {@link #writePartitionFile}.
 *///from   w w  w . j  ava  2  s .c o  m
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    ArrayList<String> otherArgs = new ArrayList<String>();
    Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-inFormat".equals(args[i])) {
                job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class));
            } else if ("-keyClass".equals(args[i])) {
                job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class));
            } else if ("-splitSample".equals(args[i])) {
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new SplitSampler<K, V>(numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    if (job.getNumReduceTasks() <= 1) {
        System.err.println("Sampler requires more than one reducer");
        return printUsage();
    }
    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }
    if (null == sampler) {
        sampler = new SplitSampler<K, V>(1000, 10);
    }

    Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
    TotalOrderPartitioner.setPartitionFile(getConf(), outf);
    for (String s : otherArgs) {
        FileInputFormat.addInputPath(job, new Path(s));
    }
    InputSampler.<K, V>writePartitionFile(job, sampler);

    return 0;
}

From source file:cn.edu.hfut.dmic.webcollector.crawldb.DBReader.java

public static void main(String[] args) throws Exception {
    Path crawlPath = new Path("task2");
    Path currentPath = new Path(crawlPath, "crawldb/current");
    Path output = new Path("output");

    Configuration config = CrawlerConfiguration.create();
    FileSystem fs = FileSystem.get(config);

    if (fs.exists(output)) {
        fs.delete(output);/*from   w ww .ja v  a  2  s  .  c o m*/
    }

    Job job = new Job(config);
    job.setJobName("dbreader " + crawlPath.toString());
    job.setMapperClass(DBReaderMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, currentPath);
    FileOutputFormat.setOutputPath(job, output);

    job.waitForCompletion(true);

}

From source file:cn.edu.hfut.dmic.webcollector.crawldb.Generator.java

public static String generate(Path crawlPath, Configuration conf) throws Exception {
    SegmentUtil.initSegments(crawlPath, conf);
    String segmentName = SegmentUtil.createSegment(crawlPath, conf);

    Path currentPath = new Path(crawlPath, "crawldb/current");
    Path generatePath = new Path(crawlPath, "segments/" + segmentName + "/generate");

    Job job = new Job(conf);
    job.setJobName("generate " + crawlPath.toString());
    job.setJarByClass(Generator.class);

    job.setReducerClass(GeneratorReducer.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    FileInputFormat.addInputPath(job, currentPath);
    FileOutputFormat.setOutputPath(job, generatePath);
    job.waitForCompletion(true);/*from  ww  w  .  j a va 2s .  c  o  m*/
    long count = job.getCounters().findCounter("generator", "count").getValue();
    System.out.println("total generate:" + count);
    if (count == 0) {
        return null;
    } else {
        return segmentName;
    }

}

From source file:cn.edu.hfut.dmic.webcollector.crawldb.Merge.java

public static void merge(Path crawlPath, Path[] mergePaths, Configuration conf, String jobName)
        throws Exception {

    Job job = new Job(conf);
    job.setJobName(jobName + "  " + crawlPath.toString());
    job.setJarByClass(Merge.class);
    // job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path crawldbPath = new Path(crawlPath, "crawldb");
    Path newdb = new Path(crawldbPath, "new");
    Path currentdb = new Path(crawldbPath, "current");

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }//from   w w w . ja va2 s .  c  o m

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }
    for (Path mergePath : mergePaths) {
        FileInputFormat.addInputPath(job, mergePath);
    }
    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.waitForCompletion(true);

}

From source file:cn.edu.hfut.dmic.webcollector.fetcher.Fetcher.java

public static void fetch(Path crawlPath, String segmentName, Configuration conf) throws Exception {
    Path segmentPath = new Path(crawlPath, "segments/" + segmentName);
    Path generatePath = new Path(segmentPath, "generate");

    Job job = new Job(conf);
    job.setJobName("fetch " + crawlPath.toString());
    job.setJarByClass(Fetcher.class);

    job.setReducerClass(FetcherReducer.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(FetcherOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    FileInputFormat.addInputPath(job, generatePath);
    FileOutputFormat.setOutputPath(job, segmentPath);

    job.waitForCompletion(true);/* w ww  .  j  av a2 s.c o  m*/
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java

public static Job createJob(Configuration conf, Path crawldb) throws IOException {

    Job job = new Job(conf);
    //job.setJarByClass(Merge.class);
    job.getConfiguration().set("mapred",
            "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path newdb = new Path(crawldb, "new");
    Path currentdb = new Path(crawldb, "current");

    FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create());
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }/*from w w w. j  a  v  a2  s .  c  o m*/

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }

    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job;
}

From source file:co.cask.cdap.hive.datasets.DatasetInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) {
        try {/* w  w w .  j a v a 2  s.c  o m*/
            datasetAccessor.initialize();
        } catch (Exception e) {
            throw new IOException("Could not get dataset", e);
        }
        try (RecordScannable recordScannable = datasetAccessor.getDataset()) {
            Job job = new Job(jobConf);
            JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
            Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

            List<Split> dsSplits = recordScannable.getSplits();

            InputSplit[] inputSplits = new InputSplit[dsSplits.size()];
            for (int i = 0; i < dsSplits.size(); i++) {
                inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]);
            }
            return inputSplits;
        }
    }
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.DelegatingInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = new ArrayList<>();
    Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration());

    for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) {
        String inputName = mapperInputEntry.getKey();
        MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue();
        String mapperClassName = mapperInput.getMapperClassName();
        Job jobCopy = new Job(job.getConfiguration());
        Configuration confCopy = jobCopy.getConfiguration();

        // set configuration specific for this input onto the jobCopy
        ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy);

        Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName());
        Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ",
                mapperInput.getInputFormatClassName());
        InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy);

        // Get splits for each input path and tag with InputFormat
        // and Mapper types by wrapping in a TaggedInputSplit.
        List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy);
        for (InputSplit split : formatSplits) {
            splits.add(new TaggedInputSplit(inputName, split, confCopy,
                    mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName));
        }//w w  w  .  java 2  s.  c  om
    }
    return splits;
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultiInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = new ArrayList<>();
    Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration());

    for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) {
        String inputName = mapperInputEntry.getKey();
        MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue();
        String mapperClassName = mapperInput.getMapperClassName();
        Job jobCopy = new Job(job.getConfiguration());
        Configuration confCopy = jobCopy.getConfiguration();

        // set configuration specific for this input onto the jobCopy
        ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy);

        Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName());
        Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ",
                mapperInput.getInputFormatClassName());

        InputFormat<K, V> inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy);
        //some input format need a jobId to getSplits
        jobCopy.setJobID(new JobID(inputName, inputName.hashCode()));

        // Get splits for each input path and tag with InputFormat
        // and Mapper types by wrapping in a MultiInputTaggedSplit.
        List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy);
        for (InputSplit split : formatSplits) {
            splits.add(new MultiInputTaggedSplit(split, confCopy, inputName,
                    mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName));
        }/*  w  ww.  j a v a  2s  . co  m*/
    }
    return splits;
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitionerWriterWrapper.java

License:Apache License

private TaskAttemptContext getTaskAttemptContext(TaskAttemptContext context, String newOutputName)
        throws IOException {
    Job job = new Job(context.getConfiguration());
    DynamicPartitioningOutputFormat.setOutputName(job, newOutputName);
    // CDAP-4806 We must set this parameter in addition to calling FileOutputFormat#setOutputName, because
    // AvroKeyOutputFormat/AvroKeyValueOutputFormat use a different parameter for the output name than FileOutputFormat.
    if (isAvroOutputFormat(getFileOutputFormat(context))) {
        job.getConfiguration().set("avro.mo.config.namedOutput", newOutputName);
    }/*  w  w  w. j a  v  a 2s.co m*/

    Path jobOutputPath = DynamicPartitioningOutputFormat
            .createJobSpecificPath(FileOutputFormat.getOutputPath(job), context);
    DynamicPartitioningOutputFormat.setOutputPath(job, jobOutputPath);

    return new TaskAttemptContextImpl(job.getConfiguration(), context.getTaskAttemptID());
}