Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:cmd.sampler.java

License:Apache License

/**
 * Driver for InputSampler from the command line. Configures a JobConf
 * instance and calls {@link #writePartitionFile}.
 */// w w  w.j  a  va2  s .c  o m
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    ArrayList<String> otherArgs = new ArrayList<String>();
    Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-inFormat".equals(args[i])) {
                job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class));
            } else if ("-keyClass".equals(args[i])) {
                job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class));
            } else if ("-splitSample".equals(args[i])) {
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new SplitSampler<K, V>(numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    if (job.getNumReduceTasks() <= 1) {
        System.err.println("Sampler requires more than one reducer");
        return printUsage();
    }
    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }
    if (null == sampler) {
        sampler = new SplitSampler<K, V>(1000, 10);
    }

    Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
    TotalOrderPartitioner.setPartitionFile(getConf(), outf);
    for (String s : otherArgs) {
        FileInputFormat.addInputPath(job, new Path(s));
    }
    InputSampler.<K, V>writePartitionFile(job, sampler);

    return 0;
}

From source file:cn.edu.hfut.dmic.webcollector.crawldb.DBReader.java

public static void main(String[] args) throws Exception {
    Path crawlPath = new Path("task2");
    Path currentPath = new Path(crawlPath, "crawldb/current");
    Path output = new Path("output");

    Configuration config = CrawlerConfiguration.create();
    FileSystem fs = FileSystem.get(config);

    if (fs.exists(output)) {
        fs.delete(output);/*from w  w w .  j  a va 2 s .c o m*/
    }

    Job job = new Job(config);
    job.setJobName("dbreader " + crawlPath.toString());
    job.setMapperClass(DBReaderMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, currentPath);
    FileOutputFormat.setOutputPath(job, output);

    job.waitForCompletion(true);

}

From source file:cn.edu.hfut.dmic.webcollector.crawldb.Generator.java

public static String generate(Path crawlPath, Configuration conf) throws Exception {
    SegmentUtil.initSegments(crawlPath, conf);
    String segmentName = SegmentUtil.createSegment(crawlPath, conf);

    Path currentPath = new Path(crawlPath, "crawldb/current");
    Path generatePath = new Path(crawlPath, "segments/" + segmentName + "/generate");

    Job job = new Job(conf);
    job.setJobName("generate " + crawlPath.toString());
    job.setJarByClass(Generator.class);

    job.setReducerClass(GeneratorReducer.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    FileInputFormat.addInputPath(job, currentPath);
    FileOutputFormat.setOutputPath(job, generatePath);
    job.waitForCompletion(true);//w  w  w  .  j a va 2s .  c o  m
    long count = job.getCounters().findCounter("generator", "count").getValue();
    System.out.println("total generate:" + count);
    if (count == 0) {
        return null;
    } else {
        return segmentName;
    }

}

From source file:cn.edu.hfut.dmic.webcollector.crawldb.Merge.java

public static void merge(Path crawlPath, Path[] mergePaths, Configuration conf, String jobName)
        throws Exception {

    Job job = new Job(conf);
    job.setJobName(jobName + "  " + crawlPath.toString());
    job.setJarByClass(Merge.class);
    // job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path crawldbPath = new Path(crawlPath, "crawldb");
    Path newdb = new Path(crawldbPath, "new");
    Path currentdb = new Path(crawldbPath, "current");

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }// w w w  .  j  a v a  2s.  com

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }
    for (Path mergePath : mergePaths) {
        FileInputFormat.addInputPath(job, mergePath);
    }
    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.waitForCompletion(true);

}

From source file:cn.edu.hfut.dmic.webcollector.fetcher.Fetcher.java

public static void fetch(Path crawlPath, String segmentName, Configuration conf) throws Exception {
    Path segmentPath = new Path(crawlPath, "segments/" + segmentName);
    Path generatePath = new Path(segmentPath, "generate");

    Job job = new Job(conf);
    job.setJobName("fetch " + crawlPath.toString());
    job.setJarByClass(Fetcher.class);

    job.setReducerClass(FetcherReducer.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(FetcherOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    FileInputFormat.addInputPath(job, generatePath);
    FileOutputFormat.setOutputPath(job, segmentPath);

    job.waitForCompletion(true);//from   w w w.ja  v a2  s  . co  m
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java

public static Job createJob(Configuration conf, Path crawldb) throws IOException {

    Job job = new Job(conf);
    //job.setJarByClass(Merge.class);
    job.getConfiguration().set("mapred",
            "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path newdb = new Path(crawldb, "new");
    Path currentdb = new Path(crawldb, "current");

    FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create());
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }/*from  w ww  . j ava2  s. co m*/

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }

    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job;
}

From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java

License:Apache License

/** Create and setup a job */
@SuppressWarnings("deprecation")
private static Job createJob(String name, Configuration conf) throws IOException {
    final Job job = new Job(conf, NAME + "_" + name);
    final Configuration jobconf = job.getConfiguration();
    job.setJarByClass(BaileyBorweinPlouffe.class);

    // setup mapper
    job.setMapperClass(BbpMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);

    // setup reducer
    job.setReducerClass(BbpReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setNumReduceTasks(1);/*from w  w w  .java  2s  .c om*/

    // setup input
    job.setInputFormatClass(BbpInputFormat.class);

    // disable task timeout
    jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0);

    // do not use speculative execution
    jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false);
    jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false);
    return job;
}

From source file:cn.lhfei.hbase.ch04.SampleUploader.java

License:Apache License

/**
 * Job configuration./* ww  w.  j ava2  s .co m*/
 */
public static Job configureJob(Configuration conf, String[] args) throws IOException {
    Path inputPath = new Path(args[0]);
    String tableName = args[1];
    //Job job = new Job(conf, NAME + "_" + tableName);

    Job job = Job.getInstance(conf);

    //job.setJarByClass(Uploader.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(Uploader.class);
    // No reducers. Just write straight to table. Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:co.cask.cdap.data.stream.StreamInputFormatTest.java

License:Apache License

private void runMR(File inputDir, File outputDir, long startTime, long endTime, long splitSize, long ttl)
        throws Exception {

    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    StreamInputFormat.setTTL(conf, ttl);
    StreamInputFormat.setStreamPath(conf, inputDir.toURI());
    StreamInputFormat.setTimeRange(conf, startTime, endTime);
    StreamInputFormat.setMaxSplitSize(conf, splitSize);
    job.setInputFormatClass(TestStreamInputFormat.class);

    TextOutputFormat.setOutputPath(job, new Path(outputDir.toURI()));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setJarByClass(StreamInputFormatTest.class);
    job.setMapperClass(TokenizeMapper.class);
    job.setReducerClass(AggregateReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.waitForCompletion(true);/*from www.  ja va 2  s.  c  om*/
}

From source file:co.cask.cdap.hbase.wd.RowKeyDistributorTestBase.java

License:Apache License

private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue,
        int seekIntervalMinValue, int seekIntervalMaxValue)
        throws IOException, InterruptedException, ClassNotFoundException {
    int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue,
            seekIntervalMinValue, seekIntervalMaxValue);

    // Reading data
    Configuration conf = new Configuration(testingUtility.getConfiguration());
    conf.set("fs.defaultFS", "file:///");
    conf.set("fs.default.name", "file:///");
    conf.setInt("mapreduce.local.map.tasks.maximum", 16);
    conf.setInt("mapreduce.local.reduce.tasks.maximum", 16);
    Job job = Job.getInstance(conf, "testMapReduceInternal()-Job");
    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class,
            ImmutableBytesWritable.class, Result.class, job);

    // Substituting standard TableInputFormat which was set in TableMapReduceUtil.initTableMapperJob(...)
    job.setInputFormatClass(WdTableInputFormat.class);
    keyDistributor.addInfo(job.getConfiguration());

    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);//from  www  . j a  v  a  2 s.  c om

    boolean succeeded = job.waitForCompletion(true);
    Assert.assertTrue(succeeded);

    long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue();
    Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords);

    // Need to kill the job after completion, after it could leave MRAppMaster running not terminated.
    // Not sure what causing this, but maybe problem in MiniYarnCluster
    job.killJob();
}