Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.neu.cs6240.Xml2csvComments.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    // Setting up the xml tag configurator for splitter
    conf.set("xmlinput.start", "<row ");
    conf.set("xmlinput.end", " />");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: Xml2csvPosts <in> <out>");
        System.exit(2);/*w ww  . ja  v  a 2 s. c  o m*/
    }
    Job job = new Job(conf, "Converts Posts.xml to .csv");
    job.setJarByClass(Xml2csvPosts.class);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setMapperClass(CommentsMapper.class);
    job.setReducerClass(CommentsReducer.class);
    job.setPartitionerClass(PostsPartitioner.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    // Set as per your file size
    job.setNumReduceTasks(10);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.neu.cs6240.Xml2csvPosts.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    // Setting up the xml tag configurator for splitter
    conf.set("xmlinput.start", "<row ");
    conf.set("xmlinput.end", " />");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: Xml2csvPosts <in> <out>");
        System.exit(2);/*from ww w  .j av  a 2s .  co m*/
    }
    Job job = new Job(conf, "Converts Posts.xml to .csv");
    job.setJarByClass(Xml2csvPosts.class);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setMapperClass(PostsMapper.class);
    job.setReducerClass(PostsReducer.class);
    job.setPartitionerClass(PostsPartitioner.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    // Set as per your file size
    job.setNumReduceTasks(15);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.nikoo28.excel.mapreduce.ExcelDriver.java

License:Apache License

/**
 * Main entry point for the example.//from w  ww.  j  a v  a2s.  c o m
 *
 * @param args arguments
 * @throws Exception when something goes wrong
 */
public static void main(String[] args) throws Exception {
    logger.info("Driver started");

    Job job = new Job();
    job.setJarByClass(ExcelDriver.class);
    job.setJobName("Excel Record Reader");

    job.setMapperClass(ExcelMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(ExcelInputFormat.class);

    job.waitForCompletion(true);
}

From source file:com.nistfortunetellers.cleaning.NISTClean.java

License:Apache License

/** Runs a Job that is Text in and Out, and TextInput in and out, too! */
@SuppressWarnings({ "deprecation", "rawtypes" })
static void runTextJob(String jobName, Configuration jobConfig, String inputPath, String outputPath,
        Class<? extends Mapper> mapper, Class<? extends Reducer> reducer) {
    try {//from w w w  . jav a2s .  c  o  m
        Job genericJob = new Job(jobConfig, jobName);
        // DEBUG
        //genericJob.setNumReduceTasks(0);
        // END DEBUG
        genericJob.setJarByClass(NISTClean.class);
        genericJob.setOutputKeyClass(Text.class);
        genericJob.setOutputValueClass(Text.class);
        genericJob.setMapperClass(mapper);
        genericJob.setReducerClass(reducer);
        genericJob.setInputFormatClass(TextInputFormat.class);
        genericJob.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.addInputPath(genericJob, new Path(inputPath));
        FileOutputFormat.setOutputPath(genericJob, new Path(outputPath));
        genericJob.waitForCompletion(true);
    } catch (IOException e) {
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

From source file:com.pagerankcalculator.TwitterPageRank.java

/**
 * Graph Parsing// w  ww . ja v  a2  s .co m
 * Memasukan data mentah dan melakukan inisialisasi pagerank
 * 
 * @param in file data masukan
 * @param out direktori output
 */
public int parseGraph(String in, String out) throws IOException, InterruptedException, ClassNotFoundException {

    Job job = Job.getInstance(getConf());
    job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#1 Parsing Graph");
    job.setJarByClass(TwitterPageRank.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(GraphParsingMapper.class);
    job.setReducerClass(GraphParsingReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setNumReduceTasks(TwitterPageRank.NUM_REDUCE_TASKS);

    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    Path inputFilePath = new Path(in);
    Path outputFilePath = new Path(out);

    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, outputFilePath);

    FileSystem fs = FileSystem.newInstance(getConf());

    if (fs.exists(outputFilePath)) {
        fs.delete(outputFilePath, true);
    }

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.pagerankcalculator.TwitterPageRank.java

public int calculatePagerank(String in, String out, int iteration)
        throws IOException, InterruptedException, ClassNotFoundException {
    Job job = Job.getInstance(getConf());
    job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#2 Iteration-" + iteration + " Calculating Page Rank");
    job.setJarByClass(TwitterPageRank.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(PageRankCalculationMapper.class);
    job.setReducerClass(PageRankCalculationReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setNumReduceTasks(TwitterPageRank.NUM_REDUCE_TASKS);

    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    Path inputFilePath = new Path(in);
    Path outputFilePath = new Path(out);

    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, outputFilePath);

    FileSystem fs = FileSystem.newInstance(getConf());

    if (fs.exists(outputFilePath)) {
        fs.delete(outputFilePath, true);
    }//from   w  w w . j a  v a  2s .  c  o  m

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.pagerankcalculator.TwitterPageRank.java

public int sortPagerank(String in, String out)
        throws IOException, InterruptedException, ClassNotFoundException {
    Job job = Job.getInstance(getConf());
    job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#3 Sorting Page Rank");
    job.setJarByClass(TwitterPageRank.class);

    job.setMapOutputKeyClass(DoubleWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(PageRankSortingMapper.class);
    job.setReducerClass(PageRankSortingReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setNumReduceTasks(1);// ww w.  j a  v  a2  s . co  m

    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    job.setSortComparatorClass(DoubleSortDescComparator.class);

    Path inputFilePath = new Path(in);
    Path outputFilePath = new Path(out);

    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, outputFilePath);

    FileSystem fs = FileSystem.newInstance(getConf());

    if (fs.exists(outputFilePath)) {
        fs.delete(outputFilePath, true);
    }

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.peer2gear.nutch.xquery.ParseResult.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.printf("Usage: %s [generic options] (<segment> ... | -dir <segments>) <output>\n",
                getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//from   w  ww . j  a  va  2s  . co  m

    Job job = new Job(getConf());
    for (int i = 0; i < args.length - 1; i++) {
        if ("-dir".equals(args[i])) {
            Path dir = new Path(args[++i]);
            FileSystem fs = dir.getFileSystem(getConf());
            FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
            Path[] segments = HadoopFSUtil.getPaths(fstats);
            for (Path segment : segments) {
                FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
            }
        } else {
            FileInputFormat.addInputPath(job, new Path(args[i], ParseData.DIR_NAME));
        }
    }
    FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1]));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(GetResultMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.phantom.hadoop.examples.BaileyBorweinPlouffe.java

License:Apache License

/** Create and setup a job */
private static Job createJob(String name, Configuration conf) throws IOException {
    final Job job = new Job(conf, NAME + "_" + name);
    final Configuration jobconf = job.getConfiguration();
    job.setJarByClass(BaileyBorweinPlouffe.class);

    // setup mapper
    job.setMapperClass(BbpMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);

    // setup reducer
    job.setReducerClass(BbpReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setNumReduceTasks(1);/*from   www  .  j av  a2  s . co  m*/

    // setup input
    job.setInputFormatClass(BbpInputFormat.class);

    // disable task timeout
    jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0);

    // do not use speculative execution
    jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false);
    jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false);
    return job;
}

From source file:com.phantom.hadoop.examples.Grep.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }/*from ww w.j a va 2s.  co  m*/

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Configuration conf = getConf();
    conf.set(RegexMapper.PATTERN, args[2]);
    if (args.length == 4)
        conf.set(RegexMapper.GROUP, args[3]);

    Job grepJob = new Job(conf);

    try {

        grepJob.setJobName("grep-search");

        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.setMapperClass(RegexMapper.class);

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(grepJob, tempDir);
        grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);

        grepJob.waitForCompletion(true);

        Job sortJob = new Job(conf);
        sortJob.setJobName("grep-sort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormatClass(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setSortComparatorClass( // sort by decreasing freq
                LongWritable.DecreasingComparator.class);

        sortJob.waitForCompletion(true);
    } finally {
        FileSystem.get(conf).delete(tempDir, true);
    }
    return 0;
}