Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:DateExample_Month.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//from  w w w .  j a va 2 s  .c o m
    }
    Job job = new Job(conf, "word count fs");
    job.setJarByClass(DateExample_Month.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(IsValidKeyFormat.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:WordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    ///start/*  w  ww  .  j  a v a 2 s.  co m*/
    final long startTime = System.currentTimeMillis();
    String outputReducerType = "filesystem";
    if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) {
        String[] s = args[0].split("=");
        if (s != null && s.length == 2)
            outputReducerType = s[1];
    }
    logger.info("output reducer type: " + outputReducerType);

    // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better
    ConfigHelper.setRangeBatchSize(getConf(), 99);

    for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) {
        String columnName = "userId";
        Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        if (outputReducerType.equalsIgnoreCase("filesystem")) {
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
        } else {
            job.setReducerClass(ReducerToCassandra.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(ByteBuffer.class);
            job.setOutputValueClass(List.class);

            job.setOutputFormatClass(ColumnFamilyOutputFormat.class);

            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
            job.getConfiguration().set(CONF_COLUMN_NAME, "sum");
        }
        job.setInputFormatClass(ColumnFamilyInputFormat.class);
        ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
        ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost");

        //Change partitioner here
        ConfigHelper.setInputPartitioner(job.getConfiguration(), "RandomPartitioner");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);

        SlicePredicate predicate = new SlicePredicate()
                .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

        // this will cause the predicate to be ignored in favor of scanning everything as a wide row
        //Son degisiklik Super Column Support ?
        // ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true);

        ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setOutputPartitioner(job.getConfiguration(), "RandomPartitioner");
        job.waitForCompletion(true);
    }

    final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
    System.out.println();
    System.out.println("Job Finished in " + duration + " seconds");
    System.out.println();

    return 0;
}

From source file:FindMaxPageRankNodes.java

License:Apache License

/**
 * Runs this tool./*from   w  w  w. jav a 2s  .c  o m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("top n").create(TOP));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(TOP)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int n = Integer.parseInt(cmdline.getOptionValue(TOP));

    LOG.info("Tool name: " + FindMaxPageRankNodes.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);
    LOG.info(" - top: " + n);

    Configuration conf = getConf();
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.setInt("n", n);

    Job job = Job.getInstance(conf);
    job.setJobName(FindMaxPageRankNodes.class.getName() + ":" + inputPath);
    job.setJarByClass(FindMaxPageRankNodes.class);

    job.setNumReduceTasks(1);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(FloatWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:GetUserInfoGivenMovieId.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set("movieId", args[2]);
    Job job = new Job(conf, "GetUserInfoGivenMovieId");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setJarByClass(GetUserInfoGivenMovieId.class);
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[3]));

    boolean flag1 = job.waitForCompletion(true);

    if (flag1) {/*from  ww w .ja v  a 2s.  co  m*/
        Configuration conf2 = new Configuration();
        //FileSystem fs = FileSystem.get(conf2);
        //Path Intermediate = new Path(args[1]);
        //DistributedCache.addCacheFile(Intermediate.toUri(), conf2);
        //DistributedCache.addCacheFile(new URI(args[1]),conf2);

        Job job2 = new Job(conf2, "UserInfo");
        /*Job job2 = new Job(new Configuration());
        Configuration conf2 = job.getConfiguration();
        job2.setJobName("Join with Cache");
        DistributedCache.addCacheFile(new URI(args[1]), conf2);*/
        job2.addCacheFile(new URI(args[1]));
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(Text.class);
        job2.setJarByClass(GetUserInfoGivenMovieId.class);
        job2.setMapperClass(MapWithJoin.class);
        //job2.setCombinerClass(Reduce.class);
        job2.setReducerClass(ReduceFinal.class);

        job2.setInputFormatClass(TextInputFormat.class);
        job2.setOutputFormatClass(TextOutputFormat.class);

        FileInputFormat.addInputPath(job2, new Path(args[3]));
        FileOutputFormat.setOutputPath(job2, new Path(args[4]));
        job2.waitForCompletion(true);
    }
}

From source file:CrimenCasosTotales.java

public static void main(String args[]) throws Exception {

    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "casostotales");
    job.setJarByClass(WordCount.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:DumpPageRankRecordsToPlainText.java

License:Apache License

/**
 * Runs this tool./*from w  w w  .j av  a 2  s  .c  o  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);

    LOG.info("Tool name: " + DumpPageRankRecordsToPlainText.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Configuration conf = new Configuration();
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    Job job = Job.getInstance(conf);
    job.setJobName(DumpPageRankRecordsToPlainText.class.getSimpleName());
    job.setJarByClass(DumpPageRankRecordsToPlainText.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:MaleUsersBelow7Years.java

public static void main(String args[]) throws Exception {
    Configuration configuration = new Configuration();

    Job job = new Job(configuration, "CountMaleUsersLessThan7");
    job.setJarByClass(MaleUsersBelow7Years.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reducer.class);
    job.setCombinerClass(Reducer.class);

    //set output and input formats;mapper-input reducer-output
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    FileInputFormat.addInputPath(job, new Path(args[0])); //path for input file
    FileOutputFormat.setOutputPath(job, new Path(args[1])); // Path for output file
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:WordCount.java

License:Apache License

public int run(String[] args) throws Exception {

    ///start// ww  w .jav  a  2  s  .c o m
    final long startTime = System.currentTimeMillis();
    String outputReducerType = "filesystem";
    if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) {
        String[] s = args[0].split("=");
        if (s != null && s.length == 2)
            outputReducerType = s[1];
    }
    logger.info("output reducer type: " + outputReducerType);

    // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better
    ConfigHelper.setRangeBatchSize(getConf(), 99);

    for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) {
        String columnName = "userId";
        Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);

        if (outputReducerType.equalsIgnoreCase("filesystem")) {
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
        } else {
            job.setReducerClass(ReducerToCassandra.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(ByteBuffer.class);
            job.setOutputValueClass(List.class);

            job.setOutputFormatClass(ColumnFamilyOutputFormat.class);

            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
            job.getConfiguration().set(CONF_COLUMN_NAME, "sum");
        }

        job.setInputFormatClass(ColumnFamilyInputFormat.class);
        ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
        ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setInputPartitioner(job.getConfiguration(), "RandomPartitioner");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);

        SlicePredicate predicate = new SlicePredicate()
                .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

        // this will cause the predicate to be ignored in favor of scanning everything as a wide row
        //Son degisiklik
        // ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true);
        //System.out.println("tessssssaaat");

        ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setOutputPartitioner(job.getConfiguration(), "RandomPartitioner");

        job.waitForCompletion(true);
    }
    //print
    final double duration = (System.currentTimeMillis() - startTime) / 1000.0; // after
    System.out.println();
    System.out.println("Job Finished in " + duration + " seconds");
    System.out.println();

    return 0;
}

From source file:WordCount.java

License:Apache License

public int run(String[] args) throws Exception {

    ///start//from w w w  .  ja  v  a 2s .  c  o  m
    final long startTime = System.currentTimeMillis();
    String outputReducerType = "filesystem";
    if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) {
        String[] s = args[0].split("=");
        if (s != null && s.length == 2)
            outputReducerType = s[1];
    }

    logger.info("output reducer type: " + outputReducerType);

    // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better
    ConfigHelper.setRangeBatchSize(getConf(), 99);

    for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) {
        String columnName = "userId";

        Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);

        if (outputReducerType.equalsIgnoreCase("filesystem")) {
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
        } else {
            job.setReducerClass(ReducerToCassandra.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(ByteBuffer.class);
            job.setOutputValueClass(List.class);
            job.setOutputFormatClass(ColumnFamilyOutputFormat.class);
            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
            job.getConfiguration().set(CONF_COLUMN_NAME, "sum");
        }
        job.setInputFormatClass(ColumnFamilyInputFormat.class);
        ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
        ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setInputPartitioner(job.getConfiguration(), "RandomPartitioner");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);

        SlicePredicate predicate = new SlicePredicate()
                .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

        // this will cause the predicate to be ignored in favor of scanning everything as a wide row          
        //Son degisiklik
        // ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true);
        //System.out.println("tessssssaaat");

        ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setOutputPartitioner(job.getConfiguration(), "RandomPartitioner");

        job.waitForCompletion(true);
    }

    //print
    final double duration = (System.currentTimeMillis() - startTime) / 1000.0; // after
    System.out.println();
    System.out.println("Job Finished in " + duration + " seconds");
    System.out.println();

    return 0;
}

From source file:ExtractTopPersonalizedPageRankNodes.java

License:Apache License

/**
 * Runs this tool./*from  w w w  .jav  a 2  s  .  c o  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("top n").create(TOP));
    options.addOption(OptionBuilder.withArgName("src").hasArg().withDescription("source node").create(SRC));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(TOP)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = "abc";//cmdline.getOptionValue(OUTPUT);
    int n = Integer.parseInt(cmdline.getOptionValue(TOP));

    //LOG.info("Tool name: " + ExtractTopPersonalizedPageRankNodes.class.getSimpleName());
    //LOG.info(" - input: " + inputPath);
    //LOG.info(" - output: " + outputPath);
    //LOG.info(" - top: " + n);

    Configuration conf = getConf();
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.setInt(TOP_PG, n);

    Job job = Job.getInstance(conf);
    job.setJobName(ExtractTopPersonalizedPageRankNodes.class.getName() + ":" + inputPath);
    job.setJarByClass(ExtractTopPersonalizedPageRankNodes.class);

    job.setNumReduceTasks(1);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(PairOfIntFloat.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    FileSystem fileSystem = FileSystem.get(conf);
    Path path = new Path(outputPath + "/part-r-00000");
    ;
    //MapFile.Reader reader = new MapFile.Reader(new Path(outputPath+ "/part-r-00000"),conf);

    // InputStream fis=new FileInputStream(outputPath+"/part-r-00000");
    BufferedReader br = new BufferedReader(new InputStreamReader(fileSystem.open(path)));
    String s;
    float key;//=new FloatWritable();
    int value;//=new IntWritable();
    while ((s = br.readLine()) != null) {
        String[] sources = s.split("\\s+");
        key = Float.parseFloat(sources[0]);
        value = Integer.parseInt(sources[1]);
        if (key == 0.0f) {
            System.out.print("\n" + "Source: " + value + "\n");
        } else {
            System.out.print(String.format("%.5f %d", key, value) + "\n");
        }
    }
    //reader.close();
    br.close();

    //while(!SysOut.isEmpty())
    //{
    //   System.out.print(SysOut.poll());
    //}

    return 0;
}