Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat setMinInputSplitSize

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat setMinInputSplitSize

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat setMinInputSplitSize.

Prototype

public static void setMinInputSplitSize(Job job, long size) 

Source Link

Document

Set the minimum input split size

Usage

From source file:bb.BranchAndBound.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {//w  w  w  . ja  v a  2 s.c  om
       System.err.println("Usage: branchandbound <input> <output>");
       System.exit(2);
    }
    Job job = new Job(conf, "branch and bound");
    job.setJarByClass(BranchAndBound.class);
    job.setMapperClass(BBMapper.class);
    //      job.setCombinerClass(IntSumReducer.class);
    //      job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);*/
    int n;
    String[] inputargs = new GenericOptionsParser(new Configuration(), args).getRemainingArgs();
    if (inputargs.length != 2) {
        System.err.println("Usage: branchandbound <data directory> <n>");
        System.exit(2);
    }
    n = Integer.parseInt(inputargs[1]);
    String dataDir = inputargs[0];
    String prev_output = dataDir + "/input";
    /*      for( int i = 1 ; i <= n ; i++ ) {
             for( int j = 0 ; j < 2 ; j++ ) {
    String input = prev_output ;
    String output = inputargs[1] + "/iteration" + i + "_" + j ;
    Job job = getJob(input, output, i, j) ;
    job.waitForCompletion(true) ; // if failed ????
    prev_output = output;
             }
          }
    */
    //prev_output = dataDir + "/output" + "/iteration" + 17;
    long totalNodes = 0;
    long searchedNodes = 0;
    long cutbyDEE = 0;
    int mapTotal = 768;
    for (int i = 0; i <= n; i++) {
        iterRound = i;
        String input = prev_output;
        String output = dataDir + "/output" + "/iteration" + i;
        Job job = getJob(input, output, dataDir, i);
        if (i == n) {
            numReduceTasks = 1;
        }
        //job.setNumMapTasks(200);
        if (numOutput > mapTotal) {
            FileInputFormat.setMaxInputSplitSize(job, 10 * (8 * n + 10) + numOutput * (8 * n + 10) / 3000);
            FileInputFormat.setMinInputSplitSize(job, Math.max((8 * n + 10), numOutput * (8 * n + 10) / 5000));
        } else {
            FileInputFormat.setMaxInputSplitSize(job, (8 * n + 10));
        }
        /*
        if( i == 0 ) {
        job.setNumReduceTasks(1);
        } else {
        job.setNumReduceTasks(0);
        }
         */
        job.setNumReduceTasks(0);
        job.waitForCompletion(true); // if failed ????
        prev_output = output;
        Counters counters = job.getCounters();
        Counter counter = counters.findCounter("MyCounter", "Map Output Counter");
        numOutput = counter.getValue();
        totalNodes += numOutput;
        cutbyDEE += counters.findCounter("MyCounter", "Cut By DEE").getValue();
        searchedNodes += totalNodes + cutbyDEE + counters.findCounter("MyCounter", "Cut By Bound").getValue();
        System.out.println(numOutput + " " + (8 * n + 10) + " " + (numOutput * (8 * n + 10) / 768));
    }
    System.out.println("searchedNodes " + searchedNodes);
    System.out.println(totalNodes);
    System.out.println("cut by dee " + cutbyDEE);
}

From source file:bigmodel.AutoCoderLocal.java

License:Apache License

/**
 * Runs this tool./* ww w  . java2  s.  c o  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT) + "/part-r-00000";
    String outputPath = cmdline.getOptionValue(OUTPUT);
    String dataPath = cmdline.getOptionValue(INPUT) + "/common";
    //String inputPath = "/home/qiwang321/mapreduce-data/data/in-mingled1-5/part*";
    //String outputPath = "output";
    //String dataPath = "/home/qiwang321/mapreduce-data/data/in-mingled1-5/common";
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + AutoCoderLocal.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    Configuration conf = getConf();
    initialParameters(conf);

    conf.set("dataPath", dataPath);

    Job job = Job.getInstance(conf);
    job.setJobName(AutoCoderLocal.class.getSimpleName());
    job.setJarByClass(AutoCoderLocal.class);
    // set the path of the information of k clusters in this iteration
    job.getConfiguration().set("sidepath", inputPath + "/side_output");
    job.setNumReduceTasks(reduceTasks);

    dataShuffle();

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileInputFormat.setMinInputSplitSize(job, 1000 * 1024 * 1024);
    FileInputFormat.setMaxInputSplitSize(job, 1000 * 1024 * 1024);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(ModelNode.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SuperModel.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    //prepareNextIteration(inputPath0, outputPath,iterations,conf,reduceTasks);

    return 0;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraValidate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    boolean useRecordService = false;
    if (args.length != 2 && args.length != 3) {
        usage();//ww  w  .j  a  va  2  s  . co m
        return 1;
    }
    if (args.length == 3) {
        useRecordService = Boolean.parseBoolean(args[2]);
    }

    Job job = Job.getInstance(getConf());
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraValidate");
    job.setJarByClass(TeraValidate.class);
    job.setMapperClass(ValidateMapper.class);
    job.setReducerClass(ValidateReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    // force a single reducer
    job.setNumReduceTasks(1);
    // force a single split
    FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE);
    if (useRecordService) {
        RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]);
        job.setInputFormatClass(RecordServiceTeraInputFormat.class);
    } else {
        TeraInputFormat.setInputPaths(job, new Path(args[0]));
        job.setInputFormatClass(TeraInputFormat.class);
    }
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.google.cloud.bigtable.dataflowimport.HadoopFileSource.java

License:Apache License

private List<InputSplit> computeSplits(long desiredBundleSizeBytes)
        throws IOException, IllegalAccessException, InstantiationException {
    Job job = Job.getInstance(getDeserializerConfiguration());
    FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes);
    FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes);
    return createFormat(job).getSplits(job);
}

From source file:com.google.cloud.dataflow.contrib.hadoop.HadoopFileSource.java

License:Apache License

private List<InputSplit> computeSplits(long desiredBundleSizeBytes)
        throws IOException, IllegalAccessException, InstantiationException {
    Job job = Job.getInstance();//from  w  ww  .  j av  a 2s.c  o m
    FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes);
    FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes);
    return createFormat(job).getSplits(job);
}

From source file:com.phantom.hadoop.examples.terasort.TeraValidate.java

License:Apache License

public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    if (args.length != 2) {
        usage();//from   www .j a v a  2  s .  com
        return 1;
    }
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraValidate");
    job.setJarByClass(TeraValidate.class);
    job.setMapperClass(ValidateMapper.class);
    job.setReducerClass(ValidateReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    // force a single reducer
    job.setNumReduceTasks(1);
    // force a single split
    FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE);
    job.setInputFormatClass(TeraInputFormat.class);
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:info.halo9pan.word2vec.hadoop.terasort.TeraValidate.java

License:Apache License

public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    if (args.length != 2) {
        usage();//from   ww  w.  java2s .  com
        return 1;
    }
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraValidate");
    job.setJarByClass(TeraValidate.class);
    job.setMapperClass(ValidateMapper.class);
    job.setReducerClass(ValidateReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    // force a single reducer
    job.setNumReduceTasks(1);
    // force a single split 
    FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE);
    job.setInputFormatClass(TeraInputFormat.class);
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:model.AutoCoder.java

License:Apache License

/**
 * Runs this tool.//from   www.  j a  v  a  2  s  .c o m
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT) + "/part*";
    String outputPath = cmdline.getOptionValue(OUTPUT);
    //String inputPath = "mingled_v2/part*";
    //String outputPath = "output";
    String dataPath = cmdline.getOptionValue(INPUT) + "/common";
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + AutoCoder.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    Configuration conf = getConf();
    initialParameters(conf);

    conf.set("dataPath", dataPath);

    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");

    Job job = Job.getInstance(conf);
    job.setJobName(AutoCoder.class.getSimpleName());
    job.setJarByClass(AutoCoder.class);
    // set the path of the information of k clusters in this iteration
    job.getConfiguration().set("sidepath", inputPath + "/side_output");
    job.setNumReduceTasks(reduceTasks);

    dataShuffle();

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileInputFormat.setMaxInputSplitSize(job, 1000 * 1024 * 1024);
    FileInputFormat.setMinInputSplitSize(job, 1000 * 1024 * 1024);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(ModelNode.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SuperModel.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    //prepareNextIteration(inputPath0, outputPath,iterations,conf,reduceTasks);

    return 0;
}

From source file:org.apache.beam.sdk.io.hdfs.HDFSFileSource.java

License:Apache License

private List<InputSplit> computeSplits(long desiredBundleSizeBytes,
        SerializableConfiguration serializableConfiguration)
        throws IOException, IllegalAccessException, InstantiationException {
    Job job = SerializableConfiguration.newJob(serializableConfiguration);
    FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes);
    FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes);
    return createFormat(job).getSplits(job);
}

From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopPopularWordsTest.java

License:Apache License

/**
 * Configures the Hadoop MapReduce job.//from w w w.j  av a 2 s  .co m
 *
 * @return Instance of the Hadoop MapRed job.
 * @throws IOException If failed.
 */
private Job createConfigBasedHadoopJob() throws IOException {
    Job jobCfg = new Job();

    Configuration cfg = jobCfg.getConfiguration();

    // Use explicit configuration of distributed file system, if provided.
    if (DFS_CFG != null)
        cfg.addResource(U.resolveIgniteUrl(DFS_CFG));

    jobCfg.setJobName("HadoopPopularWordExample");
    jobCfg.setJarByClass(GridHadoopPopularWordsTest.class);
    jobCfg.setInputFormatClass(TextInputFormat.class);
    jobCfg.setOutputKeyClass(Text.class);
    jobCfg.setOutputValueClass(IntWritable.class);
    jobCfg.setMapperClass(TokenizingMapper.class);
    jobCfg.setReducerClass(TopNWordsReducer.class);

    FileInputFormat.setInputPaths(jobCfg, BOOKS_DFS_DIR);
    FileOutputFormat.setOutputPath(jobCfg, RESULT_DFS_DIR);

    // Local job tracker allows the only task per wave, but text input format
    // replaces it with the calculated value based on input split size option.
    if ("local".equals(cfg.get("mapred.job.tracker", "local"))) {
        // Split job into tasks using 32MB split size.
        FileInputFormat.setMinInputSplitSize(jobCfg, 32 * 1024 * 1024);
        FileInputFormat.setMaxInputSplitSize(jobCfg, Long.MAX_VALUE);
    }

    return jobCfg;
}