List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat setMinInputSplitSize
public static void setMinInputSplitSize(Job job, long size)
From source file:bb.BranchAndBound.java
License:Apache License
public static void main(String[] args) throws Exception { /*Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) {//w w w . ja v a 2 s.c om System.err.println("Usage: branchandbound <input> <output>"); System.exit(2); } Job job = new Job(conf, "branch and bound"); job.setJarByClass(BranchAndBound.class); job.setMapperClass(BBMapper.class); // job.setCombinerClass(IntSumReducer.class); // job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1);*/ int n; String[] inputargs = new GenericOptionsParser(new Configuration(), args).getRemainingArgs(); if (inputargs.length != 2) { System.err.println("Usage: branchandbound <data directory> <n>"); System.exit(2); } n = Integer.parseInt(inputargs[1]); String dataDir = inputargs[0]; String prev_output = dataDir + "/input"; /* for( int i = 1 ; i <= n ; i++ ) { for( int j = 0 ; j < 2 ; j++ ) { String input = prev_output ; String output = inputargs[1] + "/iteration" + i + "_" + j ; Job job = getJob(input, output, i, j) ; job.waitForCompletion(true) ; // if failed ???? prev_output = output; } } */ //prev_output = dataDir + "/output" + "/iteration" + 17; long totalNodes = 0; long searchedNodes = 0; long cutbyDEE = 0; int mapTotal = 768; for (int i = 0; i <= n; i++) { iterRound = i; String input = prev_output; String output = dataDir + "/output" + "/iteration" + i; Job job = getJob(input, output, dataDir, i); if (i == n) { numReduceTasks = 1; } //job.setNumMapTasks(200); if (numOutput > mapTotal) { FileInputFormat.setMaxInputSplitSize(job, 10 * (8 * n + 10) + numOutput * (8 * n + 10) / 3000); FileInputFormat.setMinInputSplitSize(job, Math.max((8 * n + 10), numOutput * (8 * n + 10) / 5000)); } else { FileInputFormat.setMaxInputSplitSize(job, (8 * n + 10)); } /* if( i == 0 ) { job.setNumReduceTasks(1); } else { job.setNumReduceTasks(0); } */ job.setNumReduceTasks(0); job.waitForCompletion(true); // if failed ???? prev_output = output; Counters counters = job.getCounters(); Counter counter = counters.findCounter("MyCounter", "Map Output Counter"); numOutput = counter.getValue(); totalNodes += numOutput; cutbyDEE += counters.findCounter("MyCounter", "Cut By DEE").getValue(); searchedNodes += totalNodes + cutbyDEE + counters.findCounter("MyCounter", "Cut By Bound").getValue(); System.out.println(numOutput + " " + (8 * n + 10) + " " + (numOutput * (8 * n + 10) / 768)); } System.out.println("searchedNodes " + searchedNodes); System.out.println(totalNodes); System.out.println("cut by dee " + cutbyDEE); }
From source file:bigmodel.AutoCoderLocal.java
License:Apache License
/** * Runs this tool./* ww w . java2 s. c o m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT) + "/part-r-00000"; String outputPath = cmdline.getOptionValue(OUTPUT); String dataPath = cmdline.getOptionValue(INPUT) + "/common"; //String inputPath = "/home/qiwang321/mapreduce-data/data/in-mingled1-5/part*"; //String outputPath = "output"; //String dataPath = "/home/qiwang321/mapreduce-data/data/in-mingled1-5/common"; int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + AutoCoderLocal.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Configuration conf = getConf(); initialParameters(conf); conf.set("dataPath", dataPath); Job job = Job.getInstance(conf); job.setJobName(AutoCoderLocal.class.getSimpleName()); job.setJarByClass(AutoCoderLocal.class); // set the path of the information of k clusters in this iteration job.getConfiguration().set("sidepath", inputPath + "/side_output"); job.setNumReduceTasks(reduceTasks); dataShuffle(); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileInputFormat.setMinInputSplitSize(job, 1000 * 1024 * 1024); FileInputFormat.setMaxInputSplitSize(job, 1000 * 1024 * 1024); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ModelNode.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SuperModel.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); //prepareNextIteration(inputPath0, outputPath,iterations,conf,reduceTasks); return 0; }
From source file:com.cloudera.recordservice.examples.terasort.TeraValidate.java
License:Apache License
@Override public int run(String[] args) throws Exception { boolean useRecordService = false; if (args.length != 2 && args.length != 3) { usage();//ww w .j a va 2 s . co m return 1; } if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } Job job = Job.getInstance(getConf()); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraValidate"); job.setJarByClass(TeraValidate.class); job.setMapperClass(ValidateMapper.class); job.setReducerClass(ValidateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // force a single reducer job.setNumReduceTasks(1); // force a single split FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE); if (useRecordService) { RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); job.setInputFormatClass(RecordServiceTeraInputFormat.class); } else { TeraInputFormat.setInputPaths(job, new Path(args[0])); job.setInputFormatClass(TeraInputFormat.class); } return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.google.cloud.bigtable.dataflowimport.HadoopFileSource.java
License:Apache License
private List<InputSplit> computeSplits(long desiredBundleSizeBytes) throws IOException, IllegalAccessException, InstantiationException { Job job = Job.getInstance(getDeserializerConfiguration()); FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes); FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes); return createFormat(job).getSplits(job); }
From source file:com.google.cloud.dataflow.contrib.hadoop.HadoopFileSource.java
License:Apache License
private List<InputSplit> computeSplits(long desiredBundleSizeBytes) throws IOException, IllegalAccessException, InstantiationException { Job job = Job.getInstance();//from w ww . j av a 2s.c o m FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes); FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes); return createFormat(job).getSplits(job); }
From source file:com.phantom.hadoop.examples.terasort.TeraValidate.java
License:Apache License
public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); if (args.length != 2) { usage();//from www .j a v a 2 s . com return 1; } TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraValidate"); job.setJarByClass(TeraValidate.class); job.setMapperClass(ValidateMapper.class); job.setReducerClass(ValidateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // force a single reducer job.setNumReduceTasks(1); // force a single split FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE); job.setInputFormatClass(TeraInputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:info.halo9pan.word2vec.hadoop.terasort.TeraValidate.java
License:Apache License
public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); if (args.length != 2) { usage();//from ww w. java2s . com return 1; } TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraValidate"); job.setJarByClass(TeraValidate.class); job.setMapperClass(ValidateMapper.class); job.setReducerClass(ValidateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // force a single reducer job.setNumReduceTasks(1); // force a single split FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE); job.setInputFormatClass(TeraInputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:model.AutoCoder.java
License:Apache License
/** * Runs this tool.//from www. j a v a 2 s .c o m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT) + "/part*"; String outputPath = cmdline.getOptionValue(OUTPUT); //String inputPath = "mingled_v2/part*"; //String outputPath = "output"; String dataPath = cmdline.getOptionValue(INPUT) + "/common"; int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + AutoCoder.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Configuration conf = getConf(); initialParameters(conf); conf.set("dataPath", dataPath); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); Job job = Job.getInstance(conf); job.setJobName(AutoCoder.class.getSimpleName()); job.setJarByClass(AutoCoder.class); // set the path of the information of k clusters in this iteration job.getConfiguration().set("sidepath", inputPath + "/side_output"); job.setNumReduceTasks(reduceTasks); dataShuffle(); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileInputFormat.setMaxInputSplitSize(job, 1000 * 1024 * 1024); FileInputFormat.setMinInputSplitSize(job, 1000 * 1024 * 1024); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ModelNode.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SuperModel.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); //prepareNextIteration(inputPath0, outputPath,iterations,conf,reduceTasks); return 0; }
From source file:org.apache.beam.sdk.io.hdfs.HDFSFileSource.java
License:Apache License
private List<InputSplit> computeSplits(long desiredBundleSizeBytes, SerializableConfiguration serializableConfiguration) throws IOException, IllegalAccessException, InstantiationException { Job job = SerializableConfiguration.newJob(serializableConfiguration); FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes); FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes); return createFormat(job).getSplits(job); }
From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopPopularWordsTest.java
License:Apache License
/** * Configures the Hadoop MapReduce job.//from w w w.j av a 2 s .co m * * @return Instance of the Hadoop MapRed job. * @throws IOException If failed. */ private Job createConfigBasedHadoopJob() throws IOException { Job jobCfg = new Job(); Configuration cfg = jobCfg.getConfiguration(); // Use explicit configuration of distributed file system, if provided. if (DFS_CFG != null) cfg.addResource(U.resolveIgniteUrl(DFS_CFG)); jobCfg.setJobName("HadoopPopularWordExample"); jobCfg.setJarByClass(GridHadoopPopularWordsTest.class); jobCfg.setInputFormatClass(TextInputFormat.class); jobCfg.setOutputKeyClass(Text.class); jobCfg.setOutputValueClass(IntWritable.class); jobCfg.setMapperClass(TokenizingMapper.class); jobCfg.setReducerClass(TopNWordsReducer.class); FileInputFormat.setInputPaths(jobCfg, BOOKS_DFS_DIR); FileOutputFormat.setOutputPath(jobCfg, RESULT_DFS_DIR); // Local job tracker allows the only task per wave, but text input format // replaces it with the calculated value based on input split size option. if ("local".equals(cfg.get("mapred.job.tracker", "local"))) { // Split job into tasks using 32MB split size. FileInputFormat.setMinInputSplitSize(jobCfg, 32 * 1024 * 1024); FileInputFormat.setMaxInputSplitSize(jobCfg, Long.MAX_VALUE); } return jobCfg; }