List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat setMaxInputSplitSize
public static void setMaxInputSplitSize(Job job, long size)
From source file:bb.BranchAndBound.java
License:Apache License
public static void main(String[] args) throws Exception { /*Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) {/*from w ww. j a va 2 s . c om*/ System.err.println("Usage: branchandbound <input> <output>"); System.exit(2); } Job job = new Job(conf, "branch and bound"); job.setJarByClass(BranchAndBound.class); job.setMapperClass(BBMapper.class); // job.setCombinerClass(IntSumReducer.class); // job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1);*/ int n; String[] inputargs = new GenericOptionsParser(new Configuration(), args).getRemainingArgs(); if (inputargs.length != 2) { System.err.println("Usage: branchandbound <data directory> <n>"); System.exit(2); } n = Integer.parseInt(inputargs[1]); String dataDir = inputargs[0]; String prev_output = dataDir + "/input"; /* for( int i = 1 ; i <= n ; i++ ) { for( int j = 0 ; j < 2 ; j++ ) { String input = prev_output ; String output = inputargs[1] + "/iteration" + i + "_" + j ; Job job = getJob(input, output, i, j) ; job.waitForCompletion(true) ; // if failed ???? prev_output = output; } } */ //prev_output = dataDir + "/output" + "/iteration" + 17; long totalNodes = 0; long searchedNodes = 0; long cutbyDEE = 0; int mapTotal = 768; for (int i = 0; i <= n; i++) { iterRound = i; String input = prev_output; String output = dataDir + "/output" + "/iteration" + i; Job job = getJob(input, output, dataDir, i); if (i == n) { numReduceTasks = 1; } //job.setNumMapTasks(200); if (numOutput > mapTotal) { FileInputFormat.setMaxInputSplitSize(job, 10 * (8 * n + 10) + numOutput * (8 * n + 10) / 3000); FileInputFormat.setMinInputSplitSize(job, Math.max((8 * n + 10), numOutput * (8 * n + 10) / 5000)); } else { FileInputFormat.setMaxInputSplitSize(job, (8 * n + 10)); } /* if( i == 0 ) { job.setNumReduceTasks(1); } else { job.setNumReduceTasks(0); } */ job.setNumReduceTasks(0); job.waitForCompletion(true); // if failed ???? prev_output = output; Counters counters = job.getCounters(); Counter counter = counters.findCounter("MyCounter", "Map Output Counter"); numOutput = counter.getValue(); totalNodes += numOutput; cutbyDEE += counters.findCounter("MyCounter", "Cut By DEE").getValue(); searchedNodes += totalNodes + cutbyDEE + counters.findCounter("MyCounter", "Cut By Bound").getValue(); System.out.println(numOutput + " " + (8 * n + 10) + " " + (numOutput * (8 * n + 10) / 768)); } System.out.println("searchedNodes " + searchedNodes); System.out.println(totalNodes); System.out.println("cut by dee " + cutbyDEE); }
From source file:bigmodel.AutoCoderLocal.java
License:Apache License
/** * Runs this tool.// w ww . j ava2 s. c o m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT) + "/part-r-00000"; String outputPath = cmdline.getOptionValue(OUTPUT); String dataPath = cmdline.getOptionValue(INPUT) + "/common"; //String inputPath = "/home/qiwang321/mapreduce-data/data/in-mingled1-5/part*"; //String outputPath = "output"; //String dataPath = "/home/qiwang321/mapreduce-data/data/in-mingled1-5/common"; int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + AutoCoderLocal.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Configuration conf = getConf(); initialParameters(conf); conf.set("dataPath", dataPath); Job job = Job.getInstance(conf); job.setJobName(AutoCoderLocal.class.getSimpleName()); job.setJarByClass(AutoCoderLocal.class); // set the path of the information of k clusters in this iteration job.getConfiguration().set("sidepath", inputPath + "/side_output"); job.setNumReduceTasks(reduceTasks); dataShuffle(); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileInputFormat.setMinInputSplitSize(job, 1000 * 1024 * 1024); FileInputFormat.setMaxInputSplitSize(job, 1000 * 1024 * 1024); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ModelNode.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SuperModel.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); //prepareNextIteration(inputPath0, outputPath,iterations,conf,reduceTasks); return 0; }
From source file:co.cask.hydrator.plugin.batch.source.FileBatchSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws Exception { //SimpleDateFormat needs to be local because it is not threadsafe SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH"); //calculate date one hour ago, rounded down to the nearest hour prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1)); Calendar cal = Calendar.getInstance(); cal.setTime(prevHour);/* w w w . j a v a2 s . co m*/ cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); prevHour = cal.getTime(); Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE); //noinspection ConstantConditions for (Map.Entry<String, String> entry : properties.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } conf.set(INPUT_REGEX_CONFIG, config.fileRegex); conf.set(INPUT_NAME_CONFIG, config.path); if (config.timeTable != null) { table = context.getDataset(config.timeTable); datesToRead = Bytes.toString(table.read(LAST_TIME_READ)); if (datesToRead == null) { List<Date> firstRun = Lists.newArrayList(new Date(0)); datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE); } List<Date> attempted = Lists.newArrayList(prevHour); String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE); if (!updatedDatesToRead.equals(datesToRead)) { table.write(LAST_TIME_READ, updatedDatesToRead); } conf.set(LAST_TIME_READ, datesToRead); } conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour)); FileInputFormat.setInputPathFilter(job, BatchFileFilter.class); FileInputFormat.addInputPath(job, new Path(config.path)); if (config.maxSplitSize != null) { FileInputFormat.setMaxInputSplitSize(job, config.maxSplitSize); } context.setInput( Input.of(config.referenceName, new SourceInputFormatProvider(config.inputFormatClass, conf))); }
From source file:com.asp.tranlog.ImportTsv.java
License:Apache License
/** * Sets up the actual job.//from w ww .jav a 2s . com * * @param conf * The current configuration. * @param args * The command line parameters. * @return The newly created job. * @throws IOException * When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException, ClassNotFoundException { // Support non-XML supported characters // by re-encoding the passed separator as a Base64 string. String actualSeparator = conf.get(SEPARATOR_CONF_KEY); if (actualSeparator != null) { conf.set(SEPARATOR_CONF_KEY, new String(Base64.encodeBytes(actualSeparator.getBytes()))); } // See if a non-default Mapper was set String mapperClassName = conf.get(MAPPER_CONF_KEY); Class mapperClass = mapperClassName != null ? Class.forName(mapperClassName) : DEFAULT_MAPPER; String tableName = args[0]; Path inputDir = new Path(args[1]); Job job = new Job(conf, NAME + "_" + tableName); job.setJarByClass(mapperClass); FileInputFormat.setInputPaths(job, inputDir); String inputCodec = conf.get(INPUT_LZO_KEY); if (inputCodec == null) { FileInputFormat.setMaxInputSplitSize(job, 67108864l); // max split // size = // 64m job.setInputFormatClass(TextInputFormat.class); } else { if (inputCodec.equalsIgnoreCase("lzo")) job.setInputFormatClass(LzoTextInputFormat.class); else { usage("not supported compression codec!"); System.exit(-1); } } job.setMapperClass(mapperClass); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); if (hfileOutPath != null) { HTable table = new HTable(conf, tableName); job.setReducerClass(PutSortReducer.class); Path outputDir = new Path(hfileOutPath); FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(Put.class); HFileOutputFormat.configureIncrementalLoad(job, table); } else { // No reducers. Just write straight to table. Call // initTableReducerJob // to set up the TableOutputFormat. TableMapReduceUtil.initTableReducerJob(tableName, null, job); job.setNumReduceTasks(0); } TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.addDependencyJars(job.getConfiguration(), com.google.common.base.Function.class /* * Guava used by TsvParser */); return job; }
From source file:com.cloudera.oryx.computation.common.JobStep.java
License:Open Source License
/** * Creates a new {@link MRPipeline} instance that contains common configuration * settings./*from ww w . j ava2 s .co m*/ * * @return a new {@link MRPipeline} instance, suitably configured */ protected final MRPipeline createBasicPipeline(Class<?> jarClass) throws IOException { Configuration conf = OryxConfiguration.get(getConf()); conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); conf.setBoolean("mapred.output.compress", true); conf.set("mapred.output.compression.type", "BLOCK"); conf.setClass("mapred.output.compression.codec", SnappyCodec.class, CompressionCodec.class); // Set old-style equivalents for Avro/Crunch's benefit conf.set("avro.output.codec", "snappy"); conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true); conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, true); conf.setBoolean(TTConfig.TT_OUTOFBAND_HEARBEAT, true); conf.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1); //conf.setBoolean("crunch.disable.deep.copy", true); // Giving one mapper a lot of data can cause issues in some stages, so default to disable this conf.setBoolean("crunch.disable.combine.file", true); Config appConfig = ConfigUtils.getDefaultConfig(); conf.set("crunch.tmp.dir", appConfig.getString("computation-layer.tmp-dir")); int mapMemoryMB = appConfig.getInt("computation-layer.mapper-memory-mb"); log.info("Mapper memory: {}", mapMemoryMB); int mapHeapMB = (int) (mapMemoryMB / 1.3); // Matches Hadoop's default log.info("Mappers have {}MB heap and can access {}MB RAM", mapHeapMB, mapMemoryMB); if (conf.get(MRJobConfig.MAP_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); } conf.set(MRJobConfig.MAP_JAVA_OPTS, "-Xmx" + mapHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); // See comment below on CM conf.setInt("mapreduce.map.java.opts.max.heap", mapHeapMB); int reduceMemoryMB = appConfig.getInt("computation-layer.reducer-memory-mb"); log.info("Reducer memory: {}", reduceMemoryMB); if (isHighMemoryStep()) { reduceMemoryMB *= appConfig.getInt("computation-layer.worker-high-memory-factor"); log.info("Increasing {} to {} for high-memory step", MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); } conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); int reduceHeapMB = (int) (reduceMemoryMB / 1.3); // Matches Hadoop's default log.info("Reducers have {}MB heap and can access {}MB RAM", reduceHeapMB, reduceMemoryMB); if (conf.get(MRJobConfig.REDUCE_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); } conf.set(MRJobConfig.REDUCE_JAVA_OPTS, "-Xmx" + reduceHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); // I see this in CM but not in Hadoop docs; probably won't hurt as it's supposed to result in // -Xmx appended to opts above, which is at worst redundant conf.setInt("mapreduce.reduce.java.opts.max.heap", reduceHeapMB); conf.setInt("yarn.scheduler.capacity.minimum-allocation-mb", 128); conf.setInt("yarn.app.mapreduce.am.resource.mb", 384); // Pass total config state conf.set(CONFIG_SERIALIZATION_KEY, ConfigUtils.getDefaultConfig().root().render()); // Make sure to set any args to conf above this line! setConf(conf); Job job = Job.getInstance(conf); // Basic File IO settings FileInputFormat.setMaxInputSplitSize(job, 1L << 28); // ~268MB SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); log.info("Created pipeline configuration {}", job.getConfiguration()); return new MRPipeline(jarClass, getCustomJobName(), job.getConfiguration()); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleInputOutputFormat.java
License:Apache License
public void testSplits(long maxSplitSize, int generatedRows) throws IOException, InterruptedException, IllegalArgumentException, SecurityException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { logger.info("Testing maxSplitSize: " + maxSplitSize + " and generatedRows:" + generatedRows); FileSystem fS = FileSystem.get(getConf()); Random r = new Random(1); Schema schema = new Schema("schema", Fields.parse("i:int,s:string")); ITuple tuple = new Tuple(schema); Path outPath = new Path(OUT); TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(getConf()), getConf(), outPath, schema); for (int i = 0; i < generatedRows; i++) { tuple.set("i", r.nextInt()); tuple.set("s", r.nextLong() + ""); writer.append(tuple);/*from w ww . j ava2 s . c o m*/ } writer.close(); TupleInputFormat format = ReflectionUtils.newInstance(TupleInputFormat.class, getConf()); Job job = new Job(getConf()); FileInputFormat.setInputPaths(job, outPath); logger.info("Using max input split size: " + maxSplitSize); FileInputFormat.setMaxInputSplitSize(job, maxSplitSize); job.setInputFormatClass(FileInputFormat.class); // Read all the splits and count. The number of read rows must // be the same than the written ones. int count = 0; for (InputSplit split : format.getSplits(job)) { TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext = TaskAttemptContextFactory.get(getConf(), attemptId); logger.info("Sampling split: " + split); RecordReader<ITuple, NullWritable> reader = format.createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); while (reader.nextKeyValue()) { tuple = reader.getCurrentKey(); count++; } reader.close(); } assertEquals(generatedRows, count); HadoopUtils.deleteIfExists(fS, outPath); }
From source file:com.google.cloud.bigtable.dataflowimport.HadoopFileSource.java
License:Apache License
private List<InputSplit> computeSplits(long desiredBundleSizeBytes) throws IOException, IllegalAccessException, InstantiationException { Job job = Job.getInstance(getDeserializerConfiguration()); FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes); FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes); return createFormat(job).getSplits(job); }
From source file:com.google.cloud.dataflow.contrib.hadoop.HadoopFileSource.java
License:Apache License
private List<InputSplit> computeSplits(long desiredBundleSizeBytes) throws IOException, IllegalAccessException, InstantiationException { Job job = Job.getInstance();//from w w w.ja v a 2 s . com FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes); FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes); return createFormat(job).getSplits(job); }
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
private int runMapReduce(Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { int chunkSizeInMB = 64; if (hasOption(CHUNK_SIZE_OPTION[0])) { chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0])); }/*w w w.j a v a2s . c om*/ String keyPrefix = null; if (hasOption(KEY_PREFIX_OPTION[0])) { keyPrefix = getOption(KEY_PREFIX_OPTION[0]); } // Prepare Job for submission. Job job = prepareJob(input, output, MultipleTextFileInputFormat.class, SequenceFilesFromDirectoryMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class, "SequenceFilesFromDirectory"); Configuration jobConfig = job.getConfiguration(); jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix); FileSystem fs = FileSystem.get(jobConfig); FileStatus fsFileStatus = fs.getFileStatus(input); String inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus); jobConfig.set(BASE_INPUT_PATH, input.toString()); long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024; // set the max split locations, otherwise we get nasty debug stuff jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS)); FileInputFormat.setInputPaths(job, inputDirList); // need to set this to a multiple of the block size, or no split happens FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes); FileOutputFormat.setCompressOutput(job, true); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { return -1; } return 0; }
From source file:com.phantom.hadoop.examples.dancing.DistributedPentomino.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); if (args.length == 0) { System.out.println("Usage: pentomino <output> [-depth #] [-height #] [-width #]"); ToolRunner.printGenericCommandUsage(System.out); return 2; }/*w w w . j av a 2 s.c om*/ // check for passed parameters, otherwise use defaults int width = conf.getInt(Pentomino.WIDTH, PENT_WIDTH); int height = conf.getInt(Pentomino.HEIGHT, PENT_HEIGHT); int depth = conf.getInt(Pentomino.DEPTH, PENT_DEPTH); for (int i = 0; i < args.length; i++) { if (args[i].equalsIgnoreCase("-depth")) { depth = Integer.parseInt(args[++i].trim()); } else if (args[i].equalsIgnoreCase("-height")) { height = Integer.parseInt(args[++i].trim()); } else if (args[i].equalsIgnoreCase("-width")) { width = Integer.parseInt(args[++i].trim()); } } // now set the values within conf for M/R tasks to read, this // will ensure values are set preventing MAPREDUCE-4678 conf.setInt(Pentomino.WIDTH, width); conf.setInt(Pentomino.HEIGHT, height); conf.setInt(Pentomino.DEPTH, depth); Class<? extends Pentomino> pentClass = conf.getClass(Pentomino.CLASS, OneSidedPentomino.class, Pentomino.class); int numMaps = conf.getInt(MRJobConfig.NUM_MAPS, DEFAULT_MAPS); Path output = new Path(args[0]); Path input = new Path(output + "_input"); FileSystem fileSys = FileSystem.get(conf); try { Job job = new Job(conf); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(PentMap.class); job.setJobName("dancingElephant"); Pentomino pent = ReflectionUtils.newInstance(pentClass, conf); pent.initialize(width, height); long inputSize = createInputDirectory(fileSys, input, pent, depth); // for forcing the number of maps FileInputFormat.setMaxInputSplitSize(job, (inputSize / numMaps)); // the keys are the prefix strings job.setOutputKeyClass(Text.class); // the values are puzzle solutions job.setOutputValueClass(Text.class); job.setMapperClass(PentMap.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); return (job.waitForCompletion(true) ? 0 : 1); } finally { fileSys.delete(input, true); } }