List of usage examples for org.apache.hadoop.mapred JobConf getNumMapTasks
public int getNumMapTasks()
From source file:cascading.flow.hadoop.HadoopStepStats.java
License:Open Source License
public void captureJobStats() { RunningJob runningJob = getRunningJob(); if (runningJob == null) return;/*w w w . jav a 2 s . c o m*/ JobConf ranJob = new JobConf(runningJob.getJobFile()); setNumMapTasks(ranJob.getNumMapTasks()); setNumReducerTasks(ranJob.getNumReduceTasks()); }
From source file:com.aerospike.hadoop.mapreduce.AerospikeInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobContext context) throws IOException { // Delegate to the old API. Configuration cfg = context.getConfiguration(); JobConf jobconf = AerospikeConfigUtil.asJobConf(cfg); return Arrays.asList((InputSplit[]) getSplits(jobconf, jobconf.getNumMapTasks())); }
From source file:com.benchmark.mapred.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param conf the job to sample/*from w w w . jav a 2 s . c o m*/ * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(JobConf conf, Path partFile) throws IOException { TeraInputFormat inFormat = new TeraInputFormat(); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; long records = 0; // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null); while (reader.next(key, value)) { sampler.addKey(key); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); for (Text split : sampler.createPartitions(partitions)) { writer.append(split, nullValue); } writer.close(); }
From source file:com.ebay.erl.mobius.core.mapred.MobiusInputSampler.java
License:Apache License
@Override public Object[] getSample(InputFormat inf, JobConf job) throws IOException { // the following codes are copied from {@link InputSampler#RandomSampler}, // but require some modifications. InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples); int splitsToSample = Math.min(this.maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed);//from w w w . j a v a 2 s . c o m // get Sorters Sorter[] sorters = null; if (job.get(ConfigureConstants.SORTERS, null) != null) { // total sort job sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job); } else { // there is no sorter, should be reducer/join job Column[] keys = (Column[]) SerializableUtil .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job); sorters = new Sorter[keys.length]; for (int i = 0; i < keys.length; i++) { sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC); } } long proportion = 10L; while ((int) (this.freq * proportion) == 0) { proportion = proportion * 10; } proportion = 5L * proportion; // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } SamplingOutputCollector collector = new SamplingOutputCollector(); for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) { LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size()); RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); WritableComparable key = reader.createKey(); WritableComparable value = reader.createValue(); if (!(inf instanceof MobiusDelegatingInputFormat)) { // not mobius delegating input format, so the CURRENT_DATASET_ID // will not be set by inf#getRecordReader, we set them here. // // set the current dataset id, as the AbstractMobiusMapper#configure // method needs this property. job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS)); } Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID)); LOGGER.info("Samples coming from dataset: " + datasetID.toString()); AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job); mapper.configure(job); // reading elements from one split long readElement = 0; while (reader.next(key, value)) { collector.clear(); Tuple tuple = mapper.parse(key, value); readElement++; if (readElement > (((long) numSamples) * ((long) proportion))) { // a split might be very big (ex: a large gz file), // so we just need to read the break; } if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { mapper.joinmap(key, value, collector, Reporter.NULL); // joinmap function might generate more than one output key // per <code>key</code> input. for (Tuple t : collector.getOutKey()) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.add(nkey); } } else { // When exceeding the maximum number of samples, replace // a random element with this one, then adjust the // frequency to reflect the possibility of existing // elements being pushed out mapper.joinmap(key, value, collector, Reporter.NULL); for (Tuple t : collector.getOutKey()) { int ind = r.nextInt(numSamples); if (ind != numSamples) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.set(ind, nkey); } } freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples; } key = reader.createKey(); value = reader.createValue(); } } reader.close(); } LOGGER.info("Samples have been collected, return."); return samples.toArray(); }
From source file:com.ibm.bi.dml.runtime.matrix.sort.SamplingSortMRInputFormat.java
License:Open Source License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param conf the job to sample//w ww. j a va 2s. c om * @param partFile where to write the output file to * @throws IOException if something goes wrong * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings({ "unchecked", "unused", "deprecation" }) public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException { SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat(); Sampler sampler = new Sampler(); Class<? extends WritableComparable> targetKeyClass; targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class); //get input converter information int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0); int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0); //indicate whether the matrix value in this mapper is a matrix cell or a matrix block int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 1000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; // take N samples from different parts of the input int totalcount = 0; for (int i = 0; i < samples; ++i) { SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat .getRecordReader(splits[sampleStep * i], conf, null); int count = 0; WritableComparable key = (WritableComparable) reader.createKey(); Writable value = (Writable) reader.createValue(); while (reader.next(key, value) && count < recordsPerSample) { Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0); inputConverter.setBlockSize(brlen, bclen); inputConverter.convert(key, value); while (inputConverter.hasNext()) { Pair pair = inputConverter.next(); if (pair.getKey() instanceof DoubleWritable) { sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get())); } else if (pair.getValue() instanceof MatrixCell) { sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue())); } else throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass()); count++; } key = (WritableComparable) reader.createKey(); value = (Writable) reader.createValue(); } totalcount += count; } if (totalcount == 0) //empty input files sampler.addValue(new DoubleWritable(0)); FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } //note: key value always double/null as expected by partitioner SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); int index0 = -1, i = 0; boolean lessthan0 = true; for (WritableComparable splitValue : sampler.createPartitions(partitions)) { writer.append(splitValue, nullValue); if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) { index0 = i; lessthan0 = false; } i++; } if (lessthan0) index0 = partitions - 1; writer.close(); return index0; }
From source file:com.ibm.jaql.lang.expr.io.InputSplitsFn.java
License:Apache License
@Override public JsonIterator iter(Context context) throws Exception { JsonValue iod = exprs[0].eval(context); Adapter adapter = JaqlUtil.getAdapterStore().input.getAdapter(iod); if (!(adapter instanceof HadoopInputAdapter)) { throw new ClassCastException("i/o descriptor must be for an input format"); }/*ww w . jav a2 s . c om*/ HadoopInputAdapter hia = (HadoopInputAdapter) adapter; JobConf conf = new JobConf(); // TODO: allow configuration hia.setParallel(conf); // right thing to do? hia.configure(conf); // right thing to do? int numSplits = conf.getNumMapTasks(); // TODO: allow override final InputSplit[] splits = hia.getSplits(conf, numSplits); final MutableJsonString className = new MutableJsonString(); final MutableJsonBinary rawSplit = new MutableJsonBinary(); final BufferedJsonRecord rec = new BufferedJsonRecord(3); final BufferedJsonArray locArr = new BufferedJsonArray(); rec.add(CLASS_TAG, className); rec.add(SPLIT_TAG, rawSplit); rec.add(LOCATIONS_TAG, locArr); return new JsonIterator(rec) { DataOutputBuffer out = new DataOutputBuffer(); int i = 0; @Override public boolean moveNext() throws Exception { if (i >= splits.length) { return false; } InputSplit split = splits[i++]; className.setCopy(split.getClass().getName()); out.reset(); split.write(out); rawSplit.setCopy(out.getData(), out.getLength()); locArr.clear(); String[] locs = split.getLocations(); if (locs != null) { for (String loc : locs) { locArr.add(new JsonString(loc)); } } return true; } }; }
From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java
License:Apache License
/** * Runs the map-reduce job on ScaleOut hServer.* * * @param job the job to run//from w ww . j av a 2 s. com * @param jobId the id of the job * @param sortEnabled if key sorting is enabled * @param jobParameter user defined parameter object for the job * @param grid the invocation grid to run the job * @throws IOException if errors occurred during the job * @throws InterruptedException if the processing thread is interrupted * @throws ClassNotFoundException if the invocation grid does not contain the dependency class */ @SuppressWarnings("unchecked") public void runOldApiJob(JobConf job, org.apache.hadoop.mapred.JobID jobId, boolean sortEnabled, Object jobParameter, InvocationGrid grid) throws IOException, InterruptedException, ClassNotFoundException { //Initialize user credential in advance int jobAppId = 0xFFFFFFF & BitConverter.hashStringOneInt(jobId.toString()); String hadoopVersion = VersionInfo.getVersion(); long time = System.currentTimeMillis(); CreateUserCredentials.run(grid); try { //Check output specs before running the job job.getOutputFormat().checkOutputSpecs(FileSystem.get(job), job); JobContext jContext = HadoopVersionSpecificCode.getInstance(hadoopVersion, job).createJobContext(job, jobId); org.apache.hadoop.mapred.OutputCommitter outputCommitter = job.getOutputCommitter(); outputCommitter.setupJob(jContext); //clear all temporary objects DataAccessor.clearObjects(jobAppId); //Calculating the partition layout com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping .getCurrent(); List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts()); //Generating mapping of Hadoop partitions to SOSS partitions, so they are equally distributed across hosts int numHosts = hostAddresses.size(); int numberOfSlotsPerNode = Math .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1); //Set the number of splits to the number of cores if (NamedMapInputFormatMapred.class.isAssignableFrom(job.getInputFormat().getClass())) { int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job) * numHosts * numberOfSlotsPerNode; job.setNumMapTasks(Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS)); } //Generating split to hostname map org.apache.hadoop.mapred.InputFormat inputFormat = job.getInputFormat(); List<org.apache.hadoop.mapred.InputSplit> splitList = Arrays .asList(inputFormat.getSplits(job, job.getNumMapTasks())); Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null); //Choose the optimal number of reducers for GridOutputFormat if (job.getOutputFormat() instanceof NamedMapOutputFormatMapred) { job.setNumReduceTasks(numHosts * numberOfSlotsPerNode); sortEnabled = false; } int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks()); //Generating invocation parameters Class<? extends org.apache.hadoop.mapred.InputSplit> splitType = splitList.size() > 0 ? splitList.get(0).getClass() : null; HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job, jobId, true); HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit> parameters = new HServerInvocationParameters<org.apache.hadoop.mapred.InputSplit>( hadoopParameters, jobAppId, partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, splitList, splitToHostAddress, false, sortEnabled, hadoopVersion, jobParameter, SerializationMode.DEFAULT); StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("Splits created:\n"); for (InetAddress address : splitToHostAddress.keySet()) { stringBuilder.append("Host "); stringBuilder.append(address); stringBuilder.append(" has "); stringBuilder.append(splitToHostAddress.get(address).size()); stringBuilder.append(" splits.\n"); } System.out.println(stringBuilder.toString()); System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid, RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds()); if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) { throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0)); } System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); MapperResult resultObject = mapInvokeResult.getResult(); if (resultObject == null || mapInvokeResult.getNumFailed() != 0) { throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed()); } if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) { throw new IOException("Number of splits does not match the number of invocations. Nsplits = " + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed()); } if (partitionMapping.length > 0) { //Running the reduce step InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class, jobAppId, TimeSpan.INFINITE_TIMEOUT.getSeconds()); System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms."); DataAccessor.clearObjects(jobAppId); //clear all temporary objects if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) { throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0)); } if (reduceInvokeResult.getNumFailed() != 0) { throw new IOException("Reduce invocation failed."); } if (reduceInvokeResult.getResult() != partitionMapping.length) { throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length + " Actual = " + reduceInvokeResult.getResult()); } } outputCommitter.commitJob(jContext); } catch (StateServerException e) { throw new IOException("ScaleOut hServer access error.", e); } }
From source file:com.TCG.Nutch_DNS.Generator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*www . jav a 2 s. c o m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("Generator: starting at " + sdf.format(start)); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:crunch.MaxTemperature.java
License:Apache License
@Test @Ignore("Needs more investigation") public void recordsCoincideWithBlocks() throws IOException { int recordLength = 1024; Path input = new Path("input"); createFile(input, 12, recordLength); JobConf job = new JobConf(); job.set("fs.default.name", fs.getUri().toString()); FileInputFormat.addInputPath(job, input); InputFormat<LongWritable, Text> inputFormat = job.getInputFormat(); InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks()); assertThat(splits.length, is(3)); checkSplit(splits[0], 0, 4096);// ww w . j ava 2 s . c om checkSplit(splits[1], 4096, 4096); checkSplit(splits[2], 8192, 4096); checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 4); checkRecordReader(inputFormat, splits[1], job, recordLength, 4, 8); checkRecordReader(inputFormat, splits[2], job, recordLength, 8, 12); }
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void recordsDontCoincideWithBlocks() throws IOException { int recordLength = 1024 + 512; Path input = new Path("input"); createFile(input, 8, recordLength); JobConf job = new JobConf(); job.set("fs.default.name", fs.getUri().toString()); FileInputFormat.addInputPath(job, input); InputFormat<LongWritable, Text> inputFormat = job.getInputFormat(); InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks()); System.out.println(Arrays.asList(splits)); checkSplit(splits[0], 0, 4096);/*from w w w. ja va2 s. c om*/ checkSplit(splits[1], 4096, 4096); checkSplit(splits[2], 8192, 4096); checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 3); checkRecordReader(inputFormat, splits[1], job, recordLength, 3, 6); checkRecordReader(inputFormat, splits[2], job, recordLength, 6, 8); }