List of usage examples for org.apache.hadoop.io IntWritable IntWritable
public IntWritable()
From source file:com.ibm.bi.dml.runtime.matrix.data.TaggedInt.java
License:Open Source License
public TaggedInt() { tag = -1; base = new IntWritable(); }
From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java
License:Open Source License
public static double[] pickValueWeight(String dir, NumItemsByEachReducerMetaData metadata, double p, boolean average) throws IOException { long[] counts = metadata.getNumItemsArray(); long[] ranges = new long[counts.length]; ranges[0] = counts[0];//from www .ja v a 2 s . com for (int i = 1; i < counts.length; i++) ranges[i] = ranges[i - 1] + counts[i]; long total = ranges[ranges.length - 1]; // do averaging only if it is asked for; and sum_wt is even average = average && (total % 2 == 0); int currentPart = 0; double cum_weight = 0; long pos = (long) Math.ceil(total * p); while (ranges[currentPart] < pos) { currentPart++; cum_weight += ranges[currentPart]; } int offset; if (currentPart > 0) offset = (int) (pos - ranges[currentPart - 1] - 1); else offset = (int) pos - 1; FileSystem fs = FileSystem.get(_rJob); Path path = new Path(dir); FileStatus[] files = fs.listStatus(path); Path fileToRead = null; for (FileStatus file : files) if (file.getPath().toString().endsWith(Integer.toString(currentPart))) { fileToRead = file.getPath(); break; } if (fileToRead == null) throw new RuntimeException("cannot read partition " + currentPart); FSDataInputStream currentStream = fs.open(fileToRead); DoubleWritable readKey = new DoubleWritable(); IntWritable readValue = new IntWritable(); boolean contain0s = false; long numZeros = 0; if (currentPart == metadata.getPartitionOfZero()) { contain0s = true; numZeros = metadata.getNumberOfZero(); } ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros); int numRead = 0; while (numRead <= offset) { reader.readNextKeyValuePairs(readKey, readValue); numRead += readValue.get(); cum_weight += readValue.get(); } double ret = readKey.get(); if (average) { if (numRead <= offset + 1) { reader.readNextKeyValuePairs(readKey, readValue); cum_weight += readValue.get(); ret = (ret + readKey.get()) / 2; } } currentStream.close(); return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) }; }
From source file:com.jeffy.fbds.SequenceFileWriter.java
License:Apache License
public static void main(String[] args) throws IOException { // ?//from w ww. j a v a 2 s.co m String uri = args[0]; Configuration conf = new Configuration(); Path path = new Path(uri); IntWritable key = new IntWritable(); Text value = new Text(); try (SequenceFile.Writer writer = SequenceFile.createWriter(conf, Writer.file(path), Writer.keyClass(key.getClass()), Writer.valueClass(value.getClass()))) { for (int i = 0; i < 100; i++) { key.set(100 - i); value.set(DATA[i % DATA.length]); System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value); writer.append(key, value); } } }
From source file:com.jfolson.hive.serde.RBaseSerDe.java
License:Apache License
protected void serializeField(Object o, ObjectInspector oi, Object reuse) throws IOException { //LOG.info("Serializing hive type: "+oi.getTypeName()); //LOG.info("Serializing category: "+oi.getCategory().toString()); if (o == null) { tbOut.writeNull();/*from w w w.j av a2 s . c om*/ return; } switch (oi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; //LOG.info("Serializing primitive: "+poi.getPrimitiveCategory().toString()); switch (poi.getPrimitiveCategory()) { case VOID: { return; } case BINARY: { BinaryObjectInspector boi = (BinaryObjectInspector) poi; TypedBytesWritable bytes = reuse == null ? new TypedBytesWritable() : (TypedBytesWritable) reuse; BytesWritable bytesWrite = boi.getPrimitiveWritableObject(o); if (bytesWrite != null) { bytes.set(bytesWrite); if (!RType.isValid(bytes)) { LOG.error("Invalid typedbytes detected with type: " + RType.getType(bytes).code); bytes.setValue(new Buffer(bytesWrite.getBytes(), 0, bytesWrite.getLength())); } //LOG.info("Writing binary primitive with class: "+bytes.getClass().getName()); tbOut.write(bytes); } return; } case BOOLEAN: { BooleanObjectInspector boi = (BooleanObjectInspector) poi; BooleanWritable r = reuse == null ? new BooleanWritable() : (BooleanWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case BYTE: { ByteObjectInspector boi = (ByteObjectInspector) poi; ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case SHORT: { ShortObjectInspector spoi = (ShortObjectInspector) poi; ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse; r.set(spoi.get(o)); tbOut.write(r); return; } case INT: { IntObjectInspector ioi = (IntObjectInspector) poi; IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse; r.set(ioi.get(o)); tbOut.write(r); return; } case LONG: { LongObjectInspector loi = (LongObjectInspector) poi; LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse; r.set(loi.get(o)); tbOut.write(r); return; } case FLOAT: { FloatObjectInspector foi = (FloatObjectInspector) poi; FloatWritable r = reuse == null ? new FloatWritable() : (FloatWritable) reuse; r.set(foi.get(o)); tbOut.write(r); return; } case DOUBLE: DoubleObjectInspector doi = (DoubleObjectInspector) poi; DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse; r.set(doi.get(o)); tbOut.write(r); return; case STRING: { StringObjectInspector soi = (StringObjectInspector) poi; Text t = soi.getPrimitiveWritableObject(o); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory()); } } } case LIST: { ListObjectInspector loi = (ListObjectInspector) oi; ObjectInspector elemOI = loi.getListElementObjectInspector(); List l = loi.getList(o); // Don't use array (typecode: 144) until everything supports NA values in typedbytes if (false) {//(elemOI.getCategory()==ObjectInspector.Category.PRIMITIVE){ tbOut.writeArray(l, (PrimitiveObjectInspector) elemOI); } else { tbOut.writeVector(l, (PrimitiveObjectInspector) elemOI); } return; } case MAP: case STRUCT: { // For complex object, serialize to JSON format String s = SerDeUtils.getJSONString(o, oi); Text t = reuse == null ? new Text() : (Text) reuse; // convert to Text and write it t.set(s); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + oi.getCategory()); } } }
From source file:com.jfolson.hive.serde.RTypedBytesSerDe.java
License:Apache License
private void serializeField(Object o, ObjectInspector oi, Object reuse) throws IOException { //LOG.info("Serializing hive type: "+oi.getTypeName()); //LOG.info("Serializing category: "+oi.getCategory().toString()); if (o == null) { tbOut.writeNull();/*from w ww . ja v a 2 s . c om*/ return; } switch (oi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; //LOG.info("Serializing primitive: "+poi.getPrimitiveCategory().toString()); switch (poi.getPrimitiveCategory()) { case VOID: { return; } case BINARY: { BinaryObjectInspector boi = (BinaryObjectInspector) poi; TypedBytesWritable bytes = reuse == null ? new TypedBytesWritable() : (TypedBytesWritable) reuse; BytesWritable bytesWrite = boi.getPrimitiveWritableObject(o); if (bytesWrite != null) { bytes.set(bytesWrite); if (!RType.isValid(bytes)) { LOG.error("Invalid typedbytes detected with type: " + RType.getType(bytes).code); bytes.setValue(new Buffer(bytesWrite.getBytes(), 0, bytesWrite.getLength())); } //LOG.info("Writing binary primitive with class: "+bytes.getClass().getName()); tbOut.write(bytes); } return; } case BOOLEAN: { BooleanObjectInspector boi = (BooleanObjectInspector) poi; BooleanWritable r = reuse == null ? new BooleanWritable() : (BooleanWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case BYTE: { ByteObjectInspector boi = (ByteObjectInspector) poi; ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse; r.set(boi.get(o)); tbOut.write(r); return; } case SHORT: { ShortObjectInspector spoi = (ShortObjectInspector) poi; ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse; r.set(spoi.get(o)); tbOut.write(r); return; } case INT: { IntObjectInspector ioi = (IntObjectInspector) poi; IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse; r.set(ioi.get(o)); tbOut.write(r); return; } case LONG: { LongObjectInspector loi = (LongObjectInspector) poi; LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse; r.set(loi.get(o)); tbOut.write(r); return; } case FLOAT: { FloatObjectInspector foi = (FloatObjectInspector) poi; FloatWritable r = reuse == null ? new FloatWritable() : (FloatWritable) reuse; r.set(foi.get(o)); tbOut.write(r); return; } case DOUBLE: DoubleObjectInspector doi = (DoubleObjectInspector) poi; DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse; r.set(doi.get(o)); tbOut.write(r); return; case STRING: { StringObjectInspector soi = (StringObjectInspector) poi; Text t = soi.getPrimitiveWritableObject(o); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory()); } } } case LIST: { ListObjectInspector loi = (ListObjectInspector) oi; ObjectInspector elemOI = loi.getListElementObjectInspector(); List l = loi.getList(o); if (false) {//(elemOI.getCategory()==ObjectInspector.Category.PRIMITIVE){ tbOut.writeArray(l, (PrimitiveObjectInspector) elemOI); } else { tbOut.writeVector(l, (PrimitiveObjectInspector) elemOI); } return; } case MAP: case STRUCT: { // For complex object, serialize to JSON format String s = SerDeUtils.getJSONString(o, oi); Text t = reuse == null ? new Text() : (Text) reuse; // convert to Text and write it t.set(s); tbOut.write(t); return; } default: { throw new RuntimeException("Unrecognized type: " + oi.getCategory()); } } }
From source file:com.jfolson.hive.serde.RTypedBytesWritableInput.java
License:Apache License
public IntWritable readInt(IntWritable iw) throws IOException { if (iw == null) { iw = new IntWritable(); }/*from ww w . j a va 2 s . co m*/ int val = in.readInt(); if (val == RType.NA_INTEGER) { return null; } iw.set(val); return iw; }
From source file:com.linkedin.cubert.io.CompactWritablesDeserializer.java
License:Open Source License
private static final WritableComparable<?> createWritable(DataType type) { switch (type) { case BOOLEAN: return new BooleanWritable(); case BYTE://from www .ja va 2 s . c o m return new ByteWritable(); case INT: return new IntWritable(); case LONG: return new LongWritable(); case FLOAT: return new FloatWritable(); case DOUBLE: return new DoubleWritable(); case STRING: return new Text(); default: return null; } }
From source file:com.liveramp.hank.hadoop.KeyAndPartitionWritable.java
License:Apache License
public KeyAndPartitionWritable() { key = new BytesWritable(); partition = new IntWritable(); }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
void writeDirs() throws IOException { print(Verbosity.INFO, "\nUsing temporary directory " + tmpDir.toUri().getPath() + "\n"); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); removableFiles = new HashSet<String>(); /*/*from w w w .java 2s .c om*/ * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Bucketer partitionBucketer = new Bucketer(maxTasks, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); int fileCount = 0; //Path bucketFile = new Path(tmpIn, "dirs_" + fileCount++); Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { String dirPath = dir.toUri().getPath(); print(Verbosity.INFO, "\n\n[" + dirPath + "]"); jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); FileStatus[] contents = fs.listStatus(dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFilesMatcher == null) return true; ignoredFilesMatcher.reset(testPath.toUri().getPath()); boolean ignores = ignoredFilesMatcher.matches(); if (ignores) LOG.info("Ignoring file " + testPath); return !ignores; } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, "\n Directory is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { String filePath = path.toUri().getPath(); boolean skipFile = false; if (skippedFilesMatcher != null) { skippedFilesMatcher.reset(filePath); if (skippedFilesMatcher.matches()) { skipFile = true; } } boolean changed = uncrushedFiles.add(filePath); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (!skipFile && fileLength <= maxEligibleSize) { if (removeEmptyFiles && fileLength == 0) removableFiles.add(filePath); else { crushables.add(content); crushableBytes += fileLength; } } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, "\n Directory has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); print(Verbosity.INFO, "\n Directory skipped"); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, "\n Generating " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> filesInBucket = crushFile.contents(); print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), filesInBucket.size())); key.set(bucketId); for (String f : filesInBucket) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); /* * Write one row per file to maximize the number of mappers */ writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, filesInBucket.size()); partitionBucketer.add(crushFile); } } } if (!removableFiles.isEmpty()) { print(Verbosity.INFO, "\n Marked " + removableFiles.size() + " files for removal"); for (String removable : removableFiles) { uncrushedFiles.remove(removable); print(Verbosity.VERBOSE, "\n " + removable); } jobCounters.incrCounter(MapperCounter.FILES_REMOVED, removableFiles.size()); } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { writer.close(); } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= maxTasks; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); int totalReducers = 0; for (Bucket partition : partitions) { String partitionName = partition.name(); int p = Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)); partNum.set(p); if (partition.contents().size() > 0) totalReducers++; for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } writer.close(); print(Verbosity.INFO, "\n\nNumber of allocated reducers = " + totalReducers); job.setInt("mapreduce.job.reduces", totalReducers); DataOutputStream countersStream = fs.create(this.counters); jobCounters.write(countersStream); countersStream.close(); }
From source file:com.m6d.filecrush.crush.CrushTest.java
License:Apache License
@Test public void bucketing() throws Exception { File in = tmp.newFolder("in"); Counters expectedCounters = new Counters(); List<String> expectedBucketFiles = new ArrayList<String>(); /*/*from ww w . j a va 2 s. com*/ * Create a hierarchy of directories. Directories are distinguished by a trailing slash in these comments. * * 1/ * 1.1/ * file1 10 bytes * file2 20 bytes * file3 30 bytes * file4 41 bytes * file5 15 bytes * file6 30 bytes * file7 20 bytes * 1.2/ * file1 20 bytes * file2 10 bytes * 1.3/ * 2/ * file1 70 bytes * file2 30 bytes * file3 25 bytes * file4 30 bytes * file5 35 bytes * 2.1/ * file1 10 bytes * 2.2/ * file1 25 bytes * file2 15 bytes * file3 35 bytes * 2.3/ * file1 41 bytes * file2 10 bytes * 2.4/ * 2.4.1/ * file1 100 bytes * file2 30 bytes * 2.4.2/ * file1 20 bytes * file2 20 bytes * file3 10 bytes */ /* * in contains 2 dirs and no files so it is skipped. * * in/ * 1/ * 2/ */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); tmp.newFolder("in/1"); File dir2 = tmp.newFolder("in/2"); /* * in/1 contains three dirs and no files so it is skipped. * * in/ * 1/ * 1.1/ * 1.2/ * 1.3/ */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); File dir1_1 = tmp.newFolder("in/1/1.1"); File dir1_2 = tmp.newFolder("in/1/1.2"); tmp.newFolder("in/1/1.3"); /* * in/2 contains five files and four dirs. * * in/ * 2/ * file1 70 bytes * file2 30 bytes * file3 25 bytes * file4 30 bytes * file5 35 bytes * 2.1/ * 2.2/ * 2.3/ * 2.4/ * * 0 1 2 * file5 35 file2 30 file4 30 * file3 25 * * Buckets 0 and 2 have a single file each so they are ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 5); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 3); File dir2_1 = tmp.newFolder("in/2/2.1"); File dir2_2 = tmp.newFolder("in/2/2.2"); File dir2_3 = tmp.newFolder("in/2/2.3"); tmp.newFolder("in/2/2.4"); createFile(dir2, "file1", 70); createFile(dir2, "file2", 30); createFile(dir2, "file3", 25); createFile(dir2, "file4", 30); createFile(dir2, "file5", 35); expectedBucketFiles .add(format("%s %s", dir2.getAbsolutePath() + "-1", new File(dir2, "file2").getAbsolutePath())); expectedBucketFiles .add(format("%s %s", dir2.getAbsolutePath() + "-1", new File(dir2, "file3").getAbsolutePath())); /* * in/1/1.1 contains seven files and no dirs. * * in/ * 1/ * 1.1/ * file1 10 bytes * file2 20 bytes * file3 30 bytes * file4 41 bytes * file5 15 bytes * file6 30 bytes * file7 20 bytes * * 0 1 2 * file3 30 file6 30 file2 20 * file5 15 file1 10 file7 20 * * file4 is > 50 * 0.8 so it is ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 7); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 6); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1); createFile(dir1_1, "file1", 10); createFile(dir1_1, "file2", 20); createFile(dir1_1, "file3", 30); createFile(dir1_1, "file4", 41); createFile(dir1_1, "file5", 15); createFile(dir1_1, "file6", 30); createFile(dir1_1, "file7", 20); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file3").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file5").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file6").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file1").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file2").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file7").getAbsolutePath())); /* * in/1/1.2 contains to files. * * in/ * 1/ * 1.2/ * file1 20 bytes * file2 10 bytes */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2); createFile(dir1_2, "file1", 20); createFile(dir1_2, "file2", 10); expectedBucketFiles.add( format("%s %s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file1").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file2").getAbsolutePath())); /* * in/1/1.3 is empty. * * in/ * 1/ * 1.3/ */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); tmp.newFolder("in/1/1.3"); /* * in/2/2.1 contains on file. * * in/ * 2/ * 2.1/ * file1 10 bytes * * Single file dirs are ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 1); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1); createFile(dir2_1, "file1", 10); /* * in/2/2.2 contains three files. * * in/ * 2/ * 2.2/ * file1 25 bytes * file2 15 bytes * file3 35 bytes * * 0 1 * file3 35 file1 25 * file2 15 * * Bucket 0 with a single file is ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1); createFile(dir2_2, "file1", 25); createFile(dir2_2, "file2", 15); createFile(dir2_2, "file3", 35); expectedBucketFiles.add( format("%s %s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file1").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file2").getAbsolutePath())); /* * in/2/2.3 contains 2 files. * * in/ * 2/ * 2.3/ * file1 41 bytes * file2 10 bytes * * file1 is too big and leaving file2 as a single file, which is also ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2); createFile(dir2_3, "file1", 41); createFile(dir2_3, "file2", 10); /* * in/2/2.4 contains two sub directories and no files. * * in/ * 2/ * 2.4/ * 2.4.1/ * 2.4.2/ */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); tmp.newFolder("in/2/2.4"); File dir2_4_1 = tmp.newFolder("in/2/2.4/2.4.1"); File dir2_4_2 = tmp.newFolder("in/2/2.4/2.4.2"); /* * in/ * 2/ * 2.4/ * 2.4.1/ * file1 100 bytes * file2 30 bytes */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2); createFile(dir2_4_1, "file1", 100); createFile(dir2_4_1, "file2", 30); /* * in/ * 2/ * 2.4/ * 2.4.2/ * file1 20 bytes * file2 20 bytes * file3 10 bytes * 0 * file1 20 * file2 20 * file3 10 */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 3); createFile(dir2_4_2, "file1", 20); createFile(dir2_4_2, "file2", 20); createFile(dir2_4_2, "file3", 10); expectedBucketFiles.add(format("%s %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file1").getAbsolutePath())); expectedBucketFiles.add(format("%s %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file2").getAbsolutePath())); expectedBucketFiles.add(format("%s %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file3").getAbsolutePath())); Crush crush = new Crush(); crush.setConf(job); crush.setFileSystem(fileSystem); /* * Call these in the same order that run() does. */ crush.createJobConfAndParseArgs("--compress=none", "--max-file-blocks=1", in.getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "20101124171730"); crush.writeDirs(); /* * Verify bucket contents. */ List<String> actualBucketFiles = new ArrayList<String>(); Text key = new Text(); Text value = new Text(); Reader reader = new Reader(FileSystem.get(job), crush.getBucketFiles(), job); while (reader.next(key, value)) { actualBucketFiles.add(format("%s\t%s", key, value)); } reader.close(); Collections.sort(expectedBucketFiles); Collections.sort(actualBucketFiles); assertThat(actualBucketFiles, equalTo(expectedBucketFiles)); /* * Verify the partition map. */ Reader partitionMapReader = new Reader(FileSystem.get(job), crush.getPartitionMap(), job); IntWritable partNum = new IntWritable(); Map<String, Integer> actualPartitions = new HashMap<String, Integer>(); while (partitionMapReader.next(key, partNum)) { actualPartitions.put(key.toString(), partNum.get()); } partitionMapReader.close(); /* * These crush files need to allocated into 5 partitions: * * in/2-1 55 bytes * in/1/1.1-0 45 bytes * in/1/1.1-2 40 bytes * in/1/1.1-1 40 bytes * in/1/1.2-0 30 bytes * in/2/2.2-1 40 bytes * in/2/2.4/2.4.2-0 50 bytes * * 0 1 2 3 4 * in/2-1 55 in/2/2.4/2.4.2-0 50 in/1/1.1-0 45 in/1/1.1-2 40 in/1/1.1-1 40 * in/2/2.2-1 40 in/1/1.2-0 39 */ Map<String, Integer> expectedPartitions = new HashMap<String, Integer>(); //TODO: this may not be deterministic due to jvm/hashmap/filesystem expectedPartitions.put(dir2.getAbsolutePath() + "-1", 0); expectedPartitions.put(dir2_4_2.getAbsolutePath() + "-0", 1); expectedPartitions.put(dir1_1.getAbsolutePath() + "-0", 2); expectedPartitions.put(dir1_1.getAbsolutePath() + "-2", 4); expectedPartitions.put(dir2_2.getAbsolutePath() + "-1", 3); expectedPartitions.put(dir1_1.getAbsolutePath() + "-1", 3); expectedPartitions.put(dir1_2.getAbsolutePath() + "-0", 4); assertThat(actualPartitions, equalTo(expectedPartitions)); /* * Verify counters. */ Counters actualCounters = new Counters(); DataInputStream countersStream = FileSystem.get(job).open(crush.getCounters()); actualCounters.readFields(countersStream); countersStream.close(); assertThat(actualCounters, equalTo(expectedCounters)); }