List of usage examples for org.apache.hadoop.io SequenceFile createWriter
@Deprecated public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass, CompressionType compressionType, CompressionCodec codec) throws IOException
From source file:com.google.cloud.bigtable.beam.sequencefiles.SequenceFileSourceTest.java
License:Open Source License
private File generateTextData(int recordCount, int syncInterval) throws IOException { Configuration config = new Configuration(false); List<KV<Text, Text>> data = Lists.newArrayList(); for (int i = 0; i < recordCount; i++) { data.add(KV.of(new Text(String.format("key-%010d", i)), new Text(String.format("value-%010d", i)))); }/* w w w . ja va 2 s .c o m*/ // Write data to read File targetFile = workDir.newFile(); try (Writer writer = SequenceFile.createWriter(config, Writer.file(new org.apache.hadoop.fs.Path(targetFile.toString())), Writer.keyClass(Text.class), Writer.valueClass(Text.class), Writer.blockSize(1), Writer.compression(CompressionType.NONE))) { int noSyncCount = 0; for (KV<Text, Text> kv : data) { writer.append(kv.getKey(), kv.getValue()); noSyncCount++; if (noSyncCount >= syncInterval) { writer.sync(); noSyncCount = 0; } } } return targetFile; }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
void writeDirs() throws IOException { print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath()); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); /*// w w w . ja v a 2 s.co m * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks")); Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); print(Verbosity.INFO, "\n\n" + dir.toUri().getPath()); FileStatus[] contents = fs.listStatus(dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFiles == null) return true; ignoredFiles.reset(testPath.toUri().getPath()); return !ignoredFiles.matches(); } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, " is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { boolean changed = uncrushedFiles.add(path.toUri().getPath()); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (fileLength <= maxEligibleSize) { crushables.add(content); crushableBytes += fileLength; } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, " has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, " => " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> bucketFiles = crushFile.contents(); print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), bucketFiles.size())); key.set(bucketId); for (String f : bucketFiles) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size()); partitionBucketer.add(crushFile); } } } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + bucketFiles, e); } } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= numPartitions; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); try { for (Bucket partition : partitions) { String partitionName = partition.name(); partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1))); for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } } finally { try { writer.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } DataOutputStream countersStream = fs.create(this.counters); try { jobCounters.write(countersStream); } finally { try { countersStream.close(); } catch (Exception e) { LOG.error("Trapped exception during close: " + partitionMap, e); } } }
From source file:com.inmobi.conduit.distcp.ConduitDistCp.java
License:Apache License
@Override protected Path createInputFileListing(Job job) throws IOException { // get the file path where copy listing file has to be saved Path fileListingPath = getFileListingPath(); Configuration config = job.getConfiguration(); SequenceFile.Writer fileListWriter = null; try {/*from w ww .j av a2 s .com*/ fileListWriter = SequenceFile.createWriter(fileListingPath.getFileSystem(config), config, fileListingPath, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE); for (Map.Entry<String, FileStatus> entry : fileListingMap.entrySet()) { FileStatus status = FileUtil.getFileStatus(entry.getValue(), buffer, in); fileListWriter.append(new Text(entry.getKey()), status); // Create a sync point after each entry. This will ensure that SequenceFile // Reader can work at file entry level granularity, given that SequenceFile // Reader reads from the starting of sync point. fileListWriter.sync(); totalBytesToCopy += entry.getValue().getLen(); totalPaths++; } } finally { if (fileListWriter != null) { fileListWriter.close(); } } LOG.info("Number of paths considered for copy: " + totalPaths); LOG.info("Number of bytes considered for copy: " + totalBytesToCopy + " (Actual number of bytes copied depends on whether any files are " + "skipped or overwritten.)"); // set distcp configurations config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, fileListingPath.toString()); config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, totalBytesToCopy); config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, totalPaths); return fileListingPath; }
From source file:com.inmobi.conduit.distcp.tools.mapred.lib.DynamicInputChunk.java
License:Apache License
private void openForWrite() throws IOException { writer = SequenceFile.createWriter(chunkSet.getFileSystem(), chunkSet.getConf(), chunkFilePath, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE); }
From source file:com.inmobi.conduit.distcp.tools.SimpleCopyListing.java
License:Apache License
private SequenceFile.Writer getWriter(Path pathToListFile) throws IOException { return SequenceFile.createWriter(pathToListFile.getFileSystem(getConf()), getConf(), pathToListFile, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE); }
From source file:com.inmobi.messaging.consumer.util.MessageUtil.java
License:Apache License
public static void createMessageSequenceFile(String fileName, FileSystem fs, Path parent, int msgIndex, Configuration conf) throws IOException { Path file = new Path(parent, fileName); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, Text.class, CompressionType.NONE);/* w w w . j a va 2s . c o m*/ for (int i = 0; i < 100; i++) { writer.append(new IntWritable(i), new Text(constructMessage(msgIndex).getBytes())); msgIndex++; } writer.close(); TestUtil.LOG.debug("Created sequence data file:" + file); }
From source file:com.inmobi.messaging.consumer.util.MessageUtil.java
License:Apache License
public static void createEmptySequenceFile(String fileName, FileSystem fs, Path parent, Configuration conf) throws IOException { Path file = new Path(parent, fileName); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, Text.class, CompressionType.NONE);//from ww w . j av a2s .co m writer.close(); TestUtil.LOG.debug("Created empty sequence file:" + file); }
From source file:com.kadwa.hadoop.DistExec.java
License:Open Source License
/** * Initialize ExecFilesMapper specific job-configuration. * * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments//w ww . j a va 2s .co m * @return true if it is necessary to launch a job. */ private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); jobConf.set(EXEC_CMD_LABEL, args.execCmd); //set boolean values jobConf.setBoolean(Options.REDIRECT_ERROR_TO_OUT.propertyname, args.flags.contains(Options.REDIRECT_ERROR_TO_OUT)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path stagingArea; try { stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf); } catch (InterruptedException e) { throw new IOException(e); } Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId); FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION); FileSystem.mkdirs(FileSystem.get(jobDirectory.toUri(), conf), jobDirectory, mapredSysPerms); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); FileSystem dstfs = args.dst.getFileSystem(conf); // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_" + NAME + "_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_" + NAME + "_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_" + NAME + "_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_" + NAME + "_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory. final boolean special = (args.srcs.size() == 1 && !dstExists); int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_" + NAME + "_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_" + NAME + "_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); LOG.info("sourcePathsCount=" + srcCount); LOG.info("filesToExecCount=" + fileCount); LOG.info("bytesToExecCount=" + StringUtils.humanReadableInt(byteCount)); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(fileCount, jobConf); return fileCount > 0; }
From source file:com.linkedin.camus.etl.kafka.common.StringKafkaRecordWriterProvider.java
@Override public RecordWriter<IEtlKey, CamusWrapper> getDataRecordWriter(final TaskAttemptContext context, final String fileName, CamusWrapper data, FileOutputCommitter committer) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); Path file = committer.getWorkPath(); file = new Path(file, EtlMultiOutputFormat.getUniqueFile(context, fileName, getFilenameExtension())); CompressionCodec codec = null;/*ww w.ja v a 2 s .co m*/ SequenceFile.CompressionType compressionType = SequenceFile.CompressionType.NONE; final SequenceFile.Writer out = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class), SequenceFile.Writer.compression(compressionType, codec), SequenceFile.Writer.progressable(context)); return new RecordWriter<IEtlKey, CamusWrapper>() { @Override public void write(IEtlKey iEtlKey, CamusWrapper camusWrapper) throws IOException { String record = (String) camusWrapper.getRecord() + recordDelimiter; out.append(new Text(String.valueOf(iEtlKey.getOffset())), new Text(record.getBytes())); } @Override public void close(TaskAttemptContext taskAttemptContext) throws IOException { out.close(); } }; }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
void writeDirs() throws IOException { print(Verbosity.INFO, "\nUsing temporary directory " + tmpDir.toUri().getPath() + "\n"); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); removableFiles = new HashSet<String>(); /*/*www. j av a2 s .co m*/ * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Bucketer partitionBucketer = new Bucketer(maxTasks, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); int fileCount = 0; //Path bucketFile = new Path(tmpIn, "dirs_" + fileCount++); Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { String dirPath = dir.toUri().getPath(); print(Verbosity.INFO, "\n\n[" + dirPath + "]"); jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); FileStatus[] contents = fs.listStatus(dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFilesMatcher == null) return true; ignoredFilesMatcher.reset(testPath.toUri().getPath()); boolean ignores = ignoredFilesMatcher.matches(); if (ignores) LOG.info("Ignoring file " + testPath); return !ignores; } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, "\n Directory is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { String filePath = path.toUri().getPath(); boolean skipFile = false; if (skippedFilesMatcher != null) { skippedFilesMatcher.reset(filePath); if (skippedFilesMatcher.matches()) { skipFile = true; } } boolean changed = uncrushedFiles.add(filePath); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (!skipFile && fileLength <= maxEligibleSize) { if (removeEmptyFiles && fileLength == 0) removableFiles.add(filePath); else { crushables.add(content); crushableBytes += fileLength; } } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, "\n Directory has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); print(Verbosity.INFO, "\n Directory skipped"); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, "\n Generating " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> filesInBucket = crushFile.contents(); print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), filesInBucket.size())); key.set(bucketId); for (String f : filesInBucket) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); /* * Write one row per file to maximize the number of mappers */ writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, filesInBucket.size()); partitionBucketer.add(crushFile); } } } if (!removableFiles.isEmpty()) { print(Verbosity.INFO, "\n Marked " + removableFiles.size() + " files for removal"); for (String removable : removableFiles) { uncrushedFiles.remove(removable); print(Verbosity.VERBOSE, "\n " + removable); } jobCounters.incrCounter(MapperCounter.FILES_REMOVED, removableFiles.size()); } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { writer.close(); } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= maxTasks; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); int totalReducers = 0; for (Bucket partition : partitions) { String partitionName = partition.name(); int p = Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)); partNum.set(p); if (partition.contents().size() > 0) totalReducers++; for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } writer.close(); print(Verbosity.INFO, "\n\nNumber of allocated reducers = " + totalReducers); job.setInt("mapreduce.job.reduces", totalReducers); DataOutputStream countersStream = fs.create(this.counters); jobCounters.write(countersStream); countersStream.close(); }