Example usage for org.apache.hadoop.io SequenceFile createWriter

List of usage examples for org.apache.hadoop.io SequenceFile createWriter

Introduction

In this page you can find the example usage for org.apache.hadoop.io SequenceFile createWriter.

Prototype

@Deprecated
public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass,
        CompressionType compressionType, CompressionCodec codec) throws IOException 

Source Link

Document

Construct the preferred type of 'raw' SequenceFile Writer.

Usage

From source file:com.google.cloud.bigtable.beam.sequencefiles.SequenceFileSourceTest.java

License:Open Source License

private File generateTextData(int recordCount, int syncInterval) throws IOException {
    Configuration config = new Configuration(false);

    List<KV<Text, Text>> data = Lists.newArrayList();
    for (int i = 0; i < recordCount; i++) {
        data.add(KV.of(new Text(String.format("key-%010d", i)), new Text(String.format("value-%010d", i))));
    }/* w  w  w . ja  va 2 s .c  o m*/

    // Write data to read
    File targetFile = workDir.newFile();

    try (Writer writer = SequenceFile.createWriter(config,
            Writer.file(new org.apache.hadoop.fs.Path(targetFile.toString())), Writer.keyClass(Text.class),
            Writer.valueClass(Text.class), Writer.blockSize(1), Writer.compression(CompressionType.NONE))) {
        int noSyncCount = 0;
        for (KV<Text, Text> kv : data) {
            writer.append(kv.getKey(), kv.getValue());
            noSyncCount++;
            if (noSyncCount >= syncInterval) {
                writer.sync();
                noSyncCount = 0;
            }
        }
    }

    return targetFile;
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

void writeDirs() throws IOException {

    print(Verbosity.INFO, "\n\nUsing temporary directory " + tmpDir.toUri().getPath());

    FileStatus status = fs.getFileStatus(srcDir);

    Path tmpIn = new Path(tmpDir, "in");

    bucketFiles = new Path(tmpIn, "dirs");
    partitionMap = new Path(tmpIn, "partition-map");
    counters = new Path(tmpIn, "counters");

    skippedFiles = new HashSet<String>();

    /*//  w  w w .  ja  v  a 2 s.co m
     * Prefer the path returned by the status because it is always fully qualified.
     */
    List<Path> dirs = asList(status.getPath());

    Text key = new Text();
    Text value = new Text();

    Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class,
            CompressionType.BLOCK);

    int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks"));

    Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false);
    partitionBucketer.reset("partition-map");

    jobCounters = new Counters();

    try {
        while (!dirs.isEmpty()) {
            List<Path> nextLevel = new LinkedList<Path>();

            for (Path dir : dirs) {
                jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);

                print(Verbosity.INFO, "\n\n" + dir.toUri().getPath());

                FileStatus[] contents = fs.listStatus(dir, new PathFilter() {
                    @Override
                    public boolean accept(Path testPath) {
                        if (ignoredFiles == null)
                            return true;
                        ignoredFiles.reset(testPath.toUri().getPath());
                        return !ignoredFiles.matches();
                    }

                });

                if (contents == null || contents.length == 0) {
                    print(Verbosity.INFO, " is empty");

                    jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                } else {
                    List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length);
                    Set<String> uncrushedFiles = new HashSet<String>(contents.length);

                    long crushableBytes = 0;

                    /*
                     * Queue sub directories for subsequent inspection and examine the files in this directory.
                     */
                    for (FileStatus content : contents) {
                        Path path = content.getPath();

                        if (content.isDir()) {
                            nextLevel.add(path);
                        } else {
                            boolean changed = uncrushedFiles.add(path.toUri().getPath());

                            assert changed : path.toUri().getPath();

                            long fileLength = content.getLen();

                            if (fileLength <= maxEligibleSize) {
                                crushables.add(content);
                                crushableBytes += fileLength;
                            }
                        }
                    }

                    /*
                     * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the
                     * number of files we found.
                     */
                    if (!uncrushedFiles.isEmpty()) {
                        if (-1 == findMatcher(dir)) {
                            throw new IllegalArgumentException(
                                    "Could not find matching regex for directory: " + dir);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size());
                    }

                    if (0 == crushableBytes) {
                        print(Verbosity.INFO, " has no crushable files");

                        jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                    } else {
                        /*
                         * We found files to consider for crushing.
                         */
                        long nBlocks = crushableBytes / dfsBlockSize;

                        if (nBlocks * dfsBlockSize != crushableBytes) {
                            nBlocks++;
                        }

                        /*
                         * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory.
                         */
                        long dirBuckets = nBlocks / maxFileBlocks;

                        if (dirBuckets * maxFileBlocks != nBlocks) {
                            dirBuckets++;
                        }

                        if (dirBuckets > Integer.MAX_VALUE) {
                            throw new AssertionError("Too many buckets: " + dirBuckets);
                        }

                        Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs);

                        directoryBucketer.reset(getPathPart(dir));

                        for (FileStatus file : crushables) {
                            directoryBucketer.add(new FileStatusHasSize(file));
                        }

                        List<Bucket> crushFiles = directoryBucketer.createBuckets();

                        if (crushFiles.isEmpty()) {
                            jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                        } else {
                            nBuckets += crushFiles.size();

                            jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

                            print(Verbosity.INFO, " => " + crushFiles.size() + " output files");

                            /*
                             * Write out the mapping between a bucket and a file.
                             */
                            for (Bucket crushFile : crushFiles) {
                                String bucketId = crushFile.name();

                                List<String> bucketFiles = crushFile.contents();

                                print(Verbosity.INFO,
                                        format("\n  Output %s will include %,d input bytes from %,d files",
                                                bucketId, crushFile.size(), bucketFiles.size()));

                                key.set(bucketId);

                                for (String f : bucketFiles) {
                                    boolean changed = uncrushedFiles.remove(f);

                                    assert changed : f;

                                    pathMatcher.reset(f);

                                    pathMatcher.matches();

                                    value.set(pathMatcher.group(5));

                                    writer.append(key, value);

                                    /*
                                     * Print the input file with four leading spaces.
                                     */
                                    print(Verbosity.VERBOSE, "\n    " + f);
                                }

                                jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size());

                                partitionBucketer.add(crushFile);
                            }
                        }
                    }

                    if (!uncrushedFiles.isEmpty()) {
                        print(Verbosity.INFO, "\n\n  Skipped " + uncrushedFiles.size() + " files");

                        for (String uncrushed : uncrushedFiles) {
                            print(Verbosity.VERBOSE, "\n    " + uncrushed);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size());
                    }

                    skippedFiles.addAll(uncrushedFiles);
                }
            }

            dirs = nextLevel;
        }
    } finally {
        try {
            writer.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + bucketFiles, e);
        }
    }

    /*
     * Now that we have processed all the directories, write the partition map.
     */
    List<Bucket> partitions = partitionBucketer.createBuckets();

    assert partitions.size() <= numPartitions;

    writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);

    IntWritable partNum = new IntWritable();

    try {
        for (Bucket partition : partitions) {
            String partitionName = partition.name();

            partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)));

            for (String bucketId : partition.contents()) {
                key.set(bucketId);

                writer.append(key, partNum);
            }
        }
    } finally {
        try {
            writer.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + partitionMap, e);
        }
    }

    DataOutputStream countersStream = fs.create(this.counters);

    try {
        jobCounters.write(countersStream);
    } finally {
        try {
            countersStream.close();
        } catch (Exception e) {
            LOG.error("Trapped exception during close: " + partitionMap, e);
        }
    }
}

From source file:com.inmobi.conduit.distcp.ConduitDistCp.java

License:Apache License

@Override
protected Path createInputFileListing(Job job) throws IOException {
    // get the file path where copy listing file has to be saved
    Path fileListingPath = getFileListingPath();
    Configuration config = job.getConfiguration();

    SequenceFile.Writer fileListWriter = null;
    try {/*from  w ww  .j  av  a2  s  .com*/
        fileListWriter = SequenceFile.createWriter(fileListingPath.getFileSystem(config), config,
                fileListingPath, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE);

        for (Map.Entry<String, FileStatus> entry : fileListingMap.entrySet()) {
            FileStatus status = FileUtil.getFileStatus(entry.getValue(), buffer, in);
            fileListWriter.append(new Text(entry.getKey()), status);

            // Create a sync point after each entry. This will ensure that SequenceFile
            // Reader can work at file entry level granularity, given that SequenceFile
            // Reader reads from the starting of sync point.
            fileListWriter.sync();

            totalBytesToCopy += entry.getValue().getLen();
            totalPaths++;
        }
    } finally {
        if (fileListWriter != null) {
            fileListWriter.close();
        }
    }

    LOG.info("Number of paths considered for copy: " + totalPaths);
    LOG.info("Number of bytes considered for copy: " + totalBytesToCopy
            + " (Actual number of bytes copied depends on whether any files are " + "skipped or overwritten.)");

    // set distcp configurations
    config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, fileListingPath.toString());
    config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, totalBytesToCopy);
    config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, totalPaths);

    return fileListingPath;
}

From source file:com.inmobi.conduit.distcp.tools.mapred.lib.DynamicInputChunk.java

License:Apache License

private void openForWrite() throws IOException {
    writer = SequenceFile.createWriter(chunkSet.getFileSystem(), chunkSet.getConf(), chunkFilePath, Text.class,
            FileStatus.class, SequenceFile.CompressionType.NONE);

}

From source file:com.inmobi.conduit.distcp.tools.SimpleCopyListing.java

License:Apache License

private SequenceFile.Writer getWriter(Path pathToListFile) throws IOException {
    return SequenceFile.createWriter(pathToListFile.getFileSystem(getConf()), getConf(), pathToListFile,
            Text.class, FileStatus.class, SequenceFile.CompressionType.NONE);
}

From source file:com.inmobi.messaging.consumer.util.MessageUtil.java

License:Apache License

public static void createMessageSequenceFile(String fileName, FileSystem fs, Path parent, int msgIndex,
        Configuration conf) throws IOException {
    Path file = new Path(parent, fileName);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, Text.class,
            CompressionType.NONE);/* w  w w  .  j  a va 2s  .  c  o  m*/

    for (int i = 0; i < 100; i++) {
        writer.append(new IntWritable(i), new Text(constructMessage(msgIndex).getBytes()));
        msgIndex++;
    }
    writer.close();
    TestUtil.LOG.debug("Created sequence data file:" + file);
}

From source file:com.inmobi.messaging.consumer.util.MessageUtil.java

License:Apache License

public static void createEmptySequenceFile(String fileName, FileSystem fs, Path parent, Configuration conf)
        throws IOException {
    Path file = new Path(parent, fileName);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, Text.class,
            CompressionType.NONE);//from  ww  w .  j av a2s  .co  m
    writer.close();
    TestUtil.LOG.debug("Created empty sequence file:" + file);
}

From source file:com.kadwa.hadoop.DistExec.java

License:Open Source License

/**
 * Initialize ExecFilesMapper specific job-configuration.
 *
 * @param conf    : The dfs/mapred configuration.
 * @param jobConf : The handle to the jobConf object to be initialized.
 * @param args    Arguments//w  ww .  j a va  2s .co m
 * @return true if it is necessary to launch a job.
 */
private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());
    jobConf.set(EXEC_CMD_LABEL, args.execCmd);

    //set boolean values
    jobConf.setBoolean(Options.REDIRECT_ERROR_TO_OUT.propertyname,
            args.flags.contains(Options.REDIRECT_ERROR_TO_OUT));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path stagingArea;
    try {
        stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }

    Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId);
    FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
    FileSystem.mkdirs(FileSystem.get(jobDirectory.toUri(), conf), jobDirectory, mapredSysPerms);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    FileSystem dstfs = args.dst.getFileSystem(conf);

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf);

    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
        dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
        String filename = "_" + NAME + "_logs_" + randomId;
        if (!dstExists || !dstIsDir) {
            Path parent = args.dst.getParent();
            if (!dstfs.exists(parent)) {
                dstfs.mkdirs(parent);
            }
            logPath = new Path(parent, filename);
        } else {
            logPath = new Path(args.dst, filename);
        }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_" + NAME + "_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_" + NAME + "_dst_files");
    SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class,
            Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_" + NAME + "_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists);
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
        for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) {
            final Path src = srcItr.next();
            FileSystem srcfs = src.getFileSystem(conf);
            FileStatus srcfilestat = srcfs.getFileStatus(src);
            Path root = special && srcfilestat.isDir() ? src : src.getParent();
            if (srcfilestat.isDir()) {
                ++srcCount;
            }

            Stack<FileStatus> pathstack = new Stack<FileStatus>();
            for (pathstack.push(srcfilestat); !pathstack.empty();) {
                FileStatus cur = pathstack.pop();
                FileStatus[] children = srcfs.listStatus(cur.getPath());
                for (int i = 0; i < children.length; i++) {
                    boolean skipfile = false;
                    final FileStatus child = children[i];
                    final String dst = makeRelative(root, child.getPath());
                    ++srcCount;

                    if (child.isDir()) {
                        pathstack.push(child);
                    } else {

                        if (!skipfile) {
                            ++fileCount;
                            byteCount += child.getLen();

                            if (LOG.isTraceEnabled()) {
                                LOG.trace("adding file " + child.getPath());
                            }

                            ++cnsyncf;
                            cbsyncs += child.getLen();
                            if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) {
                                src_writer.sync();
                                dst_writer.sync();
                                cnsyncf = 0;
                                cbsyncs = 0L;
                            }
                        }
                    }

                    if (!skipfile) {
                        src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()),
                                new FilePair(child, dst));
                    }

                    dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
                }

                if (cur.isDir()) {
                    String dst = makeRelative(root, cur.getPath());
                    dir_writer.append(new Text(dst), new FilePair(cur, dst));
                    if (++dirsyn > SYNC_FILE_MAX) {
                        dirsyn = 0;
                        dir_writer.sync();
                    }
                }
            }
        }
    } finally {
        checkAndClose(src_writer);
        checkAndClose(dst_writer);
        checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
        dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
        LOG.info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
        if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
            throw new IOException("Failed to create" + args.dst);
        }
    }

    final Path sorted = new Path(jobDirectory, "_" + NAME + "_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    Path tmpDir = new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst,
            "_" + NAME + "_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());
    LOG.info("sourcePathsCount=" + srcCount);
    LOG.info("filesToExecCount=" + fileCount);
    LOG.info("bytesToExecCount=" + StringUtils.humanReadableInt(byteCount));
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(fileCount, jobConf);
    return fileCount > 0;
}

From source file:com.linkedin.camus.etl.kafka.common.StringKafkaRecordWriterProvider.java

@Override
public RecordWriter<IEtlKey, CamusWrapper> getDataRecordWriter(final TaskAttemptContext context,
        final String fileName, CamusWrapper data, FileOutputCommitter committer)
        throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();

    Path file = committer.getWorkPath();
    file = new Path(file, EtlMultiOutputFormat.getUniqueFile(context, fileName, getFilenameExtension()));

    CompressionCodec codec = null;/*ww w.ja  v a  2 s  .co m*/
    SequenceFile.CompressionType compressionType = SequenceFile.CompressionType.NONE;

    final SequenceFile.Writer out = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file),
            SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class),
            SequenceFile.Writer.compression(compressionType, codec), SequenceFile.Writer.progressable(context));

    return new RecordWriter<IEtlKey, CamusWrapper>() {

        @Override
        public void write(IEtlKey iEtlKey, CamusWrapper camusWrapper) throws IOException {
            String record = (String) camusWrapper.getRecord() + recordDelimiter;
            out.append(new Text(String.valueOf(iEtlKey.getOffset())), new Text(record.getBytes()));
        }

        @Override
        public void close(TaskAttemptContext taskAttemptContext) throws IOException {
            out.close();
        }
    };
}

From source file:com.m6d.filecrush.crush.Crush.java

License:Apache License

void writeDirs() throws IOException {

    print(Verbosity.INFO, "\nUsing temporary directory " + tmpDir.toUri().getPath() + "\n");

    FileStatus status = fs.getFileStatus(srcDir);

    Path tmpIn = new Path(tmpDir, "in");

    bucketFiles = new Path(tmpIn, "dirs");
    partitionMap = new Path(tmpIn, "partition-map");
    counters = new Path(tmpIn, "counters");

    skippedFiles = new HashSet<String>();
    removableFiles = new HashSet<String>();

    /*/*www.  j  av  a2  s  .co m*/
     * Prefer the path returned by the status because it is always fully qualified.
     */
    List<Path> dirs = asList(status.getPath());

    Text key = new Text();
    Text value = new Text();

    Bucketer partitionBucketer = new Bucketer(maxTasks, 0, false);
    partitionBucketer.reset("partition-map");

    jobCounters = new Counters();
    int fileCount = 0;

    //Path bucketFile = new Path(tmpIn, "dirs_" + fileCount++);
    Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class,
            CompressionType.BLOCK);

    try {
        while (!dirs.isEmpty()) {
            List<Path> nextLevel = new LinkedList<Path>();

            for (Path dir : dirs) {
                String dirPath = dir.toUri().getPath();
                print(Verbosity.INFO, "\n\n[" + dirPath + "]");

                jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);

                FileStatus[] contents = fs.listStatus(dir, new PathFilter() {
                    @Override
                    public boolean accept(Path testPath) {
                        if (ignoredFilesMatcher == null)
                            return true;
                        ignoredFilesMatcher.reset(testPath.toUri().getPath());
                        boolean ignores = ignoredFilesMatcher.matches();
                        if (ignores)
                            LOG.info("Ignoring file " + testPath);
                        return !ignores;
                    }

                });

                if (contents == null || contents.length == 0) {
                    print(Verbosity.INFO, "\n  Directory is empty");

                    jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                } else {
                    List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length);
                    Set<String> uncrushedFiles = new HashSet<String>(contents.length);

                    long crushableBytes = 0;

                    /*
                     * Queue sub directories for subsequent inspection and examine the files in this directory.
                     */
                    for (FileStatus content : contents) {
                        Path path = content.getPath();

                        if (content.isDir()) {
                            nextLevel.add(path);
                        } else {
                            String filePath = path.toUri().getPath();
                            boolean skipFile = false;
                            if (skippedFilesMatcher != null) {
                                skippedFilesMatcher.reset(filePath);
                                if (skippedFilesMatcher.matches()) {
                                    skipFile = true;
                                }
                            }

                            boolean changed = uncrushedFiles.add(filePath);
                            assert changed : path.toUri().getPath();
                            long fileLength = content.getLen();

                            if (!skipFile && fileLength <= maxEligibleSize) {
                                if (removeEmptyFiles && fileLength == 0)
                                    removableFiles.add(filePath);
                                else {
                                    crushables.add(content);
                                    crushableBytes += fileLength;
                                }
                            }
                        }
                    }

                    /*
                     * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the
                     * number of files we found.
                     */
                    if (!uncrushedFiles.isEmpty()) {
                        if (-1 == findMatcher(dir)) {
                            throw new IllegalArgumentException(
                                    "Could not find matching regex for directory: " + dir);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size());
                    }

                    if (0 == crushableBytes) {
                        print(Verbosity.INFO, "\n  Directory has no crushable files");

                        jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                    } else {
                        /*
                         * We found files to consider for crushing.
                         */
                        long nBlocks = crushableBytes / dfsBlockSize;

                        if (nBlocks * dfsBlockSize != crushableBytes) {
                            nBlocks++;
                        }

                        /*
                         * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory.
                         */
                        long dirBuckets = nBlocks / maxFileBlocks;
                        if (dirBuckets * maxFileBlocks != nBlocks) {
                            dirBuckets++;
                        }

                        if (dirBuckets > Integer.MAX_VALUE) {
                            throw new AssertionError("Too many buckets: " + dirBuckets);
                        }

                        Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs);
                        directoryBucketer.reset(getPathPart(dir));

                        for (FileStatus file : crushables) {
                            directoryBucketer.add(new FileStatusHasSize(file));
                        }

                        List<Bucket> crushFiles = directoryBucketer.createBuckets();
                        if (crushFiles.isEmpty()) {
                            jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                            print(Verbosity.INFO, "\n  Directory skipped");
                        } else {
                            nBuckets += crushFiles.size();
                            jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);
                            print(Verbosity.INFO, "\n  Generating " + crushFiles.size() + " output files");

                            /*
                             * Write out the mapping between a bucket and a file.
                             */
                            for (Bucket crushFile : crushFiles) {
                                String bucketId = crushFile.name();

                                List<String> filesInBucket = crushFile.contents();

                                print(Verbosity.INFO,
                                        format("\n  Output %s will include %,d input bytes from %,d files",
                                                bucketId, crushFile.size(), filesInBucket.size()));

                                key.set(bucketId);

                                for (String f : filesInBucket) {
                                    boolean changed = uncrushedFiles.remove(f);
                                    assert changed : f;

                                    pathMatcher.reset(f);
                                    pathMatcher.matches();

                                    value.set(pathMatcher.group(5));

                                    /*
                                     * Write one row per file to maximize the number of mappers
                                     */
                                    writer.append(key, value);

                                    /*
                                     * Print the input file with four leading spaces.
                                     */
                                    print(Verbosity.VERBOSE, "\n    " + f);
                                }

                                jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, filesInBucket.size());

                                partitionBucketer.add(crushFile);
                            }
                        }
                    }

                    if (!removableFiles.isEmpty()) {
                        print(Verbosity.INFO, "\n  Marked " + removableFiles.size() + " files for removal");

                        for (String removable : removableFiles) {
                            uncrushedFiles.remove(removable);
                            print(Verbosity.VERBOSE, "\n    " + removable);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_REMOVED, removableFiles.size());
                    }

                    if (!uncrushedFiles.isEmpty()) {
                        print(Verbosity.INFO, "\n  Skipped " + uncrushedFiles.size() + " files");

                        for (String uncrushed : uncrushedFiles) {
                            print(Verbosity.VERBOSE, "\n    " + uncrushed);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size());
                    }

                    skippedFiles.addAll(uncrushedFiles);
                }
            }

            dirs = nextLevel;
        }
    } finally {
        writer.close();
    }

    /*
     * Now that we have processed all the directories, write the partition map.
     */
    List<Bucket> partitions = partitionBucketer.createBuckets();
    assert partitions.size() <= maxTasks;

    writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);
    IntWritable partNum = new IntWritable();
    int totalReducers = 0;
    for (Bucket partition : partitions) {
        String partitionName = partition.name();

        int p = Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1));
        partNum.set(p);

        if (partition.contents().size() > 0)
            totalReducers++;

        for (String bucketId : partition.contents()) {
            key.set(bucketId);
            writer.append(key, partNum);
        }
    }
    writer.close();

    print(Verbosity.INFO, "\n\nNumber of allocated reducers = " + totalReducers);
    job.setInt("mapreduce.job.reduces", totalReducers);

    DataOutputStream countersStream = fs.create(this.counters);
    jobCounters.write(countersStream);
    countersStream.close();
}