Example usage for org.apache.mahout.common HadoopUtil buildDirList

Introduction

In this page you can find the example usage for org.apache.mahout.common HadoopUtil buildDirList.

Prototype

public static String buildDirList(FileSystem fs, FileStatus fileStatus, PathFilter pathFilter)
        throws IOException

Source Link

Document

Builds a comma-separated list of input splits

Usage

From source file:my.mahout.SequenceFilesFromDirectory.java

License:Apache License

private int runMapReduce(Path input, Path output)
        throws IOException, ClassNotFoundException, InterruptedException {

    int chunkSizeInMB = 64;
    if (hasOption(CHUNK_SIZE_OPTION[0])) {
        chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
    }/*from  w w  w. ja  va  2s.  c om*/

    String keyPrefix = null;
    if (hasOption(KEY_PREFIX_OPTION[0])) {
        keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
    }

    String fileFilterClassName = null;
    if (hasOption(FILE_FILTER_CLASS_OPTION[0])) {
        fileFilterClassName = getOption(FILE_FILTER_CLASS_OPTION[0]);
    }

    PathFilter pathFilter = null;
    // Prefix Addition is presently handled in the Mapper and unlike
    // runsequential()
    // need not be done via a pathFilter
    if (!StringUtils.isBlank(fileFilterClassName)
            && !PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
        try {
            pathFilter = (PathFilter) Class.forName(fileFilterClassName).newInstance();
        } catch (InstantiationException e) {
            throw new IllegalStateException(e);
        } catch (IllegalAccessException e) {
            throw new IllegalStateException(e);
        }
    }

    // Prepare Job for submission.
    Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
            SequenceFilesFromDirectoryMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class,
            "SequenceFilesFromDirectory");

    Configuration jobConfig = job.getConfiguration();
    jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix);
    jobConfig.set(FILE_FILTER_CLASS_OPTION[0], fileFilterClassName);

    FileSystem fs = FileSystem.get(jobConfig);
    FileStatus fsFileStatus = fs.getFileStatus(input);

    String inputDirList;
    if (pathFilter != null) {
        inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus, pathFilter);
    } else {
        inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus);
    }

    jobConfig.set(BASE_INPUT_PATH, input.toString());

    long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024;

    // set the max split locations, otherwise we get nasty debug stuff
    jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS));

    FileInputFormat.setInputPaths(job, inputDirList);
    // need to set this to a multiple of the block size, or no split happens
    FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes);
    FileOutputFormat.setCompressOutput(job, true);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    return 0;
}