Example usage for org.apache.mahout.common HadoopUtil buildDirList

List of usage examples for org.apache.mahout.common HadoopUtil buildDirList

Introduction

In this page you can find the example usage for org.apache.mahout.common HadoopUtil buildDirList.

Prototype

public static String buildDirList(FileSystem fs, FileStatus fileStatus) throws IOException 

Source Link

Document

Builds a comma-separated list of input splits

Usage

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

private int runMapReduce(Path input, Path output)
        throws IOException, ClassNotFoundException, InterruptedException {

    int chunkSizeInMB = 64;
    if (hasOption(CHUNK_SIZE_OPTION[0])) {
        chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
    }/*from ww  w .  ja  v  a 2  s .c o  m*/

    String keyPrefix = null;
    if (hasOption(KEY_PREFIX_OPTION[0])) {
        keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
    }

    // Prepare Job for submission.
    Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
            SequenceFilesFromDirectoryMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class,
            "SequenceFilesFromDirectory");

    Configuration jobConfig = job.getConfiguration();
    jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix);
    FileSystem fs = FileSystem.get(jobConfig);
    FileStatus fsFileStatus = fs.getFileStatus(input);
    String inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus);
    jobConfig.set(BASE_INPUT_PATH, input.toString());

    long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024;

    // set the max split locations, otherwise we get nasty debug stuff
    jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS));

    FileInputFormat.setInputPaths(job, inputDirList);
    // need to set this to a multiple of the block size, or no split happens
    FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes);
    FileOutputFormat.setCompressOutput(job, true);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    return 0;
}