List of usage examples for org.apache.mahout.common HadoopUtil buildDirList
public static String buildDirList(FileSystem fs, FileStatus fileStatus) throws IOException
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
private int runMapReduce(Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { int chunkSizeInMB = 64; if (hasOption(CHUNK_SIZE_OPTION[0])) { chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0])); }/*from ww w . ja v a 2 s .c o m*/ String keyPrefix = null; if (hasOption(KEY_PREFIX_OPTION[0])) { keyPrefix = getOption(KEY_PREFIX_OPTION[0]); } // Prepare Job for submission. Job job = prepareJob(input, output, MultipleTextFileInputFormat.class, SequenceFilesFromDirectoryMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class, "SequenceFilesFromDirectory"); Configuration jobConfig = job.getConfiguration(); jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix); FileSystem fs = FileSystem.get(jobConfig); FileStatus fsFileStatus = fs.getFileStatus(input); String inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus); jobConfig.set(BASE_INPUT_PATH, input.toString()); long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024; // set the max split locations, otherwise we get nasty debug stuff jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS)); FileInputFormat.setInputPaths(job, inputDirList); // need to set this to a multiple of the block size, or no split happens FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes); FileOutputFormat.setCompressOutput(job, true); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { return -1; } return 0; }