List of usage examples for org.apache.mahout.common HadoopUtil buildDirList
public static String buildDirList(FileSystem fs, FileStatus fileStatus, PathFilter pathFilter) throws IOException
From source file:my.mahout.SequenceFilesFromDirectory.java
License:Apache License
private int runMapReduce(Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { int chunkSizeInMB = 64; if (hasOption(CHUNK_SIZE_OPTION[0])) { chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0])); }/*from w w w. ja va 2s. c om*/ String keyPrefix = null; if (hasOption(KEY_PREFIX_OPTION[0])) { keyPrefix = getOption(KEY_PREFIX_OPTION[0]); } String fileFilterClassName = null; if (hasOption(FILE_FILTER_CLASS_OPTION[0])) { fileFilterClassName = getOption(FILE_FILTER_CLASS_OPTION[0]); } PathFilter pathFilter = null; // Prefix Addition is presently handled in the Mapper and unlike // runsequential() // need not be done via a pathFilter if (!StringUtils.isBlank(fileFilterClassName) && !PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) { try { pathFilter = (PathFilter) Class.forName(fileFilterClassName).newInstance(); } catch (InstantiationException e) { throw new IllegalStateException(e); } catch (IllegalAccessException e) { throw new IllegalStateException(e); } } // Prepare Job for submission. Job job = prepareJob(input, output, MultipleTextFileInputFormat.class, SequenceFilesFromDirectoryMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class, "SequenceFilesFromDirectory"); Configuration jobConfig = job.getConfiguration(); jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix); jobConfig.set(FILE_FILTER_CLASS_OPTION[0], fileFilterClassName); FileSystem fs = FileSystem.get(jobConfig); FileStatus fsFileStatus = fs.getFileStatus(input); String inputDirList; if (pathFilter != null) { inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus, pathFilter); } else { inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus); } jobConfig.set(BASE_INPUT_PATH, input.toString()); long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024; // set the max split locations, otherwise we get nasty debug stuff jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS)); FileInputFormat.setInputPaths(job, inputDirList); // need to set this to a multiple of the block size, or no split happens FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes); FileOutputFormat.setCompressOutput(job, true); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { return -1; } return 0; }