List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * read the feature frequency List which is built at the end of the Parallel counting job * /*from ww w. j av a 2s . c om*/ * @param params * @return Feature Frequency List * @throws IOException */ public static List<Pair<String, Long>> readFList(Parameters params) throws IOException { Writable key = new Text(); LongWritable value = new LongWritable(); int minSupport = Integer.valueOf(params.get("minSupport", "3")); Configuration conf = new Configuration(); Path parallelCountingPath = new Path(params.get("output"), "parallelcounting"); FileSystem fs = FileSystem.get(parallelCountingPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(parallelCountingPath, "part-*")); PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11, new Comparator<Pair<String, Long>>() { @Override public int compare(Pair<String, Long> o1, Pair<String, Long> o2) { int ret = o2.getSecond().compareTo(o1.getSecond()); if (ret != 0) { return ret; } return o1.getFirst().compareTo(o2.getFirst()); } }); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // key is feature value is count while (reader.next(key, value)) { if (value.get() >= minSupport) { queue.add(new Pair<String, Long>(key.toString(), value.get())); } } } List<Pair<String, Long>> fList = new ArrayList<Pair<String, Long>>(); while (!queue.isEmpty()) { fList.add(queue.poll()); } return fList; }
From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Read the Frequent Patterns generated from Text * /* w w w .j a v a 2 s. c o m*/ * @param params * @return List of TopK patterns for each string frequent feature * @throws IOException */ public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Parameters params) throws IOException { Configuration conf = new Configuration(); Path frequentPatternsPath = new Path(params.get("output"), "frequentPatterns"); FileSystem fs = FileSystem.get(frequentPatternsPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(frequentPatternsPath, "part-*")); List<Pair<String, TopKStringPatterns>> ret = new ArrayList<Pair<String, TopKStringPatterns>>(); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); ret.addAll(FPGrowth.readFrequentPattern(fs, conf, path)); } return ret; }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtDenseOutJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path xiPath, Path sqPath, Path sbPath, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks, boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); Job job = new Job(oldApiJob); job.setJobName("ABt-job"); job.setJarByClass(ABtDenseOutJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputAPaths); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); }/* www . jav a 2 s . c o m*/ FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(SplitPartitionedWritable.class); job.setMapOutputValueClass(DenseBlockWritable.class); job.setOutputKeyClass(SplitPartitionedWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(ABtMapper.class); job.setReducerClass(QRReducer.class); job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows); job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight); job.getConfiguration().setInt(QRFirstStep.PROP_K, k); job.getConfiguration().setInt(QRFirstStep.PROP_P, p); job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString()); /* * PCA-related options, MAHOUT-817 */ if (xiPath != null) { job.getConfiguration().set(PROP_XI_PATH, xiPath.toString()); job.getConfiguration().set(PROP_SB_PATH, sbPath.toString()); job.getConfiguration().set(PROP_SQ_PATH, sqPath.toString()); } job.setNumReduceTasks(numReduceTasks); // broadcast Bt files if required. if (broadcastBInput) { job.getConfiguration().set(PROP_BT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf); FileStatus[] fstats = fs.globStatus(inputBtGlob); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration()); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("ABt job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.ABtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks, boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); // MultipleOutputs // .addNamedOutput(oldApiJob, // QJob.OUTPUT_QHAT, // org.apache.hadoop.mapred.SequenceFileOutputFormat.class, // SplitPartitionedWritable.class, // DenseBlockWritable.class); ///*w w w. j ava 2 s . co m*/ // MultipleOutputs // .addNamedOutput(oldApiJob, // QJob.OUTPUT_RHAT, // org.apache.hadoop.mapred.SequenceFileOutputFormat.class, // SplitPartitionedWritable.class, // VectorWritable.class); Job job = new Job(oldApiJob); job.setJobName("ABt-job"); job.setJarByClass(ABtJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputAPaths); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(SplitPartitionedWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(SplitPartitionedWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(ABtMapper.class); job.setCombinerClass(BtJob.OuterProductCombiner.class); job.setReducerClass(QRReducer.class); job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows); job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight); job.getConfiguration().setInt(QRFirstStep.PROP_K, k); job.getConfiguration().setInt(QRFirstStep.PROP_P, p); job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString()); // number of reduce tasks doesn't matter. we don't actually // send anything to reducers. job.setNumReduceTasks(numReduceTasks); // broadcast Bt files if required. if (broadcastBInput) { job.getConfiguration().set(PROP_BT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf); FileStatus[] fstats = fs.globStatus(inputBtGlob); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), conf); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("ABt job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.BtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPathA, Path inputPathQJob, Path xiPath, Path outputPath, int minSplitSize, int k, int p, int btBlockHeight, int numReduceTasks, boolean broadcast, Class<? extends Writable> labelClass, boolean outputBBtProducts) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_Q, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, labelClass, VectorWritable.class); if (outputBBtProducts) { MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_BBT, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); /*//from ww w. j a v a2 s . co m * MAHOUT-1067: if we are asked to output BBT products then named vector * names should be propagated to Q too so that UJob could pick them up * from there. */ oldApiJob.setBoolean(PROP_NV, true); } if (xiPath != null) { // compute pca -related stuff as well MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SQ, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); MultipleOutputs.addNamedOutput(oldApiJob, OUTPUT_SB, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); } /* * HACK: we use old api multiple outputs since they are not available in the * new api of either 0.20.2 or 0.20.203 but wrap it into a new api job so we * can use new api interfaces. */ Job job = new Job(oldApiJob); job.setJobName("Bt-job"); job.setJarByClass(BtJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, inputPathA); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); // WARN: tight hadoop integration here: job.getConfiguration().set("mapreduce.output.basename", OUTPUT_BT); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(BtMapper.class); job.setCombinerClass(OuterProductCombiner.class); job.setReducerClass(OuterProductReducer.class); job.getConfiguration().setInt(QJob.PROP_K, k); job.getConfiguration().setInt(QJob.PROP_P, p); job.getConfiguration().set(PROP_QJOB_PATH, inputPathQJob.toString()); job.getConfiguration().setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts); job.getConfiguration().setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, btBlockHeight); job.setNumReduceTasks(numReduceTasks); /* * PCA-related options, MAHOUT-817 */ if (xiPath != null) { job.getConfiguration().set(PROP_XI_PATH, xiPath.toString()); } /* * we can broadhast Rhat files since all of them are reuqired by each job, * but not Q files which correspond to splits of A (so each split of A will * require only particular Q file, each time different one). */ if (broadcast) { job.getConfiguration().set(PROP_RHAT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputPathQJob.toUri(), conf); FileStatus[] fstats = fs.globStatus(new Path(inputPathQJob, QJob.OUTPUT_RHAT + "-*")); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), job.getConfiguration()); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("Bt job unsuccessful."); } }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.SSVDHelper.java
License:Apache License
/** * sniff label type in the input files//from w ww . j a va 2 s . c om */ static Class<? extends Writable> sniffInputLabelType(Path[] inputPath, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(conf); for (Path p : inputPath) { FileStatus[] fstats = fs.globStatus(p); if (fstats == null || fstats.length == 0) { continue; } FileStatus firstSeqFile; if (fstats[0].isDir()) { firstSeqFile = fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0]; } else { firstSeqFile = fstats[0]; } SequenceFile.Reader r = null; try { r = new SequenceFile.Reader(fs, firstSeqFile.getPath(), conf); return r.getKeyClass().asSubclass(Writable.class); } finally { Closeables.close(r, true); } } throw new IOException("Unable to open input files to determine input label type."); }
From source file:org.apache.mahout.utils.ConcatenateVectorsJob.java
License:Apache License
private Class<? extends Writable> getKeyClass(Path path, FileSystem fs) throws IOException { // this works for both part* and a directory/ with part*. Path pathPattern = new Path(path, "part*"); FileStatus[] paths = fs.globStatus(pathPattern); Preconditions.checkArgument(paths.length > 0, path.getName() + " is a file, should be a directory"); Path file = paths[0].getPath(); SequenceFile.Reader reader = null; try {//from w w w . j a v a2 s. c om reader = new SequenceFile.Reader(fs, file, fs.getConf()); return reader.getKeyClass().asSubclass(Writable.class); } finally { Closeables.close(reader, true); } }
From source file:org.apache.mahout.utils.eval.InMemoryFactorizationEvaluator.java
License:Apache License
private Matrix readMatrix(Path dir) throws IOException { Matrix matrix = new SparseMatrix(new int[] { Integer.MAX_VALUE, Integer.MAX_VALUE }); FileSystem fs = dir.getFileSystem(getConf()); for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) { Path path = seqFile.getPath(); SequenceFile.Reader reader = null; try {// w w w. ja v a2 s . c o m reader = new SequenceFile.Reader(fs, path, getConf()); IntWritable key = new IntWritable(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { int row = key.get(); Iterator<Vector.Element> elementsIterator = value.get().iterateNonZero(); while (elementsIterator.hasNext()) { Vector.Element element = elementsIterator.next(); matrix.set(row, element.index(), element.get()); } } } finally { IOUtils.quietClose(reader); } } return matrix; }
From source file:org.apache.mahout.utils.eval.InMemoryFactorizationEvaluator.java
License:Apache License
private List<Preference> readProbePreferences(Path dir) throws IOException { List<Preference> preferences = new LinkedList<Preference>(); FileSystem fs = dir.getFileSystem(getConf()); for (FileStatus seqFile : fs.globStatus(new Path(dir, "part-*"))) { Path path = seqFile.getPath(); InputStream in = null;// ww w . j a va 2s . co m try { in = fs.open(path); BufferedReader reader = new BufferedReader(new InputStreamReader(in, Charset.forName("UTF-8"))); String line; while ((line = reader.readLine()) != null) { String[] tokens = TasteHadoopUtils.splitPrefTokens(line); long userID = Long.parseLong(tokens[0]); long itemID = Long.parseLong(tokens[1]); float value = Float.parseFloat(tokens[2]); preferences.add(new GenericPreference(userID, itemID, value)); } } finally { IOUtils.quietClose(in); } } return preferences; }
From source file:org.apache.mahout.utils.vectors.text.DictionaryVectorizer.java
License:Apache License
/** * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them. * This will use constant memory and will run at the speed of your disk read * //www . ja v a 2 s .com * @param minSupport * @param wordCountPath * @param dictionaryPathBase * @throws IOException */ private static List<Path> createDictionaryChunks(int minSupport, Path wordCountPath, Path dictionaryPathBase, int chunkSizeInMegabytes, Writable value, int[] maxTermDimension) throws IOException { List<Path> chunkPaths = new ArrayList<Path>(); Writable key = new Text(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf); FileStatus[] outputFiles = fs.globStatus(new Path(wordCountPath, OUTPUT_FILES_PATTERN)); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); long currentChunkSize = 0; int i = 0; for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // key is feature value is count while (reader.next(key, value)) { if (currentChunkSize > chunkSizeLimit) { dictWriter.close(); chunkIndex++; chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath); dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); currentChunkSize = 0; } int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8; currentChunkSize += fieldSize; dictWriter.append(key, new IntWritable(i++)); } } maxTermDimension[0] = i; dictWriter.close(); return chunkPaths; }