List of usage examples for org.apache.hadoop.fs Path getParent
public Path getParent()
From source file:org.apache.mahout.clustering.minhash.LastfmDataConverter.java
License:Apache License
/** * Converts each record in (item,features) map into Mahout vector format and * writes it into sequencefile for minhash clustering *//* w w w. j a v a 2 s. co m*/ public static boolean writeToSequenceFile(Map<String, List<Integer>> itemFeaturesMap, Path outputPath) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); fs.mkdirs(outputPath.getParent()); long totalRecords = itemFeaturesMap.size(); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputPath, Text.class, VectorWritable.class); try { String msg = "Now writing vectorized data in sequence file format: "; System.out.print(msg); Text itemWritable = new Text(); VectorWritable featuresWritable = new VectorWritable(); int doneRecords = 0; int prevPercentDone = 1; for (Map.Entry<String, List<Integer>> itemFeature : itemFeaturesMap.entrySet()) { int numfeatures = itemFeature.getValue().size(); itemWritable.set(itemFeature.getKey()); Vector featureVector = new SequentialAccessSparseVector(numfeatures); int i = 0; for (Integer feature : itemFeature.getValue()) { featureVector.setQuick(i++, feature); } featuresWritable.set(featureVector); writer.append(itemWritable, featuresWritable); // Update the progress double percentDone = ++doneRecords * 100.0 / totalRecords; if (percentDone > prevPercentDone) { System.out.print('\r' + msg + percentDone + "% " + (percentDone >= 100 ? "Completed\n" : "")); prevPercentDone++; } } } finally { Closeables.closeQuietly(writer); } return true; }
From source file:org.apache.mahout.clustering.spectral.common.MatrixDiagonalizeJob.java
License:Apache License
public static Vector runJob(Path affInput, int dimensions) throws IOException, ClassNotFoundException, InterruptedException { // set up all the job tasks Configuration conf = new Configuration(); Path diagOutput = new Path(affInput.getParent(), "diagonal"); HadoopUtil.delete(conf, diagOutput); conf.setInt(EigencutsKeys.AFFINITY_DIMENSIONS, dimensions); Job job = new Job(conf, "MatrixDiagonalizeJob"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(IntDoublePairWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(MatrixDiagonalizeMapper.class); job.setReducerClass(MatrixDiagonalizeReducer.class); FileInputFormat.addInputPath(job, affInput); FileOutputFormat.setOutputPath(job, diagOutput); job.setJarByClass(MatrixDiagonalizeJob.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); }// ww w. java2s .c om // read the results back from the path return VectorCache.load(conf, new Path(diagOutput, "part-r-00000")); }
From source file:org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob.java
License:Apache License
public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath) throws IOException, ClassNotFoundException, InterruptedException { // set up the serialization of the diagonal vector Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(markovPath.toUri(), conf); markovPath = fs.makeQualified(markovPath); outputPath = fs.makeQualified(outputPath); Path vectorOutputPath = new Path(outputPath.getParent(), "vector"); VectorCache.save(new IntWritable(EigencutsKeys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf); // set up the job itself Job job = new Job(conf, "VectorMatrixMultiplication"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(VectorMatrixMultiplicationMapper.class); job.setNumReduceTasks(0);//from w w w .j a v a2 s .c om FileInputFormat.addInputPath(job, markovPath); FileOutputFormat.setOutputPath(job, outputPath); job.setJarByClass(VectorMatrixMultiplicationJob.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } // build the resulting DRM from the results return new DistributedRowMatrix(outputPath, tmpPath, diag.size(), diag.size()); }
From source file:org.apache.mahout.clustering.spectral.eigencuts.EigencutsSensitivityJob.java
License:Apache License
/** * Initializes the configuration tasks, loads the needed data into * the HDFS cache, and executes the job. * //from w w w . j av a 2 s . c om * @param eigenvalues Vector of eigenvalues * @param diagonal Vector representing the diagonal matrix * @param eigenvectors Path to the DRM of eigenvectors * @param output Path to the output matrix (will have between n and full-rank * non-zero elements) */ public static void runJob(Vector eigenvalues, Vector diagonal, Path eigenvectors, double beta, double tau, double delta, double epsilon, Path output) throws IOException, ClassNotFoundException, InterruptedException { // save the two vectors to the distributed cache Configuration jobConfig = new Configuration(); Path eigenOutputPath = new Path(output.getParent(), "eigenvalues"); Path diagOutputPath = new Path(output.getParent(), "diagonal"); jobConfig.set(EigencutsKeys.VECTOR_CACHE_BASE, output.getParent().getName()); VectorCache.save(new IntWritable(EigencutsKeys.EIGENVALUES_CACHE_INDEX), eigenvalues, eigenOutputPath, jobConfig); VectorCache.save(new IntWritable(EigencutsKeys.DIAGONAL_CACHE_INDEX), diagonal, diagOutputPath, jobConfig); // set up the rest of the job jobConfig.set(EigencutsKeys.BETA, Double.toString(beta)); jobConfig.set(EigencutsKeys.EPSILON, Double.toString(epsilon)); jobConfig.set(EigencutsKeys.DELTA, Double.toString(delta)); jobConfig.set(EigencutsKeys.TAU, Double.toString(tau)); Job job = new Job(jobConfig, "EigencutsSensitivityJob"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(EigencutsSensitivityNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(EigencutsSensitivityMapper.class); job.setReducerClass(EigencutsSensitivityReducer.class); FileInputFormat.addInputPath(job, eigenvectors); FileOutputFormat.setOutputPath(job, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.clustering.spectral.MatrixDiagonalizeJob.java
License:Apache License
public static Vector runJob(Path affInput, int dimensions) throws IOException, ClassNotFoundException, InterruptedException { // set up all the job tasks Configuration conf = new Configuration(); Path diagOutput = new Path(affInput.getParent(), "diagonal"); HadoopUtil.delete(conf, diagOutput); conf.setInt(Keys.AFFINITY_DIMENSIONS, dimensions); Job job = new Job(conf, "MatrixDiagonalizeJob"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(IntDoublePairWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(MatrixDiagonalizeMapper.class); job.setReducerClass(MatrixDiagonalizeReducer.class); FileInputFormat.addInputPath(job, affInput); FileOutputFormat.setOutputPath(job, diagOutput); job.setJarByClass(MatrixDiagonalizeJob.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); }/*from www. j ava 2 s. c o m*/ // read the results back from the path return VectorCache.load(conf, new Path(diagOutput, "part-r-00000")); }
From source file:org.apache.mahout.clustering.spectral.VectorMatrixMultiplicationJob.java
License:Apache License
public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath) throws IOException, ClassNotFoundException, InterruptedException { // set up the serialization of the diagonal vector Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(markovPath.toUri(), conf); markovPath = fs.makeQualified(markovPath); outputPath = fs.makeQualified(outputPath); Path vectorOutputPath = new Path(outputPath.getParent(), "vector"); VectorCache.save(new IntWritable(Keys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf); // set up the job itself Job job = new Job(conf, "VectorMatrixMultiplication"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(VectorMatrixMultiplicationMapper.class); job.setNumReduceTasks(0);/*from ww w. j a va2 s .c o m*/ FileInputFormat.addInputPath(job, markovPath); FileOutputFormat.setOutputPath(job, outputPath); job.setJarByClass(VectorMatrixMultiplicationJob.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } // build the resulting DRM from the results return new DistributedRowMatrix(outputPath, tmpPath, diag.size(), diag.size()); }
From source file:org.apache.mahout.clustering.topdown.postprocessor.ClusterOutputPostProcessorDriver.java
License:Apache License
/** * Using @FileSystem rename method to move the file. *///from www . ja v a 2 s . c om private static void renameFile(Writable key, FileStatus fileStatus, Configuration conf) throws IOException { Path path = fileStatus.getPath(); FileSystem fileSystem = path.getFileSystem(conf); Path subDir = new Path(key.toString()); Path renameTo = new Path(path.getParent(), subDir); fileSystem.mkdirs(renameTo); fileSystem.rename(path, renameTo); }
From source file:org.apache.mahout.freqtermsets.ParallelFPStreamReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); Parameters params = new Parameters(conf.get(PFPGrowth.PFP_PARAMETERS, "")); intervalStart = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START)); intervalEnd = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END)); windowSize = Long//from www . j a v a2 s. c o m .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(intervalEnd - intervalStart))); endTimestamp = Math.min(intervalEnd, intervalStart + windowSize - 1); PFPGrowth.loadEarlierFHashMaps(context, params, intervalStart, idStringMap, stringIdMap); maxHeapSize = Integer.valueOf(params.get(PFPGrowth.MAX_HEAPSIZE, "50")); minSupport = Integer.valueOf(params.get(PFPGrowth.MIN_SUPPORT, "3")); numGroups = params.getInt(PFPGrowth.NUM_GROUPS, PFPGrowth.NUM_GROUPS_DEFAULT); minWordsForLangDetection = params.getInt(MIN_WORDS_FOR_LANG_ID, MIN_WORDS_FOR_LANG_ID_DEFAULT); repeatHashTag = Boolean.parseBoolean(params.get(TokenIterator.PARAM_REPEAT_HASHTAG, "false")); long maxPatternLoadLag = Long.parseLong( params.get(PFPGrowth.PARAM_MAX_PATTERN_LOAD_LAG, PFPGrowth.DEFAULT_MAX_PATTERN_LOAD_LAG)); Path mostRecentPath = null; Path outPath = new Path(params.get(PFPGrowth.OUTPUT)); Path timeRoot = outPath.getParent().getParent(); FileSystem fs = FileSystem.get(conf); FileStatus[] otherWindows = fs.listStatus(timeRoot); // List<IndexReader> earlierIndexes = Lists // .<IndexReader> newArrayListWithCapacity(otherWindows.length - 1); for (int f = otherWindows.length - 1; f >= 0; --f) { Path p = otherWindows[f].getPath(); long pathStartTime = Long.parseLong(p.getName()); // should have used end time, but it doesn't make a difference, // AS LONG AS windows don't overlap // long timeDifference = intervalStart - pathStartTime; // if (timeDifference > 0 && timeDifference <= maxPatternLoadLag) { if (pathStartTime < intervalStart && pathStartTime > mostRecentTime) { p = fs.listStatus(p)[0].getPath(); p = new Path(p, "index"); if (fs.exists(p)) { mostRecentTime = pathStartTime; mostRecentPath = p; // File indexDir = FileUtils.toFile(p.toUri().toURL()); // // FIXME: this will work only on local filesystem.. like many other parts of the code // Directory fisdir = new MMapDirectory(indexDir); // IndexReader fisIxReader = IndexReader.open(fisdir); // earlierIndexes.add(fisIxReader); } } } if (mostRecentPath != null) { // if(!earlierIndexes.isEmpty()) { // fisIxMultiReader = new MultiReader(earlierIndexes.toArray(new IndexReader[0])); Directory fisdir = new MMapDirectory(FileUtils.toFile(mostRecentPath.toUri().toURL())); fisIxReader = IndexReader.open(fisdir); // fisSearcher = new IndexSearcher(fisIxMultiReader); fisSearcher = new IndexSearcher(fisIxReader); fisSimilarity = new ItemSetSimilarity(); fisSearcher.setSimilarity(fisSimilarity); fisQparser = new QueryParser(Version.LUCENE_36, ItemSetIndexBuilder.AssocField.ITEMSET.name, ANALYZER); fisQparser.setDefaultOperator(Operator.AND); timeWeigth = TimeWeightFunction.getDefault(params); } }
From source file:org.apache.mahout.freqtermsets.PFPGrowth.java
License:Apache License
/** * Generates the fList from the serialized string representation * /* w ww . jav a 2 s . c o m*/ * @return Deserialized Feature Frequency List */ public static OpenObjectLongHashMap<String> readOlderCachedFLists(Configuration conf, long currWindowStart, TimeWeightFunction weightFunction) throws IOException { OpenObjectLongHashMap<String> list = new OpenObjectLongHashMap<String>(); Path[] files = DistributedCache.getLocalCacheFiles(conf); if (files == null) { throw new IOException("Cannot read Frequency list from Distributed Cache"); } for (int i = 0; i < files.length; ++i) { FileSystem fs = FileSystem.getLocal(conf); Path fListLocalPath = fs.makeQualified(files[i]); // Fallback if we are running locally. if (!fs.exists(fListLocalPath)) { URI[] filesURIs = DistributedCache.getCacheFiles(conf); if (filesURIs == null) { throw new IOException("Cannot read Frequency list from Distributed Cache"); } fListLocalPath = new Path(filesURIs[i].getPath()); } long listWindowStart = Long.parseLong(fListLocalPath.getParent().getParent().getName()); for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath, true, conf)) { String token = record.getFirst().toString(); list.put(token, Math.round(list.get(token) + weightFunction.apply(record.getSecond().get(), listWindowStart, currWindowStart))); } } return list; }