List of usage examples for org.apache.mahout.common.iterator.sequencefile PathFilters partFilter
public static PathFilter partFilter()
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
/** * @param topicModelStateTemp/*w w w . ja v a 2 s.c om*/ * @param iteration * @return {@code double[2]} where first value is perplexity and second is model weight of those * documents sampled during perplexity computation, or {@code null} if no perplexity data * exists for the given iteration. * @throws IOException */ public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration) throws IOException { Path perplexityPath = perplexityPath(topicModelStateTemp, iteration); FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf); if (!fs.exists(perplexityPath)) { log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath); return Double.NaN; } double perplexity = 0; double modelWeight = 0; long n = 0; for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>( perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { modelWeight += pair.getFirst().get(); perplexity += pair.getSecond().get(); n++; } log.info("Read {} entries with total perplexity {} and model weight {}", new Object[] { n, perplexity, modelWeight }); return perplexity / modelWeight; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setMapperClass(CVB0DocInferenceMapper.class); job.setNumReduceTasks(0);//from ww w. j a v a 2s .c om job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileSystem fs = FileSystem.get(corpus.toUri(), conf); if (modelInput != null && fs.exists(modelInput)) { FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter()); URI[] modelUris = new URI[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelUris[i] = statuses[i].getPath().toUri(); } DistributedCache.setCacheFiles(modelUris, conf); } setModelPaths(job, modelInput);//bug:mahout-1147 FileInputFormat.addInputPath(job, corpus); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(CVB0Driver.class); job.submit(); return job; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static void setModelPaths(Job job, Path modelPath) throws IOException { Configuration conf = job.getConfiguration(); if (modelPath == null || !FileSystem.get(modelPath.toUri(), conf).exists(modelPath)) { return;//from w ww . java 2 s . c o m } FileStatus[] statuses = FileSystem.get(modelPath.toUri(), conf).listStatus(modelPath, PathFilters.partFilter()); Preconditions.checkState(statuses.length > 0, "No part files found in model path '%s'", modelPath.toString()); String[] modelPaths = new String[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelPaths[i] = statuses[i].getPath().toUri().toString(); } conf.setStrings(MODEL_PATHS, modelPaths); }
From source file:com.luca.filipponi.tweetAnalysis.SentimentClassifier.CustomTestNaiveBayesDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/* w w w .ja v a 2 s. c o m*/ addOutputOption(); addOption(addOption(DefaultOptionCreator.overwriteOption().create())); addOption("model", "m", "The path to the model built during training", true); addOption( buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false))); addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false))); addOption("labelIndex", "l", "The path to the location of the label index", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); } boolean complementary = hasOption("testComplementary"); boolean sequential = hasOption("runSequential"); if (sequential) { FileSystem fs = FileSystem.get(getConf()); NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf()); AbstractNaiveBayesClassifier classifier; if (complementary) { classifier = new ComplementaryNaiveBayesClassifier(model); } else { classifier = new StandardNaiveBayesClassifier(model); } SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class); SequenceFile.Reader reader = new SequenceFile.Reader(fs, getInputPath(), getConf()); Text key = new Text(); VectorWritable vw = new VectorWritable(); while (reader.next(key, vw)) { writer.append(new Text(SLASH.split(key.toString())[1]), new VectorWritable(classifier.classifyFull(vw.get()))); } writer.close(); reader.close(); } else { boolean succeeded = runMapReduce(parsedArgs); if (!succeeded) { return -1; } } //load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex"))); //loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf()); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer); return 0; }
From source file:com.missionsky.scp.dataanalysis.mahout.TestNaiveBayesDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*w ww. java 2s . c o m*/ addOutputOption(); addOption(addOption(DefaultOptionCreator.overwriteOption().create())); addOption("model", "m", "The path to the model built during training", true); addOption( buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false))); addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false))); addOption("labelIndex", "l", "The path to the location of the label index", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); } boolean complementary = hasOption("testComplementary"); boolean sequential = hasOption("runSequential"); if (sequential) { FileSystem fs = FileSystem.get(getConf()); NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf()); AbstractNaiveBayesClassifier classifier; if (complementary) { classifier = new ComplementaryNaiveBayesClassifier(model); } else { classifier = new StandardNaiveBayesClassifier(model); } SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class); Reader reader = new Reader(fs, getInputPath(), getConf()); Text key = new Text(); VectorWritable vw = new VectorWritable(); while (reader.next(key, vw)) { writer.append(new Text(SLASH.split(key.toString())[1]), new VectorWritable(classifier.classifyFull(vw.get()))); } writer.close(); reader.close(); } else { boolean succeeded = runMapReduce(parsedArgs); if (!succeeded) { return -1; } } //load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex"))); //loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf()); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer); return 0; }
From source file:com.pocketx.gravity.common.TasteHadoopUtils.java
License:Apache License
/** * Reads a binary mapping file/* www . j a va2s.co m*/ */ public static OpenIntLongHashMap readItemIDIndexMap(String itemIDIndexPathStr, Configuration conf) { OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap(); Path itemIDIndexPath = new Path(itemIDIndexPathStr); for (Pair<VarIntWritable, VarLongWritable> record : new SequenceFileDirIterable<VarIntWritable, VarLongWritable>( itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { indexItemIDMap.put(record.getFirst().get(), record.getSecond().get()); } return indexItemIDMap; }
From source file:de.tuberlin.dima.recsys.ssnmm.ratingprediction.Evaluate.java
License:Apache License
public static void main(String[] args) throws IOException { int numUsers = 1823179; int numItems = 136736; double mu = 3.157255412010664; String distributedSimilarityMatrixPath = "/home/ssc/Desktop/yahoo/similarityMatrix/"; String itemBiasesFilePath = "/home/ssc/Desktop/yahoo/itemBiases.tsv"; String userBiasesFilePath = "/home/ssc/Desktop/yahoo/userBiases.tsv"; String trainingSetPath = "/home/ssc/Entwicklung/datasets/yahoo-songs/songs.tsv"; String holdoutSetPath = "home/ssc/Entwicklung/datasets/yahoo-songs/holdout.tsv"; Matrix similarities = new SparseRowMatrix(numItems, numItems); System.out.println("Reading similarities..."); int similaritiesRead = 0; Configuration conf = new Configuration(); for (Pair<IntWritable, VectorWritable> pair : new SequenceFileDirIterable<IntWritable, VectorWritable>( new Path(distributedSimilarityMatrixPath), PathType.LIST, PathFilters.partFilter(), conf)) { int item = pair.getFirst().get(); Iterator<Vector.Element> elements = pair.getSecond().get().iterateNonZero(); while (elements.hasNext()) { Vector.Element elem = elements.next(); similarities.setQuick(item, elem.index(), elem.get()); similaritiesRead++;/*w ww . j a v a2 s . com*/ } } System.out.println("Found " + similaritiesRead + " similarities"); Pattern sep = Pattern.compile("\t"); double[] itemBiases = new double[numItems]; double[] userBiases = new double[numUsers]; System.out.println("Reading item biases"); for (String line : new FileLineIterable(new File(itemBiasesFilePath))) { String[] parts = sep.split(line); itemBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]); } System.out.println("Reading user biases"); for (String line : new FileLineIterable(new File(userBiasesFilePath))) { String[] parts = sep.split(line); userBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]); } Iterator<Rating> trainRatings = new RatingsIterable(new File(trainingSetPath)).iterator(); Iterator<Rating> heldOutRatings = new RatingsIterable(new File(holdoutSetPath)).iterator(); int currentUser = 0; OpenIntDoubleHashMap prefs = new OpenIntDoubleHashMap(); int usersProcessed = 0; RunningAverage rmse = new FullRunningAverage(); RunningAverage mae = new FullRunningAverage(); RunningAverage rmseBase = new FullRunningAverage(); RunningAverage maeBase = new FullRunningAverage(); while (trainRatings.hasNext()) { Rating rating = trainRatings.next(); if (rating.user() != currentUser) { for (int n = 0; n < 10; n++) { Rating heldOutRating = heldOutRatings.next(); Preconditions.checkState(heldOutRating.user() == currentUser); double preference = 0.0; double totalSimilarity = 0.0; int count = 0; Iterator<Vector.Element> similarItems = similarities.viewRow(heldOutRating.item()) .iterateNonZero(); while (similarItems.hasNext()) { Vector.Element similarity = similarItems.next(); int similarItem = similarity.index(); if (prefs.containsKey(similarItem)) { preference += similarity.get() * (prefs.get(similarItem) - (mu + userBiases[currentUser] + itemBiases[similarItem])); totalSimilarity += Math.abs(similarity.get()); count++; } } double baselineEstimate = mu + userBiases[currentUser] + itemBiases[heldOutRating.item()]; double estimate = baselineEstimate; if (count > 1) { estimate += preference / totalSimilarity; } double baseError = Math.abs(heldOutRating.rating() - baselineEstimate); maeBase.addDatum(baseError); rmseBase.addDatum(baseError * baseError); double error = Math.abs(heldOutRating.rating() - estimate); mae.addDatum(error); rmse.addDatum(error * error); } if (++usersProcessed % 10000 == 0) { System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage())); } currentUser = rating.user(); prefs.clear(); } prefs.put(rating.item(), rating.rating()); } System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage())); }
From source file:edu.indiana.d2i.htrc.kmeans.MemKMeansUtil.java
License:Apache License
/** Configure the mapper with the cluster info */ public static void configureWithClusterInfo(Configuration conf, Path clusterPath, Collection<Cluster> clusters) { for (Writable value : new SequenceFileDirValueIterable<Writable>(clusterPath, PathType.LIST, PathFilters.partFilter(), conf)) { Class<? extends Writable> valueClass = value.getClass(); if (valueClass.equals(Cluster.class)) { // get the cluster info clusters.add((Cluster) value); } else if (valueClass.equals(Canopy.class)) { // get the cluster info Canopy canopy = (Canopy) value; clusters.add(new Cluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure())); } else {// w w w. j a v a2s .c om throw new IllegalStateException("Bad value class: " + valueClass); } } }
From source file:mlbench.bayes.BayesUtils.java
License:Apache License
public static NaiveBayesModel readModelFromDir(Path base, Configuration conf) { float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f); // read feature sums and label sums Vector scoresPerLabel = null; Vector scoresPerFeature = null; for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>( new Path(base, TrainNaiveBayesJob.WEIGHTS), PathType.LIST, PathFilters.partFilter(), conf)) { String key = record.getFirst().toString(); VectorWritable value = record.getSecond(); if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) { scoresPerFeature = value.get(); } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) { scoresPerLabel = value.get(); }//from w w w . j a v a 2 s . c o m } // Preconditions.checkNotNull(scoresPerFeature); // Preconditions.checkNotNull(scoresPerLabel); Matrix scoresPerLabelAndFeature = new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size()); for (Pair<IntWritable, VectorWritable> entry : new SequenceFileDirIterable<IntWritable, VectorWritable>( new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS), PathType.LIST, PathFilters.partFilter(), conf)) { scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get()); } Vector perlabelThetaNormalizer = scoresPerLabel.like(); /* * for (Pair<Text,VectorWritable> entry : new * SequenceFileDirIterable<Text,VectorWritable>( new Path(base, * TrainNaiveBayesJob.THETAS), PathType.LIST, PathFilters.partFilter(), * conf)) { if (entry.getFirst().toString().equals(TrainNaiveBayesJob. * LABEL_THETA_NORMALIZER)) { perlabelThetaNormalizer = * entry.getSecond().get(); } } * * Preconditions.checkNotNull(perlabelThetaNormalizer); */ return new NaiveBayesModel(scoresPerLabelAndFeature, scoresPerFeature, scoresPerLabel, perlabelThetaNormalizer, alphaI, false); }
From source file:mlbench.bayes.test.BayesTest.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException { parseArgs(args);//from w w w .ja v a2 s.co m HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); NaiveBayesModel model = NaiveBayesModel.materialize(modelPath, config); classifier = new StandardNaiveBayesClassifier(model); MPI_D.COMM_BIPARTITE_O.Barrier(); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, (JobConf) config, inDir, rank); for (int i = 0; i < inputs.length; i++) { FileSplit fsplit = inputs[i]; SequenceFileRecordReader<Text, VectorWritable> kvrr = new SequenceFileRecordReader<>(config, fsplit); Text key = kvrr.createKey(); VectorWritable value = kvrr.createValue(); while (kvrr.next(key, value)) { Vector result = classifier.classifyFull(value.get()); MPI_D.Send(new Text(SLASH.split(key.toString())[1]), new VectorWritable(result)); } } } else if (MPI_D.COMM_BIPARTITE_A != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); config.set(MAPRED_OUTPUT_DIR, outDir); config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString()); ((JobConf) config).setOutputKeyClass(Text.class); ((JobConf) config).setOutputValueClass(VectorWritable.class); TaskAttemptContext taskContext = new TaskAttemptContextImpl(config, DataMPIUtil.getHadoopTaskAttemptID()); SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>(); FileSystem fs = FileSystem.get(config); Path output = new Path(config.get(MAPRED_OUTPUT_DIR)); FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext); RecordWriter<Text, VectorWritable> outrw = null; try { fcommitter.setupJob(taskContext); outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null); } catch (IOException e) { e.printStackTrace(); System.err.println("ERROR: Please set the HDFS configuration properly\n"); System.exit(-1); } Text key = null; VectorWritable point = null; Vector vector = null; Object[] vals = MPI_D.Recv(); while (vals != null) { key = (Text) vals[0]; point = (VectorWritable) vals[1]; if (key != null && point != null) { vector = point.get(); outrw.write(key, new VectorWritable(vector)); } vals = MPI_D.Recv(); } outrw.close(null); if (fcommitter.needsTaskCommit(taskContext)) { fcommitter.commitTask(taskContext); } MPI_D.COMM_BIPARTITE_A.Barrier(); if (rank == 0) { // load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(config, labPath); // loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( output, PathType.LIST, PathFilters.partFilter(), config); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); } } MPI_D.Finalize(); }