List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable
public SequenceFileDirIterable(Path path, PathType pathType, PathFilter filter, Configuration conf)
From source file:com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java
License:Open Source License
public static BasicDBList getBsonFromSequenceFile(CustomMapReduceJobPojo cmr, int nLimit, String fields) throws SAXException, IOException, ParserConfigurationException { BasicDBList dbl = new BasicDBList(); PropertiesManager props = new PropertiesManager(); Configuration conf = getConfiguration(props); Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false); @SuppressWarnings({ "unchecked", "rawtypes" }) SequenceFileDirIterable<? extends Writable, ? extends Writable> seqFileDir = new SequenceFileDirIterable( pathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf); // Very basic, only allow top level, 1 level of nesting, and field removal HashSet<String> fieldLookup = null; if (null != fields) { fieldLookup = new HashSet<String>(); String[] fieldArray = fields.split(","); for (String field : fieldArray) { String[] fieldDecomp = field.split(":"); fieldLookup.add(fieldDecomp[0]); }/*from ww w.j a va 2 s. co m*/ } //TOTEST int nRecords = 0; for (Pair<? extends Writable, ? extends Writable> record : seqFileDir) { BasicDBObject element = new BasicDBObject(); // KEY Writable key = record.getFirst(); if (key instanceof org.apache.hadoop.io.Text) { org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) key; element.put("key", writable.toString()); } else if (key instanceof org.apache.hadoop.io.DoubleWritable) { org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) key; element.put("key", Double.toString(writable.get())); } else if (key instanceof org.apache.hadoop.io.IntWritable) { org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) key; element.put("key", Integer.toString(writable.get())); } else if (key instanceof org.apache.hadoop.io.LongWritable) { org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) key; element.put("key", Long.toString(writable.get())); } else if (key instanceof BSONWritable) { element.put("key", MongoDbUtil.convert((BSONWritable) key)); } // VALUE Writable value = record.getSecond(); if (value instanceof org.apache.hadoop.io.Text) { org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) value; element.put("value", writable.toString()); } else if (value instanceof org.apache.hadoop.io.DoubleWritable) { org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) value; element.put("value", Double.toString(writable.get())); } else if (value instanceof org.apache.hadoop.io.IntWritable) { org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) value; element.put("value", Integer.toString(writable.get())); } else if (value instanceof org.apache.hadoop.io.LongWritable) { org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) value; element.put("value", Long.toString(writable.get())); } else if (value instanceof BSONWritable) { element.put("value", MongoDbUtil.convert((BSONWritable) value)); } else if (value instanceof org.apache.mahout.math.VectorWritable) { Vector vec = ((org.apache.mahout.math.VectorWritable) value).get(); BasicDBList dbl2 = listFromMahoutVector(vec, "value", element); element.put("value", dbl2); } else if (value instanceof org.apache.mahout.clustering.classify.WeightedVectorWritable) { org.apache.mahout.clustering.classify.WeightedVectorWritable vecW = (org.apache.mahout.clustering.classify.WeightedVectorWritable) value; element.put("valueWeight", vecW.getWeight()); BasicDBList dbl2 = listFromMahoutVector(vecW.getVector(), "value", element); element.put("value", dbl2); } else if (value instanceof org.apache.mahout.clustering.iterator.ClusterWritable) { Cluster cluster = ((org.apache.mahout.clustering.iterator.ClusterWritable) value).getValue(); BasicDBObject clusterVal = new BasicDBObject(); clusterVal.put("center", listFromMahoutVector(cluster.getCenter(), "center", clusterVal)); clusterVal.put("radius", listFromMahoutVector(cluster.getRadius(), "radius", clusterVal)); element.put("value", clusterVal); } else { element.put("unknownValue", value.getClass().toString()); } // Check the fields settings: // Only handle a few... if (null != fieldLookup) { for (String fieldToRemove : fieldLookup) { if (fieldToRemove.startsWith("value.")) { fieldToRemove = fieldToRemove.substring(6); BasicDBObject nested = (BasicDBObject) element.get("value."); if (null != nested) { nested.remove(fieldToRemove); } } else { element.remove(fieldToRemove); } } //TOTEST } dbl.add(element); nRecords++; if ((nLimit > 0) && (nRecords >= nLimit)) { break; } } return dbl; }
From source file:com.luca.filipponi.tweetAnalysis.SentimentClassifier.CustomTestNaiveBayesDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from ww w . ja v a 2s .c o m addOutputOption(); addOption(addOption(DefaultOptionCreator.overwriteOption().create())); addOption("model", "m", "The path to the model built during training", true); addOption( buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false))); addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false))); addOption("labelIndex", "l", "The path to the location of the label index", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); } boolean complementary = hasOption("testComplementary"); boolean sequential = hasOption("runSequential"); if (sequential) { FileSystem fs = FileSystem.get(getConf()); NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf()); AbstractNaiveBayesClassifier classifier; if (complementary) { classifier = new ComplementaryNaiveBayesClassifier(model); } else { classifier = new StandardNaiveBayesClassifier(model); } SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class); SequenceFile.Reader reader = new SequenceFile.Reader(fs, getInputPath(), getConf()); Text key = new Text(); VectorWritable vw = new VectorWritable(); while (reader.next(key, vw)) { writer.append(new Text(SLASH.split(key.toString())[1]), new VectorWritable(classifier.classifyFull(vw.get()))); } writer.close(); reader.close(); } else { boolean succeeded = runMapReduce(parsedArgs); if (!succeeded) { return -1; } } //load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex"))); //loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf()); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer); return 0; }
From source file:com.missionsky.scp.dataanalysis.mahout.TestNaiveBayesDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/* ww w . j a v a 2s. c o m*/ addOutputOption(); addOption(addOption(DefaultOptionCreator.overwriteOption().create())); addOption("model", "m", "The path to the model built during training", true); addOption( buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false))); addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false))); addOption("labelIndex", "l", "The path to the location of the label index", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); } boolean complementary = hasOption("testComplementary"); boolean sequential = hasOption("runSequential"); if (sequential) { FileSystem fs = FileSystem.get(getConf()); NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf()); AbstractNaiveBayesClassifier classifier; if (complementary) { classifier = new ComplementaryNaiveBayesClassifier(model); } else { classifier = new StandardNaiveBayesClassifier(model); } SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class); Reader reader = new Reader(fs, getInputPath(), getConf()); Text key = new Text(); VectorWritable vw = new VectorWritable(); while (reader.next(key, vw)) { writer.append(new Text(SLASH.split(key.toString())[1]), new VectorWritable(classifier.classifyFull(vw.get()))); } writer.close(); reader.close(); } else { boolean succeeded = runMapReduce(parsedArgs); if (!succeeded) { return -1; } } //load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex"))); //loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf()); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer); return 0; }
From source file:com.netease.news.classifier.naivebayes.TrainNaiveBayesJob.java
License:Apache License
private long createLabelIndex(Path labPath) throws IOException { long labelSize = 0; if (hasOption(LABELS)) { Iterable<String> labels = Splitter.on(",").split(getOption(LABELS)); labelSize = BayesUtils.writeLabelIndex(getConf(), labels, labPath); } else if (hasOption(EXTRACT_LABELS)) { Iterable<Pair<Text, IntWritable>> iterable = new SequenceFileDirIterable<Text, IntWritable>( getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), getConf()); labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable); }/* w ww .j a va 2s . c om*/ return labelSize; }
From source file:de.tuberlin.dima.recsys.ssnmm.ratingprediction.Evaluate.java
License:Apache License
public static void main(String[] args) throws IOException { int numUsers = 1823179; int numItems = 136736; double mu = 3.157255412010664; String distributedSimilarityMatrixPath = "/home/ssc/Desktop/yahoo/similarityMatrix/"; String itemBiasesFilePath = "/home/ssc/Desktop/yahoo/itemBiases.tsv"; String userBiasesFilePath = "/home/ssc/Desktop/yahoo/userBiases.tsv"; String trainingSetPath = "/home/ssc/Entwicklung/datasets/yahoo-songs/songs.tsv"; String holdoutSetPath = "home/ssc/Entwicklung/datasets/yahoo-songs/holdout.tsv"; Matrix similarities = new SparseRowMatrix(numItems, numItems); System.out.println("Reading similarities..."); int similaritiesRead = 0; Configuration conf = new Configuration(); for (Pair<IntWritable, VectorWritable> pair : new SequenceFileDirIterable<IntWritable, VectorWritable>( new Path(distributedSimilarityMatrixPath), PathType.LIST, PathFilters.partFilter(), conf)) { int item = pair.getFirst().get(); Iterator<Vector.Element> elements = pair.getSecond().get().iterateNonZero(); while (elements.hasNext()) { Vector.Element elem = elements.next(); similarities.setQuick(item, elem.index(), elem.get()); similaritiesRead++;//www .j a v a2 s.c om } } System.out.println("Found " + similaritiesRead + " similarities"); Pattern sep = Pattern.compile("\t"); double[] itemBiases = new double[numItems]; double[] userBiases = new double[numUsers]; System.out.println("Reading item biases"); for (String line : new FileLineIterable(new File(itemBiasesFilePath))) { String[] parts = sep.split(line); itemBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]); } System.out.println("Reading user biases"); for (String line : new FileLineIterable(new File(userBiasesFilePath))) { String[] parts = sep.split(line); userBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]); } Iterator<Rating> trainRatings = new RatingsIterable(new File(trainingSetPath)).iterator(); Iterator<Rating> heldOutRatings = new RatingsIterable(new File(holdoutSetPath)).iterator(); int currentUser = 0; OpenIntDoubleHashMap prefs = new OpenIntDoubleHashMap(); int usersProcessed = 0; RunningAverage rmse = new FullRunningAverage(); RunningAverage mae = new FullRunningAverage(); RunningAverage rmseBase = new FullRunningAverage(); RunningAverage maeBase = new FullRunningAverage(); while (trainRatings.hasNext()) { Rating rating = trainRatings.next(); if (rating.user() != currentUser) { for (int n = 0; n < 10; n++) { Rating heldOutRating = heldOutRatings.next(); Preconditions.checkState(heldOutRating.user() == currentUser); double preference = 0.0; double totalSimilarity = 0.0; int count = 0; Iterator<Vector.Element> similarItems = similarities.viewRow(heldOutRating.item()) .iterateNonZero(); while (similarItems.hasNext()) { Vector.Element similarity = similarItems.next(); int similarItem = similarity.index(); if (prefs.containsKey(similarItem)) { preference += similarity.get() * (prefs.get(similarItem) - (mu + userBiases[currentUser] + itemBiases[similarItem])); totalSimilarity += Math.abs(similarity.get()); count++; } } double baselineEstimate = mu + userBiases[currentUser] + itemBiases[heldOutRating.item()]; double estimate = baselineEstimate; if (count > 1) { estimate += preference / totalSimilarity; } double baseError = Math.abs(heldOutRating.rating() - baselineEstimate); maeBase.addDatum(baseError); rmseBase.addDatum(baseError * baseError); double error = Math.abs(heldOutRating.rating() - estimate); mae.addDatum(error); rmse.addDatum(error * error); } if (++usersProcessed % 10000 == 0) { System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage())); } currentUser = rating.user(); prefs.clear(); } prefs.put(rating.item(), rating.rating()); } System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage())); }
From source file:mlbench.bayes.BayesUtils.java
License:Apache License
public static NaiveBayesModel readModelFromDir(Path base, Configuration conf) { float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f); // read feature sums and label sums Vector scoresPerLabel = null; Vector scoresPerFeature = null; for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>( new Path(base, TrainNaiveBayesJob.WEIGHTS), PathType.LIST, PathFilters.partFilter(), conf)) { String key = record.getFirst().toString(); VectorWritable value = record.getSecond(); if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) { scoresPerFeature = value.get(); } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) { scoresPerLabel = value.get(); }/*w w w . ja v a 2 s . co m*/ } // Preconditions.checkNotNull(scoresPerFeature); // Preconditions.checkNotNull(scoresPerLabel); Matrix scoresPerLabelAndFeature = new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size()); for (Pair<IntWritable, VectorWritable> entry : new SequenceFileDirIterable<IntWritable, VectorWritable>( new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS), PathType.LIST, PathFilters.partFilter(), conf)) { scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get()); } Vector perlabelThetaNormalizer = scoresPerLabel.like(); /* * for (Pair<Text,VectorWritable> entry : new * SequenceFileDirIterable<Text,VectorWritable>( new Path(base, * TrainNaiveBayesJob.THETAS), PathType.LIST, PathFilters.partFilter(), * conf)) { if (entry.getFirst().toString().equals(TrainNaiveBayesJob. * LABEL_THETA_NORMALIZER)) { perlabelThetaNormalizer = * entry.getSecond().get(); } } * * Preconditions.checkNotNull(perlabelThetaNormalizer); */ return new NaiveBayesModel(scoresPerLabelAndFeature, scoresPerFeature, scoresPerLabel, perlabelThetaNormalizer, alphaI, false); }
From source file:mlbench.bayes.test.BayesTest.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException { parseArgs(args);//from w w w . j ava2 s. c o m HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); NaiveBayesModel model = NaiveBayesModel.materialize(modelPath, config); classifier = new StandardNaiveBayesClassifier(model); MPI_D.COMM_BIPARTITE_O.Barrier(); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, (JobConf) config, inDir, rank); for (int i = 0; i < inputs.length; i++) { FileSplit fsplit = inputs[i]; SequenceFileRecordReader<Text, VectorWritable> kvrr = new SequenceFileRecordReader<>(config, fsplit); Text key = kvrr.createKey(); VectorWritable value = kvrr.createValue(); while (kvrr.next(key, value)) { Vector result = classifier.classifyFull(value.get()); MPI_D.Send(new Text(SLASH.split(key.toString())[1]), new VectorWritable(result)); } } } else if (MPI_D.COMM_BIPARTITE_A != null) { int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); config.set(MAPRED_OUTPUT_DIR, outDir); config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString()); ((JobConf) config).setOutputKeyClass(Text.class); ((JobConf) config).setOutputValueClass(VectorWritable.class); TaskAttemptContext taskContext = new TaskAttemptContextImpl(config, DataMPIUtil.getHadoopTaskAttemptID()); SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>(); FileSystem fs = FileSystem.get(config); Path output = new Path(config.get(MAPRED_OUTPUT_DIR)); FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext); RecordWriter<Text, VectorWritable> outrw = null; try { fcommitter.setupJob(taskContext); outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null); } catch (IOException e) { e.printStackTrace(); System.err.println("ERROR: Please set the HDFS configuration properly\n"); System.exit(-1); } Text key = null; VectorWritable point = null; Vector vector = null; Object[] vals = MPI_D.Recv(); while (vals != null) { key = (Text) vals[0]; point = (VectorWritable) vals[1]; if (key != null && point != null) { vector = point.get(); outrw.write(key, new VectorWritable(vector)); } vals = MPI_D.Recv(); } outrw.close(null); if (fcommitter.needsTaskCommit(taskContext)) { fcommitter.commitTask(taskContext); } MPI_D.COMM_BIPARTITE_A.Barrier(); if (rank == 0) { // load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(config, labPath); // loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( output, PathType.LIST, PathFilters.partFilter(), config); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); } } MPI_D.Finalize(); }
From source file:mlbench.bayes.train.IndexInstances.java
License:Apache License
private static long createLabelIndex(Path labPath) throws IOException { long labelSize = 0; Iterable<Pair<Text, IntWritable>> iterable = new SequenceFileDirIterable<Text, IntWritable>(inPath, PathType.LIST, PathFilters.logsCRCFilter(), config); labelSize = BayesUtils.writeLabelIndex(config, labPath, iterable); return labelSize; }
From source file:org.gpfvic.mahout.cf.taste.hadoop.als.ALS.java
License:Apache License
public static OpenIntObjectHashMap<Vector> readMatrixByRows(Path dir, Configuration conf) { OpenIntObjectHashMap<Vector> matrix = new OpenIntObjectHashMap<>(); for (Pair<IntWritable, VectorWritable> pair : new SequenceFileDirIterable<IntWritable, VectorWritable>(dir, PathType.LIST, PathFilters.partFilter(), conf)) { int rowIndex = pair.getFirst().get(); Vector row = pair.getSecond().get(); matrix.put(rowIndex, row);/*from ww w . j a va 2 s . c o m*/ } return matrix; }
From source file:org.gpfvic.mahout.cf.taste.hadoop.als.FactorizationEvaluator.java
License:Apache License
double computeRmse(Path errors) { RunningAverage average = new FullRunningAverage(); for (Pair<DoubleWritable, NullWritable> entry : new SequenceFileDirIterable<DoubleWritable, NullWritable>( errors, PathType.LIST, PathFilters.logsCRCFilter(), getConf())) { DoubleWritable error = entry.getFirst(); average.addDatum(error.get() * error.get()); }/*from ww w . ja v a 2 s .c om*/ return Math.sqrt(average.getAverage()); }