List of usage examples for org.apache.mahout.math RandomAccessSparseVector RandomAccessSparseVector
public RandomAccessSparseVector(RandomAccessSparseVector other, boolean shallowCopy)
From source file:cn.edu.bjtu.cit.recommender.Recommender.java
License:Apache License
@SuppressWarnings("unchecked") public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println();/*from ww w . ja v a 2 s . c o m*/ System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output [profiling] [estimation] [clustersize]"); System.err.println(); printUsage(); GenericOptionsParser.printGenericCommandUsage(System.err); return 1; } OptionParser parser = new OptionParser(args); Pipeline pipeline = new MRPipeline(Recommender.class, getConf()); if (parser.hasOption(CLUSTER_SIZE)) { pipeline.getConfiguration().setInt(ClusterOracle.CLUSTER_SIZE, Integer.parseInt(parser.getOption(CLUSTER_SIZE).getValue())); } if (parser.hasOption(PROFILING)) { pipeline.getConfiguration().setBoolean(Profiler.IS_PROFILE, true); this.profileFilePath = parser.getOption(PROFILING).getValue(); } if (parser.hasOption(ESTIMATION)) { estFile = parser.getOption(ESTIMATION).getValue(); est = new Estimator(estFile, clusterSize); } if (parser.hasOption(OPT_REDUCE)) { pipeline.getConfiguration().setBoolean(OPT_REDUCE, true); } if (parser.hasOption(OPT_MSCR)) { pipeline.getConfiguration().setBoolean(OPT_MSCR, true); } if (parser.hasOption(ACTIVE_THRESHOLD)) { threshold = Integer.parseInt(parser.getOption("at").getValue()); } if (parser.hasOption(TOP)) { top = Integer.parseInt(parser.getOption("top").getValue()); } profiler = new Profiler(pipeline); /* * input node */ PCollection<String> lines = pipeline.readTextFile(args[0]); if (profiler.isProfiling() && lines.getSize() > 10 * 1024 * 1024) { lines = lines.sample(0.1); } /* * S0 + GBK */ PGroupedTable<Long, Long> userWithPrefs = lines.parallelDo(new MapFn<String, Pair<Long, Long>>() { @Override public Pair<Long, Long> map(String input) { String[] split = input.split(Estimator.DELM); long userID = Long.parseLong(split[0]); long itemID = Long.parseLong(split[1]); return Pair.of(userID, itemID); } @Override public float scaleFactor() { return est.getScaleFactor("S0").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("S0").recsFactor; } }, Writables.tableOf(Writables.longs(), Writables.longs())).groupByKey(est.getClusterSize()); /* * S1 */ PTable<Long, Vector> userVector = userWithPrefs .parallelDo(new MapFn<Pair<Long, Iterable<Long>>, Pair<Long, Vector>>() { @Override public Pair<Long, Vector> map(Pair<Long, Iterable<Long>> input) { Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); for (long itemPref : input.second()) { userVector.set((int) itemPref, 1.0f); } return Pair.of(input.first(), userVector); } @Override public float scaleFactor() { return est.getScaleFactor("S1").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("S1").recsFactor; } }, Writables.tableOf(Writables.longs(), Writables.vectors())); userVector = profiler.profile("S0-S1", pipeline, userVector, ProfileConverter.long_vector(), Writables.tableOf(Writables.longs(), Writables.vectors())); /* * S2 */ PTable<Long, Vector> filteredUserVector = userVector .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Long, Vector>>() { @Override public void process(Pair<Long, Vector> input, Emitter<Pair<Long, Vector>> emitter) { if (input.second().getNumNondefaultElements() > threshold) { emitter.emit(input); } } @Override public float scaleFactor() { return est.getScaleFactor("S2").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("S2").recsFactor; } }, Writables.tableOf(Writables.longs(), Writables.vectors())); filteredUserVector = profiler.profile("S2", pipeline, filteredUserVector, ProfileConverter.long_vector(), Writables.tableOf(Writables.longs(), Writables.vectors())); /* * S3 + GBK */ PGroupedTable<Integer, Integer> coOccurencePairs = filteredUserVector .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Integer, Integer>>() { @Override public void process(Pair<Long, Vector> input, Emitter<Pair<Integer, Integer>> emitter) { Iterator<Vector.Element> it = input.second().iterateNonZero(); while (it.hasNext()) { int index1 = it.next().index(); Iterator<Vector.Element> it2 = input.second().iterateNonZero(); while (it2.hasNext()) { int index2 = it2.next().index(); emitter.emit(Pair.of(index1, index2)); } } } @Override public float scaleFactor() { float size = est.getScaleFactor("S3").sizeFactor; return size; } @Override public float scaleFactorByRecord() { float recs = est.getScaleFactor("S3").recsFactor; return recs; } }, Writables.tableOf(Writables.ints(), Writables.ints())).groupByKey(est.getClusterSize()); /* * S4 */ PTable<Integer, Vector> coOccurenceVector = coOccurencePairs .parallelDo(new MapFn<Pair<Integer, Iterable<Integer>>, Pair<Integer, Vector>>() { @Override public Pair<Integer, Vector> map(Pair<Integer, Iterable<Integer>> input) { Vector cooccurrenceRow = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); for (int itemIndex2 : input.second()) { cooccurrenceRow.set(itemIndex2, cooccurrenceRow.get(itemIndex2) + 1.0); } return Pair.of(input.first(), cooccurrenceRow); } @Override public float scaleFactor() { return est.getScaleFactor("S4").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("S4").recsFactor; } }, Writables.tableOf(Writables.ints(), Writables.vectors())); coOccurenceVector = profiler.profile("S3-S4", pipeline, coOccurenceVector, ProfileConverter.int_vector(), Writables.tableOf(Writables.ints(), Writables.vectors())); /* * S5 Wrapping co-occurrence columns */ PTable<Integer, VectorOrPref> wrappedCooccurrence = coOccurenceVector .parallelDo(new MapFn<Pair<Integer, Vector>, Pair<Integer, VectorOrPref>>() { @Override public Pair<Integer, VectorOrPref> map(Pair<Integer, Vector> input) { return Pair.of(input.first(), new VectorOrPref(input.second())); } @Override public float scaleFactor() { return est.getScaleFactor("S5").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("S5").recsFactor; } }, Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs())); wrappedCooccurrence = profiler.profile("S5", pipeline, wrappedCooccurrence, ProfileConverter.int_vopv(), Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs())); /* * S6 Splitting user vectors */ PTable<Integer, VectorOrPref> userVectorSplit = filteredUserVector .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Integer, VectorOrPref>>() { @Override public void process(Pair<Long, Vector> input, Emitter<Pair<Integer, VectorOrPref>> emitter) { long userID = input.first(); Vector userVector = input.second(); Iterator<Vector.Element> it = userVector.iterateNonZero(); while (it.hasNext()) { Vector.Element e = it.next(); int itemIndex = e.index(); float preferenceValue = (float) e.get(); emitter.emit(Pair.of(itemIndex, new VectorOrPref(userID, preferenceValue))); } } @Override public float scaleFactor() { return est.getScaleFactor("S6").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("S6").recsFactor; } }, Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs())); userVectorSplit = profiler.profile("S6", pipeline, userVectorSplit, ProfileConverter.int_vopp(), Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs())); /* * S7 Combine VectorOrPrefs */ PTable<Integer, VectorAndPrefs> combinedVectorOrPref = wrappedCooccurrence.union(userVectorSplit) .groupByKey(est.getClusterSize()) .parallelDo(new DoFn<Pair<Integer, Iterable<VectorOrPref>>, Pair<Integer, VectorAndPrefs>>() { @Override public void process(Pair<Integer, Iterable<VectorOrPref>> input, Emitter<Pair<Integer, VectorAndPrefs>> emitter) { Vector vector = null; List<Long> userIDs = Lists.newArrayList(); List<Float> values = Lists.newArrayList(); for (VectorOrPref vop : input.second()) { if (vector == null) { vector = vop.getVector(); } long userID = vop.getUserID(); if (userID != Long.MIN_VALUE) { userIDs.add(vop.getUserID()); } float value = vop.getValue(); if (!Float.isNaN(value)) { values.add(vop.getValue()); } } emitter.emit(Pair.of(input.first(), new VectorAndPrefs(vector, userIDs, values))); } @Override public float scaleFactor() { return est.getScaleFactor("S7").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("S7").recsFactor; } }, Writables.tableOf(Writables.ints(), VectorAndPrefs.vectorAndPrefs())); combinedVectorOrPref = profiler.profile("S5+S6-S7", pipeline, combinedVectorOrPref, ProfileConverter.int_vap(), Writables.tableOf(Writables.ints(), VectorAndPrefs.vectorAndPrefs())); /* * S8 Computing partial recommendation vectors */ PTable<Long, Vector> partialMultiply = combinedVectorOrPref .parallelDo(new DoFn<Pair<Integer, VectorAndPrefs>, Pair<Long, Vector>>() { @Override public void process(Pair<Integer, VectorAndPrefs> input, Emitter<Pair<Long, Vector>> emitter) { Vector cooccurrenceColumn = input.second().getVector(); List<Long> userIDs = input.second().getUserIDs(); List<Float> prefValues = input.second().getValues(); for (int i = 0; i < userIDs.size(); i++) { long userID = userIDs.get(i); if (userID != Long.MIN_VALUE) { float prefValue = prefValues.get(i); Vector partialProduct = cooccurrenceColumn.times(prefValue); emitter.emit(Pair.of(userID, partialProduct)); } } } @Override public float scaleFactor() { return est.getScaleFactor("S8").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("S8").recsFactor; } }, Writables.tableOf(Writables.longs(), Writables.vectors())).groupByKey(est.getClusterSize()) .combineValues(new CombineFn<Long, Vector>() { @Override public void process(Pair<Long, Iterable<Vector>> input, Emitter<Pair<Long, Vector>> emitter) { Vector partial = null; for (Vector vector : input.second()) { partial = partial == null ? vector : partial.plus(vector); } emitter.emit(Pair.of(input.first(), partial)); } @Override public float scaleFactor() { return est.getScaleFactor("combine").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("combine").recsFactor; } }); partialMultiply = profiler.profile("S8-combine", pipeline, partialMultiply, ProfileConverter.long_vector(), Writables.tableOf(Writables.longs(), Writables.vectors())); /* * S9 Producing recommendations from vectors */ PTable<Long, RecommendedItems> recommendedItems = partialMultiply .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Long, RecommendedItems>>() { @Override public void process(Pair<Long, Vector> input, Emitter<Pair<Long, RecommendedItems>> emitter) { Queue<RecommendedItem> topItems = new PriorityQueue<RecommendedItem>(11, Collections.reverseOrder(BY_PREFERENCE_VALUE)); Iterator<Vector.Element> recommendationVectorIterator = input.second().iterateNonZero(); while (recommendationVectorIterator.hasNext()) { Vector.Element element = recommendationVectorIterator.next(); int index = element.index(); float value = (float) element.get(); if (topItems.size() < top) { topItems.add(new GenericRecommendedItem(index, value)); } else if (value > topItems.peek().getValue()) { topItems.add(new GenericRecommendedItem(index, value)); topItems.poll(); } } List<RecommendedItem> recommendations = new ArrayList<RecommendedItem>(topItems.size()); recommendations.addAll(topItems); Collections.sort(recommendations, BY_PREFERENCE_VALUE); emitter.emit(Pair.of(input.first(), new RecommendedItems(recommendations))); } @Override public float scaleFactor() { return est.getScaleFactor("S9").sizeFactor; } @Override public float scaleFactorByRecord() { return est.getScaleFactor("S9").recsFactor; } }, Writables.tableOf(Writables.longs(), RecommendedItems.recommendedItems())); recommendedItems = profiler.profile("S9", pipeline, recommendedItems, ProfileConverter.long_ri(), Writables.tableOf(Writables.longs(), RecommendedItems.recommendedItems())); /* * Profiling */ if (profiler.isProfiling()) { profiler.writeResultToFile(profileFilePath); profiler.cleanup(pipeline.getConfiguration()); return 0; } /* * asText */ pipeline.writeTextFile(recommendedItems, args[1]); PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;//from w w w . j a va2 s .com } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.closeQuietly(sf); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1); } }
From source file:com.mozilla.grouperfish.pig.storage.DocumentVectorStorage.java
License:Apache License
@SuppressWarnings("unchecked") @Override/*from www. ja v a 2 s . co m*/ public void putNext(Tuple tuple) throws IOException { outputKey.set((String) tuple.get(0)); Tuple vectorTuple = (Tuple) tuple.get(1); Vector vector = new NamedVector(new RandomAccessSparseVector(dimensions, vectorTuple.size()), outputKey.toString()); for (int i = 0; i < vectorTuple.size(); i++) { Object o = vectorTuple.get(i); switch (vectorTuple.getType(i)) { case DataType.INTEGER: // If this is just an integer then we just want to set the index to 1.0 vector.set((Integer) o, 1.0); break; case DataType.TUPLE: // If this is a tuple then we want to set the index and the weight Tuple subt = (Tuple) o; vector.set((Integer) subt.get(0), (Double) subt.get(1)); break; default: throw new RuntimeException("Unexpected tuple form"); } } outputValue.set(vector); try { writer.write(outputKey, outputValue); } catch (InterruptedException e) { LOG.error("Interrupted while writing", e); } }
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.MahoutVectorStorage.java
License:Apache License
@Override public void putNext(Tuple t) throws IOException { IntWritable outputKey = new IntWritable(); VectorWritable outputValue = new VectorWritable(); outputKey.set((Integer) t.get(0)); Tuple currRow = (Tuple) t.get(1);//from w ww .ja v a 2 s . c o m Vector currRowVector; if (dimensions == 0) { throw new IllegalArgumentException("Trying to create 0 dimension vector"); } if (STORE_AS_DENSE) { currRowVector = new NamedVector(new DenseVector(dimensions), outputKey.toString()); } else if (STORE_AS_SEQUENTIAL) { currRowVector = new NamedVector(new SequentialAccessSparseVector(dimensions, currRow.size()), outputKey.toString()); } else { currRowVector = new NamedVector(new RandomAccessSparseVector(dimensions, currRow.size()), outputKey.toString()); } for (int ii = 0; ii < currRow.size(); ii++) { Object o = currRow.get(ii); switch (currRow.getType(ii)) { case DataType.INTEGER: case DataType.LONG: case DataType.FLOAT: case DataType.DOUBLE: currRowVector.set(ii, (Double) o); break; case DataType.TUPLE: // If this is a tuple then we want to set column and element Tuple subt = (Tuple) o; currRowVector.set((Integer) subt.get(0), (Double) subt.get(1)); break; default: throw new RuntimeException("Unexpected tuple form"); } } outputValue.set(currRowVector); try { writer.write(outputKey, outputValue); } catch (InterruptedException e) { LOG.error("Interrupted while writing", e); } }
From source file:com.netease.news.classifier.naivebayes.WeightsMapper.java
License:Apache License
@Override protected void map(IntWritable index, VectorWritable value, Context ctx) throws IOException, InterruptedException { Vector instance = value.get(); if (weightsPerFeature == null) { weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements()); }/*from ww w.j av a 2 s. c om*/ int label = index.get(); weightsPerFeature.assign(instance, Functions.PLUS); weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum()); }
From source file:com.pocketx.gravity.recommender.cf.similarity.mapreduce.ToItemVectorsMapper.java
License:Apache License
@Override protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx) throws IOException, InterruptedException { Vector userRatings = vectorWritable.get(); int numElementsBeforeSampling = userRatings.getNumNondefaultElements(); userRatings = Vectors.maybeSample(userRatings, sampleSize); int numElementsAfterSampling = userRatings.getNumNondefaultElements(); int column = TasteHadoopUtils.idToIndex(rowIndex.get()); VectorWritable itemVector = new VectorWritable(new RandomAccessSparseVector(Integer.MAX_VALUE, 1)); itemVector.setWritesLaxPrecision(true); ////from w ww .j a va2s . c o m Iterator<Vector.Element> iterator = userRatings.nonZeroes().iterator(); // while (iterator.hasNext()) { Vector.Element elem = iterator.next(); itemVector.get().setQuick(column, elem.get()); ctx.write(new IntWritable(elem.index()), itemVector); } ctx.getCounter(Elements.USER_RATINGS_USED).increment(numElementsAfterSampling); ctx.getCounter(Elements.USER_RATINGS_NEGLECTED) .increment(numElementsBeforeSampling - numElementsAfterSampling); }
From source file:edu.rosehulman.mahout.math.VectorWritable.java
License:Apache License
@Override public void readFields(DataInput in) throws IOException { int flags = in.readByte(); //Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2)); boolean dense = (flags & FLAG_DENSE) != 0; boolean sequential = (flags & FLAG_SEQUENTIAL) != 0; boolean named = (flags & FLAG_NAMED) != 0; boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0; int size = Varint.readUnsignedVarInt(in); Vector v;/* w w w.j av a2 s.com*/ if (dense) { double[] values = new double[size]; for (int i = 0; i < size; i++) { values[i] = laxPrecision ? in.readFloat() : in.readDouble(); } v = new DenseVector(values); } else { int numNonDefaultElements = Varint.readUnsignedVarInt(in); v = sequential ? new SequentialAccessSparseVector(size, numNonDefaultElements) : new RandomAccessSparseVector(size, numNonDefaultElements); if (sequential) { int lastIndex = 0; for (int i = 0; i < numNonDefaultElements; i++) { int delta = Varint.readUnsignedVarInt(in); int index = lastIndex + delta; lastIndex = index; double value = laxPrecision ? in.readFloat() : in.readDouble(); v.setQuick(index, value); } } else { for (int i = 0; i < numNonDefaultElements; i++) { int index = Varint.readUnsignedVarInt(in); double value = laxPrecision ? in.readFloat() : in.readDouble(); v.setQuick(index, value); } } } if (named) { String name = in.readUTF(); v = new NamedVector(v, name); } vector = v; }
From source file:edu.rosehulman.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;// w ww .ja v a2s. c o m } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); sf.reset(); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.close(sf, true); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1); } }
From source file:hadoop.api.AggregateAndRecommendReducer.java
License:Apache License
private void reduceNonBooleanData(VarLongWritable userID, Iterable<PrefAndSimilarityColumnWritable> values, Context context) throws IOException, InterruptedException { /* each entry here is the sum in the numerator of the prediction formula */ Vector numerators = null;// w ww. j a v a 2s. c om /* each entry here is the sum in the denominator of the prediction formula */ Vector denominators = null; /* each entry here is the number of similar items used in the prediction formula */ Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) { Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn(); float prefValue = prefAndSimilarityColumn.getPrefValue(); /* count the number of items used for each prediction */ for (Element e : simColumn.nonZeroes()) { int itemIDIndex = e.index(); numberOfSimilarItemsUsed.setQuick(itemIDIndex, numberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1); } if (denominators == null) { denominators = simColumn.clone(); } else { denominators.assign(simColumn, Functions.PLUS_ABS); } if (numerators == null) { numerators = simColumn.clone(); if (prefValue != BOOLEAN_PREF_VALUE) { numerators.assign(Functions.MULT, prefValue); } } else { if (prefValue != BOOLEAN_PREF_VALUE) { simColumn.assign(Functions.MULT, prefValue); } numerators.assign(simColumn, Functions.PLUS); } } if (numerators == null) { return; } Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); for (Element element : numerators.nonZeroes()) { int itemIDIndex = element.index(); /* preference estimations must be based on at least 2 datapoints */ if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) { /* compute normalized prediction */ double prediction = element.get() / denominators.getQuick(itemIDIndex); recommendationVector.setQuick(itemIDIndex, prediction); } } writeRecommendedItems(userID, recommendationVector, context); }
From source file:hk.newsRecommender.Classify.java
License:Open Source License
public static void genNaiveBayesModel(Configuration conf, int labelIndex, String trainFile, String trainSeqFile, boolean hasHeader) { CSVReader reader = null;/*from w w w .j a va 2 s . co m*/ try { FileSystem fs = FileSystem.get(conf); if (fs.exists(new Path(trainSeqFile))) fs.delete(new Path(trainSeqFile), true); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, new Path(trainSeqFile), Text.class, VectorWritable.class); FileSystem fsopen = FileSystem.get(conf); FSDataInputStream in = fsopen.open(new Path(trainFile)); reader = new CSVReader(new InputStreamReader(in)); String[] header = null; if (hasHeader) header = reader.readNext(); String[] line = null; Long l = 0L; while ((line = reader.readNext()) != null) { if (labelIndex > line.length) break; l++; List<String> tmpList = Lists.newArrayList(line); String label = tmpList.get(labelIndex); if (!strLabelList.contains(label)) strLabelList.add(label); // Text key = new Text("/" + label + "/" + l); Text key = new Text("/" + label + "/"); tmpList.remove(labelIndex); VectorWritable vectorWritable = new VectorWritable(); Vector vector = new RandomAccessSparseVector(tmpList.size(), tmpList.size());//??? for (int i = 0; i < tmpList.size(); i++) { String tmpStr = tmpList.get(i); if (StringUtils.isNumeric(tmpStr)) vector.set(i, Double.parseDouble(tmpStr)); else vector.set(i, parseStrCell(tmpStr)); } vectorWritable.set(vector); writer.append(key, vectorWritable); } writer.close(); } catch (IOException e) { e.printStackTrace(); } }