Example usage for org.apache.mahout.math RandomAccessSparseVector RandomAccessSparseVector

Introduction

In this page you can find the example usage for org.apache.mahout.math RandomAccessSparseVector RandomAccessSparseVector.

Prototype

public RandomAccessSparseVector(RandomAccessSparseVector other, boolean shallowCopy)

Source Link

Usage

From source file:cn.edu.bjtu.cit.recommender.Recommender.java

License:Apache License

@SuppressWarnings("unchecked")
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println();/*from   ww  w .  ja v a 2 s .  c  o  m*/
        System.err.println("Usage: " + this.getClass().getName()
                + " [generic options] input output [profiling] [estimation] [clustersize]");
        System.err.println();
        printUsage();
        GenericOptionsParser.printGenericCommandUsage(System.err);

        return 1;
    }
    OptionParser parser = new OptionParser(args);

    Pipeline pipeline = new MRPipeline(Recommender.class, getConf());

    if (parser.hasOption(CLUSTER_SIZE)) {
        pipeline.getConfiguration().setInt(ClusterOracle.CLUSTER_SIZE,
                Integer.parseInt(parser.getOption(CLUSTER_SIZE).getValue()));
    }

    if (parser.hasOption(PROFILING)) {
        pipeline.getConfiguration().setBoolean(Profiler.IS_PROFILE, true);
        this.profileFilePath = parser.getOption(PROFILING).getValue();

    }

    if (parser.hasOption(ESTIMATION)) {
        estFile = parser.getOption(ESTIMATION).getValue();
        est = new Estimator(estFile, clusterSize);
    }

    if (parser.hasOption(OPT_REDUCE)) {
        pipeline.getConfiguration().setBoolean(OPT_REDUCE, true);
    }

    if (parser.hasOption(OPT_MSCR)) {
        pipeline.getConfiguration().setBoolean(OPT_MSCR, true);
    }

    if (parser.hasOption(ACTIVE_THRESHOLD)) {
        threshold = Integer.parseInt(parser.getOption("at").getValue());
    }

    if (parser.hasOption(TOP)) {
        top = Integer.parseInt(parser.getOption("top").getValue());
    }

    profiler = new Profiler(pipeline);
    /*
     * input node
     */
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    if (profiler.isProfiling() && lines.getSize() > 10 * 1024 * 1024) {
        lines = lines.sample(0.1);
    }

    /*
     * S0 + GBK
     */
    PGroupedTable<Long, Long> userWithPrefs = lines.parallelDo(new MapFn<String, Pair<Long, Long>>() {

        @Override
        public Pair<Long, Long> map(String input) {
            String[] split = input.split(Estimator.DELM);
            long userID = Long.parseLong(split[0]);
            long itemID = Long.parseLong(split[1]);
            return Pair.of(userID, itemID);
        }

        @Override
        public float scaleFactor() {
            return est.getScaleFactor("S0").sizeFactor;
        }

        @Override
        public float scaleFactorByRecord() {
            return est.getScaleFactor("S0").recsFactor;
        }
    }, Writables.tableOf(Writables.longs(), Writables.longs())).groupByKey(est.getClusterSize());

    /*
     * S1
     */
    PTable<Long, Vector> userVector = userWithPrefs
            .parallelDo(new MapFn<Pair<Long, Iterable<Long>>, Pair<Long, Vector>>() {
                @Override
                public Pair<Long, Vector> map(Pair<Long, Iterable<Long>> input) {
                    Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
                    for (long itemPref : input.second()) {
                        userVector.set((int) itemPref, 1.0f);
                    }
                    return Pair.of(input.first(), userVector);
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S1").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S1").recsFactor;
                }
            }, Writables.tableOf(Writables.longs(), Writables.vectors()));

    userVector = profiler.profile("S0-S1", pipeline, userVector, ProfileConverter.long_vector(),
            Writables.tableOf(Writables.longs(), Writables.vectors()));

    /*
     * S2
     */
    PTable<Long, Vector> filteredUserVector = userVector
            .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Long, Vector>>() {

                @Override
                public void process(Pair<Long, Vector> input, Emitter<Pair<Long, Vector>> emitter) {
                    if (input.second().getNumNondefaultElements() > threshold) {
                        emitter.emit(input);
                    }
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S2").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S2").recsFactor;
                }

            }, Writables.tableOf(Writables.longs(), Writables.vectors()));

    filteredUserVector = profiler.profile("S2", pipeline, filteredUserVector, ProfileConverter.long_vector(),
            Writables.tableOf(Writables.longs(), Writables.vectors()));

    /*
     * S3 + GBK
     */
    PGroupedTable<Integer, Integer> coOccurencePairs = filteredUserVector
            .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Integer, Integer>>() {
                @Override
                public void process(Pair<Long, Vector> input, Emitter<Pair<Integer, Integer>> emitter) {
                    Iterator<Vector.Element> it = input.second().iterateNonZero();
                    while (it.hasNext()) {
                        int index1 = it.next().index();
                        Iterator<Vector.Element> it2 = input.second().iterateNonZero();
                        while (it2.hasNext()) {
                            int index2 = it2.next().index();
                            emitter.emit(Pair.of(index1, index2));
                        }
                    }
                }

                @Override
                public float scaleFactor() {
                    float size = est.getScaleFactor("S3").sizeFactor;
                    return size;
                }

                @Override
                public float scaleFactorByRecord() {
                    float recs = est.getScaleFactor("S3").recsFactor;
                    return recs;
                }
            }, Writables.tableOf(Writables.ints(), Writables.ints())).groupByKey(est.getClusterSize());

    /*
     * S4
     */
    PTable<Integer, Vector> coOccurenceVector = coOccurencePairs
            .parallelDo(new MapFn<Pair<Integer, Iterable<Integer>>, Pair<Integer, Vector>>() {
                @Override
                public Pair<Integer, Vector> map(Pair<Integer, Iterable<Integer>> input) {
                    Vector cooccurrenceRow = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
                    for (int itemIndex2 : input.second()) {
                        cooccurrenceRow.set(itemIndex2, cooccurrenceRow.get(itemIndex2) + 1.0);
                    }
                    return Pair.of(input.first(), cooccurrenceRow);
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S4").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S4").recsFactor;
                }
            }, Writables.tableOf(Writables.ints(), Writables.vectors()));

    coOccurenceVector = profiler.profile("S3-S4", pipeline, coOccurenceVector, ProfileConverter.int_vector(),
            Writables.tableOf(Writables.ints(), Writables.vectors()));

    /*
     * S5 Wrapping co-occurrence columns
     */
    PTable<Integer, VectorOrPref> wrappedCooccurrence = coOccurenceVector
            .parallelDo(new MapFn<Pair<Integer, Vector>, Pair<Integer, VectorOrPref>>() {

                @Override
                public Pair<Integer, VectorOrPref> map(Pair<Integer, Vector> input) {
                    return Pair.of(input.first(), new VectorOrPref(input.second()));
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S5").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S5").recsFactor;
                }

            }, Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs()));

    wrappedCooccurrence = profiler.profile("S5", pipeline, wrappedCooccurrence, ProfileConverter.int_vopv(),
            Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs()));

    /*
     * S6 Splitting user vectors
     */
    PTable<Integer, VectorOrPref> userVectorSplit = filteredUserVector
            .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Integer, VectorOrPref>>() {

                @Override
                public void process(Pair<Long, Vector> input, Emitter<Pair<Integer, VectorOrPref>> emitter) {
                    long userID = input.first();
                    Vector userVector = input.second();
                    Iterator<Vector.Element> it = userVector.iterateNonZero();
                    while (it.hasNext()) {
                        Vector.Element e = it.next();
                        int itemIndex = e.index();
                        float preferenceValue = (float) e.get();
                        emitter.emit(Pair.of(itemIndex, new VectorOrPref(userID, preferenceValue)));
                    }
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S6").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S6").recsFactor;
                }
            }, Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs()));

    userVectorSplit = profiler.profile("S6", pipeline, userVectorSplit, ProfileConverter.int_vopp(),
            Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs()));

    /*
     * S7 Combine VectorOrPrefs
     */
    PTable<Integer, VectorAndPrefs> combinedVectorOrPref = wrappedCooccurrence.union(userVectorSplit)
            .groupByKey(est.getClusterSize())
            .parallelDo(new DoFn<Pair<Integer, Iterable<VectorOrPref>>, Pair<Integer, VectorAndPrefs>>() {

                @Override
                public void process(Pair<Integer, Iterable<VectorOrPref>> input,
                        Emitter<Pair<Integer, VectorAndPrefs>> emitter) {
                    Vector vector = null;
                    List<Long> userIDs = Lists.newArrayList();
                    List<Float> values = Lists.newArrayList();
                    for (VectorOrPref vop : input.second()) {
                        if (vector == null) {
                            vector = vop.getVector();
                        }
                        long userID = vop.getUserID();
                        if (userID != Long.MIN_VALUE) {
                            userIDs.add(vop.getUserID());
                        }
                        float value = vop.getValue();
                        if (!Float.isNaN(value)) {
                            values.add(vop.getValue());
                        }
                    }
                    emitter.emit(Pair.of(input.first(), new VectorAndPrefs(vector, userIDs, values)));
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S7").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S7").recsFactor;
                }
            }, Writables.tableOf(Writables.ints(), VectorAndPrefs.vectorAndPrefs()));

    combinedVectorOrPref = profiler.profile("S5+S6-S7", pipeline, combinedVectorOrPref,
            ProfileConverter.int_vap(), Writables.tableOf(Writables.ints(), VectorAndPrefs.vectorAndPrefs()));
    /*
     * S8 Computing partial recommendation vectors
     */
    PTable<Long, Vector> partialMultiply = combinedVectorOrPref
            .parallelDo(new DoFn<Pair<Integer, VectorAndPrefs>, Pair<Long, Vector>>() {
                @Override
                public void process(Pair<Integer, VectorAndPrefs> input, Emitter<Pair<Long, Vector>> emitter) {
                    Vector cooccurrenceColumn = input.second().getVector();
                    List<Long> userIDs = input.second().getUserIDs();
                    List<Float> prefValues = input.second().getValues();
                    for (int i = 0; i < userIDs.size(); i++) {
                        long userID = userIDs.get(i);
                        if (userID != Long.MIN_VALUE) {
                            float prefValue = prefValues.get(i);
                            Vector partialProduct = cooccurrenceColumn.times(prefValue);
                            emitter.emit(Pair.of(userID, partialProduct));
                        }
                    }
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S8").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S8").recsFactor;
                }

            }, Writables.tableOf(Writables.longs(), Writables.vectors())).groupByKey(est.getClusterSize())
            .combineValues(new CombineFn<Long, Vector>() {

                @Override
                public void process(Pair<Long, Iterable<Vector>> input, Emitter<Pair<Long, Vector>> emitter) {
                    Vector partial = null;
                    for (Vector vector : input.second()) {
                        partial = partial == null ? vector : partial.plus(vector);
                    }
                    emitter.emit(Pair.of(input.first(), partial));
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("combine").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("combine").recsFactor;
                }
            });

    partialMultiply = profiler.profile("S8-combine", pipeline, partialMultiply, ProfileConverter.long_vector(),
            Writables.tableOf(Writables.longs(), Writables.vectors()));

    /*
     * S9 Producing recommendations from vectors
     */
    PTable<Long, RecommendedItems> recommendedItems = partialMultiply
            .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Long, RecommendedItems>>() {

                @Override
                public void process(Pair<Long, Vector> input, Emitter<Pair<Long, RecommendedItems>> emitter) {
                    Queue<RecommendedItem> topItems = new PriorityQueue<RecommendedItem>(11,
                            Collections.reverseOrder(BY_PREFERENCE_VALUE));
                    Iterator<Vector.Element> recommendationVectorIterator = input.second().iterateNonZero();
                    while (recommendationVectorIterator.hasNext()) {
                        Vector.Element element = recommendationVectorIterator.next();
                        int index = element.index();
                        float value = (float) element.get();
                        if (topItems.size() < top) {
                            topItems.add(new GenericRecommendedItem(index, value));
                        } else if (value > topItems.peek().getValue()) {
                            topItems.add(new GenericRecommendedItem(index, value));
                            topItems.poll();
                        }
                    }
                    List<RecommendedItem> recommendations = new ArrayList<RecommendedItem>(topItems.size());
                    recommendations.addAll(topItems);
                    Collections.sort(recommendations, BY_PREFERENCE_VALUE);
                    emitter.emit(Pair.of(input.first(), new RecommendedItems(recommendations)));
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S9").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S9").recsFactor;
                }

            }, Writables.tableOf(Writables.longs(), RecommendedItems.recommendedItems()));

    recommendedItems = profiler.profile("S9", pipeline, recommendedItems, ProfileConverter.long_ri(),
            Writables.tableOf(Writables.longs(), RecommendedItems.recommendedItems()));

    /*
     * Profiling
     */
    if (profiler.isProfiling()) {
        profiler.writeResultToFile(profileFilePath);
        profiler.cleanup(pipeline.getConfiguration());
        return 0;
    }
    /*
     * asText
     */
    pipeline.writeTextFile(recommendedItems, args[1]);
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
}

From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//from  w w w  .  j a va2 s  .com
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.closeQuietly(sf);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:com.mozilla.grouperfish.pig.storage.DocumentVectorStorage.java

License:Apache License

@SuppressWarnings("unchecked")
@Override/*from www. ja v  a  2  s . co m*/
public void putNext(Tuple tuple) throws IOException {
    outputKey.set((String) tuple.get(0));
    Tuple vectorTuple = (Tuple) tuple.get(1);
    Vector vector = new NamedVector(new RandomAccessSparseVector(dimensions, vectorTuple.size()),
            outputKey.toString());
    for (int i = 0; i < vectorTuple.size(); i++) {
        Object o = vectorTuple.get(i);
        switch (vectorTuple.getType(i)) {
        case DataType.INTEGER:
            // If this is just an integer then we just want to set the index to 1.0
            vector.set((Integer) o, 1.0);
            break;
        case DataType.TUPLE:
            // If this is a tuple then we want to set the index and the weight
            Tuple subt = (Tuple) o;
            vector.set((Integer) subt.get(0), (Double) subt.get(1));
            break;
        default:
            throw new RuntimeException("Unexpected tuple form");
        }

    }
    outputValue.set(vector);
    try {
        writer.write(outputKey, outputValue);
    } catch (InterruptedException e) {
        LOG.error("Interrupted while writing", e);
    }
}

From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.MahoutVectorStorage.java

License:Apache License

@Override
public void putNext(Tuple t) throws IOException {
    IntWritable outputKey = new IntWritable();
    VectorWritable outputValue = new VectorWritable();
    outputKey.set((Integer) t.get(0));
    Tuple currRow = (Tuple) t.get(1);//from  w  ww .ja v  a 2  s .  c o m
    Vector currRowVector;
    if (dimensions == 0) {
        throw new IllegalArgumentException("Trying to create 0 dimension vector");
    }
    if (STORE_AS_DENSE) {
        currRowVector = new NamedVector(new DenseVector(dimensions), outputKey.toString());
    } else if (STORE_AS_SEQUENTIAL) {
        currRowVector = new NamedVector(new SequentialAccessSparseVector(dimensions, currRow.size()),
                outputKey.toString());
    } else {
        currRowVector = new NamedVector(new RandomAccessSparseVector(dimensions, currRow.size()),
                outputKey.toString());
    }
    for (int ii = 0; ii < currRow.size(); ii++) {
        Object o = currRow.get(ii);
        switch (currRow.getType(ii)) {
        case DataType.INTEGER:
        case DataType.LONG:
        case DataType.FLOAT:
        case DataType.DOUBLE:
            currRowVector.set(ii, (Double) o);
            break;
        case DataType.TUPLE:
            // If this is a tuple then we want to set column and element
            Tuple subt = (Tuple) o;
            currRowVector.set((Integer) subt.get(0), (Double) subt.get(1));
            break;
        default:
            throw new RuntimeException("Unexpected tuple form");
        }
    }
    outputValue.set(currRowVector);
    try {
        writer.write(outputKey, outputValue);
    } catch (InterruptedException e) {
        LOG.error("Interrupted while writing", e);
    }
}

From source file:com.netease.news.classifier.naivebayes.WeightsMapper.java

License:Apache License

@Override
protected void map(IntWritable index, VectorWritable value, Context ctx)
        throws IOException, InterruptedException {
    Vector instance = value.get();
    if (weightsPerFeature == null) {
        weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements());
    }/*from  ww  w.j  av a  2  s.  c  om*/

    int label = index.get();
    weightsPerFeature.assign(instance, Functions.PLUS);
    weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum());
}

From source file:com.pocketx.gravity.recommender.cf.similarity.mapreduce.ToItemVectorsMapper.java

License:Apache License

@Override
protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx)
        throws IOException, InterruptedException {
    Vector userRatings = vectorWritable.get();

    int numElementsBeforeSampling = userRatings.getNumNondefaultElements();
    userRatings = Vectors.maybeSample(userRatings, sampleSize);
    int numElementsAfterSampling = userRatings.getNumNondefaultElements();

    int column = TasteHadoopUtils.idToIndex(rowIndex.get());
    VectorWritable itemVector = new VectorWritable(new RandomAccessSparseVector(Integer.MAX_VALUE, 1));
    itemVector.setWritesLaxPrecision(true);
    ////from   w ww  .j  a va2s . c  o m
    Iterator<Vector.Element> iterator = userRatings.nonZeroes().iterator();
    //
    while (iterator.hasNext()) {
        Vector.Element elem = iterator.next();
        itemVector.get().setQuick(column, elem.get());
        ctx.write(new IntWritable(elem.index()), itemVector);
    }

    ctx.getCounter(Elements.USER_RATINGS_USED).increment(numElementsAfterSampling);
    ctx.getCounter(Elements.USER_RATINGS_NEGLECTED)
            .increment(numElementsBeforeSampling - numElementsAfterSampling);
}

From source file:edu.rosehulman.mahout.math.VectorWritable.java

License:Apache License

@Override
public void readFields(DataInput in) throws IOException {
    int flags = in.readByte();
    //Preconditions.checkArgument(flags >> NUM_FLAGS == 0, "Unknown flags set: %d", Integer.toString(flags, 2));
    boolean dense = (flags & FLAG_DENSE) != 0;
    boolean sequential = (flags & FLAG_SEQUENTIAL) != 0;
    boolean named = (flags & FLAG_NAMED) != 0;
    boolean laxPrecision = (flags & FLAG_LAX_PRECISION) != 0;

    int size = Varint.readUnsignedVarInt(in);
    Vector v;/*  w  w w.j  av  a2 s.com*/
    if (dense) {
        double[] values = new double[size];
        for (int i = 0; i < size; i++) {
            values[i] = laxPrecision ? in.readFloat() : in.readDouble();
        }
        v = new DenseVector(values);
    } else {
        int numNonDefaultElements = Varint.readUnsignedVarInt(in);
        v = sequential ? new SequentialAccessSparseVector(size, numNonDefaultElements)
                : new RandomAccessSparseVector(size, numNonDefaultElements);
        if (sequential) {
            int lastIndex = 0;
            for (int i = 0; i < numNonDefaultElements; i++) {
                int delta = Varint.readUnsignedVarInt(in);
                int index = lastIndex + delta;
                lastIndex = index;
                double value = laxPrecision ? in.readFloat() : in.readDouble();
                v.setQuick(index, value);
            }
        } else {
            for (int i = 0; i < numNonDefaultElements; i++) {
                int index = Varint.readUnsignedVarInt(in);
                double value = laxPrecision ? in.readFloat() : in.readDouble();
                v.setQuick(index, value);
            }
        }
    }
    if (named) {
        String name = in.readUTF();
        v = new NamedVector(v, name);
    }
    vector = v;
}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;// w ww  .ja  v a2s.  c  o  m
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        sf.reset();
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.close(sf, true);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:hadoop.api.AggregateAndRecommendReducer.java

License:Apache License

private void reduceNonBooleanData(VarLongWritable userID, Iterable<PrefAndSimilarityColumnWritable> values,
        Context context) throws IOException, InterruptedException {
    /* each entry here is the sum in the numerator of the prediction formula */
    Vector numerators = null;// w  ww.  j a  v  a 2s. c  om
    /* each entry here is the sum in the denominator of the prediction formula */
    Vector denominators = null;
    /* each entry here is the number of similar items used in the prediction formula */
    Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);

    for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) {
        Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn();
        float prefValue = prefAndSimilarityColumn.getPrefValue();
        /* count the number of items used for each prediction */
        for (Element e : simColumn.nonZeroes()) {
            int itemIDIndex = e.index();
            numberOfSimilarItemsUsed.setQuick(itemIDIndex, numberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1);
        }

        if (denominators == null) {
            denominators = simColumn.clone();
        } else {
            denominators.assign(simColumn, Functions.PLUS_ABS);
        }

        if (numerators == null) {
            numerators = simColumn.clone();
            if (prefValue != BOOLEAN_PREF_VALUE) {
                numerators.assign(Functions.MULT, prefValue);
            }
        } else {
            if (prefValue != BOOLEAN_PREF_VALUE) {
                simColumn.assign(Functions.MULT, prefValue);
            }
            numerators.assign(simColumn, Functions.PLUS);
        }

    }

    if (numerators == null) {
        return;
    }

    Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
    for (Element element : numerators.nonZeroes()) {
        int itemIDIndex = element.index();
        /* preference estimations must be based on at least 2 datapoints */
        if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) {
            /* compute normalized prediction */
            double prediction = element.get() / denominators.getQuick(itemIDIndex);
            recommendationVector.setQuick(itemIDIndex, prediction);
        }
    }
    writeRecommendedItems(userID, recommendationVector, context);
}

From source file:hk.newsRecommender.Classify.java

License:Open Source License

public static void genNaiveBayesModel(Configuration conf, int labelIndex, String trainFile, String trainSeqFile,
        boolean hasHeader) {
    CSVReader reader = null;/*from  w  w w .j  a va 2 s . co  m*/
    try {
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(new Path(trainSeqFile)))
            fs.delete(new Path(trainSeqFile), true);
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, new Path(trainSeqFile), Text.class,
                VectorWritable.class);
        FileSystem fsopen = FileSystem.get(conf);
        FSDataInputStream in = fsopen.open(new Path(trainFile));
        reader = new CSVReader(new InputStreamReader(in));

        String[] header = null;
        if (hasHeader)
            header = reader.readNext();
        String[] line = null;
        Long l = 0L;
        while ((line = reader.readNext()) != null) {
            if (labelIndex > line.length)
                break;
            l++;
            List<String> tmpList = Lists.newArrayList(line);
            String label = tmpList.get(labelIndex);
            if (!strLabelList.contains(label))
                strLabelList.add(label);
            //            Text key = new Text("/" + label + "/" + l);
            Text key = new Text("/" + label + "/");
            tmpList.remove(labelIndex);

            VectorWritable vectorWritable = new VectorWritable();
            Vector vector = new RandomAccessSparseVector(tmpList.size(), tmpList.size());//???

            for (int i = 0; i < tmpList.size(); i++) {
                String tmpStr = tmpList.get(i);
                if (StringUtils.isNumeric(tmpStr))
                    vector.set(i, Double.parseDouble(tmpStr));
                else
                    vector.set(i, parseStrCell(tmpStr));
            }
            vectorWritable.set(vector);
            writer.append(key, vectorWritable);
        }
        writer.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}