Example usage for org.apache.mahout.common StringTuple getEntries

Introduction

In this page you can find the example usage for org.apache.mahout.common StringTuple getEntries.

Prototype

public List<String> getEntries()

Source Link

Document

Fetch the list of entries from the tuple

Usage

From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//from  www  .j  a  v  a2  s.  c om
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.closeQuietly(sf);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:com.gsvic.csmr.io.InputData.java

License:Apache License

/**
 * Reads the tokenized document/* w ww  .ja v a 2  s  .c  o m*/
 * @param conf
 * @param input
 * @return Returns the document tokens (StringTuples) in a HashMap
 * @throws IOException 
 */
public HashMap<Text, StringTuple> readTokenizedDocument(Configuration conf, Path input) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader;
    reader = new SequenceFile.Reader(fs, input, conf);
    Text key = new Text();
    StringTuple value = new StringTuple();
    HashMap<Text, StringTuple> tokensMap = new HashMap<>();

    while (reader.next(key, value)) {
        tokensMap.put(new Text(key), new StringTuple(value.getEntries()));
    }

    return tokensMap;
}

From source file:edu.rosehulman.CollocMapper.java

License:Apache License

/**
 * Collocation finder: pass 1 map phase.
 * <p/>//  w  w w  . j av a  2 s.  co  m
 * Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter delivers ngrams of
 * the appropriate size which are then decomposed into head and tail subgrams which are collected in the
 * following manner
 * <p/>
 * <pre>
 * k:head_key,           v:head_subgram
 * k:head_key,ngram_key, v:ngram
 * k:tail_key,           v:tail_subgram
 * k:tail_key,ngram_key, v:ngram
 * </pre>
 * <p/>
 * The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or tail of the
 * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram.
 * <p/>
 * For example, given 'click and clack' and an ngram length of 3:
 * <pre>
 * k: head_'click and'                         v:head_'click and'
 * k: head_'click and',ngram_'click and clack' v:ngram_'click and clack'
 * k: tail_'clack',                            v:tail_'clack'
 * k: tail_'clack',ngram_'click and clack'     v:ngram_'click and clack'
 * </pre>
 * <p/>
 * Also counts the total number of ngrams encountered and adds it to the counter
 * CollocDriver.Count.NGRAM_TOTAL
 * </p>
 *
 * @throws IOException if there's a problem with the ShingleFilter reading data or the collector collecting output.
 */
@Override
protected void map(Text key, StringTuple value, final Context context)
        throws IOException, InterruptedException {

    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
            maxShingleSize);
    sf.reset();
    try {
        int count = 0; // ngram count

        OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(
                value.getEntries().size() * (maxShingleSize - 1));
        OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());

        do {
            String term = sf.getAttribute(CharTermAttribute.class).toString();
            String type = sf.getAttribute(TypeAttribute.class).type();
            if ("shingle".equals(type)) {
                count++;
                ngrams.adjustOrPutValue(term, 1, 1);
            } else if (emitUnigrams && !term.isEmpty()) { // unigram
                unigrams.adjustOrPutValue(term, 1, 1);
            }
        } while (sf.incrementToken());

        final GramKey gramKey = new GramKey();

        ngrams.forEachPair(new ObjectIntProcedure<String>() {
            public boolean apply(String term, int frequency) {
                // obtain components, the leading (n-1)gram and the trailing unigram.
                int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
                if (i != -1) { // bigram, trigram etc

                    try {
                        Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
                        Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
                        Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);

                        gramKey.set(head, EMPTY);
                        context.write(gramKey, head);

                        gramKey.set(head, ngram.getBytes());
                        context.write(gramKey, ngram);

                        gramKey.set(tail, EMPTY);
                        context.write(gramKey, tail);

                        gramKey.set(tail, ngram.getBytes());
                        context.write(gramKey, ngram);

                    } catch (IOException e) {
                        throw new IllegalStateException(e);
                    } catch (InterruptedException e) {
                        throw new IllegalStateException(e);
                    }
                }
                return true;
            }
        });

        unigrams.forEachPair(new ObjectIntProcedure<String>() {
            public boolean apply(String term, int frequency) {
                try {
                    Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
                    gramKey.set(unigram, EMPTY);
                    context.write(gramKey, unigram);
                } catch (IOException e) {
                    throw new IllegalStateException(e);
                } catch (InterruptedException e) {
                    throw new IllegalStateException(e);
                }
                return true;
            }
        });

        context.getCounter(Count.NGRAM_TOTAL).increment(count);
        sf.end();
    } finally {
        Closeables.close(sf, true);
    }
}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//from w  ww  .j  ava 2s  . co m
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        sf.reset();
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.close(sf, true);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1);
    }
}