Example usage for org.apache.mahout.common StringTuple getEntries

List of usage examples for org.apache.mahout.common StringTuple getEntries

Introduction

In this page you can find the example usage for org.apache.mahout.common StringTuple getEntries.

Prototype

public List<String> getEntries() 

Source Link

Document

Fetch the list of entries from the tuple

Usage

From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//from  www  .j  a  v  a2  s.  c om
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.closeQuietly(sf);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:com.gsvic.csmr.io.InputData.java

License:Apache License

/**
 * Reads the tokenized document/* w ww  .ja v a 2  s  .c  o m*/
 * @param conf
 * @param input
 * @return Returns the document tokens (StringTuples) in a HashMap
 * @throws IOException 
 */
public HashMap<Text, StringTuple> readTokenizedDocument(Configuration conf, Path input) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader;
    reader = new SequenceFile.Reader(fs, input, conf);
    Text key = new Text();
    StringTuple value = new StringTuple();
    HashMap<Text, StringTuple> tokensMap = new HashMap<>();

    while (reader.next(key, value)) {
        tokensMap.put(new Text(key), new StringTuple(value.getEntries()));
    }

    return tokensMap;
}

From source file:edu.rosehulman.CollocMapper.java

License:Apache License

/**
 * Collocation finder: pass 1 map phase.
 * <p/>//  w  w w  . j av a  2 s.  co  m
 * Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter delivers ngrams of
 * the appropriate size which are then decomposed into head and tail subgrams which are collected in the
 * following manner
 * <p/>
 * <pre>
 * k:head_key,           v:head_subgram
 * k:head_key,ngram_key, v:ngram
 * k:tail_key,           v:tail_subgram
 * k:tail_key,ngram_key, v:ngram
 * </pre>
 * <p/>
 * The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or tail of the
 * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram.
 * <p/>
 * For example, given 'click and clack' and an ngram length of 3:
 * <pre>
 * k: head_'click and'                         v:head_'click and'
 * k: head_'click and',ngram_'click and clack' v:ngram_'click and clack'
 * k: tail_'clack',                            v:tail_'clack'
 * k: tail_'clack',ngram_'click and clack'     v:ngram_'click and clack'
 * </pre>
 * <p/>
 * Also counts the total number of ngrams encountered and adds it to the counter
 * CollocDriver.Count.NGRAM_TOTAL
 * </p>
 *
 * @throws IOException if there's a problem with the ShingleFilter reading data or the collector collecting output.
 */
@Override
protected void map(Text key, StringTuple value, final Context context)
        throws IOException, InterruptedException {

    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
            maxShingleSize);
    sf.reset();
    try {
        int count = 0; // ngram count

        OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(
                value.getEntries().size() * (maxShingleSize - 1));
        OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());

        do {
            String term = sf.getAttribute(CharTermAttribute.class).toString();
            String type = sf.getAttribute(TypeAttribute.class).type();
            if ("shingle".equals(type)) {
                count++;
                ngrams.adjustOrPutValue(term, 1, 1);
            } else if (emitUnigrams && !term.isEmpty()) { // unigram
                unigrams.adjustOrPutValue(term, 1, 1);
            }
        } while (sf.incrementToken());

        final GramKey gramKey = new GramKey();

        ngrams.forEachPair(new ObjectIntProcedure<String>() {
            public boolean apply(String term, int frequency) {
                // obtain components, the leading (n-1)gram and the trailing unigram.
                int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
                if (i != -1) { // bigram, trigram etc

                    try {
                        Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
                        Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
                        Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);

                        gramKey.set(head, EMPTY);
                        context.write(gramKey, head);

                        gramKey.set(head, ngram.getBytes());
                        context.write(gramKey, ngram);

                        gramKey.set(tail, EMPTY);
                        context.write(gramKey, tail);

                        gramKey.set(tail, ngram.getBytes());
                        context.write(gramKey, ngram);

                    } catch (IOException e) {
                        throw new IllegalStateException(e);
                    } catch (InterruptedException e) {
                        throw new IllegalStateException(e);
                    }
                }
                return true;
            }
        });

        unigrams.forEachPair(new ObjectIntProcedure<String>() {
            public boolean apply(String term, int frequency) {
                try {
                    Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
                    gramKey.set(unigram, EMPTY);
                    context.write(gramKey, unigram);
                } catch (IOException e) {
                    throw new IllegalStateException(e);
                } catch (InterruptedException e) {
                    throw new IllegalStateException(e);
                }
                return true;
            }
        });

        context.getCounter(Count.NGRAM_TOTAL).increment(count);
        sf.end();
    } finally {
        Closeables.close(sf, true);
    }
}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//from w  ww  .j  ava 2s  . co m
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        sf.reset();
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.close(sf, true);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1);
    }
}