Example usage for org.apache.lucene.analysis.shingle ShingleFilter close

List of usage examples for org.apache.lucene.analysis.shingle ShingleFilter close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.shingle ShingleFilter close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

NOTE: The default implementation chains the call to the input TokenStream, so be sure to call super.close() when overriding this method.

Usage

From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocMapper.java

License:Apache License

/**
 * Collocation finder: pass 1 map phase.
 * <p/>/* w  w w.  ja v a2  s.  c om*/
 * Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter delivers ngrams of
 * the appropriate size which are then decomposed into head and tail subgrams which are collected in the
 * following manner
 * 
 * <pre>
 * k:head_key,           v:head_subgram
 * k:head_key,ngram_key, v:ngram
 * k:tail_key,           v:tail_subgram
 * k:tail_key,ngram_key, v:ngram
 * </pre>
 * 
 * The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or tail of the
 * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram.
 * <p/>
 * For example, given 'click and clack' and an ngram length of 3: 
 * <pre>
 * k: head_'click and'                         v:head_'click and'
 * k: head_'click and',ngram_'click and clack' v:ngram_'click and clack'
 * k: tail_'clack',                            v:tail_'clack'
 * k: tail_'clack',ngram_'click and clack'     v:ngram_'click and clack'
 * </pre>
 * 
 * Also counts the total number of ngrams encountered and adds it to the counter
 * CollocDriver.Count.NGRAM_TOTAL
 * </p>
 * 
 * @throws IOException
 *           if there's a problem with the ShingleFilter reading data or the collector collecting output.
 */
@Override
protected void map(Text key, StringTuple value, final Context context)
        throws IOException, InterruptedException {

    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
            maxShingleSize);
    int count = 0; // ngram count

    OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(
            value.getEntries().size() * (maxShingleSize - 1));
    OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());

    do {
        String term = (sf.getAttribute(TermAttribute.class)).term();
        String type = (sf.getAttribute(TypeAttribute.class)).type();
        if ("shingle".equals(type)) {
            count++;
            ngrams.adjustOrPutValue(term, 1, 1);
        } else if (emitUnigrams && term.length() > 0) { // unigram
            unigrams.adjustOrPutValue(term, 1, 1);
        }
    } while (sf.incrementToken());

    try {
        final GramKey gramKey = new GramKey();

        ngrams.forEachPair(new ObjectIntProcedure<String>() {
            @Override
            public boolean apply(String term, int frequency) {
                // obtain components, the leading (n-1)gram and the trailing unigram.
                int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
                if (i != -1) { // bigram, trigram etc

                    try {
                        Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
                        Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
                        Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);

                        gramKey.set(head, EMPTY);
                        context.write(gramKey, head);

                        gramKey.set(head, ngram.getBytes());
                        context.write(gramKey, ngram);

                        gramKey.set(tail, EMPTY);
                        context.write(gramKey, tail);

                        gramKey.set(tail, ngram.getBytes());
                        context.write(gramKey, ngram);

                    } catch (IOException e) {
                        throw new IllegalStateException(e);
                    } catch (InterruptedException e) {
                        throw new IllegalStateException(e);
                    }
                }
                return true;
            }
        });

        unigrams.forEachPair(new ObjectIntProcedure<String>() {
            @Override
            public boolean apply(String term, int frequency) {
                try {
                    Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
                    gramKey.set(unigram, EMPTY);
                    context.write(gramKey, unigram);
                } catch (IOException e) {
                    throw new IllegalStateException(e);
                } catch (InterruptedException e) {
                    throw new IllegalStateException(e);
                }
                return true;
            }
        });
    } catch (IllegalStateException ise) {
        // catch an re-throw original exceptions from the procedures.
        if (ise.getCause() instanceof IOException) {
            throw (IOException) ise.getCause();
        } else {
            // wasn't what was expected, so re-throw
            throw ise;
        }
    }

    context.getCounter(Count.NGRAM_TOTAL).increment(count);

    sf.end();
    sf.close();
}

From source file:org.apache.mahout.utils.vectors.text.term.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;/* w  w w .j  av a  2s  .  c o  m*/
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);

        do {
            String term = (sf.getAttribute(TermAttribute.class)).term();
            if (term.length() > 0) { // ngram
                if (dictionary.containsKey(term)) {
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            }
        } while (sf.incrementToken());

        sf.end();
        sf.close();
    } else {
        for (String term : value.getEntries()) {
            if (term.length() > 0) { // unigram
                if (dictionary.containsKey(term)) {
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }
    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(new NamedVector(vector, key.toString()));
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1);
    }
}