Example usage for org.apache.lucene.analysis.shingle ShingleFilter ShingleFilter

List of usage examples for org.apache.lucene.analysis.shingle ShingleFilter ShingleFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.shingle ShingleFilter ShingleFilter.

Prototype

public ShingleFilter(TokenStream input, String tokenType) 

Source Link

Document

Construct a ShingleFilter with the specified token type for shingle tokens and the default shingle size: 2

Usage

From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;//ww w  .ja va 2s. c  o  m
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.closeQuietly(sf);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.CharacterShingleAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new CharacterShingleTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(result);
    result = new ShingleFilter(result, 3);

    //        result = new WordDelimiterFilter(result, WordDelimiterFilter.DIGIT, null);

    return new TokenStreamComponents(source, result);
}

From source file:edu.isi.pfindr.learn.model.Shingles.java

License:Apache License

public static List<String> computeShingles(String data) {

    //System.out.println("I an here");
    data = data.toLowerCase();/* w  w w.j a va 2 s  . c  o  m*/
    List<String> shingleList = new ArrayList<String>();
    //System.out.println("DATA inside expandWithDictionaryForShingles "+ data);
    try {
        Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
        TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
        ShingleFilter filter = new ShingleFilter(tokenStream, 4);
        filter.setOutputUnigrams(false);
        TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);

        //System.out.print("Printing the shingles ");
        while (filter.incrementToken()) {
            shingleList.add(termAtt.term().trim()); //.replaceAll("_", " ").replaceAll("\\s+", " ").trim());
            //System.out.print(termAtt.term()+ "\t"); 
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    logger.info("Shingle List size returned: " + shingleList.size());
    return shingleList;
}

From source file:edu.rosehulman.CollocMapper.java

License:Apache License

/**
 * Collocation finder: pass 1 map phase.
 * <p/>/*  w ww.j a  v a 2s . c  o  m*/
 * Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter delivers ngrams of
 * the appropriate size which are then decomposed into head and tail subgrams which are collected in the
 * following manner
 * <p/>
 * <pre>
 * k:head_key,           v:head_subgram
 * k:head_key,ngram_key, v:ngram
 * k:tail_key,           v:tail_subgram
 * k:tail_key,ngram_key, v:ngram
 * </pre>
 * <p/>
 * The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or tail of the
 * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram.
 * <p/>
 * For example, given 'click and clack' and an ngram length of 3:
 * <pre>
 * k: head_'click and'                         v:head_'click and'
 * k: head_'click and',ngram_'click and clack' v:ngram_'click and clack'
 * k: tail_'clack',                            v:tail_'clack'
 * k: tail_'clack',ngram_'click and clack'     v:ngram_'click and clack'
 * </pre>
 * <p/>
 * Also counts the total number of ngrams encountered and adds it to the counter
 * CollocDriver.Count.NGRAM_TOTAL
 * </p>
 *
 * @throws IOException if there's a problem with the ShingleFilter reading data or the collector collecting output.
 */
@Override
protected void map(Text key, StringTuple value, final Context context)
        throws IOException, InterruptedException {

    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
            maxShingleSize);
    sf.reset();
    try {
        int count = 0; // ngram count

        OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(
                value.getEntries().size() * (maxShingleSize - 1));
        OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());

        do {
            String term = sf.getAttribute(CharTermAttribute.class).toString();
            String type = sf.getAttribute(TypeAttribute.class).type();
            if ("shingle".equals(type)) {
                count++;
                ngrams.adjustOrPutValue(term, 1, 1);
            } else if (emitUnigrams && !term.isEmpty()) { // unigram
                unigrams.adjustOrPutValue(term, 1, 1);
            }
        } while (sf.incrementToken());

        final GramKey gramKey = new GramKey();

        ngrams.forEachPair(new ObjectIntProcedure<String>() {
            public boolean apply(String term, int frequency) {
                // obtain components, the leading (n-1)gram and the trailing unigram.
                int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
                if (i != -1) { // bigram, trigram etc

                    try {
                        Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
                        Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
                        Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);

                        gramKey.set(head, EMPTY);
                        context.write(gramKey, head);

                        gramKey.set(head, ngram.getBytes());
                        context.write(gramKey, ngram);

                        gramKey.set(tail, EMPTY);
                        context.write(gramKey, tail);

                        gramKey.set(tail, ngram.getBytes());
                        context.write(gramKey, ngram);

                    } catch (IOException e) {
                        throw new IllegalStateException(e);
                    } catch (InterruptedException e) {
                        throw new IllegalStateException(e);
                    }
                }
                return true;
            }
        });

        unigrams.forEachPair(new ObjectIntProcedure<String>() {
            public boolean apply(String term, int frequency) {
                try {
                    Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
                    gramKey.set(unigram, EMPTY);
                    context.write(gramKey, unigram);
                } catch (IOException e) {
                    throw new IllegalStateException(e);
                } catch (InterruptedException e) {
                    throw new IllegalStateException(e);
                }
                return true;
            }
        });

        context.getCounter(Count.NGRAM_TOTAL).increment(count);
        sf.end();
    } finally {
        Closeables.close(sf, true);
    }
}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<StringTuple> values, Context context)
        throws IOException, InterruptedException {
    Iterator<StringTuple> it = values.iterator();
    if (!it.hasNext()) {
        return;/*w  ww .  ja v a2s  . co m*/
    }
    StringTuple value = it.next();

    Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size

    if (maxNGramSize >= 2) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
                maxNGramSize);
        sf.reset();
        try {
            do {
                String term = sf.getAttribute(CharTermAttribute.class).toString();
                if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram
                    int termId = dictionary.get(term);
                    vector.setQuick(termId, vector.getQuick(termId) + 1);
                }
            } while (sf.incrementToken());

            sf.end();
        } finally {
            Closeables.close(sf, true);
        }
    } else {
        for (String term : value.getEntries()) {
            if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
                int termId = dictionary.get(term);
                vector.setQuick(termId, vector.getQuick(termId) + 1);
            }
        }
    }
    if (sequentialAccess) {
        vector = new SequentialAccessSparseVector(vector);
    }

    if (namedVector) {
        vector = new NamedVector(vector, key.toString());
    }

    // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
    if (vector.getNumNondefaultElements() > 0) {
        VectorWritable vectorWritable = new VectorWritable(vector);
        context.write(key, vectorWritable);
    } else {
        context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1);
    }
}

From source file:indexer.TweetAnalyzer.java

@Override
protected Analyzer.TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer tokenizer = new UAX29URLEmailTokenizer(matchVersion, reader);

    TokenStream tokenStream = new StandardFilter(matchVersion, tokenizer);
    tokenStream = new LowerCaseFilter(matchVersion, tokenStream);
    tokenStream = new StopFilter(matchVersion, tokenStream, stopSet);
    tokenStream = new URLFilter(matchVersion, tokenStream); // remove URLs

    if (!uniGramsOnly)
        tokenStream = new ShingleFilter(tokenStream, 2); // index bigrams as well...

    if (toStem)//from   w w w  . j  a va  2s. c o m
        tokenStream = new PorterStemFilter(tokenStream);

    return new Analyzer.TokenStreamComponents(tokenizer, tokenStream);
}

From source file:it.cnr.isti.hpc.dexter.analysis.DexterAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {

    CharFilter cf = new PatternReplaceCharFilter(Pattern.compile("[*-!`{}~[]='<>:/;.&%|=+_]"), "", reader);

    cf = new HTMLStripCharFilter(cf);

    final StandardTokenizer analyzer = new StandardTokenizer(Version.LUCENE_41, cf);
    TokenStream tok = new StandardFilter(Version.LUCENE_41, analyzer);
    tok = new LowerCaseFilter(Version.LUCENE_41, tok);
    tok = new ASCIIFoldingFilter(tok);
    if (shingles) {
        tok = new ShingleFilter(tok, 5);
    }//from   w  w w. java2s.  c  o m
    return new TokenStreamComponents(analyzer, tok) {
        @Override
        protected void setReader(final Reader reader) throws IOException {
            super.setReader(reader);
        }
    };
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureMapper.java

License:Apache License

/**
 * We need to count the number of times we've seen a term with a given label and we need to output that. But
 * this Mapper does more than just outputing the count. It first does weight normalisation. Secondly, it
 * outputs for each unique word in a document value 1 for summing up as the Term Document Frequency. Which
 * later is used to calculate the Idf Thirdly, it outputs for each label the number of times a document was
 * seen(Also used in Idf Calculation)/*from   w w  w .ja  v  a2s.c o m*/
 * 
 * @param key
 *          The label
 * @param value
 *          the features (all unique) associated w/ this label in stringtuple format
 * @param output
 *          The OutputCollector to write the results to
 * @param reporter
 *          Not used
 */
@Override
public void map(Text key, Text value, final OutputCollector<StringTuple, DoubleWritable> output,
        Reporter reporter) throws IOException {
    // String line = value.toString();
    final String label = key.toString();
    String[] tokens = SPACE_PATTERN.split(value.toString());
    OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);

    if (gramSize > 1) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(new ArrayIterator<String>(tokens)),
                gramSize);
        do {
            String term = (sf.getAttribute(TermAttribute.class)).term();
            if (term.length() > 0) {
                if (wordList.containsKey(term)) {
                    wordList.put(term, 1 + wordList.get(term));
                } else {
                    wordList.put(term, 1);
                }
            }
        } while (sf.incrementToken());
    } else {
        for (String term : tokens) {
            if (wordList.containsKey(term)) {
                wordList.put(term, 1 + wordList.get(term));
            } else {
                wordList.put(term, 1);
            }
        }
    }
    final MutableDouble lengthNormalisationMut = new MutableDouble(0.0);
    wordList.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String word, int dKJ) {
            lengthNormalisationMut.add(dKJ * dKJ);
            return true;
        }
    });

    final double lengthNormalisation = Math.sqrt(lengthNormalisationMut.doubleValue());

    // Output Length Normalized + TF Transformed Frequency per Word per Class
    // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) )
    wordList.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String token, int dKJ) {
            try {
                StringTuple tuple = new StringTuple();
                tuple.add(BayesConstants.WEIGHT);
                tuple.add(label);
                tuple.add(token);
                DoubleWritable f = new DoubleWritable(Math.log(1.0 + dKJ) / lengthNormalisation);
                output.collect(tuple, f);
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
            return true;
        }
    });
    reporter.setStatus("Bayes Feature Mapper: Document Label: " + label);

    // Output Document Frequency per Word per Class
    // Corpus Document Frequency (FEATURE_COUNT)
    // Corpus Term Frequency (FEATURE_TF)
    wordList.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String token, int dKJ) {
            try {
                StringTuple dfTuple = new StringTuple();
                dfTuple.add(BayesConstants.DOCUMENT_FREQUENCY);
                dfTuple.add(label);
                dfTuple.add(token);
                output.collect(dfTuple, ONE);

                StringTuple tokenCountTuple = new StringTuple();
                tokenCountTuple.add(BayesConstants.FEATURE_COUNT);
                tokenCountTuple.add(token);
                output.collect(tokenCountTuple, ONE);

                StringTuple tokenTfTuple = new StringTuple();
                tokenTfTuple.add(BayesConstants.FEATURE_TF);
                tokenTfTuple.add(token);
                output.collect(tokenTfTuple, new DoubleWritable(dKJ));
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
            return true;
        }
    });

    // output that we have seen the label to calculate the Count of Document per
    // class
    StringTuple labelCountTuple = new StringTuple();
    labelCountTuple.add(BayesConstants.LABEL_COUNT);
    labelCountTuple.add(label);
    output.collect(labelCountTuple, ONE);
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** shingles, keep those matching whitelist */
@Test//  w  w w .  j a v a2  s  . c  om
public void testShingleFilteredAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    ShingleFilter sf = new ShingleFilter(ts, 3);
    TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf);
    validateTokens(expectedShingleTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocMapper.java

License:Apache License

/**
 * Collocation finder: pass 1 map phase.
 * <p/>/*w ww .j a  va  2 s  . c  o m*/
 * Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter delivers ngrams of
 * the appropriate size which are then decomposed into head and tail subgrams which are collected in the
 * following manner
 * 
 * <pre>
 * k:head_key,           v:head_subgram
 * k:head_key,ngram_key, v:ngram
 * k:tail_key,           v:tail_subgram
 * k:tail_key,ngram_key, v:ngram
 * </pre>
 * 
 * The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or tail of the
 * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram.
 * <p/>
 * For example, given 'click and clack' and an ngram length of 3: 
 * <pre>
 * k: head_'click and'                         v:head_'click and'
 * k: head_'click and',ngram_'click and clack' v:ngram_'click and clack'
 * k: tail_'clack',                            v:tail_'clack'
 * k: tail_'clack',ngram_'click and clack'     v:ngram_'click and clack'
 * </pre>
 * 
 * Also counts the total number of ngrams encountered and adds it to the counter
 * CollocDriver.Count.NGRAM_TOTAL
 * </p>
 * 
 * @throws IOException
 *           if there's a problem with the ShingleFilter reading data or the collector collecting output.
 */
@Override
protected void map(Text key, StringTuple value, final Context context)
        throws IOException, InterruptedException {

    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
            maxShingleSize);
    int count = 0; // ngram count

    OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(
            value.getEntries().size() * (maxShingleSize - 1));
    OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());

    do {
        String term = (sf.getAttribute(TermAttribute.class)).term();
        String type = (sf.getAttribute(TypeAttribute.class)).type();
        if ("shingle".equals(type)) {
            count++;
            ngrams.adjustOrPutValue(term, 1, 1);
        } else if (emitUnigrams && term.length() > 0) { // unigram
            unigrams.adjustOrPutValue(term, 1, 1);
        }
    } while (sf.incrementToken());

    try {
        final GramKey gramKey = new GramKey();

        ngrams.forEachPair(new ObjectIntProcedure<String>() {
            @Override
            public boolean apply(String term, int frequency) {
                // obtain components, the leading (n-1)gram and the trailing unigram.
                int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages.
                if (i != -1) { // bigram, trigram etc

                    try {
                        Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM);
                        Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD);
                        Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL);

                        gramKey.set(head, EMPTY);
                        context.write(gramKey, head);

                        gramKey.set(head, ngram.getBytes());
                        context.write(gramKey, ngram);

                        gramKey.set(tail, EMPTY);
                        context.write(gramKey, tail);

                        gramKey.set(tail, ngram.getBytes());
                        context.write(gramKey, ngram);

                    } catch (IOException e) {
                        throw new IllegalStateException(e);
                    } catch (InterruptedException e) {
                        throw new IllegalStateException(e);
                    }
                }
                return true;
            }
        });

        unigrams.forEachPair(new ObjectIntProcedure<String>() {
            @Override
            public boolean apply(String term, int frequency) {
                try {
                    Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM);
                    gramKey.set(unigram, EMPTY);
                    context.write(gramKey, unigram);
                } catch (IOException e) {
                    throw new IllegalStateException(e);
                } catch (InterruptedException e) {
                    throw new IllegalStateException(e);
                }
                return true;
            }
        });
    } catch (IllegalStateException ise) {
        // catch an re-throw original exceptions from the procedures.
        if (ise.getCause() instanceof IOException) {
            throw (IOException) ise.getCause();
        } else {
            // wasn't what was expected, so re-throw
            throw ise;
        }
    }

    context.getCounter(Count.NGRAM_TOTAL).increment(count);

    sf.end();
    sf.close();
}