Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.mathworks.xzheng.analysis.positional.PositionalStopFilter.java

License:Apache License

public PositionalStopFilter(TokenStream in, CharArraySet stopWords) {
    super(in);/*w  ww .  jav  a2 s  . c om*/
    this.stopWords = stopWords;
    posIncrAttr = in.addAttribute(PositionIncrementAttribute.class);
    termAttr = in.addAttribute(CharTermAttribute.class);
}

From source file:com.mathworks.xzheng.analysis.synonym.SynonymAnalyzerTest.java

License:Apache License

public void testJumps() throws Exception {
    TokenStream stream = synonymAnalyzer.tokenStream("contents", // #A
            new StringReader("jumps")); // #A
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int i = 0;/*from  w  ww . jav  a  2 s.c o m*/
    String[] expected = new String[] { "jumps", // #B
            "hops", // #B
            "leaps" }; // #B
    while (stream.incrementToken()) {
        assertEquals(expected[i], term.buffer());

        int expectedPos; // #C
        if (i == 0) { // #C
            expectedPos = 1; // #C
        } else { // #C
            expectedPos = 0; // #C
        } // #C
        assertEquals(expectedPos, // #C
                posIncr.getPositionIncrement()); // #C
        i++;
    }
    assertEquals(3, i);
}

From source file:com.memonews.mahout.sentiment.SentimentModelHelper.java

License:Apache License

private static void countWords(final Analyzer analyzer, final Collection<String> words, final Reader in,
        final Multiset<String> overallCounts) throws IOException {
    final TokenStream ts = analyzer.reusableTokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*from   w  w w .  j av  a  2  s  .c  om*/
    while (ts.incrementToken()) {
        final String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
}

From source file:com.mhs.qsol.proximity.ProximityVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query.//from  w w  w . java  2s  .c  om
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {
    if (logger.isLoggable(Level.FINE)) {
        // logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    if (logger.isLoggable(Level.FINE)) {
        logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        return null;
    } else if (v.size() == 1) {
        t = v.get(0);
        SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        stq.setBoost(this.boost);
        return stq;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                SpanQuery[] spanQueries = new SpanQuery[v.size()];

                StringBuilder regex = new StringBuilder();

                for (int i = 0; i < v.size(); i++) {
                    spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString()));
                }

                return new SpanOrQuery(spanQueries);
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length())));
            }

            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.mhs.qsol.QsolToQueryVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query.//  ww  w.j av  a 2s  .  com
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        // null's will get cleaned up in visitBooleanOp
        return null;
    } else if (v.size() == 1) {

        t = v.get(0);

        TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        termQuery.setBoost(this.boost);

        return termQuery;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                BooleanQuery q = new BooleanQuery(true);

                for (int i = 0; i < v.size(); i++) {
                    t = v.get(i);

                    TermQuery currentQuery = new TermQuery(
                            new Term(field, new String(t.buffer(), 0, t.length())));
                    currentQuery.setBoost(this.boost);

                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }

                return q;
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                SpanTermQuery spanQuery = new SpanTermQuery(
                        new Term(field, new String(t2.buffer(), 0, t2.length())));
                spanQuery.setBoost(boost);
                clauses[i] = spanQuery;
            }

            // Note: There's a bug here (not by me) that where term offsets are not respected.
            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    stream.reset();/*from  w  ww  . ja  v  a  2s.  co m*/
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);

    //drop stop words
    document = StopWordsHandler.dropStopWords(document);
    context.write(key, document);
}

From source file:com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    Set<String> stopwords = Dictionary
            .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt"));
    NGramEnglishAnalyzer analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(
            Version.LUCENE_31, stopwords, false, true);
    TokenStream stream = analyzer.tokenStream("",
            new StringReader("When I was growing up this was so much fun."));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            System.out.println(termAttr.toString());
            termAttr.setEmpty();/* w w  w  .j  av a 2 s  . c o  m*/
        }
    }
}

From source file:com.mozilla.grouperfish.pig.eval.text.NGramTokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }/*from   w  w  w.j a v a  2 s  .  c  o m*/

    if (analyzer == null) {
        String langCode = "en";
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        boolean outputUnigrams = false;
        if (input.size() > 3) {
            outputUnigrams = Boolean.parseBoolean((String) input.get(3));
        }
        int minNGram = 2;
        if (input.size() > 4) {
            minNGram = Integer.parseInt((String) input.get(4));
        }
        int maxNGram = 3;
        if (input.size() > 5) {
            maxNGram = Integer.parseInt((String) input.get(5));
        }
        if (input.size() > 6) {
            langCode = (String) input.get(6);
        }

        if (stopwords != null && stopwords.size() != 0) {
            analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31,
                    stopwords, stem, outputUnigrams, minNGram, maxNGram);
        } else {
            analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31,
                    StandardAnalyzer.STOP_WORDS_SET, stem, outputUnigrams, minNGram, maxNGram);
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.mozilla.grouperfish.pig.eval.text.Tokenize.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }/*ww  w.j a v a 2s . c  o  m*/

    if (analyzer == null) {
        String langCode = "en";
        if (input.size() > 1) {
            loadDictionary((String) input.get(1));
        }
        boolean stem = false;
        if (input.size() > 2) {
            stem = Boolean.parseBoolean((String) input.get(2));
        }
        if (input.size() > 3) {
            langCode = (String) input.get(3);
        }

        if (langCode.startsWith("zh") || langCode.startsWith("ja")) {
            analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer(Version.LUCENE_31);
        } else if (langCode.startsWith("de")) {
            analyzer = new org.apache.lucene.analysis.de.GermanAnalyzer(Version.LUCENE_31);
        } else if (langCode.startsWith("es")) {
            analyzer = new org.apache.lucene.analysis.es.SpanishAnalyzer(Version.LUCENE_31);
        } else {
            if (stopwords != null && stopwords.size() > 0) {
                analyzer = new EnglishAnalyzer(Version.LUCENE_31, stopwords, stem);
            } else {
                analyzer = new EnglishAnalyzer(Version.LUCENE_31, stem);
            }
        }
    }

    DataBag output = bagFactory.newDefaultBag();
    TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0)));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            Tuple t = tupleFactory.newTuple(termAttr.toString());
            output.add(t);
            termAttr.setEmpty();
        }
    }

    return output;
}

From source file:com.mozilla.grouperfish.transforms.coclustering.lucene.analysis.en.NGramEnglishAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // TODO: SMELLY: de-system-ify
    Set<String> stopwords = Dictionary
            .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt"));
    NGramEnglishAnalyzer analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, false, true);
    TokenStream stream = analyzer.tokenStream("",
            new StringReader("When I was growing up this was so much fun."));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        if (termAttr.length() > 0) {
            System.out.println(termAttr.toString());
            termAttr.setEmpty();//from ww  w.ja  v a  2s.  c  o m
        }
    }
}