Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java

License:Apache License

/**
 * Adds term weights found by tokenizing text from reader into the Map words
 *
 * @param reader a source of text to be tokenized
 * @param termWeightMap a Map of terms and their weights
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*  w w w  . j  a v a2s .c om*/
private void addTermWeights(Reader reader, Map<String, Flt> termWeightMap, String fieldName)
        throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException(
                "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }

    TokenStream ts = analyzer.tokenStream(fieldName, reader);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class);
        TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);

        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsedPerField) {
                break;
            }
            if (word.trim().length() == 0) {
                continue;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            BytesRef payload = payloadAttr.getPayload();
            float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field
            if (isPayloadField(fieldName) && payload != null) {
                tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            }
            // increment frequency
            Flt termWeight = termWeightMap.get(word);
            if (termWeight == null) {
                termWeightMap.put(word, new Flt(tokenWeight));
            } else {
                termWeight.x += tokenWeight;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.dice.solrenhancements.spellchecker.DiceMultipleCaseSuggester.java

License:Apache License

private String getAnalyzerResult(String suggestion) {
    TokenStream ts = null;
    try {// ww w  .j  a  v a 2 s  .com
        Reader reader = new StringReader(suggestion);
        ts = this.suggestionAnalyzer.tokenStream("", reader);

        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            if (word != null && word.length() > 0) {
                return word;
            }
        }
    } catch (Exception ex) {
        if (this.field != null) {
            LOG.error(
                    String.format("Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}",
                            this.field, suggestion),
                    ex);
        } else if (this.fieldTypeName != null) {
            LOG.error(String.format(
                    "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}",
                    this.fieldTypeName, suggestion), ex);
        }
    } finally {
        if (ts != null) {
            IOUtils.closeWhileHandlingException(ts);
        }
    }
    return null;
}

From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java

License:Apache License

/**
 * Adds term weights found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termWeightMap a Map of terms and their weights
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*from   w ww .j a v  a2  s. c om*/
private void addTermWeights(Reader r, Map<String, Flt> termWeightMap, String fieldName) throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException(
                "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class);

        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsedPerField) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            BytesRef payload = payloadAttr.getPayload();
            float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field
            if (isPayloadField(fieldName) && payload != null) {
                tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            }
            // increment frequency
            Flt termWeight = termWeightMap.get(word);
            if (termWeight == null) {
                termWeightMap.put(word, new Flt(tokenWeight));
            } else {
                termWeight.x += tokenWeight;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.drftpd.vfs.index.lucene.LuceneUtils.java

License:Open Source License

/**
 * Parses the name removing unwanted chars from it.
 *
 * @param field//from   w w w  .  jav a2 s.co m
 * @param term
 * @param name
 * @return Query
 */
public static Query analyze(String field, Term term, String name) {
    TokenStream ts = LuceneEngine.ANALYZER.tokenStream(field, new StringReader(name));

    BooleanQuery bQuery = new BooleanQuery();
    WildcardQuery wQuery;

    Set<String> tokens = new HashSet<String>(); // avoids repeated terms.

    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            tokens.add(termAtt.toString());
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        logger.error("IOException analyzing string", e);
    }

    for (String text : tokens) {
        wQuery = new WildcardQuery(term.createTerm(text));
        bQuery.add(wQuery, BooleanClause.Occur.MUST);
    }

    return bQuery;
}

From source file:org.eclipse.che.api.search.server.impl.LuceneSearcher.java

License:Open Source License

@Override
public SearchResult search(QueryExpression query) throws InvalidQueryException, QueryExecutionException {
    IndexSearcher luceneSearcher = null;
    try {/*  w w  w  .  j  ava  2  s.c  om*/
        final long startTime = System.currentTimeMillis();
        searcherManager.maybeRefresh();
        luceneSearcher = searcherManager.acquire();

        Query luceneQuery = createLuceneQuery(query);

        ScoreDoc after = null;
        final int numSkipDocs = Math.max(0, query.getSkipCount());
        if (numSkipDocs > 0) {
            after = skipScoreDocs(luceneSearcher, luceneQuery, numSkipDocs);
        }

        final int numDocs = query.getMaxItems() > 0 ? Math.min(query.getMaxItems(), RESULT_LIMIT)
                : RESULT_LIMIT;
        TopDocs topDocs = luceneSearcher.searchAfter(after, luceneQuery, numDocs, sort, true, true);
        final long totalHitsNum = topDocs.totalHits;

        List<SearchResultEntry> results = newArrayList();
        List<OffsetData> offsetData = Collections.emptyList();
        for (int i = 0; i < topDocs.scoreDocs.length; i++) {
            ScoreDoc scoreDoc = topDocs.scoreDocs[i];
            int docId = scoreDoc.doc;
            Document doc = luceneSearcher.doc(docId);
            if (query.isIncludePositions()) {
                offsetData = new ArrayList<>();
                String txt = doc.get(TEXT_FIELD);
                if (txt != null) {
                    IndexReader reader = luceneSearcher.getIndexReader();

                    TokenStream tokenStream = TokenSources.getTokenStream(TEXT_FIELD,
                            reader.getTermVectors(docId), txt, luceneIndexWriter.getAnalyzer(), -1);

                    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
                    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);

                    QueryScorer queryScorer = new QueryScorer(luceneQuery);
                    // TODO think about this constant
                    queryScorer.setMaxDocCharsToAnalyze(1_000_000);
                    TokenStream newStream = queryScorer.init(tokenStream);
                    if (newStream != null) {
                        tokenStream = newStream;
                    }
                    queryScorer.startFragment(null);

                    tokenStream.reset();

                    int startOffset, endOffset;
                    // TODO think about this constant
                    for (boolean next = tokenStream.incrementToken(); next
                            && (offsetAtt.startOffset() < 1_000_000); next = tokenStream.incrementToken()) {
                        startOffset = offsetAtt.startOffset();
                        endOffset = offsetAtt.endOffset();

                        if ((endOffset > txt.length()) || (startOffset > txt.length())) {
                            throw new QueryExecutionException("Token " + termAtt.toString()
                                    + " exceeds length of provided text size " + txt.length());
                        }

                        float res = queryScorer.getTokenScore();
                        if (res > 0.0F && startOffset <= endOffset) {
                            String tokenText = txt.substring(startOffset, endOffset);
                            Scanner sc = new Scanner(txt);
                            int lineNum = 1;
                            long len = 0;
                            String foundLine = "";
                            while (sc.hasNextLine()) {
                                foundLine = sc.nextLine();

                                len += foundLine.length();
                                if (len > startOffset) {
                                    break;
                                }
                                lineNum++;
                            }
                            offsetData.add(
                                    new OffsetData(tokenText, startOffset, endOffset, res, lineNum, foundLine));
                        }
                    }
                }
            }

            String filePath = doc.getField(PATH_FIELD).stringValue();
            LOG.debug("Doc {} path {} score {} ", docId, filePath, scoreDoc.score);
            results.add(new SearchResultEntry(filePath, offsetData));
        }

        final long elapsedTimeMillis = System.currentTimeMillis() - startTime;

        boolean hasMoreToRetrieve = numSkipDocs + topDocs.scoreDocs.length + 1 < totalHitsNum;
        QueryExpression nextPageQueryExpression = null;
        if (hasMoreToRetrieve) {
            nextPageQueryExpression = createNextPageQuery(query, numSkipDocs + topDocs.scoreDocs.length);
        }

        return SearchResult.aSearchResult().withResults(results).withTotalHits(totalHitsNum)
                .withNextPageQueryExpression(nextPageQueryExpression).withElapsedTimeMillis(elapsedTimeMillis)
                .build();
    } catch (ParseException e) {
        throw new InvalidQueryException(e.getMessage(), e);
    } catch (IOException e) {
        throw new QueryExecutionException(e.getMessage(), e);
    } finally {
        try {
            searcherManager.release(luceneSearcher);
        } catch (IOException e) {
            LOG.error(e.getMessage());
        }
    }
}

From source file:org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction.java

License:Apache License

@Override
protected AnalyzeResponse shardOperation(AnalyzeRequest request, int shardId) throws ElasticsearchException {
    IndexService indexService = null;/*from ww  w.j  a v  a 2 s.  c o m*/
    if (request.index() != null) {
        indexService = indicesService.indexServiceSafe(request.index());
    }
    Analyzer analyzer = null;
    boolean closeAnalyzer = false;
    String field = null;
    if (request.field() != null) {
        if (indexService == null) {
            throw new ElasticsearchIllegalArgumentException(
                    "No index provided, and trying to analyzer based on a specific field which requires the index parameter");
        }
        FieldMapper<?> fieldMapper = indexService.mapperService().smartNameFieldMapper(request.field());
        if (fieldMapper != null) {
            if (fieldMapper.isNumeric()) {
                throw new ElasticsearchIllegalArgumentException("Can't process field [" + request.field()
                        + "], Analysis requests are not supported on numeric fields");
            }
            analyzer = fieldMapper.indexAnalyzer();
            field = fieldMapper.names().indexName();

        }
    }
    if (field == null) {
        if (indexService != null) {
            field = indexService.queryParserService().defaultField();
        } else {
            field = AllFieldMapper.NAME;
        }
    }
    if (analyzer == null && request.analyzer() != null) {
        if (indexService == null) {
            analyzer = indicesAnalysisService.analyzer(request.analyzer());
        } else {
            analyzer = indexService.analysisService().analyzer(request.analyzer());
        }
        if (analyzer == null) {
            throw new ElasticsearchIllegalArgumentException(
                    "failed to find analyzer [" + request.analyzer() + "]");
        }
    } else if (request.tokenizer() != null) {
        TokenizerFactory tokenizerFactory;
        if (indexService == null) {
            TokenizerFactoryFactory tokenizerFactoryFactory = indicesAnalysisService
                    .tokenizerFactoryFactory(request.tokenizer());
            if (tokenizerFactoryFactory == null) {
                throw new ElasticsearchIllegalArgumentException(
                        "failed to find global tokenizer under [" + request.tokenizer() + "]");
            }
            tokenizerFactory = tokenizerFactoryFactory.create(request.tokenizer(),
                    ImmutableSettings.Builder.EMPTY_SETTINGS);
        } else {
            tokenizerFactory = indexService.analysisService().tokenizer(request.tokenizer());
            if (tokenizerFactory == null) {
                throw new ElasticsearchIllegalArgumentException(
                        "failed to find tokenizer under [" + request.tokenizer() + "]");
            }
        }
        TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
        if (request.tokenFilters() != null && request.tokenFilters().length > 0) {
            tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length];
            for (int i = 0; i < request.tokenFilters().length; i++) {
                String tokenFilterName = request.tokenFilters()[i];
                if (indexService == null) {
                    TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService
                            .tokenFilterFactoryFactory(tokenFilterName);
                    if (tokenFilterFactoryFactory == null) {
                        throw new ElasticsearchIllegalArgumentException(
                                "failed to find global token filter under [" + request.tokenizer() + "]");
                    }
                    tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName,
                            ImmutableSettings.Builder.EMPTY_SETTINGS);
                } else {
                    tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName);
                    if (tokenFilterFactories[i] == null) {
                        throw new ElasticsearchIllegalArgumentException(
                                "failed to find token filter under [" + request.tokenizer() + "]");
                    }
                }
                if (tokenFilterFactories[i] == null) {
                    throw new ElasticsearchIllegalArgumentException(
                            "failed to find token filter under [" + request.tokenizer() + "]");
                }
            }
        }
        analyzer = new CustomAnalyzer(tokenizerFactory, new CharFilterFactory[0], tokenFilterFactories);
        closeAnalyzer = true;
    } else if (analyzer == null) {
        if (indexService == null) {
            analyzer = Lucene.STANDARD_ANALYZER;
        } else {
            analyzer = indexService.analysisService().defaultIndexAnalyzer();
        }
    }
    if (analyzer == null) {
        throw new ElasticsearchIllegalArgumentException("failed to find analyzer");
    }

    List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(field, request.text());
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        int position = 0;
        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                position = position + increment;
            }
            tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(),
                    offset.endOffset(), type.type()));
        }
        stream.end();
    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        if (stream != null) {
            try {
                stream.close();
            } catch (IOException e) {
                // ignore
            }
        }
        if (closeAnalyzer) {
            analyzer.close();
        }
    }

    return new AnalyzeResponse(tokens);
}

From source file:org.elasticsearch.analysis.common.CommonAnalysisPlugin.java

License:Apache License

@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(//  w  w  w .j  a va2 s .  co  m
            PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
            input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false,
            input -> new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false,
            input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    // TODO deprecate edgeNGram
    filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input,
            EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
    filters.add(PreConfiguredTokenFilter.singleton("elision", true,
            input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false,
            input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(
            PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false,
            input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton("limit", false,
            input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
                    LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
    // TODO deprecate nGram
    filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false,
            input -> new SnowballFilter(input, "Russian")));
    filters.add(
            PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true,
            ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false,
            input -> new SnowballFilter(input, "English")));
    filters.add(
            PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false,
            input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
    filters.add(
            PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false,
            input -> new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                            | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS
                            | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE,
                    null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false,
            input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                    | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                    | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                    | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}

From source file:org.elasticsearch.analysis.common.CompoundAnalysisTests.java

License:Apache License

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
    AnalysisModule analysisModule = createAnalysisModule(settings);
    IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
    Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

    TokenStream stream = analyzer.tokenStream("", text);
    stream.reset();// w  ww.  j  av  a2 s .  co  m
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    List<String> terms = new ArrayList<>();
    while (stream.incrementToken()) {
        String tokText = termAtt.toString();
        terms.add(tokText);
    }
    return terms;
}

From source file:org.elasticsearch.analysis.common.SynonymsAnalysisTests.java

License:Apache License

private void match(String analyzerName, String source, String target) throws IOException {
    Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

    TokenStream stream = analyzer.tokenStream("", source);
    stream.reset();//from  w w w .java2  s  . co m
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    StringBuilder sb = new StringBuilder();
    while (stream.incrementToken()) {
        sb.append(termAtt.toString()).append(" ");
    }

    MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

From source file:org.elasticsearch.analysis.common.UniqueTokenFilterTests.java

License:Apache License

public void testSimple() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override//from  w w w. ja  v a2s . co m
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(t, new UniqueTokenFilter(t));
        }
    };

    TokenStream test = analyzer.tokenStream("test", "this test with test");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("this"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("test"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("with"));

    assertThat(test.incrementToken(), equalTo(false));
}