Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java

License:Apache License

@Test
public void testRSLPPhrases() throws Exception {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.analyzer.myAnalyzer.type", "custom")
            .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
            .put("index.analysis.analyzer.myAnalyzer.filter", "br_rslp").build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    Analyzer analyzer = analysisService.analyzer("myAnalyzer");

    Map<String, List<String>> phrases = buildPhraseList();

    for (String phrase : phrases.keySet()) {
        List<String> outputWords = phrases.get(phrase);

        TokenStream ts = analyzer.tokenStream("test", phrase);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();//www. j  a  va2s. co m

        for (String expected : outputWords) {
            assertThat(ts.incrementToken(), equalTo(true));
            assertThat(term1.toString(), equalTo(expected));
        }
        ts.close();

    }
}

From source file:org.elasticsearch.index.analysis.SimpleIcuCollationTokenFilterTests.java

License:Apache License

private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
    CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);

    stream1.reset();//ww  w .  j  a  v a2s.c om
    stream2.reset();

    assertThat(stream1.incrementToken(), equalTo(true));
    assertThat(stream2.incrementToken(), equalTo(true));
    assertThat(Integer.signum(term1.toString().compareTo(term2.toString())),
            equalTo(Integer.signum(comparison)));
    assertThat(stream1.incrementToken(), equalTo(false));
    assertThat(stream2.incrementToken(), equalTo(false));

    stream1.end();
    stream2.end();

    stream1.close();
    stream2.close();
}

From source file:org.elasticsearch.index.analysis.SimplePolishTokenFilterTests.java

License:Apache License

private void testToken(String source, String expected) throws IOException {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "polish_stem")
            .build();/*from  ww w  . j  a va 2s.co  m*/
    AnalysisService analysisService = createAnalysisService(index, settings);

    TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(source));
    TokenStream ts = filterFactory.create(tokenizer);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    assertThat(ts.incrementToken(), equalTo(true));

    assertThat(term1.toString(), equalTo(expected));
}

From source file:org.elasticsearch.index.analysis.SimplePolishTokenFilterTests.java

License:Apache License

private void testAnalyzer(String source, String... expected_terms) throws IOException {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    Analyzer analyzer = analysisService.analyzer("polish").analyzer();

    TokenStream ts = analyzer.tokenStream("test", source);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from  w  w w .j  ava  2s .  c o  m

    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }
}

From source file:org.elasticsearch.index.analysis.SimpleUkrainianAnalyzerTests.java

License:Apache License

private static void testAnalyzer(String source, String... expected_terms) throws IOException {
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
            new AnalysisUkrainianPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer();
    TokenStream ts = analyzer.tokenStream("test", source);
    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*from   ww w . ja v  a2s .c  om*/
    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }
    assertThat(ts.incrementToken(), equalTo(false));
}

From source file:org.elasticsearch.index.analysis.SimpleVietnameseAnalyzerTests.java

License:Apache License

private static void testAnalyzer(String source, String... expected_terms) throws IOException {
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
            new AnalysisVietnamesePlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("vietnamese").analyzer();
    TokenStream ts = analyzer.tokenStream("test", source);
    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from w w  w .  j a  va  2  s  .  c o m
    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }
    assertThat(ts.incrementToken(), equalTo(false));
}

From source file:org.elasticsearch.index.analysis.SmartChineseAnalysisTests.java

License:Apache License

/**
 * With the default analyzer or the default Chinese analyzer, you would get: 
 * { "", "", "", "", "" }.//from w  ww  . j av a2 s.c  om
 * <p>
 * But the SmartAnalyzer gets you the more desirable:
 * { "", "", "", "" }.
 * That is, "" (China) is one token.
 */
@Test
public void analyzeSomeChineseText() throws Exception {
    Index index = new Index("test");

    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS),
            new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule())
            .createInjector();
    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, EMPTY_SETTINGS), new IndexNameModule(index),
                    new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class))
                            .addProcessor(new SmartChineseAnalysisBinderProcessor()))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    Analyzer analyzer = analysisService.analyzer("smartcn").analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("message", "", 1.0f);
    allEntries.reset();

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);

    List<String> terms = new ArrayList<String>();
    while (stream.incrementToken()) {
        String tokText = termAtt.term();
        terms.add(tokText);
    }

    MatcherAssert.assertThat(terms.size(), equalTo(4));
    MatcherAssert.assertThat(terms, hasItems("", "", "", ""));
}

From source file:org.elasticsearch.index.analysis.synonyms.SynonymsAnalysisTest.java

License:Apache License

private void match(String analyzerName, String source, String target) throws IOException {

    Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("field", source, 1.0f);
    allEntries.reset();/*from w  w w . java 2 s  .co  m*/

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    StringBuilder sb = new StringBuilder();
    while (stream.incrementToken()) {
        sb.append(termAtt.toString()).append(" ");
    }

    MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

From source file:org.elasticsearch.index.mapper.core.TokenCountFieldMapper.java

License:Apache License

/**
 * Count position increments in a token stream.  Package private for testing.
 * @param tokenStream token stream to count
 * @return number of position increments in a token stream
 * @throws IOException if tokenStream throws it
 *///from www  .ja  v  a2  s  .  c om
static int countPositions(TokenStream tokenStream) throws IOException {
    try {
        int count = 0;
        PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            count += position.getPositionIncrement();
        }
        tokenStream.end();
        count += position.getPositionIncrement();
        return count;
    } finally {
        tokenStream.close();
    }
}

From source file:org.elasticsearch.index.mapper.date.LegacyDateMappingTests.java

License:Apache License

private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA,
        String fieldB) throws IOException {
    assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.mappers().indexAnalyzer(), null),
            notNullValue());// w  w w .java  2  s.c  o m
    assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null),
            notNullValue());

    TokenStream tokenStream = doc.rootDoc().getField(fieldA)
            .tokenStream(defaultMapper.mappers().indexAnalyzer(), null);
    tokenStream.reset();
    LegacyNumericTermAttribute nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class);
    List<Long> values = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        values.add(nta.getRawValue());
    }

    tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null);
    tokenStream.reset();
    nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class);
    int pos = 0;
    while (tokenStream.incrementToken()) {
        assertThat(values.get(pos++), equalTo(nta.getRawValue()));
    }
    assertThat(pos, equalTo(values.size()));
}