Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java

License:Apache License

@Test
public void testRSLPPhrases() throws Exception {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.analyzer.myAnalyzer.type", "custom")
            .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
            .put("index.analysis.analyzer.myAnalyzer.filter", "br_rslp").build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    Analyzer analyzer = analysisService.analyzer("myAnalyzer");

    Map<String, List<String>> phrases = buildPhraseList();

    for (String phrase : phrases.keySet()) {
        List<String> outputWords = phrases.get(phrase);

        TokenStream ts = analyzer.tokenStream("test", phrase);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();//www. j  a  va2s. co m

        for (String expected : outputWords) {
            assertThat(ts.incrementToken(), equalTo(true));
            assertThat(term1.toString(), equalTo(expected));
        }
        ts.close();

    }
}

From source file:org.elasticsearch.index.analysis.SimpleIcuCollationTokenFilterTests.java

License:Apache License

private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
    CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);

    stream1.reset();//ww  w .  j  a  v a2s.c om
    stream2.reset();

    assertThat(stream1.incrementToken(), equalTo(true));
    assertThat(stream2.incrementToken(), equalTo(true));
    assertThat(Integer.signum(term1.toString().compareTo(term2.toString())),
            equalTo(Integer.signum(comparison)));
    assertThat(stream1.incrementToken(), equalTo(false));
    assertThat(stream2.incrementToken(), equalTo(false));

    stream1.end();
    stream2.end();

    stream1.close();
    stream2.close();
}

From source file:org.elasticsearch.index.analysis.SimplePolishTokenFilterTests.java

License:Apache License

private void testToken(String source, String expected) throws IOException {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "polish_stem")
            .build();/*from  ww w  . j  a va 2s.co  m*/
    AnalysisService analysisService = createAnalysisService(index, settings);

    TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(source));
    TokenStream ts = filterFactory.create(tokenizer);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    assertThat(ts.incrementToken(), equalTo(true));

    assertThat(term1.toString(), equalTo(expected));
}

From source file:org.elasticsearch.index.analysis.SimplePolishTokenFilterTests.java

License:Apache License

private void testAnalyzer(String source, String... expected_terms) throws IOException {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    Analyzer analyzer = analysisService.analyzer("polish").analyzer();

    TokenStream ts = analyzer.tokenStream("test", source);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from  w  w w .j  ava  2s .  c o  m

    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }
}

From source file:org.elasticsearch.index.analysis.SimpleUkrainianAnalyzerTests.java

License:Apache License

private static void testAnalyzer(String source, String... expected_terms) throws IOException {
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
            new AnalysisUkrainianPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer();
    TokenStream ts = analyzer.tokenStream("test", source);
    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();/*from   ww w . ja v  a2s .c  om*/
    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }
    assertThat(ts.incrementToken(), equalTo(false));
}

From source file:org.elasticsearch.index.analysis.SimpleVietnameseAnalyzerTests.java

License:Apache License

private static void testAnalyzer(String source, String... expected_terms) throws IOException {
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
            new AnalysisVietnamesePlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("vietnamese").analyzer();
    TokenStream ts = analyzer.tokenStream("test", source);
    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from w w  w .  j a  va  2  s  .  c o m
    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }
    assertThat(ts.incrementToken(), equalTo(false));
}

From source file:org.elasticsearch.index.analysis.SmartChineseAnalysisTests.java

License:Apache License

/**
 * With the default analyzer or the default Chinese analyzer, you would get: 
 * { "", "", "", "", "" }.//from w  ww  . j av a2 s.c  om
 * <p>
 * But the SmartAnalyzer gets you the more desirable:
 * { "", "", "", "" }.
 * That is, "" (China) is one token.
 */
@Test
public void analyzeSomeChineseText() throws Exception {
    Index index = new Index("test");

    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS),
            new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule())
            .createInjector();
    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, EMPTY_SETTINGS), new IndexNameModule(index),
                    new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class))
                            .addProcessor(new SmartChineseAnalysisBinderProcessor()))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    Analyzer analyzer = analysisService.analyzer("smartcn").analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("message", "", 1.0f);
    allEntries.reset();

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);

    List<String> terms = new ArrayList<String>();
    while (stream.incrementToken()) {
        String tokText = termAtt.term();
        terms.add(tokText);
    }

    MatcherAssert.assertThat(terms.size(), equalTo(4));
    MatcherAssert.assertThat(terms, hasItems("", "", "", ""));
}

From source file:org.elasticsearch.index.analysis.synonyms.SynonymsAnalysisTest.java

License:Apache License

private void match(String analyzerName, String source, String target) throws IOException {

    Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("field", source, 1.0f);
    allEntries.reset();/*from w  w w . java 2 s  .co  m*/

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    StringBuilder sb = new StringBuilder();
    while (stream.incrementToken()) {
        sb.append(termAtt.toString()).append(" ");
    }

    MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

From source file:org.elasticsearch.index.mapper.core.TokenCountFieldMapper.java

License:Apache License

/**
 * Count position increments in a token stream.  Package private for testing.
 * @param tokenStream token stream to count
 * @return number of position increments in a token stream
 * @throws IOException if tokenStream throws it
 *///from www  .ja  v  a2  s  .  c om
static int countPositions(TokenStream tokenStream) throws IOException {
    try {
        int count = 0;
        PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            count += position.getPositionIncrement();
        }
        tokenStream.end();
        count += position.getPositionIncrement();
        return count;
    } finally {
        tokenStream.close();
    }
}

From source file:org.elasticsearch.index.mapper.date.LegacyDateMappingTests.java

License:Apache License

private void assertNumericTokensEqual(ParsedDocument doc, DocumentMapper defaultMapper, String fieldA,
        String fieldB) throws IOException {
    assertThat(doc.rootDoc().getField(fieldA).tokenStream(defaultMapper.mappers().indexAnalyzer(), null),
            notNullValue());// w  w w .java  2  s.c  o m
    assertThat(doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null),
            notNullValue());

    TokenStream tokenStream = doc.rootDoc().getField(fieldA)
            .tokenStream(defaultMapper.mappers().indexAnalyzer(), null);
    tokenStream.reset();
    LegacyNumericTermAttribute nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class);
    List<Long> values = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        values.add(nta.getRawValue());
    }

    tokenStream = doc.rootDoc().getField(fieldB).tokenStream(defaultMapper.mappers().indexAnalyzer(), null);
    tokenStream.reset();
    nta = tokenStream.addAttribute(LegacyNumericTermAttribute.class);
    int pos = 0;
    while (tokenStream.incrementToken()) {
        assertThat(values.get(pos++), equalTo(nta.getRawValue()));
    }
    assertThat(pos, equalTo(values.size()));
}