Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.elasticsearch.index.analysis.PatternTokenizerTests.java

License:Apache License

/** 
 * TODO: rewrite tests not to use string comparison.
 *//*from  ww  w.  jav a2  s  .c  o m*/
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    in.reset();
    while (in.incrementToken()) {
        if (out.length() > 0)
            out.append(' ');
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
    }

    in.close();
    return out.toString();
}

From source file:org.elasticsearch.index.analysis.PreBuiltAnalyzerTests.java

License:Apache License

@Test
public void testThatDefaultAndStandardAnalyzerChangedIn10Beta1() throws IOException {
    Analyzer currentStandardAnalyzer = PreBuiltAnalyzers.STANDARD.getAnalyzer(Version.V_1_0_0_Beta1);
    Analyzer currentDefaultAnalyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1);

    // special case, these two are the same instance
    assertThat(currentDefaultAnalyzer, is(currentStandardAnalyzer));
    PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1);
    final int n = atLeast(10);
    Version version = Version.CURRENT;/*  w  ww. ja va2 s. c o  m*/
    for (int i = 0; i < n; i++) {
        if (version.equals(Version.V_1_0_0_Beta1)) {
            assertThat(currentDefaultAnalyzer, is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version)));
        } else {
            assertThat(currentDefaultAnalyzer, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
        }
        Analyzer analyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(version);
        TokenStream ts = analyzer.tokenStream("foo", "This is it Dude");
        ts.reset();
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        List<String> list = new ArrayList<String>();
        while (ts.incrementToken()) {
            list.add(charTermAttribute.toString());
        }
        if (version.onOrAfter(Version.V_1_0_0_Beta1)) {
            assertThat(list.size(), is(4));
            assertThat(list, contains("this", "is", "it", "dude"));

        } else {
            assertThat(list.size(), is(1));
            assertThat(list, contains("dude"));
        }
        ts.close();
        version = randomVersion();
    }
}

From source file:org.elasticsearch.index.analysis.PreBuiltAnalyzerTests.java

License:Apache License

@Test
public void testAnalyzerChangedIn10RC1() throws IOException {
    Analyzer pattern = PreBuiltAnalyzers.PATTERN.getAnalyzer(Version.V_1_0_0_RC1);
    Analyzer standardHtml = PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(Version.V_1_0_0_RC1);
    final int n = atLeast(10);
    Version version = Version.CURRENT;/*from  w w w . ja va2  s.c  o  m*/
    for (int i = 0; i < n; i++) {
        if (version.equals(Version.V_1_0_0_RC1)) {
            assertThat(pattern, is(PreBuiltAnalyzers.PATTERN.getAnalyzer(version)));
            assertThat(standardHtml, is(PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version)));
        } else {
            assertThat(pattern, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
            assertThat(standardHtml, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
        }
        Analyzer analyzer = randomBoolean() ? PreBuiltAnalyzers.PATTERN.getAnalyzer(version)
                : PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version);
        TokenStream ts = analyzer.tokenStream("foo", "This is it Dude");
        ts.reset();
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        List<String> list = new ArrayList<String>();
        while (ts.incrementToken()) {
            list.add(charTermAttribute.toString());
        }
        if (version.onOrAfter(Version.V_1_0_0_RC1)) {
            assertThat(list.toString(), list.size(), is(4));
            assertThat(list, contains("this", "is", "it", "dude"));

        } else {
            assertThat(list.size(), is(1));
            assertThat(list, contains("dude"));
        }
        ts.close();
        version = randomVersion();
    }
}

From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java

License:Apache License

@Test
public void testRSLPRules() throws Exception {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "br_rslp").build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();

    Map<String, String> words = buildWordList();

    Set<String> inputWords = words.keySet();
    for (String word : inputWords) {
        tokenizer.setReader(new StringReader(word));
        TokenStream ts = filterFactory.create(tokenizer);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(words.get(word)));
        ts.close();//from   w  ww  .j  ava2 s.  c o  m
    }
}

From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java

License:Apache License

@Test
public void testRSLPPhrases() throws Exception {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.analyzer.myAnalyzer.type", "custom")
            .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard")
            .put("index.analysis.analyzer.myAnalyzer.filter", "br_rslp").build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    Analyzer analyzer = analysisService.analyzer("myAnalyzer");

    Map<String, List<String>> phrases = buildPhraseList();

    for (String phrase : phrases.keySet()) {
        List<String> outputWords = phrases.get(phrase);

        TokenStream ts = analyzer.tokenStream("test", phrase);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();

        for (String expected : outputWords) {
            assertThat(ts.incrementToken(), equalTo(true));
            assertThat(term1.toString(), equalTo(expected));
        }//from   www .  j  a va 2  s.  com
        ts.close();

    }
}

From source file:org.elasticsearch.index.analysis.SimpleIcuCollationTokenFilterTests.java

License:Apache License

private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
    CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);

    stream1.reset();
    stream2.reset();//from   w ww .ja v a  2  s .  com

    assertThat(stream1.incrementToken(), equalTo(true));
    assertThat(stream2.incrementToken(), equalTo(true));
    assertThat(Integer.signum(term1.toString().compareTo(term2.toString())),
            equalTo(Integer.signum(comparison)));
    assertThat(stream1.incrementToken(), equalTo(false));
    assertThat(stream2.incrementToken(), equalTo(false));

    stream1.end();
    stream2.end();

    stream1.close();
    stream2.close();
}

From source file:org.elasticsearch.index.analysis.SimplePolishTokenFilterTests.java

License:Apache License

private void testToken(String source, String expected) throws IOException {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "polish_stem")
            .build();/*from   w w w. ja v a2 s.  c om*/
    AnalysisService analysisService = createAnalysisService(index, settings);

    TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(source));
    TokenStream ts = filterFactory.create(tokenizer);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    assertThat(ts.incrementToken(), equalTo(true));

    assertThat(term1.toString(), equalTo(expected));
}

From source file:org.elasticsearch.index.analysis.SimplePolishTokenFilterTests.java

License:Apache License

private void testAnalyzer(String source, String... expected_terms) throws IOException {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    Analyzer analyzer = analysisService.analyzer("polish").analyzer();

    TokenStream ts = analyzer.tokenStream("test", source);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();

    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }/*w w  w .  ja va2s  . co m*/
}

From source file:org.elasticsearch.index.analysis.SimpleUkrainianAnalyzerTests.java

License:Apache License

private static void testAnalyzer(String source, String... expected_terms) throws IOException {
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
            new AnalysisUkrainianPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer();
    TokenStream ts = analyzer.tokenStream("test", source);
    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }/*from  w  w  w.  jav  a 2s . com*/
    assertThat(ts.incrementToken(), equalTo(false));
}

From source file:org.elasticsearch.index.analysis.SimpleVietnameseAnalyzerTests.java

License:Apache License

private static void testAnalyzer(String source, String... expected_terms) throws IOException {
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
            new AnalysisVietnamesePlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("vietnamese").analyzer();
    TokenStream ts = analyzer.tokenStream("test", source);
    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    for (String expected : expected_terms) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(expected));
    }//from w w  w .  j a v  a 2s . co  m
    assertThat(ts.incrementToken(), equalTo(false));
}