Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.elasticsearch.analysis.common.CompoundAnalysisTests.java

License:Apache License

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
    AnalysisModule analysisModule = createAnalysisModule(settings);
    IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
    Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

    TokenStream stream = analyzer.tokenStream("", text);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    List<String> terms = new ArrayList<>();
    while (stream.incrementToken()) {
        String tokText = termAtt.toString();
        terms.add(tokText);/*from  ww w.  j  a v  a 2 s  .  co  m*/
    }
    return terms;
}

From source file:org.elasticsearch.analysis.common.SynonymsAnalysisTests.java

License:Apache License

private void match(String analyzerName, String source, String target) throws IOException {
    Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

    TokenStream stream = analyzer.tokenStream("", source);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    StringBuilder sb = new StringBuilder();
    while (stream.incrementToken()) {
        sb.append(termAtt.toString()).append(" ");
    }// ww w  .  ja  v a 2  s  .co m

    MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

From source file:org.elasticsearch.analysis.common.UniqueTokenFilterTests.java

License:Apache License

public void testSimple() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override// ww w.  ja v  a  2  s  .c o m
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(t, new UniqueTokenFilter(t));
        }
    };

    TokenStream test = analyzer.tokenStream("test", "this test with test");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("this"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("test"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("with"));

    assertThat(test.incrementToken(), equalTo(false));
}

From source file:org.elasticsearch.common.lucene.all.SimpleAllTests.java

License:Apache License

@Test
public void testBoostOnEagerTokenizer() throws Exception {
    AllEntries allEntries = new AllEntries();
    allEntries.addText("field1", "all", 2.0f);
    allEntries.addText("field2", "your", 1.0f);
    allEntries.addText("field1", "boosts", 0.5f);
    allEntries.reset();//from ww w  .ja  va 2  s  .c o m
    // whitespace analyzer's tokenizer reads characters eagerly on the contrary to the standard tokenizer
    final TokenStream ts = AllTokenStream.allTokenStream("any", allEntries,
            new WhitespaceAnalyzer(Lucene.VERSION));
    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    ts.reset();
    for (int i = 0; i < 3; ++i) {
        assertTrue(ts.incrementToken());
        final String term;
        final float boost;
        switch (i) {
        case 0:
            term = "all";
            boost = 2;
            break;
        case 1:
            term = "your";
            boost = 1;
            break;
        case 2:
            term = "boosts";
            boost = 0.5f;
            break;
        default:
            throw new AssertionError();
        }
        assertEquals(term, termAtt.toString());
        final BytesRef payload = payloadAtt.getPayload();
        if (payload == null || payload.length == 0) {
            assertEquals(boost, 1f, 0.001f);
        } else {
            assertEquals(4, payload.length);
            final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            assertEquals(boost, b, 0.001f);
        }
    }
    assertFalse(ts.incrementToken());
}

From source file:org.elasticsearch.docvalues.string.DVStringFieldMapper.java

License:Apache License

@Override
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
    // luckily this is single thread access and we dont need a thread local.
    hasDocValsNow = false;/*ww w .j  a v  a2s .co  m*/
    super.parseCreateField(context, fields);
    hasDocValsNow = true;
    String value = null;
    if (context.externalValueSet()) {
        value = (String) context.externalValue();
    } else {
        for (Field f : fields) {
            Class<?> fClass = f.getClass();
            if (fClass == Field.class || fClass == TextField.class || fClass == StringField.class) {
                value = f.stringValue();
                break;
            }
        }
    }
    if (value != null) {
        TokenStream stream = docValuesAnalyzer.analyzer().tokenStream(null, new StringReader(value));
        CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String token = cattr.toString();
            // take the first token and make it a doc value
            fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(token)));
            break;
        }
        stream.end();
        stream.close();
    }
}

From source file:org.elasticsearch.index.analysis.AnalysisRegistryTests.java

License:Apache License

public void testConfigureCamelCaseTokenFilter() throws IOException {
    Settings settings = Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
            .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
            .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
            .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
            .putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter")
            .build();/*from w  w  w .  j a v  a  2 s  . c  om*/

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

    IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList())
            .getAnalysisRegistry().build(idxSettings);
    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 2, token.size());
        assertEquals("j2se", token.get(0));
        assertEquals("j2ee", token.get(1));
    }

    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 6, token.size());
        assertEquals("j", token.get(0));
        assertEquals("2", token.get(1));
        assertEquals("se", token.get(2));
        assertEquals("j", token.get(3));
        assertEquals("2", token.get(4));
        assertEquals("ee", token.get(5));
    }
}

From source file:org.elasticsearch.index.analysis.CompoundAnalysisTests.java

License:Apache License

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
    Index index = new Index("test");
    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
            new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, settings), new IndexNameModule(index),
                    new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("field1", text, 1.0f);
    allEntries.reset();/*  ww w  .  j  a va  2 s  .  c o m*/

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    List<String> terms = new ArrayList<String>();
    while (stream.incrementToken()) {
        String tokText = termAtt.toString();
        terms.add(tokText);
    }
    return terms;
}

From source file:org.elasticsearch.index.analysis.CustomWBAnalysisTests.java

License:Apache License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    assertThat(termAttr, notNullValue());
    int i = 0;/*from w w  w . java  2 s  .com*/
    while (stream.incrementToken()) {
        assertThat(expected.length, greaterThan(i));
        assertThat("expected different term at index " + i, expected[i++], equalTo(termAttr.toString()));
    }
    assertThat("not all tokens produced", i, equalTo(expected.length));
}

From source file:org.elasticsearch.index.analysis.morphology.SimpleMorphologyAnalysisTests.java

License:Apache License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    Assert.assertNotNull(termAttr);/*ww w .j a v a 2s  .c o  m*/
    int i = 0;
    while (stream.incrementToken()) {
        Assert.assertTrue("got extra term: " + termAttr.toString(), i < expected.length);
        Assert.assertEquals("expected different term at index " + i, termAttr.toString(), expected[i]);
        i++;
    }
    Assert.assertEquals("not all tokens produced", i, expected.length);
}

From source file:org.elasticsearch.index.analysis.NumericAnalyzerTests.java

License:Apache License

@Test
public void testAttributeEqual() throws IOException {
    final int precisionStep = 8;
    final double value = randomDouble();
    NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep);

    final TokenStream ts1 = analyzer.tokenStream("dummy", String.valueOf(value));
    final NumericTokenStream ts2 = new NumericTokenStream(precisionStep);
    ts2.setDoubleValue(value);/*from w  ww .jav a  2s.c  o  m*/
    final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class);
    final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class);
    final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class);
    final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class);
    ts1.reset();
    ts2.reset();
    while (ts1.incrementToken()) {
        assertThat(ts2.incrementToken(), is(true));
        assertThat(posInc1, equalTo(posInc2));
        // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070)
        assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue()));
        assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift()));
    }
    assertThat(ts2.incrementToken(), is(false));
    ts1.end();
    ts2.end();
}