Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.elasticsearch.common.lucene.all.SimpleAllTests.java

License:Apache License

@Test
public void testBoostOnEagerTokenizer() throws Exception {
    AllEntries allEntries = new AllEntries();
    allEntries.addText("field1", "all", 2.0f);
    allEntries.addText("field2", "your", 1.0f);
    allEntries.addText("field1", "boosts", 0.5f);
    allEntries.reset();/*  ww w. j  ava2  s.co m*/
    // whitespace analyzer's tokenizer reads characters eagerly on the contrary to the standard tokenizer
    final TokenStream ts = AllTokenStream.allTokenStream("any", allEntries,
            new WhitespaceAnalyzer(Lucene.VERSION));
    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    ts.reset();
    for (int i = 0; i < 3; ++i) {
        assertTrue(ts.incrementToken());
        final String term;
        final float boost;
        switch (i) {
        case 0:
            term = "all";
            boost = 2;
            break;
        case 1:
            term = "your";
            boost = 1;
            break;
        case 2:
            term = "boosts";
            boost = 0.5f;
            break;
        default:
            throw new AssertionError();
        }
        assertEquals(term, termAtt.toString());
        final BytesRef payload = payloadAtt.getPayload();
        if (payload == null || payload.length == 0) {
            assertEquals(boost, 1f, 0.001f);
        } else {
            assertEquals(4, payload.length);
            final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            assertEquals(boost, b, 0.001f);
        }
    }
    assertFalse(ts.incrementToken());
}

From source file:org.elasticsearch.docvalues.string.DVStringFieldMapper.java

License:Apache License

@Override
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
    // luckily this is single thread access and we dont need a thread local.
    hasDocValsNow = false;// w  w w  .j  a va2  s  . c  o m
    super.parseCreateField(context, fields);
    hasDocValsNow = true;
    String value = null;
    if (context.externalValueSet()) {
        value = (String) context.externalValue();
    } else {
        for (Field f : fields) {
            Class<?> fClass = f.getClass();
            if (fClass == Field.class || fClass == TextField.class || fClass == StringField.class) {
                value = f.stringValue();
                break;
            }
        }
    }
    if (value != null) {
        TokenStream stream = docValuesAnalyzer.analyzer().tokenStream(null, new StringReader(value));
        CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String token = cattr.toString();
            // take the first token and make it a doc value
            fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(token)));
            break;
        }
        stream.end();
        stream.close();
    }
}

From source file:org.elasticsearch.index.analysis.Analysis.java

License:Apache License

/**
 * Check whether the provided token stream is able to provide character
 * terms.//from www  . j  av a 2 s.  c  om
 * <p>Although most analyzers generate character terms (CharTermAttribute),
 * some token only contain binary terms (BinaryTermAttribute,
 * CharTermAttribute being a special type of BinaryTermAttribute), such as
 * {@link NumericTokenStream} and unsuitable for highlighting and
 * more-like-this queries which expect character terms.</p>
 */
public static boolean isCharacterTokenStream(TokenStream tokenStream) {
    try {
        tokenStream.addAttribute(CharTermAttribute.class);
        return true;
    } catch (IllegalArgumentException e) {
        return false;
    }
}

From source file:org.elasticsearch.index.analysis.AnalysisRegistryTests.java

License:Apache License

public void testConfigureCamelCaseTokenFilter() throws IOException {
    Settings settings = Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
            .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
            .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
            .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
            .putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter")
            .build();/*from  ww  w  .j  a v a2s . c o m*/

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

    IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList())
            .getAnalysisRegistry().build(idxSettings);
    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 2, token.size());
        assertEquals("j2se", token.get(0));
        assertEquals("j2ee", token.get(1));
    }

    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 6, token.size());
        assertEquals("j", token.get(0));
        assertEquals("2", token.get(1));
        assertEquals("se", token.get(2));
        assertEquals("j", token.get(3));
        assertEquals("2", token.get(4));
        assertEquals("ee", token.get(5));
    }
}

From source file:org.elasticsearch.index.analysis.CompoundAnalysisTests.java

License:Apache License

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
    Index index = new Index("test");
    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
            new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, settings), new IndexNameModule(index),
                    new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("field1", text, 1.0f);
    allEntries.reset();/*w ww.  j  ava  2s  .  co  m*/

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    List<String> terms = new ArrayList<String>();
    while (stream.incrementToken()) {
        String tokText = termAtt.toString();
        terms.add(tokText);
    }
    return terms;
}

From source file:org.elasticsearch.index.analysis.NumericAnalyzerTests.java

License:Apache License

@Test
public void testAttributeEqual() throws IOException {
    final int precisionStep = 8;
    final double value = randomDouble();
    NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep);

    final TokenStream ts1 = analyzer.tokenStream("dummy", String.valueOf(value));
    final NumericTokenStream ts2 = new NumericTokenStream(precisionStep);
    ts2.setDoubleValue(value);/*w  ww .j a  va2 s.  co m*/
    final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class);
    final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class);
    final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class);
    final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class);
    ts1.reset();
    ts2.reset();
    while (ts1.incrementToken()) {
        assertThat(ts2.incrementToken(), is(true));
        assertThat(posInc1, equalTo(posInc2));
        // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070)
        assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue()));
        assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift()));
    }
    assertThat(ts2.incrementToken(), is(false));
    ts1.end();
    ts2.end();
}

From source file:org.elasticsearch.index.analysis.PatternTokenizerTests.java

License:Apache License

/** 
 * TODO: rewrite tests not to use string comparison.
 *///from   w w  w. j av  a  2  s .  c  o  m
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    in.reset();
    while (in.incrementToken()) {
        if (out.length() > 0)
            out.append(' ');
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
    }

    in.close();
    return out.toString();
}

From source file:org.elasticsearch.index.analysis.PreBuiltAnalyzerTests.java

License:Apache License

@Test
public void testThatDefaultAndStandardAnalyzerChangedIn10Beta1() throws IOException {
    Analyzer currentStandardAnalyzer = PreBuiltAnalyzers.STANDARD.getAnalyzer(Version.V_1_0_0_Beta1);
    Analyzer currentDefaultAnalyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1);

    // special case, these two are the same instance
    assertThat(currentDefaultAnalyzer, is(currentStandardAnalyzer));
    PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1);
    final int n = atLeast(10);
    Version version = Version.CURRENT;/*from  w w w.  ja  v  a 2  s.com*/
    for (int i = 0; i < n; i++) {
        if (version.equals(Version.V_1_0_0_Beta1)) {
            assertThat(currentDefaultAnalyzer, is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version)));
        } else {
            assertThat(currentDefaultAnalyzer, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
        }
        Analyzer analyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(version);
        TokenStream ts = analyzer.tokenStream("foo", "This is it Dude");
        ts.reset();
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        List<String> list = new ArrayList<String>();
        while (ts.incrementToken()) {
            list.add(charTermAttribute.toString());
        }
        if (version.onOrAfter(Version.V_1_0_0_Beta1)) {
            assertThat(list.size(), is(4));
            assertThat(list, contains("this", "is", "it", "dude"));

        } else {
            assertThat(list.size(), is(1));
            assertThat(list, contains("dude"));
        }
        ts.close();
        version = randomVersion();
    }
}

From source file:org.elasticsearch.index.analysis.PreBuiltAnalyzerTests.java

License:Apache License

@Test
public void testAnalyzerChangedIn10RC1() throws IOException {
    Analyzer pattern = PreBuiltAnalyzers.PATTERN.getAnalyzer(Version.V_1_0_0_RC1);
    Analyzer standardHtml = PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(Version.V_1_0_0_RC1);
    final int n = atLeast(10);
    Version version = Version.CURRENT;/*from  w w  w.j  a va2s  .  c o m*/
    for (int i = 0; i < n; i++) {
        if (version.equals(Version.V_1_0_0_RC1)) {
            assertThat(pattern, is(PreBuiltAnalyzers.PATTERN.getAnalyzer(version)));
            assertThat(standardHtml, is(PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version)));
        } else {
            assertThat(pattern, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
            assertThat(standardHtml, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
        }
        Analyzer analyzer = randomBoolean() ? PreBuiltAnalyzers.PATTERN.getAnalyzer(version)
                : PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version);
        TokenStream ts = analyzer.tokenStream("foo", "This is it Dude");
        ts.reset();
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        List<String> list = new ArrayList<String>();
        while (ts.incrementToken()) {
            list.add(charTermAttribute.toString());
        }
        if (version.onOrAfter(Version.V_1_0_0_RC1)) {
            assertThat(list.toString(), list.size(), is(4));
            assertThat(list, contains("this", "is", "it", "dude"));

        } else {
            assertThat(list.size(), is(1));
            assertThat(list, contains("dude"));
        }
        ts.close();
        version = randomVersion();
    }
}

From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java

License:Apache License

@Test
public void testRSLPRules() throws Exception {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "br_rslp").build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();

    Map<String, String> words = buildWordList();

    Set<String> inputWords = words.keySet();
    for (String word : inputWords) {
        tokenizer.setReader(new StringReader(word));
        TokenStream ts = filterFactory.create(tokenizer);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();/*from  w  w w. j a v a 2s.c o  m*/
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(words.get(word)));
        ts.close();
    }
}