Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.elasticsearch.common.lucene.all.SimpleAllTests.java

License:Apache License

@Test
public void testBoostOnEagerTokenizer() throws Exception {
    AllEntries allEntries = new AllEntries();
    allEntries.addText("field1", "all", 2.0f);
    allEntries.addText("field2", "your", 1.0f);
    allEntries.addText("field1", "boosts", 0.5f);
    allEntries.reset();/*  ww w. j  ava2  s.co m*/
    // whitespace analyzer's tokenizer reads characters eagerly on the contrary to the standard tokenizer
    final TokenStream ts = AllTokenStream.allTokenStream("any", allEntries,
            new WhitespaceAnalyzer(Lucene.VERSION));
    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    ts.reset();
    for (int i = 0; i < 3; ++i) {
        assertTrue(ts.incrementToken());
        final String term;
        final float boost;
        switch (i) {
        case 0:
            term = "all";
            boost = 2;
            break;
        case 1:
            term = "your";
            boost = 1;
            break;
        case 2:
            term = "boosts";
            boost = 0.5f;
            break;
        default:
            throw new AssertionError();
        }
        assertEquals(term, termAtt.toString());
        final BytesRef payload = payloadAtt.getPayload();
        if (payload == null || payload.length == 0) {
            assertEquals(boost, 1f, 0.001f);
        } else {
            assertEquals(4, payload.length);
            final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            assertEquals(boost, b, 0.001f);
        }
    }
    assertFalse(ts.incrementToken());
}

From source file:org.elasticsearch.docvalues.string.DVStringFieldMapper.java

License:Apache License

@Override
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
    // luckily this is single thread access and we dont need a thread local.
    hasDocValsNow = false;// w  w w  .j  a va2  s  . c  o m
    super.parseCreateField(context, fields);
    hasDocValsNow = true;
    String value = null;
    if (context.externalValueSet()) {
        value = (String) context.externalValue();
    } else {
        for (Field f : fields) {
            Class<?> fClass = f.getClass();
            if (fClass == Field.class || fClass == TextField.class || fClass == StringField.class) {
                value = f.stringValue();
                break;
            }
        }
    }
    if (value != null) {
        TokenStream stream = docValuesAnalyzer.analyzer().tokenStream(null, new StringReader(value));
        CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String token = cattr.toString();
            // take the first token and make it a doc value
            fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(token)));
            break;
        }
        stream.end();
        stream.close();
    }
}

From source file:org.elasticsearch.index.analysis.Analysis.java

License:Apache License

/**
 * Check whether the provided token stream is able to provide character
 * terms.//from www  . j  av a 2 s.  c  om
 * <p>Although most analyzers generate character terms (CharTermAttribute),
 * some token only contain binary terms (BinaryTermAttribute,
 * CharTermAttribute being a special type of BinaryTermAttribute), such as
 * {@link NumericTokenStream} and unsuitable for highlighting and
 * more-like-this queries which expect character terms.</p>
 */
public static boolean isCharacterTokenStream(TokenStream tokenStream) {
    try {
        tokenStream.addAttribute(CharTermAttribute.class);
        return true;
    } catch (IllegalArgumentException e) {
        return false;
    }
}

From source file:org.elasticsearch.index.analysis.AnalysisRegistryTests.java

License:Apache License

public void testConfigureCamelCaseTokenFilter() throws IOException {
    Settings settings = Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
            .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
            .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
            .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
            .putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter")
            .build();/*from  ww  w  .j  a v a2s . c o m*/

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

    IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList())
            .getAnalysisRegistry().build(idxSettings);
    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 2, token.size());
        assertEquals("j2se", token.get(0));
        assertEquals("j2ee", token.get(1));
    }

    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 6, token.size());
        assertEquals("j", token.get(0));
        assertEquals("2", token.get(1));
        assertEquals("se", token.get(2));
        assertEquals("j", token.get(3));
        assertEquals("2", token.get(4));
        assertEquals("ee", token.get(5));
    }
}

From source file:org.elasticsearch.index.analysis.CompoundAnalysisTests.java

License:Apache License

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
    Index index = new Index("test");
    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
            new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, settings), new IndexNameModule(index),
                    new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("field1", text, 1.0f);
    allEntries.reset();/*w ww.  j  ava  2s  .  co  m*/

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    List<String> terms = new ArrayList<String>();
    while (stream.incrementToken()) {
        String tokText = termAtt.toString();
        terms.add(tokText);
    }
    return terms;
}

From source file:org.elasticsearch.index.analysis.NumericAnalyzerTests.java

License:Apache License

@Test
public void testAttributeEqual() throws IOException {
    final int precisionStep = 8;
    final double value = randomDouble();
    NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep);

    final TokenStream ts1 = analyzer.tokenStream("dummy", String.valueOf(value));
    final NumericTokenStream ts2 = new NumericTokenStream(precisionStep);
    ts2.setDoubleValue(value);/*w  ww .j a  va2 s.  co m*/
    final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class);
    final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class);
    final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class);
    final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class);
    ts1.reset();
    ts2.reset();
    while (ts1.incrementToken()) {
        assertThat(ts2.incrementToken(), is(true));
        assertThat(posInc1, equalTo(posInc2));
        // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070)
        assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue()));
        assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift()));
    }
    assertThat(ts2.incrementToken(), is(false));
    ts1.end();
    ts2.end();
}

From source file:org.elasticsearch.index.analysis.PatternTokenizerTests.java

License:Apache License

/** 
 * TODO: rewrite tests not to use string comparison.
 *///from   w w  w. j av  a  2  s .  c  o  m
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    in.reset();
    while (in.incrementToken()) {
        if (out.length() > 0)
            out.append(' ');
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
    }

    in.close();
    return out.toString();
}

From source file:org.elasticsearch.index.analysis.PreBuiltAnalyzerTests.java

License:Apache License

@Test
public void testThatDefaultAndStandardAnalyzerChangedIn10Beta1() throws IOException {
    Analyzer currentStandardAnalyzer = PreBuiltAnalyzers.STANDARD.getAnalyzer(Version.V_1_0_0_Beta1);
    Analyzer currentDefaultAnalyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1);

    // special case, these two are the same instance
    assertThat(currentDefaultAnalyzer, is(currentStandardAnalyzer));
    PreBuiltAnalyzers.DEFAULT.getAnalyzer(Version.V_1_0_0_Beta1);
    final int n = atLeast(10);
    Version version = Version.CURRENT;/*from  w w w.  ja  v  a 2  s.com*/
    for (int i = 0; i < n; i++) {
        if (version.equals(Version.V_1_0_0_Beta1)) {
            assertThat(currentDefaultAnalyzer, is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version)));
        } else {
            assertThat(currentDefaultAnalyzer, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
        }
        Analyzer analyzer = PreBuiltAnalyzers.DEFAULT.getAnalyzer(version);
        TokenStream ts = analyzer.tokenStream("foo", "This is it Dude");
        ts.reset();
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        List<String> list = new ArrayList<String>();
        while (ts.incrementToken()) {
            list.add(charTermAttribute.toString());
        }
        if (version.onOrAfter(Version.V_1_0_0_Beta1)) {
            assertThat(list.size(), is(4));
            assertThat(list, contains("this", "is", "it", "dude"));

        } else {
            assertThat(list.size(), is(1));
            assertThat(list, contains("dude"));
        }
        ts.close();
        version = randomVersion();
    }
}

From source file:org.elasticsearch.index.analysis.PreBuiltAnalyzerTests.java

License:Apache License

@Test
public void testAnalyzerChangedIn10RC1() throws IOException {
    Analyzer pattern = PreBuiltAnalyzers.PATTERN.getAnalyzer(Version.V_1_0_0_RC1);
    Analyzer standardHtml = PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(Version.V_1_0_0_RC1);
    final int n = atLeast(10);
    Version version = Version.CURRENT;/*from  w w  w.j  a va2s  .  c o m*/
    for (int i = 0; i < n; i++) {
        if (version.equals(Version.V_1_0_0_RC1)) {
            assertThat(pattern, is(PreBuiltAnalyzers.PATTERN.getAnalyzer(version)));
            assertThat(standardHtml, is(PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version)));
        } else {
            assertThat(pattern, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
            assertThat(standardHtml, not(is(PreBuiltAnalyzers.DEFAULT.getAnalyzer(version))));
        }
        Analyzer analyzer = randomBoolean() ? PreBuiltAnalyzers.PATTERN.getAnalyzer(version)
                : PreBuiltAnalyzers.STANDARD_HTML_STRIP.getAnalyzer(version);
        TokenStream ts = analyzer.tokenStream("foo", "This is it Dude");
        ts.reset();
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        List<String> list = new ArrayList<String>();
        while (ts.incrementToken()) {
            list.add(charTermAttribute.toString());
        }
        if (version.onOrAfter(Version.V_1_0_0_RC1)) {
            assertThat(list.toString(), list.size(), is(4));
            assertThat(list, contains("this", "is", "it", "dude"));

        } else {
            assertThat(list.size(), is(1));
            assertThat(list, contains("dude"));
        }
        ts.close();
        version = randomVersion();
    }
}

From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java

License:Apache License

@Test
public void testRSLPRules() throws Exception {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "br_rslp").build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();

    Map<String, String> words = buildWordList();

    Set<String> inputWords = words.keySet();
    for (String word : inputWords) {
        tokenizer.setReader(new StringReader(word));
        TokenStream ts = filterFactory.create(tokenizer);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();/*from  w  w w. j a v a 2s.c o  m*/
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(words.get(word)));
        ts.close();
    }
}