Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.elasticsearch.analysis.common.SynonymsAnalysisTests.java

License:Apache License

private void match(String analyzerName, String source, String target) throws IOException {
    Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

    TokenStream stream = analyzer.tokenStream("", source);
    stream.reset();/*  w ww . j ava 2  s .  c om*/
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    StringBuilder sb = new StringBuilder();
    while (stream.incrementToken()) {
        sb.append(termAtt.toString()).append(" ");
    }

    MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

From source file:org.elasticsearch.analysis.common.UniqueTokenFilterTests.java

License:Apache License

public void testSimple() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override/*from  w ww. j a va2s  .  c  o m*/
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(t, new UniqueTokenFilter(t));
        }
    };

    TokenStream test = analyzer.tokenStream("test", "this test with test");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("this"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("test"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("with"));

    assertThat(test.incrementToken(), equalTo(false));
}

From source file:org.elasticsearch.common.lucene.all.SimpleAllTests.java

License:Apache License

@Test
public void testBoostOnEagerTokenizer() throws Exception {
    AllEntries allEntries = new AllEntries();
    allEntries.addText("field1", "all", 2.0f);
    allEntries.addText("field2", "your", 1.0f);
    allEntries.addText("field1", "boosts", 0.5f);
    allEntries.reset();//from   w  w  w . j a  v  a2s . c  o m
    // whitespace analyzer's tokenizer reads characters eagerly on the contrary to the standard tokenizer
    final TokenStream ts = AllTokenStream.allTokenStream("any", allEntries,
            new WhitespaceAnalyzer(Lucene.VERSION));
    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    ts.reset();
    for (int i = 0; i < 3; ++i) {
        assertTrue(ts.incrementToken());
        final String term;
        final float boost;
        switch (i) {
        case 0:
            term = "all";
            boost = 2;
            break;
        case 1:
            term = "your";
            boost = 1;
            break;
        case 2:
            term = "boosts";
            boost = 0.5f;
            break;
        default:
            throw new AssertionError();
        }
        assertEquals(term, termAtt.toString());
        final BytesRef payload = payloadAtt.getPayload();
        if (payload == null || payload.length == 0) {
            assertEquals(boost, 1f, 0.001f);
        } else {
            assertEquals(4, payload.length);
            final float b = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            assertEquals(boost, b, 0.001f);
        }
    }
    assertFalse(ts.incrementToken());
}

From source file:org.elasticsearch.docvalues.string.DVStringFieldMapper.java

License:Apache License

@Override
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
    // luckily this is single thread access and we dont need a thread local.
    hasDocValsNow = false;// w w w .  j a  v  a 2 s  . c  om
    super.parseCreateField(context, fields);
    hasDocValsNow = true;
    String value = null;
    if (context.externalValueSet()) {
        value = (String) context.externalValue();
    } else {
        for (Field f : fields) {
            Class<?> fClass = f.getClass();
            if (fClass == Field.class || fClass == TextField.class || fClass == StringField.class) {
                value = f.stringValue();
                break;
            }
        }
    }
    if (value != null) {
        TokenStream stream = docValuesAnalyzer.analyzer().tokenStream(null, new StringReader(value));
        CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String token = cattr.toString();
            // take the first token and make it a doc value
            fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(token)));
            break;
        }
        stream.end();
        stream.close();
    }
}

From source file:org.elasticsearch.index.analysis.AnalysisRegistryTests.java

License:Apache License

public void testConfigureCamelCaseTokenFilter() throws IOException {
    Settings settings = Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
            .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
            .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
            .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
            .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
            .putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter")
            .build();/*from  w  w w .java 2s  .  c  o m*/

    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

    IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList())
            .getAnalysisRegistry().build(idxSettings);
    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 2, token.size());
        assertEquals("j2se", token.get(0));
        assertEquals("j2ee", token.get(1));
    }

    try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
        assertNotNull(custom_analyser);
        TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
        tokenStream.reset();
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        List<String> token = new ArrayList<>();
        while (tokenStream.incrementToken()) {
            token.add(charTermAttribute.toString());
        }
        assertEquals(token.toString(), 6, token.size());
        assertEquals("j", token.get(0));
        assertEquals("2", token.get(1));
        assertEquals("se", token.get(2));
        assertEquals("j", token.get(3));
        assertEquals("2", token.get(4));
        assertEquals("ee", token.get(5));
    }
}

From source file:org.elasticsearch.index.analysis.CompoundAnalysisTests.java

License:Apache License

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
    Index index = new Index("test");
    Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
            new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
    Injector injector = new ModulesBuilder()
            .add(new IndexSettingsModule(index, settings), new IndexNameModule(index),
                    new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)))
            .createChildInjector(parentInjector);

    AnalysisService analysisService = injector.getInstance(AnalysisService.class);

    Analyzer analyzer = analysisService.analyzer(analyzerName).analyzer();

    AllEntries allEntries = new AllEntries();
    allEntries.addText("field1", text, 1.0f);
    allEntries.reset();// www .j  av a  2 s  .  c  o  m

    TokenStream stream = AllTokenStream.allTokenStream("_all", allEntries, analyzer);
    stream.reset();
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    List<String> terms = new ArrayList<String>();
    while (stream.incrementToken()) {
        String tokText = termAtt.toString();
        terms.add(tokText);
    }
    return terms;
}

From source file:org.elasticsearch.index.analysis.CustomWBAnalysisTests.java

License:Apache License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();/*from w ww  . j  a va 2  s . c o m*/
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    assertThat(termAttr, notNullValue());
    int i = 0;
    while (stream.incrementToken()) {
        assertThat(expected.length, greaterThan(i));
        assertThat("expected different term at index " + i, expected[i++], equalTo(termAttr.toString()));
    }
    assertThat("not all tokens produced", i, equalTo(expected.length));
}

From source file:org.elasticsearch.index.analysis.morphology.SimpleMorphologyAnalysisTests.java

License:Apache License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();// w  ww .j  a  va 2s . c  o  m
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    Assert.assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        Assert.assertTrue("got extra term: " + termAttr.toString(), i < expected.length);
        Assert.assertEquals("expected different term at index " + i, termAttr.toString(), expected[i]);
        i++;
    }
    Assert.assertEquals("not all tokens produced", i, expected.length);
}

From source file:org.elasticsearch.index.analysis.NumericAnalyzerTests.java

License:Apache License

@Test
public void testAttributeEqual() throws IOException {
    final int precisionStep = 8;
    final double value = randomDouble();
    NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep);

    final TokenStream ts1 = analyzer.tokenStream("dummy", String.valueOf(value));
    final NumericTokenStream ts2 = new NumericTokenStream(precisionStep);
    ts2.setDoubleValue(value);/*from   w  w  w  .  j a  va  2  s.co  m*/
    final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class);
    final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class);
    final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class);
    final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class);
    ts1.reset();
    ts2.reset();
    while (ts1.incrementToken()) {
        assertThat(ts2.incrementToken(), is(true));
        assertThat(posInc1, equalTo(posInc2));
        // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070)
        assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue()));
        assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift()));
    }
    assertThat(ts2.incrementToken(), is(false));
    ts1.end();
    ts2.end();
}

From source file:org.elasticsearch.index.analysis.PaodingAnalysisTests.java

License:Apache License

public List getname(String param) throws IOException {

    System.setProperty("paoding.dic.home.config-first",
            "D:/Projects/Java Related/ElasticSearch/plugins/elasticsearch-analysis-paoding/config/paoding/dic");

    //?(??)//from w w  w.  j  a va2s .  c o m
    Analyzer ika = new PaodingAnalyzer();
    List<String> keys = new ArrayList<String>();
    TokenStream ts = null;

    try {
        Reader r = new StringReader(param);
        ts = ika.tokenStream("TestField", r);
        CharTermAttribute termAtt = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);
        TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
        String key = null;
        while (ts.incrementToken()) {
            if ("word".equals(typeAtt.type())) {
                key = termAtt.toString();
                if (key.length() >= 2) {
                    keys.add(key);
                }
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        if (ts != null) {
            ts.close();
        }
    }

    Map<String, Integer> keyMap = new HashMap<String, Integer>();
    Integer $ = null;
    //??
    for (String key : keys) {
        keyMap.put(key, ($ = keyMap.get(key)) == null ? 1 : $ + 1);
    }
    List<Map.Entry<String, Integer>> keyList = new ArrayList<Map.Entry<String, Integer>>(keyMap.entrySet());
    //?
    Collections.sort(keyList, new Comparator<Map.Entry<String, Integer>>() {
        public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
            return (o2.getValue() - o1.getValue());
        }
    });
    //??
    String id = null;
    String str = "";
    List list = new ArrayList();
    if (keyList.size() > 0) {
        for (int i = 0; i < keyList.size(); i++) {
            id = keyList.get(i).toString();
            String[] strs = id.split("\\=");
            str = strs[0];
            list.add(strs[0]);
            System.out.println("id:" + id);
        }
    }
    return list;
}