Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer() 

Source Link

Document

Creates a new instance of the org.apache.lucene.analysis.standard.StandardTokenizer .

Usage

From source file:org.elasticsearch.index.analysis.CustomWordBoundaryStandardTokenizerFactory.java

License:Apache License

@Override
public Tokenizer create() {
    if (version.onOrAfter(Version.LUCENE_5_5_0)) {
        return new CustomWordBoundaryStandardTokenizer(characterMappings);
    } else {//from w w w .  j a v  a  2s. c  o  m
        return new StandardTokenizer();
    }
}

From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java

License:Apache License

public void testLenientParser() throws IOException, ParseException {
    ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer());
    String rules = "&,and\n" + "come,advance,approach\n";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);//from w ww.  ja  va  2  s . c  o m
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("approach quietly then advance & destroy"));
    TokenStream ts = new SynonymFilter(tokenizer, synonymMap, false);
    assertTokenStreamContents(ts, new String[] { "come", "quietly", "then", "come", "destroy" });
}

From source file:org.elasticsearch.index.analysis.ESSolrSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
    String rules = "foo,bar,baz";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);//from  w ww  . j  av  a2 s.  com
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}

From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java

License:Apache License

public void testLenientParser() throws IOException, ParseException {
    ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true, new StandardAnalyzer());
    String rules = "s(100000001,1,'&',a,1,0).\n" + "s(100000001,2,'and',a,1,0).\n"
            + "s(100000002,1,'come',v,1,0).\n" + "s(100000002,2,'advance',v,1,0).\n"
            + "s(100000002,3,'approach',v,1,0).";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);/*from   w ww  . j a v  a 2 s  .c o  m*/
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("approach quietly then advance & destroy"));
    TokenStream ts = new SynonymFilter(tokenizer, synonymMap, false);
    assertTokenStreamContents(ts, new String[] { "come", "quietly", "then", "come", "destroy" });
}

From source file:org.elasticsearch.index.analysis.ESWordnetSynonymParserTests.java

License:Apache License

public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
    CharArraySet stopSet = new CharArraySet(1, true);
    stopSet.add("bar");
    ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true,
            new StandardAnalyzer(stopSet));
    String rules = "s(100000001,1,'foo',v,1,0).\n" + "s(100000001,2,'bar',v,1,0).\n"
            + "s(100000001,3,'baz',v,1,0).";
    StringReader rulesReader = new StringReader(rules);
    parser.parse(rulesReader);//  w  w w  . j  av  a  2 s  . co  m
    SynonymMap synonymMap = parser.build();
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
    TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
    assertTokenStreamContents(ts,
            new String[] { "first", "word", "is", "foo", "then", "and", "lastly", "foo" });
}

From source file:org.elasticsearch.index.analysis.KeepTypesFilterFactoryTests.java

License:Apache License

@Test
public void testKeepTypes() throws IOException {
    Settings settings = Settings.settingsBuilder().put("path.home", createTempDir().toString())
            .put("index.analysis.filter.keep_numbers.type", "keep_types")
            .putArray("index.analysis.filter.keep_numbers.types", new String[] { "<NUM>", "<SOMETHINGELSE>" })
            .build();/*from   ww  w . ja va2s . c o  m*/
    AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep_numbers");
    assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
    String source = "Hello 123 world";
    String[] expected = new String[] { "123" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 });
}

From source file:org.elasticsearch.index.analysis.SnowballAnalyzer.java

License:Apache License

/** Constructs a {@link StandardTokenizer} filtered by a {@link
    StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
    and a {@link SnowballFilter} */
@Override//  www .  j  a  va 2 s. c  o m
public TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer;
    if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) {
        tokenizer = new StandardTokenizer();
    } else {
        tokenizer = new StandardTokenizer40();
    }
    TokenStream result = tokenizer;
    // remove the possessive 's for english stemmers
    if (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))
        result = new EnglishPossessiveFilter(result);
    // Use a special lowercase filter for turkish, the stemmer expects it.
    if (name.equals("Turkish"))
        result = new TurkishLowerCaseFilter(result);
    else
        result = new LowerCaseFilter(result);
    if (stopSet != null)
        result = new StopFilter(result, stopSet);
    result = new SnowballFilter(result, name);
    return new TokenStreamComponents(tokenizer, result);
}

From source file:org.elasticsearch.index.query.MockRepeatAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new StandardTokenizer();
    TokenStream repeatFilter = new MockRepeatFilter(tokenizer);
    return new TokenStreamComponents(tokenizer, repeatFilter);
}

From source file:org.lambda3.indra.core.IndraAnalyzer.java

License:Open Source License

public IndraAnalyzer(String lang, ModelMetadata metadata) {
    if (lang == null || metadata == null) {
        throw new IllegalArgumentException("all parameters are mandatory.");
    }/* w w w. j a  va 2  s  .co m*/

    logger.debug("Creating analyzer, lang={}, preprocessing={}", lang, metadata);
    tokenizer = new StandardTokenizer();
    tokenStream = createStream(lang, metadata, tokenizer);
}

From source file:org.lambda3.indra.pp.StandardPreProcessorIterator.java

License:Open Source License

StandardPreProcessorIterator(CorpusMetadata metadata, String text) {
    this.metadata = Objects.requireNonNull(metadata);
    ;/*from ww  w .j a va2s  . c  o m*/
    this.tokenizer = new StandardTokenizer();
    this.tokenStream = createStream(metadata, tokenizer);
    setTransformers();
    initialize(Objects.requireNonNull(text));
}