Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer() 

Source Link

Document

Creates a new instance of the org.apache.lucene.analysis.standard.StandardTokenizer .

Usage

From source file:org.elasticsearch.action.termvectors.AbstractTermVectorsTestCase.java

License:Apache License

protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException {
    Map<String, Analyzer> mapping = new HashMap<>();
    for (TestFieldSetting field : testDocs[0].fieldSettings) {
        if (field.storedPayloads) {
            mapping.put(field.name, new Analyzer() {
                @Override/*from  w  ww.ja  v  a2s  . c  o m*/
                protected TokenStreamComponents createComponents(String fieldName) {
                    Tokenizer tokenizer = new StandardTokenizer();
                    TokenFilter filter = new LowerCaseFilter(tokenizer);
                    filter = new TypeAsPayloadTokenFilter(filter);
                    return new TokenStreamComponents(tokenizer, filter);
                }

            });
        }
    }
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET),
            mapping);

    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(wrapper);

    conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, conf);

    for (TestDoc doc : testDocs) {
        Document d = new Document();
        d.add(new Field("id", doc.id, StringField.TYPE_STORED));
        for (int i = 0; i < doc.fieldContent.length; i++) {
            FieldType type = new FieldType(TextField.TYPE_STORED);
            TestFieldSetting fieldSetting = doc.fieldSettings[i];

            type.setStoreTermVectorOffsets(fieldSetting.storedOffset);
            type.setStoreTermVectorPayloads(fieldSetting.storedPayloads);
            type.setStoreTermVectorPositions(
                    fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset);
            type.setStoreTermVectors(true);
            type.freeze();
            d.add(new Field(fieldSetting.name, doc.fieldContent[i], type));
        }
        writer.updateDocument(new Term("id", doc.id), d);
        writer.commit();
    }
    writer.close();

    return DirectoryReader.open(dir);
}

From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java

License:Apache License

public void testDefault() throws IOException {
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram");
    String source = "????????";
    String[] expected = new String[] { "??", "???", "?", "", "?", "?", "",
            "?", "??", "??", "??" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}

From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java

License:Apache License

public void testNoFlags() throws IOException {
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
    String source = "????????";
    String[] expected = new String[] { "??", "???", "?", "", "?", "?", "",
            "?", "??", "??", "??" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}

From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java

License:Apache License

public void testHanOnly() throws IOException {
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only");
    String source = "????????";
    String[] expected = new String[] { "", "??", "?", "", "?", "", "?", "?", "?",
            "?" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}

From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java

License:Apache License

public void testHanUnigramOnly() throws IOException {
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
    String source = "????????";
    String[] expected = new String[] { "", "??", "?", "", "", "", "?", "", "",
            "", "?", "?", "?", "?" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}

From source file:org.elasticsearch.analysis.common.CJKFilterFactoryTests.java

License:Apache License

public void testDisableGraph() throws IOException {
    TokenFilterFactory allFlagsFactory = analysis.tokenFilter.get("cjk_all_flags");
    TokenFilterFactory hanOnlyFactory = analysis.tokenFilter.get("cjk_han_only");

    String source = "????????";
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    try (TokenStream tokenStream = allFlagsFactory.create(tokenizer)) {
        // This config outputs different size of ngrams so graph analysis is disabled
        assertTrue(tokenStream.hasAttribute(DisableGraphAttribute.class));
    }//  w w  w  .jav a  2  s. com

    tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    try (TokenStream tokenStream = hanOnlyFactory.create(tokenizer)) {
        // This config uses only bigrams so graph analysis is enabled
        assertFalse(tokenStream.hasAttribute(DisableGraphAttribute.class));
    }
}

From source file:org.elasticsearch.analysis.common.KeepTypesFilterFactoryTests.java

License:Apache License

public void testKeepTypes() throws IOException {
    Settings settings = Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.keep_numbers.type", "keep_types")
            .putList("index.analysis.filter.keep_numbers.types", new String[] { "<NUM>", "<SOMETHINGELSE>" })
            .build();// www  .j a va  2s  .co m
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings,
            new CommonAnalysisPlugin());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
    assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
    String source = "Hello 123 world";
    String[] expected = new String[] { "123" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 });
}

From source file:org.elasticsearch.analysis.common.SnowballAnalyzer.java

License:Apache License

/** Constructs a {@link StandardTokenizer} filtered by a {@link
    StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
    and a {@link SnowballFilter} */
@Override//from  w  w  w  .j  a  v a  2s .  c  om
public TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer = new StandardTokenizer();
    TokenStream result = tokenizer;
    // remove the possessive 's for english stemmers
    if (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))
        result = new EnglishPossessiveFilter(result);
    // Use a special lowercase filter for turkish, the stemmer expects it.
    if (name.equals("Turkish"))
        result = new TurkishLowerCaseFilter(result);
    else
        result = new LowerCaseFilter(result);
    if (stopSet != null)
        result = new StopFilter(result, stopSet);
    result = new SnowballFilter(result, name);
    return new TokenStreamComponents(tokenizer, result);
}

From source file:org.elasticsearch.analysis.hunspell.cs.CzechHunspellAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String field) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StopFilter(source, stopwords);
    if (!this.stemExclusionTable.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionTable);
    }//from w  ww .j  a  va 2s  .  co  m
    result = new HunspellStemFilter(result, dictionary);
    result = new LowerCaseFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:org.elasticsearch.analysis.hunspell.fr.FrenchHunspellAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String field) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new ElisionFilter(source, FrenchAnalyzer.DEFAULT_ARTICLES);
    result = new StopFilter(result, stopwords);
    if (!this.stemExclusionTable.isEmpty()) {
        result = new SetKeywordMarkerFilter(result, stemExclusionTable);
    }/* w  w  w .  j  a  va 2  s .c  o m*/
    result = new HunspellStemFilter(result, dictionary);
    result = new LowerCaseFilter(result);
    return new TokenStreamComponents(source, result);
}