Example usage for org.apache.lucene.analysis.core KeywordTokenizer KeywordTokenizer

List of usage examples for org.apache.lucene.analysis.core KeywordTokenizer KeywordTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core KeywordTokenizer KeywordTokenizer.

Prototype

public KeywordTokenizer(int bufferSize) 

Source Link

Usage

From source file:at.ac.univie.mminf.luceneSKOS.analysis.MeSHAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fileName, Reader reader) {
    if (expansionType.equals(ExpansionType.URI)) {
        final KeywordTokenizer src = new KeywordTokenizer(reader);
        TokenStream tok = new MeSHURIFilter(src, skosEngine, new StandardAnalyzer(matchVersion), types);
        tok = new LowerCaseFilter(matchVersion, tok);
        return new TokenStreamComponents(src, tok);
    } else {/*from  w w  w  .  ja va2  s .  c o m*/
        final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
        src.setMaxTokenLength(maxTokenLength);
        TokenStream tok = new StandardFilter(matchVersion, src);
        // prior to this we get the classic behavior, standardfilter does it for
        // us.
        tok = new MeSHLabelFilter(tok, skosEngine, new StandardAnalyzer(matchVersion), bufferSize, types);
        tok = new LowerCaseFilter(matchVersion, tok);
        tok = new StopFilter(matchVersion, tok, stopwords);
        tok = new RemoveDuplicatesTokenFilter(tok);
        return new TokenStreamComponents(src, tok) {
            @Override
            protected void setReader(final Reader reader) throws IOException {
                src.setMaxTokenLength(maxTokenLength);
                super.setReader(reader);
            }
        };
    }
}

From source file:at.molindo.esi4j.util.NullAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    return new TokenStreamComponents(new KeywordTokenizer(reader));
}

From source file:au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {

    KeywordTokenizer src = new KeywordTokenizer(reader);
    TokenStream result = new LowerCaseFilter(Version.LUCENE_34, src);

    return new TokenStreamComponents(src, result) {

        @Override/*from w  ww  .ja v a2s. c o  m*/
        protected void setReader(final Reader reader) throws IOException {
            super.setReader(reader);
        }
    };
}

From source file:com.tuplejump.stargate.lucene.CaseInsensitiveKeywordAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    KeywordTokenizer source = new KeywordTokenizer(reader);
    LowerCaseFilter filter = new LowerCaseFilter(version, source);
    return new TokenStreamComponents(source, filter);
}

From source file:de.jetsli.lumeo.util.KeywordAnalyzerLowerCase.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer tokenizer = new KeywordTokenizer(reader);
    return new TokenStreamComponents(tokenizer, new LowerCaseFilter(version, tokenizer));
}

From source file:edu.rpi.tw.linkipedia.search.index.analyzer.EntropyAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    //   System.out.println("compoent");
    Tokenizer tk = new KeywordTokenizer(reader);
    TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, tk);
    filter = new DelimitedPayloadTokenFilter(filter, '|', encoder);
    TokenStreamComponents components = new TokenStreamComponents(tk, filter);

    return components;
}

From source file:edu.stanford.lucene.analysis.TestCJKFoldingFilter.java

License:Open Source License

@Test
public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {
        @Override//from ww w .  jav  a2s  . co  m
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer tokenizer = new KeywordTokenizer(reader);
            return new TokenStreamComponents(tokenizer, new CJKFoldingFilter(tokenizer));
        }
    };
    checkOneTermReuse(a, "", "");
}

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerFactory.java

License:Apache License

public static Analyzer getAnalyzer(final boolean ignoreCase) {
    return new Analyzer() {
        @Override//from   w  ww .  java2  s. co  m
        protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
            final Tokenizer tokenizer = new KeywordTokenizer(reader);
            @SuppressWarnings("resource")
            final TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
}

From source file:org.apache.jackrabbit.oak.plugins.index.solr.configuration.DefaultAnalyzersConfigurationTest.java

License:Apache License

@Before
public void setUp() throws Exception {
    this.exactPathAnalyzer = new Analyzer() {
        @Override//from  w  w w.  j a  va2  s. co  m
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathIndexingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathSearchingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source);
            filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false);
            filter = new ReverseStringFilter(Version.LUCENE_47, filter);
            return new TokenStreamComponents(source, filter);
        }
    };

    this.directChildrenPathIndexingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source);
            filter = new LengthFilter(Version.LUCENE_47, filter, 2, Integer.MAX_VALUE);
            filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false);
            filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false);
            filter = new ReverseStringFilter(Version.LUCENE_47, filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathSearchingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };

    this.allChildrenPathIndexingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new PathHierarchyTokenizer(reader);
            TokenStream filter = new PatternCaptureGroupTokenFilter(source, false,
                    Pattern.compile("((\\/).*)"));
            filter = new RemoveDuplicatesTokenFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.allChildrenPathSearchingAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
}

From source file:org.apache.solr.analysis.TestCapitalizationFilterFactory.java

License:Apache License

public void testCapitalization() throws Exception {
    Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
    args.put(CapitalizationFilterFactory.KEEP, "and the it BIG");
    args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");

    CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
    factory.init(args);//from  w w  w .  jav  a2  s  .c  o m
    assertTokenStreamContents(
            factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kiTTEN"))),
            new String[] { "Kitten" });

    factory.forceFirstLetter = true;

    assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("and"))),
            new String[] { "And" });

    //first is forced, but it's not a keep word, either
    assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
            new String[] { "And" });

    factory.forceFirstLetter = false;

    //first is not forced, but it's not a keep word, either
    assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
            new String[] { "And" });

    factory.forceFirstLetter = true;

    assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("big"))),
            new String[] { "Big" });

    assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("BIG"))),
            new String[] { "BIG" });

    assertTokenStreamContents(
            factory.create(new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"))),
            new String[] { "Hello there my name is ryan" });

    // now each token
    factory.onlyFirstWord = false;
    assertTokenStreamContents(
            factory.create(
                    new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
            new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });

    // now only the long words
    factory.minWordLength = 3;
    assertTokenStreamContents(
            factory.create(
                    new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
            new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });

    // without prefix
    assertTokenStreamContents(
            factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
            new String[] { "Mckinley" });

    // Now try some prefixes
    factory = new CapitalizationFilterFactory();
    args.put("okPrefix", "McK"); // all words
    factory.init(args);
    assertTokenStreamContents(
            factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
            new String[] { "McKinley" });

    // now try some stuff with numbers
    factory.forceFirstLetter = false;
    factory.onlyFirstWord = false;
    assertTokenStreamContents(
            factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third"))),
            new String[] { "1st", "2nd", "Third" });

    factory.forceFirstLetter = true;
    assertTokenStreamContents(factory.create(new KeywordTokenizer(new StringReader("the The the"))),
            new String[] { "The The the" });
}