List of usage examples for org.apache.lucene.analysis.core KeywordTokenizer KeywordTokenizer
public KeywordTokenizer(int bufferSize)
From source file:at.ac.univie.mminf.luceneSKOS.analysis.MeSHAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fileName, Reader reader) { if (expansionType.equals(ExpansionType.URI)) { final KeywordTokenizer src = new KeywordTokenizer(reader); TokenStream tok = new MeSHURIFilter(src, skosEngine, new StandardAnalyzer(matchVersion), types); tok = new LowerCaseFilter(matchVersion, tok); return new TokenStreamComponents(src, tok); } else {/*from w w w . ja va2 s . c o m*/ final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(matchVersion, src); // prior to this we get the classic behavior, standardfilter does it for // us. tok = new MeSHLabelFilter(tok, skosEngine, new StandardAnalyzer(matchVersion), bufferSize, types); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, stopwords); tok = new RemoveDuplicatesTokenFilter(tok); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(maxTokenLength); super.setReader(reader); } }; } }
From source file:at.molindo.esi4j.util.NullAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { return new TokenStreamComponents(new KeywordTokenizer(reader)); }
From source file:au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { KeywordTokenizer src = new KeywordTokenizer(reader); TokenStream result = new LowerCaseFilter(Version.LUCENE_34, src); return new TokenStreamComponents(src, result) { @Override/*from w ww .ja v a2s. c o m*/ protected void setReader(final Reader reader) throws IOException { super.setReader(reader); } }; }
From source file:com.tuplejump.stargate.lucene.CaseInsensitiveKeywordAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { KeywordTokenizer source = new KeywordTokenizer(reader); LowerCaseFilter filter = new LowerCaseFilter(version, source); return new TokenStreamComponents(source, filter); }
From source file:de.jetsli.lumeo.util.KeywordAnalyzerLowerCase.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new LowerCaseFilter(version, tokenizer)); }
From source file:edu.rpi.tw.linkipedia.search.index.analyzer.EntropyAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { // System.out.println("compoent"); Tokenizer tk = new KeywordTokenizer(reader); TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, tk); filter = new DelimitedPayloadTokenFilter(filter, '|', encoder); TokenStreamComponents components = new TokenStreamComponents(tk, filter); return components; }
From source file:edu.stanford.lucene.analysis.TestCJKFoldingFilter.java
License:Open Source License
@Test public void testEmptyTerm() throws IOException { Analyzer a = new Analyzer() { @Override//from ww w . jav a2s . co m protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new CJKFoldingFilter(tokenizer)); } }; checkOneTermReuse(a, "", ""); }
From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerFactory.java
License:Apache License
public static Analyzer getAnalyzer(final boolean ignoreCase) { return new Analyzer() { @Override//from w ww . java2 s. co m protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final Tokenizer tokenizer = new KeywordTokenizer(reader); @SuppressWarnings("resource") final TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; }
From source file:org.apache.jackrabbit.oak.plugins.index.solr.configuration.DefaultAnalyzersConfigurationTest.java
License:Apache License
@Before public void setUp() throws Exception { this.exactPathAnalyzer = new Analyzer() { @Override//from w w w. j a va2 s. co m protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); return new TokenStreamComponents(source); } }; this.parentPathIndexingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); return new TokenStreamComponents(source); } }; this.parentPathSearchingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source); filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false); filter = new ReverseStringFilter(Version.LUCENE_47, filter); return new TokenStreamComponents(source, filter); } }; this.directChildrenPathIndexingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source); filter = new LengthFilter(Version.LUCENE_47, filter, 2, Integer.MAX_VALUE); filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false); filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false); filter = new ReverseStringFilter(Version.LUCENE_47, filter); return new TokenStreamComponents(source, filter); } }; this.directChildrenPathSearchingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); return new TokenStreamComponents(source); } }; this.allChildrenPathIndexingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new PathHierarchyTokenizer(reader); TokenStream filter = new PatternCaptureGroupTokenFilter(source, false, Pattern.compile("((\\/).*)")); filter = new RemoveDuplicatesTokenFilter(filter); return new TokenStreamComponents(source, filter); } }; this.allChildrenPathSearchingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new KeywordTokenizer(reader); return new TokenStreamComponents(source); } }; }
From source file:org.apache.solr.analysis.TestCapitalizationFilterFactory.java
License:Apache License
public void testCapitalization() throws Exception { Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM); args.put(CapitalizationFilterFactory.KEEP, "and the it BIG"); args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true"); CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); factory.init(args);//from w w w . jav a2 s .c o m assertTokenStreamContents( factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kiTTEN"))), new String[] { "Kitten" }); factory.forceFirstLetter = true; assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("and"))), new String[] { "And" }); //first is forced, but it's not a keep word, either assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))), new String[] { "And" }); factory.forceFirstLetter = false; //first is not forced, but it's not a keep word, either assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))), new String[] { "And" }); factory.forceFirstLetter = true; assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("big"))), new String[] { "Big" }); assertTokenStreamContents(factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("BIG"))), new String[] { "BIG" }); assertTokenStreamContents( factory.create(new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"))), new String[] { "Hello there my name is ryan" }); // now each token factory.onlyFirstWord = false; assertTokenStreamContents( factory.create( new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))), new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); // now only the long words factory.minWordLength = 3; assertTokenStreamContents( factory.create( new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))), new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); // without prefix assertTokenStreamContents( factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))), new String[] { "Mckinley" }); // Now try some prefixes factory = new CapitalizationFilterFactory(); args.put("okPrefix", "McK"); // all words factory.init(args); assertTokenStreamContents( factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))), new String[] { "McKinley" }); // now try some stuff with numbers factory.forceFirstLetter = false; factory.onlyFirstWord = false; assertTokenStreamContents( factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third"))), new String[] { "1st", "2nd", "Third" }); factory.forceFirstLetter = true; assertTokenStreamContents(factory.create(new KeywordTokenizer(new StringReader("the The the"))), new String[] { "The The the" }); }