List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer(AttributeFactory factory)
From source file:net.sf.jtmt.summarizers.SummaryAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String fieldName, Reader reader) { return new PorterStemFilter(new StopFilter(false, // enable_position_increment_default == false, for backward compat new LowerCaseFilter(new NumericTokenFilter(new StandardFilter(new StandardTokenizer(reader)))), stopset));/*from w ww .j a v a2s . co m*/ }
From source file:org.alfresco.repo.search.impl.lucene.analysis.AlfrescoStandardAnalyser.java
License:Open Source License
/** * Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. *///from ww w . j a v a2s.c om public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new AlfrescoStandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); result = new ISOLatin1AccentFilter(result); return result; }
From source file:org.apache.solr.analysis.HTMLStripStandardTokenizerFactory.java
License:Apache License
public Tokenizer create(Reader input) { return new StandardTokenizer(new HTMLStripReader(input)) { @Override//from ww w .j av a 2 s. c o m public void reset(Reader reader) throws IOException { super.reset(new HTMLStripReader(reader)); } }; }
From source file:org.apache.uima.lucas.indexer.Tokenizer.java
License:Apache License
public TokenStream tokenize(TokenStream tokenStream, AnnotationDescription description) throws IOException { String tokenizer = description.getTokenizer(); if (tokenizer != null && !tokenizer.equals(TOKENIZER_CAS)) { String tokenStreamAsString = tokenStreamStringConcatenator.tokenStreamToStringWithDelimiter(tokenStream, " "); StringReader stringReader = new StringReader(tokenStreamAsString); if (tokenizer.equals(TOKENIZER_WHITESPACE)) tokenStream = new WhitespaceTokenizer(stringReader); else if (tokenizer.equals(TOKENIZER_STANDARD)) tokenStream = new StandardTokenizer(stringReader); }/*from w ww.j a v a 2s. c o m*/ return tokenStream; }
From source file:org.eurekastreams.commons.search.analysis.HashTagTextStemmerIndexingAnalyzer.java
License:Apache License
/** * Tokenize the stream./*from ww w . j av a 2 s .c o m*/ * * @param fieldName * the name of the field * @param inReader * the reader * @return the stream */ @Override public TokenStream tokenStream(final String fieldName, final Reader inReader) { // collection to hold hashtagged keywords List<String> hashTaggedKeywords = new ArrayList<String>(); // this reader will replace all hashtags with our marker text Reader reader = CharacterReplacementStreamBuilder.buildReplacementReader(inReader, '#', INDEXED_HASHTAG_PREFIX); TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); // now remove our hashtag prefixes and store the hashtagged keywords in a set result = new PrefixedTokenRemoverDuplicatorAndExtractorTokenizer(result, INDEXED_HASHTAG_PREFIX, "#", hashTaggedKeywords); result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS); result = new EnglishPorterFilterFactory().create(result); result = new WordListInjectionTokenizer(hashTaggedKeywords, result); return result; }
From source file:org.eurekastreams.commons.search.analysis.HashTagTextStemmerSearchAnalyzer.java
License:Apache License
/** * Tokenize the stream./*from ww w. ja v a2 s . c o m*/ * * @param fieldName * the name of the field * @param inReader * the reader * @return the stream */ @Override public TokenStream tokenStream(final String fieldName, final Reader inReader) { // collection to hold hashtagged keywords List<String> hashTaggedKeywords = new ArrayList<String>(); // this reader will replace all hashtags with our marker text Reader reader = CharacterReplacementStreamBuilder.buildReplacementReader(inReader, '#', INDEXED_HASHTAG_PREFIX); TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); // now remove our hashtag prefixes and store the hashtagged keywords in a set result = new PrefixedTokenRemoverAndExtractorTokenizer(result, INDEXED_HASHTAG_PREFIX, "#", hashTaggedKeywords); result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS); result = new EnglishPorterFilterFactory().create(result); result = new WordListInjectionTokenizer(hashTaggedKeywords, result); return result; }
From source file:org.eurekastreams.commons.search.analysis.TextStemmerAnalyzer.java
License:Apache License
/** * Tokenize the stream.//from ww w. ja va 2s .c o m * * @param fieldName * the name of the field * @param reader * the reader * @return the stream */ @Override public TokenStream tokenStream(final String fieldName, final Reader reader) { TokenStream tokenStream = new StandardTokenizer(reader); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS); result = new EnglishPorterFilterFactory().create(result); return result; }
From source file:org.eurekastreams.commons.search.analysis.TextSynonymStemmerAnalyzer.java
License:Apache License
/** * Tokenize the stream.//from w w w . j a v a2 s . c o m * * @param fieldName * the name of the field * @param reader * the reader * @return the stream */ @Override public TokenStream tokenStream(final String fieldName, final Reader reader) { TokenStream tokenStream = new StandardTokenizer(reader); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS); result = new SynonymTokenFilter(result, SynonymMapFactory.getSynonymMap(), MAX_NUMBER_OF_SYNONYMS); result = new EnglishPorterFilterFactory().create(result); return result; }
From source file:org.exist.indexing.lucene.DMLBSAccentAnalyzer.java
License:Apache License
/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new org.exist.indexing.lucene.DMLBSAccentFilter(result); //result = new ASCIIFoldingFilter(result); result = new StopFilter(result, stopSet); return result; }
From source file:org.lexevs.dao.index.indexer.LuceneLoaderCode.java
License:Open Source License
public static PerFieldAnalyzerWrapper getAnaylzer() { Map<String, Analyzer> analyzerPerField = new HashMap<>(); //add a literal analyzer -- keep all special characters analyzerPerField.put(LITERAL_PROPERTY_VALUE_FIELD, literalAnalyzer); analyzerPerField.put(LITERAL_AND_REVERSE_PROPERTY_VALUE_FIELD, literalAnalyzer); //treat as string field by analyzing with the KeywordAnalyzer analyzerPerField.put(UNIQUE_ID, new KeywordAnalyzer()); analyzerPerField.put(ENTITY_TYPE, new KeywordAnalyzer()); analyzerPerField.put("isPreferred", new KeywordAnalyzer()); analyzerPerField.put(SQLTableConstants.TBLCOL_ENTITYCODENAMESPACE, new KeywordAnalyzer()); if (doubleMetaphoneEnabled_) { Analyzer temp = new Analyzer() { @Override// ww w. ja va 2s. c om protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new DoubleMetaphoneFilter(filter, 4, false); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(DOUBLE_METAPHONE_PROPERTY_VALUE_FIELD, temp); } if (normEnabled_) { try { Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET); analyzerPerField.put(NORM_PROPERTY_VALUE_FIELD, temp); } catch (NoClassDefFoundError e) { // } } if (stemmingEnabled_) { Analyzer temp = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(STEMMING_PROPERTY_VALUE_FIELD, temp); } final CharArraySet dividerList = new CharArraySet(10, true); dividerList.add(STRING_TOKENIZER_TOKEN); Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true)); Analyzer qualifierAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String arg0) { final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new LowerCaseFilter(source); Pattern pattern = Pattern.compile("\\-|\\;|\\(|\\)|\\{|\\}|\\[|\\]|\\<|\\>|\\||(\\<\\:\\>)"); filter = new PatternReplaceFilter(filter, pattern, " ", true); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put("sources", sa); analyzerPerField.put("usageContexts", sa); analyzerPerField.put("qualifiers", qualifierAnalyzer); // no stop words, default character removal set. PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), analyzerPerField); return analyzer; }