List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer setMaxTokenLength
public void setMaxTokenLength(int length)
From source file:org.lexevs.dao.index.indexer.LuceneLoaderCode.java
License:Open Source License
public static PerFieldAnalyzerWrapper getAnaylzer() { Map<String, Analyzer> analyzerPerField = new HashMap<>(); //add a literal analyzer -- keep all special characters analyzerPerField.put(LITERAL_PROPERTY_VALUE_FIELD, literalAnalyzer); analyzerPerField.put(LITERAL_AND_REVERSE_PROPERTY_VALUE_FIELD, literalAnalyzer); //treat as string field by analyzing with the KeywordAnalyzer analyzerPerField.put(UNIQUE_ID, new KeywordAnalyzer()); analyzerPerField.put(ENTITY_TYPE, new KeywordAnalyzer()); analyzerPerField.put("isPreferred", new KeywordAnalyzer()); analyzerPerField.put(SQLTableConstants.TBLCOL_ENTITYCODENAMESPACE, new KeywordAnalyzer()); if (doubleMetaphoneEnabled_) { Analyzer temp = new Analyzer() { @Override/*w w w .j a v a 2 s. co m*/ protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new DoubleMetaphoneFilter(filter, 4, false); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(DOUBLE_METAPHONE_PROPERTY_VALUE_FIELD, temp); } if (normEnabled_) { try { Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET); analyzerPerField.put(NORM_PROPERTY_VALUE_FIELD, temp); } catch (NoClassDefFoundError e) { // } } if (stemmingEnabled_) { Analyzer temp = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(STEMMING_PROPERTY_VALUE_FIELD, temp); } final CharArraySet dividerList = new CharArraySet(10, true); dividerList.add(STRING_TOKENIZER_TOKEN); Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true)); Analyzer qualifierAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String arg0) { final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new LowerCaseFilter(source); Pattern pattern = Pattern.compile("\\-|\\;|\\(|\\)|\\{|\\}|\\[|\\]|\\<|\\>|\\||(\\<\\:\\>)"); filter = new PatternReplaceFilter(filter, pattern, " ", true); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put("sources", sa); analyzerPerField.put("usageContexts", sa); analyzerPerField.put("qualifiers", qualifierAnalyzer); // no stop words, default character removal set. PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), analyzerPerField); return analyzer; }
From source file:org.lexevs.dao.index.metadata.BaseMetaDataLoader.java
License:Open Source License
public static Analyzer getMetadataAnalyzer() { Map<String, Analyzer> analyzerPerField = new HashMap<>(); if (doubleMetaphoneEnabled_) { Analyzer temp = new Analyzer() { @Override//from ww w.j a va 2s. co m protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new DoubleMetaphoneFilter(filter, 4, true); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(doubleMetaphonePrefix_ + "propertyValue", temp); } if (normEnabled_) { try { Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET); analyzerPerField.put(normPrefix_ + "propertyValue", temp); } catch (NoClassDefFoundError e) { // norm is not available normEnabled_ = false; } } if (stemmingEnabled_) { Analyzer temp = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(stemmingPrefix_ + "propertyValue", temp); } // these fields just get simple analyzing. List<String> dividerList = new ArrayList<String>(); dividerList.add(STRING_TOKENIZER_TOKEN); Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true)); analyzerPerField.put("parentContainers", sa); // no stop words, default character removal set. PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), analyzerPerField); return analyzer; }
From source file:org.lexevs.dao.indexer.lucene.analyzers.SnowballAnalyzerTest.java
License:Open Source License
@Test public void testDontKeepOrigional() throws Exception { Analyzer temp = new Analyzer() { @Override/*from w w w . j av a 2s .co m*/ protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; String input = new String("The trees have Leaves!"); String[] output = { "tree", "have", "leav" }; BaseTokenStreamTestCase.assertAnalyzesTo(temp, input, output); }
From source file:org.opencms.search.galleries.CmsGallerySearchAnalyzer.java
License:Open Source License
/** * @see org.apache.lucene.analysis.ReusableAnalyzerBase#createComponents(java.lang.String, java.io.Reader) * /*from w w w .j a v a2 s. c o m*/ * This is take from the Lucene StandardAnalyzer, which is final since 3.1 */ @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH); TokenStream tok = new StandardFilter(matchVersion, src); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected boolean reset(final Reader r) throws IOException { src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH); return super.reset(r); } }; }
From source file:org.segrada.search.lucene.LuceneSegradaAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String s, Reader reader) { final StandardTokenizer src = new StandardTokenizer(this.matchVersion, reader); src.setMaxTokenLength(255); StandardFilter tok = new StandardFilter(this.matchVersion, src); LowerCaseFilter tok1 = new LowerCaseFilter(this.matchVersion, tok); StopFilter tok2 = new StopFilter(this.matchVersion, tok1, this.stopwords); final ASCIIFoldingFilter tok3 = new ASCIIFoldingFilter(tok2); return new TokenStreamComponents(src, tok3) { protected void setReader(Reader reader) throws IOException { src.setMaxTokenLength(255);//from www. j a va 2 s. c o m super.setReader(reader); } }; }
From source file:org.vskm.text.topic.CaseSensitiveAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(matchVersion, src); tok = new StopFilter(matchVersion, tok, stopwords); return new TokenStreamComponents(src, tok) { @Override/* w w w. j a va 2s . c o m*/ protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(CaseSensitiveAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
From source file:pl.litwiniuk.rowicki.collocations.CollocationAnalyzer.java
License:Apache License
/** * Creates a new {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} instance for this analyzer. * * @param fieldName the name of the fields content passed to the * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} sink as a reader * @param reader the reader passed to the {@link org.apache.lucene.analysis.Tokenizer} constructor * @return the {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} for this analyzer. *///from ww w .j a v a2s . co m @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(matchVersion, src); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, StopAnalyzer.ENGLISH_STOP_WORDS_SET); tok = new MorfologikFilter(tok, PolishStemmer.DICTIONARY.MORFOLOGIK, Version.LUCENE_43); tok = new CollocationFilter(tok, engine); try { tok = new ModificatedSynonymFilter(tok, loadSolrSynonyms(), true); } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } catch (ParseException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(CollocationAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
From source file:text_analyzer.StandardAnalyzer1.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); src.setReplaceInvalidAcronym(replaceInvalidAcronym); TokenStream tok = new StandardFilter(matchVersion, src); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, stopwords); tok = new PorterStemFilter(tok); return new TokenStreamComponents(src, tok) { @Override// w w w . j a va 2s .c o m protected boolean reset(final Reader reader) throws IOException { src.setMaxTokenLength(StandardAnalyzer1.this.maxTokenLength); return super.reset(reader); } }; }