List of usage examples for org.apache.lucene.analysis.core WhitespaceTokenizer WhitespaceTokenizer
public WhitespaceTokenizer(AttributeFactory factory, int maxTokenLen)
From source file:cc.twittertools.index.TweetAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { Tokenizer source = new WhitespaceTokenizer(matchVersion, reader); TokenStream filter = new LowerCaseEntityPreservingFilter(source); if (stemming) { // Porter stemmer ignores words which are marked as keywords filter = new PorterStemFilter(filter); }/*from w w w . j av a2s. co m*/ return new TokenStreamComponents(source, filter); }
From source file:ci6226.LowcaseAnalyzer.java
@Override protected LowcaseAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) { TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader); TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source); return new Analyzer.TokenStreamComponents((Tokenizer) source, filter); }
From source file:ci6226.myAnalyzer.java
@Override protected myAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) { TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader); //TokenStream source = new LetterFilter(Version.LUCENE_47, reader); TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source); filter = new PorterStemFilter(filter); //TokenStream filter = new StopFilter(Version.LUCENE_47, source, StopAnalyzer.ENGLISH_STOP_WORDS_SET); //ilter = new StandardFilter(Version.LUCENE_47, source); //TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source); filter = new StopFilter(Version.LUCENE_47, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return new TokenStreamComponents((Tokenizer) source, filter); }
From source file:ci6226.StemmingAnalyzer.java
@Override protected StemmingAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) { TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader); TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source); filter = new PorterStemFilter(filter); return new Analyzer.TokenStreamComponents((Tokenizer) source, filter); }
From source file:ci6226.StopWordsAnalyzer.java
@Override protected StopWordsAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) { TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader); TokenStream filter = new StopFilter(Version.LUCENE_47, source, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return new TokenStreamComponents((Tokenizer) source, filter); }
From source file:com.globalsight.ling.tm2.lucene.GsPerFieldAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { TokenStreamComponents result = null; if (TuvDocument.TARGET_LOCALES_FIELD.equalsIgnoreCase(fieldName)) { result = new TokenStreamComponents(new WhitespaceTokenizer(LuceneUtil.VERSION, reader)); } else {/*from ww w. j a va2 s . c o m*/ try { Tokenizer t = new GsTokenizer(reader, m_locale); TokenStream tok = new GsStopFilter(t, m_locale); tok = new GsStemFilter(tok, m_locale); result = new TokenStreamComponents(t, tok); } catch (Exception e) { // can't throw checked exception c_logger.error("An error occured in tokenStream", e); throw new RuntimeException(e); } } return result; }
From source file:com.sindicetech.siren.analysis.AnyURIAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final WhitespaceTokenizer source = new WhitespaceTokenizer(matchVersion, reader); TokenStream sink = new URIDecodingFilter(source, "UTF-8"); sink = this.applyURINormalisation(sink); sink = new MailtoFilter(sink); sink = new LowerCaseFilter(matchVersion, sink); sink = new StopFilter(matchVersion, sink, stopSet); sink = new LengthFilter(matchVersion, true, sink, 2, 256); return new TokenStreamComponents(source, sink); }
From source file:com.sindicetech.siren.analysis.filter.TestASCIIFoldingExpansionFilter.java
License:Open Source License
@Test public void testTokenTypeFilter1() throws Exception { final Reader reader = new StringReader("aaa cls caf"); final TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); final ASCIIFoldingExpansionFilter filter = new ASCIIFoldingExpansionFilter(stream); final CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); final PositionIncrementAttribute posAtt = filter.getAttribute(PositionIncrementAttribute.class); filter.reset(); // prepare stream this.assertTermEquals("aaa", 1, filter, termAtt, posAtt); this.assertTermEquals("cles", 1, filter, termAtt, posAtt); this.assertTermEquals("cls", 0, filter, termAtt, posAtt); this.assertTermEquals("cafe", 1, filter, termAtt, posAtt); this.assertTermEquals("caf", 0, filter, termAtt, posAtt); }
From source file:com.sindicetech.siren.qparser.keyword.BasicSyntaxTest.java
License:Open Source License
@Test public void testQueryTermAtSamePosition() throws Exception { final HashMap<ConfigurationKey, Object> config = new HashMap<ConfigurationKey, Object>(); final Analyzer analyser = new Analyzer() { @Override/* ww w. j a v a2 s .co m*/ protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final WhitespaceTokenizer t = new WhitespaceTokenizer(LuceneTestCase.TEST_VERSION_CURRENT, reader); final TokenStream ts = new ASCIIFoldingExpansionFilter(t); return new TokenStreamComponents(t, ts); } }; config.put(ConfigurationKeys.DEFAULT_OPERATOR, Operator.OR); final HashMap<String, Analyzer> dts = new HashMap<String, Analyzer>(); dts.put("exp", analyser); dts.put(XSDDatatype.XSD_STRING, new WhitespaceAnalyzer(LuceneTestCase.TEST_VERSION_CURRENT)); config.put(KeywordConfigurationKeys.DATATYPES_ANALYZERS, dts); /* * Here we cannot parse the toString output, because the query * has been expanded by DatatypeAnalyzerProcessor */ Query q = bq(should(ntq("latte")), must(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp")))), should("the")).getQuery(); assertEquals(q, parse(config, "latte +exp(caf) the")); q = bq(must(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp"))))) .getQuery(); assertEquals(q, parse(config, "+exp(caf)")); q = bq(must(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp")))), must(bq(should(ntq("mate").setDatatype("exp")), should(ntq("mat").setDatatype("exp"))))) .getQuery(); assertEquals(q, parse(config, "exp(+caf +mat)")); q = bq(must(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp")))), not(bq(should(ntq("mate").setDatatype("exp")), should(ntq("mat").setDatatype("exp"))))) .getQuery(); assertEquals(q, parse(config, "exp(+caf -mat)")); q = bq(should(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp")))), should(bq(should(ntq("mate").setDatatype("exp")), should(ntq("mat").setDatatype("exp"))))) .getQuery(); assertEquals(q, parse(config, "exp(caf mat)")); }
From source file:com.sindicetech.siren.qparser.keyword.BasicSyntaxTest.java
License:Open Source License
@Test(expected = QueryNodeException.class) public void testMultiPhraseQuery() throws Exception { final HashMap<ConfigurationKey, Object> config = new HashMap<ConfigurationKey, Object>(); final Analyzer analyser = new Analyzer() { @Override//from w w w. j av a 2 s . c o m protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final WhitespaceTokenizer t = new WhitespaceTokenizer(LuceneTestCase.TEST_VERSION_CURRENT, reader); final TokenStream ts = new ASCIIFoldingExpansionFilter(t); return new TokenStreamComponents(t, ts); } }; final HashMap<String, Analyzer> dts = new HashMap<String, Analyzer>(); dts.put("exp", analyser); config.put(KeywordConfigurationKeys.DATATYPES_ANALYZERS, dts); this.parse(config, "exp(\"caf coffe\")"); }