Example usage for org.apache.lucene.analysis.core WhitespaceTokenizer WhitespaceTokenizer

List of usage examples for org.apache.lucene.analysis.core WhitespaceTokenizer WhitespaceTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core WhitespaceTokenizer WhitespaceTokenizer.

Prototype

public WhitespaceTokenizer(AttributeFactory factory, int maxTokenLen) 

Source Link

Document

Construct a new WhitespaceTokenizer using a given org.apache.lucene.util.AttributeFactory .

Usage

From source file:cc.twittertools.index.TweetAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
    TokenStream filter = new LowerCaseEntityPreservingFilter(source);

    if (stemming) {
        // Porter stemmer ignores words which are marked as keywords
        filter = new PorterStemFilter(filter);
    }/*from w w w .  j av a2s.  co  m*/
    return new TokenStreamComponents(source, filter);
}

From source file:ci6226.LowcaseAnalyzer.java

@Override
protected LowcaseAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) {
    TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader);

    TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source);

    return new Analyzer.TokenStreamComponents((Tokenizer) source, filter);
}

From source file:ci6226.myAnalyzer.java

@Override
protected myAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) {
    TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader);

    //TokenStream source = new LetterFilter(Version.LUCENE_47, reader);

    TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source);
    filter = new PorterStemFilter(filter);

    //TokenStream filter = new StopFilter(Version.LUCENE_47, source, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

    //ilter = new StandardFilter(Version.LUCENE_47, source);
    //TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source);
    filter = new StopFilter(Version.LUCENE_47, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

    return new TokenStreamComponents((Tokenizer) source, filter);
}

From source file:ci6226.StemmingAnalyzer.java

@Override
protected StemmingAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) {
    TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader);

    TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source);
    filter = new PorterStemFilter(filter);

    return new Analyzer.TokenStreamComponents((Tokenizer) source, filter);
}

From source file:ci6226.StopWordsAnalyzer.java

@Override
protected StopWordsAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) {
    TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader);
    TokenStream filter = new StopFilter(Version.LUCENE_47, source, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

    return new TokenStreamComponents((Tokenizer) source, filter);
}

From source file:com.globalsight.ling.tm2.lucene.GsPerFieldAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    TokenStreamComponents result = null;

    if (TuvDocument.TARGET_LOCALES_FIELD.equalsIgnoreCase(fieldName)) {
        result = new TokenStreamComponents(new WhitespaceTokenizer(LuceneUtil.VERSION, reader));
    } else {/*from   ww  w. j  a  va2  s  . c o m*/
        try {
            Tokenizer t = new GsTokenizer(reader, m_locale);
            TokenStream tok = new GsStopFilter(t, m_locale);
            tok = new GsStemFilter(tok, m_locale);

            result = new TokenStreamComponents(t, tok);
        } catch (Exception e) {
            // can't throw checked exception
            c_logger.error("An error occured in tokenStream", e);

            throw new RuntimeException(e);
        }
    }

    return result;
}

From source file:com.sindicetech.siren.analysis.AnyURIAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    final WhitespaceTokenizer source = new WhitespaceTokenizer(matchVersion, reader);
    TokenStream sink = new URIDecodingFilter(source, "UTF-8");
    sink = this.applyURINormalisation(sink);
    sink = new MailtoFilter(sink);
    sink = new LowerCaseFilter(matchVersion, sink);
    sink = new StopFilter(matchVersion, sink, stopSet);
    sink = new LengthFilter(matchVersion, true, sink, 2, 256);
    return new TokenStreamComponents(source, sink);
}

From source file:com.sindicetech.siren.analysis.filter.TestASCIIFoldingExpansionFilter.java

License:Open Source License

@Test
public void testTokenTypeFilter1() throws Exception {
    final Reader reader = new StringReader("aaa cls caf");
    final TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
    final ASCIIFoldingExpansionFilter filter = new ASCIIFoldingExpansionFilter(stream);

    final CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posAtt = filter.getAttribute(PositionIncrementAttribute.class);

    filter.reset(); // prepare stream

    this.assertTermEquals("aaa", 1, filter, termAtt, posAtt);
    this.assertTermEquals("cles", 1, filter, termAtt, posAtt);
    this.assertTermEquals("cls", 0, filter, termAtt, posAtt);
    this.assertTermEquals("cafe", 1, filter, termAtt, posAtt);
    this.assertTermEquals("caf", 0, filter, termAtt, posAtt);
}

From source file:com.sindicetech.siren.qparser.keyword.BasicSyntaxTest.java

License:Open Source License

@Test
public void testQueryTermAtSamePosition() throws Exception {
    final HashMap<ConfigurationKey, Object> config = new HashMap<ConfigurationKey, Object>();

    final Analyzer analyser = new Analyzer() {
        @Override/* ww w. j  a  v  a2 s .co m*/
        protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
            final WhitespaceTokenizer t = new WhitespaceTokenizer(LuceneTestCase.TEST_VERSION_CURRENT, reader);
            final TokenStream ts = new ASCIIFoldingExpansionFilter(t);
            return new TokenStreamComponents(t, ts);
        }
    };
    config.put(ConfigurationKeys.DEFAULT_OPERATOR, Operator.OR);
    final HashMap<String, Analyzer> dts = new HashMap<String, Analyzer>();
    dts.put("exp", analyser);
    dts.put(XSDDatatype.XSD_STRING, new WhitespaceAnalyzer(LuceneTestCase.TEST_VERSION_CURRENT));
    config.put(KeywordConfigurationKeys.DATATYPES_ANALYZERS, dts);

    /*
     * Here we cannot parse the toString output, because the query
     * has been expanded by DatatypeAnalyzerProcessor
     */
    Query q = bq(should(ntq("latte")),
            must(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp")))),
            should("the")).getQuery();
    assertEquals(q, parse(config, "latte +exp(caf) the"));

    q = bq(must(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp")))))
            .getQuery();
    assertEquals(q, parse(config, "+exp(caf)"));

    q = bq(must(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp")))),
            must(bq(should(ntq("mate").setDatatype("exp")), should(ntq("mat").setDatatype("exp")))))
                    .getQuery();
    assertEquals(q, parse(config, "exp(+caf +mat)"));

    q = bq(must(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp")))),
            not(bq(should(ntq("mate").setDatatype("exp")), should(ntq("mat").setDatatype("exp")))))
                    .getQuery();
    assertEquals(q, parse(config, "exp(+caf -mat)"));

    q = bq(should(bq(should(ntq("cafe").setDatatype("exp")), should(ntq("caf").setDatatype("exp")))),
            should(bq(should(ntq("mate").setDatatype("exp")), should(ntq("mat").setDatatype("exp")))))
                    .getQuery();
    assertEquals(q, parse(config, "exp(caf mat)"));
}

From source file:com.sindicetech.siren.qparser.keyword.BasicSyntaxTest.java

License:Open Source License

@Test(expected = QueryNodeException.class)
public void testMultiPhraseQuery() throws Exception {
    final HashMap<ConfigurationKey, Object> config = new HashMap<ConfigurationKey, Object>();

    final Analyzer analyser = new Analyzer() {
        @Override//from  w  w w. j av a  2 s  .  c  o  m
        protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
            final WhitespaceTokenizer t = new WhitespaceTokenizer(LuceneTestCase.TEST_VERSION_CURRENT, reader);
            final TokenStream ts = new ASCIIFoldingExpansionFilter(t);
            return new TokenStreamComponents(t, ts);
        }
    };
    final HashMap<String, Analyzer> dts = new HashMap<String, Analyzer>();
    dts.put("exp", analyser);
    config.put(KeywordConfigurationKeys.DATATYPES_ANALYZERS, dts);

    this.parse(config, "exp(\"caf coffe\")");
}