Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer(AttributeFactory factory) 

Source Link

Document

Creates a new StandardTokenizer with a given org.apache.lucene.util.AttributeFactory

Usage

From source file:net.sf.jtmt.summarizers.SummaryAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new PorterStemFilter(new StopFilter(false, // enable_position_increment_default == false, for backward compat
            new LowerCaseFilter(new NumericTokenFilter(new StandardFilter(new StandardTokenizer(reader)))),
            stopset));/*from w ww .j  a  v  a2s . co  m*/
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.AlfrescoStandardAnalyser.java

License:Open Source License

/**
 * Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
 *///from   ww w  . j  a v  a2s.c om
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new AlfrescoStandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopSet);
    result = new ISOLatin1AccentFilter(result);
    return result;
}

From source file:org.apache.solr.analysis.HTMLStripStandardTokenizerFactory.java

License:Apache License

public Tokenizer create(Reader input) {
    return new StandardTokenizer(new HTMLStripReader(input)) {
        @Override//from  ww  w .j av a  2 s. c o  m
        public void reset(Reader reader) throws IOException {
            super.reset(new HTMLStripReader(reader));
        }
    };
}

From source file:org.apache.uima.lucas.indexer.Tokenizer.java

License:Apache License

public TokenStream tokenize(TokenStream tokenStream, AnnotationDescription description) throws IOException {
    String tokenizer = description.getTokenizer();
    if (tokenizer != null && !tokenizer.equals(TOKENIZER_CAS)) {
        String tokenStreamAsString = tokenStreamStringConcatenator.tokenStreamToStringWithDelimiter(tokenStream,
                " ");
        StringReader stringReader = new StringReader(tokenStreamAsString);
        if (tokenizer.equals(TOKENIZER_WHITESPACE))
            tokenStream = new WhitespaceTokenizer(stringReader);
        else if (tokenizer.equals(TOKENIZER_STANDARD))
            tokenStream = new StandardTokenizer(stringReader);
    }/*from   w ww.j  a v  a  2s. c  o m*/

    return tokenStream;
}

From source file:org.eurekastreams.commons.search.analysis.HashTagTextStemmerIndexingAnalyzer.java

License:Apache License

/**
 * Tokenize the stream./*from  ww  w  .  j  av a  2 s  .c  o  m*/
 *
 * @param fieldName
 *            the name of the field
 * @param inReader
 *            the reader
 * @return the stream
 */
@Override
public TokenStream tokenStream(final String fieldName, final Reader inReader) {
    // collection to hold hashtagged keywords
    List<String> hashTaggedKeywords = new ArrayList<String>();

    // this reader will replace all hashtags with our marker text
    Reader reader = CharacterReplacementStreamBuilder.buildReplacementReader(inReader, '#',
            INDEXED_HASHTAG_PREFIX);
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);

    // now remove our hashtag prefixes and store the hashtagged keywords in a set
    result = new PrefixedTokenRemoverDuplicatorAndExtractorTokenizer(result, INDEXED_HASHTAG_PREFIX, "#",
            hashTaggedKeywords);
    result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS);
    result = new EnglishPorterFilterFactory().create(result);
    result = new WordListInjectionTokenizer(hashTaggedKeywords, result);

    return result;
}

From source file:org.eurekastreams.commons.search.analysis.HashTagTextStemmerSearchAnalyzer.java

License:Apache License

/**
 * Tokenize the stream./*from   ww w. ja v  a2  s .  c o m*/
 *
 * @param fieldName
 *            the name of the field
 * @param inReader
 *            the reader
 * @return the stream
 */
@Override
public TokenStream tokenStream(final String fieldName, final Reader inReader) {
    // collection to hold hashtagged keywords
    List<String> hashTaggedKeywords = new ArrayList<String>();

    // this reader will replace all hashtags with our marker text
    Reader reader = CharacterReplacementStreamBuilder.buildReplacementReader(inReader, '#',
            INDEXED_HASHTAG_PREFIX);
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);

    // now remove our hashtag prefixes and store the hashtagged keywords in a set
    result = new PrefixedTokenRemoverAndExtractorTokenizer(result, INDEXED_HASHTAG_PREFIX, "#",
            hashTaggedKeywords);
    result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS);
    result = new EnglishPorterFilterFactory().create(result);
    result = new WordListInjectionTokenizer(hashTaggedKeywords, result);

    return result;
}

From source file:org.eurekastreams.commons.search.analysis.TextStemmerAnalyzer.java

License:Apache License

/**
 * Tokenize the stream.//from   ww  w. ja  va 2s .c o  m
 *
 * @param fieldName
 *            the name of the field
 * @param reader
 *            the reader
 * @return the stream
 */
@Override
public TokenStream tokenStream(final String fieldName, final Reader reader) {
    TokenStream tokenStream = new StandardTokenizer(reader);
    TokenStream result = new StandardFilter(tokenStream);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS);
    result = new EnglishPorterFilterFactory().create(result);
    return result;
}

From source file:org.eurekastreams.commons.search.analysis.TextSynonymStemmerAnalyzer.java

License:Apache License

/**
 * Tokenize the stream.//from w  w w . j a v  a2 s . c  o  m
 *
 * @param fieldName
 *            the name of the field
 * @param reader
 *            the reader
 * @return the stream
 */
@Override
public TokenStream tokenStream(final String fieldName, final Reader reader) {
    TokenStream tokenStream = new StandardTokenizer(reader);
    TokenStream result = new StandardFilter(tokenStream);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS);
    result = new SynonymTokenFilter(result, SynonymMapFactory.getSynonymMap(), MAX_NUMBER_OF_SYNONYMS);
    result = new EnglishPorterFilterFactory().create(result);
    return result;
}

From source file:org.exist.indexing.lucene.DMLBSAccentAnalyzer.java

License:Apache License

/** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new org.exist.indexing.lucene.DMLBSAccentFilter(result);
    //result = new ASCIIFoldingFilter(result);
    result = new StopFilter(result, stopSet);
    return result;
}

From source file:org.lexevs.dao.index.indexer.LuceneLoaderCode.java

License:Open Source License

public static PerFieldAnalyzerWrapper getAnaylzer() {

    Map<String, Analyzer> analyzerPerField = new HashMap<>();

    //add a literal analyzer -- keep all special characters
    analyzerPerField.put(LITERAL_PROPERTY_VALUE_FIELD, literalAnalyzer);
    analyzerPerField.put(LITERAL_AND_REVERSE_PROPERTY_VALUE_FIELD, literalAnalyzer);

    //treat as string field by analyzing with the KeywordAnalyzer
    analyzerPerField.put(UNIQUE_ID, new KeywordAnalyzer());
    analyzerPerField.put(ENTITY_TYPE, new KeywordAnalyzer());
    analyzerPerField.put("isPreferred", new KeywordAnalyzer());
    analyzerPerField.put(SQLTableConstants.TBLCOL_ENTITYCODENAMESPACE, new KeywordAnalyzer());

    if (doubleMetaphoneEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override// ww w.  ja  va  2s.  c  om
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new DoubleMetaphoneFilter(filter, 4, false);
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(DOUBLE_METAPHONE_PROPERTY_VALUE_FIELD, temp);
    }

    if (normEnabled_) {
        try {
            Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET);
            analyzerPerField.put(NORM_PROPERTY_VALUE_FIELD, temp);
        } catch (NoClassDefFoundError e) {
            //
        }
    }

    if (stemmingEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new SnowballFilter(filter, "English");
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(STEMMING_PROPERTY_VALUE_FIELD, temp);
    }

    final CharArraySet dividerList = new CharArraySet(10, true);
    dividerList.add(STRING_TOKENIZER_TOKEN);
    Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true));
    Analyzer qualifierAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String arg0) {
            final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
            source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
            TokenStream filter = new LowerCaseFilter(source);
            Pattern pattern = Pattern.compile("\\-|\\;|\\(|\\)|\\{|\\}|\\[|\\]|\\<|\\>|\\||(\\<\\:\\>)");
            filter = new PatternReplaceFilter(filter, pattern, " ", true);
            return new TokenStreamComponents(source, filter);
        }

    };
    analyzerPerField.put("sources", sa);
    analyzerPerField.put("usageContexts", sa);
    analyzerPerField.put("qualifiers", qualifierAnalyzer);

    // no stop words, default character removal set.
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET),
            analyzerPerField);
    return analyzer;
}