Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer(AttributeFactory factory) 

Source Link

Document

Creates a new StandardTokenizer with a given org.apache.lucene.util.AttributeFactory

Usage

From source file:org.lexevs.dao.index.metadata.BaseMetaDataLoader.java

License:Open Source License

public static Analyzer getMetadataAnalyzer() {
    Map<String, Analyzer> analyzerPerField = new HashMap<>();

    if (doubleMetaphoneEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override/*from   w  w w  .  j  ava2  s  .c  om*/
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new DoubleMetaphoneFilter(filter, 4, true);
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(doubleMetaphonePrefix_ + "propertyValue", temp);
    }

    if (normEnabled_) {
        try {
            Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET);
            analyzerPerField.put(normPrefix_ + "propertyValue", temp);
        } catch (NoClassDefFoundError e) {
            // norm is not available
            normEnabled_ = false;
        }
    }

    if (stemmingEnabled_) {
        Analyzer temp = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                final StandardTokenizer source = new StandardTokenizer(
                        AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
                source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
                TokenStream filter = new StandardFilter(source);
                filter = new LowerCaseFilter(filter);
                filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
                filter = new SnowballFilter(filter, "English");
                return new TokenStreamComponents(source, filter);
            }
        };
        analyzerPerField.put(stemmingPrefix_ + "propertyValue", temp);
    }

    // these fields just get simple analyzing.
    List<String> dividerList = new ArrayList<String>();
    dividerList.add(STRING_TOKENIZER_TOKEN);
    Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true));
    analyzerPerField.put("parentContainers", sa);

    // no stop words, default character removal set.
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET),
            analyzerPerField);

    return analyzer;
}

From source file:org.lexevs.dao.indexer.lucene.analyzers.SnowballAnalyzerTest.java

License:Open Source License

@Test
public void testDontKeepOrigional() throws Exception {
    Analyzer temp = new Analyzer() {

        @Override/*w ww. j a  va  2 s  . c o m*/
        protected TokenStreamComponents createComponents(String fieldName) {
            final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
            source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
            TokenStream filter = new StandardFilter(source);
            filter = new LowerCaseFilter(filter);
            filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET);
            filter = new SnowballFilter(filter, "English");
            return new TokenStreamComponents(source, filter);
        }
    };

    String input = new String("The trees have Leaves!");
    String[] output = { "tree", "have", "leav" };
    BaseTokenStreamTestCase.assertAnalyzesTo(temp, input, output);
}

From source file:org.meresco.lucene.suggestion.ShingleAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    Tokenizer source = new StandardTokenizer(reader);
    TokenStream src = new LowerCaseFilter(source);
    ShingleFilter filter = new ShingleFilter(src, this.minShingleSize, this.maxShingleSize);
    return new TokenStreamComponents(source, filter);
}

From source file:org.xbib.elasticsearch.index.analysis.skos.SKOSAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fileName, Reader reader) {
    if (expansionType.equals(ExpansionType.URI)) {
        final KeywordTokenizer src = new KeywordTokenizer(reader);
        TokenStream tok = new SKOSURIFilter(src, skosEngine, new StandardAnalyzer(), types);
        tok = new LowerCaseFilter(tok);
        return new TokenStreamComponents(src, tok);
    } else {//from   w w  w .j  a va 2  s. c o m
        final StandardTokenizer src = new StandardTokenizer(reader);
        src.setMaxTokenLength(maxTokenLength);
        TokenStream tok = new StandardFilter(src);
        // prior to this we get the classic behavior, standardfilter does it for
        // us.
        tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(), bufferSize, types);
        tok = new LowerCaseFilter(tok);
        tok = new StopFilter(tok, stopwords);
        tok = new RemoveDuplicatesTokenFilter(tok);
        return new TokenStreamComponents(src, tok) {
            @Override
            protected void setReader(final Reader reader) throws IOException {
                src.setMaxTokenLength(maxTokenLength);
                super.setReader(reader);
            }
        };
    }
}

From source file:phoneticsearch.lucene.DefaultAnalyzer.java

License:Apache License

/**
   * Creates a TokenStream which tokenizes all the text in the provided Reader.
   *//www  .  j  a  v a 2  s  . c  om
   * @return A TokenStream build from a StandardTokenizer filtered with
   *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
   */
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    /* initialisation du token */
    final Tokenizer source = new StandardTokenizer(reader);
    //final Tokenizer source = new NGramTokenizer(reader, 2, 12);
    //---------------------------------------------------------------------
    /* on retire les lisions*/
    final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
    TokenStream filter = new ElisionFilter(source, elisionSet);
    /* on retire article adjectif */
    filter = new StopFilter(filter, stopWords);
    /* on retire les accents */
    filter = new ASCIIFoldingFilter(filter);
    /* on met en minuscule */
    filter = new LowerCaseFilter(filter);

    if (withFrPhonetic || withMetaphone) {
        //final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList("any")));
        //filter = new BeiderMorseFilter(filter, new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true), languages);
        //filter = new DoubleMetaphoneFilter(filter, 8, true);
        filter = new FrDoubleMetaphoneFilter(filter, 8, true, withFrPhonetic, withMetaphone);
    }
    filter = new PrefixTokenFilter(filter, 6);
    return new TokenStreamComponents(source, filter);
}

From source file:ro.calin.snowball.SnowballAnalyzer.java

License:Apache License

/** Constructs a {@link StandardTokenizer} filtered by a {@link
    StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    if (stopSet != null)
        result = new StopFilter(result, stopSet);
    result = new SnowballFilter(result, name);
    return result;
}

From source file:uk.nhs.cfh.dsp.yasb.indexgenerator.analyser.SynonymAnalyser.java

License:Apache License

@Override
public TokenStream tokenStream(String fieldName, Reader reader) {

    TokenStream stream = new SynonymFilter(
            new StopFilter(new LowerCaseFilter(new StandardFilter(new StandardTokenizer(reader))),
                    StandardAnalyzer.STOP_WORDS),
            synonymEngine);//from   w w w .  j av a  2s .  c om

    return stream;
}

From source file:uoc.dedup.document.fingerprintCharikar.java

License:Open Source License

/**
 * Calculate the fingerprint./*w w w  .  ja v  a 2  s. c o m*/
 * Splt the text in shingles and with each token generate a hashrabin, with the result
 * we generate a final fingerprint vector.
 * @return fingerprint in a string
 */
public String calculateFingerprint() {
    totalTokens = 0;
    totalnGramTokens = 0;
    TokenStream tk = null;

    if (this.useStemming()) {
        this.analyzer = analyzerCache.newAnalyzer(this.language);
        tk = this.analyzer.tokenStream("fingerprint", reader);
    } else {
        tk = new StandardTokenizer(reader);
    }
    ShingleMatrixFilter tokens = new ShingleMatrixFilter(tk, 1, this.getMAXGRAMS(), new Character(' '));

    //Put the tokens in a map and select the most important terms.
    try {
        while (true) {
            Token token = tokens.next();
            if (token == null) {
                break;
            }
            int numtokens = token.term().split(" ").length;
            if (numtokens == 1) {
                this.add(token.term(), this.m); //Add a token to the list of frequencies tokens                                        
                //System.out.println(token.term());
                totalTokens++;
            } else if (numtokens >= this.MIMGRAMS) {
                //System.out.println(token.term());
                this.add(token.term(), this.nGrams);
                totalnGramTokens++; //Count the ngram tokens            
            }
        }
        tokens.close();
        this.createTopTerms(this.m, this.getTokensTop(), this.totalTokens);
        //Calculate the fingerprint vector
        this.calculateVectorFingerprint(this.nGrams, this.totalnGramTokens);
        tk.close();
    } catch (IOException e) {
        System.out.println("Error getTokens: " + e.getMessage());
    }
    vFingerprint = this.simHash.getFingerprint();
    this.fingerprint2String();
    return this.getFingerprint();
}

From source file:uoc.language.SpanishAnalyzer.java

License:Apache License

/** Constructs a {@link StandardTokenizer} filtered by a {@link
 * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}
 * and a {@link SpanishStemFilter}. */
public final TokenStream tokenStream(String mode, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new LengthFilter(result, 3, 30);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopTable);
    result = new StandardFilter(result);
    //if (this.reader != null && !mode.equalsIgnoreCase("fingerprint")) {        
    result = new uocSpanishSteemer(result, this.reader);
    //} //Steemer de diccionari        
    return result;
}