Example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer.

Prototype

public StandardTokenizer(AttributeFactory factory) 

Source Link

Document

Creates a new StandardTokenizer with a given org.apache.lucene.util.AttributeFactory

Usage

From source file:fi.nationallibrary.ndl.solrvoikko2.TestApp.java

License:Open Source License

public static void main(String[] args) throws IOException {
    BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in));
    Voikko voikko = null;/*from w  w w. j  av  a2  s .  c om*/
    try {
        ConcurrentMap<String, List<CompoundToken>> cache = new ConcurrentLinkedHashMap.Builder<String, List<CompoundToken>>()
                .maximumWeightedCapacity(100).build();

        voikko = new Voikko("fi-x-morphoid");

        StringReader reader = new StringReader("");
        Tokenizer tokenizer = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
        tokenizer.setReader(reader);
        tokenizer.reset();

        voikko = new Voikko("fi-x-morphoid");
        VoikkoFilter voikkoFilter = new VoikkoFilter(tokenizer, voikko, true,
                VoikkoFilter.DEFAULT_MIN_WORD_SIZE, VoikkoFilter.DEFAULT_MIN_SUBWORD_SIZE,
                VoikkoFilter.DEFAULT_MAX_SUBWORD_SIZE, true, cache, 0);

        String text;
        System.out.println();
        System.out.println("Enter word or phrase");
        while ((text = stdin.readLine()) != null) {
            List<Analysis> analysisList = voikko.analyze(text);
            if (analysisList.isEmpty()) {
                System.out.println("No analysis available");
            }
            for (Analysis analysis : analysisList) {
                System.out.println("Analysis:");
                if (analysis.containsKey(BASEFORM)) {
                    WordComponent component = new WordComponent();
                    component.component = analysis.get(BASEFORM);
                    component.startInOriginal = 0;
                    component.lengthInOriginal = text.length();
                    print(component);
                }
                if (analysis.containsKey(WORDBASES)) {
                    System.out.println(analysis.get(WORDBASES));
                }
            }

            tokenizer.close();
            reader = new StringReader(text);
            tokenizer.setReader(reader);
            tokenizer.reset();

            System.out.println("\nVoikkoFilter results:");
            while (voikkoFilter.incrementToken()) {
                System.out.println(
                        voikkoFilter.termAtt.toString() + " [" + voikkoFilter.posIncAtt.getPositionIncrement()
                                + ":" + voikkoFilter.offsetAtt.startOffset() + ":"
                                + voikkoFilter.offsetAtt.endOffset() + "]");
            }

            System.out.println();
            System.out.println("Enter word or phrase");
        }
        voikkoFilter.close();
    } finally {
        voikko.terminate();
    }
}

From source file:fi.nationallibrary.ndl.solrvoikko2.VoikkoTest.java

License:Open Source License

/**
 * Execute Voikko analysis and return results in a string
 * /*from w  w w .j a v  a 2 s. c  o  m*/
 * @param term           String to analyze
 * 
 * @return Comma-separated list of results 
 * @throws IOException
 */
final protected String getVoikkoWords(String term) throws IOException {
    ConcurrentMap<String, List<CompoundToken>> cache = new ConcurrentLinkedHashMap.Builder<String, List<CompoundToken>>()
            .maximumWeightedCapacity(100).build();

    Tokenizer tokenizer = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
    tokenizer.setReader(new StringReader(term));
    tokenizer.reset();

    Voikko voikko = new Voikko("fi-x-morphoid");
    VoikkoFilter voikkoFilter = new VoikkoFilter(tokenizer, voikko, true, VoikkoFilter.DEFAULT_MIN_WORD_SIZE,
            VoikkoFilter.DEFAULT_MIN_SUBWORD_SIZE, VoikkoFilter.DEFAULT_MAX_SUBWORD_SIZE, true, cache, 0);

    String results = "";

    //voikkoFilter.reset();
    while (voikkoFilter.incrementToken()) {
        if (!results.isEmpty()) {
            results += ",";
        }
        results += voikkoFilter.termAtt.toString() + " [" + voikkoFilter.posIncAtt.getPositionIncrement() + ":"
                + voikkoFilter.offsetAtt.startOffset() + ":" + voikkoFilter.offsetAtt.endOffset() + "]";
    }
    voikkoFilter.close();

    return results;
}

From source file:fr.xebia.demo.hibernate.search.analysis.SimpleEnglishAnalyzer.java

License:Apache License

@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {

    if (fieldName == null)
        throw new IllegalArgumentException("fieldName must not be null");
    if (reader == null)
        throw new IllegalArgumentException("reader must not be null");

    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopTable);
    result = new PorterStemFilter(result);
    return result;
}

From source file:ie.cmrc.smtx.lucene.analysis.EnglishKeywordAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    Tokenizer source = new StandardTokenizer(reader);
    TokenStream filter = new StandardFilter(source);
    filter = new LowerCaseFilter(filter);
    filter = new StopFilter(filter, EnglishAnalyzer.getDefaultStopSet());
    filter = new KStemFilter(filter);
    //filter = new PorterStemFilter(filter);
    filter = new ASCIIFoldingFilter(filter);
    filter = new ConcatFilter(filter);
    return new Analyzer.TokenStreamComponents(source, filter);
}

From source file:ie.cmrc.smtx.lucene.analysis.SmartKeywordAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, String language, Reader reader) {
    Tokenizer source = new StandardTokenizer(reader);
    TokenStream filter = new StandardFilter(source);
    filter = new LowerCaseFilter(filter);
    filter = new StopFilter(filter, this.getStopWordsSet(language));
    filter = getMinimalStemFilter(language, filter);
    filter = new ASCIIFoldingFilter(filter);
    filter = new ConcatFilter(filter);
    return new TokenStreamComponents(source, filter);
}

From source file:ie.cmrc.smtx.lucene.analysis.StandardEuropeanAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, String language, Reader reader) {
    Tokenizer source = new StandardTokenizer(reader);
    TokenStream filter = new StandardFilter(source);
    filter = new LowerCaseFilter(filter);
    filter = new StopFilter(filter, this.getStopWordsSet(language));
    filter = getStemFilter(language, filter);
    filter = new ASCIIFoldingFilter(filter);
    return new TokenStreamComponents(source, filter);
}

From source file:magoffin.matt.lucene.BaseAnalyzer.java

License:Open Source License

@Override
public TokenStream tokenStream(String field, Reader reader) {
    char fieldChar = field.charAt(0);
    TokenStream result = null;/*www. java 2  s . c  om*/
    switch (fieldChar) {
    case FIELD_GENERAL_TEXT:
        result = new StandardTokenizer(reader);
        result = new StandardFilter(result);
        result = new LowerCaseFilter(result);
        result = new RegexpSplitFilter(result, "[@.]"); // tokenize emails
        result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS);
        // result = new PorterStemFilter(result);
        break;

    default:
        result = new StandardTokenizer(reader);
        result = new StandardFilter(result);
        result = new LowerCaseFilter(result);
        break;
    }
    return result;
}

From source file:magoffin.matt.ma2.lucene.StandardMatteAnalyzer.java

License:Open Source License

private TokenStream standardFilters(Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);

    // split words with periods, which StandardTokenizer does not do
    result = new TokenFilter(result) {

        Queue<Token> queue = new LinkedList<Token>();

        @SuppressWarnings("deprecation")
        @Override//from ww w  .  ja  v a 2  s  . c om
        public Token next() throws IOException {
            if (queue.size() > 0) {
                return queue.poll();
            }
            Token t = input.next();
            if (t == null) {
                return null;
            }
            if (!WORD_WITH_PERIOD.matcher(t.term()).find()) {
                return t;
            }
            String[] split = t.term().split("\\.");
            int startPos = t.startOffset();
            for (int i = 0; i < split.length; i++) {
                Token next = new Token(split[i], startPos, startPos + split[i].length());
                queue.offer(next);
                startPos = startPos + split[i].length() + 1;
            }
            return queue.poll();
        }
    };
    result = new LowerCaseFilter(result);
    return result;
}

From source file:magoffin.matt.tidbits.lucene.StandardTidbitsAnalyzer.java

License:Open Source License

private TokenStream standardFilters(Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    return result;
}

From source file:net.mumie.cocoon.search.GermanEntityAnalyzer.java

License:Open Source License

/**
 * Creates a TokenStream which tokenizes all the text in the provided Reader.
 *
 * @return A TokenStream build from a StandardTokenizer filtered with
 *         StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
 *///from   ww w .  j  av a 2s  .  c  o  m
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new EntityFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopSet);
    result = new GermanStemFilter(result, exclusionSet);
    return result;
}