List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer(AttributeFactory factory)
From source file:fi.nationallibrary.ndl.solrvoikko2.TestApp.java
License:Open Source License
public static void main(String[] args) throws IOException { BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); Voikko voikko = null;/*from w w w. j av a2 s . c om*/ try { ConcurrentMap<String, List<CompoundToken>> cache = new ConcurrentLinkedHashMap.Builder<String, List<CompoundToken>>() .maximumWeightedCapacity(100).build(); voikko = new Voikko("fi-x-morphoid"); StringReader reader = new StringReader(""); Tokenizer tokenizer = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); tokenizer.setReader(reader); tokenizer.reset(); voikko = new Voikko("fi-x-morphoid"); VoikkoFilter voikkoFilter = new VoikkoFilter(tokenizer, voikko, true, VoikkoFilter.DEFAULT_MIN_WORD_SIZE, VoikkoFilter.DEFAULT_MIN_SUBWORD_SIZE, VoikkoFilter.DEFAULT_MAX_SUBWORD_SIZE, true, cache, 0); String text; System.out.println(); System.out.println("Enter word or phrase"); while ((text = stdin.readLine()) != null) { List<Analysis> analysisList = voikko.analyze(text); if (analysisList.isEmpty()) { System.out.println("No analysis available"); } for (Analysis analysis : analysisList) { System.out.println("Analysis:"); if (analysis.containsKey(BASEFORM)) { WordComponent component = new WordComponent(); component.component = analysis.get(BASEFORM); component.startInOriginal = 0; component.lengthInOriginal = text.length(); print(component); } if (analysis.containsKey(WORDBASES)) { System.out.println(analysis.get(WORDBASES)); } } tokenizer.close(); reader = new StringReader(text); tokenizer.setReader(reader); tokenizer.reset(); System.out.println("\nVoikkoFilter results:"); while (voikkoFilter.incrementToken()) { System.out.println( voikkoFilter.termAtt.toString() + " [" + voikkoFilter.posIncAtt.getPositionIncrement() + ":" + voikkoFilter.offsetAtt.startOffset() + ":" + voikkoFilter.offsetAtt.endOffset() + "]"); } System.out.println(); System.out.println("Enter word or phrase"); } voikkoFilter.close(); } finally { voikko.terminate(); } }
From source file:fi.nationallibrary.ndl.solrvoikko2.VoikkoTest.java
License:Open Source License
/** * Execute Voikko analysis and return results in a string * /*from w w w .j a v a 2 s. c o m*/ * @param term String to analyze * * @return Comma-separated list of results * @throws IOException */ final protected String getVoikkoWords(String term) throws IOException { ConcurrentMap<String, List<CompoundToken>> cache = new ConcurrentLinkedHashMap.Builder<String, List<CompoundToken>>() .maximumWeightedCapacity(100).build(); Tokenizer tokenizer = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); tokenizer.setReader(new StringReader(term)); tokenizer.reset(); Voikko voikko = new Voikko("fi-x-morphoid"); VoikkoFilter voikkoFilter = new VoikkoFilter(tokenizer, voikko, true, VoikkoFilter.DEFAULT_MIN_WORD_SIZE, VoikkoFilter.DEFAULT_MIN_SUBWORD_SIZE, VoikkoFilter.DEFAULT_MAX_SUBWORD_SIZE, true, cache, 0); String results = ""; //voikkoFilter.reset(); while (voikkoFilter.incrementToken()) { if (!results.isEmpty()) { results += ","; } results += voikkoFilter.termAtt.toString() + " [" + voikkoFilter.posIncAtt.getPositionIncrement() + ":" + voikkoFilter.offsetAtt.startOffset() + ":" + voikkoFilter.offsetAtt.endOffset() + "]"; } voikkoFilter.close(); return results; }
From source file:fr.xebia.demo.hibernate.search.analysis.SimpleEnglishAnalyzer.java
License:Apache License
@Override public final TokenStream tokenStream(String fieldName, Reader reader) { if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null"); if (reader == null) throw new IllegalArgumentException("reader must not be null"); TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopTable); result = new PorterStemFilter(result); return result; }
From source file:ie.cmrc.smtx.lucene.analysis.EnglishKeywordAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(reader); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, EnglishAnalyzer.getDefaultStopSet()); filter = new KStemFilter(filter); //filter = new PorterStemFilter(filter); filter = new ASCIIFoldingFilter(filter); filter = new ConcatFilter(filter); return new Analyzer.TokenStreamComponents(source, filter); }
From source file:ie.cmrc.smtx.lucene.analysis.SmartKeywordAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, String language, Reader reader) { Tokenizer source = new StandardTokenizer(reader); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, this.getStopWordsSet(language)); filter = getMinimalStemFilter(language, filter); filter = new ASCIIFoldingFilter(filter); filter = new ConcatFilter(filter); return new TokenStreamComponents(source, filter); }
From source file:ie.cmrc.smtx.lucene.analysis.StandardEuropeanAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, String language, Reader reader) { Tokenizer source = new StandardTokenizer(reader); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, this.getStopWordsSet(language)); filter = getStemFilter(language, filter); filter = new ASCIIFoldingFilter(filter); return new TokenStreamComponents(source, filter); }
From source file:magoffin.matt.lucene.BaseAnalyzer.java
License:Open Source License
@Override public TokenStream tokenStream(String field, Reader reader) { char fieldChar = field.charAt(0); TokenStream result = null;/*www. java 2 s . c om*/ switch (fieldChar) { case FIELD_GENERAL_TEXT: result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new RegexpSplitFilter(result, "[@.]"); // tokenize emails result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS); // result = new PorterStemFilter(result); break; default: result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); break; } return result; }
From source file:magoffin.matt.ma2.lucene.StandardMatteAnalyzer.java
License:Open Source License
private TokenStream standardFilters(Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); // split words with periods, which StandardTokenizer does not do result = new TokenFilter(result) { Queue<Token> queue = new LinkedList<Token>(); @SuppressWarnings("deprecation") @Override//from ww w . ja v a 2 s . c om public Token next() throws IOException { if (queue.size() > 0) { return queue.poll(); } Token t = input.next(); if (t == null) { return null; } if (!WORD_WITH_PERIOD.matcher(t.term()).find()) { return t; } String[] split = t.term().split("\\."); int startPos = t.startOffset(); for (int i = 0; i < split.length; i++) { Token next = new Token(split[i], startPos, startPos + split[i].length()); queue.offer(next); startPos = startPos + split[i].length() + 1; } return queue.poll(); } }; result = new LowerCaseFilter(result); return result; }
From source file:magoffin.matt.tidbits.lucene.StandardTidbitsAnalyzer.java
License:Open Source License
private TokenStream standardFilters(Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); return result; }
From source file:net.mumie.cocoon.search.GermanEntityAnalyzer.java
License:Open Source License
/** * Creates a TokenStream which tokenizes all the text in the provided Reader. * * @return A TokenStream build from a StandardTokenizer filtered with * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter *///from ww w . j av a 2s . c o m public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new EntityFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); result = new GermanStemFilter(result, exclusionSet); return result; }