List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer(AttributeFactory factory)
From source file:org.lexevs.dao.index.metadata.BaseMetaDataLoader.java
License:Open Source License
public static Analyzer getMetadataAnalyzer() { Map<String, Analyzer> analyzerPerField = new HashMap<>(); if (doubleMetaphoneEnabled_) { Analyzer temp = new Analyzer() { @Override/*from w w w . j ava2 s .c om*/ protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new DoubleMetaphoneFilter(filter, 4, true); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(doubleMetaphonePrefix_ + "propertyValue", temp); } if (normEnabled_) { try { Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET); analyzerPerField.put(normPrefix_ + "propertyValue", temp); } catch (NoClassDefFoundError e) { // norm is not available normEnabled_ = false; } } if (stemmingEnabled_) { Analyzer temp = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(stemmingPrefix_ + "propertyValue", temp); } // these fields just get simple analyzing. List<String> dividerList = new ArrayList<String>(); dividerList.add(STRING_TOKENIZER_TOKEN); Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true)); analyzerPerField.put("parentContainers", sa); // no stop words, default character removal set. PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), analyzerPerField); return analyzer; }
From source file:org.lexevs.dao.indexer.lucene.analyzers.SnowballAnalyzerTest.java
License:Open Source License
@Test public void testDontKeepOrigional() throws Exception { Analyzer temp = new Analyzer() { @Override/*w ww. j a va 2 s . c o m*/ protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; String input = new String("The trees have Leaves!"); String[] output = { "tree", "have", "leav" }; BaseTokenStreamTestCase.assertAnalyzesTo(temp, input, output); }
From source file:org.meresco.lucene.suggestion.ShingleAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(reader); TokenStream src = new LowerCaseFilter(source); ShingleFilter filter = new ShingleFilter(src, this.minShingleSize, this.maxShingleSize); return new TokenStreamComponents(source, filter); }
From source file:org.xbib.elasticsearch.index.analysis.skos.SKOSAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fileName, Reader reader) { if (expansionType.equals(ExpansionType.URI)) { final KeywordTokenizer src = new KeywordTokenizer(reader); TokenStream tok = new SKOSURIFilter(src, skosEngine, new StandardAnalyzer(), types); tok = new LowerCaseFilter(tok); return new TokenStreamComponents(src, tok); } else {//from w w w .j a va 2 s. c o m final StandardTokenizer src = new StandardTokenizer(reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(src); // prior to this we get the classic behavior, standardfilter does it for // us. tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(), bufferSize, types); tok = new LowerCaseFilter(tok); tok = new StopFilter(tok, stopwords); tok = new RemoveDuplicatesTokenFilter(tok); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(maxTokenLength); super.setReader(reader); } }; } }
From source file:phoneticsearch.lucene.DefaultAnalyzer.java
License:Apache License
/** * Creates a TokenStream which tokenizes all the text in the provided Reader. *//www . j a v a 2 s . c om * @return A TokenStream build from a StandardTokenizer filtered with * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter */ @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { /* initialisation du token */ final Tokenizer source = new StandardTokenizer(reader); //final Tokenizer source = new NGramTokenizer(reader, 2, 12); //--------------------------------------------------------------------- /* on retire les lisions*/ final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true); TokenStream filter = new ElisionFilter(source, elisionSet); /* on retire article adjectif */ filter = new StopFilter(filter, stopWords); /* on retire les accents */ filter = new ASCIIFoldingFilter(filter); /* on met en minuscule */ filter = new LowerCaseFilter(filter); if (withFrPhonetic || withMetaphone) { //final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList("any"))); //filter = new BeiderMorseFilter(filter, new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true), languages); //filter = new DoubleMetaphoneFilter(filter, 8, true); filter = new FrDoubleMetaphoneFilter(filter, 8, true, withFrPhonetic, withMetaphone); } filter = new PrefixTokenFilter(filter, 6); return new TokenStreamComponents(source, filter); }
From source file:ro.calin.snowball.SnowballAnalyzer.java
License:Apache License
/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(result, stopSet); result = new SnowballFilter(result, name); return result; }
From source file:uk.nhs.cfh.dsp.yasb.indexgenerator.analyser.SynonymAnalyser.java
License:Apache License
@Override public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream stream = new SynonymFilter( new StopFilter(new LowerCaseFilter(new StandardFilter(new StandardTokenizer(reader))), StandardAnalyzer.STOP_WORDS), synonymEngine);//from w w w . j av a 2s . c om return stream; }
From source file:uoc.dedup.document.fingerprintCharikar.java
License:Open Source License
/** * Calculate the fingerprint./*w w w . ja v a 2 s. c o m*/ * Splt the text in shingles and with each token generate a hashrabin, with the result * we generate a final fingerprint vector. * @return fingerprint in a string */ public String calculateFingerprint() { totalTokens = 0; totalnGramTokens = 0; TokenStream tk = null; if (this.useStemming()) { this.analyzer = analyzerCache.newAnalyzer(this.language); tk = this.analyzer.tokenStream("fingerprint", reader); } else { tk = new StandardTokenizer(reader); } ShingleMatrixFilter tokens = new ShingleMatrixFilter(tk, 1, this.getMAXGRAMS(), new Character(' ')); //Put the tokens in a map and select the most important terms. try { while (true) { Token token = tokens.next(); if (token == null) { break; } int numtokens = token.term().split(" ").length; if (numtokens == 1) { this.add(token.term(), this.m); //Add a token to the list of frequencies tokens //System.out.println(token.term()); totalTokens++; } else if (numtokens >= this.MIMGRAMS) { //System.out.println(token.term()); this.add(token.term(), this.nGrams); totalnGramTokens++; //Count the ngram tokens } } tokens.close(); this.createTopTerms(this.m, this.getTokensTop(), this.totalTokens); //Calculate the fingerprint vector this.calculateVectorFingerprint(this.nGrams, this.totalnGramTokens); tk.close(); } catch (IOException e) { System.out.println("Error getTokens: " + e.getMessage()); } vFingerprint = this.simHash.getFingerprint(); this.fingerprint2String(); return this.getFingerprint(); }
From source file:uoc.language.SpanishAnalyzer.java
License:Apache License
/** Constructs a {@link StandardTokenizer} filtered by a {@link * StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter} * and a {@link SpanishStemFilter}. */ public final TokenStream tokenStream(String mode, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new LengthFilter(result, 3, 30); result = new LowerCaseFilter(result); result = new StopFilter(result, stopTable); result = new StandardFilter(result); //if (this.reader != null && !mode.equalsIgnoreCase("fingerprint")) { result = new uocSpanishSteemer(result, this.reader); //} //Steemer de diccionari return result; }