List of usage examples for org.apache.lucene.analysis.ngram EdgeNGramTokenFilter EdgeNGramTokenFilter
public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean preserveOriginal)
From source file:com.jaeksoft.searchlib.analysis.filter.EdgeNGramFilter.java
License:Open Source License
@Override public TokenStream create(TokenStream input) { return new EdgeNGramTokenFilter(input, side, min, max); }
From source file:es.eucm.ead.editor.indexes.Index.java
License:Open Source License
/** * Purges the contents of this modelIndex *//* www .ja v a2s . c o m*/ public final void clear() { idsToNodes.clear(); modelsToIds.clear(); searchIndex = new RAMDirectory(); searchAnalyzer = new ReusableAnalyzerBase() { @Override protected TokenStreamComponents createComponents(String s, Reader reader) { KeywordTokenizer source = new KeywordTokenizer(reader); TokenFilter filter = new LowerCaseFilter(Version.LUCENE_36, source); filter = new EdgeNGramTokenFilter(filter, EdgeNGramTokenFilter.Side.BACK, 2, 50); return new TokenStreamComponents(source, filter); } }; IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, searchAnalyzer); try { indexWriter = new IndexWriter(searchIndex, config); } catch (IOException e) { Gdx.app.error("index", "Could not initialize search index", e); } }
From source file:org.ala.lucene.Autocompleter.java
License:Open Source License
@SuppressWarnings("unchecked") public void reIndex(Directory sourceDirectory, String fieldToAutocomplete, boolean createNewIndex) throws CorruptIndexException, IOException { // build a dictionary (from the spell package) IndexReader sourceReader = IndexReader.open(sourceDirectory); LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete); // code from//ww w . java 2s . c o m // org.apache.lucene.search.spell.SpellChecker.indexDictionary( // Dictionary) IndexWriter.unlock(autoCompleteDirectory); // use a custom analyzer so we can do EdgeNGramFiltering IndexWriterConfig indexWriterConfig = new IndexWriterConfig(SolrUtils.BIE_LUCENE_VERSION, new Analyzer() { protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final StandardTokenizer src = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader); TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader); result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result); result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result); result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result, new CharArraySet(SolrUtils.BIE_LUCENE_VERSION, new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS)), true)); result = new EdgeNGramTokenFilter(result, Side.FRONT, 1, 20); return new TokenStreamComponents(src, result) { @Override protected void setReader(final Reader reader) throws IOException { super.setReader(reader); } }; } // public TokenStream tokenStream(String fieldName, Reader reader) { // TokenStream result = new StandardTokenizer(SolrUtils.BIE_LUCENE_VERSION, reader); // // result = new StandardFilter(SolrUtils.BIE_LUCENE_VERSION, result); // result = new LowerCaseFilter(SolrUtils.BIE_LUCENE_VERSION, result); // //result = new ISOLatin1AccentFilter(result); // result = new StopFilter(SolrUtils.BIE_LUCENE_VERSION, result, new HashSet<String>(Arrays.asList(ENGLISH_STOP_WORDS))); // result = new EdgeNGramTokenFilter(result, Side.FRONT,1, 20); // // return result; // } }); if (createNewIndex) { indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } else { indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } indexWriterConfig.setMaxBufferedDocs(150); IndexWriter writer = new IndexWriter(autoCompleteDirectory, indexWriterConfig); // writer.setMergeFactor(300); // go through every word, storing the original word (incl. n-grams) // and the number of times it occurs Map<String, Integer> wordsMap = new HashMap<String, Integer>(); Iterator<String> iter = (Iterator<String>) dict.getWordsIterator(); while (iter.hasNext()) { String word = iter.next(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (wordsMap.containsKey(word)) { throw new IllegalStateException("This should never happen in Lucene 2.3.2"); // wordsMap.put(word, wordsMap.get(word) + 1); } else { // use the number of documents this word appears in wordsMap.put(word, sourceReader.docFreq(new Term(fieldToAutocomplete, word))); } } for (String word : wordsMap.keySet()) { // ok index the word Document doc = new Document(); doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.ANALYZED)); // grammed doc.add(new Field(COUNT_FIELD, Integer.toString(wordsMap.get(word)), Field.Store.NO, Field.Index.NOT_ANALYZED)); // count writer.addDocument(doc); } sourceReader.close(); // close writer writer.forceMerge(1); writer.close(); // re-open our reader reOpenReader(); }
From source file:org.apache.solr.analysis.EdgeNGramFilterFactory.java
License:Apache License
public EdgeNGramTokenFilter create(TokenStream input) { return new EdgeNGramTokenFilter(input, side, minGramSize, maxGramSize); }
From source file:org.codesearch.searcher.server.util.STAutocompleteLuceneAnalyzer.java
License:Open Source License
/** {@inheritDoc} */ @Override/*from w w w . j av a2s . co m*/ public TokenStream tokenStream(String string, Reader reader) { TokenStream result = new StandardTokenizer(IndexConstants.LUCENE_VERSION, reader); result = new StandardFilter(IndexConstants.LUCENE_VERSION, result); result = new LowerCaseFilter(IndexConstants.LUCENE_VERSION, result); result = new EdgeNGramTokenFilter(result, Side.FRONT, 1, 20); return result; }
From source file:org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { if (version.onOrAfter(Version.LUCENE_43) && esVersion.onOrAfter(org.elasticsearch.Version.V_0_90_2)) { /*//w w w.j av a 2 s . co m * We added this in 0.90.2 but 0.90.1 used LUCENE_43 already so we can not rely on the lucene version. * Yet if somebody uses 0.90.2 or higher with a prev. lucene version we should also use the deprecated version. */ final Version version = this.version == Version.LUCENE_43 ? Version.LUCENE_44 : this.version; // always use 4.4 or higher TokenStream result = tokenStream; // side=BACK is not supported anymore but applying ReverseStringFilter up-front and after the token filter has the same effect if (side == Side.BACK) { result = new ReverseStringFilter(version, result); } result = new EdgeNGramTokenFilter(version, result, minGram, maxGram); if (side == Side.BACK) { result = new ReverseStringFilter(version, result); } return result; } return new EdgeNGramTokenFilter(version, tokenStream, side, minGram, maxGram); }