List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java
License:Apache License
/** * Adds term weights found by tokenizing text from reader into the Map words * * @param reader a source of text to be tokenized * @param termWeightMap a Map of terms and their weights * @param fieldName Used by analyzer for any special per-field analysis *//* w w w . j a v a2s .c om*/ private void addTermWeights(Reader reader, Map<String, Flt> termWeightMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, reader); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class); TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsedPerField) { break; } if (word.trim().length() == 0) { continue; } if (isNoiseWord(word)) { continue; } BytesRef payload = payloadAttr.getPayload(); float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field if (isPayloadField(fieldName) && payload != null) { tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset); } // increment frequency Flt termWeight = termWeightMap.get(word); if (termWeight == null) { termWeightMap.put(word, new Flt(tokenWeight)); } else { termWeight.x += tokenWeight; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.dice.solrenhancements.spellchecker.DiceMultipleCaseSuggester.java
License:Apache License
private String getAnalyzerResult(String suggestion) { TokenStream ts = null; try {// ww w .j a v a 2 s .com Reader reader = new StringReader(suggestion); ts = this.suggestionAnalyzer.tokenStream("", reader); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); if (word != null && word.length() > 0) { return word; } } } catch (Exception ex) { if (this.field != null) { LOG.error( String.format("Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}", this.field, suggestion), ex); } else if (this.fieldTypeName != null) { LOG.error(String.format( "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}", this.fieldTypeName, suggestion), ex); } } finally { if (ts != null) { IOUtils.closeWhileHandlingException(ts); } } return null; }
From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java
License:Apache License
/** * Adds term weights found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termWeightMap a Map of terms and their weights * @param fieldName Used by analyzer for any special per-field analysis *//*from w ww .j a v a2 s. c om*/ private void addTermWeights(Reader r, Map<String, Flt> termWeightMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, r); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsedPerField) { break; } if (isNoiseWord(word)) { continue; } BytesRef payload = payloadAttr.getPayload(); float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field if (isPayloadField(fieldName) && payload != null) { tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset); } // increment frequency Flt termWeight = termWeightMap.get(word); if (termWeight == null) { termWeightMap.put(word, new Flt(tokenWeight)); } else { termWeight.x += tokenWeight; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.drftpd.vfs.index.lucene.LuceneUtils.java
License:Open Source License
/** * Parses the name removing unwanted chars from it. * * @param field//from w w w . jav a2 s.co m * @param term * @param name * @return Query */ public static Query analyze(String field, Term term, String name) { TokenStream ts = LuceneEngine.ANALYZER.tokenStream(field, new StringReader(name)); BooleanQuery bQuery = new BooleanQuery(); WildcardQuery wQuery; Set<String> tokens = new HashSet<String>(); // avoids repeated terms. // get the CharTermAttribute from the TokenStream CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { tokens.add(termAtt.toString()); } ts.end(); ts.close(); } catch (IOException e) { logger.error("IOException analyzing string", e); } for (String text : tokens) { wQuery = new WildcardQuery(term.createTerm(text)); bQuery.add(wQuery, BooleanClause.Occur.MUST); } return bQuery; }
From source file:org.eclipse.che.api.search.server.impl.LuceneSearcher.java
License:Open Source License
@Override public SearchResult search(QueryExpression query) throws InvalidQueryException, QueryExecutionException { IndexSearcher luceneSearcher = null; try {/* w w w . j ava 2 s.c om*/ final long startTime = System.currentTimeMillis(); searcherManager.maybeRefresh(); luceneSearcher = searcherManager.acquire(); Query luceneQuery = createLuceneQuery(query); ScoreDoc after = null; final int numSkipDocs = Math.max(0, query.getSkipCount()); if (numSkipDocs > 0) { after = skipScoreDocs(luceneSearcher, luceneQuery, numSkipDocs); } final int numDocs = query.getMaxItems() > 0 ? Math.min(query.getMaxItems(), RESULT_LIMIT) : RESULT_LIMIT; TopDocs topDocs = luceneSearcher.searchAfter(after, luceneQuery, numDocs, sort, true, true); final long totalHitsNum = topDocs.totalHits; List<SearchResultEntry> results = newArrayList(); List<OffsetData> offsetData = Collections.emptyList(); for (int i = 0; i < topDocs.scoreDocs.length; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; int docId = scoreDoc.doc; Document doc = luceneSearcher.doc(docId); if (query.isIncludePositions()) { offsetData = new ArrayList<>(); String txt = doc.get(TEXT_FIELD); if (txt != null) { IndexReader reader = luceneSearcher.getIndexReader(); TokenStream tokenStream = TokenSources.getTokenStream(TEXT_FIELD, reader.getTermVectors(docId), txt, luceneIndexWriter.getAnalyzer(), -1); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); QueryScorer queryScorer = new QueryScorer(luceneQuery); // TODO think about this constant queryScorer.setMaxDocCharsToAnalyze(1_000_000); TokenStream newStream = queryScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } queryScorer.startFragment(null); tokenStream.reset(); int startOffset, endOffset; // TODO think about this constant for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < 1_000_000); next = tokenStream.incrementToken()) { startOffset = offsetAtt.startOffset(); endOffset = offsetAtt.endOffset(); if ((endOffset > txt.length()) || (startOffset > txt.length())) { throw new QueryExecutionException("Token " + termAtt.toString() + " exceeds length of provided text size " + txt.length()); } float res = queryScorer.getTokenScore(); if (res > 0.0F && startOffset <= endOffset) { String tokenText = txt.substring(startOffset, endOffset); Scanner sc = new Scanner(txt); int lineNum = 1; long len = 0; String foundLine = ""; while (sc.hasNextLine()) { foundLine = sc.nextLine(); len += foundLine.length(); if (len > startOffset) { break; } lineNum++; } offsetData.add( new OffsetData(tokenText, startOffset, endOffset, res, lineNum, foundLine)); } } } } String filePath = doc.getField(PATH_FIELD).stringValue(); LOG.debug("Doc {} path {} score {} ", docId, filePath, scoreDoc.score); results.add(new SearchResultEntry(filePath, offsetData)); } final long elapsedTimeMillis = System.currentTimeMillis() - startTime; boolean hasMoreToRetrieve = numSkipDocs + topDocs.scoreDocs.length + 1 < totalHitsNum; QueryExpression nextPageQueryExpression = null; if (hasMoreToRetrieve) { nextPageQueryExpression = createNextPageQuery(query, numSkipDocs + topDocs.scoreDocs.length); } return SearchResult.aSearchResult().withResults(results).withTotalHits(totalHitsNum) .withNextPageQueryExpression(nextPageQueryExpression).withElapsedTimeMillis(elapsedTimeMillis) .build(); } catch (ParseException e) { throw new InvalidQueryException(e.getMessage(), e); } catch (IOException e) { throw new QueryExecutionException(e.getMessage(), e); } finally { try { searcherManager.release(luceneSearcher); } catch (IOException e) { LOG.error(e.getMessage()); } } }
From source file:org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction.java
License:Apache License
@Override protected AnalyzeResponse shardOperation(AnalyzeRequest request, int shardId) throws ElasticsearchException { IndexService indexService = null;/*from ww w.j a v a 2 s. c o m*/ if (request.index() != null) { indexService = indicesService.indexServiceSafe(request.index()); } Analyzer analyzer = null; boolean closeAnalyzer = false; String field = null; if (request.field() != null) { if (indexService == null) { throw new ElasticsearchIllegalArgumentException( "No index provided, and trying to analyzer based on a specific field which requires the index parameter"); } FieldMapper<?> fieldMapper = indexService.mapperService().smartNameFieldMapper(request.field()); if (fieldMapper != null) { if (fieldMapper.isNumeric()) { throw new ElasticsearchIllegalArgumentException("Can't process field [" + request.field() + "], Analysis requests are not supported on numeric fields"); } analyzer = fieldMapper.indexAnalyzer(); field = fieldMapper.names().indexName(); } } if (field == null) { if (indexService != null) { field = indexService.queryParserService().defaultField(); } else { field = AllFieldMapper.NAME; } } if (analyzer == null && request.analyzer() != null) { if (indexService == null) { analyzer = indicesAnalysisService.analyzer(request.analyzer()); } else { analyzer = indexService.analysisService().analyzer(request.analyzer()); } if (analyzer == null) { throw new ElasticsearchIllegalArgumentException( "failed to find analyzer [" + request.analyzer() + "]"); } } else if (request.tokenizer() != null) { TokenizerFactory tokenizerFactory; if (indexService == null) { TokenizerFactoryFactory tokenizerFactoryFactory = indicesAnalysisService .tokenizerFactoryFactory(request.tokenizer()); if (tokenizerFactoryFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find global tokenizer under [" + request.tokenizer() + "]"); } tokenizerFactory = tokenizerFactoryFactory.create(request.tokenizer(), ImmutableSettings.Builder.EMPTY_SETTINGS); } else { tokenizerFactory = indexService.analysisService().tokenizer(request.tokenizer()); if (tokenizerFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find tokenizer under [" + request.tokenizer() + "]"); } } TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0]; if (request.tokenFilters() != null && request.tokenFilters().length > 0) { tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length]; for (int i = 0; i < request.tokenFilters().length; i++) { String tokenFilterName = request.tokenFilters()[i]; if (indexService == null) { TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService .tokenFilterFactoryFactory(tokenFilterName); if (tokenFilterFactoryFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find global token filter under [" + request.tokenizer() + "]"); } tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName, ImmutableSettings.Builder.EMPTY_SETTINGS); } else { tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName); if (tokenFilterFactories[i] == null) { throw new ElasticsearchIllegalArgumentException( "failed to find token filter under [" + request.tokenizer() + "]"); } } if (tokenFilterFactories[i] == null) { throw new ElasticsearchIllegalArgumentException( "failed to find token filter under [" + request.tokenizer() + "]"); } } } analyzer = new CustomAnalyzer(tokenizerFactory, new CharFilterFactory[0], tokenFilterFactories); closeAnalyzer = true; } else if (analyzer == null) { if (indexService == null) { analyzer = Lucene.STANDARD_ANALYZER; } else { analyzer = indexService.analysisService().defaultIndexAnalyzer(); } } if (analyzer == null) { throw new ElasticsearchIllegalArgumentException("failed to find analyzer"); } List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList(); TokenStream stream = null; try { stream = analyzer.tokenStream(field, request.text()); stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type())); } stream.end(); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { // ignore } } if (closeAnalyzer) { analyzer.close(); } } return new AnalyzeResponse(tokens); }
From source file:org.elasticsearch.analysis.common.CommonAnalysisPlugin.java
License:Apache License
@Override public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() { List<PreConfiguredTokenFilter> filters = new ArrayList<>(); filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new)); filters.add(// w w w .j a va2 s . co m PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); // TODO deprecate edgeNGram filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); filters.add(PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))); filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer()))); filters.add( PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS))); filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new)); // TODO deprecate nGram filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); filters.add( PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> { TokenStream ts = new ShingleFilter(input); /** * We disable the graph analysis on this token stream * because it produces shingles of different size. * Graph analysis on such token stream is useless and dangerous as it may create too many paths * since shingles of different size are not aligned in terms of positions. */ ts.addAttribute(DisableGraphAttribute.class); return ts; })); filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English"))); filters.add( PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new)); filters.add( PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; }
From source file:org.elasticsearch.analysis.common.CompoundAnalysisTests.java
License:Apache License
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings); AnalysisModule analysisModule = createAnalysisModule(settings); IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings); Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); TokenStream stream = analyzer.tokenStream("", text); stream.reset();// w ww. j av a2 s . co m CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); List<String> terms = new ArrayList<>(); while (stream.incrementToken()) { String tokText = termAtt.toString(); terms.add(tokText); } return terms; }
From source file:org.elasticsearch.analysis.common.SynonymsAnalysisTests.java
License:Apache License
private void match(String analyzerName, String source, String target) throws IOException { Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); TokenStream stream = analyzer.tokenStream("", source); stream.reset();//from w w w .java2 s . co m CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringBuilder sb = new StringBuilder(); while (stream.incrementToken()) { sb.append(termAtt.toString()).append(" "); } MatcherAssert.assertThat(target, equalTo(sb.toString().trim())); }
From source file:org.elasticsearch.analysis.common.UniqueTokenFilterTests.java
License:Apache License
public void testSimple() throws IOException { Analyzer analyzer = new Analyzer() { @Override//from w w w. ja v a2s . co m protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(t, new UniqueTokenFilter(t)); } }; TokenStream test = analyzer.tokenStream("test", "this test with test"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("this")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("test")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("with")); assertThat(test.incrementToken(), equalTo(false)); }