List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java
License:Apache License
/** * Adds term weights found by tokenizing text from reader into the Map words * * @param reader a source of text to be tokenized * @param termWeightMap a Map of terms and their weights * @param fieldName Used by analyzer for any special per-field analysis *//*from ww w .ja v a 2 s . com*/ private void addTermWeights(Reader reader, Map<String, Flt> termWeightMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, reader); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class); TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsedPerField) { break; } if (word.trim().length() == 0) { continue; } if (isNoiseWord(word)) { continue; } BytesRef payload = payloadAttr.getPayload(); float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field if (isPayloadField(fieldName) && payload != null) { tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset); } // increment frequency Flt termWeight = termWeightMap.get(word); if (termWeight == null) { termWeightMap.put(word, new Flt(tokenWeight)); } else { termWeight.x += tokenWeight; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.dice.solrenhancements.spellchecker.DiceMultipleCaseSuggester.java
License:Apache License
private String getAnalyzerResult(String suggestion) { TokenStream ts = null; try {/* w w w . j a v a 2s . c o m*/ Reader reader = new StringReader(suggestion); ts = this.suggestionAnalyzer.tokenStream("", reader); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); if (word != null && word.length() > 0) { return word; } } } catch (Exception ex) { if (this.field != null) { LOG.error( String.format("Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}", this.field, suggestion), ex); } else if (this.fieldTypeName != null) { LOG.error(String.format( "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}", this.fieldTypeName, suggestion), ex); } } finally { if (ts != null) { IOUtils.closeWhileHandlingException(ts); } } return null; }
From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java
License:Apache License
/** * Adds term weights found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termWeightMap a Map of terms and their weights * @param fieldName Used by analyzer for any special per-field analysis *//*from w w w . ja va2s. c o m*/ private void addTermWeights(Reader r, Map<String, Flt> termWeightMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, r); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsedPerField) { break; } if (isNoiseWord(word)) { continue; } BytesRef payload = payloadAttr.getPayload(); float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field if (isPayloadField(fieldName) && payload != null) { tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset); } // increment frequency Flt termWeight = termWeightMap.get(word); if (termWeight == null) { termWeightMap.put(word, new Flt(tokenWeight)); } else { termWeight.x += tokenWeight; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.drftpd.vfs.index.lucene.LuceneUtils.java
License:Open Source License
/** * Parses the name removing unwanted chars from it. * * @param field/*ww w. java 2 s .c o m*/ * @param term * @param name * @return Query */ public static Query analyze(String field, Term term, String name) { TokenStream ts = LuceneEngine.ANALYZER.tokenStream(field, new StringReader(name)); BooleanQuery bQuery = new BooleanQuery(); WildcardQuery wQuery; Set<String> tokens = new HashSet<String>(); // avoids repeated terms. // get the CharTermAttribute from the TokenStream CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { tokens.add(termAtt.toString()); } ts.end(); ts.close(); } catch (IOException e) { logger.error("IOException analyzing string", e); } for (String text : tokens) { wQuery = new WildcardQuery(term.createTerm(text)); bQuery.add(wQuery, BooleanClause.Occur.MUST); } return bQuery; }
From source file:org.easynet.resource.queryparser.QueryParserBase.java
License:Apache License
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { if (analyzerIn == null) analyzerIn = getAnalyzer();/* ww w . j a va 2 s. c om*/ TokenStream source = null; try { source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.eclipse.che.api.search.server.impl.LuceneSearcher.java
License:Open Source License
@Override public SearchResult search(QueryExpression query) throws InvalidQueryException, QueryExecutionException { IndexSearcher luceneSearcher = null; try {/*from w w w . j av a 2s . c o m*/ final long startTime = System.currentTimeMillis(); searcherManager.maybeRefresh(); luceneSearcher = searcherManager.acquire(); Query luceneQuery = createLuceneQuery(query); ScoreDoc after = null; final int numSkipDocs = Math.max(0, query.getSkipCount()); if (numSkipDocs > 0) { after = skipScoreDocs(luceneSearcher, luceneQuery, numSkipDocs); } final int numDocs = query.getMaxItems() > 0 ? Math.min(query.getMaxItems(), RESULT_LIMIT) : RESULT_LIMIT; TopDocs topDocs = luceneSearcher.searchAfter(after, luceneQuery, numDocs, sort, true, true); final long totalHitsNum = topDocs.totalHits; List<SearchResultEntry> results = newArrayList(); List<OffsetData> offsetData = Collections.emptyList(); for (int i = 0; i < topDocs.scoreDocs.length; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; int docId = scoreDoc.doc; Document doc = luceneSearcher.doc(docId); if (query.isIncludePositions()) { offsetData = new ArrayList<>(); String txt = doc.get(TEXT_FIELD); if (txt != null) { IndexReader reader = luceneSearcher.getIndexReader(); TokenStream tokenStream = TokenSources.getTokenStream(TEXT_FIELD, reader.getTermVectors(docId), txt, luceneIndexWriter.getAnalyzer(), -1); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); QueryScorer queryScorer = new QueryScorer(luceneQuery); // TODO think about this constant queryScorer.setMaxDocCharsToAnalyze(1_000_000); TokenStream newStream = queryScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } queryScorer.startFragment(null); tokenStream.reset(); int startOffset, endOffset; // TODO think about this constant for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < 1_000_000); next = tokenStream.incrementToken()) { startOffset = offsetAtt.startOffset(); endOffset = offsetAtt.endOffset(); if ((endOffset > txt.length()) || (startOffset > txt.length())) { throw new QueryExecutionException("Token " + termAtt.toString() + " exceeds length of provided text size " + txt.length()); } float res = queryScorer.getTokenScore(); if (res > 0.0F && startOffset <= endOffset) { String tokenText = txt.substring(startOffset, endOffset); Scanner sc = new Scanner(txt); int lineNum = 1; long len = 0; String foundLine = ""; while (sc.hasNextLine()) { foundLine = sc.nextLine(); len += foundLine.length(); if (len > startOffset) { break; } lineNum++; } offsetData.add( new OffsetData(tokenText, startOffset, endOffset, res, lineNum, foundLine)); } } } } String filePath = doc.getField(PATH_FIELD).stringValue(); LOG.debug("Doc {} path {} score {} ", docId, filePath, scoreDoc.score); results.add(new SearchResultEntry(filePath, offsetData)); } final long elapsedTimeMillis = System.currentTimeMillis() - startTime; boolean hasMoreToRetrieve = numSkipDocs + topDocs.scoreDocs.length + 1 < totalHitsNum; QueryExpression nextPageQueryExpression = null; if (hasMoreToRetrieve) { nextPageQueryExpression = createNextPageQuery(query, numSkipDocs + topDocs.scoreDocs.length); } return SearchResult.aSearchResult().withResults(results).withTotalHits(totalHitsNum) .withNextPageQueryExpression(nextPageQueryExpression).withElapsedTimeMillis(elapsedTimeMillis) .build(); } catch (ParseException e) { throw new InvalidQueryException(e.getMessage(), e); } catch (IOException e) { throw new QueryExecutionException(e.getMessage(), e); } finally { try { searcherManager.release(luceneSearcher); } catch (IOException e) { LOG.error(e.getMessage()); } } }
From source file:org.eclipse.help.internal.search.QueryBuilder.java
License:Open Source License
/** * Get a list of tokens corresponding to a search word or phrase * // w w w . j a v a2 s.co m * @return List of String */ private List<String> analyzeText(Analyzer analyzer, String fieldName, String text) { List<String> words = new ArrayList<String>(1); Reader reader = new StringReader(text); TokenStream tStream = analyzer.tokenStream(fieldName, reader); CharTermAttribute termAttribute = (CharTermAttribute) tStream.getAttribute(CharTermAttribute.class); try { while (tStream.incrementToken()) { String term = termAttribute.toString(); words.add(term); } reader.close(); } catch (IOException ioe) { } return words; }
From source file:org.eclipse.recommenders.test.codesearch.rcp.indexer.analyzer.AnalysisTestBase.java
License:Open Source License
private List<String> parseKeywords(Analyzer analyzer, String field, String keywords) { List<String> result = Lists.newArrayList(); TokenStream stream = analyzer.tokenStream(field, new StringReader(keywords)); try {/* w w w. j a va 2 s . c om*/ while (stream.incrementToken()) { result.add(stream.getAttribute(TermAttribute.class).term()); } stream.close(); } catch (IOException e) { // not thrown b/c we're using a string reader... } return result; }
From source file:org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction.java
License:Apache License
@Override protected AnalyzeResponse shardOperation(AnalyzeRequest request, int shardId) throws ElasticsearchException { IndexService indexService = null;//from w w w . j ava 2 s .c om if (request.index() != null) { indexService = indicesService.indexServiceSafe(request.index()); } Analyzer analyzer = null; boolean closeAnalyzer = false; String field = null; if (request.field() != null) { if (indexService == null) { throw new ElasticsearchIllegalArgumentException( "No index provided, and trying to analyzer based on a specific field which requires the index parameter"); } FieldMapper<?> fieldMapper = indexService.mapperService().smartNameFieldMapper(request.field()); if (fieldMapper != null) { if (fieldMapper.isNumeric()) { throw new ElasticsearchIllegalArgumentException("Can't process field [" + request.field() + "], Analysis requests are not supported on numeric fields"); } analyzer = fieldMapper.indexAnalyzer(); field = fieldMapper.names().indexName(); } } if (field == null) { if (indexService != null) { field = indexService.queryParserService().defaultField(); } else { field = AllFieldMapper.NAME; } } if (analyzer == null && request.analyzer() != null) { if (indexService == null) { analyzer = indicesAnalysisService.analyzer(request.analyzer()); } else { analyzer = indexService.analysisService().analyzer(request.analyzer()); } if (analyzer == null) { throw new ElasticsearchIllegalArgumentException( "failed to find analyzer [" + request.analyzer() + "]"); } } else if (request.tokenizer() != null) { TokenizerFactory tokenizerFactory; if (indexService == null) { TokenizerFactoryFactory tokenizerFactoryFactory = indicesAnalysisService .tokenizerFactoryFactory(request.tokenizer()); if (tokenizerFactoryFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find global tokenizer under [" + request.tokenizer() + "]"); } tokenizerFactory = tokenizerFactoryFactory.create(request.tokenizer(), ImmutableSettings.Builder.EMPTY_SETTINGS); } else { tokenizerFactory = indexService.analysisService().tokenizer(request.tokenizer()); if (tokenizerFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find tokenizer under [" + request.tokenizer() + "]"); } } TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0]; if (request.tokenFilters() != null && request.tokenFilters().length > 0) { tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length]; for (int i = 0; i < request.tokenFilters().length; i++) { String tokenFilterName = request.tokenFilters()[i]; if (indexService == null) { TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService .tokenFilterFactoryFactory(tokenFilterName); if (tokenFilterFactoryFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find global token filter under [" + request.tokenizer() + "]"); } tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName, ImmutableSettings.Builder.EMPTY_SETTINGS); } else { tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName); if (tokenFilterFactories[i] == null) { throw new ElasticsearchIllegalArgumentException( "failed to find token filter under [" + request.tokenizer() + "]"); } } if (tokenFilterFactories[i] == null) { throw new ElasticsearchIllegalArgumentException( "failed to find token filter under [" + request.tokenizer() + "]"); } } } analyzer = new CustomAnalyzer(tokenizerFactory, new CharFilterFactory[0], tokenFilterFactories); closeAnalyzer = true; } else if (analyzer == null) { if (indexService == null) { analyzer = Lucene.STANDARD_ANALYZER; } else { analyzer = indexService.analysisService().defaultIndexAnalyzer(); } } if (analyzer == null) { throw new ElasticsearchIllegalArgumentException("failed to find analyzer"); } List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList(); TokenStream stream = null; try { stream = analyzer.tokenStream(field, request.text()); stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type())); } stream.end(); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { // ignore } } if (closeAnalyzer) { analyzer.close(); } } return new AnalyzeResponse(tokens); }
From source file:org.elasticsearch.analysis.common.CompoundAnalysisTests.java
License:Apache License
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings); AnalysisModule analysisModule = createAnalysisModule(settings); IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings); Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); TokenStream stream = analyzer.tokenStream("", text); stream.reset();// w w w .ja v a 2s.c o m CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); List<String> terms = new ArrayList<>(); while (stream.incrementToken()) { String tokText = termAtt.toString(); terms.add(tokText); } return terms; }