List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:org.cosmo.common.util.WordUtil.java
License:Apache License
public static void main(String[] args) throws Exception { StringReader reader = new StringReader( "CNN, CNN news, CNN.com, CNN TV, news, news online, breaking news, U.S. news, world news, weather, business, CNN Money, sports, politics, law, technology, entertainment, education, travel, health, special reports, autos, developing story, news video, CNN Intl"); /*//from ww w. j a v a2 s . c om LetterTokenizer tokenizer = new LetterTokenizer(reader); AttributeSource filter = new StopFilter(true, tokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true); while (filter.hasAttributes()) { Attribute attribute = filter.captureState(). System.out.println(attribute); } */ StopAnalyzer analyzer = new StopAnalyzer(Index.Version); Set<String> uniqueTerms = new HashSet(); TokenStream tokenStream = analyzer.reusableTokenStream("anyting", reader); tokenStream.reset(); while (tokenStream.incrementToken()) { TermAttribute term = tokenStream.getAttribute(TermAttribute.class); uniqueTerms.add(term.term()); } tokenStream.end(); tokenStream.close(); System.out.println(Arrays.toString(uniqueTerms.toArray())); }
From source file:org.dbpedia.spotlight.lucene.analysis.NGramAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { String myString = "cancer"; Analyzer analyzer = new NGramAnalyzer(3, 3); System.out.println("Analyzing: \"" + myString + "\""); StringReader reader = new StringReader(myString); TokenStream stream = analyzer.tokenStream("field", reader); // TokenStream stream = new NGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 1,2); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println("token: " + stream); }//ww w .j a va2s . c o m stream.end(); stream.close(); }
From source file:org.dbpedia.spotlight.lucene.analysis.PhoneticAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { String myString = "cancer"; Analyzer analyzer = new PhoneticAnalyzer(Version.LUCENE_36, SpotlightConfiguration.DEFAULT_STOPWORDS); System.out.println("Analyzing: \"" + myString + "\""); StringReader reader = new StringReader(myString); TokenStream stream = analyzer.tokenStream("field", reader); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println("token: " + stream); }/*from w w w.ja va 2 s .c om*/ stream.end(); stream.close(); }
From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java
License:Apache License
/** * Adds term weights found by tokenizing text from reader into the Map words * * @param reader a source of text to be tokenized * @param termWeightMap a Map of terms and their weights * @param fieldName Used by analyzer for any special per-field analysis *//*from ww w . j a v a 2 s. co m*/ private void addTermWeights(Reader reader, Map<String, Flt> termWeightMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, reader); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class); TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsedPerField) { break; } if (word.trim().length() == 0) { continue; } if (isNoiseWord(word)) { continue; } BytesRef payload = payloadAttr.getPayload(); float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field if (isPayloadField(fieldName) && payload != null) { tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset); } // increment frequency Flt termWeight = termWeightMap.get(word); if (termWeight == null) { termWeightMap.put(word, new Flt(tokenWeight)); } else { termWeight.x += tokenWeight; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.dice.solrenhancements.spellchecker.DiceMultipleCaseSuggester.java
License:Apache License
private String getAnalyzerResult(String suggestion) { TokenStream ts = null; try {// w w w .j av a 2s .c o m Reader reader = new StringReader(suggestion); ts = this.suggestionAnalyzer.tokenStream("", reader); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); if (word != null && word.length() > 0) { return word; } } } catch (Exception ex) { if (this.field != null) { LOG.error( String.format("Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}", this.field, suggestion), ex); } else if (this.fieldTypeName != null) { LOG.error(String.format( "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}", this.fieldTypeName, suggestion), ex); } } finally { if (ts != null) { IOUtils.closeWhileHandlingException(ts); } } return null; }
From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java
License:Apache License
/** * Adds term weights found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termWeightMap a Map of terms and their weights * @param fieldName Used by analyzer for any special per-field analysis *//*from www .j a v a 2 s . c o m*/ private void addTermWeights(Reader r, Map<String, Flt> termWeightMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, r); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsedPerField) { break; } if (isNoiseWord(word)) { continue; } BytesRef payload = payloadAttr.getPayload(); float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field if (isPayloadField(fieldName) && payload != null) { tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset); } // increment frequency Flt termWeight = termWeightMap.get(word); if (termWeight == null) { termWeightMap.put(word, new Flt(tokenWeight)); } else { termWeight.x += tokenWeight; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.drftpd.vfs.index.lucene.LuceneUtils.java
License:Open Source License
/** * Parses the name removing unwanted chars from it. * * @param field// w ww. ja va 2 s. c o m * @param term * @param name * @return Query */ public static Query analyze(String field, Term term, String name) { TokenStream ts = LuceneEngine.ANALYZER.tokenStream(field, new StringReader(name)); BooleanQuery bQuery = new BooleanQuery(); WildcardQuery wQuery; Set<String> tokens = new HashSet<String>(); // avoids repeated terms. // get the CharTermAttribute from the TokenStream CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { tokens.add(termAtt.toString()); } ts.end(); ts.close(); } catch (IOException e) { logger.error("IOException analyzing string", e); } for (String text : tokens) { wQuery = new WildcardQuery(term.createTerm(text)); bQuery.add(wQuery, BooleanClause.Occur.MUST); } return bQuery; }
From source file:org.easynet.resource.queryparser.QueryParserBase.java
License:Apache License
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { if (analyzerIn == null) analyzerIn = getAnalyzer();/*from ww w. j a v a 2 s .com*/ TokenStream source = null; try { source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.eclipse.che.api.search.server.impl.LuceneSearcher.java
License:Open Source License
@Override public SearchResult search(QueryExpression query) throws InvalidQueryException, QueryExecutionException { IndexSearcher luceneSearcher = null; try {/*from w w w .ja v a2s . com*/ final long startTime = System.currentTimeMillis(); searcherManager.maybeRefresh(); luceneSearcher = searcherManager.acquire(); Query luceneQuery = createLuceneQuery(query); ScoreDoc after = null; final int numSkipDocs = Math.max(0, query.getSkipCount()); if (numSkipDocs > 0) { after = skipScoreDocs(luceneSearcher, luceneQuery, numSkipDocs); } final int numDocs = query.getMaxItems() > 0 ? Math.min(query.getMaxItems(), RESULT_LIMIT) : RESULT_LIMIT; TopDocs topDocs = luceneSearcher.searchAfter(after, luceneQuery, numDocs, sort, true, true); final long totalHitsNum = topDocs.totalHits; List<SearchResultEntry> results = newArrayList(); List<OffsetData> offsetData = Collections.emptyList(); for (int i = 0; i < topDocs.scoreDocs.length; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; int docId = scoreDoc.doc; Document doc = luceneSearcher.doc(docId); if (query.isIncludePositions()) { offsetData = new ArrayList<>(); String txt = doc.get(TEXT_FIELD); if (txt != null) { IndexReader reader = luceneSearcher.getIndexReader(); TokenStream tokenStream = TokenSources.getTokenStream(TEXT_FIELD, reader.getTermVectors(docId), txt, luceneIndexWriter.getAnalyzer(), -1); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); QueryScorer queryScorer = new QueryScorer(luceneQuery); // TODO think about this constant queryScorer.setMaxDocCharsToAnalyze(1_000_000); TokenStream newStream = queryScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } queryScorer.startFragment(null); tokenStream.reset(); int startOffset, endOffset; // TODO think about this constant for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < 1_000_000); next = tokenStream.incrementToken()) { startOffset = offsetAtt.startOffset(); endOffset = offsetAtt.endOffset(); if ((endOffset > txt.length()) || (startOffset > txt.length())) { throw new QueryExecutionException("Token " + termAtt.toString() + " exceeds length of provided text size " + txt.length()); } float res = queryScorer.getTokenScore(); if (res > 0.0F && startOffset <= endOffset) { String tokenText = txt.substring(startOffset, endOffset); Scanner sc = new Scanner(txt); int lineNum = 1; long len = 0; String foundLine = ""; while (sc.hasNextLine()) { foundLine = sc.nextLine(); len += foundLine.length(); if (len > startOffset) { break; } lineNum++; } offsetData.add( new OffsetData(tokenText, startOffset, endOffset, res, lineNum, foundLine)); } } } } String filePath = doc.getField(PATH_FIELD).stringValue(); LOG.debug("Doc {} path {} score {} ", docId, filePath, scoreDoc.score); results.add(new SearchResultEntry(filePath, offsetData)); } final long elapsedTimeMillis = System.currentTimeMillis() - startTime; boolean hasMoreToRetrieve = numSkipDocs + topDocs.scoreDocs.length + 1 < totalHitsNum; QueryExpression nextPageQueryExpression = null; if (hasMoreToRetrieve) { nextPageQueryExpression = createNextPageQuery(query, numSkipDocs + topDocs.scoreDocs.length); } return SearchResult.aSearchResult().withResults(results).withTotalHits(totalHitsNum) .withNextPageQueryExpression(nextPageQueryExpression).withElapsedTimeMillis(elapsedTimeMillis) .build(); } catch (ParseException e) { throw new InvalidQueryException(e.getMessage(), e); } catch (IOException e) { throw new QueryExecutionException(e.getMessage(), e); } finally { try { searcherManager.release(luceneSearcher); } catch (IOException e) { LOG.error(e.getMessage()); } } }
From source file:org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction.java
License:Apache License
@Override protected AnalyzeResponse shardOperation(AnalyzeRequest request, int shardId) throws ElasticsearchException { IndexService indexService = null;/*from w w w .j a v a 2 s . co m*/ if (request.index() != null) { indexService = indicesService.indexServiceSafe(request.index()); } Analyzer analyzer = null; boolean closeAnalyzer = false; String field = null; if (request.field() != null) { if (indexService == null) { throw new ElasticsearchIllegalArgumentException( "No index provided, and trying to analyzer based on a specific field which requires the index parameter"); } FieldMapper<?> fieldMapper = indexService.mapperService().smartNameFieldMapper(request.field()); if (fieldMapper != null) { if (fieldMapper.isNumeric()) { throw new ElasticsearchIllegalArgumentException("Can't process field [" + request.field() + "], Analysis requests are not supported on numeric fields"); } analyzer = fieldMapper.indexAnalyzer(); field = fieldMapper.names().indexName(); } } if (field == null) { if (indexService != null) { field = indexService.queryParserService().defaultField(); } else { field = AllFieldMapper.NAME; } } if (analyzer == null && request.analyzer() != null) { if (indexService == null) { analyzer = indicesAnalysisService.analyzer(request.analyzer()); } else { analyzer = indexService.analysisService().analyzer(request.analyzer()); } if (analyzer == null) { throw new ElasticsearchIllegalArgumentException( "failed to find analyzer [" + request.analyzer() + "]"); } } else if (request.tokenizer() != null) { TokenizerFactory tokenizerFactory; if (indexService == null) { TokenizerFactoryFactory tokenizerFactoryFactory = indicesAnalysisService .tokenizerFactoryFactory(request.tokenizer()); if (tokenizerFactoryFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find global tokenizer under [" + request.tokenizer() + "]"); } tokenizerFactory = tokenizerFactoryFactory.create(request.tokenizer(), ImmutableSettings.Builder.EMPTY_SETTINGS); } else { tokenizerFactory = indexService.analysisService().tokenizer(request.tokenizer()); if (tokenizerFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find tokenizer under [" + request.tokenizer() + "]"); } } TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0]; if (request.tokenFilters() != null && request.tokenFilters().length > 0) { tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length]; for (int i = 0; i < request.tokenFilters().length; i++) { String tokenFilterName = request.tokenFilters()[i]; if (indexService == null) { TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService .tokenFilterFactoryFactory(tokenFilterName); if (tokenFilterFactoryFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find global token filter under [" + request.tokenizer() + "]"); } tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName, ImmutableSettings.Builder.EMPTY_SETTINGS); } else { tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName); if (tokenFilterFactories[i] == null) { throw new ElasticsearchIllegalArgumentException( "failed to find token filter under [" + request.tokenizer() + "]"); } } if (tokenFilterFactories[i] == null) { throw new ElasticsearchIllegalArgumentException( "failed to find token filter under [" + request.tokenizer() + "]"); } } } analyzer = new CustomAnalyzer(tokenizerFactory, new CharFilterFactory[0], tokenFilterFactories); closeAnalyzer = true; } else if (analyzer == null) { if (indexService == null) { analyzer = Lucene.STANDARD_ANALYZER; } else { analyzer = indexService.analysisService().defaultIndexAnalyzer(); } } if (analyzer == null) { throw new ElasticsearchIllegalArgumentException("failed to find analyzer"); } List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList(); TokenStream stream = null; try { stream = analyzer.tokenStream(field, request.text()); stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type())); } stream.end(); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { // ignore } } if (closeAnalyzer) { analyzer.close(); } } return new AnalyzeResponse(tokens); }