List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java
License:Apache License
/** * Adds term weights found by tokenizing text from reader into the Map words * * @param reader a source of text to be tokenized * @param termWeightMap a Map of terms and their weights * @param fieldName Used by analyzer for any special per-field analysis *//* w w w.jav a 2s .co m*/ private void addTermWeights(Reader reader, Map<String, Flt> termWeightMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, reader); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class); TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsedPerField) { break; } if (word.trim().length() == 0) { continue; } if (isNoiseWord(word)) { continue; } BytesRef payload = payloadAttr.getPayload(); float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field if (isPayloadField(fieldName) && payload != null) { tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset); } // increment frequency Flt termWeight = termWeightMap.get(word); if (termWeight == null) { termWeightMap.put(word, new Flt(tokenWeight)); } else { termWeight.x += tokenWeight; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java
License:Apache License
/** * Adds term weights found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termWeightMap a Map of terms and their weights * @param fieldName Used by analyzer for any special per-field analysis *///from w w w .j a v a2s .co m private void addTermWeights(Reader r, Map<String, Flt> termWeightMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, r); try { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsedPerField) { break; } if (isNoiseWord(word)) { continue; } BytesRef payload = payloadAttr.getPayload(); float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field if (isPayloadField(fieldName) && payload != null) { tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset); } // increment frequency Flt termWeight = termWeightMap.get(word); if (termWeight == null) { termWeightMap.put(word, new Flt(tokenWeight)); } else { termWeight.x += tokenWeight; } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } }
From source file:org.drftpd.vfs.index.lucene.LuceneUtils.java
License:Open Source License
/** * Parses the name removing unwanted chars from it. * * @param field//w w w . ja va 2 s .c o m * @param term * @param name * @return Query */ public static Query analyze(String field, Term term, String name) { TokenStream ts = LuceneEngine.ANALYZER.tokenStream(field, new StringReader(name)); BooleanQuery bQuery = new BooleanQuery(); WildcardQuery wQuery; Set<String> tokens = new HashSet<String>(); // avoids repeated terms. // get the CharTermAttribute from the TokenStream CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { tokens.add(termAtt.toString()); } ts.end(); ts.close(); } catch (IOException e) { logger.error("IOException analyzing string", e); } for (String text : tokens) { wQuery = new WildcardQuery(term.createTerm(text)); bQuery.add(wQuery, BooleanClause.Occur.MUST); } return bQuery; }
From source file:org.easynet.resource.queryparser.QueryParserBase.java
License:Apache License
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { if (analyzerIn == null) analyzerIn = getAnalyzer();//from www. java 2 s .c o m TokenStream source = null; try { source = analyzerIn.tokenStream(field, part); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + part, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction.java
License:Apache License
@Override protected AnalyzeResponse shardOperation(AnalyzeRequest request, int shardId) throws ElasticsearchException { IndexService indexService = null;//w ww.j a v a 2 s .c om if (request.index() != null) { indexService = indicesService.indexServiceSafe(request.index()); } Analyzer analyzer = null; boolean closeAnalyzer = false; String field = null; if (request.field() != null) { if (indexService == null) { throw new ElasticsearchIllegalArgumentException( "No index provided, and trying to analyzer based on a specific field which requires the index parameter"); } FieldMapper<?> fieldMapper = indexService.mapperService().smartNameFieldMapper(request.field()); if (fieldMapper != null) { if (fieldMapper.isNumeric()) { throw new ElasticsearchIllegalArgumentException("Can't process field [" + request.field() + "], Analysis requests are not supported on numeric fields"); } analyzer = fieldMapper.indexAnalyzer(); field = fieldMapper.names().indexName(); } } if (field == null) { if (indexService != null) { field = indexService.queryParserService().defaultField(); } else { field = AllFieldMapper.NAME; } } if (analyzer == null && request.analyzer() != null) { if (indexService == null) { analyzer = indicesAnalysisService.analyzer(request.analyzer()); } else { analyzer = indexService.analysisService().analyzer(request.analyzer()); } if (analyzer == null) { throw new ElasticsearchIllegalArgumentException( "failed to find analyzer [" + request.analyzer() + "]"); } } else if (request.tokenizer() != null) { TokenizerFactory tokenizerFactory; if (indexService == null) { TokenizerFactoryFactory tokenizerFactoryFactory = indicesAnalysisService .tokenizerFactoryFactory(request.tokenizer()); if (tokenizerFactoryFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find global tokenizer under [" + request.tokenizer() + "]"); } tokenizerFactory = tokenizerFactoryFactory.create(request.tokenizer(), ImmutableSettings.Builder.EMPTY_SETTINGS); } else { tokenizerFactory = indexService.analysisService().tokenizer(request.tokenizer()); if (tokenizerFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find tokenizer under [" + request.tokenizer() + "]"); } } TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0]; if (request.tokenFilters() != null && request.tokenFilters().length > 0) { tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length]; for (int i = 0; i < request.tokenFilters().length; i++) { String tokenFilterName = request.tokenFilters()[i]; if (indexService == null) { TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService .tokenFilterFactoryFactory(tokenFilterName); if (tokenFilterFactoryFactory == null) { throw new ElasticsearchIllegalArgumentException( "failed to find global token filter under [" + request.tokenizer() + "]"); } tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName, ImmutableSettings.Builder.EMPTY_SETTINGS); } else { tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName); if (tokenFilterFactories[i] == null) { throw new ElasticsearchIllegalArgumentException( "failed to find token filter under [" + request.tokenizer() + "]"); } } if (tokenFilterFactories[i] == null) { throw new ElasticsearchIllegalArgumentException( "failed to find token filter under [" + request.tokenizer() + "]"); } } } analyzer = new CustomAnalyzer(tokenizerFactory, new CharFilterFactory[0], tokenFilterFactories); closeAnalyzer = true; } else if (analyzer == null) { if (indexService == null) { analyzer = Lucene.STANDARD_ANALYZER; } else { analyzer = indexService.analysisService().defaultIndexAnalyzer(); } } if (analyzer == null) { throw new ElasticsearchIllegalArgumentException("failed to find analyzer"); } List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList(); TokenStream stream = null; try { stream = analyzer.tokenStream(field, request.text()); stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type())); } stream.end(); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { // ignore } } if (closeAnalyzer) { analyzer.close(); } } return new AnalyzeResponse(tokens); }
From source file:org.elasticsearch.docvalues.string.DVStringFieldMapper.java
License:Apache License
@Override protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException { // luckily this is single thread access and we dont need a thread local. hasDocValsNow = false;// w ww .j av a 2 s . com super.parseCreateField(context, fields); hasDocValsNow = true; String value = null; if (context.externalValueSet()) { value = (String) context.externalValue(); } else { for (Field f : fields) { Class<?> fClass = f.getClass(); if (fClass == Field.class || fClass == TextField.class || fClass == StringField.class) { value = f.stringValue(); break; } } } if (value != null) { TokenStream stream = docValuesAnalyzer.analyzer().tokenStream(null, new StringReader(value)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String token = cattr.toString(); // take the first token and make it a doc value fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(token))); break; } stream.end(); stream.close(); } }
From source file:org.elasticsearch.index.analysis.NumericAnalyzerTests.java
License:Apache License
@Test public void testAttributeEqual() throws IOException { final int precisionStep = 8; final double value = randomDouble(); NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep); final TokenStream ts1 = analyzer.tokenStream("dummy", String.valueOf(value)); final NumericTokenStream ts2 = new NumericTokenStream(precisionStep); ts2.setDoubleValue(value);//www . java 2 s .co m final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class); final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class); final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class); final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class); ts1.reset(); ts2.reset(); while (ts1.incrementToken()) { assertThat(ts2.incrementToken(), is(true)); assertThat(posInc1, equalTo(posInc2)); // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070) assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue())); assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift())); } assertThat(ts2.incrementToken(), is(false)); ts1.end(); ts2.end(); }
From source file:org.elasticsearch.index.analysis.SimpleIcuCollationTokenFilterTests.java
License:Apache License
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException { CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class); CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class); stream1.reset();// w w w .ja v a2 s. c o m stream2.reset(); assertThat(stream1.incrementToken(), equalTo(true)); assertThat(stream2.incrementToken(), equalTo(true)); assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); assertThat(stream1.incrementToken(), equalTo(false)); assertThat(stream2.incrementToken(), equalTo(false)); stream1.end(); stream2.end(); stream1.close(); stream2.close(); }
From source file:org.elasticsearch.index.mapper.core.TokenCountFieldMapper.java
License:Apache License
/** * Count position increments in a token stream. Package private for testing. * @param tokenStream token stream to count * @return number of position increments in a token stream * @throws IOException if tokenStream throws it *///from www . jav a2 s. c om static int countPositions(TokenStream tokenStream) throws IOException { try { int count = 0; PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { count += position.getPositionIncrement(); } tokenStream.end(); count += position.getPositionIncrement(); return count; } finally { tokenStream.close(); } }
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Query fieldQuery(String value, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real query TokenStream tok = null; try {/*w ww . j ava 2 s .c o m*/ tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanQuery q = new BooleanQuery(); try { while (tok.incrementToken()) { Term term = names().createIndexNameTerm(termAtt.toString()); q.add(new TermQuery(term), BooleanClause.Occur.MUST); } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); q = null; } return q; }