Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java

License:Apache License

/**
 * Adds term weights found by tokenizing text from reader into the Map words
 *
 * @param reader a source of text to be tokenized
 * @param termWeightMap a Map of terms and their weights
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*  w w  w.jav  a 2s .co m*/
private void addTermWeights(Reader reader, Map<String, Flt> termWeightMap, String fieldName)
        throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException(
                "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }

    TokenStream ts = analyzer.tokenStream(fieldName, reader);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class);
        TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);

        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsedPerField) {
                break;
            }
            if (word.trim().length() == 0) {
                continue;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            BytesRef payload = payloadAttr.getPayload();
            float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field
            if (isPayloadField(fieldName) && payload != null) {
                tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            }
            // increment frequency
            Flt termWeight = termWeightMap.get(word);
            if (termWeight == null) {
                termWeightMap.put(word, new Flt(tokenWeight));
            } else {
                termWeight.x += tokenWeight;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java

License:Apache License

/**
 * Adds term weights found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termWeightMap a Map of terms and their weights
 * @param fieldName Used by analyzer for any special per-field analysis
 *///from w w w .j  a  v  a2s  .co  m
private void addTermWeights(Reader r, Map<String, Flt> termWeightMap, String fieldName) throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException(
                "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class);

        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsedPerField) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            BytesRef payload = payloadAttr.getPayload();
            float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field
            if (isPayloadField(fieldName) && payload != null) {
                tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            }
            // increment frequency
            Flt termWeight = termWeightMap.get(word);
            if (termWeight == null) {
                termWeightMap.put(word, new Flt(tokenWeight));
            } else {
                termWeight.x += tokenWeight;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.drftpd.vfs.index.lucene.LuceneUtils.java

License:Open Source License

/**
 * Parses the name removing unwanted chars from it.
 *
 * @param field//w w  w .  ja  va  2 s .c o  m
 * @param term
 * @param name
 * @return Query
 */
public static Query analyze(String field, Term term, String name) {
    TokenStream ts = LuceneEngine.ANALYZER.tokenStream(field, new StringReader(name));

    BooleanQuery bQuery = new BooleanQuery();
    WildcardQuery wQuery;

    Set<String> tokens = new HashSet<String>(); // avoids repeated terms.

    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            tokens.add(termAtt.toString());
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        logger.error("IOException analyzing string", e);
    }

    for (String text : tokens) {
        wQuery = new WildcardQuery(term.createTerm(text));
        bQuery.add(wQuery, BooleanClause.Occur.MUST);
    }

    return bQuery;
}

From source file:org.easynet.resource.queryparser.QueryParserBase.java

License:Apache License

protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
    if (analyzerIn == null)
        analyzerIn = getAnalyzer();//from  www.  java  2  s  .c o m

    TokenStream source = null;
    try {
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction.java

License:Apache License

@Override
protected AnalyzeResponse shardOperation(AnalyzeRequest request, int shardId) throws ElasticsearchException {
    IndexService indexService = null;//w ww.j  a  v a  2  s .c  om
    if (request.index() != null) {
        indexService = indicesService.indexServiceSafe(request.index());
    }
    Analyzer analyzer = null;
    boolean closeAnalyzer = false;
    String field = null;
    if (request.field() != null) {
        if (indexService == null) {
            throw new ElasticsearchIllegalArgumentException(
                    "No index provided, and trying to analyzer based on a specific field which requires the index parameter");
        }
        FieldMapper<?> fieldMapper = indexService.mapperService().smartNameFieldMapper(request.field());
        if (fieldMapper != null) {
            if (fieldMapper.isNumeric()) {
                throw new ElasticsearchIllegalArgumentException("Can't process field [" + request.field()
                        + "], Analysis requests are not supported on numeric fields");
            }
            analyzer = fieldMapper.indexAnalyzer();
            field = fieldMapper.names().indexName();

        }
    }
    if (field == null) {
        if (indexService != null) {
            field = indexService.queryParserService().defaultField();
        } else {
            field = AllFieldMapper.NAME;
        }
    }
    if (analyzer == null && request.analyzer() != null) {
        if (indexService == null) {
            analyzer = indicesAnalysisService.analyzer(request.analyzer());
        } else {
            analyzer = indexService.analysisService().analyzer(request.analyzer());
        }
        if (analyzer == null) {
            throw new ElasticsearchIllegalArgumentException(
                    "failed to find analyzer [" + request.analyzer() + "]");
        }
    } else if (request.tokenizer() != null) {
        TokenizerFactory tokenizerFactory;
        if (indexService == null) {
            TokenizerFactoryFactory tokenizerFactoryFactory = indicesAnalysisService
                    .tokenizerFactoryFactory(request.tokenizer());
            if (tokenizerFactoryFactory == null) {
                throw new ElasticsearchIllegalArgumentException(
                        "failed to find global tokenizer under [" + request.tokenizer() + "]");
            }
            tokenizerFactory = tokenizerFactoryFactory.create(request.tokenizer(),
                    ImmutableSettings.Builder.EMPTY_SETTINGS);
        } else {
            tokenizerFactory = indexService.analysisService().tokenizer(request.tokenizer());
            if (tokenizerFactory == null) {
                throw new ElasticsearchIllegalArgumentException(
                        "failed to find tokenizer under [" + request.tokenizer() + "]");
            }
        }
        TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
        if (request.tokenFilters() != null && request.tokenFilters().length > 0) {
            tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length];
            for (int i = 0; i < request.tokenFilters().length; i++) {
                String tokenFilterName = request.tokenFilters()[i];
                if (indexService == null) {
                    TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService
                            .tokenFilterFactoryFactory(tokenFilterName);
                    if (tokenFilterFactoryFactory == null) {
                        throw new ElasticsearchIllegalArgumentException(
                                "failed to find global token filter under [" + request.tokenizer() + "]");
                    }
                    tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName,
                            ImmutableSettings.Builder.EMPTY_SETTINGS);
                } else {
                    tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName);
                    if (tokenFilterFactories[i] == null) {
                        throw new ElasticsearchIllegalArgumentException(
                                "failed to find token filter under [" + request.tokenizer() + "]");
                    }
                }
                if (tokenFilterFactories[i] == null) {
                    throw new ElasticsearchIllegalArgumentException(
                            "failed to find token filter under [" + request.tokenizer() + "]");
                }
            }
        }
        analyzer = new CustomAnalyzer(tokenizerFactory, new CharFilterFactory[0], tokenFilterFactories);
        closeAnalyzer = true;
    } else if (analyzer == null) {
        if (indexService == null) {
            analyzer = Lucene.STANDARD_ANALYZER;
        } else {
            analyzer = indexService.analysisService().defaultIndexAnalyzer();
        }
    }
    if (analyzer == null) {
        throw new ElasticsearchIllegalArgumentException("failed to find analyzer");
    }

    List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(field, request.text());
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        int position = 0;
        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                position = position + increment;
            }
            tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(),
                    offset.endOffset(), type.type()));
        }
        stream.end();
    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        if (stream != null) {
            try {
                stream.close();
            } catch (IOException e) {
                // ignore
            }
        }
        if (closeAnalyzer) {
            analyzer.close();
        }
    }

    return new AnalyzeResponse(tokens);
}

From source file:org.elasticsearch.docvalues.string.DVStringFieldMapper.java

License:Apache License

@Override
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
    // luckily this is single thread access and we dont need a thread local.
    hasDocValsNow = false;//  w  ww  .j  av a 2  s  .  com
    super.parseCreateField(context, fields);
    hasDocValsNow = true;
    String value = null;
    if (context.externalValueSet()) {
        value = (String) context.externalValue();
    } else {
        for (Field f : fields) {
            Class<?> fClass = f.getClass();
            if (fClass == Field.class || fClass == TextField.class || fClass == StringField.class) {
                value = f.stringValue();
                break;
            }
        }
    }
    if (value != null) {
        TokenStream stream = docValuesAnalyzer.analyzer().tokenStream(null, new StringReader(value));
        CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String token = cattr.toString();
            // take the first token and make it a doc value
            fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(token)));
            break;
        }
        stream.end();
        stream.close();
    }
}

From source file:org.elasticsearch.index.analysis.NumericAnalyzerTests.java

License:Apache License

@Test
public void testAttributeEqual() throws IOException {
    final int precisionStep = 8;
    final double value = randomDouble();
    NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep);

    final TokenStream ts1 = analyzer.tokenStream("dummy", String.valueOf(value));
    final NumericTokenStream ts2 = new NumericTokenStream(precisionStep);
    ts2.setDoubleValue(value);//www  . java  2  s  .co  m
    final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class);
    final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class);
    final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class);
    final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class);
    ts1.reset();
    ts2.reset();
    while (ts1.incrementToken()) {
        assertThat(ts2.incrementToken(), is(true));
        assertThat(posInc1, equalTo(posInc2));
        // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070)
        assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue()));
        assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift()));
    }
    assertThat(ts2.incrementToken(), is(false));
    ts1.end();
    ts2.end();
}

From source file:org.elasticsearch.index.analysis.SimpleIcuCollationTokenFilterTests.java

License:Apache License

private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
    CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class);

    stream1.reset();// w w w  .ja v  a2  s. c o m
    stream2.reset();

    assertThat(stream1.incrementToken(), equalTo(true));
    assertThat(stream2.incrementToken(), equalTo(true));
    assertThat(Integer.signum(term1.toString().compareTo(term2.toString())),
            equalTo(Integer.signum(comparison)));
    assertThat(stream1.incrementToken(), equalTo(false));
    assertThat(stream2.incrementToken(), equalTo(false));

    stream1.end();
    stream2.end();

    stream1.close();
    stream2.close();
}

From source file:org.elasticsearch.index.mapper.core.TokenCountFieldMapper.java

License:Apache License

/**
 * Count position increments in a token stream.  Package private for testing.
 * @param tokenStream token stream to count
 * @return number of position increments in a token stream
 * @throws IOException if tokenStream throws it
 *///from  www .  jav a2  s. c om
static int countPositions(TokenStream tokenStream) throws IOException {
    try {
        int count = 0;
        PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            count += position.getPositionIncrement();
        }
        tokenStream.end();
        count += position.getPositionIncrement();
        return count;
    } finally {
        tokenStream.close();
    }
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Query fieldQuery(String value, @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {/*w ww . j  ava  2 s .c o m*/
        tok = indexAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanQuery q = new BooleanQuery();
    try {
        while (tok.incrementToken()) {
            Term term = names().createIndexNameTerm(termAtt.toString());
            q.add(new TermQuery(term), BooleanClause.Occur.MUST);
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        q = null;
    }
    return q;
}