Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java

License:Apache License

/**
 * Adds term weights found by tokenizing text from reader into the Map words
 *
 * @param reader a source of text to be tokenized
 * @param termWeightMap a Map of terms and their weights
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*from  ww w .ja  v  a  2 s  . com*/
private void addTermWeights(Reader reader, Map<String, Flt> termWeightMap, String fieldName)
        throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException(
                "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }

    TokenStream ts = analyzer.tokenStream(fieldName, reader);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class);
        TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);

        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsedPerField) {
                break;
            }
            if (word.trim().length() == 0) {
                continue;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            BytesRef payload = payloadAttr.getPayload();
            float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field
            if (isPayloadField(fieldName) && payload != null) {
                tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            }
            // increment frequency
            Flt termWeight = termWeightMap.get(word);
            if (termWeight == null) {
                termWeightMap.put(word, new Flt(tokenWeight));
            } else {
                termWeight.x += tokenWeight;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.dice.solrenhancements.spellchecker.DiceMultipleCaseSuggester.java

License:Apache License

private String getAnalyzerResult(String suggestion) {
    TokenStream ts = null;
    try {/* w w  w .  j a v  a 2s .  c o m*/
        Reader reader = new StringReader(suggestion);
        ts = this.suggestionAnalyzer.tokenStream("", reader);

        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            if (word != null && word.length() > 0) {
                return word;
            }
        }
    } catch (Exception ex) {
        if (this.field != null) {
            LOG.error(
                    String.format("Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}",
                            this.field, suggestion),
                    ex);
        } else if (this.fieldTypeName != null) {
            LOG.error(String.format(
                    "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}",
                    this.fieldTypeName, suggestion), ex);
        }
    } finally {
        if (ts != null) {
            IOUtils.closeWhileHandlingException(ts);
        }
    }
    return null;
}

From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java

License:Apache License

/**
 * Adds term weights found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termWeightMap a Map of terms and their weights
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*from w w  w .  ja va2s. c o m*/
private void addTermWeights(Reader r, Map<String, Flt> termWeightMap, String fieldName) throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException(
                "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttr = ts.addAttribute(PayloadAttribute.class);

        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsedPerField) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            BytesRef payload = payloadAttr.getPayload();
            float tokenWeight = 1.0f; // 1.0 or payload if set and a payload field
            if (isPayloadField(fieldName) && payload != null) {
                tokenWeight = PayloadHelper.decodeFloat(payload.bytes, payload.offset);
            }
            // increment frequency
            Flt termWeight = termWeightMap.get(word);
            if (termWeight == null) {
                termWeightMap.put(word, new Flt(tokenWeight));
            } else {
                termWeight.x += tokenWeight;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.drftpd.vfs.index.lucene.LuceneUtils.java

License:Open Source License

/**
 * Parses the name removing unwanted chars from it.
 *
 * @param field/*ww  w.  java  2  s .c o  m*/
 * @param term
 * @param name
 * @return Query
 */
public static Query analyze(String field, Term term, String name) {
    TokenStream ts = LuceneEngine.ANALYZER.tokenStream(field, new StringReader(name));

    BooleanQuery bQuery = new BooleanQuery();
    WildcardQuery wQuery;

    Set<String> tokens = new HashSet<String>(); // avoids repeated terms.

    // get the CharTermAttribute from the TokenStream
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            tokens.add(termAtt.toString());
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        logger.error("IOException analyzing string", e);
    }

    for (String text : tokens) {
        wQuery = new WildcardQuery(term.createTerm(text));
        bQuery.add(wQuery, BooleanClause.Occur.MUST);
    }

    return bQuery;
}

From source file:org.easynet.resource.queryparser.QueryParserBase.java

License:Apache License

protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
    if (analyzerIn == null)
        analyzerIn = getAnalyzer();/* ww  w . j  a  va 2 s.  c  om*/

    TokenStream source = null;
    try {
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
        termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:org.eclipse.che.api.search.server.impl.LuceneSearcher.java

License:Open Source License

@Override
public SearchResult search(QueryExpression query) throws InvalidQueryException, QueryExecutionException {
    IndexSearcher luceneSearcher = null;
    try {/*from   w w w . j av a  2s .  c o  m*/
        final long startTime = System.currentTimeMillis();
        searcherManager.maybeRefresh();
        luceneSearcher = searcherManager.acquire();

        Query luceneQuery = createLuceneQuery(query);

        ScoreDoc after = null;
        final int numSkipDocs = Math.max(0, query.getSkipCount());
        if (numSkipDocs > 0) {
            after = skipScoreDocs(luceneSearcher, luceneQuery, numSkipDocs);
        }

        final int numDocs = query.getMaxItems() > 0 ? Math.min(query.getMaxItems(), RESULT_LIMIT)
                : RESULT_LIMIT;
        TopDocs topDocs = luceneSearcher.searchAfter(after, luceneQuery, numDocs, sort, true, true);
        final long totalHitsNum = topDocs.totalHits;

        List<SearchResultEntry> results = newArrayList();
        List<OffsetData> offsetData = Collections.emptyList();
        for (int i = 0; i < topDocs.scoreDocs.length; i++) {
            ScoreDoc scoreDoc = topDocs.scoreDocs[i];
            int docId = scoreDoc.doc;
            Document doc = luceneSearcher.doc(docId);
            if (query.isIncludePositions()) {
                offsetData = new ArrayList<>();
                String txt = doc.get(TEXT_FIELD);
                if (txt != null) {
                    IndexReader reader = luceneSearcher.getIndexReader();

                    TokenStream tokenStream = TokenSources.getTokenStream(TEXT_FIELD,
                            reader.getTermVectors(docId), txt, luceneIndexWriter.getAnalyzer(), -1);

                    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
                    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);

                    QueryScorer queryScorer = new QueryScorer(luceneQuery);
                    // TODO think about this constant
                    queryScorer.setMaxDocCharsToAnalyze(1_000_000);
                    TokenStream newStream = queryScorer.init(tokenStream);
                    if (newStream != null) {
                        tokenStream = newStream;
                    }
                    queryScorer.startFragment(null);

                    tokenStream.reset();

                    int startOffset, endOffset;
                    // TODO think about this constant
                    for (boolean next = tokenStream.incrementToken(); next
                            && (offsetAtt.startOffset() < 1_000_000); next = tokenStream.incrementToken()) {
                        startOffset = offsetAtt.startOffset();
                        endOffset = offsetAtt.endOffset();

                        if ((endOffset > txt.length()) || (startOffset > txt.length())) {
                            throw new QueryExecutionException("Token " + termAtt.toString()
                                    + " exceeds length of provided text size " + txt.length());
                        }

                        float res = queryScorer.getTokenScore();
                        if (res > 0.0F && startOffset <= endOffset) {
                            String tokenText = txt.substring(startOffset, endOffset);
                            Scanner sc = new Scanner(txt);
                            int lineNum = 1;
                            long len = 0;
                            String foundLine = "";
                            while (sc.hasNextLine()) {
                                foundLine = sc.nextLine();

                                len += foundLine.length();
                                if (len > startOffset) {
                                    break;
                                }
                                lineNum++;
                            }
                            offsetData.add(
                                    new OffsetData(tokenText, startOffset, endOffset, res, lineNum, foundLine));
                        }
                    }
                }
            }

            String filePath = doc.getField(PATH_FIELD).stringValue();
            LOG.debug("Doc {} path {} score {} ", docId, filePath, scoreDoc.score);
            results.add(new SearchResultEntry(filePath, offsetData));
        }

        final long elapsedTimeMillis = System.currentTimeMillis() - startTime;

        boolean hasMoreToRetrieve = numSkipDocs + topDocs.scoreDocs.length + 1 < totalHitsNum;
        QueryExpression nextPageQueryExpression = null;
        if (hasMoreToRetrieve) {
            nextPageQueryExpression = createNextPageQuery(query, numSkipDocs + topDocs.scoreDocs.length);
        }

        return SearchResult.aSearchResult().withResults(results).withTotalHits(totalHitsNum)
                .withNextPageQueryExpression(nextPageQueryExpression).withElapsedTimeMillis(elapsedTimeMillis)
                .build();
    } catch (ParseException e) {
        throw new InvalidQueryException(e.getMessage(), e);
    } catch (IOException e) {
        throw new QueryExecutionException(e.getMessage(), e);
    } finally {
        try {
            searcherManager.release(luceneSearcher);
        } catch (IOException e) {
            LOG.error(e.getMessage());
        }
    }
}

From source file:org.eclipse.help.internal.search.QueryBuilder.java

License:Open Source License

/**
 * Get a list of tokens corresponding to a search word or phrase
 * //  w  w w  . j a  v a2  s.co  m
 * @return List of String
 */
private List<String> analyzeText(Analyzer analyzer, String fieldName, String text) {
    List<String> words = new ArrayList<String>(1);
    Reader reader = new StringReader(text);
    TokenStream tStream = analyzer.tokenStream(fieldName, reader);

    CharTermAttribute termAttribute = (CharTermAttribute) tStream.getAttribute(CharTermAttribute.class);
    try {
        while (tStream.incrementToken()) {
            String term = termAttribute.toString();
            words.add(term);
        }
        reader.close();
    } catch (IOException ioe) {
    }

    return words;
}

From source file:org.eclipse.recommenders.test.codesearch.rcp.indexer.analyzer.AnalysisTestBase.java

License:Open Source License

private List<String> parseKeywords(Analyzer analyzer, String field, String keywords) {

    List<String> result = Lists.newArrayList();
    TokenStream stream = analyzer.tokenStream(field, new StringReader(keywords));

    try {/* w w w. j a  va  2 s .  c  om*/
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(TermAttribute.class).term());
        }
        stream.close();
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
    }

    return result;
}

From source file:org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction.java

License:Apache License

@Override
protected AnalyzeResponse shardOperation(AnalyzeRequest request, int shardId) throws ElasticsearchException {
    IndexService indexService = null;//from w w  w  . j ava 2 s .c om
    if (request.index() != null) {
        indexService = indicesService.indexServiceSafe(request.index());
    }
    Analyzer analyzer = null;
    boolean closeAnalyzer = false;
    String field = null;
    if (request.field() != null) {
        if (indexService == null) {
            throw new ElasticsearchIllegalArgumentException(
                    "No index provided, and trying to analyzer based on a specific field which requires the index parameter");
        }
        FieldMapper<?> fieldMapper = indexService.mapperService().smartNameFieldMapper(request.field());
        if (fieldMapper != null) {
            if (fieldMapper.isNumeric()) {
                throw new ElasticsearchIllegalArgumentException("Can't process field [" + request.field()
                        + "], Analysis requests are not supported on numeric fields");
            }
            analyzer = fieldMapper.indexAnalyzer();
            field = fieldMapper.names().indexName();

        }
    }
    if (field == null) {
        if (indexService != null) {
            field = indexService.queryParserService().defaultField();
        } else {
            field = AllFieldMapper.NAME;
        }
    }
    if (analyzer == null && request.analyzer() != null) {
        if (indexService == null) {
            analyzer = indicesAnalysisService.analyzer(request.analyzer());
        } else {
            analyzer = indexService.analysisService().analyzer(request.analyzer());
        }
        if (analyzer == null) {
            throw new ElasticsearchIllegalArgumentException(
                    "failed to find analyzer [" + request.analyzer() + "]");
        }
    } else if (request.tokenizer() != null) {
        TokenizerFactory tokenizerFactory;
        if (indexService == null) {
            TokenizerFactoryFactory tokenizerFactoryFactory = indicesAnalysisService
                    .tokenizerFactoryFactory(request.tokenizer());
            if (tokenizerFactoryFactory == null) {
                throw new ElasticsearchIllegalArgumentException(
                        "failed to find global tokenizer under [" + request.tokenizer() + "]");
            }
            tokenizerFactory = tokenizerFactoryFactory.create(request.tokenizer(),
                    ImmutableSettings.Builder.EMPTY_SETTINGS);
        } else {
            tokenizerFactory = indexService.analysisService().tokenizer(request.tokenizer());
            if (tokenizerFactory == null) {
                throw new ElasticsearchIllegalArgumentException(
                        "failed to find tokenizer under [" + request.tokenizer() + "]");
            }
        }
        TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
        if (request.tokenFilters() != null && request.tokenFilters().length > 0) {
            tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length];
            for (int i = 0; i < request.tokenFilters().length; i++) {
                String tokenFilterName = request.tokenFilters()[i];
                if (indexService == null) {
                    TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService
                            .tokenFilterFactoryFactory(tokenFilterName);
                    if (tokenFilterFactoryFactory == null) {
                        throw new ElasticsearchIllegalArgumentException(
                                "failed to find global token filter under [" + request.tokenizer() + "]");
                    }
                    tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName,
                            ImmutableSettings.Builder.EMPTY_SETTINGS);
                } else {
                    tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName);
                    if (tokenFilterFactories[i] == null) {
                        throw new ElasticsearchIllegalArgumentException(
                                "failed to find token filter under [" + request.tokenizer() + "]");
                    }
                }
                if (tokenFilterFactories[i] == null) {
                    throw new ElasticsearchIllegalArgumentException(
                            "failed to find token filter under [" + request.tokenizer() + "]");
                }
            }
        }
        analyzer = new CustomAnalyzer(tokenizerFactory, new CharFilterFactory[0], tokenFilterFactories);
        closeAnalyzer = true;
    } else if (analyzer == null) {
        if (indexService == null) {
            analyzer = Lucene.STANDARD_ANALYZER;
        } else {
            analyzer = indexService.analysisService().defaultIndexAnalyzer();
        }
    }
    if (analyzer == null) {
        throw new ElasticsearchIllegalArgumentException("failed to find analyzer");
    }

    List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(field, request.text());
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        int position = 0;
        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                position = position + increment;
            }
            tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(),
                    offset.endOffset(), type.type()));
        }
        stream.end();
    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        if (stream != null) {
            try {
                stream.close();
            } catch (IOException e) {
                // ignore
            }
        }
        if (closeAnalyzer) {
            analyzer.close();
        }
    }

    return new AnalyzeResponse(tokens);
}

From source file:org.elasticsearch.analysis.common.CompoundAnalysisTests.java

License:Apache License

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
    IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
    AnalysisModule analysisModule = createAnalysisModule(settings);
    IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings);
    Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

    TokenStream stream = analyzer.tokenStream("", text);
    stream.reset();// w w w .ja  v a  2s.c  o m
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    List<String> terms = new ArrayList<>();
    while (stream.incrementToken()) {
        String tokText = termAtt.toString();
        terms.add(tokText);
    }
    return terms;
}