Example usage for org.apache.lucene.queries.mlt MoreLikeThis setBoost

List of usage examples for org.apache.lucene.queries.mlt MoreLikeThis setBoost

Introduction

In this page you can find the example usage for org.apache.lucene.queries.mlt MoreLikeThis setBoost.

Prototype

public void setBoost(boolean boost) 

Source Link

Document

Sets whether to boost terms in query based on "score" or not.

Usage

From source file:com.qwazr.search.query.MoreLikeThisQuery.java

License:Apache License

@Override
final public Query getQuery(QueryContext queryContext) throws IOException, ParseException {
    Objects.requireNonNull(doc_num, "The doc_num field is missing");
    final MoreLikeThis mlt = new MoreLikeThis(queryContext.indexSearcher.getIndexReader());
    if (is_boost != null)
        mlt.setBoost(is_boost);
    if (boost_factor != null)
        mlt.setBoostFactor(boost_factor);
    if (fieldnames != null)
        mlt.setFieldNames(fieldnames);//from  w  w  w.jav  a  2s  .c  o  m
    if (max_doc_freq != null)
        mlt.setMaxDocFreq(max_doc_freq);
    if (max_doc_freq_pct != null)
        mlt.setMaxDocFreqPct(max_doc_freq_pct);
    if (max_num_tokens_parsed != null)
        mlt.setMaxNumTokensParsed(max_num_tokens_parsed);
    if (max_query_terms != null)
        mlt.setMaxQueryTerms(max_query_terms);
    if (max_word_len != null)
        mlt.setMaxWordLen(max_word_len);
    if (min_doc_freq != null)
        mlt.setMinDocFreq(min_doc_freq);
    if (min_term_freq != null)
        mlt.setMinTermFreq(min_term_freq);
    if (min_word_len != null)
        mlt.setMinWordLen(min_word_len);
    if (stop_words != null)
        mlt.setStopWords(stop_words);
    mlt.setAnalyzer(queryContext.analyzer);
    return mlt.like(doc_num);
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.MoreLikeThisHelper.java

License:Apache License

public static Query getMoreLikeThis(IndexReader reader, Analyzer analyzer, String mltQueryString) {
    Query moreLikeThisQuery = null;
    MoreLikeThis mlt = new MoreLikeThis(reader);
    mlt.setAnalyzer(analyzer);/*from ww w  .  ja  va2 s.  c  o m*/
    try {
        String text = null;
        String[] fields = {};
        for (String param : mltQueryString.split("&")) {
            String[] keyValuePair = param.split("=");
            if (keyValuePair.length != 2 || keyValuePair[0] == null || keyValuePair[1] == null) {
                throw new RuntimeException("Unparsable native Lucene MLT query: " + mltQueryString);
            } else {
                if ("stream.body".equals(keyValuePair[0])) {
                    text = keyValuePair[1];
                } else if ("mlt.fl".equals(keyValuePair[0])) {
                    fields = keyValuePair[1].split(",");
                } else if ("mlt.mindf".equals(keyValuePair[0])) {
                    mlt.setMinDocFreq(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.mintf".equals(keyValuePair[0])) {
                    mlt.setMinTermFreq(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.boost".equals(keyValuePair[0])) {
                    mlt.setBoost(Boolean.parseBoolean(keyValuePair[1]));
                } else if ("mlt.qf".equals(keyValuePair[0])) {
                    mlt.setBoostFactor(Float.parseFloat(keyValuePair[1]));
                } else if ("mlt.maxdf".equals(keyValuePair[0])) {
                    mlt.setMaxDocFreq(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxdfp".equals(keyValuePair[0])) {
                    mlt.setMaxDocFreqPct(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxntp".equals(keyValuePair[0])) {
                    mlt.setMaxNumTokensParsed(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxqt".equals(keyValuePair[0])) {
                    mlt.setMaxQueryTerms(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxwl".equals(keyValuePair[0])) {
                    mlt.setMaxWordLen(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.minwl".equals(keyValuePair[0])) {
                    mlt.setMinWordLen(Integer.parseInt(keyValuePair[1]));
                }
            }
        }
        if (text != null) {
            if (FieldNames.PATH.equals(fields[0])) {
                IndexSearcher searcher = new IndexSearcher(reader);
                TermQuery q = new TermQuery(new Term(FieldNames.PATH, text));
                TopDocs top = searcher.search(q, 1);
                if (top.totalHits == 0) {
                    mlt.setFieldNames(fields);
                    moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]);
                } else {
                    ScoreDoc d = top.scoreDocs[0];
                    Document doc = reader.document(d.doc);
                    List<String> fieldNames = new ArrayList<String>();
                    for (IndexableField f : doc.getFields()) {
                        if (!FieldNames.PATH.equals(f.name())) {
                            fieldNames.add(f.name());
                        }
                    }
                    String[] docFields = fieldNames.toArray(new String[fieldNames.size()]);
                    mlt.setFieldNames(docFields);
                    moreLikeThisQuery = mlt.like(d.doc);
                }
            } else {
                mlt.setFieldNames(fields);
                moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]);
            }
        }
        return moreLikeThisQuery;
    } catch (Exception e) {
        throw new RuntimeException("could not handle MLT query " + mltQueryString);
    }
}

From source file:org.apache.solr.handler.RedbubbleMoreLikeThisHandler.java

License:Apache License

private void setMLTparams(SolrParams params, String[] similarityFields, MoreLikeThis mlt) {
    mlt.setMinTermFreq(params.getInt(MoreLikeThisParams.MIN_TERM_FREQ, MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
    mlt.setMinDocFreq(params.getInt(MoreLikeThisParams.MIN_DOC_FREQ, MoreLikeThis.DEFAULT_MIN_DOC_FREQ));
    mlt.setMaxDocFreq(params.getInt(MoreLikeThisParams.MAX_DOC_FREQ, MoreLikeThis.DEFAULT_MAX_DOC_FREQ));
    mlt.setMinWordLen(params.getInt(MoreLikeThisParams.MIN_WORD_LEN, MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
    mlt.setMaxWordLen(params.getInt(MoreLikeThisParams.MAX_WORD_LEN, MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
    mlt.setMaxQueryTerms(//from w ww .  ja  v a 2 s  . c o m
            params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS, MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
    mlt.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED,
            MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
    mlt.setBoost(params.getBool(MoreLikeThisParams.BOOST, false));
    mlt.setFieldNames(similarityFields);
}

From source file:org.cee.store.lucene.LuceneArticleStore.java

License:Apache License

private Query createRelatedArticlesQuery(List<EntityKey> sites, ArticleKey reference, IndexSearcher searcher,
        String language) throws IOException {
    Query articleQuery = createArticleQuery(reference);
    TopDocs topDocs = searcher.search(articleQuery, 1);
    if (topDocs.totalHits == 0) {
        return new BooleanQuery(true);
    }/*www.  jav  a  2 s .  c o m*/
    MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
    mlt.setFieldNames(LuceneConstants.ARTICLE_RELATED_SEARCH_FIELDS);
    mlt.setMaxQueryTerms(20);
    mlt.setBoost(true);
    mlt.setMinTermFreq(0);
    mlt.setMinDocFreq(0);
    Query relatedQuery = boostRelatedQuery(mlt.like(topDocs.scoreDocs[0].doc));

    BooleanQuery query = new BooleanQuery();
    query.add(new BooleanClause(relatedQuery, Occur.MUST));
    query.add(new BooleanClause(createQueryArticlesOfSites(sites), Occur.MUST));
    return query;
}

From source file:org.elasticsearch.common.lucene.search.MoreLikeThisQuery.java

License:Apache License

@Override
public Query rewrite(IndexReader reader) throws IOException {
    MoreLikeThis mlt = new MoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity);

    mlt.setFieldNames(moreLikeFields);/*ww  w .j a  va 2 s . c  o m*/
    mlt.setAnalyzer(analyzer);
    mlt.setMinTermFreq(minTermFrequency);
    mlt.setMinDocFreq(minDocFreq);
    mlt.setMaxDocFreq(maxDocFreq);
    mlt.setMaxQueryTerms(maxQueryTerms);
    mlt.setMinWordLen(minWordLen);
    mlt.setMaxWordLen(maxWordLen);
    mlt.setStopWords(stopWords);
    mlt.setBoost(boostTerms);
    mlt.setBoostFactor(boostTermsFactor);
    //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
    BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText), moreLikeFields[0]);
    BooleanClause[] clauses = bq.getClauses();

    bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));

    bq.setBoost(getBoost());
    return bq;
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.TSetBasedCategoriserServiceImpl.java

License:Mozilla Public License

/**
 * run More Like This process on a document by comparing its description to
 * the description of all items of the training set<br/>
 * currently we get a fixed number of the top results
 * //  w  w w .  ja  va  2s .  com
 * @param document
 *            document being tested
 * @return
 * @throws IOException
 */
public List<TSetBasedCategorisationResult> runMlt(Document document) {

    Map<String, TSetBasedCategorisationResult> result = null;
    IndexSearcher searcher = null;
    try {
        trainingSetSearcherManager.maybeRefresh();
        // Boolean wasRefreshed = trainingSetSearcherManager.maybeRefresh();
        // if (wasRefreshed) {
        // logger.debug(".runMlt: training set searcher had to be refreshed");
        // }
        searcher = trainingSetSearcherManager.acquire();

        // TODO TSETBASED refresh reader/searcher: Use readermanager and
        // refresh it?
        MoreLikeThis moreLikeThis = new MoreLikeThis(this.trainingSetIndexReader);
        moreLikeThis.setMinTermFreq(minTermFreq);
        moreLikeThis.setMinDocFreq(minDocFreq);
        moreLikeThis.setAnalyzer(this.trainingSetAnalyser);
        moreLikeThis.setFieldNames(fieldsToAnalyse.split(","));
        moreLikeThis.setBoost(true);

        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();

        for (String fieldName : fieldsToAnalyse.split(",")) {
            String value = document.get(fieldName);
            if (value != null && !"null".equals(value)) {

                switch (InformationAssetViewFields.valueOf(fieldName)) {
                case DESCRIPTION:
                    moreLikeThis.setBoostFactor(descBoostingFactor);
                    break;
                case TITLE:
                    moreLikeThis.setBoostFactor(titleBoostingFactor);
                    break;
                case CONTEXTDESCRIPTION:
                    moreLikeThis.setBoostFactor(contextDescBoostingFactor);
                    break;
                default:
                case SUBJECTS:
                case CORPBODYS:
                case PERSON_FULLNAME:
                case PLACE_NAME:
                    moreLikeThis.setBoostFactor(1);
                    break;
                }
                Query query = moreLikeThis.like(fieldName, new StringReader(value));
                queryBuilder.add(query, Occur.SHOULD);
            }
        }
        BooleanQuery fullQuery = queryBuilder.build();

        TopDocs topDocs = searcher.search(fullQuery, this.maximumSimilarElements);
        logger.debug(".runMlt: found {} total hits, processed at maximum {} hits", topDocs.totalHits,
                this.maximumSimilarElements);

        result = new LinkedHashMap<String, TSetBasedCategorisationResult>();

        int size = 0;
        if (topDocs.totalHits <= this.maximumSimilarElements) {
            size = topDocs.totalHits - 1;
        } else {
            size = this.maximumSimilarElements - 1;
        }

        for (int i = 0; i < size; i++) {
            ScoreDoc scoreDoc = topDocs.scoreDocs[i];
            Float currrentScore = scoreDoc.score;

            if (currrentScore < this.mimimumScoreForMlt) {
                break;
            }

            Document hitDoc = searcher.doc(scoreDoc.doc);
            String category = hitDoc.get(InformationAssetViewFields.TAXONOMY.toString());
            String docReference = hitDoc.get(InformationAssetViewFields.DOCREFERENCE.toString());
            logger.debug(".runMlt: found doc, category: {}, score: {}, docreference: {}", category,
                    currrentScore, docReference);

            TSetBasedCategorisationResult existingCategorisationResult = result.get(category);
            Float scoreToSet = currrentScore;
            Integer numberOfFoundDocuments = 1;
            // k nearest neighbour algorithm
            if (existingCategorisationResult != null) {
                scoreToSet += existingCategorisationResult.getScore();
                numberOfFoundDocuments += existingCategorisationResult.getNumberOfFoundDocuments();
            }
            result.put(category,
                    new TSetBasedCategorisationResult(category, scoreToSet, numberOfFoundDocuments));

        }

    } catch (IOException e) {
        throw new TaxonomyException(TaxonomyErrorType.LUCENE_IO_EXCEPTION, e);
    } finally {
        LuceneHelperTools.releaseSearcherManagerQuietly(trainingSetSearcherManager, searcher);
    }

    List<TSetBasedCategorisationResult> sortedResults = sortCategorisationResultsByScoreDescAndFilterByGlobalScore(
            new ArrayList<TSetBasedCategorisationResult>(result.values()));

    return sortedResults;
}