Example usage for org.apache.lucene.queries.mlt MoreLikeThis setMaxWordLen

Introduction

In this page you can find the example usage for org.apache.lucene.queries.mlt MoreLikeThis setMaxWordLen.

Prototype

public void setMaxWordLen(int maxWordLen)

Source Link

Document

Sets the maximum word length above which words will be ignored.

Usage

From source file:com.qwazr.search.query.MoreLikeThisQuery.java

License:Apache License

@Override
final public Query getQuery(QueryContext queryContext) throws IOException, ParseException {
    Objects.requireNonNull(doc_num, "The doc_num field is missing");
    final MoreLikeThis mlt = new MoreLikeThis(queryContext.indexSearcher.getIndexReader());
    if (is_boost != null)
        mlt.setBoost(is_boost);//from  www .java 2  s  .c  om
    if (boost_factor != null)
        mlt.setBoostFactor(boost_factor);
    if (fieldnames != null)
        mlt.setFieldNames(fieldnames);
    if (max_doc_freq != null)
        mlt.setMaxDocFreq(max_doc_freq);
    if (max_doc_freq_pct != null)
        mlt.setMaxDocFreqPct(max_doc_freq_pct);
    if (max_num_tokens_parsed != null)
        mlt.setMaxNumTokensParsed(max_num_tokens_parsed);
    if (max_query_terms != null)
        mlt.setMaxQueryTerms(max_query_terms);
    if (max_word_len != null)
        mlt.setMaxWordLen(max_word_len);
    if (min_doc_freq != null)
        mlt.setMinDocFreq(min_doc_freq);
    if (min_term_freq != null)
        mlt.setMinTermFreq(min_term_freq);
    if (min_word_len != null)
        mlt.setMinWordLen(min_word_len);
    if (stop_words != null)
        mlt.setStopWords(stop_words);
    mlt.setAnalyzer(queryContext.analyzer);
    return mlt.like(doc_num);
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.MoreLikeThisHelper.java

License:Apache License

public static Query getMoreLikeThis(IndexReader reader, Analyzer analyzer, String mltQueryString) {
    Query moreLikeThisQuery = null;
    MoreLikeThis mlt = new MoreLikeThis(reader);
    mlt.setAnalyzer(analyzer);//from   w  ww  .j av  a  2 s .  co m
    try {
        String text = null;
        String[] fields = {};
        for (String param : mltQueryString.split("&")) {
            String[] keyValuePair = param.split("=");
            if (keyValuePair.length != 2 || keyValuePair[0] == null || keyValuePair[1] == null) {
                throw new RuntimeException("Unparsable native Lucene MLT query: " + mltQueryString);
            } else {
                if ("stream.body".equals(keyValuePair[0])) {
                    text = keyValuePair[1];
                } else if ("mlt.fl".equals(keyValuePair[0])) {
                    fields = keyValuePair[1].split(",");
                } else if ("mlt.mindf".equals(keyValuePair[0])) {
                    mlt.setMinDocFreq(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.mintf".equals(keyValuePair[0])) {
                    mlt.setMinTermFreq(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.boost".equals(keyValuePair[0])) {
                    mlt.setBoost(Boolean.parseBoolean(keyValuePair[1]));
                } else if ("mlt.qf".equals(keyValuePair[0])) {
                    mlt.setBoostFactor(Float.parseFloat(keyValuePair[1]));
                } else if ("mlt.maxdf".equals(keyValuePair[0])) {
                    mlt.setMaxDocFreq(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxdfp".equals(keyValuePair[0])) {
                    mlt.setMaxDocFreqPct(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxntp".equals(keyValuePair[0])) {
                    mlt.setMaxNumTokensParsed(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxqt".equals(keyValuePair[0])) {
                    mlt.setMaxQueryTerms(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxwl".equals(keyValuePair[0])) {
                    mlt.setMaxWordLen(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.minwl".equals(keyValuePair[0])) {
                    mlt.setMinWordLen(Integer.parseInt(keyValuePair[1]));
                }
            }
        }
        if (text != null) {
            if (FieldNames.PATH.equals(fields[0])) {
                IndexSearcher searcher = new IndexSearcher(reader);
                TermQuery q = new TermQuery(new Term(FieldNames.PATH, text));
                TopDocs top = searcher.search(q, 1);
                if (top.totalHits == 0) {
                    mlt.setFieldNames(fields);
                    moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]);
                } else {
                    ScoreDoc d = top.scoreDocs[0];
                    Document doc = reader.document(d.doc);
                    List<String> fieldNames = new ArrayList<String>();
                    for (IndexableField f : doc.getFields()) {
                        if (!FieldNames.PATH.equals(f.name())) {
                            fieldNames.add(f.name());
                        }
                    }
                    String[] docFields = fieldNames.toArray(new String[fieldNames.size()]);
                    mlt.setFieldNames(docFields);
                    moreLikeThisQuery = mlt.like(d.doc);
                }
            } else {
                mlt.setFieldNames(fields);
                moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]);
            }
        }
        return moreLikeThisQuery;
    } catch (Exception e) {
        throw new RuntimeException("could not handle MLT query " + mltQueryString);
    }
}

From source file:org.apache.solr.handler.RedbubbleMoreLikeThisHandler.java

License:Apache License

private void setMLTparams(SolrParams params, String[] similarityFields, MoreLikeThis mlt) {
    mlt.setMinTermFreq(params.getInt(MoreLikeThisParams.MIN_TERM_FREQ, MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
    mlt.setMinDocFreq(params.getInt(MoreLikeThisParams.MIN_DOC_FREQ, MoreLikeThis.DEFAULT_MIN_DOC_FREQ));
    mlt.setMaxDocFreq(params.getInt(MoreLikeThisParams.MAX_DOC_FREQ, MoreLikeThis.DEFAULT_MAX_DOC_FREQ));
    mlt.setMinWordLen(params.getInt(MoreLikeThisParams.MIN_WORD_LEN, MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
    mlt.setMaxWordLen(params.getInt(MoreLikeThisParams.MAX_WORD_LEN, MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
    mlt.setMaxQueryTerms(/*from   w  w  w .  j  a  v  a 2s.  co m*/
            params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS, MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
    mlt.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED,
            MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
    mlt.setBoost(params.getBool(MoreLikeThisParams.BOOST, false));
    mlt.setFieldNames(similarityFields);
}

From source file:org.apache.solr.search.mlt.CloudMLTQParser.java

License:Apache License

public Query parse() {
    String id = localParams.get(QueryParsing.V);
    // Do a Real Time Get for the document
    SolrDocument doc = getDocument(id);/* w  w  w . j  a v  a2 s .co  m*/

    MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader());
    // TODO: Are the mintf and mindf defaults ok at 1/0 ?

    mlt.setMinTermFreq(localParams.getInt("mintf", 1));
    mlt.setMinDocFreq(localParams.getInt("mindf", 0));
    if (localParams.get("minwl") != null)
        mlt.setMinWordLen(localParams.getInt("minwl"));

    if (localParams.get("maxwl") != null)
        mlt.setMaxWordLen(localParams.getInt("maxwl"));

    mlt.setAnalyzer(req.getSchema().getIndexAnalyzer());

    String[] qf = localParams.getParams("qf");
    Map<String, Collection<Object>> filteredDocument = new HashMap();

    if (qf != null) {
        mlt.setFieldNames(qf);
        for (String field : qf) {
            filteredDocument.put(field, doc.getFieldValues(field));
        }
    } else {
        Map<String, SchemaField> fields = req.getSchema().getFields();
        ArrayList<String> fieldNames = new ArrayList();
        for (String field : doc.getFieldNames()) {
            // Only use fields that are stored and have an explicit analyzer.
            // This makes sense as the query uses tf/idf/.. for query construction.
            // We might want to relook and change this in the future though.
            if (fields.get(field).stored() && fields.get(field).getType().isExplicitAnalyzer()) {
                fieldNames.add(field);
                filteredDocument.put(field, doc.getFieldValues(field));
            }
        }
        mlt.setFieldNames(fieldNames.toArray(new String[fieldNames.size()]));
    }

    try {
        return mlt.like(filteredDocument);
    } catch (IOException e) {
        e.printStackTrace();
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad Request");
    }

}

From source file:org.apache.solr.search.mlt.SimpleMLTQParser.java

License:Apache License

public Query parse() {

    String defaultField = req.getSchema().getUniqueKeyField().getName();
    String uniqueValue = localParams.get(QueryParsing.V);
    String[] qf = localParams.getParams("qf");

    SolrIndexSearcher searcher = req.getSearcher();
    Query docIdQuery = createIdQuery(defaultField, uniqueValue);

    try {/*from   w w w.  j  a va 2s  .c  om*/
        TopDocs td = searcher.search(docIdQuery, 1);
        if (td.totalHits != 1)
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                    "Error completing MLT request. Could not fetch " + "document with id [" + uniqueValue
                            + "]");
        ScoreDoc[] scoreDocs = td.scoreDocs;
        MoreLikeThis mlt = new MoreLikeThis(req.getSearcher().getIndexReader());
        // TODO: Are the mintf and mindf defaults ok at '1' ?
        mlt.setMinTermFreq(localParams.getInt("mintf", 1));
        mlt.setMinDocFreq(localParams.getInt("mindf", 1));
        if (localParams.get("minwl") != null)
            mlt.setMinWordLen(localParams.getInt("minwl"));

        if (localParams.get("maxwl") != null)
            mlt.setMaxWordLen(localParams.getInt("maxwl"));

        ArrayList<String> fields = new ArrayList();

        if (qf != null) {
            mlt.setFieldNames(qf);
        } else {

            Map<String, SchemaField> fieldNames = req.getSearcher().getSchema().getFields();
            for (String fieldName : fieldNames.keySet()) {
                if (fieldNames.get(fieldName).indexed() && fieldNames.get(fieldName).stored())
                    if (fieldNames.get(fieldName).getType().getNumericType() == null)
                        fields.add(fieldName);
            }
            mlt.setFieldNames(fields.toArray(new String[fields.size()]));
        }

        mlt.setAnalyzer(req.getSchema().getIndexAnalyzer());

        return mlt.like(scoreDocs[0].doc);

    } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                "Error completing MLT request" + e.getMessage());
    }
}

From source file:org.elasticsearch.common.lucene.search.MoreLikeThisQuery.java

License:Apache License

@Override
public Query rewrite(IndexReader reader) throws IOException {
    MoreLikeThis mlt = new MoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity);

    mlt.setFieldNames(moreLikeFields);/*from w w  w.  j a  v a2s . c o m*/
    mlt.setAnalyzer(analyzer);
    mlt.setMinTermFreq(minTermFrequency);
    mlt.setMinDocFreq(minDocFreq);
    mlt.setMaxDocFreq(maxDocFreq);
    mlt.setMaxQueryTerms(maxQueryTerms);
    mlt.setMinWordLen(minWordLen);
    mlt.setMaxWordLen(maxWordLen);
    mlt.setStopWords(stopWords);
    mlt.setBoost(boostTerms);
    mlt.setBoostFactor(boostTermsFactor);
    //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
    BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText), moreLikeFields[0]);
    BooleanClause[] clauses = bq.getClauses();

    bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));

    bq.setBoost(getBoost());
    return bq;
}

From source file:org.ohdsi.usagi.UsagiSearchEngine.java

License:Apache License

public List<ScoredConcept> search(String searchTerm, boolean useMlt, Collection<Integer> filterConceptIds,
        String filterDomain, String filterConceptClass, String filterVocabulary, boolean filterInvalid) {
    List<ScoredConcept> results = new ArrayList<ScoredConcept>();
    try {/*  w  w  w .jav a 2  s . c om*/
        Query query;
        if (useMlt) {
            MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
            mlt.setMinTermFreq(1);
            mlt.setMinDocFreq(1);
            mlt.setMaxDocFreq(9999);
            mlt.setMinWordLen(1);
            mlt.setMaxWordLen(9999);
            mlt.setMaxDocFreqPct(100);
            mlt.setMaxNumTokensParsed(9999);
            mlt.setMaxQueryTerms(9999);
            mlt.setStopWords(null);
            mlt.setFieldNames(new String[] { "TERM" });
            mlt.setAnalyzer(analyzer);

            query = mlt.like("TERM", new StringReader(searchTerm));
        } else {
            try {
                query = keywordsQueryParser.parse(searchTerm);
                // if (query instanceof BooleanQuery) {
                // List<BooleanClause> clauses = ((BooleanQuery) query).clauses();
                // BooleanClause lastClause = clauses.get(clauses.size() - 1);
                // lastClause.setQuery(new PrefixQuery(((TermQuery) lastClause.getQuery()).getTerm()));
                // } else if (query instanceof TermQuery) {// It's a single term
                // query = new PrefixQuery(((TermQuery) query).getTerm());
                // }

            } catch (ParseException e) {
                return results;
            }
        }

        BooleanQuery booleanQuery = new BooleanQuery();
        booleanQuery.add(query, Occur.SHOULD);
        booleanQuery.add(conceptQuery, Occur.MUST);

        if (filterConceptIds != null && filterConceptIds.size() > 0) {
            Query conceptIdQuery = conceptIdQueryParser.parse(StringUtilities.join(filterConceptIds, " OR "));
            booleanQuery.add(conceptIdQuery, Occur.MUST);
        }

        if (filterDomain != null) {
            Query domainQuery = domainQueryParser.parse("\"" + filterDomain + "\"");
            booleanQuery.add(domainQuery, Occur.MUST);
        }
        if (filterConceptClass != null) {
            Query conceptClassQuery = conceptClassQueryParser
                    .parse("\"" + filterConceptClass.toString() + "\"");
            booleanQuery.add(conceptClassQuery, Occur.MUST);
        }
        if (filterVocabulary != null) {
            Query vocabularyQuery = vocabularyQueryParser.parse("\"" + filterVocabulary.toString() + "\"");
            booleanQuery.add(vocabularyQuery, Occur.MUST);
        }
        if (filterInvalid) {
            Query invalidQuery = invalidQueryParser.parse("\"\"");
            booleanQuery.add(invalidQuery, Occur.MUST);
        }
        TopDocs topDocs = searcher.search(booleanQuery, 100);

        recomputeScores(topDocs.scoreDocs, query);
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            Document document = reader.document(scoreDoc.doc);
            int conceptId = Integer.parseInt(document.get("CONCEPT_ID"));
            // If matchscore = 0 but it was the one concept that was automatically selected, still allow it:
            if (scoreDoc.score > 0 || (filterConceptIds != null && filterConceptIds.size() == 1
                    && filterConceptIds.contains(conceptId))) {
                TargetConcept targetConcept = new TargetConcept();
                targetConcept.term = document.get("TERM");
                targetConcept.conceptId = conceptId;
                targetConcept.conceptName = document.get("CONCEPT_NAME");
                targetConcept.conceptClass = document.get("CONCEPT_CLASS");
                targetConcept.vocabulary = document.get("VOCABULARY");
                targetConcept.conceptCode = document.get("CONCEPT_CODE");
                targetConcept.validStartDate = document.get("VALID_START_DATE");
                targetConcept.validEndDate = document.get("VALID_END_DATE");
                targetConcept.invalidReason = document.get("INVALID_REASON");
                for (String domain : document.get("DOMAINS").split("\n"))
                    targetConcept.domains.add(domain);
                targetConcept.additionalInformation = document.get("ADDITIONAL_INFORMATION");
                results.add(new ScoredConcept(scoreDoc.score, targetConcept));
            }
        }
        reorderTies(results);
        removeDuplicateConcepts(results);
    } catch (Exception e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
    }

    return results;
}