Example usage for org.apache.lucene.queries.mlt MoreLikeThis like

List of usage examples for org.apache.lucene.queries.mlt MoreLikeThis like

Introduction

In this page you can find the example usage for org.apache.lucene.queries.mlt MoreLikeThis like.

Prototype

public Query like(String fieldName, Reader... readers) throws IOException 

Source Link

Document

Return a query that will return docs like the passed Readers.

Usage

From source file:fr.univ_tours.etu.searcher.LikeThisTest.java

private void findSilimar(String searchForSimilar) throws IOException {
    IndexReader reader = DirectoryReader.open(indexDir);
    IndexSearcher indexSearcher = new IndexSearcher(reader);

    MoreLikeThis mlt = new MoreLikeThis(reader);
    mlt.setMinTermFreq(0);//w w  w . j  av  a  2s  . co  m
    mlt.setMinDocFreq(0);
    mlt.setFieldNames(new String[] { "title", "content" });
    mlt.setAnalyzer(analyzer);

    Reader sReader = new StringReader(searchForSimilar);
    Query query = mlt.like("content", sReader);

    TopDocs topDocs = indexSearcher.search(query, 10);

    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
        Document aSimilar = indexSearcher.doc(scoreDoc.doc);
        String similarTitle = aSimilar.get("title");
        String similarContent = aSimilar.get("content");

        System.out.println("====similar finded====");
        System.out.println("title: " + similarTitle);
        System.out.println("content: " + similarContent);
    }

}

From source file:fr.univ_tours.etu.searcher.Searcher.java

public List<ResultObject> search(SearchQueriesRequest query) throws IOException, ParseException {

    Map<String, String> queriesDictionary = query.getQueriesDictionary();
    boolean useQueryExpansion = query.isUseQueryExpansion();
    List<Integer> docsToExpand = (useQueryExpansion) ? new ArrayList<>() : null;

    List<String> fsa = new ArrayList<>();
    List<String> qsa = new ArrayList<>();
    String contentLemmas = "";
    if (queriesDictionary.containsKey(DocFields.CONTENTS)) {
        regularTokenizer.tokenize(queriesDictionary.get(DocFields.CONTENTS), true);
        caselessTokenizer.tokenize(queriesDictionary.get(DocFields.CONTENTS), true);
        contentLemmas = caselessTokenizer.getLemmaString();
        System.out.println("Lemmas: " + caselessTokenizer.getLemmaList());
        String neString = "";
        if (caselessTokenizer.getNeList() != null && caselessTokenizer.getNeList().size() != 0) {
            neString = caselessTokenizer.getNeString(";", true);
            System.out.println("NE caseless: " + neString);
        }//w  ww.j a  v a  2s  . c o  m
        if (regularTokenizer.getNeList() != null && regularTokenizer.getNeList().size() != 0) {
            neString += ";" + regularTokenizer.getNeString(";", true);
            System.out.println("NE all: " + neString);
        }
        if (!"".equals(neString)) {
            fsa.add(DocFields.NAMED_ENTITIES);
            qsa.add(neString);
        }

    }

    for (Map.Entry<String, String> entry : queriesDictionary.entrySet()) {
        fsa.add(entry.getKey());
        if (entry.getKey().equals(DocFields.CONTENTS) || entry.getKey().equals(DocFields.SYNONYMS)) {
            qsa.add(contentLemmas);
        } else {
            qsa.add(entry.getValue());
        }
    }

    Query q = MultiFieldQueryParser.parse(qsa.toArray(new String[qsa.size()]),
            fsa.toArray(new String[fsa.size()]), analyzer);

    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs docs = searcher.search(q, this.numRetrievedDocs);
    ScoreDoc[] hits = docs.scoreDocs;

    List<ResultObject> resultObjects = new ArrayList<>();

    String result = "";
    for (int i = 0; i < hits.length; ++i) {
        int docId = hits[i].doc;
        if (useQueryExpansion) {
            docsToExpand.add(docId);
        }
        Document d = searcher.doc(docId);
        resultObjects.add(new ResultObject(docId, i, d.get(DocFields.TITLE), d.get(DocFields.AUTHOR),
                d.get(DocFields.FILE_PATH), d.get(DocFields.SUMMARY), d.get(DocFields.FILE_NAME)));
        result = d.get(DocFields.SUMMARY);
    }

    if (useQueryExpansion) {
        reader.close();

        this.reader = DirectoryReader.open(FSDirectory.open(new File(this.indexDir).toPath()));
        searcher = new IndexSearcher(reader);
        MoreLikeThis mlt = new MoreLikeThis(reader);
        mlt.setMinTermFreq(0);
        mlt.setMinDocFreq(0);
        mlt.setAnalyzer(analyzer);
        for (int i = 0; i < Math.min(docsToExpand.size(), 5); i++) {

            Reader r = new StringReader(resultObjects.get(i).getSummary());
            Query expandedQuery = mlt.like(DocFields.CONTENTS, r);

            TopDocs topDocs = searcher.search(expandedQuery, 5);

            for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
                if (!docsToExpand.contains(scoreDoc.doc)) {
                    docsToExpand.add(scoreDoc.doc);
                    Document aSimilar = searcher.doc(scoreDoc.doc);

                    resultObjects.add(new ResultObject(1, resultObjects.size(), aSimilar.get(DocFields.TITLE),
                            aSimilar.get(DocFields.AUTHOR), aSimilar.get(DocFields.FILE_PATH),
                            aSimilar.get(DocFields.SUMMARY), aSimilar.get(DocFields.FILE_NAME)));
                } else {
                }

            }
        }
    }

    return resultObjects;
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.util.MoreLikeThisHelper.java

License:Apache License

public static Query getMoreLikeThis(IndexReader reader, Analyzer analyzer, String mltQueryString) {
    Query moreLikeThisQuery = null;
    MoreLikeThis mlt = new MoreLikeThis(reader);
    mlt.setAnalyzer(analyzer);/*from   ww w.ja  va 2s.  c  o  m*/
    try {
        String text = null;
        String[] fields = {};
        for (String param : mltQueryString.split("&")) {
            String[] keyValuePair = param.split("=");
            if (keyValuePair.length != 2 || keyValuePair[0] == null || keyValuePair[1] == null) {
                throw new RuntimeException("Unparsable native Lucene MLT query: " + mltQueryString);
            } else {
                if ("stream.body".equals(keyValuePair[0])) {
                    text = keyValuePair[1];
                } else if ("mlt.fl".equals(keyValuePair[0])) {
                    fields = keyValuePair[1].split(",");
                } else if ("mlt.mindf".equals(keyValuePair[0])) {
                    mlt.setMinDocFreq(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.mintf".equals(keyValuePair[0])) {
                    mlt.setMinTermFreq(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.boost".equals(keyValuePair[0])) {
                    mlt.setBoost(Boolean.parseBoolean(keyValuePair[1]));
                } else if ("mlt.qf".equals(keyValuePair[0])) {
                    mlt.setBoostFactor(Float.parseFloat(keyValuePair[1]));
                } else if ("mlt.maxdf".equals(keyValuePair[0])) {
                    mlt.setMaxDocFreq(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxdfp".equals(keyValuePair[0])) {
                    mlt.setMaxDocFreqPct(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxntp".equals(keyValuePair[0])) {
                    mlt.setMaxNumTokensParsed(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxqt".equals(keyValuePair[0])) {
                    mlt.setMaxQueryTerms(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.maxwl".equals(keyValuePair[0])) {
                    mlt.setMaxWordLen(Integer.parseInt(keyValuePair[1]));
                } else if ("mlt.minwl".equals(keyValuePair[0])) {
                    mlt.setMinWordLen(Integer.parseInt(keyValuePair[1]));
                }
            }
        }
        if (text != null) {
            if (FieldNames.PATH.equals(fields[0])) {
                IndexSearcher searcher = new IndexSearcher(reader);
                TermQuery q = new TermQuery(new Term(FieldNames.PATH, text));
                TopDocs top = searcher.search(q, 1);
                if (top.totalHits == 0) {
                    mlt.setFieldNames(fields);
                    moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]);
                } else {
                    ScoreDoc d = top.scoreDocs[0];
                    Document doc = reader.document(d.doc);
                    List<String> fieldNames = new ArrayList<String>();
                    for (IndexableField f : doc.getFields()) {
                        if (!FieldNames.PATH.equals(f.name())) {
                            fieldNames.add(f.name());
                        }
                    }
                    String[] docFields = fieldNames.toArray(new String[fieldNames.size()]);
                    mlt.setFieldNames(docFields);
                    moreLikeThisQuery = mlt.like(d.doc);
                }
            } else {
                mlt.setFieldNames(fields);
                moreLikeThisQuery = mlt.like(new StringReader(text), mlt.getFieldNames()[0]);
            }
        }
        return moreLikeThisQuery;
    } catch (Exception e) {
        throw new RuntimeException("could not handle MLT query " + mltQueryString);
    }
}

From source file:org.elasticsearch.common.lucene.search.morelikethis.XMoreLikeThisTests.java

License:Apache License

@Test
public void testTopN() throws Exception {
    int numDocs = 100;
    int topN = 25;

    // add series of docs with terms of decreasing df
    Directory dir = newDirectory();// w ww.  j a  va 2  s  .co  m
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    for (int i = 0; i < numDocs; i++) {
        addDoc(writer, generateStrSeq(0, i + 1));
    }
    IndexReader reader = writer.getReader();
    writer.close();

    // setup MLT query
    MoreLikeThis mlt = new MoreLikeThis(reader);
    mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
    mlt.setMaxQueryTerms(topN);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] { "text" });

    // perform MLT query
    String likeText = "";
    for (String text : generateStrSeq(0, numDocs)) {
        likeText += text + " ";
    }
    BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText));

    // check best terms are topN of highest idf
    List<BooleanClause> clauses = query.clauses();
    assertEquals("Expected" + topN + "clauses only!", topN, clauses.size());

    Term[] expectedTerms = new Term[topN];
    int idx = 0;
    for (String text : generateStrSeq(numDocs - topN, topN)) {
        expectedTerms[idx++] = new Term("text", text);
    }
    for (BooleanClause clause : clauses) {
        Term term = ((TermQuery) clause.getQuery()).getTerm();
        assertTrue(Arrays.asList(expectedTerms).contains(term));
    }

    // clean up
    reader.close();
    dir.close();
}

From source file:org.elasticsearch.common.lucene.search.MoreLikeThisQuery.java

License:Apache License

@Override
public Query rewrite(IndexReader reader) throws IOException {
    MoreLikeThis mlt = new MoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity);

    mlt.setFieldNames(moreLikeFields);/*from  w  w  w.j  av a  2 s  .c  o  m*/
    mlt.setAnalyzer(analyzer);
    mlt.setMinTermFreq(minTermFrequency);
    mlt.setMinDocFreq(minDocFreq);
    mlt.setMaxDocFreq(maxDocFreq);
    mlt.setMaxQueryTerms(maxQueryTerms);
    mlt.setMinWordLen(minWordLen);
    mlt.setMaxWordLen(maxWordLen);
    mlt.setStopWords(stopWords);
    mlt.setBoost(boostTerms);
    mlt.setBoostFactor(boostTermsFactor);
    //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
    BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText), moreLikeFields[0]);
    BooleanClause[] clauses = bq.getClauses();

    bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));

    bq.setBoost(getBoost());
    return bq;
}

From source file:org.eu.bitzone.Leia.java

License:Apache License

/** More Like this query from the current doc (or selected fields) */
public void actionMLT(final Object docNum, final Object docTable) {
    if (ir == null) {
        errorMsg(MSG_NOINDEX);//  ww w .  jav a 2 s  .c  o m
        return;
    }
    int id = 0;
    try {
        id = Integer.parseInt(getString(docNum, "text"));
    } catch (final NumberFormatException nfe) {
        errorMsg("Invalid document number");
        return;
    }
    final MoreLikeThis mlt = new MoreLikeThis(ir);
    try {
        mlt.setFieldNames(Util.fieldNames(ir, true).toArray(new String[0]));
    } catch (final Exception e) {
        errorMsg("Exception collecting field names: " + e.toString());
        return;
    }
    mlt.setMinTermFreq(1);
    mlt.setMaxQueryTerms(50);
    final Analyzer a = createAnalyzer(find("srchOptTabs"));
    if (a == null) {
        return;
    }
    mlt.setAnalyzer(a);
    final Object[] rows = getSelectedItems(docTable);
    BooleanQuery similar = null;
    if (rows != null && rows.length > 0) {
        // collect text from fields
        final StringBuilder sb = new StringBuilder();
        for (int i = 0; i < rows.length; i++) {
            final Field f = (Field) getProperty(rows[i], "field");
            if (f == null) {
                continue;
            }
            final String s = f.stringValue();
            if (s == null || s.trim().length() == 0) {
                continue;
            }
            if (sb.length() > 0) {
                sb.append(" ");
            }
            sb.append(s);
        }
        try {
            similar = (BooleanQuery) mlt.like(new StringReader(sb.toString()), "field");
        } catch (final Exception e) {
            e.printStackTrace();
            errorMsg("FAILED: " + e.getMessage());
            return;
        }
    } else {
        try {
            similar = (BooleanQuery) mlt.like(id);
        } catch (final Exception e) {
            e.printStackTrace();
            errorMsg("FAILED: " + e.getMessage());
            return;
        }
    }
    if (similar.clauses() != null && similar.clauses().size() > 0) {
        // System.err.println("SIMILAR: " + similar);
        final Object tabpane = find("maintpane");
        setInteger(tabpane, "selected", 2);
        final Object qField = find("qField");
        setString(qField, "text", similar.toString());
    } else {
        showStatus("WARN: empty query - check Analyzer settings");
    }
}

From source file:org.getopt.luke.Luke.java

License:Apache License

/** More Like this query from the current doc (or selected fields) */
public void actionMLT(Object docNum, Object docTable) {
    if (ir == null) {
        errorMsg(MSG_NOINDEX);//  w w  w. j a va  2  s.co  m
        return;
    }
    int id = 0;
    try {
        id = Integer.parseInt(getString(docNum, "text"));
    } catch (NumberFormatException nfe) {
        errorMsg("Invalid document number");
        return;
    }
    MoreLikeThis mlt = new MoreLikeThis(ir);
    try {
        mlt.setFieldNames((String[]) Util.fieldNames(ir, true).toArray(new String[0]));
    } catch (Exception e) {
        errorMsg("Exception collecting field names: " + e.toString());
        return;
    }
    mlt.setMinTermFreq(1);
    mlt.setMaxQueryTerms(50);
    Analyzer a = createAnalyzer(find("srchOptTabs"));
    if (a == null) {
        return;
    }
    mlt.setAnalyzer(a);
    Object[] rows = getSelectedItems(docTable);
    BooleanQuery similar = null;
    if (rows != null && rows.length > 0) {
        // collect text from fields
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < rows.length; i++) {
            Field f = (Field) getProperty(rows[i], "field");
            if (f == null) {
                continue;
            }
            String s = f.stringValue();
            if (s == null || s.trim().length() == 0) {
                continue;
            }
            if (sb.length() > 0)
                sb.append(" ");
            sb.append(s);
        }
        try {
            similar = (BooleanQuery) mlt.like(new StringReader(sb.toString()), "field");
        } catch (Exception e) {
            e.printStackTrace();
            errorMsg("FAILED: " + e.getMessage());
            return;
        }
    } else {
        try {
            similar = (BooleanQuery) mlt.like(id);
        } catch (Exception e) {
            e.printStackTrace();
            errorMsg("FAILED: " + e.getMessage());
            return;
        }
    }
    if (similar.clauses() != null && similar.clauses().size() > 0) {
        //System.err.println("SIMILAR: " + similar);
        Object tabpane = find("maintpane");
        setInteger(tabpane, "selected", 2);
        Object qField = find("qField");
        setString(qField, "text", similar.toString());
    } else {
        showStatus("WARN: empty query - check Analyzer settings");
    }
}

From source file:org.ohdsi.usagi.UsagiSearchEngine.java

License:Apache License

public List<ScoredConcept> search(String searchTerm, boolean useMlt, Collection<Integer> filterConceptIds,
        String filterDomain, String filterConceptClass, String filterVocabulary, boolean filterInvalid) {
    List<ScoredConcept> results = new ArrayList<ScoredConcept>();
    try {/*from w ww. j a v a2 s . com*/
        Query query;
        if (useMlt) {
            MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader());
            mlt.setMinTermFreq(1);
            mlt.setMinDocFreq(1);
            mlt.setMaxDocFreq(9999);
            mlt.setMinWordLen(1);
            mlt.setMaxWordLen(9999);
            mlt.setMaxDocFreqPct(100);
            mlt.setMaxNumTokensParsed(9999);
            mlt.setMaxQueryTerms(9999);
            mlt.setStopWords(null);
            mlt.setFieldNames(new String[] { "TERM" });
            mlt.setAnalyzer(analyzer);

            query = mlt.like("TERM", new StringReader(searchTerm));
        } else {
            try {
                query = keywordsQueryParser.parse(searchTerm);
                // if (query instanceof BooleanQuery) {
                // List<BooleanClause> clauses = ((BooleanQuery) query).clauses();
                // BooleanClause lastClause = clauses.get(clauses.size() - 1);
                // lastClause.setQuery(new PrefixQuery(((TermQuery) lastClause.getQuery()).getTerm()));
                // } else if (query instanceof TermQuery) {// It's a single term
                // query = new PrefixQuery(((TermQuery) query).getTerm());
                // }

            } catch (ParseException e) {
                return results;
            }
        }

        BooleanQuery booleanQuery = new BooleanQuery();
        booleanQuery.add(query, Occur.SHOULD);
        booleanQuery.add(conceptQuery, Occur.MUST);

        if (filterConceptIds != null && filterConceptIds.size() > 0) {
            Query conceptIdQuery = conceptIdQueryParser.parse(StringUtilities.join(filterConceptIds, " OR "));
            booleanQuery.add(conceptIdQuery, Occur.MUST);
        }

        if (filterDomain != null) {
            Query domainQuery = domainQueryParser.parse("\"" + filterDomain + "\"");
            booleanQuery.add(domainQuery, Occur.MUST);
        }
        if (filterConceptClass != null) {
            Query conceptClassQuery = conceptClassQueryParser
                    .parse("\"" + filterConceptClass.toString() + "\"");
            booleanQuery.add(conceptClassQuery, Occur.MUST);
        }
        if (filterVocabulary != null) {
            Query vocabularyQuery = vocabularyQueryParser.parse("\"" + filterVocabulary.toString() + "\"");
            booleanQuery.add(vocabularyQuery, Occur.MUST);
        }
        if (filterInvalid) {
            Query invalidQuery = invalidQueryParser.parse("\"\"");
            booleanQuery.add(invalidQuery, Occur.MUST);
        }
        TopDocs topDocs = searcher.search(booleanQuery, 100);

        recomputeScores(topDocs.scoreDocs, query);
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            Document document = reader.document(scoreDoc.doc);
            int conceptId = Integer.parseInt(document.get("CONCEPT_ID"));
            // If matchscore = 0 but it was the one concept that was automatically selected, still allow it:
            if (scoreDoc.score > 0 || (filterConceptIds != null && filterConceptIds.size() == 1
                    && filterConceptIds.contains(conceptId))) {
                TargetConcept targetConcept = new TargetConcept();
                targetConcept.term = document.get("TERM");
                targetConcept.conceptId = conceptId;
                targetConcept.conceptName = document.get("CONCEPT_NAME");
                targetConcept.conceptClass = document.get("CONCEPT_CLASS");
                targetConcept.vocabulary = document.get("VOCABULARY");
                targetConcept.conceptCode = document.get("CONCEPT_CODE");
                targetConcept.validStartDate = document.get("VALID_START_DATE");
                targetConcept.validEndDate = document.get("VALID_END_DATE");
                targetConcept.invalidReason = document.get("INVALID_REASON");
                for (String domain : document.get("DOMAINS").split("\n"))
                    targetConcept.domains.add(domain);
                targetConcept.additionalInformation = document.get("ADDITIONAL_INFORMATION");
                results.add(new ScoredConcept(scoreDoc.score, targetConcept));
            }
        }
        reorderTies(results);
        removeDuplicateConcepts(results);
    } catch (Exception e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
    }

    return results;
}

From source file:uk.gov.nationalarchives.discovery.taxonomy.common.service.impl.TSetBasedCategoriserServiceImpl.java

License:Mozilla Public License

/**
 * run More Like This process on a document by comparing its description to
 * the description of all items of the training set<br/>
 * currently we get a fixed number of the top results
 * /*from  ww w  . j  av  a2s.co  m*/
 * @param document
 *            document being tested
 * @return
 * @throws IOException
 */
public List<TSetBasedCategorisationResult> runMlt(Document document) {

    Map<String, TSetBasedCategorisationResult> result = null;
    IndexSearcher searcher = null;
    try {
        trainingSetSearcherManager.maybeRefresh();
        // Boolean wasRefreshed = trainingSetSearcherManager.maybeRefresh();
        // if (wasRefreshed) {
        // logger.debug(".runMlt: training set searcher had to be refreshed");
        // }
        searcher = trainingSetSearcherManager.acquire();

        // TODO TSETBASED refresh reader/searcher: Use readermanager and
        // refresh it?
        MoreLikeThis moreLikeThis = new MoreLikeThis(this.trainingSetIndexReader);
        moreLikeThis.setMinTermFreq(minTermFreq);
        moreLikeThis.setMinDocFreq(minDocFreq);
        moreLikeThis.setAnalyzer(this.trainingSetAnalyser);
        moreLikeThis.setFieldNames(fieldsToAnalyse.split(","));
        moreLikeThis.setBoost(true);

        BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();

        for (String fieldName : fieldsToAnalyse.split(",")) {
            String value = document.get(fieldName);
            if (value != null && !"null".equals(value)) {

                switch (InformationAssetViewFields.valueOf(fieldName)) {
                case DESCRIPTION:
                    moreLikeThis.setBoostFactor(descBoostingFactor);
                    break;
                case TITLE:
                    moreLikeThis.setBoostFactor(titleBoostingFactor);
                    break;
                case CONTEXTDESCRIPTION:
                    moreLikeThis.setBoostFactor(contextDescBoostingFactor);
                    break;
                default:
                case SUBJECTS:
                case CORPBODYS:
                case PERSON_FULLNAME:
                case PLACE_NAME:
                    moreLikeThis.setBoostFactor(1);
                    break;
                }
                Query query = moreLikeThis.like(fieldName, new StringReader(value));
                queryBuilder.add(query, Occur.SHOULD);
            }
        }
        BooleanQuery fullQuery = queryBuilder.build();

        TopDocs topDocs = searcher.search(fullQuery, this.maximumSimilarElements);
        logger.debug(".runMlt: found {} total hits, processed at maximum {} hits", topDocs.totalHits,
                this.maximumSimilarElements);

        result = new LinkedHashMap<String, TSetBasedCategorisationResult>();

        int size = 0;
        if (topDocs.totalHits <= this.maximumSimilarElements) {
            size = topDocs.totalHits - 1;
        } else {
            size = this.maximumSimilarElements - 1;
        }

        for (int i = 0; i < size; i++) {
            ScoreDoc scoreDoc = topDocs.scoreDocs[i];
            Float currrentScore = scoreDoc.score;

            if (currrentScore < this.mimimumScoreForMlt) {
                break;
            }

            Document hitDoc = searcher.doc(scoreDoc.doc);
            String category = hitDoc.get(InformationAssetViewFields.TAXONOMY.toString());
            String docReference = hitDoc.get(InformationAssetViewFields.DOCREFERENCE.toString());
            logger.debug(".runMlt: found doc, category: {}, score: {}, docreference: {}", category,
                    currrentScore, docReference);

            TSetBasedCategorisationResult existingCategorisationResult = result.get(category);
            Float scoreToSet = currrentScore;
            Integer numberOfFoundDocuments = 1;
            // k nearest neighbour algorithm
            if (existingCategorisationResult != null) {
                scoreToSet += existingCategorisationResult.getScore();
                numberOfFoundDocuments += existingCategorisationResult.getNumberOfFoundDocuments();
            }
            result.put(category,
                    new TSetBasedCategorisationResult(category, scoreToSet, numberOfFoundDocuments));

        }

    } catch (IOException e) {
        throw new TaxonomyException(TaxonomyErrorType.LUCENE_IO_EXCEPTION, e);
    } finally {
        LuceneHelperTools.releaseSearcherManagerQuietly(trainingSetSearcherManager, searcher);
    }

    List<TSetBasedCategorisationResult> sortedResults = sortCategorisationResultsByScoreDescAndFilterByGlobalScore(
            new ArrayList<TSetBasedCategorisationResult>(result.values()));

    return sortedResults;
}