approxnn.DocIdSim.java Source code

Java tutorial

Introduction

Here is the source code for approxnn.DocIdSim.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package approxnn;

import indexer.DocVector;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;

/**
 *
 * @author Debasis
 */

class DocIdSim implements Comparable<DocIdSim> {
    ScoreDoc sd;

    public DocIdSim(ScoreDoc sd) {
        this.sd = sd;
    }

    @Override
    public int compareTo(DocIdSim that) { // descending by similarity
        return -1 * Float.compare(sd.score, that.sd.score);
    }

    @Override
    public boolean equals(Object obj) {
        if (obj == null) {
            return false;
        }
        if (getClass() != obj.getClass()) {
            return false;
        }
        final DocIdSim other = (DocIdSim) obj;
        return this.sd.doc == other.sd.doc;
    }

    @Override
    public int hashCode() {
        return Integer.hashCode(this.sd.doc);
    }
}

public class ANNList {
    Set<DocIdSim> neighbors;

    public ANNList(TopDocs topdocs) {
        neighbors = new HashSet<>();
        for (ScoreDoc sd : topdocs.scoreDocs) {
            neighbors.add(new DocIdSim(sd));
        }
    }

    public ANNList(Set<DocIdSim> neighbors) {
        this.neighbors = neighbors;
    }

    public ANNList(DocVector vec) {
        this.neighbors = new HashSet<>();
        ScoreDoc sd = new ScoreDoc(vec.getId(), 0);
        this.neighbors.add(new DocIdSim(sd));
    }

    public ANNList(List<DocVector> vecs) {
        this.neighbors = new HashSet<>();
        for (DocVector vec : vecs) {
            ScoreDoc sd = new ScoreDoc(vec.getId(), 0);
            this.neighbors.add(new DocIdSim(sd));
        }
    }

    static public ANNList getIntersection(ANNList a, ANNList b) {
        HashSet<DocIdSim> x = new HashSet<>(a.neighbors);
        x.retainAll(b.neighbors);
        return new ANNList(x);
    }

    static public ANNList getUnion(ANNList a, ANNList b) {
        HashSet<DocIdSim> x = new HashSet<>(a.neighbors);
        x.addAll(b.neighbors);
        return new ANNList(x);
    }

    public List<DocVector> selectTopK(DocVector qvec, IndexReader reader, int k) throws Exception {

        PriorityQueue<DocVector> nearest = new PriorityQueue();

        for (DocIdSim docIdSim : this.neighbors) {
            Document d = reader.document(docIdSim.sd.doc);
            DocVector dvec = new DocVector(d, qvec.numDimensions, DocVector.numIntervals);
            float dist = qvec.getDist(dvec);
            dvec.setDistWithQry(dist);
            nearest.add(dvec);
        }

        List<DocVector> topDocsList = new ArrayList<>();
        k = Math.min(k, nearest.size());
        for (int i = 0; i < k; i++) {
            topDocsList.add(nearest.poll());
        }

        return topDocsList;
    }

    // Use similarity scores
    public Set<DocIdSim> selectTopKSim(DocVector qvec, IndexReader reader, int k) throws Exception {

        // Resort the set of doc-score similarity objects and select top k
        List<DocIdSim> sortedDocIdSims = new ArrayList<>(this.neighbors.size());
        for (DocIdSim dIdSim : this.neighbors) {
            sortedDocIdSims.add(dIdSim);
        }
        Collections.sort(sortedDocIdSims);

        k = Math.min(k, sortedDocIdSims.size());
        Set<DocIdSim> sortedSet = new HashSet<>(k);

        for (int i = 0; i < k; i++) {
            sortedSet.add(sortedDocIdSims.get(i));
        }
        return sortedSet;
    }

    public List<DocVector> selectTop(DocVector qvec, IndexReader reader) throws Exception {
        float minDist = Float.MAX_VALUE;
        DocVector ann = null;

        for (DocIdSim docIdSim : this.neighbors) {
            Document d = reader.document(docIdSim.sd.doc);
            DocVector dvec = new DocVector(d, qvec.numDimensions, DocVector.numIntervals, true);
            float dist = qvec.getDist(dvec);
            dvec.setDistWithQry(dist);
            if (dist < minDist) {
                minDist = dist;
                ann = dvec;
            }
        }

        List<DocVector> topList = new ArrayList<>(1);
        topList.add(ann);
        return topList;
    }
}