Example usage for org.apache.lucene.search.similarities TFIDFSimilarity idf

List of usage examples for org.apache.lucene.search.similarities TFIDFSimilarity idf

Introduction

In this page you can find the example usage for org.apache.lucene.search.similarities TFIDFSimilarity idf.

Prototype

public abstract float idf(long docFreq, long docCount);

Source Link

Document

Computes a score factor based on a term's document frequency (the number of documents which contain the term).

Usage

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.Lucene.java

License:Open Source License

/**
 * uses custom similarity to compute idf, use this if you want to implement
 * IDF(numDocs,docFreq)/*from   w  w  w .  j  a  v a  2s .  com*/
 * 
 * @param reader
 * @param field
 * @param tfidfSIM
 * @return
 * @throws IOException
 */
public static Map<String, Float> getIdfs(IndexReader reader, String field, TFIDFSimilarity tfidfSIM)
        throws IOException {
    Map<String, Float> docFrequencies = new HashMap<>();

    TermsEnum termEnum = MultiFields.getTerms(reader, field).iterator();
    BytesRef bytesRef;
    while ((bytesRef = termEnum.next()) != null) {
        if (termEnum.seekExact(bytesRef)) {
            String term = bytesRef.utf8ToString();

            float idf = tfidfSIM.idf(termEnum.docFreq(), reader.numDocs());
            docFrequencies.put(term, idf);
        }
    }

    return docFrequencies;
}

From source file:org.apache.solr.search.function.TestFunctionQuery.java

License:Apache License

@Test
public void testGeneral() throws Exception {
    clearIndex();//from   ww  w .jav a  2  s  .c o  m

    assertU(adoc("id", "1", "a_tdt", "2009-08-31T12:10:10.123Z", "b_tdt", "2009-08-31T12:10:10.124Z"));
    assertU(adoc("id", "2", "a_t", "how now brown cow"));
    assertU(commit()); // create more than one segment
    assertU(adoc("id", "3", "a_t", "brown cow"));
    assertU(adoc("id", "4"));
    assertU(commit()); // create more than one segment
    assertU(adoc("id", "5"));
    assertU(adoc("id", "6", "a_t", "cow cow cow cow cow"));
    assertU(commit());

    // test relevancy functions
    assertQ(req("fl", "*,score", "q", "{!func}numdocs()", "fq", "id:6"), "//float[@name='score']='6.0'");
    assertQ(req("fl", "*,score", "q", "{!func}maxdoc()", "fq", "id:6"), "//float[@name='score']='6.0'");
    assertQ(req("fl", "*,score", "q", "{!func}docfreq(a_t,cow)", "fq", "id:6"), "//float[@name='score']='3.0'");
    assertQ(req("fl", "*,score", "q", "{!func}docfreq('a_t','cow')", "fq", "id:6"),
            "//float[@name='score']='3.0'");
    assertQ(req("fl", "*,score", "q", "{!func}docfreq($field,$value)", "fq", "id:6", "field", "a_t", "value",
            "cow"), "//float[@name='score']='3.0'");
    assertQ(req("fl", "*,score", "q", "{!func}termfreq(a_t,cow)", "fq", "id:6"),
            "//float[@name='score']='5.0'");

    TFIDFSimilarity similarity = new DefaultSimilarity();

    // make sure it doesn't get a NPE if no terms are present in a field.
    assertQ(req("fl", "*,score", "q", "{!func}termfreq(nofield_t,cow)", "fq", "id:6"),
            "//float[@name='score']='0.0'");
    assertQ(req("fl", "*,score", "q", "{!func}docfreq(nofield_t,cow)", "fq", "id:6"),
            "//float[@name='score']='0.0'");
    assertQ(req("fl", "*,score", "q", "{!func}idf(nofield_t,cow)", "fq", "id:6"),
            "//float[@name='score']='" + similarity.idf(0, 6) + "'");
    assertQ(req("fl", "*,score", "q", "{!func}tf(nofield_t,cow)", "fq", "id:6"),
            "//float[@name='score']='" + similarity.tf(0) + "'");

    assertQ(req("fl", "*,score", "q", "{!func}idf(a_t,cow)", "fq", "id:6"),
            "//float[@name='score']='" + similarity.idf(3, 6) + "'");
    assertQ(req("fl", "*,score", "q", "{!func}tf(a_t,cow)", "fq", "id:6"),
            "//float[@name='score']='" + similarity.tf(5) + "'");
    FieldInvertState state = new FieldInvertState("a_t");
    state.setBoost(1.0f);
    state.setLength(4);
    long norm = similarity.computeNorm(state);
    float nrm = similarity.decodeNormValue((byte) norm);
    assertQ(req("fl", "*,score", "q", "{!func}norm(a_t)", "fq", "id:2"),
            "//float[@name='score']='" + nrm + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte

    // test that ord and rord are working on a global index basis, not just
    // at the segment level (since Lucene 2.9 has switched to per-segment searching)
    assertQ(req("fl", "*,score", "q", "{!func}ord(id)", "fq", "id:6"), "//float[@name='score']='5.0'");
    assertQ(req("fl", "*,score", "q", "{!func}top(ord(id))", "fq", "id:6"), "//float[@name='score']='5.0'");
    assertQ(req("fl", "*,score", "q", "{!func}rord(id)", "fq", "id:1"), "//float[@name='score']='5.0'");
    assertQ(req("fl", "*,score", "q", "{!func}top(rord(id))", "fq", "id:1"), "//float[@name='score']='5.0'");

    // test that we can subtract dates to millisecond precision
    assertQ(req("fl", "*,score", "q", "{!func}ms(a_tdt,b_tdt)", "fq", "id:1"), "//float[@name='score']='-1.0'");
    assertQ(req("fl", "*,score", "q", "{!func}ms(b_tdt,a_tdt)", "fq", "id:1"), "//float[@name='score']='1.0'");
    assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.125Z,2009-08-31T12:10:10.124Z)", "fq",
            "id:1"), "//float[@name='score']='1.0'");
    assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.124Z,a_tdt)", "fq", "id:1"),
            "//float[@name='score']='1.0'");
    assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.125Z,b_tdt)", "fq", "id:1"),
            "//float[@name='score']='1.0'");

    assertQ(req("fl", "*,score", "q",
            "{!func}ms(2009-08-31T12:10:10.125Z/SECOND,2009-08-31T12:10:10.124Z/SECOND)", "fq", "id:1"),
            "//float[@name='score']='0.0'");

    // test that we can specify "NOW"
    assertQ(req("fl", "*,score", "q", "{!func}ms(NOW)", "NOW", "1000"), "//float[@name='score']='1000.0'");

    for (int i = 100; i < 112; i++) {
        assertU(adoc("id", "" + i, "text", "batman"));
    }
    assertU(commit());
    assertU(adoc("id", "120", "text", "batman superman")); // in a smaller segment
    assertU(adoc("id", "121", "text", "superman"));
    assertU(commit());

    // superman has a higher df (thus lower idf) in one segment, but reversed in the complete index
    String q = "{!func}query($qq)";
    String fq = "id:120";
    assertQ(req("fl", "*,score", "q", q, "qq", "text:batman", "fq", fq), "//float[@name='score']<'1.0'");
    assertQ(req("fl", "*,score", "q", q, "qq", "text:superman", "fq", fq), "//float[@name='score']>'1.0'");

    // test weighting through a function range query
    assertQ(req("fl", "*,score", "fq", fq, "q", "{!frange l=1 u=10}query($qq)", "qq", "text:superman"),
            "//*[@numFound='1']");

    // test weighting through a complex function
    q = "{!func}sub(div(sum(0.0,product(1,query($qq))),1),0)";
    assertQ(req("fl", "*,score", "q", q, "qq", "text:batman", "fq", fq), "//float[@name='score']<'1.0'");
    assertQ(req("fl", "*,score", "q", q, "qq", "text:superman", "fq", fq), "//float[@name='score']>'1.0'");

    // test full param dereferencing
    assertQ(req("fl", "*,score", "q", "{!func}add($v1,$v2)", "v1", "add($v3,$v4)", "v2", "1", "v3", "2", "v4",
            "5", "fq", "id:1"), "//float[@name='score']='8.0'");

    // test ability to parse multiple values
    assertQ(req("fl", "*,score", "q", "{!func}dist(2,vector(1,1),$pt)", "pt", "3,1", "fq", "id:1"),
            "//float[@name='score']='2.0'");

    // test that extra stuff after a function causes an error
    try {
        assertQ(req("fl", "*,score", "q", "{!func}10 wow dude ignore_exception"));
        fail();
    } catch (Exception e) {
        // OK
    }

    // test that sorting by function weights correctly.  superman should sort higher than batman due to idf of the whole index

    assertQ(req("q", "*:*", "fq", "id:120 OR id:121", "sort", "{!func v=$sortfunc} desc", "sortfunc",
            "query($qq)", "qq", "text:(batman OR superman)"), "*//doc[1]/float[.='120.0']",
            "*//doc[2]/float[.='121.0']");

    FieldCache.DEFAULT.purgeAllCaches(); // avoid FC insanity
}