Example usage for org.apache.lucene.search.similarities TFIDFSimilarity tf

List of usage examples for org.apache.lucene.search.similarities TFIDFSimilarity tf

Introduction

In this page you can find the example usage for org.apache.lucene.search.similarities TFIDFSimilarity tf.

Prototype

public abstract float tf(float freq);

Source Link

Document

Computes a score factor based on a term or phrase's frequency in a document.

Usage

From source file:org.apache.solr.search.function.TestFunctionQuery.java

License:Apache License

@Test
public void testGeneral() throws Exception {
    clearIndex();/*from   www . java2 s.  co  m*/

    assertU(adoc("id", "1", "a_tdt", "2009-08-31T12:10:10.123Z", "b_tdt", "2009-08-31T12:10:10.124Z"));
    assertU(adoc("id", "2", "a_t", "how now brown cow"));
    assertU(commit()); // create more than one segment
    assertU(adoc("id", "3", "a_t", "brown cow"));
    assertU(adoc("id", "4"));
    assertU(commit()); // create more than one segment
    assertU(adoc("id", "5"));
    assertU(adoc("id", "6", "a_t", "cow cow cow cow cow"));
    assertU(commit());

    // test relevancy functions
    assertQ(req("fl", "*,score", "q", "{!func}numdocs()", "fq", "id:6"), "//float[@name='score']='6.0'");
    assertQ(req("fl", "*,score", "q", "{!func}maxdoc()", "fq", "id:6"), "//float[@name='score']='6.0'");
    assertQ(req("fl", "*,score", "q", "{!func}docfreq(a_t,cow)", "fq", "id:6"), "//float[@name='score']='3.0'");
    assertQ(req("fl", "*,score", "q", "{!func}docfreq('a_t','cow')", "fq", "id:6"),
            "//float[@name='score']='3.0'");
    assertQ(req("fl", "*,score", "q", "{!func}docfreq($field,$value)", "fq", "id:6", "field", "a_t", "value",
            "cow"), "//float[@name='score']='3.0'");
    assertQ(req("fl", "*,score", "q", "{!func}termfreq(a_t,cow)", "fq", "id:6"),
            "//float[@name='score']='5.0'");

    TFIDFSimilarity similarity = new DefaultSimilarity();

    // make sure it doesn't get a NPE if no terms are present in a field.
    assertQ(req("fl", "*,score", "q", "{!func}termfreq(nofield_t,cow)", "fq", "id:6"),
            "//float[@name='score']='0.0'");
    assertQ(req("fl", "*,score", "q", "{!func}docfreq(nofield_t,cow)", "fq", "id:6"),
            "//float[@name='score']='0.0'");
    assertQ(req("fl", "*,score", "q", "{!func}idf(nofield_t,cow)", "fq", "id:6"),
            "//float[@name='score']='" + similarity.idf(0, 6) + "'");
    assertQ(req("fl", "*,score", "q", "{!func}tf(nofield_t,cow)", "fq", "id:6"),
            "//float[@name='score']='" + similarity.tf(0) + "'");

    assertQ(req("fl", "*,score", "q", "{!func}idf(a_t,cow)", "fq", "id:6"),
            "//float[@name='score']='" + similarity.idf(3, 6) + "'");
    assertQ(req("fl", "*,score", "q", "{!func}tf(a_t,cow)", "fq", "id:6"),
            "//float[@name='score']='" + similarity.tf(5) + "'");
    FieldInvertState state = new FieldInvertState("a_t");
    state.setBoost(1.0f);
    state.setLength(4);
    long norm = similarity.computeNorm(state);
    float nrm = similarity.decodeNormValue((byte) norm);
    assertQ(req("fl", "*,score", "q", "{!func}norm(a_t)", "fq", "id:2"),
            "//float[@name='score']='" + nrm + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte

    // test that ord and rord are working on a global index basis, not just
    // at the segment level (since Lucene 2.9 has switched to per-segment searching)
    assertQ(req("fl", "*,score", "q", "{!func}ord(id)", "fq", "id:6"), "//float[@name='score']='5.0'");
    assertQ(req("fl", "*,score", "q", "{!func}top(ord(id))", "fq", "id:6"), "//float[@name='score']='5.0'");
    assertQ(req("fl", "*,score", "q", "{!func}rord(id)", "fq", "id:1"), "//float[@name='score']='5.0'");
    assertQ(req("fl", "*,score", "q", "{!func}top(rord(id))", "fq", "id:1"), "//float[@name='score']='5.0'");

    // test that we can subtract dates to millisecond precision
    assertQ(req("fl", "*,score", "q", "{!func}ms(a_tdt,b_tdt)", "fq", "id:1"), "//float[@name='score']='-1.0'");
    assertQ(req("fl", "*,score", "q", "{!func}ms(b_tdt,a_tdt)", "fq", "id:1"), "//float[@name='score']='1.0'");
    assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.125Z,2009-08-31T12:10:10.124Z)", "fq",
            "id:1"), "//float[@name='score']='1.0'");
    assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.124Z,a_tdt)", "fq", "id:1"),
            "//float[@name='score']='1.0'");
    assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.125Z,b_tdt)", "fq", "id:1"),
            "//float[@name='score']='1.0'");

    assertQ(req("fl", "*,score", "q",
            "{!func}ms(2009-08-31T12:10:10.125Z/SECOND,2009-08-31T12:10:10.124Z/SECOND)", "fq", "id:1"),
            "//float[@name='score']='0.0'");

    // test that we can specify "NOW"
    assertQ(req("fl", "*,score", "q", "{!func}ms(NOW)", "NOW", "1000"), "//float[@name='score']='1000.0'");

    for (int i = 100; i < 112; i++) {
        assertU(adoc("id", "" + i, "text", "batman"));
    }
    assertU(commit());
    assertU(adoc("id", "120", "text", "batman superman")); // in a smaller segment
    assertU(adoc("id", "121", "text", "superman"));
    assertU(commit());

    // superman has a higher df (thus lower idf) in one segment, but reversed in the complete index
    String q = "{!func}query($qq)";
    String fq = "id:120";
    assertQ(req("fl", "*,score", "q", q, "qq", "text:batman", "fq", fq), "//float[@name='score']<'1.0'");
    assertQ(req("fl", "*,score", "q", q, "qq", "text:superman", "fq", fq), "//float[@name='score']>'1.0'");

    // test weighting through a function range query
    assertQ(req("fl", "*,score", "fq", fq, "q", "{!frange l=1 u=10}query($qq)", "qq", "text:superman"),
            "//*[@numFound='1']");

    // test weighting through a complex function
    q = "{!func}sub(div(sum(0.0,product(1,query($qq))),1),0)";
    assertQ(req("fl", "*,score", "q", q, "qq", "text:batman", "fq", fq), "//float[@name='score']<'1.0'");
    assertQ(req("fl", "*,score", "q", q, "qq", "text:superman", "fq", fq), "//float[@name='score']>'1.0'");

    // test full param dereferencing
    assertQ(req("fl", "*,score", "q", "{!func}add($v1,$v2)", "v1", "add($v3,$v4)", "v2", "1", "v3", "2", "v4",
            "5", "fq", "id:1"), "//float[@name='score']='8.0'");

    // test ability to parse multiple values
    assertQ(req("fl", "*,score", "q", "{!func}dist(2,vector(1,1),$pt)", "pt", "3,1", "fq", "id:1"),
            "//float[@name='score']='2.0'");

    // test that extra stuff after a function causes an error
    try {
        assertQ(req("fl", "*,score", "q", "{!func}10 wow dude ignore_exception"));
        fail();
    } catch (Exception e) {
        // OK
    }

    // test that sorting by function weights correctly.  superman should sort higher than batman due to idf of the whole index

    assertQ(req("q", "*:*", "fq", "id:120 OR id:121", "sort", "{!func v=$sortfunc} desc", "sortfunc",
            "query($qq)", "qq", "text:(batman OR superman)"), "*//doc[1]/float[.='120.0']",
            "*//doc[2]/float[.='121.0']");

    FieldCache.DEFAULT.purgeAllCaches(); // avoid FC insanity
}