List of usage examples for org.apache.lucene.search.similarities TFIDFSimilarity idf
public abstract float idf(long docFreq, long docCount);
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.Lucene.java
License:Open Source License
/** * uses custom similarity to compute idf, use this if you want to implement * IDF(numDocs,docFreq)/*from w w w . j a v a 2s . com*/ * * @param reader * @param field * @param tfidfSIM * @return * @throws IOException */ public static Map<String, Float> getIdfs(IndexReader reader, String field, TFIDFSimilarity tfidfSIM) throws IOException { Map<String, Float> docFrequencies = new HashMap<>(); TermsEnum termEnum = MultiFields.getTerms(reader, field).iterator(); BytesRef bytesRef; while ((bytesRef = termEnum.next()) != null) { if (termEnum.seekExact(bytesRef)) { String term = bytesRef.utf8ToString(); float idf = tfidfSIM.idf(termEnum.docFreq(), reader.numDocs()); docFrequencies.put(term, idf); } } return docFrequencies; }
From source file:org.apache.solr.search.function.TestFunctionQuery.java
License:Apache License
@Test public void testGeneral() throws Exception { clearIndex();//from ww w .jav a 2 s .c o m assertU(adoc("id", "1", "a_tdt", "2009-08-31T12:10:10.123Z", "b_tdt", "2009-08-31T12:10:10.124Z")); assertU(adoc("id", "2", "a_t", "how now brown cow")); assertU(commit()); // create more than one segment assertU(adoc("id", "3", "a_t", "brown cow")); assertU(adoc("id", "4")); assertU(commit()); // create more than one segment assertU(adoc("id", "5")); assertU(adoc("id", "6", "a_t", "cow cow cow cow cow")); assertU(commit()); // test relevancy functions assertQ(req("fl", "*,score", "q", "{!func}numdocs()", "fq", "id:6"), "//float[@name='score']='6.0'"); assertQ(req("fl", "*,score", "q", "{!func}maxdoc()", "fq", "id:6"), "//float[@name='score']='6.0'"); assertQ(req("fl", "*,score", "q", "{!func}docfreq(a_t,cow)", "fq", "id:6"), "//float[@name='score']='3.0'"); assertQ(req("fl", "*,score", "q", "{!func}docfreq('a_t','cow')", "fq", "id:6"), "//float[@name='score']='3.0'"); assertQ(req("fl", "*,score", "q", "{!func}docfreq($field,$value)", "fq", "id:6", "field", "a_t", "value", "cow"), "//float[@name='score']='3.0'"); assertQ(req("fl", "*,score", "q", "{!func}termfreq(a_t,cow)", "fq", "id:6"), "//float[@name='score']='5.0'"); TFIDFSimilarity similarity = new DefaultSimilarity(); // make sure it doesn't get a NPE if no terms are present in a field. assertQ(req("fl", "*,score", "q", "{!func}termfreq(nofield_t,cow)", "fq", "id:6"), "//float[@name='score']='0.0'"); assertQ(req("fl", "*,score", "q", "{!func}docfreq(nofield_t,cow)", "fq", "id:6"), "//float[@name='score']='0.0'"); assertQ(req("fl", "*,score", "q", "{!func}idf(nofield_t,cow)", "fq", "id:6"), "//float[@name='score']='" + similarity.idf(0, 6) + "'"); assertQ(req("fl", "*,score", "q", "{!func}tf(nofield_t,cow)", "fq", "id:6"), "//float[@name='score']='" + similarity.tf(0) + "'"); assertQ(req("fl", "*,score", "q", "{!func}idf(a_t,cow)", "fq", "id:6"), "//float[@name='score']='" + similarity.idf(3, 6) + "'"); assertQ(req("fl", "*,score", "q", "{!func}tf(a_t,cow)", "fq", "id:6"), "//float[@name='score']='" + similarity.tf(5) + "'"); FieldInvertState state = new FieldInvertState("a_t"); state.setBoost(1.0f); state.setLength(4); long norm = similarity.computeNorm(state); float nrm = similarity.decodeNormValue((byte) norm); assertQ(req("fl", "*,score", "q", "{!func}norm(a_t)", "fq", "id:2"), "//float[@name='score']='" + nrm + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte // test that ord and rord are working on a global index basis, not just // at the segment level (since Lucene 2.9 has switched to per-segment searching) assertQ(req("fl", "*,score", "q", "{!func}ord(id)", "fq", "id:6"), "//float[@name='score']='5.0'"); assertQ(req("fl", "*,score", "q", "{!func}top(ord(id))", "fq", "id:6"), "//float[@name='score']='5.0'"); assertQ(req("fl", "*,score", "q", "{!func}rord(id)", "fq", "id:1"), "//float[@name='score']='5.0'"); assertQ(req("fl", "*,score", "q", "{!func}top(rord(id))", "fq", "id:1"), "//float[@name='score']='5.0'"); // test that we can subtract dates to millisecond precision assertQ(req("fl", "*,score", "q", "{!func}ms(a_tdt,b_tdt)", "fq", "id:1"), "//float[@name='score']='-1.0'"); assertQ(req("fl", "*,score", "q", "{!func}ms(b_tdt,a_tdt)", "fq", "id:1"), "//float[@name='score']='1.0'"); assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.125Z,2009-08-31T12:10:10.124Z)", "fq", "id:1"), "//float[@name='score']='1.0'"); assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.124Z,a_tdt)", "fq", "id:1"), "//float[@name='score']='1.0'"); assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.125Z,b_tdt)", "fq", "id:1"), "//float[@name='score']='1.0'"); assertQ(req("fl", "*,score", "q", "{!func}ms(2009-08-31T12:10:10.125Z/SECOND,2009-08-31T12:10:10.124Z/SECOND)", "fq", "id:1"), "//float[@name='score']='0.0'"); // test that we can specify "NOW" assertQ(req("fl", "*,score", "q", "{!func}ms(NOW)", "NOW", "1000"), "//float[@name='score']='1000.0'"); for (int i = 100; i < 112; i++) { assertU(adoc("id", "" + i, "text", "batman")); } assertU(commit()); assertU(adoc("id", "120", "text", "batman superman")); // in a smaller segment assertU(adoc("id", "121", "text", "superman")); assertU(commit()); // superman has a higher df (thus lower idf) in one segment, but reversed in the complete index String q = "{!func}query($qq)"; String fq = "id:120"; assertQ(req("fl", "*,score", "q", q, "qq", "text:batman", "fq", fq), "//float[@name='score']<'1.0'"); assertQ(req("fl", "*,score", "q", q, "qq", "text:superman", "fq", fq), "//float[@name='score']>'1.0'"); // test weighting through a function range query assertQ(req("fl", "*,score", "fq", fq, "q", "{!frange l=1 u=10}query($qq)", "qq", "text:superman"), "//*[@numFound='1']"); // test weighting through a complex function q = "{!func}sub(div(sum(0.0,product(1,query($qq))),1),0)"; assertQ(req("fl", "*,score", "q", q, "qq", "text:batman", "fq", fq), "//float[@name='score']<'1.0'"); assertQ(req("fl", "*,score", "q", q, "qq", "text:superman", "fq", fq), "//float[@name='score']>'1.0'"); // test full param dereferencing assertQ(req("fl", "*,score", "q", "{!func}add($v1,$v2)", "v1", "add($v3,$v4)", "v2", "1", "v3", "2", "v4", "5", "fq", "id:1"), "//float[@name='score']='8.0'"); // test ability to parse multiple values assertQ(req("fl", "*,score", "q", "{!func}dist(2,vector(1,1),$pt)", "pt", "3,1", "fq", "id:1"), "//float[@name='score']='2.0'"); // test that extra stuff after a function causes an error try { assertQ(req("fl", "*,score", "q", "{!func}10 wow dude ignore_exception")); fail(); } catch (Exception e) { // OK } // test that sorting by function weights correctly. superman should sort higher than batman due to idf of the whole index assertQ(req("q", "*:*", "fq", "id:120 OR id:121", "sort", "{!func v=$sortfunc} desc", "sortfunc", "query($qq)", "qq", "text:(batman OR superman)"), "*//doc[1]/float[.='120.0']", "*//doc[2]/float[.='121.0']"); FieldCache.DEFAULT.purgeAllCaches(); // avoid FC insanity }