Change semantics of rank lead.
authorAdam Dickmeiss <adam@indexdata.dk>
Mon, 24 Sep 2012 17:28:54 +0000 (19:28 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Mon, 24 Sep 2012 17:28:54 +0000 (19:28 +0200)
NEWS
src/pazpar2_config.c
src/pazpar2_config.h
src/relevance.c
src/relevance.h

diff --git a/NEWS b/NEWS
index 0611b78..a73f2f9 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -4,9 +4,10 @@ occur next to each other; number-1 if they are one term apart , .. 0
 if they are number a part (all in order). Default is 0 (following
 terms has no effect).
 
-Rank tweak: lead=number will increase mult by number if term is first
-term in field, number-1 if second, ... 0 if term is at offset
-number of more. Default value is 0 (position irrelevant).
+Rank tweak: lead=k will divide mult by 1 + log2(1+k*l) where k is
+value given by lead and l is length from beginning of field where
+term occurs (l=0 for first term, l=1 for second term, ..). Default
+value of k is 0.0.
 
 Rank tweak: length=strategy. length="linear" if mult is to be divided
 by length (existing, default behavior), length="log" if mult is to be 
index 73176b8..b45d825 100644 (file)
@@ -135,7 +135,7 @@ struct conf_service *service_init(struct conf_server *server,
     service->rank_cluster = 1;
     service->rank_debug = 0;
     service->rank_follow = 0;
-    service->rank_lead = 0;
+    service->rank_lead = 0.0;
     service->rank_length = 2;
 
     service->charsets = 0;
@@ -658,7 +658,7 @@ static struct conf_service *service_create_static(struct conf_server *server,
             }
             if (rank_lead)
             {
-                service->rank_lead = atoi(rank_lead);
+                service->rank_lead = atof(rank_lead);
             }
             if (rank_length)
             {
index af416a7..86900fc 100644 (file)
@@ -119,7 +119,7 @@ struct conf_service
     int rank_cluster;
     int rank_debug;
     int rank_follow;
-    int rank_lead;
+    double rank_lead;
     int rank_length;
     char *default_sort;
 
index a002a7b..10e8cc4 100644 (file)
@@ -37,7 +37,7 @@ struct relevance
     pp2_charset_token_t prt;
     int rank_cluster;
     int follow_boost;
-    int lead_boost;
+    double lead_decay;
     int length_divide;
     NMEM nmem;
 };
@@ -51,8 +51,9 @@ struct word_entry {
     struct word_entry *next;
 };
 
-static int word_entry_match(struct relevance *r, const char *norm_str,
-                            const char *rank, int *mult)
+static struct word_entry *word_entry_match(struct relevance *r,
+                                           const char *norm_str,
+                                           const char *rank, int *mult)
 {
     int i = 1;
     struct word_entry *entries = r->entries;
@@ -79,7 +80,7 @@ static int word_entry_match(struct relevance *r, const char *norm_str,
             {
                 e_follow->follow_boost = extra--;
             }
-            return entries->termno;
+            return entries;
         }
         entries->follow_boost = 0;
     }
@@ -93,7 +94,7 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
     int *mult = r->term_frequency_vec_tmp;
     const char *norm_str;
     int i, length = 0;
-    int lead_mult = r->lead_boost;
+    double lead_decay = r->lead_decay;
     struct word_entry *e;
     WRBUF w = cluster->relevance_explain1;
 
@@ -108,14 +109,14 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
     while ((norm_str = pp2_charset_token_next(r->prt)))
     {
         int local_mult = 0;
-        int res = word_entry_match(r, norm_str, rank, &local_mult);
-        if (res)
+        e = word_entry_match(r, norm_str, rank, &local_mult);
+        if (e)
         {
+            int res = e->termno;
             assert(res < r->vec_len);
-            mult[res] += local_mult + lead_mult;
+            mult[res] += local_mult / (1 + log2(1 + lead_decay * length));
+            wrbuf_printf(w, "%s: mult[%d] += local_mult(%d) / (1+log2(1+lead_decay(%f) * length(%d)));\n", e->display_str, res, local_mult, lead_decay, length);
         }
-        if (lead_mult > 0)
-            --lead_mult;
         length++;
     }
 
@@ -123,8 +124,8 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
     {
         if (length == 0 || mult[i] == 0)
             continue;
-        wrbuf_printf(w, "%s: field=%s vecf[%d] += mult(%d)",
-                     e->display_str, name, i, mult[i]);
+        wrbuf_printf(w, "%s: field=%s vecf[%d] += mult[%d](%d)",
+                     e->display_str, name, i, i, mult[i]);
         switch (r->length_divide)
         {
         case 0:
@@ -193,7 +194,7 @@ static void pull_terms(struct relevance *res, struct ccl_rpn_node *n)
 struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
                                        struct ccl_rpn_node *query,
                                        int rank_cluster,
-                                       int follow_boost, int lead_boost,
+                                       int follow_boost, double lead_decay,
                                        int length_divide)
 {
     NMEM nmem = nmem_create();
@@ -205,7 +206,7 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
     res->vec_len = 1;
     res->rank_cluster = rank_cluster;
     res->follow_boost = follow_boost;
-    res->lead_boost = lead_boost;
+    res->lead_decay = lead_decay;
     res->length_divide = length_divide;
     res->prt = pp2_charset_token_create(pft, "relevance");
 
index f585899..8b868bc 100644 (file)
@@ -31,7 +31,7 @@ struct reclist;
 struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
                                        struct ccl_rpn_node *query,
                                        int rank_cluster, int follow_boost,
-                                       int lead_boost, int length_divide);
+                                       double lead_decay, int length_divide);
 void relevance_destroy(struct relevance **rp);
 void relevance_newrec(struct relevance *r, struct record_cluster *cluster);
 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,