Follow rank algorithm altered
authorAdam Dickmeiss <adam@indexdata.dk>
Tue, 25 Sep 2012 10:03:36 +0000 (12:03 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Tue, 25 Sep 2012 10:03:36 +0000 (12:03 +0200)
mult += mult * follow / (1 + log2(d))
where d is the distance. follow is a float (factor).

src/pazpar2_config.c
src/pazpar2_config.h
src/relevance.c
src/relevance.h
test/test_http_81.res

index b45d825..052b457 100644 (file)
@@ -134,7 +134,7 @@ struct conf_service *service_init(struct conf_server *server,
     service->z3950_operation_timeout = 30;
     service->rank_cluster = 1;
     service->rank_debug = 0;
-    service->rank_follow = 0;
+    service->rank_follow = 0.0;
     service->rank_lead = 0.0;
     service->rank_length = 2;
 
@@ -654,7 +654,7 @@ static struct conf_service *service_create_static(struct conf_server *server,
             }
             if (rank_follow)
             {
-                service->rank_follow = atoi(rank_follow);
+                service->rank_follow = atof(rank_follow);
             }
             if (rank_lead)
             {
index 86900fc..11cec52 100644 (file)
@@ -118,7 +118,7 @@ struct conf_service
     int z3950_operation_timeout;
     int rank_cluster;
     int rank_debug;
-    int rank_follow;
+    double rank_follow;
     double rank_lead;
     int rank_length;
     char *default_sort;
index 10e8cc4..51d33ac 100644 (file)
@@ -32,11 +32,12 @@ struct relevance
 {
     int *doc_frequency_vec;
     int *term_frequency_vec_tmp;
+    int *term_pos;
     int vec_len;
     struct word_entry *entries;
     pp2_charset_token_t prt;
     int rank_cluster;
-    int follow_boost;
+    double follow_factor;
     double lead_decay;
     int length_divide;
     NMEM nmem;
@@ -46,7 +47,6 @@ struct word_entry {
     const char *norm_str;
     const char *display_str;
     int termno;
-    int follow_boost;
     char *ccl_field;
     struct word_entry *next;
 };
@@ -61,8 +61,6 @@ static struct word_entry *word_entry_match(struct relevance *r,
     {
         if (*norm_str && !strcmp(norm_str, entries->norm_str))
         {
-            int extra = r->follow_boost;
-            struct word_entry *e_follow = entries;
             const char *cp = 0;
             int no_read = 0;
             sscanf(rank, "%d%n", mult, &no_read);
@@ -75,14 +73,8 @@ static struct word_entry *word_entry_match(struct relevance *r,
                     memcmp(entries->ccl_field, rank, cp - rank) == 0)
                     *mult = atoi(cp + 1);
             }
-            (*mult) += entries->follow_boost;
-            while ((e_follow = e_follow->next) != 0 && extra > 0)
-            {
-                e_follow->follow_boost = extra--;
-            }
             return entries;
         }
-        entries->follow_boost = 0;
     }
     return 0;
 }
@@ -102,7 +94,7 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
     for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
     {
         mult[i] = 0;
-        e->follow_boost = 0;
+        r->term_pos[i] = 0;
     }
 
     assert(rank);
@@ -113,9 +105,25 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
         if (e)
         {
             int res = e->termno;
+            int j;
+
             assert(res < r->vec_len);
             mult[res] += local_mult / (1 + log2(1 + lead_decay * length));
-            wrbuf_printf(w, "%s: mult[%d] += local_mult(%d) / (1+log2(1+lead_decay(%f) * length(%d)));\n", e->display_str, res, local_mult, lead_decay, length);
+            wrbuf_printf(w, "%s: mult[%d] += local_mult(%d) / "
+                         "(1+log2(1+lead_decay(%f) * length(%d)));\n",
+                         e->display_str, res, local_mult, lead_decay, length);
+            j = res - 1;
+            if (j > 0 && r->term_pos[j])
+            {
+                int d = length + 1 - r->term_pos[j];
+                mult[res] += mult[res] * r->follow_factor / (1 + log2(d));
+                wrbuf_printf(w, "%s: mult[%d] += mult[%d](%d) * follow(%f) / "
+                             "(1+log2(d(%d));\n",
+                             e->display_str, res, res, mult[res],
+                             r->follow_factor, d);
+            }
+            for (j = 0; j < r->vec_len; j++)
+                r->term_pos[j] = j < res ? 0 : length + 1;
         }
         length++;
     }
@@ -194,7 +202,7 @@ static void pull_terms(struct relevance *res, struct ccl_rpn_node *n)
 struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
                                        struct ccl_rpn_node *query,
                                        int rank_cluster,
-                                       int follow_boost, double lead_decay,
+                                       double follow_factor, double lead_decay,
                                        int length_divide)
 {
     NMEM nmem = nmem_create();
@@ -205,7 +213,7 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
     res->entries = 0;
     res->vec_len = 1;
     res->rank_cluster = rank_cluster;
-    res->follow_boost = follow_boost;
+    res->follow_factor = follow_factor;
     res->lead_decay = lead_decay;
     res->length_divide = length_divide;
     res->prt = pp2_charset_token_create(pft, "relevance");
@@ -220,6 +228,10 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
     res->term_frequency_vec_tmp =
         nmem_malloc(res->nmem,
                     res->vec_len * sizeof(*res->term_frequency_vec_tmp));
+
+    res->term_pos =
+        nmem_malloc(res->nmem, res->vec_len * sizeof(*res->term_pos));
+
     return res;
 }
 
index 8b868bc..125f50e 100644 (file)
@@ -30,7 +30,7 @@ struct reclist;
 
 struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
                                        struct ccl_rpn_node *query,
-                                       int rank_cluster, int follow_boost,
+                                       int rank_cluster, double follow_factor,
                                        double lead_decay, int length_divide);
 void relevance_destroy(struct relevance **rp);
 void relevance_newrec(struct relevance *r, struct record_cluster *cluster);
index 0e5f17f..e17bbdf 100644 (file)
@@ -32,6 +32,7 @@
  <relevance_info>
 teachers: mult[1] += local_mult(6) / (1+log2(1+lead_decay(0.000000) * length(2)));
 greece: mult[2] += local_mult(6) / (1+log2(1+lead_decay(0.000000) * length(4)));
+greece: mult[2] += mult[2](6) * follow(0.000000) / (1+log2(d(2));
 teachers: field=title vecf[1] += mult[1](6) / length(5);
 greece: field=title vecf[2] += mult[2](6) / length(5);
 greece: mult[2] += local_mult(3) / (1+log2(1+lead_decay(0.000000) * length(0)));