From 446f32183265d59ee79e2859376c598fa24408e0 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 24 Sep 2012 12:48:11 +0200 Subject: [PATCH] New rank tweaks: follow, lead, length Rank tweak: follow=number will increase mult by number if two terms occur next to each other; number-1 if they are one term apart , .. 0 if they are number a part (all in order). Default is 0 (following terms has no effect). Rank tweak: lead=number will increase mult by number if term is first term in field, number-1 if second, ... 0 if term is at offset number of more. Default value is 0 (position irrelevant). Rank tweak: length=strategy. length="linear" if mult is to be divided by length (existing, default behavior), length="log" if mult is to be divided by log2(1+length), length="none" if mult is not to be affected by length. --- NEWS | 15 ++++++++++ src/client.c | 5 +++- src/pazpar2_config.c | 31 +++++++++++++++++++++ src/pazpar2_config.h | 3 ++ src/record.h | 1 - src/relevance.c | 74 ++++++++++++++++++++++++++++++++++++++------------ src/relevance.h | 3 +- 7 files changed, 111 insertions(+), 21 deletions(-) diff --git a/NEWS b/NEWS index d9165e8..0611b78 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,18 @@ + +Rank tweak: follow=number will increase mult by number if two terms +occur next to each other; number-1 if they are one term apart , .. 0 +if they are number a part (all in order). Default is 0 (following +terms has no effect). + +Rank tweak: lead=number will increase mult by number if term is first +term in field, number-1 if second, ... 0 if term is at offset +number of more. Default value is 0 (position irrelevant). + +Rank tweak: length=strategy. length="linear" if mult is to be divided +by length (existing, default behavior), length="log" if mult is to be +divided by log2(1+length), length="none" if mult is not to be affected +by length. + --- 1.6.20 2012/09/21 Rank algorithm details may be printed as part of show reseponse in diff --git a/src/client.c b/src/client.c index b85195f..00d92ad 100644 --- a/src/client.c +++ b/src/client.c @@ -1311,7 +1311,10 @@ int client_parse_query(struct client *cl, const char *query, { // Initialize relevance structure with query terms se->relevance = relevance_create_ccl(se->service->charsets, cn, - se->service->rank_cluster); + se->service->rank_cluster, + se->service->rank_follow, + se->service->rank_lead, + se->service->rank_length); } ccl_rpn_delete(cn); return ret_value; diff --git a/src/pazpar2_config.c b/src/pazpar2_config.c index 00e36a6..73176b8 100644 --- a/src/pazpar2_config.c +++ b/src/pazpar2_config.c @@ -134,6 +134,9 @@ struct conf_service *service_init(struct conf_server *server, service->z3950_operation_timeout = 30; service->rank_cluster = 1; service->rank_debug = 0; + service->rank_follow = 0; + service->rank_lead = 0; + service->rank_length = 2; service->charsets = 0; @@ -622,6 +625,9 @@ static struct conf_service *service_create_static(struct conf_server *server, { char *rank_cluster = (char *) xmlGetProp(n, (xmlChar *) "cluster"); char *rank_debug = (char *) xmlGetProp(n, (xmlChar *) "debug"); + char *rank_follow = (char *) xmlGetProp(n, (xmlChar *) "follow"); + char *rank_lead = (char *) xmlGetProp(n, (xmlChar *) "lead"); + char *rank_length= (char *) xmlGetProp(n, (xmlChar *) "length"); if (rank_cluster) { if (!strcmp(rank_cluster, "yes")) @@ -646,8 +652,33 @@ static struct conf_service *service_create_static(struct conf_server *server, return 0; } } + if (rank_follow) + { + service->rank_follow = atoi(rank_follow); + } + if (rank_lead) + { + service->rank_lead = atoi(rank_lead); + } + if (rank_length) + { + if (!strcmp(rank_length, "linear")) + service->rank_length = 2; + else if (!strcmp(rank_length, "log")) + service->rank_length = 1; + else if (!strcmp(rank_length, "none")) + service->rank_length = 0; + else + { + yaz_log(YLOG_FATAL, "service: rank@length linear|log|none"); + return 0; + } + } xmlFree(rank_cluster); xmlFree(rank_debug); + xmlFree(rank_follow); + xmlFree(rank_lead); + xmlFree(rank_length); } else if (!strcmp((const char *) n->name, "sort-default")) { diff --git a/src/pazpar2_config.h b/src/pazpar2_config.h index 8f80f39..af416a7 100644 --- a/src/pazpar2_config.h +++ b/src/pazpar2_config.h @@ -118,6 +118,9 @@ struct conf_service int z3950_operation_timeout; int rank_cluster; int rank_debug; + int rank_follow; + int rank_lead; + int rank_length; char *default_sort; int ref_count; diff --git a/src/record.h b/src/record.h index a5afde1..fb6f8e1 100644 --- a/src/record.h +++ b/src/record.h @@ -84,7 +84,6 @@ struct record_cluster char *merge_key; int relevance_score; int *term_frequency_vec; - int *term_frequency_vec_tmp; float *term_frequency_vecf; // Set-specific ID for this record char *recid; diff --git a/src/relevance.c b/src/relevance.c index 3755b09..a002a7b 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -31,10 +31,14 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA struct relevance { int *doc_frequency_vec; + int *term_frequency_vec_tmp; int vec_len; struct word_entry *entries; pp2_charset_token_t prt; int rank_cluster; + int follow_boost; + int lead_boost; + int length_divide; NMEM nmem; }; @@ -42,17 +46,22 @@ struct word_entry { const char *norm_str; const char *display_str; int termno; + int follow_boost; char *ccl_field; struct word_entry *next; }; -static int word_entry_match(struct word_entry *entries, const char *norm_str, +static int word_entry_match(struct relevance *r, const char *norm_str, const char *rank, int *mult) { - for (; entries; entries = entries->next) + int i = 1; + struct word_entry *entries = r->entries; + for (; entries; entries = entries->next, i++) { if (*norm_str && !strcmp(norm_str, entries->norm_str)) { + int extra = r->follow_boost; + struct word_entry *e_follow = entries; const char *cp = 0; int no_read = 0; sscanf(rank, "%d%n", mult, &no_read); @@ -65,8 +74,14 @@ static int word_entry_match(struct word_entry *entries, const char *norm_str, memcmp(entries->ccl_field, rank, cp - rank) == 0) *mult = atoi(cp + 1); } + (*mult) += entries->follow_boost; + while ((e_follow = e_follow->next) != 0 && extra > 0) + { + e_follow->follow_boost = extra--; + } return entries->termno; } + entries->follow_boost = 0; } return 0; } @@ -75,39 +90,57 @@ void relevance_countwords(struct relevance *r, struct record_cluster *cluster, const char *words, const char *rank, const char *name) { - int *mult = cluster->term_frequency_vec_tmp; + int *mult = r->term_frequency_vec_tmp; const char *norm_str; int i, length = 0; - struct word_entry *e = r->entries; + int lead_mult = r->lead_boost; + struct word_entry *e; WRBUF w = cluster->relevance_explain1; pp2_charset_token_first(r->prt, words, 0); - for (i = 1; i < r->vec_len; i++) + for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next) + { mult[i] = 0; + e->follow_boost = 0; + } assert(rank); while ((norm_str = pp2_charset_token_next(r->prt))) { int local_mult = 0; - int res = word_entry_match(r->entries, norm_str, rank, &local_mult); + int res = word_entry_match(r, norm_str, rank, &local_mult); if (res) { assert(res < r->vec_len); - mult[res] += local_mult; + mult[res] += local_mult + lead_mult; } + if (lead_mult > 0) + --lead_mult; length++; } - for (i = 1; i < r->vec_len; i++) + for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next) { - if (length > 0 && mult[i] > 0) /* only add if non-empty */ + if (length == 0 || mult[i] == 0) + continue; + wrbuf_printf(w, "%s: field=%s vecf[%d] += mult(%d)", + e->display_str, name, i, mult[i]); + switch (r->length_divide) { - wrbuf_printf(w, "%s: field=%s vecf[%d] += mult(%d) / length(%d);\n", - e->display_str, name, i, mult[i], length); + case 0: + wrbuf_printf(w, ";\n"); + cluster->term_frequency_vecf[i] += (double) mult[i]; + break; + case 1: + wrbuf_printf(w, " / log2(1+length(%d));\n", length); + cluster->term_frequency_vecf[i] += + (double) mult[i] / log2(1 + length); + break; + case 2: + wrbuf_printf(w, " / length(%d);\n", length); cluster->term_frequency_vecf[i] += (double) mult[i] / length; } cluster->term_frequency_vec[i] += mult[i]; - e = e->next; } cluster->term_frequency_vec[0] += length; @@ -159,7 +192,9 @@ static void pull_terms(struct relevance *res, struct ccl_rpn_node *n) struct relevance *relevance_create_ccl(pp2_charset_fact_t pft, struct ccl_rpn_node *query, - int rank_cluster) + int rank_cluster, + int follow_boost, int lead_boost, + int length_divide) { NMEM nmem = nmem_create(); struct relevance *res = nmem_malloc(nmem, sizeof(*res)); @@ -169,6 +204,9 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft, res->entries = 0; res->vec_len = 1; res->rank_cluster = rank_cluster; + res->follow_boost = follow_boost; + res->lead_boost = lead_boost; + res->length_divide = length_divide; res->prt = pp2_charset_token_create(pft, "relevance"); pull_terms(res, query); @@ -176,6 +214,11 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft, res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int)); for (i = 0; i < res->vec_len; i++) res->doc_frequency_vec[i] = 0; + + // worker array + res->term_frequency_vec_tmp = + nmem_malloc(res->nmem, + res->vec_len * sizeof(*res->term_frequency_vec_tmp)); return res; } @@ -208,11 +251,6 @@ void relevance_newrec(struct relevance *r, struct record_cluster *rec) r->vec_len * sizeof(*rec->term_frequency_vecf)); for (i = 0; i < r->vec_len; i++) rec->term_frequency_vecf[i] = 0.0; - - // for relevance_countwords (so we don't have to xmalloc/xfree) - rec->term_frequency_vec_tmp = - nmem_malloc(r->nmem, - r->vec_len * sizeof(*rec->term_frequency_vec_tmp)); } } diff --git a/src/relevance.h b/src/relevance.h index c2ffa4a..f585899 100644 --- a/src/relevance.h +++ b/src/relevance.h @@ -30,7 +30,8 @@ struct reclist; struct relevance *relevance_create_ccl(pp2_charset_fact_t pft, struct ccl_rpn_node *query, - int rank_cluster); + int rank_cluster, int follow_boost, + int lead_boost, int length_divide); void relevance_destroy(struct relevance **rp); void relevance_newrec(struct relevance *r, struct record_cluster *cluster); void relevance_countwords(struct relevance *r, struct record_cluster *cluster, -- 1.7.10.4