X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Frelevance.c;h=a4d2c4dc0ce50373a2da20e3e893100ec948a39d;hb=817e3ec506c4095bc4fcc1923cee36153ef4ee43;hp=4d5b6e44d29a047f26a9a1fdf90660a39260f116;hpb=b22c4ae2c14eb5d2b991e6233cdea2a9fd7dcf6c;p=pazpar2-moved-to-github.git diff --git a/src/relevance.c b/src/relevance.c index 4d5b6e4..a4d2c4d 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -1,5 +1,5 @@ /* This file is part of Pazpar2. - Copyright (C) 2006-2013 Index Data + Copyright (C) Index Data Pazpar2 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -27,6 +27,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include "relevance.h" #include "session.h" +#include "client.h" +#include "settings.h" #ifdef WIN32 #define log2(x) (log(x)/log(2)) @@ -45,6 +47,7 @@ struct relevance double lead_decay; int length_divide; NMEM nmem; + struct norm_client *norm; }; struct word_entry { @@ -55,6 +58,265 @@ struct word_entry { struct word_entry *next; }; +// Structure to keep data for norm_client scores from one client +struct norm_client +{ + int num; // number of the client + float max; + float min; + int count; + const char *native_score; + int scorefield; + float a,b; // Rn = a*R + b + struct client *client; + struct norm_client *next; + struct norm_record *records; +}; + +const int scorefield_none = -1; // Do not normalize anything, use tf/idf as is + // This is the old behavior, and the default +const int scorefield_internal = -2; // use our tf/idf, but normalize it +const int scorefield_position = -3; // fake a score based on the position + +// A structure for each (sub)record. There is one list for each client +struct norm_record +{ + struct record *record; + float score; + struct record_cluster *clust; + struct norm_record *next; +}; + +// Find the norm_client entry for this client, or create one if not there +struct norm_client *findnorm( struct relevance *rel, struct client* client) +{ + struct norm_client *n = rel->norm; + struct session_database *sdb; + while (n) { + if (n->client == client ) + return n; + n = n->next; + } + n = nmem_malloc(rel->nmem, sizeof(struct norm_client) ); + if ( rel->norm ) + n->num = rel->norm->num +1; + else + n->num = 1; + n->count = 0; + n->max = 0.0; + n->min = 0.0; + n->client = client; + n->next = rel->norm; + rel->norm = n; + sdb = client_get_database(client); + n->native_score = session_setting_oneval(sdb, PZ_NATIVE_SCORE); + n->records = 0; + n->scorefield = scorefield_none; + yaz_log(YLOG_LOG,"Normalizing: Client %d uses '%s'", n->num, n->native_score ); + if ( ! n->native_score || ! *n->native_score ) // not specified + n->scorefield = scorefield_none; + else if ( strcmp(n->native_score,"position") == 0 ) + n->scorefield = scorefield_position; + else if ( strcmp(n->native_score,"internal") == 0 ) + n->scorefield = scorefield_internal; + else + { // Get the field index for the score + struct session *se = client_get_session(client); + n->scorefield = conf_service_metadata_field_id(se->service, n->native_score); + } + yaz_log(YLOG_LOG,"Normalizing: Client %d uses '%s' = %d", + n->num, n->native_score, n->scorefield ); + return n; +} + + +// Add all records from a cluster into the list for that client, for normalizing later +static void setup_norm_record( struct relevance *rel, struct record_cluster *clust) +{ + struct record *record; + for (record = clust->records; record; record = record->next) + { + struct norm_client *norm = findnorm(rel, record->client); + struct norm_record *rp; + if ( norm->scorefield == scorefield_none) + break; // not interested in normalizing this client + rp = nmem_malloc(rel->nmem, sizeof(struct norm_record) ); + norm->count ++; + rp->next = norm->records; + norm->records = rp; + rp->clust = clust; + rp->record = record; + if ( norm->scorefield == scorefield_position ) + rp->score = 1.0 / record->position; + else if ( norm->scorefield == scorefield_internal ) + rp->score = clust->relevance_score; // the tf/idf for the whole cluster + // TODO - Get them for each record, merge later! + else + { + struct record_metadata *md = record->metadata[norm->scorefield]; + rp->score = md->data.fnumber; + } + yaz_log(YLOG_LOG,"Got score for %d/%d : %f ", + norm->num, record->position, rp->score ); + record -> score = rp->score; + if ( norm->count == 1 ) + { + norm->max = rp->score; + norm->min = rp->score; + } else { + if ( rp->score > norm->max ) + norm->max = rp->score; + if ( rp->score < norm->min ) + norm->min = rp->score; + } + } +} + +// Calculate the squared sum of residuals, that is the difference from +// normalized values to the target curve, which is 1/n +static double squaresum( struct norm_record *rp, double a, double b) +{ + double sum = 0.0; + for ( ; rp; rp = rp->next ) + { + double target = 1.0 / rp->record->position; + double normscore = rp->score * a + b; + double diff = target - normscore; + sum += diff * diff; + } + return sum; +} + +// For each client, normalize scores +static void normalize_scores(struct relevance *rel) +{ + const int maxiterations = 1000; + const double enough = 100.0; // sets the number of decimals we are happy with + const double stepchange = 0.5; // reduction of the step size when finding middle + // 0.5 sems to be magical, much better than 0.4 or 0.6 + struct norm_client *norm; + for ( norm = rel->norm; norm; norm = norm->next ) + { + yaz_log(YLOG_LOG,"Normalizing client %d: scorefield=%d count=%d range=%f %f = %f", + norm->num, norm->scorefield, norm->count, norm->min, + norm->max, norm->max-norm->min); + norm->a = 1.0; // default normalizing factors, no change + norm->b = 0.0; + if ( norm->scorefield != scorefield_none && + norm->scorefield != scorefield_position ) + { // have something to normalize + double range = norm->max - norm->min; + int it = 0; + double a,b; // params to optimize + double as,bs; // step sizes + double chi; + char *branch = "?"; + // initial guesses for the parameters + // Rmax = a * rmax + b # want to be 1.0 + // Rmin = a * rmin + b # want to be 0.0 + // Rmax - Rmin = a ( rmax - rmin ) # subtracting equations + // 1.0 - 0.0 = a ( rmax - rmin ) + // a = 1 / range + // Rmin = a * rmin + b + // b = Rmin - a * rmin + // = 0.0 - 1/range * rmin + // = - rmin / range + + if ( range < 1e-6 ) // practically zero + range = norm->max; + a = 1.0 / range; + b = -1.0 * norm->min / range; + // b = fabs(norm->min) / range; + as = a / 10; + bs = fabs(b) / 10; + chi = squaresum( norm->records, a,b); + yaz_log(YLOG_LOG,"Initial done: it=%d: a=%f / %f b=%f / %f chi = %f", + 0, a, as, b, bs, chi ); + while (it++ < maxiterations) // safeguard against things not converging + { + double aplus = squaresum(norm->records, a+as, b); + double aminus= squaresum(norm->records, a-as, b); + double bplus = squaresum(norm->records, a, b+bs); + double bminus= squaresum(norm->records, a, b-bs); + double prevchi = chi; + if ( aplus < chi && aplus < aminus && aplus < bplus && aplus < bminus) + { + a = a + as; + chi = aplus; + as = as * (1.0 + stepchange); + branch = "aplus "; + } + else if ( aminus < chi && aminus < aplus && aminus < bplus && aminus < bminus) + { + a = a - as; + chi = aminus; + as = as * (1.0 + stepchange); + branch = "aminus"; + } + else if ( bplus < chi && bplus < aplus && bplus < aminus && bplus < bminus) + { + b = b + bs; + chi = bplus; + bs = bs * (1.0 + stepchange); + branch = "bplus "; + } + else if ( bminus < chi && bminus < aplus && bminus < bplus && bminus < aminus) + { + b = b - bs; + chi = bminus; + branch = "bminus"; + bs = bs * (1.0+stepchange); + } + else + { // a,b is the best so far, adjust one step size + // which one? The one that has the greatest effect to chi + // That is, the average of plus and minus is further away from chi + double adif = 0.5 * ( aplus + aminus ) - prevchi; + double bdif = 0.5 * ( bplus + bminus ) - prevchi; + if ( fabs(adif) > fabs(bdif) ) + { + as = as * ( 1.0 - stepchange); + branch = "step a"; + } + else + { + bs = bs * ( 1.0 - stepchange); + branch = "step b"; + } + } + yaz_log(YLOG_LOG,"Fitting %s it=%d: a=%g %g b=%g %g chi=%g ap=%g am=%g, bp=%g bm=%g p=%g", + branch, it, a, as, b, bs, chi, + aplus, aminus, bplus, bminus, prevchi ); + norm->a = a; + norm->b = b; + if ( fabs(as) * enough < fabs(a) && + fabs(bs) * enough < fabs(b) ) { + break; // not changing much any more + + } + } + yaz_log(YLOG_LOG,"Fitting done: it=%d: a=%g / %g b=%g / %g chi = %g", + it-1, a, as, b, bs, chi ); + } + + if ( norm->scorefield != scorefield_none ) + { // distribute the normalized scores to the records + struct norm_record *nr = norm->records; + for ( ; nr ; nr = nr->next ) { + double r = nr->score; + r = norm->a * r + norm -> b; + nr->clust->relevance_score = 10000 * r; + nr->record->score = r; + yaz_log(YLOG_LOG,"Normalized %f * %f + %f = %f", + nr->score, norm->a, norm->b, r ); + // TODO - This keeps overwriting the cluster score in random order! + // Need to merge results better + } + } + } // client loop +} + + static struct word_entry *word_entry_match(struct relevance *r, const char *norm_str, const char *rank, int *weight) @@ -89,37 +351,45 @@ int relevance_snippet(struct relevance *r, { int no = 0; const char *norm_str; -#if 1 - yaz_log(YLOG_LOG, "relevance_snippet for field=%s content=%s", - name, words); -#endif - pp2_charset_token_first(r->prt, words, 0); + int highlight = 0; + pp2_charset_token_first(r->prt, words, 0); while ((norm_str = pp2_charset_token_next(r->prt))) { size_t org_start, org_len; struct word_entry *entries = r->entries; - int highlight = 0; int i; pp2_get_org(r->prt, &org_start, &org_len); for (; entries; entries = entries->next, i++) { - yaz_log(YLOG_LOG, "Compare: %s %s", norm_str, entries->norm_str); if (*norm_str && !strcmp(norm_str, entries->norm_str)) + break; + } + if (entries) + { + if (!highlight) + { highlight = 1; + wrbuf_puts(w_snippet, ""); + no++; + } + } + else + { + if (highlight) + { + highlight = 0; + wrbuf_puts(w_snippet, ""); + } } - if (highlight) - wrbuf_puts(w_snippet, ""); - wrbuf_xmlputs_n(w_snippet, words + org_start, org_len); - if (highlight) - wrbuf_puts(w_snippet, ""); - no += highlight; } + if (highlight) + wrbuf_puts(w_snippet, ""); if (no) { - yaz_log(YLOG_LOG, "SNIPPET match: %s", wrbuf_cstr(w_snippet)); + yaz_log(YLOG_DEBUG, "SNIPPET match: %s", wrbuf_cstr(w_snippet)); } return no; } @@ -282,6 +552,7 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft, res->follow_factor = follow_factor; res->lead_decay = lead_decay; res->length_divide = length_divide; + res->norm = 0; res->prt = pp2_charset_token_create(pft, "relevance"); pull_terms(res, query); @@ -310,26 +581,35 @@ void relevance_destroy(struct relevance **rp) } } -void relevance_newrec(struct relevance *r, struct record_cluster *rec) +void relevance_mergerec(struct relevance *r, struct record_cluster *dst, + const struct record_cluster *src) { - if (!rec->term_frequency_vec) - { - int i; + int i; - // term frequency [1,..] . [0] is total length of all fields - rec->term_frequency_vec = - nmem_malloc(r->nmem, - r->vec_len * sizeof(*rec->term_frequency_vec)); - for (i = 0; i < r->vec_len; i++) - rec->term_frequency_vec[i] = 0; + for (i = 0; i < r->vec_len; i++) + dst->term_frequency_vec[i] += src->term_frequency_vec[i]; - // term frequency divided by length of field [1,...] - rec->term_frequency_vecf = - nmem_malloc(r->nmem, - r->vec_len * sizeof(*rec->term_frequency_vecf)); - for (i = 0; i < r->vec_len; i++) - rec->term_frequency_vecf[i] = 0.0; - } + for (i = 0; i < r->vec_len; i++) + dst->term_frequency_vecf[i] += src->term_frequency_vecf[i]; +} + +void relevance_newrec(struct relevance *r, struct record_cluster *rec) +{ + int i; + + // term frequency [1,..] . [0] is total length of all fields + rec->term_frequency_vec = + nmem_malloc(r->nmem, + r->vec_len * sizeof(*rec->term_frequency_vec)); + for (i = 0; i < r->vec_len; i++) + rec->term_frequency_vec[i] = 0; + + // term frequency divided by length of field [1,...] + rec->term_frequency_vecf = + nmem_malloc(r->nmem, + r->vec_len * sizeof(*rec->term_frequency_vecf)); + for (i = 0; i < r->vec_len; i++) + rec->term_frequency_vecf[i] = 0.0; } void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) @@ -343,6 +623,8 @@ void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) r->doc_frequency_vec[0]++; } + + // Prepare for a relevance-sorted read void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) { @@ -350,6 +632,7 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) float *idfvec = xmalloc(rel->vec_len * sizeof(float)); reclist_enter(reclist); + // Calculate document frequency vector for each term. for (i = 1; i < rel->vec_len; i++) { @@ -362,7 +645,7 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) rel->doc_frequency_vec[i]); } } - // Calculate relevance for each document + // Calculate relevance for each document (cluster) while (1) { int relevance = 0; @@ -405,9 +688,22 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) wrbuf_printf(w, "score = relevance(%d);\n", relevance); } rec->relevance_score = relevance; - } + + // Build the normalizing structures + // List of (sub)records for each target + setup_norm_record( rel, rec ); + + } // cluster loop + + normalize_scores(rel); + + // TODO - Calculate the cluster scores from individual records + // At the moment the record scoring puts one of them in the cluster... + reclist_rewind(reclist); + reclist_leave(reclist); xfree(idfvec); + } /*