From de6338ddb87c9c46cf1b26998cb4e340e4844395 Mon Sep 17 00:00:00 2001 From: Heikki Levanto Date: Wed, 4 Dec 2013 11:13:56 +0100 Subject: [PATCH] Normalizing scores --- heikki/dbc-os/dbc-opensearch-gw.cfg | 2 +- heikki/dbc-os/test2.sh | 2 +- heikki/solr/run.sh | 11 ++- heikki/solr/test3.sh | 22 ++++-- src/relevance.c | 148 ++++++++++++++++++++++++----------- 5 files changed, 125 insertions(+), 60 deletions(-) diff --git a/heikki/dbc-os/dbc-opensearch-gw.cfg b/heikki/dbc-os/dbc-opensearch-gw.cfg index 2b09c34..8b7fedd 100644 --- a/heikki/dbc-os/dbc-opensearch-gw.cfg +++ b/heikki/dbc-os/dbc-opensearch-gw.cfg @@ -15,7 +15,7 @@ baseurl: http://openbibdk.addi.dk/0.8/ objectformat: dkabm #constantparams: action=search&facets.numberOfTerms=10&facets.facetName=facet.creator&facets.facetName=facet.type&facets.facetName=facet.subject&agency=100200&profile=test&collectionType=work #constantparams: action=search&agency=100200&profile=test&collectionType=work&rank=rank_general -constantparams: action=search&agency=100200&profile=test&collectionType=work +constantparams: action=search&agency=100200&profile=test&collectionType=work&objectFormat=score fields: bibliotek.dk.fields.txt database: bibliotek.work diff --git a/heikki/dbc-os/test2.sh b/heikki/dbc-os/test2.sh index f187620..bb96855 100755 --- a/heikki/dbc-os/test2.sh +++ b/heikki/dbc-os/test2.sh @@ -51,7 +51,7 @@ else fi QRY=`echo $Q | sed 's/ /+/g' ` -SORT="sort=score" +SORT="sort=relevance_h" #SEARCH="command=search$SES&$QRY&rank=1&sort=relevance" #SEARCH="command=search$SES&$QRY" #SEARCH="command=search$SES&query=$QRY&sort=relevance" diff --git a/heikki/solr/run.sh b/heikki/solr/run.sh index ea40894..761fe67 100755 --- a/heikki/solr/run.sh +++ b/heikki/solr/run.sh @@ -9,18 +9,21 @@ then echo "It will be in the title of all plots, together with the query" exit 1 fi +(cd ../../src; rm -f pazpar2; make; grep '###' relevance.c ) TITLE="$1" OUTFILE=`echo $1.txt | sed 's/ /_/g'` -echo "$TITLE" > $OUTFILE +echo "Run $TITLE" > $OUTFILE ./test3.sh clean +rm *.png function onerun() { QRY="$1" echo "" >> $OUTFILE echo "Query: $QRY" >> $OUTFILE - PNG=`echo "solr_$TITLE $QRY.png" | sed 's/ /_/g' ` + PNG=`echo "$TITLE $QRY.png" | sed 's/ /_/g' ` echo "Graph: $PNG" >> $OUTFILE ./test3.sh "$QRY" "$TITLE" + cat stat.line >> $OUTFILE grep "plotline" show.out | head -10 >> $OUTFILE cp plot.png $PNG } @@ -28,6 +31,8 @@ function onerun() { onerun "harry potter" onerun "vietnam war" onerun "water or fire or ice" +onerun "zen and motorcycle" echo "" >> $OUTFILE -echo "client#, position, tf/idf, roundrobin, solr # database # title" >> $OUTFILE +echo "client#, position, tf/idf, roundrobin, solr, normalized # database # title" >> $OUTFILE +rm plot.png diff --git a/heikki/solr/test3.sh b/heikki/solr/test3.sh index 741a722..68bd69e 100755 --- a/heikki/solr/test3.sh +++ b/heikki/solr/test3.sh @@ -75,6 +75,7 @@ do HIT=`xml_grep --text_only "//hits" stat.out` REC=`xml_grep --text_only "//records" stat.out` echo "$ACT $HIT $REC" + echo "Hits/Fetched: $HIT / $REC" > stat.line if grep -q "0" stat.out then LOOPING=0 @@ -96,19 +97,24 @@ echo "Client numbers" cat scores.data | cut -d' ' -f2 | sort -u head -10 scores.data +T1=`grep ": 1 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T2=`grep ": 2 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T3=`grep ": 3 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T4=`grep ": 4 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T5=`grep ": 5 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T6=`grep ": 6 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` + echo " set term png set out \"plot.png\" set title \"$HEADLINE\" + plot \"scores.data\" using 0:(\$2==1?\$6:1/0) with points title \"1: $T1\", \ + \"scores.data\" using 0:(\$2==2?\$6:1/0) with points title \"2: $T2\", \ + \"scores.data\" using 0:(\$2==3?\$6:1/0) with points title \"3: $T3\", \ + \"scores.data\" using 0:(\$2==4?\$6:1/0) with points title \"4: $T4\", \ + \"scores.data\" using 0:(\$2==5?\$6:1/0) with points title \"5: $T5\", \ + \"scores.data\" using 0:(\$2==6?\$6:1/0) with points title \"6: $T6\" " > plot.cmd -echo ' - plot "scores.data" using 0:($2==0?$6:1/0) with points title "db-1", \ - "scores.data" using 0:($2==1?$6:1/0) with points title "db-2", \ - "scores.data" using 0:($2==2?$6:1/0) with points title "db-3", \ - "scores.data" using 0:($2==3?$6:1/0) with points title "db-4", \ - "scores.data" using 0:($2==4?$6:1/0) with points title "db-5", \ - "scores.data" using 0:($2==5?$6:1/0) with points title "db-6" \ -' >> plot.cmd cat plot.cmd | gnuplot diff --git a/src/relevance.c b/src/relevance.c index 5284686..e7f8585 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -47,6 +47,18 @@ struct relevance double lead_decay; int length_divide; NMEM nmem; + struct normalizing *norm; +}; + +// Structure to keep data for normalizing scores from one client +struct normalizing +{ + int num; + float sum; + float max; + int count; + struct client *client; + struct normalizing *next; }; struct word_entry { @@ -57,6 +69,29 @@ struct word_entry { struct word_entry *next; }; +// Find the normalizing entry for this client, or create one if not there +struct normalizing *findnorm( struct relevance *rel, struct client* client) +{ + struct normalizing *n = rel->norm; + while (n) { + if (n->client == client ) + return n; + n = n->next; + } + n = nmem_malloc(rel->nmem, sizeof(struct normalizing) ); + if ( rel->norm ) + n->num = rel->norm->num +1; + else + n->num = 1; + n->sum = 0.0; + n->count = 0; + n->max = 0.0; + n->client = client; + n->next = rel->norm; + rel->norm = n; + return n; +} + static struct word_entry *word_entry_match(struct relevance *r, const char *norm_str, const char *rank, int *weight) @@ -307,6 +342,8 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft, nmem_malloc(res->nmem, res->vec_len * sizeof(*res->term_pos)); relevance_clear(res); + + res->norm = 0; return res; } @@ -342,17 +379,6 @@ void relevance_newrec(struct relevance *r, struct record_cluster *rec) } } -void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) -{ - int i; - - for (i = 1; i < r->vec_len; i++) - if (cluster->term_frequency_vec[i] > 0) - r->doc_frequency_vec[i]++; - - r->doc_frequency_vec[0]++; -} - static const char *getfield(struct record *bestrecord, const char *tag) { struct session *se = client_get_session(bestrecord->client); @@ -361,11 +387,39 @@ static const char *getfield(struct record *bestrecord, const char *tag) if (md_field_id <0) return ""; md = bestrecord->metadata[md_field_id]; - if ( md) + if ( md) return md->data.text.disp; return ""; } +void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) +{ + int i; + + // Find the best record in a cluster - the one with lowest position + // (in this proto. Later, find a better one) + struct record *bestrecord = 0; + struct record *record; + struct normalizing *n; + float score; + for (record = cluster->records; record; record = record->next) + if ( bestrecord == 0 || bestrecord->position < record->position ) + bestrecord = record; + n = findnorm(r,bestrecord->client); + n->count ++; + score = atof( getfield(bestrecord,"score") ); + n->sum += score; + if ( n->max < score ) + n->max = score; + + for (i = 1; i < r->vec_len; i++) + if (cluster->term_frequency_vec[i] > 0) + r->doc_frequency_vec[i]++; + + r->doc_frequency_vec[0]++; +} + + // Prepare for a relevance-sorted read void relevance_prepare_read(struct relevance *rel, struct reclist *reclist, enum conf_sortkey_type type) @@ -373,11 +427,6 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist, int i; float *idfvec = xmalloc(rel->vec_len * sizeof(float)); int n_clients = clients_count(); - struct client * clients[n_clients]; - yaz_log(YLOG_LOG,"round-robin: have %d clients", n_clients); - for (i = 0; i < n_clients; i++) - clients[i] = 0; - reclist_enter(reclist); // Calculate document frequency vector for each term. @@ -439,50 +488,55 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist, // get the log entries if (type == Metadata_sortkey_relevance_h) { struct record *record; - int thisclient = 0; + struct normalizing *norm; struct record *bestrecord = 0; int nclust = 0; + int tfrel = relevance; // keep the old tf/idf score; + int robinscore; + int solrscore; + int normscore; // Find the best record in a cluster - the one with lowest position for (record = rec->records; record; record = record->next) { if ( bestrecord == 0 || bestrecord->position < record->position ) bestrecord = record; nclust++; // and count them all, for logging } - // find the client number for the record (we only have a pointer - while ( clients[thisclient] != 0 - && clients[thisclient] != bestrecord->client ) - thisclient++; - if ( clients[thisclient] == 0 ) - { - yaz_log(YLOG_LOG,"round-robin: found new client at %d: p=%p\n", thisclient, bestrecord->client); - clients[thisclient] = bestrecord->client; - } + norm = findnorm(rel, bestrecord->client); // Calculate a round-robin score - int tfrel = relevance; // keep the old tf/idf score - int robinscore = -(bestrecord->position * n_clients + thisclient) ; + robinscore = -(bestrecord->position * n_clients + norm->num) ; wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n", - bestrecord->position, thisclient, nclust, tfrel, relevance ); + bestrecord->position, norm->num, nclust, tfrel, relevance ); yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d", - bestrecord->position, thisclient, nclust, relevance ); + bestrecord->position, norm->num, nclust, relevance ); // Check if the record has a score field - const char *score = getfield(bestrecord,"score"); - int solrscore = 10000.0 * atof(score); - const char *id = getfield(bestrecord, "id"); - // clear the id, we only want the first numerical part - char idbuf[64]; - i=0; - while( id[i] >= '0' && id[i] <= '9' ) { - idbuf[i] = id[i]; - i++; + { + const char *score = getfield(bestrecord,"score"); + const char *id = getfield(bestrecord, "id"); + const char *title = getfield(bestrecord, "title"); + // clear the id, we only want the first numerical part + char idbuf[64]; + solrscore = 10000.0 * atof(score); + i=0; + while( id[i] >= '0' && id[i] <= '9' ) { + idbuf[i] = id[i]; + i++; + } + idbuf[i] = '\0'; + if ( norm->count ) + { + float avg = norm->sum / norm->count; + normscore = 10000.0 * ( atof(score) / norm->max ); + wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n", + score, norm->max, normscore); + } else + yaz_log(YLOG_LOG, "normscore: no count, can not normalize %s ", score ); + + wrbuf_printf(w,"plotline: %d %d %d %d %d %d # %s %s\n", + norm->num, bestrecord->position, + tfrel, robinscore, solrscore, normscore, idbuf, title ); } - idbuf[i] = '\0'; - - const char *title = getfield(bestrecord, "title"); - wrbuf_printf(w,"plotline: %d %d %d %d %d # %s %s\n", - thisclient, bestrecord->position, - tfrel, robinscore, solrscore, idbuf, title ); - relevance = solrscore; + relevance = normscore; // ### } rec->relevance_score = relevance; } -- 1.7.10.4