From de6338ddb87c9c46cf1b26998cb4e340e4844395 Mon Sep 17 00:00:00 2001
From: Heikki Levanto <heikki@indexdata.dk>
Date: Wed, 4 Dec 2013 11:13:56 +0100
Subject: [PATCH] Normalizing scores

---
 heikki/dbc-os/dbc-opensearch-gw.cfg |    2 +-
 heikki/dbc-os/test2.sh              |    2 +-
 heikki/solr/run.sh                  |   11 ++-
 heikki/solr/test3.sh                |   22 ++++--
 src/relevance.c                     |  148 ++++++++++++++++++++++++-----------
 5 files changed, 125 insertions(+), 60 deletions(-)
diff --git a/heikki/dbc-os/dbc-opensearch-gw.cfg b/heikki/dbc-os/dbc-opensearch-gw.cfg
index 2b09c34..8b7fedd 100644
--- a/heikki/dbc-os/dbc-opensearch-gw.cfg
+++ b/heikki/dbc-os/dbc-opensearch-gw.cfg
@@ -15,7 +15,7 @@ baseurl: http://openbibdk.addi.dk/0.8/
 objectformat: dkabm
 #constantparams: action=search&facets.numberOfTerms=10&facets.facetName=facet.creator&facets.facetName=facet.type&facets.facetName=facet.subject&agency=100200&profile=test&collectionType=work
 #constantparams: action=search&agency=100200&profile=test&collectionType=work&rank=rank_general
-constantparams: action=search&agency=100200&profile=test&collectionType=work
+constantparams: action=search&agency=100200&profile=test&collectionType=work&objectFormat=score
 fields: bibliotek.dk.fields.txt
 
 database: bibliotek.work
diff --git a/heikki/dbc-os/test2.sh b/heikki/dbc-os/test2.sh
index f187620..bb96855 100755
--- a/heikki/dbc-os/test2.sh
+++ b/heikki/dbc-os/test2.sh
@@ -51,7 +51,7 @@ else
 fi
 QRY=`echo $Q | sed 's/ /+/g' `
 
-SORT="sort=score"
+SORT="sort=relevance_h"
 #SEARCH="command=search$SES&$QRY&rank=1&sort=relevance"
 #SEARCH="command=search$SES&$QRY"
 #SEARCH="command=search$SES&query=$QRY&sort=relevance"
diff --git a/heikki/solr/run.sh b/heikki/solr/run.sh
index ea40894..761fe67 100755
--- a/heikki/solr/run.sh
+++ b/heikki/solr/run.sh
@@ -9,18 +9,21 @@ then
   echo "It will be in the title of all plots, together with the query"
   exit 1
 fi
+(cd ../../src; rm -f pazpar2; make; grep '###' relevance.c )
 TITLE="$1"
 OUTFILE=`echo $1.txt | sed 's/ /_/g'`
-echo "$TITLE" > $OUTFILE
+echo "Run $TITLE" > $OUTFILE
 ./test3.sh clean
+rm *.png
 
 function onerun() {
     QRY="$1"
     echo "" >> $OUTFILE
     echo "Query: $QRY" >> $OUTFILE
-    PNG=`echo "solr_$TITLE $QRY.png" | sed 's/ /_/g' `
+    PNG=`echo "$TITLE $QRY.png" | sed 's/ /_/g' `
     echo "Graph: $PNG" >> $OUTFILE
     ./test3.sh "$QRY" "$TITLE"
+    cat stat.line >> $OUTFILE
     grep "plotline" show.out | head -10 >> $OUTFILE
     cp plot.png $PNG
 }
@@ -28,6 +31,8 @@ function onerun() {
 onerun "harry potter"
 onerun "vietnam war"
 onerun "water or fire or ice"
+onerun "zen and motorcycle"
 echo "" >> $OUTFILE
-echo "client#, position, tf/idf, roundrobin, solr # database # title" >> $OUTFILE
+echo "client#, position, tf/idf, roundrobin, solr, normalized # database # title" >> $OUTFILE
 
+rm plot.png
diff --git a/heikki/solr/test3.sh b/heikki/solr/test3.sh
index 741a722..68bd69e 100755
--- a/heikki/solr/test3.sh
+++ b/heikki/solr/test3.sh
@@ -75,6 +75,7 @@ do
   HIT=`xml_grep --text_only "//hits" stat.out`
   REC=`xml_grep --text_only "//records" stat.out`
   echo "$ACT $HIT $REC"
+  echo "Hits/Fetched: $HIT / $REC" > stat.line
   if grep -q "<activeclients>0</activeclients>" stat.out
   then
     LOOPING=0
@@ -96,19 +97,24 @@ echo "Client numbers"
 cat scores.data | cut -d' ' -f2 | sort -u
 head -10 scores.data
 
+T1=`grep ": 1 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T2=`grep ": 2 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T3=`grep ": 3 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T4=`grep ": 4 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T5=`grep ": 5 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+T6=`grep ": 6 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2`
+
 echo "
   set term png
   set out \"plot.png\"
   set title \"$HEADLINE\"
+  plot \"scores.data\" using 0:(\$2==1?\$6:1/0) with points title \"1: $T1\", \
+       \"scores.data\" using 0:(\$2==2?\$6:1/0) with points title \"2: $T2\", \
+       \"scores.data\" using 0:(\$2==3?\$6:1/0) with points title \"3: $T3\", \
+       \"scores.data\" using 0:(\$2==4?\$6:1/0) with points title \"4: $T4\", \
+       \"scores.data\" using 0:(\$2==5?\$6:1/0) with points title \"5: $T5\", \
+       \"scores.data\" using 0:(\$2==6?\$6:1/0) with points title \"6: $T6\"
 " > plot.cmd
-echo '
-  plot "scores.data" using 0:($2==0?$6:1/0) with points title "db-1", \
-       "scores.data" using 0:($2==1?$6:1/0) with points title "db-2", \
-       "scores.data" using 0:($2==2?$6:1/0) with points title "db-3", \
-       "scores.data" using 0:($2==3?$6:1/0) with points title "db-4", \
-       "scores.data" using 0:($2==4?$6:1/0) with points title "db-5", \
-       "scores.data" using 0:($2==5?$6:1/0) with points title "db-6" \
-' >> plot.cmd
 cat plot.cmd | gnuplot
 
 
diff --git a/src/relevance.c b/src/relevance.c
index 5284686..e7f8585 100644
--- a/src/relevance.c
+++ b/src/relevance.c
@@ -47,6 +47,18 @@ struct relevance
     double lead_decay;
     int length_divide;
     NMEM nmem;
+    struct normalizing *norm;
+};
+
+// Structure to keep data for normalizing scores from one client
+struct normalizing
+{
+    int num;
+    float sum;
+    float max;
+    int count;
+    struct client *client;
+    struct normalizing *next;
 };
 
 struct word_entry {
@@ -57,6 +69,29 @@ struct word_entry {
     struct word_entry *next;
 };
 
+// Find the normalizing entry for this client, or create one if not there
+struct normalizing *findnorm( struct relevance *rel, struct client* client)
+{
+    struct normalizing *n = rel->norm;
+    while (n) {
+        if (n->client == client )
+            return n;
+        n = n->next;
+    }
+    n = nmem_malloc(rel->nmem, sizeof(struct normalizing) );
+    if ( rel->norm )
+        n->num = rel->norm->num +1;
+    else
+        n->num = 1;
+    n->sum = 0.0;
+    n->count = 0;
+    n->max = 0.0;
+    n->client = client;
+    n->next = rel->norm;
+    rel->norm = n;
+    return n;
+}
+
 static struct word_entry *word_entry_match(struct relevance *r,
                                            const char *norm_str,
                                            const char *rank, int *weight)
@@ -307,6 +342,8 @@ struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
         nmem_malloc(res->nmem, res->vec_len * sizeof(*res->term_pos));
 
     relevance_clear(res);
+
+    res->norm = 0; 
     return res;
 }
 
@@ -342,17 +379,6 @@ void relevance_newrec(struct relevance *r, struct record_cluster *rec)
     }
 }
 
-void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
-{
-    int i;
-
-    for (i = 1; i < r->vec_len; i++)
-        if (cluster->term_frequency_vec[i] > 0)
-            r->doc_frequency_vec[i]++;
-
-    r->doc_frequency_vec[0]++;
-}
-
 static const char *getfield(struct record *bestrecord, const char *tag)
 {
     struct session *se = client_get_session(bestrecord->client);
@@ -361,11 +387,39 @@ static const char *getfield(struct record *bestrecord, const char *tag)
     if (md_field_id <0)
         return "";
     md = bestrecord->metadata[md_field_id];
-    if ( md) 
+    if ( md)
         return md->data.text.disp;
     return "";
 }
 
+void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
+{
+    int i;
+    
+    // Find the best record in a cluster - the one with lowest position
+    // (in this proto. Later, find a better one)
+    struct record *bestrecord = 0;
+    struct record *record;
+    struct normalizing *n;
+    float score;
+    for (record = cluster->records; record; record = record->next) 
+        if ( bestrecord == 0 || bestrecord->position < record->position )
+            bestrecord = record;
+    n = findnorm(r,bestrecord->client);
+    n->count ++;
+    score = atof( getfield(bestrecord,"score") );
+    n->sum += score;
+    if ( n->max < score )
+        n->max = score;
+
+    for (i = 1; i < r->vec_len; i++)
+        if (cluster->term_frequency_vec[i] > 0)
+            r->doc_frequency_vec[i]++;
+
+    r->doc_frequency_vec[0]++;
+}
+
+
 // Prepare for a relevance-sorted read
 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
                             enum conf_sortkey_type type)
@@ -373,11 +427,6 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
     int i;
     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
     int n_clients = clients_count();
-    struct client * clients[n_clients];
-    yaz_log(YLOG_LOG,"round-robin: have %d clients", n_clients);
-    for (i = 0; i < n_clients; i++)
-        clients[i] = 0;
-
 
     reclist_enter(reclist);
     // Calculate document frequency vector for each term.
@@ -439,50 +488,55 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
         // get the log entries
         if (type == Metadata_sortkey_relevance_h) {
             struct record *record;
-            int thisclient = 0;
+            struct normalizing *norm;
             struct record *bestrecord = 0;
             int nclust = 0;
+            int tfrel = relevance; // keep the old tf/idf score;
+            int robinscore;
+            int solrscore;
+            int normscore;
             // Find the best record in a cluster - the one with lowest position
             for (record = rec->records; record; record = record->next) {
                 if ( bestrecord == 0 || bestrecord->position < record->position )
                     bestrecord = record;
                 nclust++; // and count them all, for logging
             }
-            // find the client number for the record (we only have a pointer
-            while ( clients[thisclient] != 0
-                    && clients[thisclient] != bestrecord->client )
-                thisclient++;
-            if ( clients[thisclient] == 0 )
-            {
-                yaz_log(YLOG_LOG,"round-robin: found new client at %d: p=%p\n", thisclient, bestrecord->client);
-                clients[thisclient] = bestrecord->client;
-            }
+            norm = findnorm(rel, bestrecord->client);
             // Calculate a round-robin score
-            int tfrel = relevance; // keep the old tf/idf score
-            int robinscore = -(bestrecord->position * n_clients + thisclient) ;
+            robinscore = -(bestrecord->position * n_clients + norm->num) ;
             wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n",
-                         bestrecord->position, thisclient, nclust, tfrel, relevance );
+                         bestrecord->position, norm->num, nclust, tfrel, relevance );
             yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d",
-                         bestrecord->position, thisclient, nclust, relevance );
+                         bestrecord->position, norm->num, nclust, relevance );
 
             // Check if the record has a score field
-            const char *score = getfield(bestrecord,"score");
-            int solrscore = 10000.0 * atof(score);
-            const char *id = getfield(bestrecord, "id");
-            // clear the id, we only want the first numerical part
-            char idbuf[64];
-            i=0;
-            while( id[i] >= '0' && id[i] <= '9' ) {
-                idbuf[i] = id[i];
-                i++;
+            {
+                const char *score = getfield(bestrecord,"score");
+                const char *id = getfield(bestrecord, "id");
+                const char *title = getfield(bestrecord, "title");
+                // clear the id, we only want the first numerical part
+                char idbuf[64];
+                solrscore = 10000.0 * atof(score);
+                i=0;
+                while( id[i] >= '0' && id[i] <= '9' ) {
+                    idbuf[i] = id[i];
+                    i++;
+                }
+                idbuf[i] = '\0';
+                if ( norm->count )
+                {
+                    float avg = norm->sum / norm->count;
+                    normscore = 10000.0 * (  atof(score) / norm->max );
+                    wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n",
+                          score, norm->max, normscore);
+                } else
+                    yaz_log(YLOG_LOG, "normscore: no count, can not normalize %s ", score );
+
+                wrbuf_printf(w,"plotline: %d %d %d %d %d %d # %s %s\n",
+                                norm->num, bestrecord->position,
+                                tfrel, robinscore, solrscore, normscore, idbuf, title );
             }
-            idbuf[i] = '\0';
-            
-            const char *title = getfield(bestrecord, "title");
-            wrbuf_printf(w,"plotline: %d %d %d %d %d # %s %s\n",
-                            thisclient, bestrecord->position,
-                            tfrel, robinscore, solrscore, idbuf, title );
-            relevance = solrscore;
+            relevance = normscore; // ###
         }
         rec->relevance_score = relevance;
     }
-- 
1.7.10.4