Tests on SOLR, plottable data out of pazpar2, plot script
authorHeikki Levanto <heikki@indexdata.dk>
Thu, 28 Nov 2013 15:19:11 +0000 (16:19 +0100)
committerHeikki Levanto <heikki@indexdata.dk>
Thu, 28 Nov 2013 15:19:11 +0000 (16:19 +0100)
heikki/README-HEIKKI
heikki/solr/run.sh [new file with mode: 0755]
heikki/solr/solr.lui.xml
heikki/solr/test3.cfg
heikki/solr/test3.sh
src/relevance.c

index 0571886..172ff99 100644 (file)
@@ -62,3 +62,13 @@ Add this to the target defs
 After this, it should be possible to get records from different databases, some
 with many records, some with a few. This is a good testing ground for merging
 rankings! Test first with a round-robin, and plot the scores.
+
+Thu 28-Nov
+Ok, I can now merge a number of SOLR databases (harvest jobs), and plot their rankings
+as solr gives them, in the order of different merge strategies
+Next: Add the normalizing merge strategy. Then plot different strategies against different queries
+Write a conclusion, and consider this plotting job done
+
+
+
+
diff --git a/heikki/solr/run.sh b/heikki/solr/run.sh
new file mode 100755 (executable)
index 0000000..ea40894
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/bash
+#
+# Run the test with a number of queries, plot the results
+# 
+
+if [ "$1" == "" ]
+then
+  echo "Need an argument, the name of this test run"
+  echo "It will be in the title of all plots, together with the query"
+  exit 1
+fi
+TITLE="$1"
+OUTFILE=`echo $1.txt | sed 's/ /_/g'`
+echo "$TITLE" > $OUTFILE
+./test3.sh clean
+
+function onerun() {
+    QRY="$1"
+    echo "" >> $OUTFILE
+    echo "Query: $QRY" >> $OUTFILE
+    PNG=`echo "solr_$TITLE $QRY.png" | sed 's/ /_/g' `
+    echo "Graph: $PNG" >> $OUTFILE
+    ./test3.sh "$QRY" "$TITLE"
+    grep "plotline" show.out | head -10 >> $OUTFILE
+    cp plot.png $PNG
+}
+
+onerun "harry potter"
+onerun "vietnam war"
+onerun "water or fire or ice"
+echo "" >> $OUTFILE
+echo "client#, position, tf/idf, roundrobin, solr # database # title" >> $OUTFILE
+
index 4e5905d..a016e82 100644 (file)
@@ -1,8 +1,33 @@
 <!-- Solr target -->
-<settings target="LUI Solr Test">
-  <set name="pz:name"  value="LUI Solr Test" />
-  <set name="pz:url"   value="lui.indexdata.com/solr" />
+<!-- General settings for all the solr targets in this test -->
+<settings target="*">
 
+  <!-- Individual databases in the solr system -->
+  <set target="lui.indexdata.com/solr#3902" name="pz:name"  value="Solr base 3902 (25m)" />
+  <set target="lui.indexdata.com/solr#3902" name="pz:url"   value="lui.indexdata.com/solr#3902" />
+  <set target="lui.indexdata.com/solr#3902" name="pz:extra_args" value="fq=database:3902" />
+      
+  <set target="lui.indexdata.com/solr#5802" name="pz:name"  value="Solr base 5802 (7m)" />
+  <set target="lui.indexdata.com/solr#5802" name="pz:url"   value="lui.indexdata.com/solr#5802" />
+  <set target="lui.indexdata.com/solr#5802" name="pz:extra_args" value="fq=database:5802" />
+
+  <set target="lui.indexdata.com/solr#3602" name="pz:name"  value="Solr base 3602 (4m)" />
+  <set target="lui.indexdata.com/solr#3602" name="pz:url"   value="lui.indexdata.com/solr#3602" />
+  <set target="lui.indexdata.com/solr#3602" name="pz:extra_args" value="fq=database:3602" />
+
+  <set target="lui.indexdata.com/solr#6202" name="pz:name"  value="Solr base 6202 (1.6m)" />
+  <set target="lui.indexdata.com/solr#6202" name="pz:url"   value="lui.indexdata.com/solr#6202" />
+  <set target="lui.indexdata.com/solr#6202" name="pz:extra_args" value="fq=database:6202" />
+
+  <set target="lui.indexdata.com/solr#4905" name="pz:name"  value="Solr base 4905 (100k)" />
+  <set target="lui.indexdata.com/solr#4905" name="pz:url"   value="lui.indexdata.com/solr#4905" />
+  <set target="lui.indexdata.com/solr#4905" name="pz:extra_args" value="fq=database:4905" />
+
+  <set target="lui.indexdata.com/solr#6103" name="pz:name"  value="Solr base 6103 (1k)" />
+  <set target="lui.indexdata.com/solr#6103" name="pz:url"   value="lui.indexdata.com/solr#6103" />
+  <set target="lui.indexdata.com/solr#6103" name="pz:extra_args" value="fq=database:6103" />
+
+   <!-- General settings for them all -->
   <set name="pz:limitmap:author"  value="rpn:@attr 1=author_exact 6=3" />
   <set name="pz:limitmap:subject" value="rpn:@attr 1=subject_exact" />
   <set name="pz:limitmap:date"    value="rpn:@attr 1=date @attr 6=3" />
index 7816b85..f0ed3c0 100644 (file)
@@ -7,9 +7,13 @@
     <service>
         <timeout session="60" z3950_operation="30" z3950_session="180"/>
 
-        <!-- settings src="bibliotek.dk.xml"/-->
+        <!-- General SOLR settings -->
         <settings src="solr.lui.xml"/>
 
+        <!-- A number of databases (also includes the general settings) -->
+        <!--settings src="solr.*.xml"/-->
+
+
         <icu_chain id="relevance" locale="en">
             <transform rule="[:Control:] Any-Remove"/>
             <tokenize rule="l"/>
@@ -37,6 +41,7 @@
         <!-- rank cluster="yes" lead="1" length="log" debug="no"/  Autographics settings-->
         <rank cluster="yes" lead="1" length="log" debug="yes"/>
 
+
         <!-- we try to keep same order as in marc21.xsl -->
         <metadata name="id" brief="yes"/>
         <metadata name="lccn" merge="unique"/>
         <metadata name="due"/>
         <metadata name="thumburl" brief="yes" merge="unique"/>
 
-        <metadata name="score" brief="yes" sortkey="numeric" merge="range"/>
+        <!--metadata name="score" brief="yes" sortkey="numeric" merge="range"/-->
+        <metadata name="score" brief="yes" />
+
     </service>
 
   </server>
index f40845d..741a722 100755 (executable)
@@ -42,9 +42,18 @@ then
 else
   Q=$1
 fi
+
+if [ -z "$2" ]
+then
+  HEADLINE="$Q"
+else
+  HEADLINE="$2: $Q"
+fi
+
 QRY=`echo $Q | sed 's/ /+/g' `
 
-SORT="sort=score"
+#SORT="sort=score"
+SORT="sort=relevance_h"
 #SEARCH="command=search$SES&$QRY&rank=1&sort=relevance"
 #SEARCH="command=search$SES&$QRY"
 #SEARCH="command=search$SES&query=$QRY&sort=relevance"
@@ -80,9 +89,30 @@ echo $SHOW
 curl -s "http://localhost:9017/?$SHOW" > show.out
 #grep "relevance" show.out | grep += | grep -v "(0)"
 #grep "round-robin" show.out
-grep '^ <md-title>' show.out | head -11
-grep 'Received' dbc-opensearch-gw.log | head -1 >> titles.out
-grep '^ <md-title>' show.out >> titles.out
+
+# Plot the lines created by the code
+grep plotline show.out > scores.data
+echo "Client numbers"
+cat scores.data | cut -d' ' -f2 | sort -u
+head -10 scores.data
+
+echo "
+  set term png
+  set out \"plot.png\"
+  set title \"$HEADLINE\"
+" > plot.cmd
+echo '
+  plot "scores.data" using 0:($2==0?$6:1/0) with points title "db-1", \
+       "scores.data" using 0:($2==1?$6:1/0) with points title "db-2", \
+       "scores.data" using 0:($2==2?$6:1/0) with points title "db-3", \
+       "scores.data" using 0:($2==3?$6:1/0) with points title "db-4", \
+       "scores.data" using 0:($2==4?$6:1/0) with points title "db-5", \
+       "scores.data" using 0:($2==5?$6:1/0) with points title "db-6" \
+' >> plot.cmd
+cat plot.cmd | gnuplot
+
+
+exit 1 # The old plotting code
 
 # Plot it
 DF=`echo $QRY | sed 's/@//g' | sed 's/[+"]/_/g' | sed s"/'//g "`
index 2e5411b..5284686 100644 (file)
@@ -353,6 +353,19 @@ void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
     r->doc_frequency_vec[0]++;
 }
 
+static const char *getfield(struct record *bestrecord, const char *tag)
+{
+    struct session *se = client_get_session(bestrecord->client);
+    int md_field_id = conf_service_metadata_field_id(se->service, tag);
+    struct record_metadata *md = 0;
+    if (md_field_id <0)
+        return "";
+    md = bestrecord->metadata[md_field_id];
+    if ( md) 
+        return md->data.text.disp;
+    return "";
+}
+
 // Prepare for a relevance-sorted read
 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
                             enum conf_sortkey_type type)
@@ -429,11 +442,13 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
             int thisclient = 0;
             struct record *bestrecord = 0;
             int nclust = 0;
+            // Find the best record in a cluster - the one with lowest position
             for (record = rec->records; record; record = record->next) {
                 if ( bestrecord == 0 || bestrecord->position < record->position )
                     bestrecord = record;
-                nclust++;
+                nclust++; // and count them all, for logging
             }
+            // find the client number for the record (we only have a pointer
             while ( clients[thisclient] != 0
                     && clients[thisclient] != bestrecord->client )
                 thisclient++;
@@ -442,12 +457,32 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
                 yaz_log(YLOG_LOG,"round-robin: found new client at %d: p=%p\n", thisclient, bestrecord->client);
                 clients[thisclient] = bestrecord->client;
             }
-            int tfrel = relevance;
-            relevance = -(bestrecord->position * n_clients + thisclient) ;
+            // Calculate a round-robin score
+            int tfrel = relevance; // keep the old tf/idf score
+            int robinscore = -(bestrecord->position * n_clients + thisclient) ;
             wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n",
                          bestrecord->position, thisclient, nclust, tfrel, relevance );
             yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d",
                          bestrecord->position, thisclient, nclust, relevance );
+
+            // Check if the record has a score field
+            const char *score = getfield(bestrecord,"score");
+            int solrscore = 10000.0 * atof(score);
+            const char *id = getfield(bestrecord, "id");
+            // clear the id, we only want the first numerical part
+            char idbuf[64];
+            i=0;
+            while( id[i] >= '0' && id[i] <= '9' ) {
+                idbuf[i] = id[i];
+                i++;
+            }
+            idbuf[i] = '\0';
+            
+            const char *title = getfield(bestrecord, "title");
+            wrbuf_printf(w,"plotline: %d %d %d %d %d # %s %s\n",
+                            thisclient, bestrecord->position,
+                            tfrel, robinscore, solrscore, idbuf, title );
+            relevance = solrscore;
         }
         rec->relevance_score = relevance;
     }