Merge branch 'master' of ssh://git.indexdata.com/home/git/pub/pazpar2

author Adam Dickmeiss <adam@indexdata.dk>

Tue, 28 Jan 2014 13:52:44 +0000 (14:52 +0100)

committer Adam Dickmeiss <adam@indexdata.dk>

Tue, 28 Jan 2014 13:52:44 +0000 (14:52 +0100)
author Adam Dickmeiss <adam@indexdata.dk>
Tue, 28 Jan 2014 13:52:44 +0000 (14:52 +0100)
committer Adam Dickmeiss <adam@indexdata.dk>
Tue, 28 Jan 2014 13:52:44 +0000 (14:52 +0100)
diff --git a/js/pz2.js b/js/pz2.js

index 8ae1379..59c501d 100644 (file)
--- a/js/pz2.js
+++ b/js/pz2.js
@@ -32,6 +32,7 @@ var pz2 = function ( paramArray )
          throw new Error("Pz2.js: Array with parameters has to be supplied."); 
  
      //supported pazpar2's protocol version
+    this.windowid = paramArray.windowid || window.name;
      this.suppProtoVer = '1';
      if (typeof paramArray.pazpar2path != "undefined")
          this.pz2String = paramArray.pazpar2path;
@@ -170,7 +171,7 @@ pz2.prototype =
          this.stop();
              
          if ( this.resetCallback )
-                this.resetCallback();
+                this.resetCallback(this.windowid);
      },
  
      init: function (sessionId, serviceId) 
@@ -214,7 +215,7 @@ pz2.prototype =
                                  context.keepAlive
                              );
                          if ( context.initCallback )
-                            context.initCallback();
+                            context.initCallback(context.windowid);
                      }
                      else
                          context.throwError('Init failed. Malformed WS resonse.',
@@ -240,7 +241,7 @@ pz2.prototype =
  
          var request = new pzHttpRequest(this.pz2String, this.errorHandler);
          request.safeGet(
-            { "command": "ping", "session": this.sessionID, "windowid" : window.name },
+            { "command": "ping", "session": this.sessionID, "windowid" : context.windowid },
              function(data) {
                  if ( data.getElementsByTagName("status")[0]
                          .childNodes[0].nodeValue == "OK" ) {
@@ -290,7 +291,7 @@ pz2.prototype =
            "command": "search",
            "query": this.currQuery, 
            "session": this.sessionID,
-          "windowid" : window.name
+          "windowid" : this.windowid
          };
         
          if( sort !== undefined ) {
@@ -343,7 +344,7 @@ pz2.prototype =
          var context = this;
          var request = new pzHttpRequest(this.pz2String, this.errorHandler);
          request.safeGet(
-            { "command": "stat", "session": this.sessionID, "windowid" : window.name },
+            { "command": "stat", "session": this.sessionID, "windowid" : context.windowid },
              function(data) {
                  if ( data.getElementsByTagName("stat") ) {
                      var activeClients = 
@@ -365,7 +366,7 @@ pz2.prototype =
                                  },
                                  delay
                              );
-                    context.statCallback(stat);
+                    context.statCallback(stat, context.windowid);
                  }
                  else
                      context.throwError('Stat failed. Malformed WS resonse.',
@@ -401,7 +402,7 @@ pz2.prototype =
                "sort": this.currentSort, 
                "block": 1,
                "type": this.showResponseType,
-              "windowid" : window.name
+              "windowid" : this.windowid
            };
          if (query_state)
            requestParameters["query-state"] = query_state;
@@ -471,7 +472,7 @@ pz2.prototype =
                    context.show();
                  }, 
                  delay);
-            context.showCallback(show);
+              context.showCallback(show, context.windowid);
            }
          );
      },
@@ -490,7 +491,7 @@ pz2.prototype =
              "command": "record", 
              "session": this.sessionID,
              "id": this.currRecID,
-            "windowid" : window.name
+            "windowid" : this.windowid
          };
         
         this.currRecOffset = null;
@@ -523,7 +524,7 @@ pz2.prototype =
                      record = new Array();
                      record['xmlDoc'] = data;
                      record['offset'] = context.currRecOffset;
-                    callback(record, args);
+                    callback(record, args, context.windowid);
                  //pz2 record
                  } else if ( recordNode = 
                      data.getElementsByTagName("record")[0] ) {
@@ -553,7 +554,7 @@ pz2.prototype =
                                    },
                                    delay
                                 );                                    
-                    callback(record, args);
+                    callback(record, args, context.windowid);
                  }
                  else
                      context.throwError('Record failed. Malformed WS resonse.',
@@ -579,7 +580,7 @@ pz2.prototype =
                  "command": "termlist", 
                  "session": this.sessionID, 
                  "name": this.termKeys,
-                "windowid" : window.name, 
+                "windowid" : this.windowid, 
                 "version" : this.version
         
              },
@@ -642,7 +643,7 @@ pz2.prototype =
                                  delay
                              );
                     
-                   context.termlistCallback(termList);
+                    context.termlistCallback(termList, context.windowid);
                  }
                  else
                      context.throwError('Termlist failed. Malformed WS resonse.',
@@ -672,7 +673,7 @@ pz2.prototype =
                 "command": "bytarget", 
                 "session": this.sessionID, 
                 "block": 1,
-               "windowid" : window.name,
+               "windowid" : this.windowid,
                 "version" : this.version
             },
              function(data) {
@@ -731,7 +732,7 @@ pz2.prototype =
                                  delay
                              );
  
-                    context.bytargetCallback(bytarget);
+                    context.bytargetCallback(bytarget, context.windowid);
                  }
                  else
                      context.throwError('Bytarget failed. Malformed WS resonse.',
diff --git a/src/relevance-todo-heikki.txt b/src/relevance-todo-heikki.txt

new file mode 100644 (file)

index 0000000..c73a6a1
--- /dev/null
+++ b/src/relevance-todo-heikki.txt
@@ -0,0 +1,160 @@
+
+Relevancy stuff - status 20-Jan-2014 - How to get going again?
+
+This summary is also in PAZ-917.
+
+I have done some ranking-related stuff, and now it looks like we might end up
+not continuing with it. So I write this quick summary to state what I have done,
+and what I would do next, so we can pick the ball up again, if need be.
+
+Added a new setting native_score which can be a field name for the score. If
+specified, this is the field that contains the ranking score from the back-end.
+These scores are normalized to a range that is close to 1.0 .. 0.0, minimizing
+the squared distance from 1/position curve.
+
+This can also be a special value "position", which can be used when the target
+returns the records in relevancy order, but without a numeric value. This makes
+a guess based on 1/position. There is also another magic value "internal", which
+uses our TF/IDF ranking, but normalized the same way as before.
+
+The normalizing works fine, as long as records have scores from the back end.
+For our own TF/IDF thing, things don't work so well yet, as it works on the
+cluster level, not on individual records. I haven't quite sorted out how to make
+the TF/IDF thing on a record level, probably need to duplicate the ranking code
+and keep score vectors per record as well as per cluster, so as to keep the
+current behavior as the default... There is a dirty hack to put the cluster
+score in the records too.
+
+The record scores are supposed to be combined into cluster scores, so that
+clusters can be sorted. This is not yet done, but should not be much of work. At
+the moment each cluster gets one of the record scores directly. Once this is
+done, we can define new setting(s) to adjust the cluster scoring. First by
+selecting some algorithm (max, avg, sum, some form of decaying sum (largest
+score + half the second largest + quarter of the next largest, etc)), and then
+adjustments parameters to give some targets extra weight (at least when
+averaging), or extra boost (to indicate they tend to have better results).
+
+Before starting to code anything much, we obviously need tests. There is a
+decent test framework, it should not be many days work to make a number of test
+cases for the native ranking first, then for the normalized TF/IDF (once we get
+that coded), and then for merging record scores into cluster scores.
+
+
+* * *
+
+
+How does relevancy ranking work in pz2
+Need to understand it before I can change it to work on individual records
+
+Data structures
+
+struct relevance {
+    int *doc_frequency_vec;
+    int *term_frequency_vec_tmp;
+    int *term_pos;
+    int vec_len;
+    struct word_entry *entries;
+    ...
+    struct norm_client *norm;   // my list of (sub)records for normalizing, one list per client
+}
+
+struct word_entry {
+    const char *norm_str;
+    const char *display_str;
+    int termno;
+    char *ccl_field;
+    struct word_entry *next;
+}
+
+// Find the norm_client entry for this client, or create one if not there
+struct norm_client *findnorm( struct relevance *rel, struct client* client) 
+
+// Add all records from a cluster into the list for that client, for normalizing later
+static void setup_norm_record( struct relevance *rel,  struct record_cluster *clust)
+
+// find the word_entry that matches the norm_str
+// if found, sets up entries->ccl_field, and weight
+static struct word_entry *word_entry_match(struct relevance *r,
+                                           const char *norm_str,
+                                           const char *rank, int *weight)
+
+// Put <match> tags around the words in the recors text
+// not called from inside relevance.c at all! Called from session.c:2051,
+// ingest_to_cluster(). Can probably be ignored for this summary.
+int relevance_snippet(struct relevance *r,
+                      const char *words, const char *name,
+                      WRBUF w_snippet)
+
+// not called from inside relevance.c!
+// Seems to implement the decay and follow stuff, adjusting term weights within a field
+// Called from session.c:2286, ingest_to_cluster(), in if(rank), with a comment
+// ranking of _all_ fields enabled. 
+void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
+                          const char *words, const char *rank,
+                          const char *name)
+
+// Recurses through a RPN query, pulls out the terms we want for ranking
+// Appends each word to relevance->entries with normalized string,
+// ccl_field, termno, and display_str.
+// Ok, here we decide which terms we are interested in!
+// called from relevance_create_ccl(), (and recursively from itself)
+static void pull_terms(struct relevance *res, struct ccl_rpn_node *n)
+
+// Clears the relevance->doc_frequency_vec
+void relevance_clear(struct relevance *r)
+
+// Sets up the relevance structure. Gets lots of controlling params
+// pulls terms, which gets the vec_len. then mallocs relevance->term_frequency_vec
+// term_frequency_vec_tmp, and term_pos. Calls relevance_clear to clear the doc_frequency_vec.
+struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
+                                       struct ccl_rpn_node *query,
+                                       int rank_cluster,
+                                       double follow_factor, double lead_decay,
+                                       int length_divide)
+
+// kills the nmem, freeing all memory.
+void relevance_destroy(struct relevance **rp)
+
+// Adds the values from src into the dst, for both term_frequency_vec and
+// term_frequency_vecf. Both src and dst are clusters.
+// Called from reclists.c:419 merge_cluster()
+void relevance_mergerec(struct relevance *r, struct record_cluster *dst,
+                        const struct record_cluster *src)
+
+// Adds a new cluster to the relevance stuff
+// mallocs rec->term_frequency_vec and _vecf for the cluster, and clears them to zeroes
+// Called from reclists.c: 458 new_cluster()
+void relevance_newrec(struct relevance *r, struct record_cluster *rec)
+
+// increments relevance->doc_frequency_vec[i] for each i that has something in the
+// cluster->term_frequency_vec[i], i=1..vec_len, and increments doc_frequency_vec[0].
+// called from session.c:2330, ingest_to_cluster(), near the end
+void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
+
+// Calculates a idfvec from relevance->doc_frequency_vec (basically 1/doc_frequency_vec,
+// times doc_frequency_vec[0].
+// Then loops through all clusters, and for each calculates score from each term
+// rec->term_frequency_vec[i] * idfvec[i]. Sums these as the cluster score.
+// If rank_cluster is set, divides the sum by the count, getting avg score.
+// Then calls normalize_scores.
+// Called from session.c:1319 show_range_start().
+void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
+
+
+TODO - Read through ingest_to_cluster, and summarize how the ranking actually
+works.  That's a long routine, 400 lines. Quick read didn't show all that much.
+
+So, basically we have
+  - relevance->entries
+     - Set up in pull_terms, updated in word_entry_match
+  - relevance->doc_frequency_vec
+     - Set up with zeroes in relevance_create_ccl
+     - Updated in relevance_donerecord, based on the cluster->term_frequency_vec
+  - cluster->term_frequency_vec
+     - Set up and zeroed in relevance_newrec
+     - Updated in relevance_mergerec
+
+* * *
+
+
+
diff --git a/src/relevance.c b/src/relevance.c

index a4d2c4d..e484ca9 100644 (file)
--- a/src/relevance.c
+++ b/src/relevance.c
@@ -77,6 +77,7 @@ const int scorefield_none = -1;  // Do not normalize anything, use tf/idf as is
    // This is the old behavior, and the default
  const int scorefield_internal = -2;  // use our tf/idf, but normalize it
  const int scorefield_position = -3;  // fake a score based on the position
+// Positive numbers indicate the field to be used for scoring.
  
  // A structure for each (sub)record. There is one list for each client
  struct norm_record
author	Adam Dickmeiss <adam@indexdata.dk>
	Tue, 28 Jan 2014 13:52:44 +0000 (14:52 +0100)
committer	Adam Dickmeiss <adam@indexdata.dk>
	Tue, 28 Jan 2014 13:52:44 +0000 (14:52 +0100)
js/pz2.js		patch \| blob \| history
src/relevance-todo-heikki.txt	[new file with mode: 0644]	patch \| blob
src/relevance.c		patch \| blob \| history