Work on cluster merging; part of PAZ-901
authorAdam Dickmeiss <adam@indexdata.dk>
Fri, 29 Nov 2013 13:21:53 +0000 (14:21 +0100)
committerAdam Dickmeiss <adam@indexdata.dk>
Fri, 29 Nov 2013 13:21:53 +0000 (14:21 +0100)
src/reclists.c
src/reclists.h
src/relevance.c
src/session.c
src/session.h

index 73a6a64..9d25970 100644 (file)
@@ -358,8 +358,28 @@ int reclist_get_num_records(struct reclist *l)
     return 0;
 }
 
+static void merge_cluster(struct reclist *l,
+                          struct relevance *r,
+                          struct record_cluster *dst,
+                          struct record_cluster **src)
+{
+#if 0
+    dst->metadata = (*src)->metadata;
+    dst->sortkeys = (*src)->sortkeys;
+    int relevance_score;
+    int *term_frequency_vec;
+    float *term_frequency_vecf;
+    // Set-specific ID for this record
+    char *recid;
+    WRBUF relevance_explain1;
+    WRBUF relevance_explain2;
+    struct record *records;
+#endif
+}
+
 // Insert a record. Return record cluster (newly formed or pre-existing)
 struct record_cluster *reclist_insert(struct reclist *l,
+                                      struct relevance *r,
                                       struct conf_service *service,
                                       struct record *record,
                                       struct record_metadata_attr *merge_keys,
@@ -393,8 +413,7 @@ struct record_cluster *reclist_insert(struct reclist *l,
                 {
                     struct record **re;
 
-                    cluster = (*p)->record;
-                    for (re = &cluster->records; *re; re = &(*re)->next)
+                    for (re = &(*p)->record->records; *re; re = &(*re)->next)
                     {
                         if ((*re)->client == record->client &&
                             record_compare(record, *re, service))
@@ -403,14 +422,19 @@ struct record_cluster *reclist_insert(struct reclist *l,
                             return 0;
                         }
                     }
-                    *re = record;
-                    record->next = 0;
-                    goto out;
+
+                    if (!cluster)
+                    {
+                        cluster = (*p)->record;
+                        *re = record;
+                        record->next = 0;
+                    }
+                    else
+                        merge_cluster(l, r, cluster, &(*p)->record);
                 }
             }
         }
     }
-out:
     if (!cluster)
     {
         struct reclist_bucket *new =
@@ -427,7 +451,6 @@ out:
         append_merge_keys(&cluster->merge_keys, merge_keys, l->nmem);
 
         cluster->relevance_score = 0;
-        cluster->term_frequency_vec = 0;
         cluster->recid = cluster->merge_keys->value;
         (*total)++;
         cluster->metadata =
@@ -440,6 +463,7 @@ out:
         memset(cluster->sortkeys, 0,
                sizeof(union data_types*) * service->num_sortkeys);
 
+        relevance_newrec(r, cluster);
         cluster->relevance_explain1 = wrbuf_alloc();
         cluster->relevance_explain2 = wrbuf_alloc();
         /* attach to hash list */
index 769b0c2..ea4d263 100644 (file)
@@ -39,6 +39,7 @@ struct reclist *reclist_create(NMEM);
 void reclist_destroy(struct reclist *l);
 void reclist_limit(struct reclist *l, struct session *session, int lazy);
 struct record_cluster *reclist_insert(struct reclist *tl,
+                                      struct relevance *r,
                                       struct conf_service *service,
                                       struct record  *record,
                                       struct record_metadata_attr *merge_keys, 
index 08527ae..63558fb 100644 (file)
@@ -320,24 +320,21 @@ void relevance_destroy(struct relevance **rp)
 
 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
 {
-    if (!rec->term_frequency_vec)
-    {
-        int i;
-
-        // term frequency [1,..] . [0] is total length of all fields
-        rec->term_frequency_vec =
-            nmem_malloc(r->nmem,
-                        r->vec_len * sizeof(*rec->term_frequency_vec));
-        for (i = 0; i < r->vec_len; i++)
-            rec->term_frequency_vec[i] = 0;
+    int i;
 
-        // term frequency divided by length of field [1,...]
-        rec->term_frequency_vecf =
-            nmem_malloc(r->nmem,
-                        r->vec_len * sizeof(*rec->term_frequency_vecf));
-        for (i = 0; i < r->vec_len; i++)
-            rec->term_frequency_vecf[i] = 0.0;
-    }
+    // term frequency [1,..] . [0] is total length of all fields
+    rec->term_frequency_vec =
+        nmem_malloc(r->nmem,
+                    r->vec_len * sizeof(*rec->term_frequency_vec));
+    for (i = 0; i < r->vec_len; i++)
+        rec->term_frequency_vec[i] = 0;
+
+    // term frequency divided by length of field [1,...]
+    rec->term_frequency_vecf =
+        nmem_malloc(r->nmem,
+                    r->vec_len * sizeof(*rec->term_frequency_vecf));
+    for (i = 0; i < r->vec_len; i++)
+        rec->term_frequency_vecf[i] = 0.0;
 }
 
 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
index 9e52d59..2039c58 100644 (file)
@@ -1728,24 +1728,50 @@ int ingest_record(struct client *cl, const char *rec,
 
     if (!strcmp((const char *) root->name, "cluster"))
     {
+        int no_merge_keys = 0;
+        int no_merge_dups = 0;
         xmlNode *sroot;
+        struct record_metadata_attr *mk = 0;
+
         for (sroot = root->children; sroot; sroot = sroot->next)
-            if (sroot->type == XML_ELEMENT_NODE)
+            if (sroot->type == XML_ELEMENT_NODE &&
+                !strcmp((const char *) sroot->name, "record"))
             {
+                struct record_metadata_attr **mkp;
                 const char *mergekey_norm =
                     get_mergekey(xdoc, sroot, cl, record_no, service, nmem,
-                         se->mergekey);
-
-                struct record_metadata_attr *mk = (struct record_metadata_attr*)
-                    nmem_malloc(nmem, sizeof(*mk));
-                mk->name = 0;
-                mk->value = nmem_strdup(nmem, mergekey_norm);
-                mk->next = 0;
-
+                                 se->mergekey);
+                if (!mergekey_norm)
+                {
+                    r = -1;
+                    break;
+                }
+                for (mkp = &mk; *mkp; mkp = &(*mkp)->next)
+                    if (!strcmp((*mkp)->value, mergekey_norm))
+                        break;
+                if (!*mkp)
+                {
+                    *mkp = (struct record_metadata_attr*)
+                        nmem_malloc(nmem, sizeof(**mkp));
+                    (*mkp)->name = 0;
+                    (*mkp)->value = nmem_strdup(nmem, mergekey_norm);
+                    (*mkp)->next = 0;
+                    no_merge_keys++;
+                }
+                else
+                    no_merge_dups++;
+            }
+        if (no_merge_keys > 1 || no_merge_dups > 0)
+        {
+            yaz_log(YLOG_LOG, "Got %d mergekeys, %d dups for position %d",
+                    no_merge_keys, no_merge_dups, record_no);
+        }
+        for (sroot = root->children; !r && sroot; sroot = sroot->next)
+            if (sroot->type == XML_ELEMENT_NODE &&
+                !strcmp((const char *) sroot->name, "record"))
+            {
                 r = ingest_sub_record(cl, xdoc, sroot, record_no, nmem, sdb,
                                       mk);
-                if (r)
-                    break;
             }
     }
     else if (!strcmp((const char *) root->name, "record"))
@@ -2038,7 +2064,7 @@ static int ingest_to_cluster(struct client *cl,
             xmlFree(value);
         return -2;
     }
-    cluster = reclist_insert(se->reclist, service, record,
+    cluster = reclist_insert(se->reclist, se->relevance, service, record,
                              merge_keys, &se->total_merged);
     if (!cluster)
         return 0; // complete match with existing record
@@ -2061,9 +2087,6 @@ static int ingest_to_cluster(struct client *cl,
         session_log(se, YLOG_LOG, "Cluster id %s from %s (#%d)", cluster->recid,
                     sdb->database->id, record_no);
 
-
-    relevance_newrec(se->relevance, cluster);
-
     // original metadata, to check if first existence of a field
     metadata0 = xmalloc(sizeof(*metadata0) * service->num_metadata);
     memcpy(metadata0, cluster->metadata,
index 85f1b8f..d58019f 100644 (file)
@@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #include <yaz/yaz-ccl.h>
 
 #include "facet_limit.h"
+#include "relevance.h"
 #include "reclists.h"
 
 struct record;