Happy new year
[pazpar2-moved-to-github.git] / src / termlists.c
index ad0529c..79e88ee 100644 (file)
@@ -1,5 +1,5 @@
 /* This file is part of Pazpar2.
-   Copyright (C) 2006-2008 Index Data
+   Copyright (C) Index Data
 
 Pazpar2 is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
@@ -17,16 +17,17 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
 */
 
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <yaz/yaz-util.h>
-
 #if HAVE_CONFIG_H
 #include <config.h>
 #endif
 
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <yaz/yaz-util.h>
+
 #include "termlists.h"
+#include "jenkins_hash.h"
 
 // Discussion:
 // As terms are found in incoming records, they are added to (or updated in) a
@@ -42,120 +43,41 @@ struct termlist_bucket
 struct termlist
 {
     struct termlist_bucket **hashtable;
-    int hashtable_size;
-    int hashmask;
-
-    struct termlist_score **highscore;
-    int highscore_size;
-    int highscore_num;
-    int highscore_min;
+    unsigned hash_size;
 
+    int no_entries;
     NMEM nmem;
 };
 
-
-// Jenkins one-at-a-time hash (from wikipedia)
-static unsigned int hash(const unsigned char *key)
+struct termlist *termlist_create(NMEM nmem)
 {
-    unsigned int hash = 0;
-
-    while (*key)
-    {
-        hash += *(key++);
-        hash += (hash << 10);
-        hash ^= (hash >> 6);
-    }
-    hash += (hash << 3);
-    hash ^= (hash >> 11);
-    hash += (hash << 15);
-    return hash;
-}
-
-struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size)
-{
-    int hashsize = 1;
-    int halfnumterms;
-    struct termlist *res;
-
-    // Calculate a hash size smallest power of 2 larger than 50% of expected numterms
-    halfnumterms = numterms >> 1;
-    if (halfnumterms < 0)
-        halfnumterms = 1;
-    while (hashsize < halfnumterms)
-        hashsize <<= 1;
-    res = nmem_malloc(nmem, sizeof(struct termlist));
-    res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*));
-    memset(res->hashtable, 0, hashsize * sizeof(struct termlist_bucket*));
-    res->hashtable_size = hashsize;
+    struct termlist *res = nmem_malloc(nmem, sizeof(struct termlist));
+    res->hash_size = 399;
+    res->hashtable =
+        nmem_malloc(nmem, res->hash_size * sizeof(struct termlist_bucket*));
+    memset(res->hashtable, 0, res->hash_size * sizeof(struct termlist_bucket*));
     res->nmem = nmem;
-    res->hashmask = hashsize - 1; // Creates a bitmask
-
-    res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
-    res->highscore_size = highscore_size;
-    res->highscore_num = 0;
-    res->highscore_min = 0;
-
+    res->no_entries = 0;
     return res;
 }
 
-static void update_highscore(struct termlist *tl, struct termlist_score *t)
-{
-    int i;
-    int smallest;
-    int me = -1;
-
-    if (tl->highscore_num > tl->highscore_size && t->frequency < tl->highscore_min)
-        return;
-
-    smallest = 0;
-    for (i = 0; i < tl->highscore_num; i++)
-    {
-        if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency)
-            smallest = i;
-        if (tl->highscore[i] == t)
-            me = i;
-    }
-    if (tl->highscore_num)
-        tl->highscore_min = tl->highscore[smallest]->frequency;
-    if (t->frequency < tl->highscore_min)
-        tl->highscore_min = t->frequency;
-    if (me >= 0)
-        return;
-    if (tl->highscore_num < tl->highscore_size)
-    {
-        tl->highscore[tl->highscore_num++] = t;
-        if (t->frequency < tl->highscore_min)
-            tl->highscore_min = t->frequency;
-    }
-    else
-    {
-        if (t->frequency > tl->highscore[smallest]->frequency)
-        {
-            tl->highscore[smallest] = t;
-        }
-    }
-}
-
-void termlist_insert(struct termlist *tl, const char *term)
+void termlist_insert(struct termlist *tl, const char *display_term,
+                     const char *norm_term, const char *id, size_t id_len,
+                     int freq)
 {
     unsigned int bucket;
     struct termlist_bucket **p;
-    char buf[256], *cp;
+    char buf[256];
 
-    if (strlen(term) > 255)
+    if (strlen(norm_term) > 255)
         return;
-    strcpy(buf, term);
-    /* chop right */
-    for (cp = buf + strlen(buf); cp != buf && strchr(",. -", cp[-1]); cp--)
-        cp[-1] = '\0';
-    
-    bucket = hash((unsigned char *)buf) & tl->hashmask;
+    strcpy(buf, norm_term);
+    bucket = jenkins_hash((unsigned char *)buf) % tl->hash_size;
     for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
     {
-        if (!strcmp(buf, (*p)->term.term))
+        if (!strcmp(buf, (*p)->term.norm_term))
         {
-            (*p)->term.frequency++;
-            update_highscore(tl, &((*p)->term));
+            (*p)->term.frequency += freq;
             break;
         }
     }
@@ -163,31 +85,54 @@ void termlist_insert(struct termlist *tl, const char *term)
     {
         struct termlist_bucket *new = nmem_malloc(tl->nmem,
                 sizeof(struct termlist_bucket));
-        new->term.term = nmem_strdup(tl->nmem, buf);
-        new->term.frequency = 1;
+        new->term.norm_term = nmem_strdup(tl->nmem, buf);
+        new->term.display_term = *display_term ?
+            nmem_strdup(tl->nmem, display_term) : new->term.norm_term;
+        new->term.id = id ? nmem_strdupn(tl->nmem, id, id_len) : 0;
+        new->term.frequency = freq;
         new->next = 0;
         *p = new;
-        update_highscore(tl, &new->term);
+        tl->no_entries++;
     }
 }
 
 static int compare(const void *s1, const void *s2)
 {
-    struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2;
-    return (*p2)->frequency - (*p1)->frequency;
+    struct termlist_score **p1 = (struct termlist_score **) s1;
+    struct termlist_score **p2 = (struct termlist_score **) s2;
+    int d = (*p2)->frequency - (*p1)->frequency;
+    if (d)
+        return d;
+    return strcmp((*p1)->display_term, (*p2)->display_term);
 }
 
-struct termlist_score **termlist_highscore(struct termlist *tl, int *len)
+struct termlist_score **termlist_highscore(struct termlist *tl, int *len,
+                                           NMEM nmem)
 {
-    qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare);
-    *len = tl->highscore_num;
-    return tl->highscore;
+    struct termlist_score **highscore =
+        (struct termlist_score **)
+        nmem_malloc(nmem, tl->no_entries * sizeof(*highscore));
+
+    int no = 0;
+    unsigned bucket;
+    for (bucket = 0; bucket < tl->hash_size; bucket++)
+    {
+        struct termlist_bucket *p;
+        for (p = tl->hashtable[bucket]; p; p = p->next)
+            highscore[no++] = &p->term;
+    }
+    assert(no == tl->no_entries);
+    qsort(highscore, tl->no_entries, sizeof(struct termlist_score*), compare);
+    *len = tl->no_entries;
+    return highscore;
 }
 
 /*
  * Local variables:
  * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
  * indent-tabs-mode: nil
  * End:
  * vim: shiftwidth=4 tabstop=8 expandtab
  */
+