/* This file is part of Pazpar2.
- Copyright (C) 2006-2008 Index Data
+ Copyright (C) 2006-2013 Index Data
Pazpar2 is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
#include <config.h>
#endif
+#include <assert.h>
#include <stdlib.h>
#include <string.h>
-#include <ctype.h>
#include <yaz/yaz-util.h>
#include "termlists.h"
+#include "jenkins_hash.h"
// Discussion:
// As terms are found in incoming records, they are added to (or updated in) a
struct termlist
{
struct termlist_bucket **hashtable;
- int hashtable_size;
- int hashmask;
-
- struct termlist_score **highscore;
- int highscore_size;
- int highscore_num;
- int highscore_min;
+ unsigned hash_size;
+ int no_entries;
NMEM nmem;
};
-
-// Jenkins one-at-a-time hash (from wikipedia)
-static unsigned int hash(const unsigned char *key)
-{
- unsigned int hash = 0;
-
- while (*key)
- {
- hash += *(key++);
- hash += (hash << 10);
- hash ^= (hash >> 6);
- }
- hash += (hash << 3);
- hash ^= (hash >> 11);
- hash += (hash << 15);
- return hash;
-}
-
-struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size)
+struct termlist *termlist_create(NMEM nmem)
{
- int hashsize = 1;
- int halfnumterms;
- struct termlist *res;
-
- // Calculate a hash size smallest power of 2 larger than 50% of expected numterms
- halfnumterms = numterms >> 1;
- if (halfnumterms < 0)
- halfnumterms = 1;
- while (hashsize < halfnumterms)
- hashsize <<= 1;
- res = nmem_malloc(nmem, sizeof(struct termlist));
- res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*));
- memset(res->hashtable, 0, hashsize * sizeof(struct termlist_bucket*));
- res->hashtable_size = hashsize;
+ struct termlist *res = nmem_malloc(nmem, sizeof(struct termlist));
+ res->hash_size = 399;
+ res->hashtable =
+ nmem_malloc(nmem, res->hash_size * sizeof(struct termlist_bucket*));
+ memset(res->hashtable, 0, res->hash_size * sizeof(struct termlist_bucket*));
res->nmem = nmem;
- res->hashmask = hashsize - 1; // Creates a bitmask
-
- res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
- res->highscore_size = highscore_size;
- res->highscore_num = 0;
- res->highscore_min = 0;
-
+ res->no_entries = 0;
return res;
}
-static void update_highscore(struct termlist *tl, struct termlist_score *t)
-{
- int i;
- int smallest;
- int me = -1;
-
- if (tl->highscore_num > tl->highscore_size && t->frequency < tl->highscore_min)
- return;
-
- smallest = 0;
- for (i = 0; i < tl->highscore_num; i++)
- {
- if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency)
- smallest = i;
- if (tl->highscore[i] == t)
- me = i;
- }
- if (tl->highscore_num)
- tl->highscore_min = tl->highscore[smallest]->frequency;
- if (t->frequency < tl->highscore_min)
- tl->highscore_min = t->frequency;
- if (me >= 0)
- return;
- if (tl->highscore_num < tl->highscore_size)
- {
- tl->highscore[tl->highscore_num++] = t;
- if (t->frequency < tl->highscore_min)
- tl->highscore_min = t->frequency;
- }
- else
- {
- if (t->frequency > tl->highscore[smallest]->frequency)
- {
- tl->highscore[smallest] = t;
- }
- }
-}
-
-void termlist_insert(struct termlist *tl, const char *term)
+void termlist_insert(struct termlist *tl, const char *display_term,
+ const char *norm_term, int freq)
{
unsigned int bucket;
struct termlist_bucket **p;
- char buf[256], *cp;
+ char buf[256];
- if (strlen(term) > 255)
+ if (strlen(norm_term) > 255)
return;
- strcpy(buf, term);
- /* chop right */
- for (cp = buf + strlen(buf); cp != buf && strchr(",. -", cp[-1]); cp--)
- cp[-1] = '\0';
-
- bucket = hash((unsigned char *)buf) & tl->hashmask;
+ strcpy(buf, norm_term);
+ bucket = jenkins_hash((unsigned char *)buf) % tl->hash_size;
for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
{
- if (!strcmp(buf, (*p)->term.term))
+ if (!strcmp(buf, (*p)->term.norm_term))
{
- (*p)->term.frequency++;
- update_highscore(tl, &((*p)->term));
+ (*p)->term.frequency += freq;
break;
}
}
{
struct termlist_bucket *new = nmem_malloc(tl->nmem,
sizeof(struct termlist_bucket));
- new->term.term = nmem_strdup(tl->nmem, buf);
- new->term.frequency = 1;
+ new->term.norm_term = nmem_strdup(tl->nmem, buf);
+ new->term.display_term = *display_term ?
+ nmem_strdup(tl->nmem, display_term) : new->term.norm_term;
+ new->term.frequency = freq;
new->next = 0;
*p = new;
- update_highscore(tl, &new->term);
+ tl->no_entries++;
}
}
static int compare(const void *s1, const void *s2)
{
- struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2;
- return (*p2)->frequency - (*p1)->frequency;
+ struct termlist_score **p1 = (struct termlist_score **) s1;
+ struct termlist_score **p2 = (struct termlist_score **) s2;
+ int d = (*p2)->frequency - (*p1)->frequency;
+ if (d)
+ return d;
+ return strcmp((*p1)->display_term, (*p2)->display_term);
}
-struct termlist_score **termlist_highscore(struct termlist *tl, int *len)
+struct termlist_score **termlist_highscore(struct termlist *tl, int *len,
+ NMEM nmem)
{
- qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare);
- *len = tl->highscore_num;
- return tl->highscore;
+ struct termlist_score **highscore =
+ (struct termlist_score **)
+ nmem_malloc(nmem, tl->no_entries * sizeof(*highscore));
+
+ int no = 0;
+ unsigned bucket;
+ for (bucket = 0; bucket < tl->hash_size; bucket++)
+ {
+ struct termlist_bucket *p;
+ for (p = tl->hashtable[bucket]; p; p = p->next)
+ highscore[no++] = &p->term;
+ }
+ assert(no == tl->no_entries);
+ qsort(highscore, tl->no_entries, sizeof(struct termlist_score*), compare);
+ *len = tl->no_entries;
+ return highscore;
}
/*
* Local variables:
* c-basic-offset: 4
+ * c-file-style: "Stroustrup"
* indent-tabs-mode: nil
* End:
* vim: shiftwidth=4 tabstop=8 expandtab
*/
+