Reorganized source tree
[pazpar2-moved-to-github.git] / src / termlists.c
1 /*
2  * $Id: termlists.c,v 1.1 2006-12-20 20:47:16 quinn Exp $
3  */
4
5 #include <stdlib.h>
6 #include <string.h>
7 #include <yaz/yaz-util.h>
8
9 #include "termlists.h"
10
11 // Discussion:
12 // As terms are found in incoming records, they are added to (or updated in) a
13 // Hash table. When term records are updated, a frequency value is updated. At
14 // the same time, a highscore is maintained for the most frequent terms.
15
16 struct termlist_bucket
17 {
18     struct termlist_score term;
19     struct termlist_bucket *next;
20 };
21
22 struct termlist
23 {
24     struct termlist_bucket **hashtable;
25     int hashtable_size;
26     int hashmask;
27
28     struct termlist_score **highscore;
29     int highscore_size;
30     int highscore_num;
31     int highscore_min;
32
33     NMEM nmem;
34 };
35
36
37 // Jenkins one-at-a-time hash (from wikipedia)
38 static unsigned int hash(const unsigned char *key)
39 {
40     unsigned int hash = 0;
41
42     while (*key)
43     {
44         hash += *(key++);
45         hash += (hash << 10);
46         hash ^= (hash >> 6);
47     }
48     hash += (hash << 3);
49     hash ^= (hash >> 11);
50     hash += (hash << 15);
51     return hash;
52 }
53
54 struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size)
55 {
56     int hashsize = 1;
57     int halfnumterms;
58     struct termlist *res;
59
60     // Calculate a hash size smallest power of 2 larger than 50% of expected numterms
61     halfnumterms = numterms >> 1;
62     if (halfnumterms < 0)
63         halfnumterms = 1;
64     while (hashsize < halfnumterms)
65         hashsize <<= 1;
66     res = nmem_malloc(nmem, sizeof(struct termlist));
67     res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*));
68     bzero(res->hashtable, hashsize * sizeof(struct termlist_bucket*));
69     res->hashtable_size = hashsize;
70     res->nmem = nmem;
71     res->hashmask = hashsize - 1; // Creates a bitmask
72
73     res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
74     res->highscore_size = highscore_size;
75     res->highscore_num = 0;
76     res->highscore_min = 0;
77
78     return res;
79 }
80
81 static void update_highscore(struct termlist *tl, struct termlist_score *t)
82 {
83     int i;
84     int smallest;
85     int me = -1;
86
87     if (t->frequency < tl->highscore_min)
88         return;
89
90     smallest = 0;
91     for (i = 0; i < tl->highscore_num; i++)
92     {
93         if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency)
94             smallest = i;
95         if (tl->highscore[i] == t)
96             me = i;
97     }
98     if (tl->highscore_num)
99         tl->highscore_min = tl->highscore[smallest]->frequency;
100     if (me >= 0)
101         return;
102     if (tl->highscore_num < tl->highscore_size)
103     {
104         tl->highscore[tl->highscore_num++] = t;
105         if (t->frequency < tl->highscore_min)
106             tl->highscore_min = t->frequency;
107     }
108     else
109     {
110         if (t->frequency > tl->highscore[smallest]->frequency)
111         {
112             tl->highscore[smallest] = t;
113         }
114     }
115 }
116
117 void termlist_insert(struct termlist *tl, const char *term)
118 {
119     unsigned int bucket;
120     struct termlist_bucket **p;
121
122     bucket = hash((unsigned char *)term) & tl->hashmask;
123     for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
124     {
125         if (!strcmp(term, (*p)->term.term))
126         {
127             (*p)->term.frequency++;
128             update_highscore(tl, &((*p)->term));
129             break;
130         }
131     }
132     if (!*p) // We made it to the end of the bucket without finding match
133     {
134         struct termlist_bucket *new = nmem_malloc(tl->nmem,
135                 sizeof(struct termlist_bucket));
136         new->term.term = nmem_strdup(tl->nmem, term);
137         new->term.frequency = 1;
138         new->next = 0;
139         *p = new;
140         update_highscore(tl, &new->term);
141     }
142 }
143
144 static int compare(const void *s1, const void *s2)
145 {
146     struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2;
147     return (*p2)->frequency - (*p1)->frequency;
148 }
149
150 struct termlist_score **termlist_highscore(struct termlist *tl, int *len)
151 {
152     qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare);
153     *len = tl->highscore_num;
154     return tl->highscore;
155 }
156
157 /*
158  * Local variables:
159  * c-basic-offset: 4
160  * indent-tabs-mode: nil
161  * End:
162  * vim: shiftwidth=4 tabstop=8 expandtab
163  */