Metadata elements dynamically created from XSLT normalization output.
[pazpar2-moved-to-github.git] / src / termlists.c
1 /*
2  * $Id: termlists.c,v 1.3 2007-01-08 18:32:35 quinn Exp $
3  */
4
5 #include <stdlib.h>
6 #include <string.h>
7 #include <yaz/yaz-util.h>
8
9 #if HAVE_CONFIG_H
10 #include <cconfig.h>
11 #endif
12
13 #include "termlists.h"
14
15 // Discussion:
16 // As terms are found in incoming records, they are added to (or updated in) a
17 // Hash table. When term records are updated, a frequency value is updated. At
18 // the same time, a highscore is maintained for the most frequent terms.
19
20 struct termlist_bucket
21 {
22     struct termlist_score term;
23     struct termlist_bucket *next;
24 };
25
26 struct termlist
27 {
28     struct termlist_bucket **hashtable;
29     int hashtable_size;
30     int hashmask;
31
32     struct termlist_score **highscore;
33     int highscore_size;
34     int highscore_num;
35     int highscore_min;
36
37     NMEM nmem;
38 };
39
40
41 // Jenkins one-at-a-time hash (from wikipedia)
42 static unsigned int hash(const unsigned char *key)
43 {
44     unsigned int hash = 0;
45
46     while (*key)
47     {
48         hash += *(key++);
49         hash += (hash << 10);
50         hash ^= (hash >> 6);
51     }
52     hash += (hash << 3);
53     hash ^= (hash >> 11);
54     hash += (hash << 15);
55     return hash;
56 }
57
58 struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size)
59 {
60     int hashsize = 1;
61     int halfnumterms;
62     struct termlist *res;
63
64     // Calculate a hash size smallest power of 2 larger than 50% of expected numterms
65     halfnumterms = numterms >> 1;
66     if (halfnumterms < 0)
67         halfnumterms = 1;
68     while (hashsize < halfnumterms)
69         hashsize <<= 1;
70     res = nmem_malloc(nmem, sizeof(struct termlist));
71     res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*));
72     bzero(res->hashtable, hashsize * sizeof(struct termlist_bucket*));
73     res->hashtable_size = hashsize;
74     res->nmem = nmem;
75     res->hashmask = hashsize - 1; // Creates a bitmask
76
77     res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
78     res->highscore_size = highscore_size;
79     res->highscore_num = 0;
80     res->highscore_min = 0;
81
82     return res;
83 }
84
85 static void update_highscore(struct termlist *tl, struct termlist_score *t)
86 {
87     int i;
88     int smallest;
89     int me = -1;
90
91     if (tl->highscore_num > tl->highscore_size && t->frequency < tl->highscore_min)
92         return;
93
94     smallest = 0;
95     for (i = 0; i < tl->highscore_num; i++)
96     {
97         if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency)
98             smallest = i;
99         if (tl->highscore[i] == t)
100             me = i;
101     }
102     if (tl->highscore_num)
103         tl->highscore_min = tl->highscore[smallest]->frequency;
104     if (t->frequency < tl->highscore_min)
105         tl->highscore_min = t->frequency;
106     if (me >= 0)
107         return;
108     if (tl->highscore_num < tl->highscore_size)
109     {
110         tl->highscore[tl->highscore_num++] = t;
111         if (t->frequency < tl->highscore_min)
112             tl->highscore_min = t->frequency;
113     }
114     else
115     {
116         if (t->frequency > tl->highscore[smallest]->frequency)
117         {
118             tl->highscore[smallest] = t;
119         }
120     }
121 }
122
123 void termlist_insert(struct termlist *tl, const char *term)
124 {
125     unsigned int bucket;
126     struct termlist_bucket **p;
127
128     bucket = hash((unsigned char *)term) & tl->hashmask;
129     for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
130     {
131         if (!strcmp(term, (*p)->term.term))
132         {
133             (*p)->term.frequency++;
134             update_highscore(tl, &((*p)->term));
135             break;
136         }
137     }
138     if (!*p) // We made it to the end of the bucket without finding match
139     {
140         struct termlist_bucket *new = nmem_malloc(tl->nmem,
141                 sizeof(struct termlist_bucket));
142         new->term.term = nmem_strdup(tl->nmem, term);
143         new->term.frequency = 1;
144         new->next = 0;
145         *p = new;
146         update_highscore(tl, &new->term);
147     }
148 }
149
150 static int compare(const void *s1, const void *s2)
151 {
152     struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2;
153     return (*p2)->frequency - (*p1)->frequency;
154 }
155
156 struct termlist_score **termlist_highscore(struct termlist *tl, int *len)
157 {
158     qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare);
159     *len = tl->highscore_num;
160     return tl->highscore;
161 }
162
163 /*
164  * Local variables:
165  * c-basic-offset: 4
166  * indent-tabs-mode: nil
167  * End:
168  * vim: shiftwidth=4 tabstop=8 expandtab
169  */