Cluster merge working PAZ-901
[pazpar2-moved-to-github.git] / src / reclists.c
1 /* This file is part of Pazpar2.
2    Copyright (C) 2006-2013 Index Data
3
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #include <assert.h>
21
22 #if HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <yaz/yaz-util.h>
27
28 #include "ppmutex.h"
29 #include "session.h"
30 #include "reclists.h"
31 #include "jenkins_hash.h"
32
33 struct reclist
34 {
35     struct reclist_bucket **hashtable;
36     unsigned hash_size;
37
38     int num_records;
39     struct reclist_bucket *sorted_list;
40     struct reclist_bucket *sorted_ptr;
41     NMEM nmem;
42     YAZ_MUTEX mutex;
43 };
44
45 struct reclist_bucket
46 {
47     struct record_cluster *record;
48     struct reclist_bucket *hash_next;
49     struct reclist_bucket *sorted_next;
50     struct reclist_sortparms *sort_parms;
51 };
52
53 static void append_merge_keys(struct record_metadata_attr **p,
54                               const struct record_metadata_attr *a,
55                               NMEM nmem)
56 {
57 #if 1
58     for (; a; a = a->next)
59     {
60         struct record_metadata_attr **pi = p;
61         for (; *pi; pi = &(*pi)->next)
62             if (!strcmp((*pi)->value, a->value))
63                 break;
64         if (!*pi)
65         {
66             *pi = (struct record_metadata_attr *) nmem_malloc(nmem, sizeof(**p));
67             (*pi)->name = nmem_strdup_null(nmem, a->name);
68             (*pi)->value = nmem_strdup_null(nmem, a->value);
69             (*pi)->next = 0;
70         }
71     }
72 #else
73     while (*p)
74         p = &(*p)->next;
75     for (; a; a = a->next)
76     {
77         *p = (struct record_metadata_attr *) nmem_malloc(nmem, sizeof(**p));
78         (*p)->name = nmem_strdup_null(nmem, a->name);
79         (*p)->value = nmem_strdup_null(nmem, a->value);
80         p = &(*p)->next;
81     }
82     *p = 0;
83 #endif
84 }
85
86 struct reclist_sortparms *reclist_parse_sortparms(NMEM nmem, const char *parms,
87                                                   struct conf_service *service)
88 {
89     struct reclist_sortparms *res = 0;
90     struct reclist_sortparms **rp = &res;
91
92     if (strlen(parms) > 256)
93         return 0;
94     while (*parms)
95     {
96         char parm[256];
97         char *pp;
98         const char *cpp;
99         int increasing = 0;
100         int i;
101         int offset = 0;
102         enum conf_sortkey_type type = Metadata_sortkey_string;
103         struct reclist_sortparms *new;
104
105         if (!(cpp = strchr(parms, ',')))
106             cpp = parms + strlen(parms);
107         strncpy(parm, parms, cpp - parms);
108         parm[cpp-parms] = '\0';
109
110         if ((pp = strchr(parm, ':')))
111         {
112             if (pp[1] == '1')
113                 increasing = 1;
114             else if (pp[1] == '0')
115                 increasing = 0;
116             else
117             {
118                 yaz_log(YLOG_FATAL, "Bad sortkey modifier: %s", parm);
119                 return 0;
120             }
121
122             if (pp[2])
123             {
124                 if (pp[2] == 'p')
125                     type = Metadata_sortkey_position;
126                 else
127                     yaz_log(YLOG_FATAL, "Bad sortkey modifier: %s", parm);
128             }
129             *pp = '\0';
130         }
131         if (type != Metadata_sortkey_position)
132         {
133             if (!strcmp(parm, "relevance"))
134             {
135                 type = Metadata_sortkey_relevance;
136             }
137             else if (!strcmp(parm, "position"))
138             {
139                 type = Metadata_sortkey_position;
140             }
141             else
142             {
143                 for (i = 0; i < service->num_sortkeys; i++)
144                 {
145                     struct conf_sortkey *sk = &service->sortkeys[i];
146                     if (!strcmp(sk->name, parm))
147                     {
148                         type = sk->type;
149                         if (type == Metadata_sortkey_skiparticle)
150                             type = Metadata_sortkey_string;
151                         break;
152                     }
153                 }
154                 if (i >= service->num_sortkeys)
155                 {
156                     yaz_log(YLOG_FATAL, "Sortkey not defined in service: %s",
157                             parm);
158                     return 0;
159                 }
160                 offset = i;
161             }
162         }
163         new = *rp = nmem_malloc(nmem, sizeof(struct reclist_sortparms));
164         new->next = 0;
165         new->offset = offset;
166         new->type = type;
167         new->increasing = increasing;
168         new->name = nmem_strdup(nmem, parm);
169         rp = &new->next;
170         if (*(parms = cpp))
171             parms++;
172     }
173     return res;
174 }
175
176 static int reclist_cmp(const void *p1, const void *p2)
177 {
178     struct reclist_sortparms *sortparms =
179         (*(struct reclist_bucket **) p1)->sort_parms;
180     struct record_cluster *r1 = (*(struct reclist_bucket**) p1)->record;
181     struct record_cluster *r2 = (*(struct reclist_bucket**) p2)->record;
182     struct reclist_sortparms *s;
183     int res = 0;
184
185     for (s = sortparms; s && res == 0; s = s->next)
186     {
187         union data_types *ut1 = r1->sortkeys[s->offset];
188         union data_types *ut2 = r2->sortkeys[s->offset];
189         const char *s1, *s2;
190         switch (s->type)
191         {
192         case Metadata_sortkey_relevance:
193             res = r2->relevance_score - r1->relevance_score;
194             break;
195         case Metadata_sortkey_string:
196             s1 = ut1 ? ut1->text.sort : "";
197             s2 = ut2 ? ut2->text.sort : "";
198             res = strcmp(s2, s1);
199             if (res)
200             {
201                 if (s->increasing)
202                     res *= -1;
203             }
204             break;
205         case Metadata_sortkey_numeric:
206             if (ut1 && ut2)
207             {
208                 if (s->increasing)
209                     res = ut1->number.min  - ut2->number.min;
210                 else
211                     res = ut2->number.max  - ut1->number.max;
212             }
213             else if (ut1 && !ut2)
214                 res = -1;
215             else if (!ut1 && ut2)
216                 res = 1;
217             else
218                 res = 0;
219             break;
220         case Metadata_sortkey_position:
221             if (r1->records && r2->records)
222             {
223                 int pos1 = 0, pos2 = 0;
224                 struct record *rec;
225                 for (rec = r1->records; rec; rec = rec->next)
226                     if (pos1 == 0 || rec->position < pos1)
227                         pos1 = rec->position;
228                 for (rec = r2->records; rec; rec = rec->next)
229                     if (pos2 == 0 || rec->position < pos2)
230                         pos2 = rec->position;
231                 res = pos1 - pos2;
232             }
233             break;
234         default:
235             yaz_log(YLOG_WARN, "Bad sort type: %d", s->type);
236             res = 0;
237             break;
238         }
239     }
240     if (res == 0)
241         res = strcmp(r1->recid, r2->recid);
242     return res;
243 }
244
245 void reclist_limit(struct reclist *l, struct session *se, int lazy)
246 {
247     unsigned i;
248     int num = 0;
249     struct reclist_bucket **pp = &l->sorted_list;
250
251     reclist_enter(l);
252
253     if (!lazy || !*pp)
254     {
255         for (i = 0; i < l->hash_size; i++)
256         {
257             struct reclist_bucket *p;
258             for (p = l->hashtable[i]; p; p = p->hash_next)
259             {
260                 if (session_check_cluster_limit(se, p->record))
261                 {
262                     *pp = p;
263                     pp = &p->sorted_next;
264                     num++;
265                 }
266             }
267         }
268         *pp = 0;
269     }
270     l->num_records = num;
271     reclist_leave(l);
272 }
273
274 void reclist_sort(struct reclist *l, struct reclist_sortparms *parms)
275 {
276     struct reclist_bucket **flatlist = xmalloc(sizeof(*flatlist) * l->num_records);
277     struct reclist_bucket *ptr;
278     struct reclist_bucket **prev;
279     int i = 0;
280
281     reclist_enter(l);
282
283     ptr = l->sorted_list;
284     prev = &l->sorted_list;
285     while (ptr)
286     {
287         ptr->sort_parms = parms;
288         flatlist[i] = ptr;
289         ptr = ptr->sorted_next;
290         i++;
291     }
292     assert(i == l->num_records);
293
294     qsort(flatlist, l->num_records, sizeof(*flatlist), reclist_cmp);
295     for (i = 0; i < l->num_records; i++)
296     {
297         *prev = flatlist[i];
298         prev = &flatlist[i]->sorted_next;
299     }
300     *prev = 0;
301
302     xfree(flatlist);
303
304     reclist_leave(l);
305 }
306
307 struct record_cluster *reclist_read_record(struct reclist *l)
308 {
309     if (l && l->sorted_ptr)
310     {
311         struct record_cluster *t = l->sorted_ptr->record;
312         l->sorted_ptr = l->sorted_ptr->sorted_next;
313         return t;
314     }
315     else
316         return 0;
317 }
318
319 void reclist_enter(struct reclist *l)
320 {
321     yaz_mutex_enter(l->mutex);
322     if (l)
323         l->sorted_ptr = l->sorted_list;
324 }
325
326
327 void reclist_leave(struct reclist *l)
328 {
329     yaz_mutex_leave(l->mutex);
330     if (l)
331         l->sorted_ptr = l->sorted_list;
332 }
333
334
335 struct reclist *reclist_create(NMEM nmem)
336 {
337     struct reclist *res = nmem_malloc(nmem, sizeof(struct reclist));
338     res->hash_size = 399;
339     res->hashtable
340         = nmem_malloc(nmem, res->hash_size * sizeof(struct reclist_bucket*));
341     memset(res->hashtable, 0, res->hash_size * sizeof(struct reclist_bucket*));
342     res->nmem = nmem;
343
344     res->sorted_ptr = 0;
345     res->sorted_list = 0;
346
347     res->num_records = 0;
348     res->mutex = 0;
349     pazpar2_mutex_create(&res->mutex, "reclist");
350     return res;
351 }
352
353 void reclist_destroy(struct reclist *l)
354 {
355     if (l)
356     {
357         unsigned i;
358         for (i = 0; i < l->hash_size; i++)
359         {
360             struct reclist_bucket *p;
361             for (p = l->hashtable[i]; p; p = p->hash_next)
362             {
363                 wrbuf_destroy(p->record->relevance_explain1);
364                 wrbuf_destroy(p->record->relevance_explain2);
365                 p->record->relevance_explain1 = 0;
366                 p->record->relevance_explain2 = 0;
367             }
368         }
369         yaz_mutex_destroy(&l->mutex);
370     }
371 }
372
373 int reclist_get_num_records(struct reclist *l)
374 {
375     if (l)
376         return l->num_records;
377     return 0;
378 }
379
380 static void merge_cluster(struct reclist *l,
381                           struct relevance *r,
382                           struct record_cluster *dst,
383                           struct record_cluster *src)
384 {
385     struct record **rp = &dst->records;
386     for (; *rp; rp = &(*rp)->next)
387         ;
388     *rp = src->records;
389
390     /* not merging metadata and sortkeys yet */
391
392     relevance_mergerec(r, dst, src);
393
394     wrbuf_puts(dst->relevance_explain1, wrbuf_cstr(src->relevance_explain1));
395     wrbuf_puts(dst->relevance_explain2, wrbuf_cstr(src->relevance_explain2));
396
397     wrbuf_destroy(src->relevance_explain1);
398     src->relevance_explain1 = 0;
399     wrbuf_destroy(src->relevance_explain2);
400     src->relevance_explain2 = 0;
401
402     append_merge_keys(&dst->merge_keys, src->merge_keys, l->nmem);
403 }
404
405 static struct record_cluster *new_cluster(
406     struct reclist *l,
407     struct relevance *r,
408     struct conf_service *service,
409     struct record *record,
410     struct record_metadata_attr *merge_keys
411     )
412 {
413     struct record_cluster *cluster;
414     cluster = nmem_malloc(l->nmem, sizeof(*cluster));
415
416     record->next = 0;
417     cluster->records = record;
418     cluster->merge_keys = 0;
419     append_merge_keys(&cluster->merge_keys, merge_keys, l->nmem);
420     cluster->relevance_score = 0;
421     cluster->recid = cluster->merge_keys->value;
422     cluster->metadata =
423         nmem_malloc(l->nmem,
424                     sizeof(struct record_metadata*) * service->num_metadata);
425     memset(cluster->metadata, 0,
426            sizeof(struct record_metadata*) * service->num_metadata);
427     cluster->sortkeys =
428         nmem_malloc(l->nmem, sizeof(struct record_metadata*) * service->num_sortkeys);
429     memset(cluster->sortkeys, 0,
430            sizeof(union data_types*) * service->num_sortkeys);
431     relevance_newrec(r, cluster);
432     cluster->relevance_explain1 = wrbuf_alloc();
433     cluster->relevance_explain2 = wrbuf_alloc();
434     /* attach to hash list */
435     l->num_records++;
436     l->sorted_list = l->sorted_ptr = 0;
437     return cluster;
438 }
439
440 // Insert a record. Return record cluster (newly formed or pre-existing)
441 struct record_cluster *reclist_insert(struct reclist *l,
442                                       struct relevance *r,
443                                       struct conf_service *service,
444                                       struct record *record,
445                                       struct record_metadata_attr *merge_keys,
446                                       int *total)
447 {
448     struct record_cluster *cluster = 0;
449     struct record_metadata_attr *mkl = merge_keys;
450
451     assert(service);
452     assert(l);
453     assert(record);
454     assert(merge_keys);
455     assert(total);
456
457     yaz_mutex_enter(l->mutex);
458
459     for (; mkl; mkl = mkl->next)
460     {
461         const char *merge_key = mkl->value;
462         unsigned int bucket =
463             jenkins_hash((unsigned char*) merge_key) % l->hash_size;
464         struct reclist_bucket **p;
465         struct reclist_bucket *rb = 0;
466
467         for (p = &l->hashtable[bucket]; *p; p = &(*p)->hash_next)
468         {
469             struct record_metadata_attr *mkr = (*p)->record->merge_keys;
470             for (; mkr; mkr = mkr->next)
471             {
472                 // We found a matching record. Merge them
473                 if (!strcmp(merge_key, mkr->value))
474                 {
475                     struct record **re;
476
477                     rb = *p;
478                     for (re = &rb->record->records; *re; re = &(*re)->next)
479                     {
480                         if ((*re)->client == record->client &&
481                             record_compare(record, *re, service))
482                         {
483                             yaz_mutex_leave(l->mutex);
484                             return 0;
485                         }
486                     }
487
488                     if (!cluster)
489                     {
490                         cluster = rb->record;
491                         *re = record;
492                         record->next = 0;
493                     }
494                     else
495                     {
496                         if (cluster != rb->record)
497                         {
498                             if (!rb->record->records)
499                             {
500                                 ; /* already merged */
501                             }
502                             else
503                             {
504                                 merge_cluster(l, r, cluster, rb->record);
505
506                                 rb->record->records = 0; /* signal merged */
507                             }
508                             /* update the hash table */
509                             rb->record = cluster;
510                         }
511                     }
512                 }
513             }
514         }
515         if (!cluster)
516         {
517             (*total)++;
518             cluster = new_cluster(l, r, service, record, merge_keys);
519         }
520
521         if (!rb)
522         {
523             rb = nmem_malloc(l->nmem, sizeof(*rb));
524             rb->record = cluster;
525             rb->hash_next = 0;
526
527             *p = rb;
528         }
529     }
530     yaz_mutex_leave(l->mutex);
531     return cluster;
532 }
533
534 int reclist_sortparms_cmp(struct reclist_sortparms *sort1, struct reclist_sortparms *sort2)
535 {
536     int rc;
537     if (sort1 == sort2)
538         return 0;
539     if (sort1 == 0 || sort2 == 0)
540         return 1;
541     rc = strcmp(sort1->name, sort2->name) || sort1->increasing != sort2->increasing || sort1->type != sort2->type;
542     return rc;
543 }
544 /*
545  * Local variables:
546  * c-basic-offset: 4
547  * c-file-style: "Stroustrup"
548  * indent-tabs-mode: nil
549  * End:
550  * vim: shiftwidth=4 tabstop=8 expandtab
551  */
552