Merge remote-tracking branch 'origin/master' into ranking-h
[pazpar2-moved-to-github.git] / src / reclists.c
1 /* This file is part of Pazpar2.
2    Copyright (C) 2006-2013 Index Data
3
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #include <assert.h>
21
22 #if HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <yaz/yaz-util.h>
27
28 #include "ppmutex.h"
29 #include "session.h"
30 #include "reclists.h"
31 #include "jenkins_hash.h"
32
33 struct reclist
34 {
35     struct reclist_bucket **hashtable;
36     unsigned hash_size;
37
38     int num_records;
39     struct record_cluster *sorted_list;
40     struct record_cluster *sorted_ptr;
41     NMEM nmem;
42     YAZ_MUTEX mutex;
43 };
44
45 struct reclist_bucket
46 {
47     struct record_cluster *record;
48     struct reclist_bucket *hash_next;
49 };
50
51 static void append_merge_keys(struct record_metadata_attr **p,
52                               const struct record_metadata_attr *a,
53                               NMEM nmem)
54 {
55     for (; a; a = a->next)
56     {
57         struct record_metadata_attr **pi = p;
58         for (; *pi; pi = &(*pi)->next)
59             if (!strcmp((*pi)->value, a->value))
60                 break;
61         if (!*pi)
62         {
63             *pi = (struct record_metadata_attr *) nmem_malloc(nmem, sizeof(**p));
64             (*pi)->name = nmem_strdup_null(nmem, a->name);
65             (*pi)->value = nmem_strdup_null(nmem, a->value);
66             (*pi)->next = 0;
67         }
68     }
69 }
70
71 struct reclist_sortparms *reclist_parse_sortparms(NMEM nmem, const char *parms,
72                                                   struct conf_service *service)
73 {
74     struct reclist_sortparms *res = 0;
75     struct reclist_sortparms **rp = &res;
76
77     if (strlen(parms) > 256)
78         return 0;
79     while (*parms)
80     {
81         char parm[256];
82         char *pp;
83         const char *cpp;
84         int increasing = 0;
85         int i;
86         int offset = 0;
87         enum conf_sortkey_type type = Metadata_sortkey_string;
88         struct reclist_sortparms *new;
89
90         if (!(cpp = strchr(parms, ',')))
91             cpp = parms + strlen(parms);
92         strncpy(parm, parms, cpp - parms);
93         parm[cpp-parms] = '\0';
94
95         if ((pp = strchr(parm, ':')))
96         {
97             if (pp[1] == '1')
98                 increasing = 1;
99             else if (pp[1] == '0')
100                 increasing = 0;
101             else
102             {
103                 yaz_log(YLOG_FATAL, "Bad sortkey modifier: %s", parm);
104                 return 0;
105             }
106
107             if (pp[2])
108             {
109                 if (pp[2] == 'p')
110                     type = Metadata_sortkey_position;
111                 else
112                     yaz_log(YLOG_FATAL, "Bad sortkey modifier: %s", parm);
113             }
114             *pp = '\0';
115         }
116         if (type != Metadata_sortkey_position)
117         {
118             if (!strcmp(parm, "relevance"))
119             {
120                 type = Metadata_sortkey_relevance;
121             }
122             else if (!strcmp(parm, "relevance_h"))
123             {
124                 type = Metadata_sortkey_relevance_h;
125             }
126             else if (!strcmp(parm, "position"))
127             {
128                 type = Metadata_sortkey_position;
129             }
130             else
131             {
132                 for (i = 0; i < service->num_sortkeys; i++)
133                 {
134                     struct conf_sortkey *sk = &service->sortkeys[i];
135                     if (!strcmp(sk->name, parm))
136                     {
137                         type = sk->type;
138                         if (type == Metadata_sortkey_skiparticle)
139                             type = Metadata_sortkey_string;
140                         break;
141                     }
142                 }
143                 if (i >= service->num_sortkeys)
144                 {
145                     yaz_log(YLOG_FATAL, "Sortkey not defined in service: %s",
146                             parm);
147                     return 0;
148                 }
149                 offset = i;
150             }
151         }
152         new = *rp = nmem_malloc(nmem, sizeof(struct reclist_sortparms));
153         new->next = 0;
154         new->offset = offset;
155         new->type = type;
156         new->increasing = increasing;
157         new->name = nmem_strdup(nmem, parm);
158         rp = &new->next;
159         if (*(parms = cpp))
160             parms++;
161     }
162     return res;
163 }
164
165 static int reclist_cmp(const void *p1, const void *p2)
166 {
167     struct record_cluster *r1 = (*(struct record_cluster**) p1);
168     struct record_cluster *r2 = (*(struct record_cluster**) p2);
169     struct reclist_sortparms *sortparms = r1->sort_parms;
170     struct reclist_sortparms *s;
171     int res = 0;
172
173     for (s = sortparms; s && res == 0; s = s->next)
174     {
175         union data_types *ut1 = r1->sortkeys[s->offset];
176         union data_types *ut2 = r2->sortkeys[s->offset];
177         const char *s1, *s2;
178         switch (s->type)
179         {
180         case Metadata_sortkey_relevance:
181             res = r2->relevance_score - r1->relevance_score;
182             break;
183         case Metadata_sortkey_relevance_h:
184             res = r2->relevance_score - r1->relevance_score;
185             break;
186         case Metadata_sortkey_string:
187             s1 = ut1 ? ut1->text.sort : "";
188             s2 = ut2 ? ut2->text.sort : "";
189             res = strcmp(s2, s1);
190             if (res)
191             {
192                 if (s->increasing)
193                     res *= -1;
194             }
195             break;
196         case Metadata_sortkey_numeric:
197             if (ut1 && ut2)
198             {
199                 if (s->increasing)
200                     res = ut1->number.min  - ut2->number.min;
201                 else
202                     res = ut2->number.max  - ut1->number.max;
203             }
204             else if (ut1 && !ut2)
205                 res = -1;
206             else if (!ut1 && ut2)
207                 res = 1;
208             else
209                 res = 0;
210             break;
211         case Metadata_sortkey_position:
212             if (r1->records && r2->records)
213             {
214                 int pos1 = 0, pos2 = 0;
215                 struct record *rec;
216                 for (rec = r1->records; rec; rec = rec->next)
217                     if (pos1 == 0 || rec->position < pos1)
218                         pos1 = rec->position;
219                 for (rec = r2->records; rec; rec = rec->next)
220                     if (pos2 == 0 || rec->position < pos2)
221                         pos2 = rec->position;
222                 res = pos1 - pos2;
223             }
224             break;
225         default:
226             yaz_log(YLOG_WARN, "Bad sort type: %d", s->type);
227             res = 0;
228             break;
229         }
230     }
231     if (res == 0)
232         res = strcmp(r1->recid, r2->recid);
233     return res;
234 }
235
236 void reclist_limit(struct reclist *l, struct session *se, int lazy)
237 {
238     unsigned i;
239     int num = 0;
240     struct record_cluster **pp = &l->sorted_list;
241
242     reclist_enter(l);
243
244     if (!lazy || !*pp)
245     {
246         for (i = 0; i < l->hash_size; i++)
247         {
248             struct reclist_bucket *p;
249             for (p = l->hashtable[i]; p; p = p->hash_next)
250                 p->record->sorted_next = 0;
251         }
252         for (i = 0; i < l->hash_size; i++)
253         {
254             struct reclist_bucket *p;
255             for (p = l->hashtable[i]; p; p = p->hash_next)
256             {
257                 if (session_check_cluster_limit(se, p->record))
258                 {
259                     if (!p->record->sorted_next)
260                     {
261                         *pp = p->record;
262                         pp = &p->record->sorted_next;
263                         *pp = p->record; /* signal already in use */
264                         num++;
265                     }
266                 }
267             }
268         }
269         *pp = 0;
270     }
271     l->num_records = num;
272     reclist_leave(l);
273 }
274
275 void reclist_sort(struct reclist *l, struct reclist_sortparms *parms)
276 {
277     struct record_cluster **flatlist = xmalloc(sizeof(*flatlist) * l->num_records);
278     struct record_cluster *ptr;
279     struct record_cluster **prev;
280     int i = 0;
281
282     reclist_enter(l);
283
284     ptr = l->sorted_list;
285     prev = &l->sorted_list;
286     while (ptr)
287     {
288         ptr->sort_parms = parms;
289         flatlist[i] = ptr;
290         ptr = ptr->sorted_next;
291         i++;
292     }
293     assert(i == l->num_records);
294
295     qsort(flatlist, l->num_records, sizeof(*flatlist), reclist_cmp);
296     for (i = 0; i < l->num_records; i++)
297     {
298         *prev = flatlist[i];
299         prev = &flatlist[i]->sorted_next;
300     }
301     *prev = 0;
302
303     xfree(flatlist);
304
305     reclist_leave(l);
306 }
307
308 struct record_cluster *reclist_read_record(struct reclist *l)
309 {
310     if (l && l->sorted_ptr)
311     {
312         struct record_cluster *t = l->sorted_ptr;
313         l->sorted_ptr = l->sorted_ptr->sorted_next;
314         return t;
315     }
316     else
317         return 0;
318 }
319
320 void reclist_enter(struct reclist *l)
321 {
322     yaz_mutex_enter(l->mutex);
323     if (l)
324         l->sorted_ptr = l->sorted_list;
325 }
326
327
328 void reclist_leave(struct reclist *l)
329 {
330     yaz_mutex_leave(l->mutex);
331     if (l)
332         l->sorted_ptr = l->sorted_list;
333 }
334
335
336 struct reclist *reclist_create(NMEM nmem)
337 {
338     struct reclist *res = nmem_malloc(nmem, sizeof(struct reclist));
339     res->hash_size = 399;
340     res->hashtable
341         = nmem_malloc(nmem, res->hash_size * sizeof(struct reclist_bucket*));
342     memset(res->hashtable, 0, res->hash_size * sizeof(struct reclist_bucket*));
343     res->nmem = nmem;
344
345     res->sorted_ptr = 0;
346     res->sorted_list = 0;
347
348     res->num_records = 0;
349     res->mutex = 0;
350     pazpar2_mutex_create(&res->mutex, "reclist");
351     return res;
352 }
353
354 void reclist_destroy(struct reclist *l)
355 {
356     if (l)
357     {
358         unsigned i;
359         for (i = 0; i < l->hash_size; i++)
360         {
361             struct reclist_bucket *p;
362             for (p = l->hashtable[i]; p; p = p->hash_next)
363             {
364                 wrbuf_destroy(p->record->relevance_explain1);
365                 wrbuf_destroy(p->record->relevance_explain2);
366                 p->record->relevance_explain1 = 0;
367                 p->record->relevance_explain2 = 0;
368             }
369         }
370         yaz_mutex_destroy(&l->mutex);
371     }
372 }
373
374 int reclist_get_num_records(struct reclist *l)
375 {
376     if (l)
377         return l->num_records;
378     return 0;
379 }
380
381 static void merge_cluster(struct reclist *l,
382                           struct relevance *r,
383                           struct record_cluster *dst,
384                           struct record_cluster *src)
385 {
386     struct record_metadata_attr *mkl;
387     struct record **rp = &dst->records;
388     for (; *rp; rp = &(*rp)->next)
389         ;
390     *rp = src->records;
391
392     for (mkl = src->merge_keys; mkl; mkl = mkl->next)
393     {
394         const char *merge_key = mkl->value;
395         unsigned int bucket =
396             jenkins_hash((unsigned char*) merge_key) % l->hash_size;
397         struct reclist_bucket *p;
398
399         for (p = l->hashtable[bucket]; p; p = p->hash_next)
400             if (p->record == src)
401                 p->record = dst;
402     }
403
404     /* not merging metadata and sortkeys yet */
405
406     relevance_mergerec(r, dst, src);
407
408     wrbuf_puts(dst->relevance_explain1, wrbuf_cstr(src->relevance_explain1));
409     wrbuf_puts(dst->relevance_explain2, wrbuf_cstr(src->relevance_explain2));
410
411     wrbuf_destroy(src->relevance_explain1);
412     src->relevance_explain1 = 0;
413     wrbuf_destroy(src->relevance_explain2);
414     src->relevance_explain2 = 0;
415
416     append_merge_keys(&dst->merge_keys, src->merge_keys, l->nmem);
417 }
418
419 static struct record_cluster *new_cluster(
420     struct reclist *l,
421     struct relevance *r,
422     struct conf_service *service,
423     struct record *record,
424     struct record_metadata_attr *merge_keys
425     )
426 {
427     struct record_cluster *cluster;
428     cluster = nmem_malloc(l->nmem, sizeof(*cluster));
429
430     record->next = 0;
431     cluster->records = record;
432     cluster->merge_keys = 0;
433     append_merge_keys(&cluster->merge_keys, merge_keys, l->nmem);
434     cluster->relevance_score = 0;
435     cluster->recid = cluster->merge_keys->value;
436     cluster->metadata =
437         nmem_malloc(l->nmem,
438                     sizeof(struct record_metadata*) * service->num_metadata);
439     memset(cluster->metadata, 0,
440            sizeof(struct record_metadata*) * service->num_metadata);
441     cluster->sortkeys =
442         nmem_malloc(l->nmem, sizeof(struct record_metadata*) * service->num_sortkeys);
443     memset(cluster->sortkeys, 0,
444            sizeof(union data_types*) * service->num_sortkeys);
445     relevance_newrec(r, cluster);
446     cluster->relevance_explain1 = wrbuf_alloc();
447     cluster->relevance_explain2 = wrbuf_alloc();
448     /* attach to hash list */
449     l->num_records++;
450     l->sorted_list = l->sorted_ptr = 0;
451     return cluster;
452 }
453
454 // Insert a record. Return record cluster (newly formed or pre-existing)
455 struct record_cluster *reclist_insert(struct reclist *l,
456                                       struct relevance *r,
457                                       struct conf_service *service,
458                                       struct record *record,
459                                       struct record_metadata_attr *merge_keys,
460                                       int *total)
461 {
462     struct record_cluster *cluster = 0;
463     struct record_metadata_attr *mkl = merge_keys;
464
465     assert(service);
466     assert(l);
467     assert(record);
468     assert(merge_keys);
469     assert(total);
470
471     yaz_mutex_enter(l->mutex);
472
473     for (; mkl; mkl = mkl->next)
474     {
475         const char *merge_key = mkl->value;
476         unsigned int bucket =
477             jenkins_hash((unsigned char*) merge_key) % l->hash_size;
478         struct reclist_bucket **p;
479         struct reclist_bucket *rb = 0;
480
481         for (p = &l->hashtable[bucket]; *p; p = &(*p)->hash_next)
482         {
483             struct record_metadata_attr *mkr = (*p)->record->merge_keys;
484             for (; mkr; mkr = mkr->next)
485             {
486                 // We found a matching record. Merge them
487                 if (!strcmp(merge_key, mkr->value))
488                 {
489                     struct record **re;
490
491                     rb = *p;
492                     for (re = &rb->record->records; *re; re = &(*re)->next)
493                     {
494                         if ((*re)->client == record->client &&
495                             record_compare(record, *re, service))
496                         {
497                             yaz_mutex_leave(l->mutex);
498                             return 0;
499                         }
500                     }
501
502                     if (!cluster)
503                     {
504                         cluster = rb->record;
505                         *re = record;
506                         record->next = 0;
507                         yaz_log(YLOG_LOG, "reclist: record insert %p", cluster);
508                     }
509                     else
510                     {
511                         if (cluster != rb->record)
512                         {
513                             assert(rb->record->relevance_explain1);
514                             yaz_log(YLOG_LOG, "reclist: cluster merge %p %p", cluster, rb->record);
515                             merge_cluster(l, r, cluster, rb->record);
516                             (*total)--;
517                         }
518                     }
519                 }
520             }
521         }
522         if (!cluster)
523         {
524             (*total)++;
525             cluster = new_cluster(l, r, service, record, merge_keys);
526             yaz_log(YLOG_LOG, "reclist: new cluster p=%p", cluster);
527         }
528
529         if (!rb)
530         {
531             rb = nmem_malloc(l->nmem, sizeof(*rb));
532             rb->record = cluster;
533             rb->hash_next = 0;
534
535             *p = rb;
536         }
537     }
538     yaz_mutex_leave(l->mutex);
539     return cluster;
540 }
541
542 int reclist_sortparms_cmp(struct reclist_sortparms *sort1, struct reclist_sortparms *sort2)
543 {
544     int rc;
545     if (sort1 == sort2)
546         return 0;
547     if (sort1 == 0 || sort2 == 0)
548         return 1;
549     rc = strcmp(sort1->name, sort2->name) || sort1->increasing != sort2->increasing || sort1->type != sort2->type;
550     return rc;
551 }
552 /*
553  * Local variables:
554  * c-basic-offset: 4
555  * c-file-style: "Stroustrup"
556  * indent-tabs-mode: nil
557  * End:
558  * vim: shiftwidth=4 tabstop=8 expandtab
559  */
560