9eee590663308f34755df122c47f84354215fe51
[idzebra-moved-to-github.git] / index / zsets.c
1 /* $Id: zsets.c,v 1.125 2007-11-01 15:59:47 adam Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23
24 #include <stdio.h>
25 #include <assert.h>
26 #ifdef WIN32
27 #include <io.h>
28 #else
29 #include <unistd.h>
30 #endif
31
32 #include "index.h"
33 #include "rank.h"
34 #include <yaz/diagbib1.h>
35 #include <rset.h>
36
37 #define ZSET_SORT_MAX_LEVEL 10
38
39 struct zebra_set_term_entry {
40     int reg_type;
41     char *db;
42     char *index_name;
43     char *term;
44 };
45
46 struct zebra_set {
47     char *name;
48     RSET rset;
49     NMEM nmem;
50     NMEM rset_nmem; /* for creating the rsets in */
51     zint hits;
52     int num_bases;
53     char **basenames;
54     Z_RPNQuery *rpn;
55     Z_SortKeySpecList *sortSpec;
56     struct zset_sort_info *sort_info;
57     struct zebra_set_term_entry *term_entries;
58     int term_entries_max;
59     struct zebra_set *next;
60     int locked;
61     int estimated_hit_count;
62
63     zint cache_position;  /* last position */
64     RSFD cache_rfd;       /* rfd (NULL if not existing) */
65     zint cache_psysno;    /* sysno for last position */
66     zint approx_limit;    /* limit before we do approx */
67 };
68
69 struct zset_sort_entry {
70     zint sysno;
71     int score;
72 };
73
74 struct zset_sort_info {
75     int max_entries;
76     int num_entries;
77     struct zset_sort_entry *all_entries;
78     struct zset_sort_entry **entries;
79 };
80
81 static int log_level_set=0;
82 static int log_level_sort=0;
83 static int log_level_searchhits=0;
84 static int log_level_searchterms=0;
85 static int log_level_resultsets=0;
86
87 static void loglevels(void)
88 {
89     if (log_level_set)
90         return;
91     log_level_sort = yaz_log_module_level("sorting");
92     log_level_searchhits = yaz_log_module_level("searchhits");
93     log_level_searchterms = yaz_log_module_level("searchterms");
94     log_level_resultsets = yaz_log_module_level("resultsets");
95     log_level_set = 1;
96 }
97
98
99 static ZEBRA_RES resultSetSearch(ZebraHandle zh, NMEM nmem, NMEM rset_nmem,
100                                  Z_RPNQuery *rpn, ZebraSet sset)
101 {
102     RSET rset = 0;
103     Z_SortKeySpecList *sort_sequence;
104     int sort_status, i;
105     ZEBRA_RES res = ZEBRA_OK;
106
107     sort_sequence = (Z_SortKeySpecList *)
108         nmem_malloc(nmem, sizeof(*sort_sequence));
109     sort_sequence->num_specs = 10; /* FIXME - Hard-coded number */
110     sort_sequence->specs = (Z_SortKeySpec **)
111         nmem_malloc(nmem, sort_sequence->num_specs *
112                      sizeof(*sort_sequence->specs));
113     for (i = 0; i<sort_sequence->num_specs; i++)
114         sort_sequence->specs[i] = 0;
115     
116     rpn_get_top_approx_limit(zh, rpn->RPNStructure, &sset->approx_limit);
117
118     res = rpn_search_top(zh, rpn->RPNStructure, rpn->attributeSetId,
119                          nmem, rset_nmem,
120                          sort_sequence,
121                          sset->num_bases, sset->basenames,
122                          &rset);
123     if (res != ZEBRA_OK)
124     {
125         sset->rset = 0;
126         return res;
127     }
128     for (i = 0; sort_sequence->specs[i]; i++)
129         ;
130     sort_sequence->num_specs = i;
131     rset->hits_limit = sset->approx_limit;
132     if (!i)
133     {
134         res = resultSetRank(zh, sset, rset, rset_nmem);
135     }
136     else
137     {
138         res = resultSetSortSingle(zh, nmem, sset, rset,
139                                    sort_sequence, &sort_status);
140     }
141     sset->rset = rset;
142     return res;
143 }
144
145
146 ZEBRA_RES resultSetAddRPN(ZebraHandle zh, NMEM m, Z_RPNQuery *rpn,
147                           int num_bases, char **basenames,
148                           const char *setname,
149                           zint *hits, int *estimated_hit_count)
150 {
151     ZebraSet zebraSet;
152     int i;
153     ZEBRA_RES res;
154
155     *hits = 0;
156     *estimated_hit_count = 0;
157
158     zebraSet = resultSetAdd(zh, setname, 1);
159     if (!zebraSet)
160         return ZEBRA_FAIL;
161     zebraSet->locked = 1;
162     zebraSet->rpn = 0;
163     zebraSet->nmem = m;
164     zebraSet->rset_nmem = nmem_create(); 
165
166     zebraSet->num_bases = num_bases;
167     zebraSet->basenames = 
168         nmem_malloc(zebraSet->nmem, num_bases * sizeof(*zebraSet->basenames));
169     for (i = 0; i<num_bases; i++)
170         zebraSet->basenames[i] = nmem_strdup(zebraSet->nmem, basenames[i]);
171
172     res = resultSetSearch(zh, zebraSet->nmem, zebraSet->rset_nmem,
173                           rpn, zebraSet);
174     *hits = zebraSet->hits;
175     if (zebraSet->estimated_hit_count)
176         *estimated_hit_count = 1;
177
178     if (zebraSet->rset)
179         zebraSet->rpn = rpn;
180     zebraSet->locked = 0;
181     if (!zebraSet->rset)
182         return ZEBRA_FAIL;
183     return res;
184 }
185
186 void resultSetAddTerm(ZebraHandle zh, ZebraSet s, int reg_type,
187                       const char *db, const char *index_name, 
188                       const char *term)
189 {
190     assert(zh); /* compiler shut up */
191     if (!s->nmem)
192         s->nmem = nmem_create();
193     if (!s->term_entries)
194     {
195         int i;
196         s->term_entries_max = 1000;
197         s->term_entries =
198             nmem_malloc(s->nmem, s->term_entries_max * 
199                          sizeof(*s->term_entries));
200         for (i = 0; i < s->term_entries_max; i++)
201             s->term_entries[i].term = 0;
202     }
203     if (s->hits < s->term_entries_max)
204     {
205         s->term_entries[s->hits].reg_type = reg_type;
206         s->term_entries[s->hits].db = nmem_strdup(s->nmem, db);
207         s->term_entries[s->hits].index_name = nmem_strdup(s->nmem, index_name);
208         s->term_entries[s->hits].term = nmem_strdup(s->nmem, term);
209     }
210     (s->hits)++;
211 }
212
213 ZebraSet resultSetAdd(ZebraHandle zh, const char *name, int ov)
214 {
215     ZebraSet s;
216     int i;
217
218     for (s = zh->sets; s; s = s->next)
219         if (!strcmp(s->name, name))
220             break;
221     
222     if (!log_level_set)
223         loglevels();
224     if (s)
225     {
226         yaz_log(log_level_resultsets, "updating result set %s", name);
227         if (!ov || s->locked)
228             return NULL;
229         if (s->rset)
230         {
231             if (s->cache_rfd)
232                 rset_close(s->cache_rfd);
233             rset_delete(s->rset);
234         }
235         if (s->rset_nmem)
236             nmem_destroy(s->rset_nmem);
237         if (s->nmem)
238             nmem_destroy(s->nmem);
239     }
240     else
241     {
242         const char *sort_max_str = zebra_get_resource(zh, "sortmax", "1000");
243
244         yaz_log(log_level_resultsets, "adding result set %s", name);
245         s = (ZebraSet) xmalloc(sizeof(*s));
246         s->next = zh->sets;
247         zh->sets = s;
248         s->name = xstrdup(name);
249
250         s->sort_info = (struct zset_sort_info *)
251             xmalloc(sizeof(*s->sort_info));
252         s->sort_info->max_entries = atoi(sort_max_str);
253         if (s->sort_info->max_entries < 2)
254             s->sort_info->max_entries = 2;
255
256         s->sort_info->entries = (struct zset_sort_entry **)
257             xmalloc(sizeof(*s->sort_info->entries) *
258                      s->sort_info->max_entries);
259         s->sort_info->all_entries = (struct zset_sort_entry *)
260             xmalloc(sizeof(*s->sort_info->all_entries) *
261                      s->sort_info->max_entries);
262         for (i = 0; i < s->sort_info->max_entries; i++)
263             s->sort_info->entries[i] = s->sort_info->all_entries + i;
264     }
265     s->locked = 0;
266     s->term_entries = 0;
267     s->hits = 0;
268     s->rset = 0;
269     s->rset_nmem = 0;
270     s->nmem = 0;
271     s->rpn = 0;
272     s->sortSpec = 0;
273     s->cache_position = 0;
274     s->cache_rfd = 0;
275     s->approx_limit = zh->approx_limit;
276     s->estimated_hit_count = 0;
277     return s;
278 }
279
280 ZebraSet resultSetGet(ZebraHandle zh, const char *name)
281 {
282     ZebraSet s;
283
284     for (s = zh->sets; s; s = s->next)
285         if (!strcmp(s->name, name))
286         {
287             if (!s->term_entries && !s->rset && s->rpn)
288             {
289                 NMEM nmem = nmem_create();
290                 yaz_log(log_level_resultsets, "research %s", name);
291                 if (!s->rset_nmem)
292                     s->rset_nmem = nmem_create();
293                 resultSetSearch(zh, nmem, s->rset_nmem, s->rpn, s);
294                 if (s->rset && s->sortSpec)
295                 {
296                     int sort_status;
297                     yaz_log(log_level_resultsets, "resort %s", name);
298                     resultSetSortSingle(zh, nmem, s, s->rset, s->sortSpec,
299                                          &sort_status);
300                 }
301                 nmem_destroy(nmem);
302             }
303             return s;
304         }
305     return NULL;
306 }
307
308 void resultSetInvalidate(ZebraHandle zh)
309 {
310     ZebraSet s = zh->sets;
311     
312     yaz_log(log_level_resultsets, "invalidating result sets");
313     for (; s; s = s->next)
314     {
315         if (s->rset)
316         {
317             if (s->cache_rfd)
318                 rset_close(s->cache_rfd);
319             rset_delete(s->rset);
320         }
321         s->rset = 0;
322         s->cache_rfd = 0;
323         s->cache_position = 0;
324         if (s->rset_nmem)
325             nmem_destroy(s->rset_nmem);
326         s->rset_nmem=0;
327     }
328 }
329
330 void resultSetDestroy(ZebraHandle zh, int num, char **names,int *statuses)
331 {
332     ZebraSet * ss = &zh->sets;
333     int i;
334     
335     if (statuses)
336         for (i = 0; i<num; i++)
337             statuses[i] = Z_DeleteStatus_resultSetDidNotExist;
338     while (*ss)
339     {
340         int i = -1;
341         ZebraSet s = *ss;
342         if (num >= 0)
343         {
344             for (i = 0; i<num; i++)
345                 if (!strcmp(s->name, names[i]))
346                 {
347                     if (statuses)
348                         statuses[i] = Z_DeleteStatus_success;
349                     i = -1;
350                     break;
351                 }
352         }
353         if (i < 0)
354         {
355             *ss = s->next;
356             
357             xfree(s->sort_info->all_entries);
358             xfree(s->sort_info->entries);
359             xfree(s->sort_info);
360             
361             if (s->nmem)
362                 nmem_destroy(s->nmem);
363             if (s->rset)
364             {
365                 if (s->cache_rfd)
366                     rset_close(s->cache_rfd);
367                 rset_delete(s->rset);
368             }
369             if (s->rset_nmem)
370                 nmem_destroy(s->rset_nmem);
371             xfree(s->name);
372             xfree(s);
373         }
374         else
375             ss = &s->next;
376     }
377 }
378
379 ZebraMetaRecord *zebra_meta_records_create_range(ZebraHandle zh,
380                                                  const char *name, 
381                                                  zint start, int num)
382 {
383     zint pos_small[10];
384     zint *pos = pos_small;
385     ZebraMetaRecord *mr;
386     int i;
387
388     if (num > 10000 || num <= 0)
389         return 0;
390
391     if (num > 10)
392         pos = xmalloc(sizeof(*pos) * num);
393     
394     for (i = 0; i<num; i++)
395         pos[i] = start+i;
396
397     mr = zebra_meta_records_create(zh, name, num, pos);
398     
399     if (num > 10)
400         xfree(pos);
401     return mr;
402 }
403
404 ZebraMetaRecord *zebra_meta_records_create(ZebraHandle zh, const char *name, 
405                                            int num, zint *positions)
406 {
407     ZebraSet sset;
408     ZebraMetaRecord *sr = 0;
409     RSET rset;
410     int i;
411     struct zset_sort_info *sort_info;
412     size_t sysno_mem_index = 0;
413
414     if (zh->m_staticrank)
415         sysno_mem_index = 1;
416
417     if (!log_level_set)
418         loglevels();
419     if (!(sset = resultSetGet(zh, name)))
420         return NULL;
421     if (!(rset = sset->rset))
422     {
423         if (!sset->term_entries)
424             return 0;
425         sr = (ZebraMetaRecord *) xmalloc(sizeof(*sr) * num);
426         for (i = 0; i<num; i++)
427         {
428             sr[i].sysno = 0;
429             sr[i].score = -1;
430             sr[i].term = 0;
431             sr[i].db = 0;
432
433             if (positions[i] <= sset->term_entries_max)
434             {
435                 sr[i].term = sset->term_entries[positions[i]-1].term;
436                 sr[i].db = sset->term_entries[positions[i]-1].db;
437             }
438         }
439     }
440     else
441     {
442         sr = (ZebraMetaRecord *) xmalloc(sizeof(*sr) * num);
443         for (i = 0; i<num; i++)
444         {
445             sr[i].sysno = 0;
446             sr[i].score = -1;
447             sr[i].term = 0;
448             sr[i].db = 0;
449         }
450         sort_info = sset->sort_info;
451         if (sort_info)
452         {
453             zint position;
454             
455             for (i = 0; i<num; i++)
456             {
457                 position = positions[i];
458                 if (position > 0 && position <= sort_info->num_entries)
459                 {
460                     yaz_log(log_level_sort, "got pos=" ZINT_FORMAT
461                             " (sorted)", position);
462                     sr[i].sysno = sort_info->entries[position-1]->sysno;
463                     sr[i].score = sort_info->entries[position-1]->score;
464                 }
465             }
466         }
467         /* did we really get all entries using sort ? */
468         for (i = 0; i<num; i++)
469         {
470             if (!sr[i].sysno)
471                 break;
472         }
473         if (i < num) /* nope, get the rest, unsorted - sorry */
474         {
475             zint position = 0;
476             int num_i = 0;
477             zint psysno = 0;
478             RSFD rfd;
479             struct it_key key;
480             
481             if (sort_info)
482                 position = sort_info->num_entries;
483             while (num_i < num && positions[num_i] <= position)
484                 num_i++;
485             
486             if (sset->cache_rfd &&
487                 num_i < num && positions[num_i] > sset->cache_position)
488             {
489                 position = sset->cache_position;
490                 rfd = sset->cache_rfd;
491                 psysno = sset->cache_psysno;
492             }
493             else
494             {
495                 if (sset->cache_rfd)
496                     rset_close(sset->cache_rfd);
497                 rfd = rset_open(rset, RSETF_READ);
498             }
499             while (num_i < num && rset_read(rfd, &key, 0))
500             {
501                 zint this_sys = key.mem[sysno_mem_index];
502                 if (this_sys != psysno)
503                 {
504                     psysno = this_sys;
505                     if (sort_info)
506                     {
507                         /* determine we alreay have this in our set */
508                         for (i = sort_info->num_entries; --i >= 0; )
509                             if (psysno == sort_info->entries[i]->sysno)
510                                 break;
511                         if (i >= 0)
512                             continue;
513                     }
514                     position++;
515                     assert(num_i < num);
516                     if (position == positions[num_i])
517                     {
518                         sr[num_i].sysno = psysno;
519                         yaz_log(log_level_sort, "got pos=" ZINT_FORMAT " (unsorted)", position);
520                         sr[num_i].score = -1;
521                         num_i++;
522                     }
523                 }
524             }
525             sset->cache_position = position;
526             sset->cache_psysno = psysno;
527             sset->cache_rfd = rfd;
528         }
529     }
530     return sr;
531 }
532
533 void zebra_meta_records_destroy(ZebraHandle zh, ZebraMetaRecord *records,
534                                  int num)
535 {
536     assert(zh); /* compiler shut up about unused arg */
537     xfree(records);
538 }
539
540 struct sortKeyInfo {
541     int relation;
542     int ord;
543     int numerical;
544     const char *index_type;
545 };
546
547 void resultSetInsertSort(ZebraHandle zh, ZebraSet sset,
548                          struct sortKeyInfo *criteria, int num_criteria,
549                          zint sysno,
550                          char *cmp_buf[], char *tmp_cmp_buf[])
551 {
552     struct zset_sort_entry *new_entry = NULL;
553     struct zset_sort_info *sort_info = sset->sort_info;
554     int i, j;
555
556     zebra_sort_sysno(zh->reg->sort_index, sysno);
557     for (i = 0; i<num_criteria; i++)
558     {
559         char *this_entry_buf = tmp_cmp_buf[i];
560         memset(this_entry_buf, '\0', SORT_IDX_ENTRYSIZE);
561         if (criteria[i].ord != -1)
562         {
563             zebra_sort_type(zh->reg->sort_index, criteria[i].ord);
564             zebra_sort_read(zh->reg->sort_index, this_entry_buf);
565         }
566     }
567     i = sort_info->num_entries;
568     while (--i >= 0)
569     {
570         int rel = 0;
571         for (j = 0; j<num_criteria; j++)
572         {
573             char *this_entry_buf = tmp_cmp_buf[j];
574             char *other_entry_buf = 
575                 cmp_buf[j] + i * SORT_IDX_ENTRYSIZE;
576             if (criteria[j].numerical)
577             {
578                 char this_entry_org[1024];
579                 char other_entry_org[1024];
580                 double diff;
581                 const char *index_type = criteria[j].index_type;
582                 zebra_term_untrans(zh, index_type, this_entry_org,
583                                    this_entry_buf);
584                 zebra_term_untrans(zh, index_type, other_entry_org,
585                                    other_entry_buf);
586                 diff = atof(this_entry_org) - atof(other_entry_org);
587                 
588                 if (diff > 0.0)
589                     rel = 1;
590                 else if (diff < 0.0)
591                     rel = -1;
592                 else
593                     rel = 0;
594             }
595             else
596             {
597                 rel = memcmp(this_entry_buf, other_entry_buf,
598                              SORT_IDX_ENTRYSIZE);
599             }
600             if (rel)
601                 break;
602         }       
603         if (!rel)
604             break;
605         if (criteria[j].relation == 'A')
606         {
607             if (rel > 0)
608                 break;
609         }
610         else if (criteria[j].relation == 'D')
611         {
612             if (rel < 0)
613                 break;
614         }
615     }
616     ++i;
617     j = sort_info->max_entries;
618     if (i == j)
619         return;
620
621     if (sort_info->num_entries == j)
622         --j;
623     else
624         j = (sort_info->num_entries)++;
625     new_entry = sort_info->entries[j];
626     while (j != i)
627     {
628         int k;
629         for (k = 0; k<num_criteria; k++)
630         {
631             char *j_buf = cmp_buf[k] + j * SORT_IDX_ENTRYSIZE;
632             char *j_1_buf = cmp_buf[k] + (j-1) * SORT_IDX_ENTRYSIZE;
633             memcpy(j_buf, j_1_buf, SORT_IDX_ENTRYSIZE);
634         }
635         sort_info->entries[j] = sort_info->entries[j-1];
636         --j;
637     }
638     sort_info->entries[i] = new_entry;
639     assert(new_entry);
640     for (i = 0; i<num_criteria; i++)
641     {
642         char *new_entry_buf = cmp_buf[i] + j * SORT_IDX_ENTRYSIZE;
643         char *this_entry_buf = tmp_cmp_buf[i];
644         memcpy(new_entry_buf, this_entry_buf, SORT_IDX_ENTRYSIZE);
645     }
646     new_entry->sysno = sysno;
647     new_entry->score = -1;
648 }
649
650 void resultSetInsertRank(ZebraHandle zh, struct zset_sort_info *sort_info,
651                          zint sysno, int score, int relation)
652 {
653     struct zset_sort_entry *new_entry = NULL;
654     int i, j;
655     assert(zh); /* compiler shut up about unused arg */
656
657     i = sort_info->num_entries;
658     while (--i >= 0)
659     {
660         int rel = 0;
661
662         rel = score - sort_info->entries[i]->score;
663
664         if (relation == 'D')
665         {
666             if (rel >= 0)
667                 break;
668         }
669         else if (relation == 'A')
670         {
671             if (rel <= 0)
672                 break;
673         }
674     }
675     ++i;
676     j = sort_info->max_entries;
677     if (i == j)
678         return;
679
680     if (sort_info->num_entries == j)
681         --j;
682     else
683         j = (sort_info->num_entries)++;
684     
685     new_entry = sort_info->entries[j];
686     while (j != i)
687     {
688         sort_info->entries[j] = sort_info->entries[j-1];
689         --j;
690     }
691     sort_info->entries[i] = new_entry;
692     assert(new_entry);
693     new_entry->sysno = sysno;
694     new_entry->score = score;
695 }
696
697 static Z_RPNQuery *copy_RPNQuery(Z_RPNQuery *src, NMEM nmem)
698 {
699     Z_RPNQuery *dst = 0;
700     ODR encode = odr_createmem(ODR_ENCODE);
701     ODR decode = odr_createmem(ODR_DECODE);
702
703     if (z_RPNQuery(encode, &src, 0, 0))
704     {
705         int len;
706         char *buf = odr_getbuf(encode, &len, 0);
707
708         if (buf)
709         {
710             odr_setbuf(decode, buf, len, 0);
711             z_RPNQuery(decode, &dst, 0, 0);
712         }
713     }
714     nmem_transfer(nmem, decode->mem);
715     odr_destroy(encode);
716     odr_destroy(decode);
717     return dst;
718 }
719
720 static Z_SortKeySpecList *copy_SortKeySpecList(Z_SortKeySpecList *src, NMEM nmem)
721 {
722     Z_SortKeySpecList *dst = 0;
723     ODR encode = odr_createmem(ODR_ENCODE);
724     ODR decode = odr_createmem(ODR_DECODE);
725
726     if (z_SortKeySpecList(encode, &src, 0, 0))
727     {
728         int len;
729         char *buf = odr_getbuf(encode, &len, 0);
730
731         if (buf)
732         {
733             odr_setbuf(decode, buf, len, 0);
734             z_SortKeySpecList(decode, &dst, 0, 0);
735         }
736     }
737     nmem_transfer(nmem, decode->mem);
738     odr_destroy(encode);
739     odr_destroy(decode);
740     return dst;
741 }
742
743 ZebraSet resultSetClone(ZebraHandle zh, const char *setname,
744                         ZebraSet rset)
745 {
746     ZebraSet nset;
747     int i;
748
749     nset = resultSetAdd(zh, setname, 1);
750     if (!nset)
751         return 0;
752
753     nset->nmem = nmem_create();
754
755     nset->num_bases = rset->num_bases;
756     nset->basenames = 
757         nmem_malloc(nset->nmem, nset->num_bases * sizeof(*rset->basenames));
758     for (i = 0; i<rset->num_bases; i++)
759         nset->basenames[i] = nmem_strdup(nset->nmem, rset->basenames[i]);
760
761     if (rset->rset)
762         nset->rset = rset_dup(rset->rset);
763     if (rset->rpn)
764         nset->rpn = copy_RPNQuery(rset->rpn, nset->nmem);
765     return nset;
766 }
767
768 ZEBRA_RES resultSetSort(ZebraHandle zh, NMEM nmem,
769                         int num_input_setnames, const char **input_setnames,
770                         const char *output_setname,
771                         Z_SortKeySpecList *sort_sequence, int *sort_status)
772 {
773     ZebraSet sset;
774     RSET rset;
775
776     if (num_input_setnames == 0)
777     {
778         zebra_setError(zh, YAZ_BIB1_NO_RESULT_SET_NAME_SUPPLIED_ON_SORT, 0);
779         return ZEBRA_FAIL;
780     }
781     if (num_input_setnames > 1)
782     {
783         zebra_setError(zh, YAZ_BIB1_SORT_TOO_MANY_INPUT_RESULTS, 0);
784         return ZEBRA_FAIL;
785     }
786     if (!log_level_set)
787         loglevels();
788     yaz_log(log_level_sort, "result set sort input=%s output=%s",
789           *input_setnames, output_setname);
790     sset = resultSetGet(zh, input_setnames[0]);
791     if (!sset)
792     {
793         zebra_setError(zh, YAZ_BIB1_SPECIFIED_RESULT_SET_DOES_NOT_EXIST,
794                        input_setnames[0]);
795         return ZEBRA_FAIL;
796     }
797     if (!(rset = sset->rset))
798     {
799         zebra_setError(zh, YAZ_BIB1_SPECIFIED_RESULT_SET_DOES_NOT_EXIST,
800                        input_setnames[0]);
801         return ZEBRA_FAIL;
802     }
803     if (strcmp(output_setname, input_setnames[0]))
804         sset = resultSetClone(zh, output_setname, sset);
805     sset->sortSpec = copy_SortKeySpecList(sort_sequence, sset->nmem);
806     return resultSetSortSingle (zh, nmem, sset, rset, sort_sequence,
807                                 sort_status);
808 }
809
810 ZEBRA_RES resultSetSortSingle(ZebraHandle zh, NMEM nmem,
811                               ZebraSet sset, RSET rset,
812                               Z_SortKeySpecList *sort_sequence,
813                               int *sort_status)
814 {
815     int i;
816     int n = 0;
817     zint kno = 0;
818     zint psysno = 0;
819     struct it_key key;
820     struct sortKeyInfo sort_criteria[ZSET_SORT_MAX_LEVEL];
821     char *cmp_buf[ZSET_SORT_MAX_LEVEL];
822     char *tmp_cmp_buf[ZSET_SORT_MAX_LEVEL];
823     int num_criteria;
824     RSFD rfd;
825     TERMID termid;
826     TERMID *terms;
827     int numTerms = 0;
828     size_t sysno_mem_index = 0;
829
830     if (zh->m_staticrank)
831         sysno_mem_index = 1;
832
833     assert(nmem); /* compiler shut up about unused param */
834     sset->sort_info->num_entries = 0;
835
836     rset_getterms(rset, 0, 0, &n);
837     terms = (TERMID *) nmem_malloc(nmem, sizeof(*terms)*n);
838     rset_getterms(rset, terms, n, &numTerms);
839
840     sset->hits = 0;
841     num_criteria = sort_sequence->num_specs;
842     if (num_criteria > ZSET_SORT_MAX_LEVEL)
843         num_criteria = ZSET_SORT_MAX_LEVEL;
844     for (i = 0; i < num_criteria; i++)
845     {
846         Z_SortKeySpec *sks = sort_sequence->specs[i];
847         Z_SortKey *sk;
848         ZEBRA_RES res;
849
850         sort_criteria[i].ord = -1;
851         sort_criteria[i].numerical = 0;
852
853         if (sks->which == Z_SortKeySpec_missingValueData)
854         {
855             zebra_setError(zh, YAZ_BIB1_UNSUPP_MISSING_DATA_ACTION, 0);
856             return ZEBRA_FAIL;
857         }
858         if (*sks->sortRelation == Z_SortKeySpec_ascending)
859             sort_criteria[i].relation = 'A';
860         else if (*sks->sortRelation == Z_SortKeySpec_descending)
861             sort_criteria[i].relation = 'D';
862         else
863         {
864             zebra_setError(zh, YAZ_BIB1_ILLEGAL_SORT_RELATION, 0);
865             return ZEBRA_FAIL;
866         }
867         if (sks->sortElement->which == Z_SortElement_databaseSpecific)
868         {
869             zebra_setError(zh, YAZ_BIB1_DATABASE_SPECIFIC_SORT_UNSUPP, 0);
870             return ZEBRA_FAIL;
871         }
872         else if (sks->sortElement->which != Z_SortElement_generic)
873         {
874             zebra_setError(zh, YAZ_BIB1_SORT_ILLEGAL_SORT, 0);
875             return ZEBRA_FAIL;
876         }       
877         sk = sks->sortElement->u.generic;
878         switch (sk->which)
879         {
880         case Z_SortKey_sortField:
881             yaz_log(log_level_sort, "key %d is of type sortField",
882                     i+1);
883             sort_criteria[i].numerical = 0;
884             sort_criteria[i].ord = 
885                 zebraExplain_lookup_attr_str(zh->reg->zei,
886                                              zinfo_index_category_sort,
887                                              0, sk->u.sortField);
888             if (sks->which != Z_SortKeySpec_null
889                 && sort_criteria[i].ord == -1)
890             {
891                 zebra_setError(zh,
892                                YAZ_BIB1_CANNOT_SORT_ACCORDING_TO_SEQUENCE, 0);
893                 return ZEBRA_FAIL;
894             }
895             break;
896         case Z_SortKey_elementSpec:
897             yaz_log(log_level_sort, "key %d is of type elementSpec",
898                     i+1);
899             zebra_setError(zh, YAZ_BIB1_CANNOT_SORT_ACCORDING_TO_SEQUENCE, 0);
900             return ZEBRA_FAIL;
901         case Z_SortKey_sortAttributes:
902             yaz_log(log_level_sort, "key %d is of type sortAttributes", i+1);
903             res = zebra_sort_get_ord(zh, sk->u.sortAttributes,
904
905                                      &sort_criteria[i].ord,
906                                      &sort_criteria[i].numerical);
907             if (sks->which != Z_SortKeySpec_null && res != ZEBRA_OK)
908                 return ZEBRA_FAIL;
909             break;
910         }
911         if (zebraExplain_lookup_ord(zh->reg->zei, sort_criteria[i].ord,
912                                     &sort_criteria[i].index_type,
913                                     0, 0))
914         {
915             zebra_setError(zh, YAZ_BIB1_CANNOT_SORT_ACCORDING_TO_SEQUENCE, 0);
916             return ZEBRA_FAIL;
917         }
918     }
919     /* allocate space for each cmpare buf + one extra for tmp comparison */
920     for (i = 0; i<num_criteria; i++)
921     {
922         cmp_buf[i] = xmalloc(sset->sort_info->max_entries
923                              * SORT_IDX_ENTRYSIZE);
924         tmp_cmp_buf[i] = xmalloc(SORT_IDX_ENTRYSIZE);
925     }
926     rfd = rset_open(rset, RSETF_READ);
927     while (rset_read(rfd, &key, &termid))
928     {
929         zint this_sys = key.mem[sysno_mem_index];
930         if (log_level_searchhits)
931             key_logdump_txt(log_level_searchhits, &key, termid->name);
932         kno++;
933         if (this_sys != psysno)
934         {
935             if ((sset->hits & 255) == 0 && zh->break_handler_func)
936             {
937                 if (zh->break_handler_func(zh->break_handler_data))
938                 {
939                     rset_set_hits_limit(rset, 0);
940                     break;
941                 }
942             }
943             (sset->hits)++;
944             psysno = this_sys;
945             resultSetInsertSort(zh, sset,
946                                 sort_criteria, num_criteria, psysno, cmp_buf,
947                                 tmp_cmp_buf);
948         }
949     }
950     rset_close(rfd);
951
952     for (i = 0; i<num_criteria; i++)
953     {
954         xfree(cmp_buf[i]);
955         xfree(tmp_cmp_buf[i]);
956     }
957
958     yaz_log(log_level_sort, ZINT_FORMAT " keys, " ZINT_FORMAT " sysnos, sort",
959             kno, sset->hits);   
960     for (i = 0; i < numTerms; i++)
961         yaz_log(log_level_sort, "term=\"%s\" type=%s count=" ZINT_FORMAT,
962                  terms[i]->name, terms[i]->flags, terms[i]->rset->hits_count);
963     *sort_status = Z_SortResponse_success;
964     return ZEBRA_OK;
965 }
966
967 RSET resultSetRef(ZebraHandle zh, const char *resultSetId)
968 {
969     ZebraSet s;
970
971     if ((s = resultSetGet(zh, resultSetId)))
972         return s->rset;
973     return NULL;
974 }
975
976 ZEBRA_RES resultSetRank(ZebraHandle zh, ZebraSet zebraSet,
977                         RSET rset, NMEM nmem)
978 {
979     struct it_key key;
980     TERMID termid;
981     TERMID *terms;
982     zint kno = 0;
983     int numTerms = 0;
984     int n = 0;
985     int i;
986     ZebraRankClass rank_class;
987     struct zset_sort_info *sort_info;
988     const char *rank_handler_name = res_get_def(zh->res, "rank", "rank-1");
989     size_t sysno_mem_index = 0;
990
991     if (zh->m_staticrank)
992         sysno_mem_index = 1;
993
994     if (!log_level_set)
995         loglevels();
996     sort_info = zebraSet->sort_info;
997     sort_info->num_entries = 0;
998     zebraSet->hits = 0;
999     zebraSet->estimated_hit_count = 0;
1000     rset_getterms(rset, 0, 0, &n);
1001     terms = (TERMID *) nmem_malloc(nmem, sizeof(*terms)*n);
1002     rset_getterms(rset, terms, n, &numTerms);
1003
1004     rank_class = zebraRankLookup(zh, rank_handler_name);
1005     if (!rank_class)
1006     {
1007         yaz_log(YLOG_WARN, "No such rank handler: %s", rank_handler_name);
1008         zebra_setError(zh, YAZ_BIB1_UNSUPP_SEARCH, "Cannot find rank handler");
1009         return ZEBRA_FAIL;
1010     }
1011     else
1012     {
1013         RSFD rfd = rset_open(rset, RSETF_READ);
1014         struct rank_control *rc = rank_class->control;
1015         int score;
1016         zint count = 0;
1017         void *handle = (*rc->begin) (zh->reg, rank_class->class_handle, rset,
1018                                      nmem, terms, numTerms);
1019         zint psysno = 0;  /* previous doc id / sys no */
1020         zint pstaticrank = 0; /* previous static rank */
1021         int stop_flag = 0;
1022         while (rset_read(rfd, &key, &termid))
1023         {
1024             zint this_sys = key.mem[sysno_mem_index];
1025
1026             zint seqno = key.mem[key.len-1];
1027             kno++;
1028             if (log_level_searchhits)
1029                 key_logdump_txt(log_level_searchhits, &key, termid->name);
1030             if (this_sys != psysno) 
1031             {   /* new record .. */
1032                 if (!(rfd->counted_items & 255) && zh->break_handler_func)
1033                 {
1034                     if (zh->break_handler_func(zh->break_handler_data))
1035                     {
1036                         yaz_log(YLOG_LOG, "Aborted search");
1037                         stop_flag = 1;
1038                     }
1039                 }
1040                 if (rfd->counted_items > rset->hits_limit)
1041                     stop_flag = 1;
1042                 if (psysno)
1043                 {   /* only if we did have a previous record */
1044                     score = (*rc->calc)(handle, psysno, pstaticrank,
1045                                          &stop_flag);
1046                     /* insert the hit. A=Ascending */
1047                     resultSetInsertRank(zh, sort_info, psysno, score, 'A');
1048                     count++;
1049                 }
1050                 if (stop_flag)
1051                 {
1052                     zebraSet->estimated_hit_count = 1;
1053                     rset_set_hits_limit(rset, 0);
1054                     break;
1055                 }
1056                 psysno = this_sys;
1057                 if (zh->m_staticrank)
1058                     pstaticrank = key.mem[0];
1059             }
1060             (*rc->add)(handle, CAST_ZINT_TO_INT(seqno), termid);
1061         }
1062         /* no more items */
1063         if (psysno)
1064         {   /* we had - at least - one record */
1065             score = (*rc->calc)(handle, psysno, pstaticrank, &stop_flag);
1066             /* insert the hit. A=Ascending */
1067             resultSetInsertRank(zh, sort_info, psysno, score, 'A');
1068             count++;
1069         }
1070         (*rc->end)(zh->reg, handle);
1071         rset_close(rfd);
1072     }
1073     zebraSet->hits = rset->hits_count;
1074
1075     yaz_log(log_level_searchterms, ZINT_FORMAT " keys, "
1076             ZINT_FORMAT " sysnos, rank",  kno, zebraSet->hits);
1077     for (i = 0; i < numTerms; i++)
1078     {
1079         yaz_log(log_level_searchterms, "term=\"%s\" type=%s count="
1080                 ZINT_FORMAT,
1081                 terms[i]->name, terms[i]->flags, terms[i]->rset->hits_count);
1082     }
1083     return ZEBRA_OK;
1084 }
1085
1086 ZebraRankClass zebraRankLookup(ZebraHandle zh, const char *name)
1087 {
1088     ZebraRankClass p = zh->reg->rank_classes;
1089     while (p && strcmp(p->control->name, name))
1090         p = p->next;
1091     if (p && !p->init_flag)
1092     {
1093         if (p->control->create)
1094             p->class_handle = (*p->control->create)(zh);
1095         p->init_flag = 1;
1096     }
1097     return p;
1098 }
1099
1100 void zebraRankInstall(struct zebra_register *reg, struct rank_control *ctrl)
1101 {
1102     ZebraRankClass p = (ZebraRankClass) xmalloc(sizeof(*p));
1103     p->control = (struct rank_control *) xmalloc(sizeof(*p->control));
1104     memcpy(p->control, ctrl, sizeof(*p->control));
1105     p->control->name = xstrdup(ctrl->name);
1106     p->init_flag = 0;
1107     p->next = reg->rank_classes;
1108     reg->rank_classes = p;
1109 }
1110
1111 void zebraRankDestroy(struct zebra_register *reg)
1112 {
1113     ZebraRankClass p = reg->rank_classes;
1114     while (p)
1115     {
1116         ZebraRankClass p_next = p->next;
1117         if (p->init_flag && p->control->destroy)
1118             (*p->control->destroy)(reg, p->class_handle);
1119         xfree(p->control->name);
1120         xfree(p->control);
1121         xfree(p);
1122         p = p_next;
1123     }
1124     reg->rank_classes = NULL;
1125 }
1126
1127 static int trav_rset_for_termids(RSET rset, TERMID *termid_array,
1128                                  zint *hits_array, int *approx_array)
1129 {
1130     int no = 0;
1131     int i;
1132     for (i = 0; i<rset->no_children; i++)
1133         no += trav_rset_for_termids(rset->children[i],
1134                                     (termid_array ? termid_array + no : 0),
1135                                     (hits_array ? hits_array + no : 0),
1136                                     (approx_array ? approx_array + no : 0));
1137     if (rset->term)
1138     {
1139         if (termid_array)
1140             termid_array[no] = rset->term;
1141         if (hits_array)
1142             hits_array[no] = rset->hits_count;
1143         if (approx_array)
1144             approx_array[no] = rset->hits_approx;
1145 #if 0
1146         yaz_log(YLOG_LOG, "rset=%p term=%s limit=" ZINT_FORMAT
1147                 " count=" ZINT_FORMAT,
1148                 rset, rset->term->name, rset->hits_limit, rset->hits_count);
1149 #endif
1150         no++;
1151     }
1152     return no;
1153 }
1154
1155 ZEBRA_RES zebra_result_set_term_no(ZebraHandle zh, const char *setname,
1156                                    int *num_terms)
1157 {
1158     ZebraSet sset = resultSetGet(zh, setname);
1159     *num_terms = 0;
1160     if (sset)
1161     {
1162         *num_terms = trav_rset_for_termids(sset->rset, 0, 0, 0);
1163         return ZEBRA_OK;
1164     }
1165     return ZEBRA_FAIL;
1166 }
1167
1168 ZEBRA_RES zebra_result_set_term_info(ZebraHandle zh, const char *setname,
1169                                      int no, zint *count, int *approx,
1170                                      char *termbuf, size_t *termlen,
1171                                      const char **term_ref_id)
1172 {
1173     ZebraSet sset = resultSetGet(zh, setname);
1174     if (sset)
1175     {
1176         int num_terms = trav_rset_for_termids(sset->rset, 0, 0, 0);
1177         if (no >= 0 && no < num_terms)
1178         {
1179             TERMID *term_array = xmalloc(num_terms * sizeof(*term_array));
1180             zint *hits_array = xmalloc(num_terms * sizeof(*hits_array));
1181             int *approx_array = xmalloc(num_terms * sizeof(*approx_array));
1182             
1183             trav_rset_for_termids(sset->rset, term_array,
1184                                   hits_array, approx_array);
1185
1186             if (count)
1187                 *count = hits_array[no];
1188             if (approx)
1189                 *approx = approx_array[no];
1190             if (termbuf)
1191             {
1192                 char *inbuf = term_array[no]->name;
1193                 size_t inleft = strlen(inbuf);
1194                 size_t outleft = *termlen - 1;
1195
1196                 if (zh->iconv_from_utf8 != 0)
1197                 {
1198                     char *outbuf = termbuf;
1199                     size_t ret;
1200                     
1201                     ret = yaz_iconv(zh->iconv_from_utf8, &inbuf, &inleft,
1202                                     &outbuf, &outleft);
1203                     if (ret == (size_t)(-1))
1204                         *termlen = 0;
1205                     else
1206                     {
1207                         yaz_iconv(zh->iconv_from_utf8, 0, 0, 
1208                                   &outbuf, &outleft);
1209                         *termlen = outbuf - termbuf;
1210                     }
1211                 }
1212                 else
1213                 {
1214                     if (inleft > outleft)
1215                         inleft = outleft;
1216                     *termlen = inleft;
1217                     memcpy(termbuf, inbuf, *termlen);
1218                 }
1219                 termbuf[*termlen] = '\0';
1220             }
1221             if (term_ref_id)
1222                 *term_ref_id = term_array[no]->ref_id;
1223
1224             xfree(term_array);
1225             xfree(hits_array);
1226             xfree(approx_array);
1227             return ZEBRA_OK;
1228         }
1229     }
1230     return ZEBRA_FAIL;
1231 }
1232
1233 ZEBRA_RES zebra_snippets_hit_vector(ZebraHandle zh, const char *setname,
1234                                     zint sysno, zebra_snippets *snippets)
1235 {
1236     ZebraSet sset = resultSetGet(zh, setname);
1237     yaz_log(YLOG_DEBUG, "zebra_get_hit_vector setname=%s zysno=" ZINT_FORMAT,
1238             setname, sysno);
1239     if (!sset)
1240         return ZEBRA_FAIL;
1241     else
1242     {
1243         struct rset_key_control *kc = zebra_key_control_create(zh);
1244         NMEM nmem = nmem_create();
1245         struct it_key key;
1246         RSET rsets[2], rset_comb;
1247         RSET rset_temp = rset_create_temp(nmem, kc, kc->scope, 
1248                                           res_get(zh->res, "setTmpDir"),0 );
1249         
1250         TERMID termid;
1251         RSFD rsfd = rset_open(rset_temp, RSETF_WRITE);
1252         
1253         key.mem[0] = sysno;
1254         key.mem[1] = 0;
1255         key.mem[2] = 0;
1256         key.mem[3] = 0;
1257         key.len = 2;
1258         rset_write(rsfd, &key);
1259         rset_close(rsfd);
1260
1261         rsets[0] = rset_temp;
1262         rsets[1] = rset_dup(sset->rset);
1263         
1264         rset_comb = rset_create_and(nmem, kc, kc->scope, 2, rsets);
1265
1266         rsfd = rset_open(rset_comb, RSETF_READ);
1267
1268         while (rset_read(rsfd, &key, &termid))
1269         {
1270             if (termid)
1271             {
1272                 struct ord_list *ol;
1273                 for (ol = termid->ol; ol; ol = ol->next)
1274                 {
1275                     zebra_snippets_append(snippets, key.mem[key.len-1], 0,
1276                                           ol->ord, termid->name);
1277                 }
1278             }
1279         }
1280         rset_close(rsfd);
1281         
1282         rset_delete(rset_comb);
1283         nmem_destroy(nmem);
1284         kc->dec(kc);
1285     }
1286     return ZEBRA_OK;
1287 }
1288
1289 /*
1290  * Local variables:
1291  * c-basic-offset: 4
1292  * indent-tabs-mode: nil
1293  * End:
1294  * vim: shiftwidth=4 tabstop=8 expandtab
1295  */
1296