Generic snippet support. Unlike previous versions of snippet
[idzebra-moved-to-github.git] / index / zsets.c
1 /* $Id: zsets.c,v 1.122 2007-08-21 11:06:47 adam Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23
24 #include <stdio.h>
25 #include <assert.h>
26 #ifdef WIN32
27 #include <io.h>
28 #else
29 #include <unistd.h>
30 #endif
31
32 #include "index.h"
33 #include "rank.h"
34 #include <yaz/diagbib1.h>
35 #include <rset.h>
36
37 #define ZSET_SORT_MAX_LEVEL 10
38
39 struct zebra_set_term_entry {
40     int reg_type;
41     char *db;
42     char *index_name;
43     char *term;
44 };
45
46 struct zebra_set {
47     char *name;
48     RSET rset;
49     NMEM nmem;
50     NMEM rset_nmem; /* for creating the rsets in */
51     zint hits;
52     int num_bases;
53     char **basenames;
54     Z_RPNQuery *rpn;
55     Z_SortKeySpecList *sortSpec;
56     struct zset_sort_info *sort_info;
57     struct zebra_set_term_entry *term_entries;
58     int term_entries_max;
59     struct zebra_set *next;
60     int locked;
61     int estimated_hit_count;
62
63     zint cache_position;  /* last position */
64     RSFD cache_rfd;       /* rfd (NULL if not existing) */
65     zint cache_psysno;    /* sysno for last position */
66     zint approx_limit;    /* limit before we do approx */
67 };
68
69 struct zset_sort_entry {
70     zint sysno;
71     int score;
72 };
73
74 struct zset_sort_info {
75     int max_entries;
76     int num_entries;
77     struct zset_sort_entry *all_entries;
78     struct zset_sort_entry **entries;
79 };
80
81 static int log_level_set=0;
82 static int log_level_sort=0;
83 static int log_level_searchhits=0;
84 static int log_level_searchterms=0;
85 static int log_level_resultsets=0;
86
87 static void loglevels(void)
88 {
89     if (log_level_set)
90         return;
91     log_level_sort = yaz_log_module_level("sorting");
92     log_level_searchhits = yaz_log_module_level("searchhits");
93     log_level_searchterms = yaz_log_module_level("searchterms");
94     log_level_resultsets = yaz_log_module_level("resultsets");
95     log_level_set = 1;
96 }
97
98
99 static ZEBRA_RES resultSetSearch(ZebraHandle zh, NMEM nmem, NMEM rset_nmem,
100                                  Z_RPNQuery *rpn, ZebraSet sset)
101 {
102     RSET rset = 0;
103     Z_SortKeySpecList *sort_sequence;
104     int sort_status, i;
105     ZEBRA_RES res = ZEBRA_OK;
106
107     sort_sequence = (Z_SortKeySpecList *)
108         nmem_malloc(nmem, sizeof(*sort_sequence));
109     sort_sequence->num_specs = 10; /* FIXME - Hard-coded number */
110     sort_sequence->specs = (Z_SortKeySpec **)
111         nmem_malloc(nmem, sort_sequence->num_specs *
112                      sizeof(*sort_sequence->specs));
113     for (i = 0; i<sort_sequence->num_specs; i++)
114         sort_sequence->specs[i] = 0;
115     
116     rpn_get_top_approx_limit(zh, rpn->RPNStructure, &sset->approx_limit);
117
118     res = rpn_search_top(zh, rpn->RPNStructure, rpn->attributeSetId,
119                          nmem, rset_nmem,
120                          sort_sequence,
121                          sset->num_bases, sset->basenames,
122                          &rset);
123     if (res != ZEBRA_OK)
124     {
125         sset->rset = 0;
126         return res;
127     }
128     for (i = 0; sort_sequence->specs[i]; i++)
129         ;
130     sort_sequence->num_specs = i;
131     rset->hits_limit = sset->approx_limit;
132     if (!i)
133     {
134         res = resultSetRank (zh, sset, rset, rset_nmem);
135     }
136     else
137     {
138         res = resultSetSortSingle (zh, nmem, sset, rset,
139                                    sort_sequence, &sort_status);
140     }
141     sset->rset = rset;
142     return res;
143 }
144
145
146 ZEBRA_RES resultSetAddRPN(ZebraHandle zh, NMEM m, Z_RPNQuery *rpn,
147                           int num_bases, char **basenames,
148                           const char *setname,
149                           zint *hits, int *estimated_hit_count)
150 {
151     ZebraSet zebraSet;
152     int i;
153     ZEBRA_RES res;
154
155     *hits = 0;
156     *estimated_hit_count = 0;
157
158     zebraSet = resultSetAdd(zh, setname, 1);
159     if (!zebraSet)
160         return ZEBRA_FAIL;
161     zebraSet->locked = 1;
162     zebraSet->rpn = 0;
163     zebraSet->nmem = m;
164     zebraSet->rset_nmem = nmem_create(); 
165
166     zebraSet->num_bases = num_bases;
167     zebraSet->basenames = 
168         nmem_malloc (zebraSet->nmem, num_bases * sizeof(*zebraSet->basenames));
169     for (i = 0; i<num_bases; i++)
170         zebraSet->basenames[i] = nmem_strdup(zebraSet->nmem, basenames[i]);
171
172     res = resultSetSearch(zh, zebraSet->nmem, zebraSet->rset_nmem,
173                           rpn, zebraSet);
174     *hits = zebraSet->hits;
175     if (zebraSet->estimated_hit_count)
176         *estimated_hit_count = 1;
177
178     if (zebraSet->rset)
179         zebraSet->rpn = rpn;
180     zebraSet->locked = 0;
181     if (!zebraSet->rset)
182         return ZEBRA_FAIL;
183     return res;
184 }
185
186 void resultSetAddTerm(ZebraHandle zh, ZebraSet s, int reg_type,
187                       const char *db, const char *index_name, 
188                       const char *term)
189 {
190     assert(zh); /* compiler shut up */
191     if (!s->nmem)
192         s->nmem = nmem_create ();
193     if (!s->term_entries)
194     {
195         int i;
196         s->term_entries_max = 1000;
197         s->term_entries =
198             nmem_malloc (s->nmem, s->term_entries_max * 
199                          sizeof(*s->term_entries));
200         for (i = 0; i < s->term_entries_max; i++)
201             s->term_entries[i].term = 0;
202     }
203     if (s->hits < s->term_entries_max)
204     {
205         s->term_entries[s->hits].reg_type = reg_type;
206         s->term_entries[s->hits].db = nmem_strdup (s->nmem, db);
207         s->term_entries[s->hits].index_name = nmem_strdup(s->nmem, index_name);
208         s->term_entries[s->hits].term = nmem_strdup(s->nmem, term);
209     }
210     (s->hits)++;
211 }
212
213 ZebraSet resultSetAdd(ZebraHandle zh, const char *name, int ov)
214 {
215     ZebraSet s;
216     int i;
217
218     for (s = zh->sets; s; s = s->next)
219         if (!strcmp (s->name, name))
220             break;
221     
222     if (!log_level_set)
223         loglevels();
224     if (s)
225     {
226         yaz_log(log_level_resultsets, "updating result set %s", name);
227         if (!ov || s->locked)
228             return NULL;
229         if (s->rset)
230         {
231             if (s->cache_rfd)
232                 rset_close(s->cache_rfd);
233             rset_delete (s->rset);
234         }
235         if (s->rset_nmem)
236             nmem_destroy (s->rset_nmem);
237         if (s->nmem)
238             nmem_destroy (s->nmem);
239     }
240     else
241     {
242         const char *sort_max_str = zebra_get_resource(zh, "sortmax", "1000");
243
244         yaz_log(log_level_resultsets, "adding result set %s", name);
245         s = (ZebraSet) xmalloc (sizeof(*s));
246         s->next = zh->sets;
247         zh->sets = s;
248         s->name = (char *) xmalloc (strlen(name)+1);
249         strcpy (s->name, name);
250
251         s->sort_info = (struct zset_sort_info *)
252             xmalloc (sizeof(*s->sort_info));
253         s->sort_info->max_entries = atoi(sort_max_str);
254         if (s->sort_info->max_entries < 2)
255             s->sort_info->max_entries = 2;
256
257         s->sort_info->entries = (struct zset_sort_entry **)
258             xmalloc (sizeof(*s->sort_info->entries) *
259                      s->sort_info->max_entries);
260         s->sort_info->all_entries = (struct zset_sort_entry *)
261             xmalloc (sizeof(*s->sort_info->all_entries) *
262                      s->sort_info->max_entries);
263         for (i = 0; i < s->sort_info->max_entries; i++)
264             s->sort_info->entries[i] = s->sort_info->all_entries + i;
265     }
266     s->locked = 0;
267     s->term_entries = 0;
268     s->hits = 0;
269     s->rset = 0;
270     s->rset_nmem = 0;
271     s->nmem = 0;
272     s->rpn = 0;
273     s->sortSpec = 0;
274     s->cache_position = 0;
275     s->cache_rfd = 0;
276     s->approx_limit = zh->approx_limit;
277     s->estimated_hit_count = 0;
278     return s;
279 }
280
281 ZebraSet resultSetGet(ZebraHandle zh, const char *name)
282 {
283     ZebraSet s;
284
285     for (s = zh->sets; s; s = s->next)
286         if (!strcmp (s->name, name))
287         {
288             if (!s->term_entries && !s->rset && s->rpn)
289             {
290                 NMEM nmem = nmem_create ();
291                 yaz_log(log_level_resultsets, "research %s", name);
292                 if (!s->rset_nmem)
293                     s->rset_nmem=nmem_create();
294                 resultSetSearch(zh, nmem, s->rset_nmem, s->rpn, s);
295                 if (s->rset && s->sortSpec)
296                 {
297                     int sort_status;
298                     yaz_log(log_level_resultsets, "resort %s", name);
299                     resultSetSortSingle (zh, nmem, s, s->rset, s->sortSpec,
300                                          &sort_status);
301                 }
302                 nmem_destroy (nmem);
303             }
304             return s;
305         }
306     return NULL;
307 }
308
309 void resultSetInvalidate (ZebraHandle zh)
310 {
311     ZebraSet s = zh->sets;
312     
313     yaz_log(log_level_resultsets, "invalidating result sets");
314     for (; s; s = s->next)
315     {
316         if (s->rset)
317         {
318             if (s->cache_rfd)
319                 rset_close(s->cache_rfd);
320             rset_delete (s->rset);
321         }
322         s->rset = 0;
323         s->cache_rfd = 0;
324         s->cache_position = 0;
325         if (s->rset_nmem)
326             nmem_destroy(s->rset_nmem);
327         s->rset_nmem=0;
328     }
329 }
330
331 void resultSetDestroy(ZebraHandle zh, int num, char **names,int *statuses)
332 {
333     ZebraSet * ss = &zh->sets;
334     int i;
335     
336     if (statuses)
337         for (i = 0; i<num; i++)
338             statuses[i] = Z_DeleteStatus_resultSetDidNotExist;
339     while (*ss)
340     {
341         int i = -1;
342         ZebraSet s = *ss;
343         if (num >= 0)
344         {
345             for (i = 0; i<num; i++)
346                 if (!strcmp (s->name, names[i]))
347                 {
348                     if (statuses)
349                         statuses[i] = Z_DeleteStatus_success;
350                     i = -1;
351                     break;
352                 }
353         }
354         if (i < 0)
355         {
356             *ss = s->next;
357             
358             xfree (s->sort_info->all_entries);
359             xfree (s->sort_info->entries);
360             xfree (s->sort_info);
361             
362             if (s->nmem)
363                 nmem_destroy (s->nmem);
364             if (s->rset)
365             {
366                 if (s->cache_rfd)
367                     rset_close(s->cache_rfd);
368                 rset_delete (s->rset);
369             }
370             if (s->rset_nmem)
371                 nmem_destroy(s->rset_nmem);
372             xfree (s->name);
373             xfree (s);
374         }
375         else
376             ss = &s->next;
377     }
378 }
379
380 ZebraMetaRecord *zebra_meta_records_create_range(ZebraHandle zh,
381                                                  const char *name, 
382                                                  zint start, int num)
383 {
384     zint pos_small[10];
385     zint *pos = pos_small;
386     ZebraMetaRecord *mr;
387     int i;
388
389     if (num > 10000 || num <= 0)
390         return 0;
391
392     if (num > 10)
393         pos = xmalloc(sizeof(*pos) * num);
394     
395     for (i = 0; i<num; i++)
396         pos[i] = start+i;
397
398     mr = zebra_meta_records_create(zh, name, num, pos);
399     
400     if (num > 10)
401         xfree(pos);
402     return mr;
403 }
404
405 ZebraMetaRecord *zebra_meta_records_create(ZebraHandle zh, const char *name, 
406                                            int num, zint *positions)
407 {
408     ZebraSet sset;
409     ZebraMetaRecord *sr = 0;
410     RSET rset;
411     int i;
412     struct zset_sort_info *sort_info;
413     size_t sysno_mem_index = 0;
414
415     if (zh->m_staticrank)
416         sysno_mem_index = 1;
417
418     if (!log_level_set)
419         loglevels();
420     if (!(sset = resultSetGet (zh, name)))
421         return NULL;
422     if (!(rset = sset->rset))
423     {
424         if (!sset->term_entries)
425             return 0;
426         sr = (ZebraMetaRecord *) xmalloc (sizeof(*sr) * num);
427         for (i = 0; i<num; i++)
428         {
429             sr[i].sysno = 0;
430             sr[i].score = -1;
431             sr[i].term = 0;
432             sr[i].db = 0;
433
434             if (positions[i] <= sset->term_entries_max)
435             {
436                 sr[i].term = sset->term_entries[positions[i]-1].term;
437                 sr[i].db = sset->term_entries[positions[i]-1].db;
438             }
439         }
440     }
441     else
442     {
443         sr = (ZebraMetaRecord *) xmalloc (sizeof(*sr) * num);
444         for (i = 0; i<num; i++)
445         {
446             sr[i].sysno = 0;
447             sr[i].score = -1;
448             sr[i].term = 0;
449             sr[i].db = 0;
450         }
451         sort_info = sset->sort_info;
452         if (sort_info)
453         {
454             zint position;
455             
456             for (i = 0; i<num; i++)
457             {
458                 position = positions[i];
459                 if (position > 0 && position <= sort_info->num_entries)
460                 {
461                     yaz_log(log_level_sort, "got pos=" ZINT_FORMAT
462                             " (sorted)", position);
463                     sr[i].sysno = sort_info->entries[position-1]->sysno;
464                     sr[i].score = sort_info->entries[position-1]->score;
465                 }
466             }
467         }
468         /* did we really get all entries using sort ? */
469         for (i = 0; i<num; i++)
470         {
471             if (!sr[i].sysno)
472                 break;
473         }
474         if (i < num) /* nope, get the rest, unsorted - sorry */
475         {
476             zint position = 0;
477             int num_i = 0;
478             zint psysno = 0;
479             RSFD rfd;
480             struct it_key key;
481             
482             if (sort_info)
483                 position = sort_info->num_entries;
484             while (num_i < num && positions[num_i] <= position)
485                 num_i++;
486             
487             if (sset->cache_rfd &&
488                 num_i < num && positions[num_i] > sset->cache_position)
489             {
490                 position = sset->cache_position;
491                 rfd = sset->cache_rfd;
492                 psysno = sset->cache_psysno;
493             }
494             else
495             {
496                 if (sset->cache_rfd)
497                     rset_close(sset->cache_rfd);
498                 rfd = rset_open (rset, RSETF_READ);
499             }
500             while (num_i < num && rset_read (rfd, &key, 0))
501             {
502                 zint this_sys = key.mem[sysno_mem_index];
503                 if (this_sys != psysno)
504                 {
505                     psysno = this_sys;
506                     if (sort_info)
507                     {
508                         /* determine we alreay have this in our set */
509                         for (i = sort_info->num_entries; --i >= 0; )
510                             if (psysno == sort_info->entries[i]->sysno)
511                                 break;
512                         if (i >= 0)
513                             continue;
514                     }
515                     position++;
516                     assert (num_i < num);
517                     if (position == positions[num_i])
518                     {
519                         sr[num_i].sysno = psysno;
520                         yaz_log(log_level_sort, "got pos=" ZINT_FORMAT " (unsorted)", position);
521                         sr[num_i].score = -1;
522                         num_i++;
523                     }
524                 }
525             }
526             sset->cache_position = position;
527             sset->cache_psysno = psysno;
528             sset->cache_rfd = rfd;
529         }
530     }
531     return sr;
532 }
533
534 void zebra_meta_records_destroy (ZebraHandle zh, ZebraMetaRecord *records,
535                                  int num)
536 {
537     assert(zh); /* compiler shut up about unused arg */
538     xfree (records);
539 }
540
541 struct sortKeyInfo {
542     int relation;
543     int ord;
544     int numerical;
545     int index_type;
546 };
547
548 void resultSetInsertSort(ZebraHandle zh, ZebraSet sset,
549                          struct sortKeyInfo *criteria, int num_criteria,
550                          zint sysno,
551                          char *cmp_buf[], char *tmp_cmp_buf[])
552 {
553     struct zset_sort_entry *new_entry = NULL;
554     struct zset_sort_info *sort_info = sset->sort_info;
555     int i, j;
556
557     zebra_sort_sysno(zh->reg->sort_index, sysno);
558     for (i = 0; i<num_criteria; i++)
559     {
560         char *this_entry_buf = tmp_cmp_buf[i];
561         memset(this_entry_buf, '\0', SORT_IDX_ENTRYSIZE);
562         if (criteria[i].ord != -1)
563         {
564             zebra_sort_type(zh->reg->sort_index, criteria[i].ord);
565             zebra_sort_read(zh->reg->sort_index, this_entry_buf);
566         }
567     }
568     i = sort_info->num_entries;
569     while (--i >= 0)
570     {
571         int rel = 0;
572         for (j = 0; j<num_criteria; j++)
573         {
574             char *this_entry_buf = tmp_cmp_buf[j];
575             char *other_entry_buf = 
576                 cmp_buf[j] + i * SORT_IDX_ENTRYSIZE;
577             if (criteria[j].numerical)
578             {
579                 char this_entry_org[1024];
580                 char other_entry_org[1024];
581                 double diff;
582                 int index_type = criteria[j].index_type;
583                 zebra_term_untrans(zh, index_type, this_entry_org,
584                                    this_entry_buf);
585                 zebra_term_untrans(zh, index_type, other_entry_org,
586                                    other_entry_buf);
587                 diff = atof(this_entry_org) - atof(other_entry_org);
588                 
589                 if (diff > 0.0)
590                     rel = 1;
591                 else if (diff < 0.0)
592                     rel = -1;
593                 else
594                     rel = 0;
595             }
596             else
597             {
598                 rel = memcmp(this_entry_buf, other_entry_buf,
599                              SORT_IDX_ENTRYSIZE);
600             }
601             if (rel)
602                 break;
603         }       
604         if (!rel)
605             break;
606         if (criteria[j].relation == 'A')
607         {
608             if (rel > 0)
609                 break;
610         }
611         else if (criteria[j].relation == 'D')
612         {
613             if (rel < 0)
614                 break;
615         }
616     }
617     ++i;
618     j = sort_info->max_entries;
619     if (i == j)
620         return;
621
622     if (sort_info->num_entries == j)
623         --j;
624     else
625         j = (sort_info->num_entries)++;
626     new_entry = sort_info->entries[j];
627     while (j != i)
628     {
629         int k;
630         for (k = 0; k<num_criteria; k++)
631         {
632             char *j_buf = cmp_buf[k] + j * SORT_IDX_ENTRYSIZE;
633             char *j_1_buf = cmp_buf[k] + (j-1) * SORT_IDX_ENTRYSIZE;
634             memcpy(j_buf, j_1_buf, SORT_IDX_ENTRYSIZE);
635         }
636         sort_info->entries[j] = sort_info->entries[j-1];
637         --j;
638     }
639     sort_info->entries[i] = new_entry;
640     assert (new_entry);
641     for (i = 0; i<num_criteria; i++)
642     {
643         char *new_entry_buf = cmp_buf[i] + j * SORT_IDX_ENTRYSIZE;
644         char *this_entry_buf = tmp_cmp_buf[i];
645         memcpy(new_entry_buf, this_entry_buf, SORT_IDX_ENTRYSIZE);
646     }
647     new_entry->sysno = sysno;
648     new_entry->score = -1;
649 }
650
651 void resultSetInsertRank(ZebraHandle zh, struct zset_sort_info *sort_info,
652                          zint sysno, int score, int relation)
653 {
654     struct zset_sort_entry *new_entry = NULL;
655     int i, j;
656     assert(zh); /* compiler shut up about unused arg */
657
658     i = sort_info->num_entries;
659     while (--i >= 0)
660     {
661         int rel = 0;
662
663         rel = score - sort_info->entries[i]->score;
664
665         if (relation == 'D')
666         {
667             if (rel >= 0)
668                 break;
669         }
670         else if (relation == 'A')
671         {
672             if (rel <= 0)
673                 break;
674         }
675     }
676     ++i;
677     j = sort_info->max_entries;
678     if (i == j)
679         return;
680
681     if (sort_info->num_entries == j)
682         --j;
683     else
684         j = (sort_info->num_entries)++;
685     
686     new_entry = sort_info->entries[j];
687     while (j != i)
688     {
689         sort_info->entries[j] = sort_info->entries[j-1];
690         --j;
691     }
692     sort_info->entries[i] = new_entry;
693     assert (new_entry);
694     new_entry->sysno = sysno;
695     new_entry->score = score;
696 }
697
698 static Z_RPNQuery *copy_RPNQuery(Z_RPNQuery *src, NMEM nmem)
699 {
700     Z_RPNQuery *dst = 0;
701     ODR encode = odr_createmem(ODR_ENCODE);
702     ODR decode = odr_createmem(ODR_DECODE);
703
704     if (z_RPNQuery(encode, &src, 0, 0))
705     {
706         int len;
707         char *buf = odr_getbuf(encode, &len, 0);
708
709         if (buf)
710         {
711             odr_setbuf(decode, buf, len, 0);
712             z_RPNQuery(decode, &dst, 0, 0);
713         }
714     }
715     nmem_transfer(nmem, decode->mem);
716     odr_destroy(encode);
717     odr_destroy(decode);
718     return dst;
719 }
720
721 static Z_SortKeySpecList *copy_SortKeySpecList(Z_SortKeySpecList *src, NMEM nmem)
722 {
723     Z_SortKeySpecList *dst = 0;
724     ODR encode = odr_createmem(ODR_ENCODE);
725     ODR decode = odr_createmem(ODR_DECODE);
726
727     if (z_SortKeySpecList(encode, &src, 0, 0))
728     {
729         int len;
730         char *buf = odr_getbuf(encode, &len, 0);
731
732         if (buf)
733         {
734             odr_setbuf(decode, buf, len, 0);
735             z_SortKeySpecList(decode, &dst, 0, 0);
736         }
737     }
738     nmem_transfer(nmem, decode->mem);
739     odr_destroy(encode);
740     odr_destroy(decode);
741     return dst;
742 }
743
744 ZebraSet resultSetClone(ZebraHandle zh, const char *setname,
745                         ZebraSet rset)
746 {
747     ZebraSet nset;
748     int i;
749
750     nset = resultSetAdd(zh, setname, 1);
751     if (!nset)
752         return 0;
753
754     nset->nmem = nmem_create();
755
756     nset->num_bases = rset->num_bases;
757     nset->basenames = 
758         nmem_malloc (nset->nmem, nset->num_bases * sizeof(*rset->basenames));
759     for (i = 0; i<rset->num_bases; i++)
760         nset->basenames[i] = nmem_strdup(nset->nmem, rset->basenames[i]);
761
762     if (rset->rset)
763         nset->rset = rset_dup(rset->rset);
764     if (rset->rpn)
765         nset->rpn = copy_RPNQuery(rset->rpn, nset->nmem);
766     return nset;
767 }
768
769 ZEBRA_RES resultSetSort(ZebraHandle zh, NMEM nmem,
770                         int num_input_setnames, const char **input_setnames,
771                         const char *output_setname,
772                         Z_SortKeySpecList *sort_sequence, int *sort_status)
773 {
774     ZebraSet sset;
775     RSET rset;
776
777     if (num_input_setnames == 0)
778     {
779         zebra_setError(zh, YAZ_BIB1_NO_RESULT_SET_NAME_SUPPLIED_ON_SORT, 0);
780         return ZEBRA_FAIL;
781     }
782     if (num_input_setnames > 1)
783     {
784         zebra_setError(zh, YAZ_BIB1_SORT_TOO_MANY_INPUT_RESULTS, 0);
785         return ZEBRA_FAIL;
786     }
787     if (!log_level_set)
788         loglevels();
789     yaz_log(log_level_sort, "result set sort input=%s output=%s",
790           *input_setnames, output_setname);
791     sset = resultSetGet (zh, input_setnames[0]);
792     if (!sset)
793     {
794         zebra_setError(zh, YAZ_BIB1_SPECIFIED_RESULT_SET_DOES_NOT_EXIST,
795                        input_setnames[0]);
796         return ZEBRA_FAIL;
797     }
798     if (!(rset = sset->rset))
799     {
800         zebra_setError(zh, YAZ_BIB1_SPECIFIED_RESULT_SET_DOES_NOT_EXIST,
801                        input_setnames[0]);
802         return ZEBRA_FAIL;
803     }
804     if (strcmp (output_setname, input_setnames[0]))
805         sset = resultSetClone(zh, output_setname, sset);
806     sset->sortSpec = copy_SortKeySpecList(sort_sequence, sset->nmem);
807     return resultSetSortSingle (zh, nmem, sset, rset, sort_sequence,
808                                 sort_status);
809 }
810
811 ZEBRA_RES resultSetSortSingle(ZebraHandle zh, NMEM nmem,
812                               ZebraSet sset, RSET rset,
813                               Z_SortKeySpecList *sort_sequence,
814                               int *sort_status)
815 {
816     int i;
817     int n = 0;
818     zint kno = 0;
819     zint psysno = 0;
820     struct it_key key;
821     struct sortKeyInfo sort_criteria[ZSET_SORT_MAX_LEVEL];
822     char *cmp_buf[ZSET_SORT_MAX_LEVEL];
823     char *tmp_cmp_buf[ZSET_SORT_MAX_LEVEL];
824     int num_criteria;
825     RSFD rfd;
826     TERMID termid;
827     TERMID *terms;
828     int numTerms = 0;
829     size_t sysno_mem_index = 0;
830
831     if (zh->m_staticrank)
832         sysno_mem_index = 1;
833
834     assert(nmem); /* compiler shut up about unused param */
835     sset->sort_info->num_entries = 0;
836
837     rset_getterms(rset, 0, 0, &n);
838     terms = (TERMID *) nmem_malloc(nmem, sizeof(*terms)*n);
839     rset_getterms(rset, terms, n, &numTerms);
840
841     sset->hits = 0;
842     num_criteria = sort_sequence->num_specs;
843     if (num_criteria > ZSET_SORT_MAX_LEVEL)
844         num_criteria = ZSET_SORT_MAX_LEVEL;
845     for (i = 0; i < num_criteria; i++)
846     {
847         Z_SortKeySpec *sks = sort_sequence->specs[i];
848         Z_SortKey *sk;
849         ZEBRA_RES res;
850
851         sort_criteria[i].ord = -1;
852         sort_criteria[i].numerical = 0;
853
854         if (sks->which == Z_SortKeySpec_missingValueData)
855         {
856             zebra_setError(zh, YAZ_BIB1_UNSUPP_MISSING_DATA_ACTION, 0);
857             return ZEBRA_FAIL;
858         }
859         if (*sks->sortRelation == Z_SortKeySpec_ascending)
860             sort_criteria[i].relation = 'A';
861         else if (*sks->sortRelation == Z_SortKeySpec_descending)
862             sort_criteria[i].relation = 'D';
863         else
864         {
865             zebra_setError(zh, YAZ_BIB1_ILLEGAL_SORT_RELATION, 0);
866             return ZEBRA_FAIL;
867         }
868         if (sks->sortElement->which == Z_SortElement_databaseSpecific)
869         {
870             zebra_setError(zh, YAZ_BIB1_DATABASE_SPECIFIC_SORT_UNSUPP, 0);
871             return ZEBRA_FAIL;
872         }
873         else if (sks->sortElement->which != Z_SortElement_generic)
874         {
875             zebra_setError(zh, YAZ_BIB1_SORT_ILLEGAL_SORT, 0);
876             return ZEBRA_FAIL;
877         }       
878         sk = sks->sortElement->u.generic;
879         switch (sk->which)
880         {
881         case Z_SortKey_sortField:
882             yaz_log(log_level_sort, "key %d is of type sortField",
883                     i+1);
884             sort_criteria[i].numerical = 0;
885             sort_criteria[i].ord = 
886                 zebraExplain_lookup_attr_str(zh->reg->zei,
887                                              zinfo_index_category_sort,
888                                              -1, sk->u.sortField);
889             if (sks->which != Z_SortKeySpec_null
890                 && sort_criteria[i].ord == -1)
891             {
892                 zebra_setError(zh,
893                                YAZ_BIB1_CANNOT_SORT_ACCORDING_TO_SEQUENCE, 0);
894                 return ZEBRA_FAIL;
895             }
896             break;
897         case Z_SortKey_elementSpec:
898             yaz_log(log_level_sort, "key %d is of type elementSpec",
899                     i+1);
900             zebra_setError(zh, YAZ_BIB1_CANNOT_SORT_ACCORDING_TO_SEQUENCE, 0);
901             return ZEBRA_FAIL;
902         case Z_SortKey_sortAttributes:
903             yaz_log(log_level_sort, "key %d is of type sortAttributes", i+1);
904             res = zebra_sort_get_ord(zh, sk->u.sortAttributes,
905
906                                      &sort_criteria[i].ord,
907                                      &sort_criteria[i].numerical);
908             if (sks->which != Z_SortKeySpec_null && res != ZEBRA_OK)
909                 return ZEBRA_FAIL;
910             break;
911         }
912         if (zebraExplain_lookup_ord(zh->reg->zei, sort_criteria[i].ord,
913                                     &sort_criteria[i].index_type,
914                                     0, 0))
915         {
916             zebra_setError(zh, YAZ_BIB1_CANNOT_SORT_ACCORDING_TO_SEQUENCE, 0);
917             return ZEBRA_FAIL;
918         }
919     }
920     /* allocate space for each cmpare buf + one extra for tmp comparison */
921     for (i = 0; i<num_criteria; i++)
922     {
923         cmp_buf[i] = xmalloc(sset->sort_info->max_entries
924                              * SORT_IDX_ENTRYSIZE);
925         tmp_cmp_buf[i] = xmalloc(SORT_IDX_ENTRYSIZE);
926     }
927     rfd = rset_open (rset, RSETF_READ);
928     while (rset_read (rfd, &key, &termid))
929     {
930         zint this_sys = key.mem[sysno_mem_index];
931         if (log_level_searchhits)
932             key_logdump_txt(log_level_searchhits, &key, termid->name);
933         kno++;
934         if (this_sys != psysno)
935         {
936             if ((sset->hits & 255) == 0 && zh->break_handler_func)
937             {
938                 if (zh->break_handler_func(zh->break_handler_data))
939                 {
940                     rset_set_hits_limit(rset, 0);
941                     break;
942                 }
943             }
944             (sset->hits)++;
945             psysno = this_sys;
946             resultSetInsertSort(zh, sset,
947                                 sort_criteria, num_criteria, psysno, cmp_buf,
948                                 tmp_cmp_buf);
949         }
950     }
951     rset_close (rfd);
952
953     for (i = 0; i<num_criteria; i++)
954     {
955         xfree(cmp_buf[i]);
956         xfree(tmp_cmp_buf[i]);
957     }
958
959     yaz_log(log_level_sort, ZINT_FORMAT " keys, " ZINT_FORMAT " sysnos, sort",
960             kno, sset->hits);   
961     for (i = 0; i < numTerms; i++)
962         yaz_log(log_level_sort, "term=\"%s\" type=%s count=" ZINT_FORMAT,
963                  terms[i]->name, terms[i]->flags, terms[i]->rset->hits_count);
964     *sort_status = Z_SortResponse_success;
965     return ZEBRA_OK;
966 }
967
968 RSET resultSetRef(ZebraHandle zh, const char *resultSetId)
969 {
970     ZebraSet s;
971
972     if ((s = resultSetGet (zh, resultSetId)))
973         return s->rset;
974     return NULL;
975 }
976
977 ZEBRA_RES resultSetRank(ZebraHandle zh, ZebraSet zebraSet,
978                         RSET rset, NMEM nmem)
979 {
980     struct it_key key;
981     TERMID termid;
982     TERMID *terms;
983     zint kno = 0;
984     int numTerms = 0;
985     int n = 0;
986     int i;
987     ZebraRankClass rank_class;
988     struct zset_sort_info *sort_info;
989     const char *rank_handler_name = res_get_def(zh->res, "rank", "rank-1");
990     size_t sysno_mem_index = 0;
991
992     if (zh->m_staticrank)
993         sysno_mem_index = 1;
994
995     if (!log_level_set)
996         loglevels();
997     sort_info = zebraSet->sort_info;
998     sort_info->num_entries = 0;
999     zebraSet->hits = 0;
1000     zebraSet->estimated_hit_count = 0;
1001     rset_getterms(rset, 0, 0, &n);
1002     terms = (TERMID *) nmem_malloc(nmem, sizeof(*terms)*n);
1003     rset_getterms(rset, terms, n, &numTerms);
1004
1005     rank_class = zebraRankLookup(zh, rank_handler_name);
1006     if (!rank_class)
1007     {
1008         yaz_log(YLOG_WARN, "No such rank handler: %s", rank_handler_name);
1009         zebra_setError(zh, YAZ_BIB1_UNSUPP_SEARCH, "Cannot find rank handler");
1010         return ZEBRA_FAIL;
1011     }
1012     else
1013     {
1014         RSFD rfd = rset_open(rset, RSETF_READ);
1015         struct rank_control *rc = rank_class->control;
1016         int score;
1017         zint count = 0;
1018         void *handle = (*rc->begin) (zh->reg, rank_class->class_handle, rset,
1019                                      nmem, terms, numTerms);
1020         zint psysno = 0;  /* previous doc id / sys no */
1021         zint pstaticrank = 0; /* previous static rank */
1022         int stop_flag = 0;
1023         while (rset_read(rfd, &key, &termid))
1024         {
1025             zint this_sys = key.mem[sysno_mem_index];
1026
1027             zint seqno = key.mem[key.len-1];
1028             kno++;
1029             if (log_level_searchhits)
1030                 key_logdump_txt(log_level_searchhits, &key, termid->name);
1031             if (this_sys != psysno) 
1032             {   /* new record .. */
1033                 if (!(rfd->counted_items & 255) && zh->break_handler_func)
1034                 {
1035                     if (zh->break_handler_func(zh->break_handler_data))
1036                     {
1037                         yaz_log(YLOG_LOG, "Aborted search");
1038                         stop_flag = 1;
1039                     }
1040                 }
1041                 if (rfd->counted_items > rset->hits_limit)
1042                     stop_flag = 1;
1043                 if (psysno)
1044                 {   /* only if we did have a previous record */
1045                     score = (*rc->calc) (handle, psysno, pstaticrank,
1046                                          &stop_flag);
1047                     /* insert the hit. A=Ascending */
1048                     resultSetInsertRank (zh, sort_info, psysno, score, 'A');
1049                     count++;
1050                 }
1051                 if (stop_flag)
1052                 {
1053                     zebraSet->estimated_hit_count = 1;
1054                     rset_set_hits_limit(rset, 0);
1055                     break;
1056                 }
1057                 psysno = this_sys;
1058                 if (zh->m_staticrank)
1059                     pstaticrank = key.mem[0];
1060             }
1061             (*rc->add) (handle, CAST_ZINT_TO_INT(seqno), termid);
1062         }
1063         /* no more items */
1064         if (psysno)
1065         {   /* we had - at least - one record */
1066             score = (*rc->calc)(handle, psysno, pstaticrank, &stop_flag);
1067             /* insert the hit. A=Ascending */
1068             resultSetInsertRank(zh, sort_info, psysno, score, 'A');
1069             count++;
1070         }
1071         (*rc->end) (zh->reg, handle);
1072         rset_close (rfd);
1073     }
1074     zebraSet->hits = rset->hits_count;
1075
1076     yaz_log(log_level_searchterms, ZINT_FORMAT " keys, "
1077             ZINT_FORMAT " sysnos, rank",  kno, zebraSet->hits);
1078     for (i = 0; i < numTerms; i++)
1079     {
1080         yaz_log(log_level_searchterms, "term=\"%s\" type=%s count="
1081                 ZINT_FORMAT,
1082                 terms[i]->name, terms[i]->flags, terms[i]->rset->hits_count);
1083     }
1084     return ZEBRA_OK;
1085 }
1086
1087 ZebraRankClass zebraRankLookup(ZebraHandle zh, const char *name)
1088 {
1089     ZebraRankClass p = zh->reg->rank_classes;
1090     while (p && strcmp (p->control->name, name))
1091         p = p->next;
1092     if (p && !p->init_flag)
1093     {
1094         if (p->control->create)
1095             p->class_handle = (*p->control->create)(zh);
1096         p->init_flag = 1;
1097     }
1098     return p;
1099 }
1100
1101 void zebraRankInstall(struct zebra_register *reg, struct rank_control *ctrl)
1102 {
1103     ZebraRankClass p = (ZebraRankClass) xmalloc (sizeof(*p));
1104     p->control = (struct rank_control *) xmalloc (sizeof(*p->control));
1105     memcpy (p->control, ctrl, sizeof(*p->control));
1106     p->control->name = xstrdup (ctrl->name);
1107     p->init_flag = 0;
1108     p->next = reg->rank_classes;
1109     reg->rank_classes = p;
1110 }
1111
1112 void zebraRankDestroy(struct zebra_register *reg)
1113 {
1114     ZebraRankClass p = reg->rank_classes;
1115     while (p)
1116     {
1117         ZebraRankClass p_next = p->next;
1118         if (p->init_flag && p->control->destroy)
1119             (*p->control->destroy)(reg, p->class_handle);
1120         xfree(p->control->name);
1121         xfree(p->control);
1122         xfree(p);
1123         p = p_next;
1124     }
1125     reg->rank_classes = NULL;
1126 }
1127
1128 static int trav_rset_for_termids(RSET rset, TERMID *termid_array,
1129                                  zint *hits_array, int *approx_array)
1130 {
1131     int no = 0;
1132     int i;
1133     for (i = 0; i<rset->no_children; i++)
1134         no += trav_rset_for_termids(rset->children[i],
1135                                     (termid_array ? termid_array + no : 0),
1136                                     (hits_array ? hits_array + no : 0),
1137                                     (approx_array ? approx_array + no : 0));
1138     if (rset->term)
1139     {
1140         if (termid_array)
1141             termid_array[no] = rset->term;
1142         if (hits_array)
1143             hits_array[no] = rset->hits_count;
1144         if (approx_array)
1145             approx_array[no] = rset->hits_approx;
1146 #if 0
1147         yaz_log(YLOG_LOG, "rset=%p term=%s limit=" ZINT_FORMAT
1148                 " count=" ZINT_FORMAT,
1149                 rset, rset->term->name, rset->hits_limit, rset->hits_count);
1150 #endif
1151         no++;
1152     }
1153     return no;
1154 }
1155
1156 ZEBRA_RES zebra_result_set_term_no(ZebraHandle zh, const char *setname,
1157                                    int *num_terms)
1158 {
1159     ZebraSet sset = resultSetGet(zh, setname);
1160     *num_terms = 0;
1161     if (sset)
1162     {
1163         *num_terms = trav_rset_for_termids(sset->rset, 0, 0, 0);
1164         return ZEBRA_OK;
1165     }
1166     return ZEBRA_FAIL;
1167 }
1168
1169 ZEBRA_RES zebra_result_set_term_info(ZebraHandle zh, const char *setname,
1170                                      int no, zint *count, int *approx,
1171                                      char *termbuf, size_t *termlen,
1172                                      const char **term_ref_id)
1173 {
1174     ZebraSet sset = resultSetGet(zh, setname);
1175     if (sset)
1176     {
1177         int num_terms = trav_rset_for_termids(sset->rset, 0, 0, 0);
1178         if (no >= 0 && no < num_terms)
1179         {
1180             TERMID *term_array = xmalloc(num_terms * sizeof(*term_array));
1181             zint *hits_array = xmalloc(num_terms * sizeof(*hits_array));
1182             int *approx_array = xmalloc(num_terms * sizeof(*approx_array));
1183             
1184             trav_rset_for_termids(sset->rset, term_array,
1185                                   hits_array, approx_array);
1186
1187             if (count)
1188                 *count = hits_array[no];
1189             if (approx)
1190                 *approx = approx_array[no];
1191             if (termbuf)
1192             {
1193                 char *inbuf = term_array[no]->name;
1194                 size_t inleft = strlen(inbuf);
1195                 size_t outleft = *termlen - 1;
1196
1197                 if (zh->iconv_from_utf8 != 0)
1198                 {
1199                     char *outbuf = termbuf;
1200                     size_t ret;
1201                     
1202                     ret = yaz_iconv(zh->iconv_from_utf8, &inbuf, &inleft,
1203                                     &outbuf, &outleft);
1204                     if (ret == (size_t)(-1))
1205                         *termlen = 0;
1206                     else
1207                     {
1208                         yaz_iconv(zh->iconv_from_utf8, 0, 0, 
1209                                   &outbuf, &outleft);
1210                         *termlen = outbuf - termbuf;
1211                     }
1212                 }
1213                 else
1214                 {
1215                     if (inleft > outleft)
1216                         inleft = outleft;
1217                     *termlen = inleft;
1218                     memcpy(termbuf, inbuf, *termlen);
1219                 }
1220                 termbuf[*termlen] = '\0';
1221             }
1222             if (term_ref_id)
1223                 *term_ref_id = term_array[no]->ref_id;
1224
1225             xfree(term_array);
1226             xfree(hits_array);
1227             xfree(approx_array);
1228             return ZEBRA_OK;
1229         }
1230     }
1231     return ZEBRA_FAIL;
1232 }
1233
1234 ZEBRA_RES zebra_snippets_hit_vector(ZebraHandle zh, const char *setname,
1235                                     zint sysno, zebra_snippets *snippets)
1236 {
1237     ZebraSet sset = resultSetGet(zh, setname);
1238     yaz_log(YLOG_DEBUG, "zebra_get_hit_vector setname=%s zysno=" ZINT_FORMAT,
1239             setname, sysno);
1240     if (!sset)
1241         return ZEBRA_FAIL;
1242     else
1243     {
1244         struct rset_key_control *kc = zebra_key_control_create(zh);
1245         NMEM nmem = nmem_create();
1246         struct it_key key;
1247         RSET rsets[2], rset_comb;
1248         RSET rset_temp = rset_create_temp(nmem, kc, kc->scope, 
1249                                           res_get (zh->res, "setTmpDir"),0 );
1250         
1251         TERMID termid;
1252         RSFD rsfd = rset_open(rset_temp, RSETF_WRITE);
1253         
1254         key.mem[0] = sysno;
1255         key.mem[1] = 0;
1256         key.mem[2] = 0;
1257         key.mem[3] = 0;
1258         key.len = 2;
1259         rset_write (rsfd, &key);
1260         rset_close (rsfd);
1261
1262         rsets[0] = rset_temp;
1263         rsets[1] = rset_dup(sset->rset);
1264         
1265         rset_comb = rset_create_and(nmem, kc, kc->scope, 2, rsets);
1266
1267         rsfd = rset_open(rset_comb, RSETF_READ);
1268
1269         while (rset_read(rsfd, &key, &termid))
1270         {
1271             if (termid)
1272             {
1273                 struct ord_list *ol;
1274                 for (ol = termid->ol; ol; ol = ol->next)
1275                 {
1276                     zebra_snippets_append(snippets, key.mem[key.len-1], 0,
1277                                           ol->ord, termid->name);
1278                 }
1279             }
1280         }
1281         rset_close(rsfd);
1282         
1283         rset_delete(rset_comb);
1284         nmem_destroy(nmem);
1285         kc->dec(kc);
1286     }
1287     return ZEBRA_OK;
1288 }
1289
1290 /*
1291  * Local variables:
1292  * c-basic-offset: 4
1293  * indent-tabs-mode: nil
1294  * End:
1295  * vim: shiftwidth=4 tabstop=8 expandtab
1296  */
1297