Handle untrans failure for element fetch zebra::index:.
[idzebra-moved-to-github.git] / index / retrieve.c
1 /* $Id: retrieve.c,v 1.84 2008-01-24 16:17:29 adam Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 #include <stdio.h>
24 #include <assert.h>
25
26 #include <fcntl.h>
27 #ifdef WIN32
28 #include <io.h>
29 #include <process.h>
30 #endif
31 #if HAVE_UNISTD_H
32 #include <unistd.h>
33 #endif
34
35 #include "index.h"
36 #include <yaz/diagbib1.h>
37 #include <yaz/snprintf.h>
38 #include <direntz.h>
39 #include <yaz/oid_db.h>
40 #include <zebra_strmap.h>
41
42 #define MAX_SYSNOS_PER_RECORD 40
43
44 #define ZEBRA_XML_HEADER_STR "<record xmlns=\"http://www.indexdata.com/zebra/\""
45
46 static int zebra_create_record_stream(ZebraHandle zh, 
47                                       Record *rec,
48                                       struct ZebraRecStream *stream)
49 {
50     RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, *rec);
51
52     if ((*rec)->size[recInfo_storeData] > 0)
53         zebra_create_stream_mem(stream, (*rec)->info[recInfo_storeData],
54                                 (*rec)->size[recInfo_storeData]);
55     else
56     {
57         char full_rep[1024];
58         int fd;
59             
60         if (zh->path_reg && !yaz_is_abspath((*rec)->info[recInfo_filename])){
61             strcpy(full_rep, zh->path_reg);
62             strcat(full_rep, "/");
63             strcat(full_rep, (*rec)->info[recInfo_filename]);
64         }
65         else
66             strcpy(full_rep, (*rec)->info[recInfo_filename]);
67             
68         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1){
69             yaz_log(YLOG_WARN|YLOG_ERRNO, "Retrieve fail; missing file: %s",
70                      full_rep);
71             rec_free(rec);
72             return YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
73         }
74         zebra_create_stream_fd(stream, fd, recordAttr->recordOffset);
75     }
76     return 0;
77 }
78     
79
80 struct index_spec {
81     const char *index_name;
82     const char *index_type;
83     const char *extra;
84     struct index_spec *next;
85 };
86
87
88 struct index_spec *parse_index_spec(const char *elem, NMEM nmem,
89                                     int *error)
90 {
91     struct index_spec *first = 0;
92     struct index_spec **last = &first;
93     const char *cp = elem;
94
95     *error = 0;
96     if (cp[0] == ':' && cp[1] == ':')
97     {
98
99         cp++; /* skip first ':' */
100
101         for (;;)
102         {
103             const char *cp0;
104             struct index_spec *spec = nmem_malloc(nmem, sizeof(*spec));
105             spec->index_type = 0;
106             spec->next = 0;
107             spec->extra = 0;
108
109             if (!first)
110                 first = spec;
111             *last = spec;
112             last = &spec->next;
113
114             cp++; /* skip ',' or second ':' */
115             cp0 = cp;
116             while (*cp != ':' && *cp != '\0' && *cp != ',')
117                 cp++;
118             spec->index_name = nmem_strdupn(nmem, cp0, cp - cp0);
119             if (*cp == ':') /* type as well */
120             {
121                 cp++;
122                 cp0 = cp;
123                 
124                 while (*cp != '\0' && *cp != ',' && *cp != ':')
125                     cp++;
126                 spec->index_type = nmem_strdupn(nmem, cp0, cp - cp0);
127             }
128             if (*cp == ':') /* extra arguments */
129             {
130                 cp++;
131                 cp0 = cp;
132                 
133                 while (*cp != '\0' && *cp != ',' && *cp != ':')
134                     cp++;
135                 spec->extra = nmem_strdupn(nmem, cp0, cp - cp0);
136             }
137             if (*cp != ',')
138                 break;
139         }
140     }
141     if (*cp != '\0')
142         *error = 1;
143     return first;
144 }
145                             
146 static int parse_zebra_elem(const char *elem,
147                             const char **index, size_t *index_len,
148                             const char **type, size_t *type_len)
149 {
150     *index = 0;
151     *index_len = 0;
152
153     *type = 0;
154     *type_len = 0;
155
156     if (elem && *elem)
157     {
158         char *cp;
159         /* verify that '::' is in the beginning of *elem 
160            and something more follows */
161         if (':' != *elem
162             || !(elem +1) || ':' != *(elem +1)
163             || !(elem +2) || '\0' == *(elem +2))
164             return 0;
165  
166         /* pick out info from string after '::' */
167         elem = elem + 2;
168         cp = strchr(elem, ':');
169
170         if (!cp) /* index, no colon, no type */
171         {
172             *index = elem;
173             *index_len = strlen(elem);
174         }
175         else if (cp[1] == '\0') /* colon, but no following type */
176         {
177             return 0;
178         }
179         else  /* index, colon and type */
180         {
181             *index = elem;
182             *index_len = cp - elem;
183             *type = cp+1;
184             *type_len = strlen(cp+1);
185         }
186     }
187     return 1;
188 }
189
190
191 int zebra_special_sort_fetch(ZebraHandle zh, zint sysno, ODR odr,
192                              const char *elemsetname,
193                              const Odr_oid *input_format,
194                              const Odr_oid **output_format,
195                              char **rec_bufp, int *rec_lenp)
196 {
197     const char *retrieval_index;
198     size_t retrieval_index_len; 
199     const char *retrieval_type;
200     size_t retrieval_type_len;
201     char retrieval_index_cstr[256];
202     char retrieval_type_cstr[256];
203     int ord;
204
205     /* only accept XML and SUTRS requests */
206     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml) 
207         && oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
208     {
209         yaz_log(YLOG_WARN, "unsupported format for element set zebra::%s", 
210                 elemsetname);
211         *output_format = 0;
212         return YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
213     }
214     
215     if (!parse_zebra_elem(elemsetname,
216                           &retrieval_index, &retrieval_index_len,
217                           &retrieval_type,  &retrieval_type_len))
218     {
219         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
220     }
221     
222     if (retrieval_type_len == 0)
223         return -1;   /* must have a register type specified */
224     if (!retrieval_index_len ||
225         retrieval_index_len >= sizeof(retrieval_index_cstr)-1)
226     {
227         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
228     }
229         
230     memcpy(retrieval_index_cstr, retrieval_index, retrieval_index_len);
231     retrieval_index_cstr[retrieval_index_len] = '\0';
232
233     memcpy(retrieval_type_cstr, retrieval_type, retrieval_type_len);
234     retrieval_type_cstr[retrieval_type_len] = '\0';
235
236     ord = zebraExplain_lookup_attr_str(zh->reg->zei,
237                                        zinfo_index_category_sort,
238                                        retrieval_type_cstr,
239                                        retrieval_index_cstr);
240     if (ord == -1)
241         return -1;  /* is not a sort index */
242     else
243     {
244         char dst_buf[IT_MAX_WORD];
245         char str[IT_MAX_WORD];
246         const char *index_type;
247         const char *db = 0;
248         const char *string_index = 0;
249         WRBUF wrbuf = wrbuf_alloc();
250         
251         zebra_sort_sysno(zh->reg->sort_index, sysno);
252         zebra_sort_type(zh->reg->sort_index, ord);
253         zebra_sort_read(zh->reg->sort_index, str);
254
255         zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, &string_index);
256         
257         zebra_term_untrans(zh, index_type, dst_buf, str);
258
259         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
260         {
261             *output_format = yaz_oid_recsyn_xml;
262             wrbuf_printf(wrbuf, ZEBRA_XML_HEADER_STR
263                          " sysno=\"" ZINT_FORMAT "\""
264                          " set=\"zebra::index%s/\">\n",
265                          sysno, elemsetname);
266
267             wrbuf_printf(wrbuf, "  <index name=\"%s\"", 
268                          string_index);
269             wrbuf_printf(wrbuf, " type=\"%s\">", index_type);
270             wrbuf_xmlputs(wrbuf, dst_buf);
271             wrbuf_printf(wrbuf, "</index>\n");
272             wrbuf_printf(wrbuf, "</record>\n");
273         }
274         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
275         {
276             *output_format = yaz_oid_recsyn_sutrs;
277             
278             wrbuf_printf(wrbuf, "%s %s %s\n", string_index, index_type,
279                          dst_buf);
280         }
281         *rec_lenp = wrbuf_len(wrbuf);
282         *rec_bufp = odr_malloc(odr, *rec_lenp);
283         memcpy(*rec_bufp, wrbuf_buf(wrbuf), *rec_lenp);
284         wrbuf_destroy(wrbuf);
285         return 0;
286     }
287 }
288                             
289 int zebra_special_index_fetch(ZebraHandle zh, zint sysno, ODR odr,
290                               Record rec,
291                               const char *elemsetname,
292                               const Odr_oid *input_format,
293                               const Odr_oid **output_format,
294                               char **rec_bufp, int *rec_lenp)
295 {
296     const char *retrieval_index;
297     size_t retrieval_index_len; 
298     const char *retrieval_type;
299     size_t retrieval_type_len;
300     zebra_rec_keys_t keys;
301     int ret_code = 0;
302     char retrieval_type_cstr[256];
303     
304     /* set output variables before processing possible error states */
305     /* *rec_lenp = 0; */
306
307     /* only accept XML and SUTRS requests */
308     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml)
309         && oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
310     {
311         yaz_log(YLOG_WARN, "unsupported format for element set zebra::%s", 
312                 elemsetname);
313         *output_format = 0;
314         return YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
315     }
316
317     if (!parse_zebra_elem(elemsetname,
318                      &retrieval_index, &retrieval_index_len,
319                      &retrieval_type,  &retrieval_type_len))
320         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
321
322     if (retrieval_type_len)
323     {
324         memcpy(retrieval_type_cstr, retrieval_type, retrieval_type_len);
325         retrieval_type_cstr[retrieval_type_len] = '\0';
326     }
327     
328     if (retrieval_index_len)
329     {
330         char retrieval_index_cstr[256];
331
332         if (retrieval_index_len < sizeof(retrieval_index_cstr) -1)
333         {
334             memcpy(retrieval_index_cstr, retrieval_index, retrieval_index_len);
335             retrieval_index_cstr[retrieval_index_len] = '\0';
336             
337             if (zebraExplain_lookup_attr_str(zh->reg->zei,
338                                              zinfo_index_category_index,
339                                              (retrieval_type_len == 0 ? 0 : 
340                                               retrieval_type_cstr),
341                                              retrieval_index_cstr) == -1)
342                 return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
343         }
344     }
345
346     keys = zebra_rec_keys_open();
347     zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys],
348                            rec->size[recInfo_delKeys], 0);
349
350     if (!zebra_rec_keys_rewind(keys))
351     {
352         ret_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
353     }
354     else
355     {
356         size_t slen;
357         const char *str;
358         struct it_key key_in;
359         WRBUF wrbuf = wrbuf_alloc();
360     
361         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
362         {
363             *output_format = input_format;
364             wrbuf_printf(wrbuf, ZEBRA_XML_HEADER_STR
365                          " sysno=\"" ZINT_FORMAT "\""
366                          " set=\"zebra::index%s/\">\n",
367                          sysno, elemsetname);
368         }
369         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
370             *output_format = input_format;
371
372         while (zebra_rec_keys_read(keys, &str, &slen, &key_in))
373         {
374             int i;
375             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
376             const char *index_type;
377             const char *db = 0;
378             const char *string_index = 0;
379             size_t string_index_len;
380             char dst_buf[IT_MAX_WORD];
381             
382             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db,
383                                     &string_index);
384             string_index_len = strlen(string_index);
385
386             /* process only if index is not defined, 
387                or if defined and matching */
388             if (retrieval_index == 0 
389                 || (string_index_len == retrieval_index_len 
390                     && !memcmp(string_index, retrieval_index,
391                                string_index_len)))
392             {
393                 /* process only if type is not defined, or is matching */
394                 if (retrieval_type == 0 
395                     || !strcmp(retrieval_type_cstr, index_type))
396                 {
397                     if (zebra_term_untrans(zh, index_type, dst_buf, str))
398                         *dst_buf = '\0'; /* untrans failed */
399
400                     if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
401                     {
402                         wrbuf_printf(wrbuf, "  <index name=\"%s\"", 
403                                      string_index);
404                         
405                         wrbuf_printf(wrbuf, " type=\"%s\"", index_type);
406                         
407                         wrbuf_printf(wrbuf, " seq=\"" ZINT_FORMAT "\">", 
408                                      key_in.mem[key_in.len -1]);
409                         wrbuf_xmlputs(wrbuf, dst_buf);
410                         wrbuf_printf(wrbuf, "</index>\n");
411                     }
412                     else 
413                     {
414                         wrbuf_printf(wrbuf, "%s ", string_index);
415                         
416                         wrbuf_printf(wrbuf, "%s", index_type);
417                         
418                         for (i = 1; i < key_in.len; i++)
419                             wrbuf_printf(wrbuf, " " ZINT_FORMAT, 
420                                              key_in.mem[i]);
421                         
422                         wrbuf_printf(wrbuf, " %s", dst_buf);
423                         
424                         wrbuf_printf(wrbuf, "\n");
425
426                     }
427                     
428                 }
429             }
430         }
431         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
432             wrbuf_printf(wrbuf, "</record>\n");
433         *rec_lenp = wrbuf_len(wrbuf);
434         *rec_bufp = odr_malloc(odr, *rec_lenp);
435         memcpy(*rec_bufp, wrbuf_buf(wrbuf), *rec_lenp);
436         wrbuf_destroy(wrbuf);
437     }
438     zebra_rec_keys_close(keys);
439     return ret_code;
440 }
441
442
443 static void retrieve_puts_attr(WRBUF wrbuf, const char *name,
444                                const char *value)
445 {
446     if (value)
447     {
448         wrbuf_printf(wrbuf, " %s=\"", name);
449         wrbuf_xmlputs(wrbuf, value);
450         wrbuf_printf(wrbuf, "\"");
451     }
452 }
453
454 static void retrieve_puts_attr_int(WRBUF wrbuf, const char *name,
455                                const int value)
456 {
457     wrbuf_printf(wrbuf, " %s=\"%i\"", name, value);
458 }
459
460 static void retrieve_puts_str(WRBUF wrbuf, const char *name,
461                                const char *value)
462 {
463     if (value)
464         wrbuf_printf(wrbuf, "%s %s\n", name, value);
465 }
466
467 static void retrieve_puts_int(WRBUF wrbuf, const char *name,
468                                const int value)
469 {
470     wrbuf_printf(wrbuf, "%s %i\n", name, value);
471 }
472
473
474 static void snippet_xml_record(ZebraHandle zh, WRBUF wrbuf, zebra_snippets *doc)
475 {
476     const zebra_snippet_word *doc_w;
477     int mark_state = 0;
478
479     wrbuf_printf(wrbuf, "%s>\n", ZEBRA_XML_HEADER_STR);
480     for (doc_w = zebra_snippets_constlist(doc); doc_w; doc_w = doc_w->next)
481     {
482         if (doc_w->mark)
483         {
484             const char *index_type;
485             const char *db = 0;
486             const char *string_index = 0;
487
488             zebraExplain_lookup_ord(zh->reg->zei, doc_w->ord, 
489                                     &index_type, &db, &string_index);
490
491             if (mark_state == 0)
492             {
493                 wrbuf_printf(wrbuf, "  <snippet name=\"%s\"",  string_index);
494                 wrbuf_printf(wrbuf, " type=\"%s\">", index_type);
495             }
496             if (doc_w->match)
497                 wrbuf_puts(wrbuf, "<s>");
498             /* not printing leading ws */
499             if (mark_state || !doc_w->ws || doc_w->match) 
500                 wrbuf_xmlputs(wrbuf, doc_w->term);
501             if (doc_w->match)
502                 wrbuf_puts(wrbuf, "</s>");
503         }
504         else if (mark_state == 1)
505         {
506             wrbuf_puts(wrbuf, "</snippet>\n");
507         }
508         mark_state = doc_w->mark;
509     }
510     if (mark_state == 1)
511     {
512         wrbuf_puts(wrbuf, "</snippet>\n");
513     }
514     wrbuf_printf(wrbuf, "</record>");
515 }
516
517 int zebra_get_rec_snippets(ZebraHandle zh, zint sysno,
518                            zebra_snippets *snippets)
519 {
520     int return_code = 0;
521     Record rec = rec_get(zh->reg->records, sysno);
522     if (!rec)
523     {
524         yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno);
525         return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
526     }
527     else
528     {
529         const char *file_type = rec->info[recInfo_fileType];
530         void *recTypeClientData;
531         RecType rt = recType_byName(zh->reg->recTypes, zh->res,
532                                     file_type, &recTypeClientData);
533
534         if (!rt)
535             return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
536         else
537         {
538             struct ZebraRecStream stream;
539             return_code = zebra_create_record_stream(zh, &rec, &stream);
540             if (return_code == 0)
541             {
542                 extract_snippet(zh, snippets, &stream,
543                                 rt, recTypeClientData);
544
545                 stream.destroy(&stream);
546             }
547         }
548         rec_free(&rec);
549     }
550     return return_code;
551 }
552
553 static int snippet_fetch(ZebraHandle zh, const char *setname,
554                          zint sysno, ODR odr,
555                          const char *elemsetname,
556                          const Odr_oid *input_format,
557                          const Odr_oid **output_format,
558                          char **rec_bufp, int *rec_lenp)
559 {
560     zebra_snippets *rec_snippets = zebra_snippets_create();
561     int return_code = zebra_get_rec_snippets(zh, sysno, rec_snippets);
562
563     if (!return_code)
564     {
565         WRBUF wrbuf = wrbuf_alloc();
566         zebra_snippets *hit_snippet = zebra_snippets_create();
567
568         zebra_snippets_hit_vector(zh, setname, sysno, hit_snippet);
569
570 #if 0
571         /* for debugging purposes */
572         yaz_log(YLOG_LOG, "---------------------------");
573         yaz_log(YLOG_LOG, "REC SNIPPET:");
574         zebra_snippets_log(rec_snippet, YLOG_LOG, 1);
575         yaz_log(YLOG_LOG, "---------------------------");
576         yaz_log(YLOG_LOG, "HIT SNIPPET:");
577         zebra_snippets_log(hit_snippet, YLOG_LOG, 1);
578 #endif
579         
580         zebra_snippets_ring(rec_snippets, hit_snippet, 5, 5);
581         
582 #if 0
583         yaz_log(YLOG_LOG, "---------------------------");
584         yaz_log(YLOG_LOG, "RING SNIPPET:");
585         zebra_snippets_log(rec_snippets, YLOG_LOG, 1);
586 #endif
587         snippet_xml_record(zh, wrbuf, rec_snippets);
588         
589         *output_format = yaz_oid_recsyn_xml;
590         
591         if (return_code == 0)
592         {
593             *rec_lenp = wrbuf_len(wrbuf);
594             *rec_bufp = odr_strdup(odr, wrbuf_cstr(wrbuf));
595         }
596         wrbuf_destroy(wrbuf);
597         zebra_snippets_destroy(hit_snippet);
598     }
599     zebra_snippets_destroy(rec_snippets);
600     return return_code;
601 }
602
603 struct term_collect {
604     const char *term;
605     int oc;
606     zint set_occur;
607 };
608
609 zint freq_term(ZebraHandle zh, int ord, const char *term, RSET rset_set)
610 {
611     struct rset_key_control *kc = zebra_key_control_create(zh);
612     char ord_buf[IT_MAX_WORD];
613     int ord_len = key_SU_encode(ord, ord_buf);
614     char *info;
615     zint hits = 0;
616     NMEM nmem = nmem_create();
617     
618     strcpy(ord_buf + ord_len, term);
619     
620     info = dict_lookup(zh->reg->dict, ord_buf);
621     if (info)
622     {
623         ISAM_P isam_p;
624         RSET rsets[2], rset;
625         memcpy(&isam_p, info+1, sizeof(ISAM_P));
626         
627         rsets[0] = zebra_create_rset_isam(zh, nmem, kc, kc->scope, isam_p, 0);
628         rsets[1] = rset_dup(rset_set);
629         
630         rset = rset_create_and(nmem, kc, kc->scope, 2, rsets);
631
632         zebra_count_set(zh, rset, &hits, zh->approx_limit);
633
634         rset_delete(rsets[0]);
635         rset_delete(rset);
636     }
637     (*kc->dec)(kc);
638     nmem_destroy(nmem);
639     return hits;
640 }
641
642 int term_qsort_handle(const void *a, const void *b)
643 {
644     const struct term_collect *l = a;
645     const struct term_collect *r = b;
646     if (l->set_occur < r->set_occur)
647         return 1;
648     else if (l->set_occur > r->set_occur)
649         return -1;
650     else
651     {
652         const char *lterm = l->term ? l->term : "";
653         const char *rterm = r->term ? r->term : "";
654         return strcmp(lterm, rterm);
655     }
656 }
657
658 void term_collect_freq(ZebraHandle zh,
659                        struct term_collect *col, int no_terms_collect,
660                        int ord, RSET rset)
661 {
662     int i;
663     for (i = 0; i < no_terms_collect; i++)
664     {
665         if (col[i].term)
666             col[i].set_occur = freq_term(zh, ord, col[i].term, rset);
667     }
668     qsort(col, no_terms_collect, sizeof(*col), term_qsort_handle);
669 }
670
671 struct term_collect *term_collect_create(zebra_strmap_t sm, 
672                                          int no_terms_collect,
673                                          NMEM nmem)
674 {
675     const char *term;
676     void *data_buf;
677     size_t data_len;
678     zebra_strmap_it it;
679     struct term_collect *col = nmem_malloc(nmem, 
680                                            sizeof *col *no_terms_collect);
681     int i;
682     for (i = 0; i < no_terms_collect; i++)
683     {
684         col[i].term = 0;
685         col[i].oc = 0;
686         col[i].set_occur = 0;
687     }
688     /* iterate over terms and collect the most frequent ones */
689     it = zebra_strmap_it_create(sm);
690     while ((term = zebra_strmap_it_next(it, &data_buf, &data_len)))
691     {
692         /* invariant:
693            col[0] has lowest oc .  col[no_terms_collect-1] has highest oc */
694         int oc = *(int*) data_buf;
695         int j = 0;
696         /* insertion may be slow but terms terms will be "infrequent" and
697            thus number of iterations should be small below 
698         */
699         while (j < no_terms_collect && oc > col[j].oc)
700             j++;
701         if (j) 
702         {   /* oc <= col[j] and oc > col[j-1] */
703             --j;
704             memmove(col, col+1, sizeof(*col) * j);
705             col[j].term = term;
706             col[j].oc = oc;
707         }
708     }
709     zebra_strmap_it_destroy(it);
710     return col;
711 }
712
713 static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname,
714                              ODR odr,
715                              const char *elemsetname,
716                              const Odr_oid *input_format,
717                              const Odr_oid **output_format,
718                              char **rec_bufp, int *rec_lenp)
719 {
720     zint *pos_array;
721     int i;
722     int num_recs = 10; /* number of records to analyze */
723     ZebraMetaRecord *poset;
724     ZEBRA_RES ret = ZEBRA_OK;
725     int *ord_array;
726     WRBUF wr = wrbuf_alloc();
727     int use_xml = 0;
728     
729     int no_ord = 0;
730     struct index_spec *spec, *spec_list;
731     int error;
732
733     /* see if XML is required for response */
734     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml) == 0)
735         use_xml = 1;
736
737     spec_list = parse_index_spec(elemsetname, odr_getmem(odr), &error);
738               
739     if (!spec_list || error)
740     {
741         zebra_setError(
742             zh, 
743             YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_,
744             0);
745         return ZEBRA_FAIL;
746     }          
747   
748     for (spec = spec_list; spec; spec = spec->next)
749     {
750         if (!spec->index_type)
751         {
752             zebra_setError(
753                 zh, 
754                 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_,
755                 0);
756             return ZEBRA_FAIL;
757         }
758         no_ord++;
759     }
760
761     ord_array = odr_malloc(odr, sizeof(*ord_array) * no_ord);
762
763     for (spec = spec_list, i = 0; spec; spec = spec->next, i++)
764     {
765         int ord = zebraExplain_lookup_attr_str(zh->reg->zei,
766                                                zinfo_index_category_index,
767                                                spec->index_type,
768                                                spec->index_name);
769         if (ord == -1)
770         {
771             zebra_setError(
772                 zh, 
773                 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_,
774                 0);
775             return ZEBRA_FAIL;
776         }
777         ord_array[i] = ord;
778     }
779     pos_array = (zint *) odr_malloc(odr, num_recs * sizeof(*pos_array));
780     for (i = 0; i < num_recs; i++)
781         pos_array[i] = i+1;
782     poset = zebra_meta_records_create(zh, setname, num_recs, pos_array);
783     if (!poset)
784     {
785         zebra_setError(zh, YAZ_BIB1_SPECIFIED_RESULT_SET_DOES_NOT_EXIST,
786                        setname);
787         ret = ZEBRA_FAIL;
788     }
789     else
790     {
791         yaz_timing_t timing = yaz_timing_create();
792         zebra_strmap_t *map_array
793             = odr_malloc(odr, sizeof *map_array * no_ord);
794         for (i = 0; i < no_ord; i++)
795             map_array[i] = zebra_strmap_create();
796
797         for (i = 0; i < num_recs; i++)
798         {
799             int j;
800             zint sysnos[MAX_SYSNOS_PER_RECORD];
801             int no_sysnos = MAX_SYSNOS_PER_RECORD;
802             if (!poset[i].sysno)
803                 continue;
804             ret = zebra_result_recid_to_sysno(zh,  setname,
805                                               poset[i].sysno,
806                                               sysnos, &no_sysnos);
807             assert(no_sysnos > 0);
808             for (j = 0; j < no_sysnos; j++)
809             {
810                 size_t slen;
811                 const char *str;
812                 struct it_key key_in;
813                 Record rec = rec_get(zh->reg->records, sysnos[j]);
814                 zebra_rec_keys_t keys = zebra_rec_keys_open();
815                 zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys],
816                                        rec->size[recInfo_delKeys], 0);
817                 
818                 if (zebra_rec_keys_rewind(keys))
819                 {
820                     while (zebra_rec_keys_read(keys, &str, &slen, &key_in))
821                     {
822                         int i;
823                         struct index_spec *spec;
824                         for (spec = spec_list, i = 0; i < no_ord; 
825                              i++, spec = spec->next)
826                         {
827                             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
828                             if (ord == ord_array[i] && 
829                                 str[0] != FIRST_IN_FIELD_CHAR)
830                             {
831                                 int *freq;
832                                 zebra_strmap_t sm = map_array[i];
833                                 
834                                 freq = zebra_strmap_lookup(sm, str, 0, 0);
835                                 if (freq)
836                                     (*freq)++;
837                                 else
838                                 {
839                                     int v = 1;
840                                     zebra_strmap_add(sm, str, &v, sizeof v);
841                                 }
842                             }
843                         }
844                     }
845                 }
846                 zebra_rec_keys_close(keys);
847                 rec_free(&rec);
848             }
849         }
850         yaz_timing_stop(timing);
851         yaz_log(YLOG_LOG, "facet first phase real=%4.2f",
852                 yaz_timing_get_real(timing));
853         yaz_timing_start(timing);
854         if (use_xml)
855             wrbuf_puts(wr, "<facets>\n");
856         for (spec = spec_list, i = 0; i < no_ord; i++, spec = spec->next)
857         {
858             int j;
859             NMEM nmem = nmem_create();
860             struct term_collect *col;
861             int no_collect_terms = 20;
862
863             if (spec->extra)
864                 no_collect_terms = atoi(spec->extra);
865             if (no_collect_terms < 1)
866                 no_collect_terms = 1;
867             col = term_collect_create(map_array[i], no_collect_terms, nmem);
868             term_collect_freq(zh, col, no_collect_terms, ord_array[i],
869                               resultSetRef(zh, setname));
870             
871             if (use_xml)
872                 wrbuf_printf(wr, "  <facet type=\"%s\" index=\"%s\">\n",
873                              spec->index_type, spec->index_name);
874             else
875                 wrbuf_printf(wr, "facet %s %s\n",
876                              spec->index_type, spec->index_name);
877             for (j = 0; j < no_collect_terms; j++)
878             {
879                 if (col[j].term)
880                 {
881                     char dst_buf[IT_MAX_WORD];
882                     zebra_term_untrans(zh, spec->index_type, dst_buf, col[j].term);
883                     if (use_xml)
884                     {
885                         wrbuf_printf(wr, "    <term coccur=\"%d\"", col[j].oc);
886                         if (col[j].set_occur)
887                             wrbuf_printf(wr, " occur=\"" ZINT_FORMAT "\"", 
888                                          col[j].set_occur);
889                         wrbuf_printf(wr, ">");
890                         wrbuf_xmlputs(wr, dst_buf);
891                         wrbuf_printf(wr, "</term>\n");
892                     }
893                     else
894                     {
895                         wrbuf_printf(wr, "term %d", col[j].oc);
896                         if (col[j].set_occur)
897                             wrbuf_printf(wr, " " ZINT_FORMAT, 
898                                          col[j].set_occur);
899                         wrbuf_printf(wr, ": %s\n", dst_buf);
900                     }
901                 }
902             }
903             if (use_xml)
904                 wrbuf_puts(wr, "  </facet>\n");
905             nmem_destroy(nmem);
906         }
907         if (use_xml)
908             wrbuf_puts(wr, "</facets>\n");
909         for (i = 0; i < no_ord; i++)
910             zebra_strmap_destroy(map_array[i]);
911         yaz_timing_stop(timing);
912         yaz_log(YLOG_LOG, "facet second phase real=%4.2f",
913                 yaz_timing_get_real(timing));
914         yaz_timing_destroy(&timing);
915     }
916     *rec_bufp = odr_strdup(odr, wrbuf_cstr(wr));
917     wrbuf_destroy(wr);
918     *rec_lenp = strlen(*rec_bufp);
919     *output_format = yaz_oid_recsyn_xml;
920
921     zebra_meta_records_destroy(zh, poset, num_recs);
922     return ret;
923 }
924
925 int zebra_special_fetch(ZebraHandle zh, const char *setname,
926                         zint sysno, int score, ODR odr,
927                         const char *elemsetname,
928                         const Odr_oid *input_format,
929                         const Odr_oid **output_format,
930                         char **rec_bufp, int *rec_lenp)
931 {
932     Record rec;
933     
934     /* set output variables before processing possible error states */
935     /* *rec_lenp = 0; */
936
937     if (elemsetname && 0 == strncmp(elemsetname, "facet", 5))
938     {
939         return facet_fetch(zh, setname, odr,
940                            elemsetname + 5,
941                            input_format, output_format,
942                            rec_bufp, rec_lenp);
943     }
944
945     if (elemsetname && 0 == strcmp(elemsetname, "snippet"))
946     {
947         return snippet_fetch(zh, setname, sysno, odr,
948                              elemsetname + 7,
949                              input_format, output_format,
950                              rec_bufp, rec_lenp);
951     }
952
953     /* processing zebra::meta::sysno elemset without fetching binary data */
954     if (elemsetname && 0 == strcmp(elemsetname, "meta::sysno"))
955     {
956         int ret = 0;
957         WRBUF wrbuf = wrbuf_alloc();
958         if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
959         {
960             wrbuf_printf(wrbuf, ZINT_FORMAT, sysno);
961             *output_format = input_format;
962         } 
963         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
964         {
965             wrbuf_printf(wrbuf, ZEBRA_XML_HEADER_STR
966                          " sysno=\"" ZINT_FORMAT "\"/>\n",
967                          sysno);
968             *output_format = input_format;
969         }
970         *rec_lenp = wrbuf_len(wrbuf);
971         if (*rec_lenp)
972             *rec_bufp = odr_strdup(odr, wrbuf_cstr(wrbuf));
973         else
974             ret = YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
975         wrbuf_destroy(wrbuf);
976         return ret;
977     }
978
979     /* processing special elementsetname zebra::index:: for sort elements */
980     if (elemsetname && 0 == strncmp(elemsetname, "index", 5))
981     {
982         int ret = zebra_special_sort_fetch(zh, sysno, odr,
983                                            elemsetname + 5,
984                                            input_format, output_format,
985                                            rec_bufp, rec_lenp);
986         if (ret != -1)
987             return ret;
988         /* not a sort index so we continue to get the full record */
989     }
990
991
992     /* fetching binary record up for all other display elementsets */
993     rec = rec_get(zh->reg->records, sysno);
994     if (!rec)
995     {
996         yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno);
997         return YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
998     }
999
1000     /* processing special elementsetnames zebra::data */    
1001     if (elemsetname && 0 == strcmp(elemsetname, "data"))
1002     {
1003         struct ZebraRecStream stream;
1004         RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, rec); 
1005         zebra_create_record_stream(zh, &rec, &stream);
1006         *output_format = input_format;
1007         *rec_lenp = recordAttr->recordSize;
1008         *rec_bufp = (char *) odr_malloc(odr, *rec_lenp);
1009         stream.readf(&stream, *rec_bufp, *rec_lenp);
1010         stream.destroy(&stream);
1011         rec_free(&rec);
1012         return 0;
1013     }
1014
1015     /* only accept XML and SUTRS requests from now */
1016     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml)
1017         && oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
1018     {
1019         yaz_log(YLOG_WARN, "unsupported format for element set zebra::%s", 
1020                 elemsetname);
1021         return YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
1022     }
1023     
1024
1025     /* processing special elementsetnames zebra::meta:: */
1026     if (elemsetname && 0 == strcmp(elemsetname, "meta"))
1027     {
1028         int ret = 0;
1029         WRBUF wrbuf = wrbuf_alloc();
1030         RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, rec); 
1031
1032         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
1033         {
1034             *output_format = input_format;
1035             
1036             wrbuf_printf(wrbuf, ZEBRA_XML_HEADER_STR
1037                          " sysno=\"" ZINT_FORMAT "\"", sysno);
1038             retrieve_puts_attr(wrbuf, "base", rec->info[recInfo_databaseName]);
1039             retrieve_puts_attr(wrbuf, "file", rec->info[recInfo_filename]);
1040             retrieve_puts_attr(wrbuf, "type", rec->info[recInfo_fileType]);
1041             if (score >= 0)
1042                 retrieve_puts_attr_int(wrbuf, "score", score);
1043            
1044             wrbuf_printf(wrbuf,
1045                          " rank=\"" ZINT_FORMAT "\""
1046                          " size=\"%i\""
1047                          " set=\"zebra::%s\"/>\n",
1048                          recordAttr->staticrank,
1049                          recordAttr->recordSize,
1050                          elemsetname);
1051         }
1052         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
1053         {
1054             *output_format = input_format;
1055             wrbuf_printf(wrbuf, "sysno " ZINT_FORMAT "\n", sysno);
1056             retrieve_puts_str(wrbuf, "base", rec->info[recInfo_databaseName]);
1057             retrieve_puts_str(wrbuf, "file", rec->info[recInfo_filename]);
1058             retrieve_puts_str(wrbuf, "type", rec->info[recInfo_fileType]);
1059             if (score >= 0)
1060                 retrieve_puts_int(wrbuf, "score", score);
1061
1062             wrbuf_printf(wrbuf,
1063                          "rank " ZINT_FORMAT "\n"
1064                          "size %i\n"
1065                          "set zebra::%s\n",
1066                          recordAttr->staticrank,
1067                          recordAttr->recordSize,
1068                          elemsetname);
1069         }
1070         *rec_lenp = wrbuf_len(wrbuf);
1071         if (*rec_lenp)
1072             *rec_bufp = odr_strdup(odr, wrbuf_cstr(wrbuf));
1073         else
1074             ret = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1075
1076         wrbuf_destroy(wrbuf);
1077         rec_free(&rec);
1078         return ret;
1079     }
1080
1081     /* processing special elementsetnames zebra::index:: */
1082     if (elemsetname && 0 == strncmp(elemsetname, "index", 5))
1083     {
1084         int ret = zebra_special_index_fetch(zh, sysno, odr, rec,
1085                                             elemsetname + 5,
1086                                             input_format, output_format,
1087                                             rec_bufp, rec_lenp);
1088         
1089         rec_free(&rec);
1090         return ret;
1091     }
1092
1093     if (rec)
1094         rec_free(&rec);
1095     return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1096 }
1097
1098                           
1099 int zebra_record_fetch(ZebraHandle zh, const char *setname,
1100                        zint sysno, int score,
1101                        ODR odr,
1102                        const Odr_oid *input_format, Z_RecordComposition *comp,
1103                        const Odr_oid **output_format,
1104                        char **rec_bufp, int *rec_lenp, char **basenamep,
1105                        char **addinfo)
1106 {
1107     Record rec;
1108     char *fname, *file_type, *basename;
1109     const char *elemsetname;
1110     struct ZebraRecStream stream;
1111     RecordAttr *recordAttr;
1112     void *clientData;
1113     int return_code = 0;
1114     zint sysnos[MAX_SYSNOS_PER_RECORD];
1115     int no_sysnos = MAX_SYSNOS_PER_RECORD;
1116     ZEBRA_RES res;
1117
1118     res = zebra_result_recid_to_sysno(zh, setname, sysno, sysnos, &no_sysnos);
1119     if (res != ZEBRA_OK)
1120         return ZEBRA_FAIL;
1121
1122     sysno = sysnos[0];
1123     *basenamep = 0;
1124     *addinfo = 0;
1125     elemsetname = yaz_get_esn(comp);
1126
1127     /* processing zebra special elementset names of form 'zebra:: */
1128     if (elemsetname && 0 == strncmp(elemsetname, "zebra::", 7))
1129         return  zebra_special_fetch(zh, setname, sysno, score, odr,
1130                                     elemsetname + 7,
1131                                     input_format, output_format,
1132                                     rec_bufp, rec_lenp);
1133
1134
1135     /* processing all other element set names */
1136     rec = rec_get(zh->reg->records, sysno);
1137     if (!rec)
1138     {
1139         yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno);
1140         *basenamep = 0;
1141         return YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1142     }
1143
1144
1145     recordAttr = rec_init_attr(zh->reg->zei, rec);
1146
1147     file_type = rec->info[recInfo_fileType];
1148     fname = rec->info[recInfo_filename];
1149     basename = rec->info[recInfo_databaseName];
1150     *basenamep = (char *) odr_malloc(odr, strlen(basename)+1);
1151     strcpy(*basenamep, basename);
1152
1153     yaz_log(YLOG_DEBUG, "retrieve localno=" ZINT_FORMAT " score=%d",
1154             sysno, score);
1155
1156     return_code = zebra_create_record_stream(zh, &rec, &stream);
1157
1158     if (rec)
1159     {
1160         RecType rt;
1161         struct recRetrieveCtrl retrieveCtrl;
1162
1163         retrieveCtrl.stream = &stream;
1164         retrieveCtrl.fname = fname;
1165         retrieveCtrl.localno = sysno;
1166         retrieveCtrl.staticrank = recordAttr->staticrank;
1167         retrieveCtrl.score = score;
1168         retrieveCtrl.recordSize = recordAttr->recordSize;
1169         retrieveCtrl.odr = odr;
1170         retrieveCtrl.input_format = retrieveCtrl.output_format = input_format;
1171         retrieveCtrl.comp = comp;
1172         retrieveCtrl.encoding = zh->record_encoding;
1173         retrieveCtrl.diagnostic = 0;
1174         retrieveCtrl.addinfo = 0;
1175         retrieveCtrl.dh = zh->reg->dh;
1176         retrieveCtrl.res = zh->res;
1177         retrieveCtrl.rec_buf = 0;
1178         retrieveCtrl.rec_len = -1;
1179
1180         if (!(rt = recType_byName(zh->reg->recTypes, zh->res,
1181                                   file_type, &clientData)))
1182         {
1183             char addinfo_str[100];
1184
1185             sprintf(addinfo_str, "Could not handle record type %.40s",
1186                     file_type);
1187                     
1188             *addinfo = odr_strdup(odr, addinfo_str);
1189             return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1190         }
1191         else
1192         {
1193             (*rt->retrieve)(clientData, &retrieveCtrl);
1194             return_code = retrieveCtrl.diagnostic;
1195
1196             *output_format = retrieveCtrl.output_format;
1197             *rec_bufp = (char *) retrieveCtrl.rec_buf;
1198             *rec_lenp = retrieveCtrl.rec_len;
1199             *addinfo = retrieveCtrl.addinfo;
1200         }
1201
1202         stream.destroy(&stream);
1203         rec_free(&rec);
1204     }
1205
1206     return return_code;
1207 }
1208
1209 /*
1210  * Local variables:
1211  * c-basic-offset: 4
1212  * indent-tabs-mode: nil
1213  * End:
1214  * vim: shiftwidth=4 tabstop=8 expandtab
1215  */
1216