zebra_record_check checks isamb too.
[idzebra-moved-to-github.git] / index / retrieve.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2010 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #include <stdio.h>
21 #include <assert.h>
22
23 #include <fcntl.h>
24 #ifdef WIN32
25 #include <io.h>
26 #include <process.h>
27 #endif
28 #if HAVE_UNISTD_H
29 #include <unistd.h>
30 #endif
31
32 #include "index.h"
33 #include <yaz/diagbib1.h>
34 #include <yaz/snprintf.h>
35 #include <direntz.h>
36 #include <yaz/oid_db.h>
37 #include <zebra_strmap.h>
38
39 #define MAX_SYSNOS_PER_RECORD 40
40
41 #define ZEBRA_XML_HEADER_STR "<record xmlns=\"http://www.indexdata.com/zebra/\""
42
43 struct special_fetch_s {
44     ZebraHandle zh;
45     const char *setname;
46     zint sysno;
47     int score;
48     NMEM nmem;
49 };
50
51 static int zebra_create_record_stream(ZebraHandle zh, 
52                                       Record *rec,
53                                       struct ZebraRecStream *stream)
54 {
55     RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, *rec);
56
57     if ((*rec)->size[recInfo_storeData] > 0 
58         || (*rec)->info[recInfo_filename] == 0)
59         zebra_create_stream_mem(stream, (*rec)->info[recInfo_storeData],
60                                 (*rec)->size[recInfo_storeData]);
61     else
62     {
63         char full_rep[1024];
64         int fd;
65             
66         if (zh->path_reg && !yaz_is_abspath((*rec)->info[recInfo_filename])){
67             strcpy(full_rep, zh->path_reg);
68             strcat(full_rep, "/");
69             strcat(full_rep, (*rec)->info[recInfo_filename]);
70         }
71         else
72             strcpy(full_rep, (*rec)->info[recInfo_filename]);
73             
74         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1){
75             yaz_log(YLOG_WARN|YLOG_ERRNO, "Retrieve fail; missing file: %s",
76                      full_rep);
77             rec_free(rec);
78             return YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
79         }
80         zebra_create_stream_fd(stream, fd, recordAttr->recordOffset);
81     }
82     return 0;
83 }
84    
85
86 struct index_spec {
87     const char *index_name;
88     const char *index_type;
89     const char *extra;
90     struct index_spec *next;
91 };
92
93
94 struct index_spec *parse_index_spec(const char *elem, NMEM nmem,
95                                     int *error)
96 {
97     struct index_spec *first = 0;
98     struct index_spec **last = &first;
99     const char *cp = elem;
100
101     *error = 0;
102     if (cp[0] == ':' && cp[1] == ':')
103     {
104
105         cp++; /* skip first ':' */
106
107         for (;;)
108         {
109             const char *cp0;
110             struct index_spec *spec = nmem_malloc(nmem, sizeof(*spec));
111             spec->index_type = 0;
112             spec->next = 0;
113             spec->extra = 0;
114
115             if (!first)
116                 first = spec;
117             *last = spec;
118             last = &spec->next;
119
120             cp++; /* skip ',' or second ':' */
121             cp0 = cp;
122             while (*cp != ':' && *cp != '\0' && *cp != ',')
123                 cp++;
124             spec->index_name = nmem_strdupn(nmem, cp0, cp - cp0);
125             if (*cp == ':') /* type as well */
126             {
127                 cp++;
128                 cp0 = cp;
129                 
130                 while (*cp != '\0' && *cp != ',' && *cp != ':')
131                     cp++;
132                 spec->index_type = nmem_strdupn(nmem, cp0, cp - cp0);
133             }
134             if (*cp == ':') /* extra arguments */
135             {
136                 cp++;
137                 cp0 = cp;
138                 
139                 while (*cp != '\0' && *cp != ',' && *cp != ':')
140                     cp++;
141                 spec->extra = nmem_strdupn(nmem, cp0, cp - cp0);
142             }
143             if (*cp != ',')
144                 break;
145         }
146     }
147     if (*cp != '\0')
148         *error = 1;
149     return first;
150 }
151
152 static int sort_fetch(
153     struct special_fetch_s *fi, const char *elemsetname,
154     const Odr_oid *input_format,
155     const Odr_oid **output_format,
156     WRBUF result, WRBUF addinfo)
157 {
158     int ord;
159     ZebraHandle zh = fi->zh;
160     int error;
161     struct index_spec *spec;
162
163     spec = parse_index_spec(elemsetname, fi->nmem, &error);
164     if (error)
165         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
166
167     /* for sort fetches.. We allow only one index and type must be given */
168     if (!spec || spec->next || !spec->index_type)
169         return -1;
170     ord = zebraExplain_lookup_attr_str(zh->reg->zei,
171                                        zinfo_index_category_sort,
172                                        spec->index_type,
173                                        spec->index_name);
174     if (ord == -1)
175         return -1;  /* is not a sort index */
176     else
177     {
178         WRBUF wrbuf_str = wrbuf_alloc();
179         const char *index_type;
180         const char *db = 0;
181         const char *string_index = 0;
182         WRBUF wrbuf_result = result;
183         int off = 0;
184
185         zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 
186                                 &string_index);
187         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
188         {
189             *output_format = yaz_oid_recsyn_xml;
190             wrbuf_printf(wrbuf_result, ZEBRA_XML_HEADER_STR
191                          " sysno=\"" ZINT_FORMAT "\""
192                          " set=\"zebra::index%s\">\n",
193                          fi->sysno, elemsetname);
194         }
195         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
196         {
197             *output_format = yaz_oid_recsyn_sutrs;
198         }
199         else
200         {
201             yaz_log(YLOG_WARN, "unsupported format for element set zebra::%s", 
202                     elemsetname);
203             *output_format = 0;
204             wrbuf_destroy(wrbuf_str);
205             return YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
206         }
207         zebra_sort_type(zh->reg->sort_index, ord);
208         zebra_sort_sysno(zh->reg->sort_index, fi->sysno);
209         zebra_sort_read(zh->reg->sort_index, 0, wrbuf_str);
210
211         while (off != wrbuf_len(wrbuf_str))
212         {
213             char dst_buf[IT_MAX_WORD];
214             assert(off < wrbuf_len(wrbuf_str));
215             zebra_term_untrans(zh, index_type, dst_buf,
216                                wrbuf_buf(wrbuf_str)+off);
217             
218             if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
219             {
220                 wrbuf_printf(wrbuf_result, "  <index name=\"%s\"", 
221                              string_index);
222                 wrbuf_printf(wrbuf_result, " type=\"%s\">", index_type);
223                 wrbuf_xmlputs(wrbuf_result, dst_buf);
224                 wrbuf_printf(wrbuf_result, "</index>\n");
225             }
226             else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
227             {
228                 wrbuf_printf(wrbuf_result, "%s %s %s\n", string_index, index_type,
229                              dst_buf);
230             }
231             off += strlen(wrbuf_buf(wrbuf_str)+off) + 1;
232         }
233         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
234         {
235             wrbuf_printf(wrbuf_result, "</record>\n");
236         }
237         wrbuf_destroy(wrbuf_str);
238         return 0;
239     }
240 }
241                             
242 static int special_index_fetch(
243     struct special_fetch_s *fi, const char *elemsetname,
244     const Odr_oid *input_format,
245     const Odr_oid **output_format,
246     WRBUF result, WRBUF addinfo,
247     Record rec)
248 {
249     zebra_rec_keys_t keys;
250     int ret_code = 0;
251     ZebraHandle zh = fi->zh;
252     struct index_spec *spec, *spec_list;
253     int error;
254     
255     /* set output variables before processing possible error states */
256     /* *rec_lenp = 0; */
257
258     /* only accept XML and SUTRS requests */
259     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml)
260         && oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
261     {
262         yaz_log(YLOG_WARN, "unsupported format for element set zebra::%s", 
263                 elemsetname);
264         *output_format = 0;
265         return YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
266     }
267
268     spec_list = parse_index_spec(elemsetname, fi->nmem, &error);
269     if (error)
270     {
271         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
272     }
273
274     for (spec = spec_list; spec; spec = spec->next)
275     {
276         if (zebraExplain_lookup_attr_str(zh->reg->zei,
277                                          zinfo_index_category_index,
278                                          spec->index_type,
279                                          spec->index_name) == -1)
280             return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
281     }
282
283     keys = zebra_rec_keys_open();
284     zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys],
285                            rec->size[recInfo_delKeys], 0);
286
287     if (!zebra_rec_keys_rewind(keys))
288     {
289         ret_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
290     }
291     else
292     {
293         size_t slen;
294         const char *str;
295         struct it_key key_in;
296         WRBUF wrbuf = result;
297
298         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
299         {
300             *output_format = input_format;
301             wrbuf_printf(wrbuf, ZEBRA_XML_HEADER_STR
302                          " sysno=\"" ZINT_FORMAT "\""
303                          " set=\"zebra::index%s\">\n",
304                          fi->sysno, elemsetname);
305         }
306         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
307             *output_format = input_format;
308
309         while (zebra_rec_keys_read(keys, &str, &slen, &key_in))
310         {
311             int i;
312             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
313             const char *index_type;
314             const char *db = 0;
315             const char *string_index = 0;
316             char dst_buf[IT_MAX_WORD];
317             int match = 0;
318             
319             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db,
320                                     &string_index);
321             if (!spec_list)
322                 match = 1; /* match all if no specs were given */
323             else
324             {
325                 for (spec = spec_list; spec; spec = spec->next)
326                 {
327                     if ((!spec->index_type ||
328                          !yaz_matchstr(spec->index_type, index_type))
329                         &&
330                         !yaz_matchstr(spec->index_name, string_index))
331                         match = 1;
332                 }
333             }
334             if (match)
335             {
336                 if (zebra_term_untrans(zh, index_type, dst_buf, str))
337                     *dst_buf = '\0'; /* untrans failed */
338                 
339                 if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
340                 {
341                     wrbuf_printf(wrbuf, "  <index name=\"%s\"", 
342                                  string_index);
343                     
344                     wrbuf_printf(wrbuf, " type=\"%s\"", index_type);
345                     
346                     wrbuf_printf(wrbuf, " seq=\"" ZINT_FORMAT "\">", 
347                                  key_in.mem[key_in.len -1]);
348                     wrbuf_xmlputs(wrbuf, dst_buf);
349                     wrbuf_printf(wrbuf, "</index>\n");
350                 }
351                 else 
352                 {
353                     wrbuf_printf(wrbuf, "%s ", string_index);
354                     
355                     wrbuf_printf(wrbuf, "%s", index_type);
356                     
357                     for (i = 1; i < key_in.len; i++)
358                         wrbuf_printf(wrbuf, " " ZINT_FORMAT, 
359                                      key_in.mem[i]);
360                     
361                     wrbuf_printf(wrbuf, " %s", dst_buf);
362                     
363                     wrbuf_printf(wrbuf, "\n");
364                 }
365             }
366         }
367         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
368             wrbuf_printf(wrbuf, "</record>\n");
369     }
370     zebra_rec_keys_close(keys);
371     return ret_code;
372 }
373
374
375 static void retrieve_puts_attr(WRBUF wrbuf, const char *name,
376                                const char *value)
377 {
378     if (value)
379     {
380         wrbuf_printf(wrbuf, " %s=\"", name);
381         wrbuf_xmlputs(wrbuf, value);
382         wrbuf_printf(wrbuf, "\"");
383     }
384 }
385
386 static void retrieve_puts_attr_int(WRBUF wrbuf, const char *name,
387                                const int value)
388 {
389     wrbuf_printf(wrbuf, " %s=\"%i\"", name, value);
390 }
391
392 static void retrieve_puts_str(WRBUF wrbuf, const char *name,
393                                const char *value)
394 {
395     if (value)
396         wrbuf_printf(wrbuf, "%s %s\n", name, value);
397 }
398
399 static void retrieve_puts_int(WRBUF wrbuf, const char *name,
400                                const int value)
401 {
402     wrbuf_printf(wrbuf, "%s %i\n", name, value);
403 }
404
405
406 static void snippet_check_fields(ZebraHandle zh, WRBUF wrbuf,
407                                  zebra_snippets *doc,
408                                  const zebra_snippet_word *doc_w,
409                                  const char *w_index_type)
410 {
411     /* beginning of snippet. See which fields the snippet also
412        occur */
413     const zebra_snippet_word *w;
414     int no = 0;
415     for (w = zebra_snippets_constlist(doc); w; w = w->next)
416     {
417         /* same sequence but other field? */
418         if (w->seqno == doc_w->seqno && w->ord != doc_w->ord)
419         {
420             const char *index_type;
421             const char *db = 0;
422             const char *string_index = 0;
423             
424             zebraExplain_lookup_ord(zh->reg->zei, w->ord, 
425                                     &index_type, &db, &string_index);
426             /* only report for same index type */
427             if (!strcmp(w_index_type, index_type))
428             {
429                 if (no == 0)
430                     wrbuf_printf(wrbuf, " fields=\"%s", string_index);
431                 else
432                     wrbuf_printf(wrbuf, " %s", string_index);
433                 no++;
434             }
435         }
436     }
437     if (no)
438         wrbuf_printf(wrbuf, "\"");
439 }
440
441 static void snippet_xml_record(ZebraHandle zh, WRBUF wrbuf, zebra_snippets *doc)
442 {
443     const zebra_snippet_word *doc_w;
444     int mark_state = 0;
445
446     wrbuf_printf(wrbuf, "%s>\n", ZEBRA_XML_HEADER_STR);
447     for (doc_w = zebra_snippets_constlist(doc); doc_w; doc_w = doc_w->next)
448     {
449         if (doc_w->mark)
450         {
451             const char *index_type;
452             const char *db = 0;
453             const char *string_index = 0;
454
455             zebraExplain_lookup_ord(zh->reg->zei, doc_w->ord, 
456                                     &index_type, &db, &string_index);
457
458             if (mark_state == 0)
459             {
460                 
461                 wrbuf_printf(wrbuf, "  <snippet name=\"%s\"",  string_index);
462                 wrbuf_printf(wrbuf, " type=\"%s\"", index_type);
463                 snippet_check_fields(zh, wrbuf, doc, doc_w, index_type);
464                 wrbuf_printf(wrbuf, ">");
465             }
466             if (doc_w->match)
467                 wrbuf_puts(wrbuf, "<s>");
468             /* not printing leading ws */
469             if (mark_state || !doc_w->ws || doc_w->match) 
470                 wrbuf_xmlputs(wrbuf, doc_w->term);
471             if (doc_w->match)
472                 wrbuf_puts(wrbuf, "</s>");
473         }
474         else if (mark_state == 1)
475         {
476             wrbuf_puts(wrbuf, "</snippet>\n");
477         }
478         mark_state = doc_w->mark;
479     }
480     if (mark_state == 1)
481     {
482         wrbuf_puts(wrbuf, "</snippet>\n");
483     }
484     wrbuf_printf(wrbuf, "</record>");
485 }
486
487 int zebra_get_rec_snippets(ZebraHandle zh, zint sysno,
488                            zebra_snippets *snippets)
489 {
490     int return_code = 0;
491     Record rec = rec_get(zh->reg->records, sysno);
492     if (!rec)
493     {
494         yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno);
495         return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
496     }
497     else
498     {
499         const char *file_type = rec->info[recInfo_fileType];
500         void *recTypeClientData;
501         RecType rt = recType_byName(zh->reg->recTypes, zh->res,
502                                     file_type, &recTypeClientData);
503
504         if (!rt)
505             return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
506         else
507         {
508             struct ZebraRecStream stream;
509             return_code = zebra_create_record_stream(zh, &rec, &stream);
510             if (return_code == 0)
511             {
512                 extract_snippet(zh, snippets, &stream,
513                                 rt, recTypeClientData);
514
515                 stream.destroy(&stream);
516             }
517         }
518         rec_free(&rec);
519     }
520     return return_code;
521 }
522
523 static int snippet_fetch(
524     struct special_fetch_s *fi, const char *elemsetname,
525     const Odr_oid *input_format,
526     const Odr_oid **output_format,
527     WRBUF result, WRBUF addinfo)
528 {
529     ZebraHandle zh = fi->zh;
530     zebra_snippets *rec_snippets = zebra_snippets_create();
531     int return_code = zebra_get_rec_snippets(zh, fi->sysno, rec_snippets);
532
533     if (!return_code)
534     {
535         WRBUF wrbuf = result;
536         zebra_snippets *hit_snippet = zebra_snippets_create();
537
538         zebra_snippets_hit_vector(zh, fi->setname, fi->sysno, hit_snippet);
539
540 #if 0
541         /* for debugging purposes */
542         yaz_log(YLOG_LOG, "---------------------------");
543         yaz_log(YLOG_LOG, "REC SNIPPET:");
544         zebra_snippets_log(rec_snippets, YLOG_LOG, 1);
545         yaz_log(YLOG_LOG, "---------------------------");
546         yaz_log(YLOG_LOG, "HIT SNIPPET:");
547         zebra_snippets_log(hit_snippet, YLOG_LOG, 1);
548 #endif
549         
550         zebra_snippets_ring(rec_snippets, hit_snippet, 5, 5);
551         
552 #if 0
553         yaz_log(YLOG_LOG, "---------------------------");
554         yaz_log(YLOG_LOG, "RING SNIPPET:");
555         zebra_snippets_log(rec_snippets, YLOG_LOG, 1);
556 #endif
557         snippet_xml_record(zh, wrbuf, rec_snippets);
558         
559         *output_format = yaz_oid_recsyn_xml;
560         
561         zebra_snippets_destroy(hit_snippet);
562     }
563     zebra_snippets_destroy(rec_snippets);
564     return return_code;
565 }
566
567 struct term_collect {
568     const char *term;
569     int oc;
570     zint set_occur;
571 };
572
573 static zint freq_term(ZebraHandle zh, int ord, const char *term, RSET rset_set)
574 {
575     struct rset_key_control *kc = zebra_key_control_create(zh);
576     char ord_buf[IT_MAX_WORD];
577     int ord_len = key_SU_encode(ord, ord_buf);
578     char *info;
579     zint hits = 0;
580     NMEM nmem = nmem_create();
581     
582     strcpy(ord_buf + ord_len, term);
583     
584     info = dict_lookup(zh->reg->dict, ord_buf);
585     if (info)
586     {
587         ISAM_P isam_p;
588         RSET rsets[2], rset;
589         memcpy(&isam_p, info+1, sizeof(ISAM_P));
590         
591         rsets[0] = zebra_create_rset_isam(zh, nmem, kc, kc->scope, isam_p, 0);
592         rsets[1] = rset_dup(rset_set);
593         
594         rset = rset_create_and(nmem, kc, kc->scope, 2, rsets);
595
596         zebra_count_set(zh, rset, &hits, zh->approx_limit);
597
598         rset_delete(rsets[0]);
599         rset_delete(rset);
600     }
601     (*kc->dec)(kc);
602     nmem_destroy(nmem);
603     return hits;
604 }
605
606 static int term_qsort_handle(const void *a, const void *b)
607 {
608     const struct term_collect *l = a;
609     const struct term_collect *r = b;
610     if (l->set_occur < r->set_occur)
611         return 1;
612     else if (l->set_occur > r->set_occur)
613         return -1;
614     else
615     {
616         const char *lterm = l->term ? l->term : "";
617         const char *rterm = r->term ? r->term : "";
618         return strcmp(lterm, rterm);
619     }
620 }
621
622 static void term_collect_freq(ZebraHandle zh,
623                               struct term_collect *col, int no_terms_collect,
624                               int ord, RSET rset, double scale_factor)
625 {
626     int i;
627     for (i = 0; i < no_terms_collect; i++)
628     {
629         if (col[i].term)
630         {
631             if (scale_factor < 0.0)
632             {
633                 col[i].set_occur = freq_term(zh, ord, col[i].term, rset);
634             }
635             else
636                 col[i].set_occur = scale_factor * col[i].oc;
637         }
638     }
639     qsort(col, no_terms_collect, sizeof(*col), term_qsort_handle);
640 }
641
642 static struct term_collect *term_collect_create(zebra_strmap_t sm, 
643                                                 int no_terms_collect,
644                                                 NMEM nmem)
645 {
646     const char *term;
647     void *data_buf;
648     size_t data_len;
649     zebra_strmap_it it;
650     struct term_collect *col = nmem_malloc(nmem, 
651                                            sizeof *col *no_terms_collect);
652     int i;
653     for (i = 0; i < no_terms_collect; i++)
654     {
655         col[i].term = 0;
656         col[i].oc = 0;
657         col[i].set_occur = 0;
658     }
659     /* iterate over terms and collect the most frequent ones */
660     it = zebra_strmap_it_create(sm);
661     while ((term = zebra_strmap_it_next(it, &data_buf, &data_len)))
662     {
663         /* invariant:
664            col[0] has lowest oc .  col[no_terms_collect-1] has highest oc */
665         int oc = *(int*) data_buf;
666         int j = 0;
667         /* insertion may be slow but terms terms will be "infrequent" and
668            thus number of iterations should be small below 
669         */
670         while (j < no_terms_collect && oc > col[j].oc)
671             j++;
672         if (j) 
673         {   /* oc <= col[j] and oc > col[j-1] */
674             --j;
675             memmove(col, col+1, sizeof(*col) * j);
676             col[j].term = term;
677             col[j].oc = oc;
678         }
679     }
680     zebra_strmap_it_destroy(it);
681     return col;
682 }
683
684 static int perform_facet_sort(ZebraHandle zh, int no_ord, int *ord_array,
685                               zebra_strmap_t *map_array,
686                               int num_recs, ZebraMetaRecord *poset)
687 {
688     int rec_i;
689     WRBUF w = wrbuf_alloc();
690     int ord_i;
691
692     for (ord_i = 0; ord_i < no_ord; ord_i++)
693     {
694         for (rec_i = 0; rec_i < num_recs; rec_i++)
695         {
696             if (!poset[rec_i].sysno)
697                 continue;
698             
699             zebra_sort_sysno(zh->reg->sort_index, poset[rec_i].sysno);
700             zebra_sort_type(zh->reg->sort_index, ord_array[ord_i]);
701             
702             wrbuf_rewind(w);
703             if (zebra_sort_read(zh->reg->sort_index, 0, w))
704             {
705                 zebra_strmap_t sm = map_array[ord_i];
706                 int off = 0;
707                 while (off != wrbuf_len(w))
708                 {
709                     const char *str = wrbuf_buf(w) + off;
710                     int *freq = zebra_strmap_lookup(sm, str, 0, 0);
711                     if (freq)
712                         (*freq)++;
713                     else
714                     {
715                         int v = 1;
716                         zebra_strmap_add(sm, str, &v, sizeof v);
717                     }
718                     off += strlen(str)+1;
719                 }
720             }
721         }
722     }
723     wrbuf_destroy(w);
724     return 0;
725 }
726
727
728 static int perform_facet_index(ZebraHandle zh,
729                                struct special_fetch_s *fi,
730                                int no_ord, int *ord_array,
731                                zebra_strmap_t *map_array,
732                                int num_recs, ZebraMetaRecord *poset,
733                                struct index_spec *spec_list)
734 {
735     int max_chunks = 2;
736     int rec_i;
737     res_get_int(zh->res, "facetMaxChunks", &max_chunks);
738
739     for (rec_i = 0; rec_i < num_recs; rec_i++)
740     {
741         int ret;
742         int j;
743         zint sysnos[MAX_SYSNOS_PER_RECORD];
744         int no_sysnos = MAX_SYSNOS_PER_RECORD;
745         if (!poset[rec_i].sysno)
746             continue;
747         ret = zebra_result_recid_to_sysno(zh, fi->setname,
748                                           poset[rec_i].sysno,
749                                           sysnos, &no_sysnos);
750         assert(no_sysnos > 0);
751         yaz_log(YLOG_DEBUG, "Analyzing rec=%d ISAM sysno=" ZINT_FORMAT " chunks=%d",
752                 rec_i, poset[rec_i].sysno, no_sysnos);
753         for (j = 0; j < no_sysnos && j < max_chunks; j++)
754         {
755             size_t slen;
756             const char *str;
757             struct it_key key_in;
758             Record rec = rec_get(zh->reg->records, sysnos[j]);
759             zebra_rec_keys_t keys = zebra_rec_keys_open();
760             zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys],
761                                    rec->size[recInfo_delKeys], 0);
762             
763             yaz_log(YLOG_DEBUG, "rec %d " ZINT_FORMAT " %s", 
764                     j, sysnos[j], zebra_rec_keys_empty(keys) ? "empty" : "non-empty");
765             if (zebra_rec_keys_rewind(keys))
766             {
767                 while (zebra_rec_keys_read(keys, &str, &slen, &key_in))
768                 {
769                     int ord_i;
770                     struct index_spec *spec;
771                     for (spec = spec_list, ord_i = 0; ord_i < no_ord; 
772                          ord_i++, spec = spec->next)
773                     {
774                         int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
775                         if (ord == ord_array[ord_i] && 
776                             str[0] != FIRST_IN_FIELD_CHAR)
777                         {
778                             int *freq;
779                             zebra_strmap_t sm = map_array[ord_i];
780                             
781                             freq = zebra_strmap_lookup(sm, str, 0, 0);
782                             if (freq)
783                                 (*freq)++;
784                             else
785                             {
786                                 int v = 1;
787                                 zebra_strmap_add(sm, str, &v, sizeof v);
788                             }
789                         }
790                     }
791                 }
792             }
793             zebra_rec_keys_close(keys);
794             rec_free(&rec);
795         }
796     }
797     return 0;
798 }
799
800 static int perform_facet(ZebraHandle zh,  
801                          struct special_fetch_s *fi,
802                          WRBUF result,
803                          int num_recs, ZebraMetaRecord *poset,
804                          struct index_spec *spec_list,
805                          int no_ord, int *ord_array,
806                          int use_xml,
807                          zinfo_index_category_t cat)
808 {
809     int i;
810     int ret = 0;
811     WRBUF wr = result;
812     struct index_spec *spec;
813     yaz_timing_t timing = yaz_timing_create();
814     zebra_strmap_t *map_array
815         = nmem_malloc(fi->nmem, sizeof *map_array * no_ord);
816     for (i = 0; i < no_ord; i++)
817         map_array[i] = zebra_strmap_create();
818
819     if (cat == zinfo_index_category_sort)
820         perform_facet_sort(zh, no_ord, ord_array, map_array,
821                            num_recs, poset);
822     else
823         perform_facet_index(zh, fi, no_ord, ord_array, map_array,
824                             num_recs, poset, spec_list);
825     yaz_timing_stop(timing);
826     yaz_log(YLOG_LOG, "facet first phase real=%4.2f cat=%s",
827             yaz_timing_get_real(timing),
828             (cat == zinfo_index_category_sort) ? "sort" : "index");
829     yaz_timing_start(timing);
830     for (spec = spec_list, i = 0; i < no_ord; i++, spec = spec->next)
831     {
832         int j;
833         NMEM nmem = nmem_create();
834         struct term_collect *col;
835         int no_collect_terms = 20;
836         
837         if (spec->extra)
838             no_collect_terms = atoi(spec->extra);
839         if (no_collect_terms < 1)
840             no_collect_terms = 1;
841         col = term_collect_create(map_array[i], no_collect_terms, nmem);
842         term_collect_freq(zh, col, no_collect_terms, ord_array[i],
843                           resultSetRef(zh, fi->setname), 
844                           cat == zinfo_index_category_sort ? 1.0 : -1.0);
845         
846         if (use_xml)
847             wrbuf_printf(wr, "  <facet type=\"%s\" index=\"%s\">\n",
848                          spec->index_type, spec->index_name);
849         else
850             wrbuf_printf(wr, "facet %s %s\n",
851                          spec->index_type, spec->index_name);
852         for (j = 0; j < no_collect_terms; j++)
853         {
854             if (col[j].term)
855             {
856                 char dst_buf[IT_MAX_WORD];
857                 zebra_term_untrans(zh, spec->index_type, dst_buf, col[j].term);
858                 if (use_xml)
859                 {
860                     wrbuf_printf(wr, "    <term coccur=\"%d\"", col[j].oc);
861                     if (col[j].set_occur)
862                         wrbuf_printf(wr, " occur=\"" ZINT_FORMAT "\"", 
863                                      col[j].set_occur);
864                     wrbuf_printf(wr, ">");
865                     wrbuf_xmlputs(wr, dst_buf);
866                     wrbuf_printf(wr, "</term>\n");
867                 }
868                 else
869                 {
870                     wrbuf_printf(wr, "term %d", col[j].oc);
871                     if (col[j].set_occur)
872                         wrbuf_printf(wr, " " ZINT_FORMAT, 
873                                      col[j].set_occur);
874                     wrbuf_printf(wr, ": %s\n", dst_buf);
875                 }
876             }
877         }
878         if (use_xml)
879             wrbuf_puts(wr, "  </facet>\n");
880         nmem_destroy(nmem);
881     }
882     for (i = 0; i < no_ord; i++)
883         zebra_strmap_destroy(map_array[i]);
884     yaz_timing_stop(timing);
885     yaz_log(YLOG_LOG, "facet second phase real=%4.2f",
886             yaz_timing_get_real(timing));
887     yaz_timing_destroy(&timing);
888     return ret;
889 }
890
891 static int facet_fetch(
892     struct special_fetch_s *fi, const char *elemsetname,
893     const Odr_oid *input_format,
894     const Odr_oid **output_format,
895     WRBUF result, WRBUF addinfo)
896 {
897     zint *pos_array;
898     int i;
899     int num_recs = 10; /* number of records to analyze */
900     ZebraMetaRecord *poset;
901     ZEBRA_RES ret = ZEBRA_OK;
902     int *ord_array;
903     int use_xml = 0;
904     int no_ord = 0;
905     struct index_spec *spec, *spec_list;
906     int error;
907     ZebraHandle zh = fi->zh;
908     /* whether sort or index based */
909     zinfo_index_category_t cat = zinfo_index_category_sort;
910
911     /* see if XML is required for response */
912     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml) == 0)
913         use_xml = 1;
914
915     spec_list = parse_index_spec(elemsetname, fi->nmem, &error);
916               
917     if (!spec_list || error)
918     {
919         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
920     }          
921   
922     for (spec = spec_list; spec; spec = spec->next)
923     {
924         if (!spec->index_type)
925             return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
926         no_ord++;
927     }
928
929     /* try to see if all specs are sort based.. If not, try the
930        index based ones */
931     ord_array = nmem_malloc(fi->nmem, sizeof(*ord_array) * no_ord);
932
933     for (spec = spec_list, i = 0; spec; spec = spec->next, i++)
934     {
935         int ord = zebraExplain_lookup_attr_str(zh->reg->zei,
936                                                zinfo_index_category_sort,
937                                                spec->index_type,
938                                                spec->index_name);
939         if (ord == -1)
940             break;
941         ord_array[i] = ord;
942         num_recs = 10000;
943     }
944     if (spec)
945     {
946         cat = zinfo_index_category_index;
947         for (spec = spec_list, i = 0; spec; spec = spec->next, i++)
948         {
949             int ord = zebraExplain_lookup_attr_str(zh->reg->zei,
950                                                    zinfo_index_category_index,
951                                                    spec->index_type,
952                                                    spec->index_name);
953             if (ord == -1)
954                 break;
955             ord_array[i] = ord;
956             
957         }
958     }
959     if (spec)
960         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
961
962     res_get_int(zh->res, "facetNumRecs", &num_recs);
963
964     pos_array = (zint *) nmem_malloc(fi->nmem, num_recs * sizeof(*pos_array));
965     for (i = 0; i < num_recs; i++)
966         pos_array[i] = i+1;
967     poset = zebra_meta_records_create(zh, fi->setname, num_recs, pos_array);
968     if (!poset)
969     {
970         wrbuf_puts(addinfo, fi->setname);
971         return YAZ_BIB1_SPECIFIED_RESULT_SET_DOES_NOT_EXIST;
972     }
973     else
974     {
975         if (use_xml)
976         {
977             wrbuf_printf(result, ZEBRA_XML_HEADER_STR ">\n");
978         }
979         ret = perform_facet(zh, fi, result, num_recs, poset,
980                             spec_list, no_ord, ord_array, use_xml,
981                             cat);
982         if (use_xml)
983             wrbuf_puts(result, "</record>\n");
984     }
985     *output_format = yaz_oid_recsyn_xml;
986     zebra_meta_records_destroy(zh, poset, num_recs);
987     return ret;
988 }
989
990
991 static int zebra_special_fetch(
992     void *handle, const char *elemsetname,
993     const Odr_oid *input_format,
994     const Odr_oid **output_format,
995     WRBUF result, WRBUF addinfo)
996 {
997     Record rec = 0;
998     struct special_fetch_s *fi = (struct special_fetch_s *) handle;
999     ZebraHandle zh = fi->zh;
1000     zint sysno = fi->sysno;
1001     
1002     /* processing zebra::facet */
1003     if (elemsetname && 0 == strncmp(elemsetname, "facet", 5))
1004     {
1005         return facet_fetch(fi, elemsetname + 5, 
1006                            input_format, output_format,
1007                            result, addinfo);
1008     }
1009
1010     if (elemsetname && 0 == strcmp(elemsetname, "snippet"))
1011     {
1012         return snippet_fetch(fi, elemsetname + 7,
1013                              input_format, output_format,
1014                              result, addinfo);
1015     }
1016
1017     /* processing zebra::meta::sysno  */
1018     if (elemsetname && 0 == strcmp(elemsetname, "meta::sysno"))
1019     {
1020         int ret = 0;
1021         if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
1022         {
1023             wrbuf_printf(result, ZINT_FORMAT, fi->sysno);
1024             *output_format = input_format;
1025         } 
1026         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
1027         {
1028             wrbuf_printf(result, ZEBRA_XML_HEADER_STR
1029                          " sysno=\"" ZINT_FORMAT "\"/>\n",
1030                          fi->sysno);
1031             *output_format = input_format;
1032         }
1033         else
1034             ret = YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
1035         return ret;
1036     }
1037
1038     /* processing special elementsetname zebra::index:: for sort elements */
1039     if (elemsetname && 0 == strncmp(elemsetname, "index", 5))
1040     {
1041         int ret = sort_fetch(
1042             fi, elemsetname + 5,
1043             input_format, output_format,
1044             result, addinfo);
1045         if (ret != -1)
1046             return ret;
1047         /* not a sort index so we continue to get the full record */
1048     }
1049
1050
1051     /* fetching binary record up for all other display elementsets */
1052     rec = rec_get(zh->reg->records, sysno);
1053     if (!rec)
1054     {
1055         yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno);
1056         return YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1057     }
1058
1059     /* processing special elementsetnames zebra::data */    
1060     if (elemsetname && 0 == strcmp(elemsetname, "data"))
1061     {
1062         struct ZebraRecStream stream;
1063         RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, rec); 
1064         char *b;
1065
1066         zebra_create_record_stream(zh, &rec, &stream);
1067         *output_format = input_format;
1068
1069         b = nmem_malloc(fi->nmem, recordAttr->recordSize);
1070         stream.readf(&stream, b, recordAttr->recordSize);
1071         wrbuf_write(result, b, recordAttr->recordSize);
1072
1073         stream.destroy(&stream);
1074         rec_free(&rec);
1075         return 0;
1076     }
1077
1078     /* processing special elementsetnames zebra::meta:: */
1079     if (elemsetname && 0 == strcmp(elemsetname, "meta"))
1080     {
1081         int ret = 0;
1082         RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, rec); 
1083
1084         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
1085         {
1086             *output_format = input_format;
1087             
1088             wrbuf_printf(result, ZEBRA_XML_HEADER_STR
1089                          " sysno=\"" ZINT_FORMAT "\"", sysno);
1090             retrieve_puts_attr(result, "base", rec->info[recInfo_databaseName]);
1091             retrieve_puts_attr(result, "file", rec->info[recInfo_filename]);
1092             retrieve_puts_attr(result, "type", rec->info[recInfo_fileType]);
1093             if (fi->score >= 0)
1094                 retrieve_puts_attr_int(result, "score", fi->score);
1095            
1096             wrbuf_printf(result,
1097                          " rank=\"" ZINT_FORMAT "\""
1098                          " size=\"%i\""
1099                          " set=\"zebra::%s\"/>\n",
1100                          recordAttr->staticrank,
1101                          recordAttr->recordSize,
1102                          elemsetname);
1103         }
1104         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
1105         {
1106             *output_format = input_format;
1107             wrbuf_printf(result, "sysno " ZINT_FORMAT "\n", sysno);
1108             retrieve_puts_str(result, "base", rec->info[recInfo_databaseName]);
1109             retrieve_puts_str(result, "file", rec->info[recInfo_filename]);
1110             retrieve_puts_str(result, "type", rec->info[recInfo_fileType]);
1111             if (fi->score >= 0)
1112                 retrieve_puts_int(result, "score", fi->score);
1113
1114             wrbuf_printf(result,
1115                          "rank " ZINT_FORMAT "\n"
1116                          "size %i\n"
1117                          "set zebra::%s\n",
1118                          recordAttr->staticrank,
1119                          recordAttr->recordSize,
1120                          elemsetname);
1121         }
1122         else
1123             ret = YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
1124
1125         rec_free(&rec);
1126         return ret;
1127     }
1128
1129     /* processing special elementsetnames zebra::index:: */
1130     if (elemsetname && 0 == strncmp(elemsetname, "index", 5))
1131     {
1132         int ret = special_index_fetch(
1133             fi, elemsetname + 5,
1134             input_format, output_format,
1135             result, addinfo, rec);
1136         rec_free(&rec);
1137         return ret;
1138     }
1139
1140     if (rec)
1141         rec_free(&rec);
1142     return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1143 }
1144
1145 int zebra_record_fetch(ZebraHandle zh, const char *setname,
1146                        zint sysno, int score,
1147                        ODR odr,
1148                        const Odr_oid *input_format, Z_RecordComposition *comp,
1149                        const Odr_oid **output_format,
1150                        char **rec_bufp, int *rec_lenp, char **basenamep,
1151                        WRBUF addinfo_w)
1152 {
1153     Record rec;
1154     char *fname, *file_type, *basename;
1155     const char *elemsetname;
1156     struct ZebraRecStream stream;
1157     RecordAttr *recordAttr;
1158     void *clientData;
1159     int return_code = 0;
1160     zint sysnos[MAX_SYSNOS_PER_RECORD];
1161     int no_sysnos = MAX_SYSNOS_PER_RECORD;
1162     ZEBRA_RES res;
1163     struct special_fetch_s fetch_info;
1164
1165     res = zebra_result_recid_to_sysno(zh, setname, sysno, sysnos, &no_sysnos);
1166     if (res != ZEBRA_OK)
1167         return ZEBRA_FAIL;
1168
1169     sysno = sysnos[0];
1170     *basenamep = 0;
1171     elemsetname = yaz_get_esn(comp);
1172
1173     fetch_info.zh = zh;
1174     fetch_info.setname = setname;
1175     fetch_info.sysno = sysno;
1176     fetch_info.score = score;
1177     fetch_info.nmem = odr->mem;
1178
1179     /* processing zebra special elementset names of form 'zebra:: */
1180     if (elemsetname && 0 == strncmp(elemsetname, "zebra::", 7))
1181     {
1182         WRBUF result = wrbuf_alloc();
1183         int r = zebra_special_fetch(&fetch_info, elemsetname + 7,
1184                                     input_format, output_format,
1185                                     result, addinfo_w);
1186         if (r == 0)
1187         {
1188             *rec_bufp = odr_strdup(odr, wrbuf_cstr(result));
1189             *rec_lenp = wrbuf_len(result);
1190         }
1191         wrbuf_destroy(result);
1192         return r;
1193     }
1194
1195     /* processing all other element set names */
1196     rec = rec_get(zh->reg->records, sysno);
1197     if (!rec)
1198     {
1199         yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno);
1200         *basenamep = 0;
1201         return YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1202     }
1203
1204
1205     recordAttr = rec_init_attr(zh->reg->zei, rec);
1206
1207     file_type = rec->info[recInfo_fileType];
1208     fname = rec->info[recInfo_filename];
1209     basename = rec->info[recInfo_databaseName];
1210     *basenamep = (char *) odr_malloc(odr, strlen(basename)+1);
1211     strcpy(*basenamep, basename);
1212
1213     yaz_log(YLOG_DEBUG, "retrieve localno=" ZINT_FORMAT " score=%d",
1214             sysno, score);
1215
1216     return_code = zebra_create_record_stream(zh, &rec, &stream);
1217
1218     if (rec)
1219     {
1220         RecType rt;
1221         struct recRetrieveCtrl retrieveCtrl;
1222
1223         retrieveCtrl.stream = &stream;
1224         retrieveCtrl.fname = fname;
1225         retrieveCtrl.localno = sysno;
1226         retrieveCtrl.staticrank = recordAttr->staticrank;
1227         retrieveCtrl.score = score;
1228         retrieveCtrl.recordSize = recordAttr->recordSize;
1229         retrieveCtrl.odr = odr;
1230         retrieveCtrl.input_format = retrieveCtrl.output_format = input_format;
1231         retrieveCtrl.comp = comp;
1232         retrieveCtrl.encoding = zh->record_encoding;
1233         retrieveCtrl.diagnostic = 0;
1234         retrieveCtrl.addinfo = 0;
1235         retrieveCtrl.dh = zh->reg->dh;
1236         retrieveCtrl.res = zh->res;
1237         retrieveCtrl.rec_buf = 0;
1238         retrieveCtrl.rec_len = -1;
1239         retrieveCtrl.handle = &fetch_info;
1240         retrieveCtrl.special_fetch = zebra_special_fetch;
1241
1242         if (!(rt = recType_byName(zh->reg->recTypes, zh->res,
1243                                   file_type, &clientData)))
1244         {
1245             wrbuf_printf(addinfo_w, "Could not handle record type %.40s",
1246                          file_type);
1247             return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1248         }
1249         else
1250         {
1251             (*rt->retrieve)(clientData, &retrieveCtrl);
1252             return_code = retrieveCtrl.diagnostic;
1253
1254             *output_format = retrieveCtrl.output_format;
1255             *rec_bufp = (char *) retrieveCtrl.rec_buf;
1256             *rec_lenp = retrieveCtrl.rec_len;
1257             if (retrieveCtrl.addinfo)
1258                 wrbuf_puts(addinfo_w, retrieveCtrl.addinfo);
1259         }
1260
1261         stream.destroy(&stream);
1262         rec_free(&rec);
1263     }
1264
1265     return return_code;
1266 }
1267
1268 /*
1269  * Local variables:
1270  * c-basic-offset: 4
1271  * c-file-style: "Stroustrup"
1272  * indent-tabs-mode: nil
1273  * End:
1274  * vim: shiftwidth=4 tabstop=8 expandtab
1275  */
1276