f5e6bb6834a8f6bf0de5445cbdb1ffd495112833
[idzebra-moved-to-github.git] / index / retrieve.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1995-2008 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #include <stdio.h>
21 #include <assert.h>
22
23 #include <fcntl.h>
24 #ifdef WIN32
25 #include <io.h>
26 #include <process.h>
27 #endif
28 #if HAVE_UNISTD_H
29 #include <unistd.h>
30 #endif
31
32 #include "index.h"
33 #include <yaz/diagbib1.h>
34 #include <yaz/snprintf.h>
35 #include <direntz.h>
36 #include <yaz/oid_db.h>
37 #include <zebra_strmap.h>
38
39 #define MAX_SYSNOS_PER_RECORD 40
40
41 #define ZEBRA_XML_HEADER_STR "<record xmlns=\"http://www.indexdata.com/zebra/\""
42
43 struct special_fetch_s {
44     ZebraHandle zh;
45     const char *setname;
46     zint sysno;
47     int score;
48     NMEM nmem;
49 };
50
51 static int zebra_create_record_stream(ZebraHandle zh, 
52                                       Record *rec,
53                                       struct ZebraRecStream *stream)
54 {
55     RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, *rec);
56
57     if ((*rec)->size[recInfo_storeData] > 0 
58         || (*rec)->info[recInfo_filename] == 0)
59         zebra_create_stream_mem(stream, (*rec)->info[recInfo_storeData],
60                                 (*rec)->size[recInfo_storeData]);
61     else
62     {
63         char full_rep[1024];
64         int fd;
65             
66         if (zh->path_reg && !yaz_is_abspath((*rec)->info[recInfo_filename])){
67             strcpy(full_rep, zh->path_reg);
68             strcat(full_rep, "/");
69             strcat(full_rep, (*rec)->info[recInfo_filename]);
70         }
71         else
72             strcpy(full_rep, (*rec)->info[recInfo_filename]);
73             
74         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1){
75             yaz_log(YLOG_WARN|YLOG_ERRNO, "Retrieve fail; missing file: %s",
76                      full_rep);
77             rec_free(rec);
78             return YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
79         }
80         zebra_create_stream_fd(stream, fd, recordAttr->recordOffset);
81     }
82     return 0;
83 }
84    
85
86 struct index_spec {
87     const char *index_name;
88     const char *index_type;
89     const char *extra;
90     struct index_spec *next;
91 };
92
93
94 struct index_spec *parse_index_spec(const char *elem, NMEM nmem,
95                                     int *error)
96 {
97     struct index_spec *first = 0;
98     struct index_spec **last = &first;
99     const char *cp = elem;
100
101     *error = 0;
102     if (cp[0] == ':' && cp[1] == ':')
103     {
104
105         cp++; /* skip first ':' */
106
107         for (;;)
108         {
109             const char *cp0;
110             struct index_spec *spec = nmem_malloc(nmem, sizeof(*spec));
111             spec->index_type = 0;
112             spec->next = 0;
113             spec->extra = 0;
114
115             if (!first)
116                 first = spec;
117             *last = spec;
118             last = &spec->next;
119
120             cp++; /* skip ',' or second ':' */
121             cp0 = cp;
122             while (*cp != ':' && *cp != '\0' && *cp != ',')
123                 cp++;
124             spec->index_name = nmem_strdupn(nmem, cp0, cp - cp0);
125             if (*cp == ':') /* type as well */
126             {
127                 cp++;
128                 cp0 = cp;
129                 
130                 while (*cp != '\0' && *cp != ',' && *cp != ':')
131                     cp++;
132                 spec->index_type = nmem_strdupn(nmem, cp0, cp - cp0);
133             }
134             if (*cp == ':') /* extra arguments */
135             {
136                 cp++;
137                 cp0 = cp;
138                 
139                 while (*cp != '\0' && *cp != ',' && *cp != ':')
140                     cp++;
141                 spec->extra = nmem_strdupn(nmem, cp0, cp - cp0);
142             }
143             if (*cp != ',')
144                 break;
145         }
146     }
147     if (*cp != '\0')
148         *error = 1;
149     return first;
150 }
151                             
152 static int parse_zebra_elem(const char *elem,
153                             const char **index, size_t *index_len,
154                             const char **type, size_t *type_len)
155 {
156     *index = 0;
157     *index_len = 0;
158
159     *type = 0;
160     *type_len = 0;
161
162     if (elem && *elem)
163     {
164         char *cp;
165         /* verify that '::' is in the beginning of *elem 
166            and something more follows */
167         if (':' != *elem
168             || !(elem +1) || ':' != *(elem +1)
169             || !(elem +2) || '\0' == *(elem +2))
170             return 0;
171  
172         /* pick out info from string after '::' */
173         elem = elem + 2;
174         cp = strchr(elem, ':');
175
176         if (!cp) /* index, no colon, no type */
177         {
178             *index = elem;
179             *index_len = strlen(elem);
180         }
181         else if (cp[1] == '\0') /* colon, but no following type */
182         {
183             return 0;
184         }
185         else  /* index, colon and type */
186         {
187             *index = elem;
188             *index_len = cp - elem;
189             *type = cp+1;
190             *type_len = strlen(cp+1);
191         }
192     }
193     return 1;
194 }
195
196
197 int zebra_special_sort_fetch(
198     struct special_fetch_s *fi, const char *elemsetname,
199     const Odr_oid *input_format,
200     const Odr_oid **output_format,
201     WRBUF result, WRBUF addinfo)
202 {
203     const char *retrieval_index;
204     size_t retrieval_index_len; 
205     const char *retrieval_type;
206     size_t retrieval_type_len;
207     char retrieval_index_cstr[256];
208     char retrieval_type_cstr[256];
209     int ord;
210     ZebraHandle zh = fi->zh;
211
212     /* only accept XML and SUTRS requests */
213     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml) 
214         && oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
215     {
216         yaz_log(YLOG_WARN, "unsupported format for element set zebra::%s", 
217                 elemsetname);
218         *output_format = 0;
219         return YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
220     }
221     
222     if (!parse_zebra_elem(elemsetname,
223                           &retrieval_index, &retrieval_index_len,
224                           &retrieval_type,  &retrieval_type_len))
225     {
226         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
227     }
228     
229     if (retrieval_type_len == 0)
230         return -1;   /* must have a register type specified */
231     if (!retrieval_index_len ||
232         retrieval_index_len >= sizeof(retrieval_index_cstr)-1)
233     {
234         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
235     }
236         
237     memcpy(retrieval_index_cstr, retrieval_index, retrieval_index_len);
238     retrieval_index_cstr[retrieval_index_len] = '\0';
239
240     memcpy(retrieval_type_cstr, retrieval_type, retrieval_type_len);
241     retrieval_type_cstr[retrieval_type_len] = '\0';
242
243     ord = zebraExplain_lookup_attr_str(zh->reg->zei,
244                                        zinfo_index_category_sort,
245                                        retrieval_type_cstr,
246                                        retrieval_index_cstr);
247     if (ord == -1)
248         return -1;  /* is not a sort index */
249     else
250     {
251         char dst_buf[IT_MAX_WORD];
252         char str[IT_MAX_WORD];
253         const char *index_type;
254         const char *db = 0;
255         const char *string_index = 0;
256         WRBUF wrbuf = result;
257         
258         zebra_sort_sysno(zh->reg->sort_index, fi->sysno);
259         zebra_sort_type(zh->reg->sort_index, ord);
260         zebra_sort_read(zh->reg->sort_index, str);
261
262         zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, &string_index);
263         
264         zebra_term_untrans(zh, index_type, dst_buf, str);
265
266         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
267         {
268             *output_format = yaz_oid_recsyn_xml;
269             wrbuf_printf(wrbuf, ZEBRA_XML_HEADER_STR
270                          " sysno=\"" ZINT_FORMAT "\""
271                          " set=\"zebra::index%s/\">\n",
272                          fi->sysno, elemsetname);
273
274             wrbuf_printf(wrbuf, "  <index name=\"%s\"", 
275                          string_index);
276             wrbuf_printf(wrbuf, " type=\"%s\">", index_type);
277             wrbuf_xmlputs(wrbuf, dst_buf);
278             wrbuf_printf(wrbuf, "</index>\n");
279             wrbuf_printf(wrbuf, "</record>\n");
280         }
281         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
282         {
283             *output_format = yaz_oid_recsyn_sutrs;
284             
285             wrbuf_printf(wrbuf, "%s %s %s\n", string_index, index_type,
286                          dst_buf);
287         }
288         return 0;
289     }
290 }
291                             
292 int zebra_special_index_fetch(
293     struct special_fetch_s *fi, const char *elemsetname,
294     const Odr_oid *input_format,
295     const Odr_oid **output_format,
296     WRBUF result, WRBUF addinfo,
297     Record rec)
298 {
299     const char *retrieval_index;
300     size_t retrieval_index_len; 
301     const char *retrieval_type;
302     size_t retrieval_type_len;
303     zebra_rec_keys_t keys;
304     int ret_code = 0;
305     char retrieval_type_cstr[256];
306     ZebraHandle zh = fi->zh;
307     
308     /* set output variables before processing possible error states */
309     /* *rec_lenp = 0; */
310
311     /* only accept XML and SUTRS requests */
312     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml)
313         && oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
314     {
315         yaz_log(YLOG_WARN, "unsupported format for element set zebra::%s", 
316                 elemsetname);
317         *output_format = 0;
318         return YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
319     }
320
321     if (!parse_zebra_elem(elemsetname,
322                      &retrieval_index, &retrieval_index_len,
323                      &retrieval_type,  &retrieval_type_len))
324         return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
325
326     if (retrieval_type_len)
327     {
328         memcpy(retrieval_type_cstr, retrieval_type, retrieval_type_len);
329         retrieval_type_cstr[retrieval_type_len] = '\0';
330     }
331     
332     if (retrieval_index_len)
333     {
334         char retrieval_index_cstr[256];
335
336         if (retrieval_index_len < sizeof(retrieval_index_cstr) -1)
337         {
338             memcpy(retrieval_index_cstr, retrieval_index, retrieval_index_len);
339             retrieval_index_cstr[retrieval_index_len] = '\0';
340             
341             if (zebraExplain_lookup_attr_str(zh->reg->zei,
342                                              zinfo_index_category_index,
343                                              (retrieval_type_len == 0 ? 0 : 
344                                               retrieval_type_cstr),
345                                              retrieval_index_cstr) == -1)
346                 return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
347         }
348     }
349
350     keys = zebra_rec_keys_open();
351     zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys],
352                            rec->size[recInfo_delKeys], 0);
353
354     if (!zebra_rec_keys_rewind(keys))
355     {
356         ret_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
357     }
358     else
359     {
360         size_t slen;
361         const char *str;
362         struct it_key key_in;
363         WRBUF wrbuf = result;
364     
365         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
366         {
367             *output_format = input_format;
368             wrbuf_printf(wrbuf, ZEBRA_XML_HEADER_STR
369                          " sysno=\"" ZINT_FORMAT "\""
370                          " set=\"zebra::index%s/\">\n",
371                          fi->sysno, elemsetname);
372         }
373         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
374             *output_format = input_format;
375
376         while (zebra_rec_keys_read(keys, &str, &slen, &key_in))
377         {
378             int i;
379             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
380             const char *index_type;
381             const char *db = 0;
382             const char *string_index = 0;
383             size_t string_index_len;
384             char dst_buf[IT_MAX_WORD];
385             
386             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db,
387                                     &string_index);
388             string_index_len = strlen(string_index);
389
390             /* process only if index is not defined, 
391                or if defined and matching */
392             if (retrieval_index == 0 
393                 || (string_index_len == retrieval_index_len 
394                     && !memcmp(string_index, retrieval_index,
395                                string_index_len)))
396             {
397                 /* process only if type is not defined, or is matching */
398                 if (retrieval_type == 0 
399                     || !strcmp(retrieval_type_cstr, index_type))
400                 {
401                     if (zebra_term_untrans(zh, index_type, dst_buf, str))
402                         *dst_buf = '\0'; /* untrans failed */
403
404                     if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
405                     {
406                         wrbuf_printf(wrbuf, "  <index name=\"%s\"", 
407                                      string_index);
408                         
409                         wrbuf_printf(wrbuf, " type=\"%s\"", index_type);
410                         
411                         wrbuf_printf(wrbuf, " seq=\"" ZINT_FORMAT "\">", 
412                                      key_in.mem[key_in.len -1]);
413                         wrbuf_xmlputs(wrbuf, dst_buf);
414                         wrbuf_printf(wrbuf, "</index>\n");
415                     }
416                     else 
417                     {
418                         wrbuf_printf(wrbuf, "%s ", string_index);
419                         
420                         wrbuf_printf(wrbuf, "%s", index_type);
421                         
422                         for (i = 1; i < key_in.len; i++)
423                             wrbuf_printf(wrbuf, " " ZINT_FORMAT, 
424                                              key_in.mem[i]);
425                         
426                         wrbuf_printf(wrbuf, " %s", dst_buf);
427                         
428                         wrbuf_printf(wrbuf, "\n");
429
430                     }
431                     
432                 }
433             }
434         }
435         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
436             wrbuf_printf(wrbuf, "</record>\n");
437     }
438     zebra_rec_keys_close(keys);
439     return ret_code;
440 }
441
442
443 static void retrieve_puts_attr(WRBUF wrbuf, const char *name,
444                                const char *value)
445 {
446     if (value)
447     {
448         wrbuf_printf(wrbuf, " %s=\"", name);
449         wrbuf_xmlputs(wrbuf, value);
450         wrbuf_printf(wrbuf, "\"");
451     }
452 }
453
454 static void retrieve_puts_attr_int(WRBUF wrbuf, const char *name,
455                                const int value)
456 {
457     wrbuf_printf(wrbuf, " %s=\"%i\"", name, value);
458 }
459
460 static void retrieve_puts_str(WRBUF wrbuf, const char *name,
461                                const char *value)
462 {
463     if (value)
464         wrbuf_printf(wrbuf, "%s %s\n", name, value);
465 }
466
467 static void retrieve_puts_int(WRBUF wrbuf, const char *name,
468                                const int value)
469 {
470     wrbuf_printf(wrbuf, "%s %i\n", name, value);
471 }
472
473
474 static void snippet_xml_record(ZebraHandle zh, WRBUF wrbuf, zebra_snippets *doc)
475 {
476     const zebra_snippet_word *doc_w;
477     int mark_state = 0;
478
479     wrbuf_printf(wrbuf, "%s>\n", ZEBRA_XML_HEADER_STR);
480     for (doc_w = zebra_snippets_constlist(doc); doc_w; doc_w = doc_w->next)
481     {
482         if (doc_w->mark)
483         {
484             const char *index_type;
485             const char *db = 0;
486             const char *string_index = 0;
487
488             zebraExplain_lookup_ord(zh->reg->zei, doc_w->ord, 
489                                     &index_type, &db, &string_index);
490
491             if (mark_state == 0)
492             {
493                 wrbuf_printf(wrbuf, "  <snippet name=\"%s\"",  string_index);
494                 wrbuf_printf(wrbuf, " type=\"%s\">", index_type);
495             }
496             if (doc_w->match)
497                 wrbuf_puts(wrbuf, "<s>");
498             /* not printing leading ws */
499             if (mark_state || !doc_w->ws || doc_w->match) 
500                 wrbuf_xmlputs(wrbuf, doc_w->term);
501             if (doc_w->match)
502                 wrbuf_puts(wrbuf, "</s>");
503         }
504         else if (mark_state == 1)
505         {
506             wrbuf_puts(wrbuf, "</snippet>\n");
507         }
508         mark_state = doc_w->mark;
509     }
510     if (mark_state == 1)
511     {
512         wrbuf_puts(wrbuf, "</snippet>\n");
513     }
514     wrbuf_printf(wrbuf, "</record>");
515 }
516
517 int zebra_get_rec_snippets(ZebraHandle zh, zint sysno,
518                            zebra_snippets *snippets)
519 {
520     int return_code = 0;
521     Record rec = rec_get(zh->reg->records, sysno);
522     if (!rec)
523     {
524         yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno);
525         return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
526     }
527     else
528     {
529         const char *file_type = rec->info[recInfo_fileType];
530         void *recTypeClientData;
531         RecType rt = recType_byName(zh->reg->recTypes, zh->res,
532                                     file_type, &recTypeClientData);
533
534         if (!rt)
535             return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
536         else
537         {
538             struct ZebraRecStream stream;
539             return_code = zebra_create_record_stream(zh, &rec, &stream);
540             if (return_code == 0)
541             {
542                 extract_snippet(zh, snippets, &stream,
543                                 rt, recTypeClientData);
544
545                 stream.destroy(&stream);
546             }
547         }
548         rec_free(&rec);
549     }
550     return return_code;
551 }
552
553 static int snippet_fetch(
554     struct special_fetch_s *fi, const char *elemsetname,
555     const Odr_oid *input_format,
556     const Odr_oid **output_format,
557     WRBUF result, WRBUF addinfo)
558 {
559     ZebraHandle zh = fi->zh;
560     zebra_snippets *rec_snippets = zebra_snippets_create();
561     int return_code = zebra_get_rec_snippets(zh, fi->sysno, rec_snippets);
562
563     if (!return_code)
564     {
565         WRBUF wrbuf = result;
566         zebra_snippets *hit_snippet = zebra_snippets_create();
567
568         zebra_snippets_hit_vector(zh, fi->setname, fi->sysno, hit_snippet);
569
570 #if 0
571         /* for debugging purposes */
572         yaz_log(YLOG_LOG, "---------------------------");
573         yaz_log(YLOG_LOG, "REC SNIPPET:");
574         zebra_snippets_log(rec_snippets, YLOG_LOG, 1);
575         yaz_log(YLOG_LOG, "---------------------------");
576         yaz_log(YLOG_LOG, "HIT SNIPPET:");
577         zebra_snippets_log(hit_snippet, YLOG_LOG, 1);
578 #endif
579         
580         zebra_snippets_ring(rec_snippets, hit_snippet, 5, 5);
581         
582 #if 0
583         yaz_log(YLOG_LOG, "---------------------------");
584         yaz_log(YLOG_LOG, "RING SNIPPET:");
585         zebra_snippets_log(rec_snippets, YLOG_LOG, 1);
586 #endif
587         snippet_xml_record(zh, wrbuf, rec_snippets);
588         
589         *output_format = yaz_oid_recsyn_xml;
590         
591         zebra_snippets_destroy(hit_snippet);
592     }
593     zebra_snippets_destroy(rec_snippets);
594     return return_code;
595 }
596
597 struct term_collect {
598     const char *term;
599     int oc;
600     zint set_occur;
601 };
602
603 zint freq_term(ZebraHandle zh, int ord, const char *term, RSET rset_set)
604 {
605     struct rset_key_control *kc = zebra_key_control_create(zh);
606     char ord_buf[IT_MAX_WORD];
607     int ord_len = key_SU_encode(ord, ord_buf);
608     char *info;
609     zint hits = 0;
610     NMEM nmem = nmem_create();
611     
612     strcpy(ord_buf + ord_len, term);
613     
614     info = dict_lookup(zh->reg->dict, ord_buf);
615     if (info)
616     {
617         ISAM_P isam_p;
618         RSET rsets[2], rset;
619         memcpy(&isam_p, info+1, sizeof(ISAM_P));
620         
621         rsets[0] = zebra_create_rset_isam(zh, nmem, kc, kc->scope, isam_p, 0);
622         rsets[1] = rset_dup(rset_set);
623         
624         rset = rset_create_and(nmem, kc, kc->scope, 2, rsets);
625
626         zebra_count_set(zh, rset, &hits, zh->approx_limit);
627
628         rset_delete(rsets[0]);
629         rset_delete(rset);
630     }
631     (*kc->dec)(kc);
632     nmem_destroy(nmem);
633     return hits;
634 }
635
636 int term_qsort_handle(const void *a, const void *b)
637 {
638     const struct term_collect *l = a;
639     const struct term_collect *r = b;
640     if (l->set_occur < r->set_occur)
641         return 1;
642     else if (l->set_occur > r->set_occur)
643         return -1;
644     else
645     {
646         const char *lterm = l->term ? l->term : "";
647         const char *rterm = r->term ? r->term : "";
648         return strcmp(lterm, rterm);
649     }
650 }
651
652 void term_collect_freq(ZebraHandle zh,
653                        struct term_collect *col, int no_terms_collect,
654                        int ord, RSET rset)
655 {
656     int i;
657     for (i = 0; i < no_terms_collect; i++)
658     {
659         if (col[i].term)
660             col[i].set_occur = freq_term(zh, ord, col[i].term, rset);
661     }
662     qsort(col, no_terms_collect, sizeof(*col), term_qsort_handle);
663 }
664
665 struct term_collect *term_collect_create(zebra_strmap_t sm, 
666                                          int no_terms_collect,
667                                          NMEM nmem)
668 {
669     const char *term;
670     void *data_buf;
671     size_t data_len;
672     zebra_strmap_it it;
673     struct term_collect *col = nmem_malloc(nmem, 
674                                            sizeof *col *no_terms_collect);
675     int i;
676     for (i = 0; i < no_terms_collect; i++)
677     {
678         col[i].term = 0;
679         col[i].oc = 0;
680         col[i].set_occur = 0;
681     }
682     /* iterate over terms and collect the most frequent ones */
683     it = zebra_strmap_it_create(sm);
684     while ((term = zebra_strmap_it_next(it, &data_buf, &data_len)))
685     {
686         /* invariant:
687            col[0] has lowest oc .  col[no_terms_collect-1] has highest oc */
688         int oc = *(int*) data_buf;
689         int j = 0;
690         /* insertion may be slow but terms terms will be "infrequent" and
691            thus number of iterations should be small below 
692         */
693         while (j < no_terms_collect && oc > col[j].oc)
694             j++;
695         if (j) 
696         {   /* oc <= col[j] and oc > col[j-1] */
697             --j;
698             memmove(col, col+1, sizeof(*col) * j);
699             col[j].term = term;
700             col[j].oc = oc;
701         }
702     }
703     zebra_strmap_it_destroy(it);
704     return col;
705 }
706
707 static ZEBRA_RES facet_fetch(
708     struct special_fetch_s *fi, const char *elemsetname,
709     const Odr_oid *input_format,
710     const Odr_oid **output_format,
711     WRBUF result, WRBUF addinfo)
712 {
713     zint *pos_array;
714     int i;
715     int num_recs = 10; /* number of records to analyze */
716     int max_chunks = 2;
717     ZebraMetaRecord *poset;
718     ZEBRA_RES ret = ZEBRA_OK;
719     int *ord_array;
720     WRBUF wr = result;
721     int use_xml = 0;
722     int no_ord = 0;
723     struct index_spec *spec, *spec_list;
724     int error;
725     ZebraHandle zh = fi->zh;
726
727     res_get_int(zh->res, "facetNumRecs", &num_recs);
728     res_get_int(zh->res, "facetMaxChunks", &max_chunks);
729
730     /* see if XML is required for response */
731     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml) == 0)
732         use_xml = 1;
733
734     spec_list = parse_index_spec(elemsetname, fi->nmem, &error);
735               
736     if (!spec_list || error)
737     {
738         zebra_setError(
739             zh, 
740             YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_,
741             0);
742         return ZEBRA_FAIL;
743     }          
744   
745     for (spec = spec_list; spec; spec = spec->next)
746     {
747         if (!spec->index_type)
748         {
749             zebra_setError(
750                 zh, 
751                 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_,
752                 0);
753             return ZEBRA_FAIL;
754         }
755         no_ord++;
756     }
757
758     ord_array = nmem_malloc(fi->nmem, sizeof(*ord_array) * no_ord);
759
760     for (spec = spec_list, i = 0; spec; spec = spec->next, i++)
761     {
762         int ord = zebraExplain_lookup_attr_str(zh->reg->zei,
763                                                zinfo_index_category_index,
764                                                spec->index_type,
765                                                spec->index_name);
766         if (ord == -1)
767         {
768             zebra_setError(
769                 zh, 
770                 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_,
771                 0);
772             return ZEBRA_FAIL;
773         }
774         ord_array[i] = ord;
775     }
776     pos_array = (zint *) nmem_malloc(fi->nmem, num_recs * sizeof(*pos_array));
777     for (i = 0; i < num_recs; i++)
778         pos_array[i] = i+1;
779     poset = zebra_meta_records_create(zh, fi->setname, num_recs, pos_array);
780     if (!poset)
781     {
782         zebra_setError(zh, YAZ_BIB1_SPECIFIED_RESULT_SET_DOES_NOT_EXIST,
783                        fi->setname);
784         ret = ZEBRA_FAIL;
785     }
786     else
787     {
788         yaz_timing_t timing = yaz_timing_create();
789         zebra_strmap_t *map_array
790             = nmem_malloc(fi->nmem, sizeof *map_array * no_ord);
791         for (i = 0; i < no_ord; i++)
792             map_array[i] = zebra_strmap_create();
793
794         for (i = 0; i < num_recs; i++)
795         {
796             int j;
797             zint sysnos[MAX_SYSNOS_PER_RECORD];
798             int no_sysnos = MAX_SYSNOS_PER_RECORD;
799             if (!poset[i].sysno)
800                 continue;
801             ret = zebra_result_recid_to_sysno(zh, fi->setname,
802                                               poset[i].sysno,
803                                               sysnos, &no_sysnos);
804             assert(no_sysnos > 0);
805             yaz_log(YLOG_DEBUG, "Analyzing rec=%d ISAM sysno=" ZINT_FORMAT " chunks=%d",
806                     i, poset[i].sysno, no_sysnos);
807             for (j = 0; j < no_sysnos && j < max_chunks; j++)
808             {
809                 size_t slen;
810                 const char *str;
811                 struct it_key key_in;
812                 Record rec = rec_get(zh->reg->records, sysnos[j]);
813                 zebra_rec_keys_t keys = zebra_rec_keys_open();
814                 zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys],
815                                        rec->size[recInfo_delKeys], 0);
816
817                 yaz_log(YLOG_DEBUG, "rec %d " ZINT_FORMAT " %s", 
818                         j, sysnos[j], zebra_rec_keys_empty(keys) ? "empty" : "non-empty");
819                 if (zebra_rec_keys_rewind(keys))
820                 {
821                     while (zebra_rec_keys_read(keys, &str, &slen, &key_in))
822                     {
823                         int i;
824                         struct index_spec *spec;
825                         for (spec = spec_list, i = 0; i < no_ord; 
826                              i++, spec = spec->next)
827                         {
828                             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
829                             if (ord == ord_array[i] && 
830                                 str[0] != FIRST_IN_FIELD_CHAR)
831                             {
832                                 int *freq;
833                                 zebra_strmap_t sm = map_array[i];
834                                 
835                                 freq = zebra_strmap_lookup(sm, str, 0, 0);
836                                 if (freq)
837                                     (*freq)++;
838                                 else
839                                 {
840                                     int v = 1;
841                                     zebra_strmap_add(sm, str, &v, sizeof v);
842                                 }
843                             }
844                         }
845                     }
846                 }
847                 zebra_rec_keys_close(keys);
848                 rec_free(&rec);
849             }
850         }
851         yaz_timing_stop(timing);
852         yaz_log(YLOG_LOG, "facet first phase real=%4.2f",
853                 yaz_timing_get_real(timing));
854         yaz_timing_start(timing);
855         if (use_xml)
856             wrbuf_puts(wr, "<facets>\n");
857         for (spec = spec_list, i = 0; i < no_ord; i++, spec = spec->next)
858         {
859             int j;
860             NMEM nmem = nmem_create();
861             struct term_collect *col;
862             int no_collect_terms = 20;
863
864             if (spec->extra)
865                 no_collect_terms = atoi(spec->extra);
866             if (no_collect_terms < 1)
867                 no_collect_terms = 1;
868             col = term_collect_create(map_array[i], no_collect_terms, nmem);
869             term_collect_freq(zh, col, no_collect_terms, ord_array[i],
870                               resultSetRef(zh, fi->setname));
871             
872             if (use_xml)
873                 wrbuf_printf(wr, "  <facet type=\"%s\" index=\"%s\">\n",
874                              spec->index_type, spec->index_name);
875             else
876                 wrbuf_printf(wr, "facet %s %s\n",
877                              spec->index_type, spec->index_name);
878             for (j = 0; j < no_collect_terms; j++)
879             {
880                 if (col[j].term)
881                 {
882                     char dst_buf[IT_MAX_WORD];
883                     zebra_term_untrans(zh, spec->index_type, dst_buf, col[j].term);
884                     if (use_xml)
885                     {
886                         wrbuf_printf(wr, "    <term coccur=\"%d\"", col[j].oc);
887                         if (col[j].set_occur)
888                             wrbuf_printf(wr, " occur=\"" ZINT_FORMAT "\"", 
889                                          col[j].set_occur);
890                         wrbuf_printf(wr, ">");
891                         wrbuf_xmlputs(wr, dst_buf);
892                         wrbuf_printf(wr, "</term>\n");
893                     }
894                     else
895                     {
896                         wrbuf_printf(wr, "term %d", col[j].oc);
897                         if (col[j].set_occur)
898                             wrbuf_printf(wr, " " ZINT_FORMAT, 
899                                          col[j].set_occur);
900                         wrbuf_printf(wr, ": %s\n", dst_buf);
901                     }
902                 }
903             }
904             if (use_xml)
905                 wrbuf_puts(wr, "  </facet>\n");
906             nmem_destroy(nmem);
907         }
908         if (use_xml)
909             wrbuf_puts(wr, "</facets>\n");
910         for (i = 0; i < no_ord; i++)
911             zebra_strmap_destroy(map_array[i]);
912         yaz_timing_stop(timing);
913         yaz_log(YLOG_LOG, "facet second phase real=%4.2f",
914                 yaz_timing_get_real(timing));
915         yaz_timing_destroy(&timing);
916     }
917     *output_format = yaz_oid_recsyn_xml;
918     zebra_meta_records_destroy(zh, poset, num_recs);
919     return ret;
920 }
921
922
923 int zebra_special_fetch(
924     void *handle, const char *elemsetname,
925     const Odr_oid *input_format,
926     const Odr_oid **output_format,
927     WRBUF result, WRBUF addinfo
928     )
929 {
930     Record rec = 0;
931     struct special_fetch_s *fi = (struct special_fetch_s *) handle;
932     ZebraHandle zh = fi->zh;
933     zint sysno = fi->sysno;
934     
935     /* set output variables before processing possible error states */
936     /* *rec_lenp = 0; */
937
938     if (elemsetname && 0 == strncmp(elemsetname, "facet", 5))
939     {
940         return facet_fetch(fi, elemsetname + 5, 
941                            input_format, output_format,
942                            result, addinfo);
943     }
944
945     if (elemsetname && 0 == strcmp(elemsetname, "snippet"))
946     {
947         return snippet_fetch(fi, elemsetname + 7,
948                              input_format, output_format,
949                              result, addinfo);
950     }
951
952     /* processing zebra::meta::sysno elemset without fetching binary data */
953     if (elemsetname && 0 == strcmp(elemsetname, "meta::sysno"))
954     {
955         int ret = 0;
956         WRBUF wrbuf = result;
957         if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
958         {
959             wrbuf_printf(wrbuf, ZINT_FORMAT, fi->sysno);
960             *output_format = input_format;
961         } 
962         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
963         {
964             wrbuf_printf(wrbuf, ZEBRA_XML_HEADER_STR
965                          " sysno=\"" ZINT_FORMAT "\"/>\n",
966                          fi->sysno);
967             *output_format = input_format;
968         }
969         if (wrbuf_len(wrbuf) == 0)
970             ret = YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
971         return ret;
972     }
973
974     /* processing special elementsetname zebra::index:: for sort elements */
975     if (elemsetname && 0 == strncmp(elemsetname, "index", 5))
976     {
977         int ret = zebra_special_sort_fetch(
978             fi, elemsetname + 5,
979             input_format, output_format,
980             result, addinfo);
981         if (ret != -1)
982             return ret;
983         /* not a sort index so we continue to get the full record */
984     }
985
986
987     /* fetching binary record up for all other display elementsets */
988     rec = rec_get(zh->reg->records, sysno);
989     if (!rec)
990     {
991         yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno);
992         return YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
993     }
994
995     /* processing special elementsetnames zebra::data */    
996     if (elemsetname && 0 == strcmp(elemsetname, "data"))
997     {
998         struct ZebraRecStream stream;
999         RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, rec); 
1000         char *b;
1001
1002         zebra_create_record_stream(zh, &rec, &stream);
1003         *output_format = input_format;
1004
1005         b = nmem_malloc(fi->nmem, recordAttr->recordSize);
1006         stream.readf(&stream, b, recordAttr->recordSize);
1007         wrbuf_write(result, b, recordAttr->recordSize);
1008
1009         stream.destroy(&stream);
1010         rec_free(&rec);
1011         return 0;
1012     }
1013
1014     /* only accept XML and SUTRS requests from now */
1015     if (oid_oidcmp(input_format, yaz_oid_recsyn_xml)
1016         && oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
1017     {
1018         yaz_log(YLOG_WARN, "unsupported format for element set zebra::%s", 
1019                 elemsetname);
1020         return YAZ_BIB1_NO_SYNTAXES_AVAILABLE_FOR_THIS_REQUEST;
1021     }
1022     
1023     /* processing special elementsetnames zebra::meta:: */
1024     if (elemsetname && 0 == strcmp(elemsetname, "meta"))
1025     {
1026         int ret = 0;
1027         WRBUF wrbuf = result;
1028         RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, rec); 
1029
1030         if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml))
1031         {
1032             *output_format = input_format;
1033             
1034             wrbuf_printf(wrbuf, ZEBRA_XML_HEADER_STR
1035                          " sysno=\"" ZINT_FORMAT "\"", sysno);
1036             retrieve_puts_attr(wrbuf, "base", rec->info[recInfo_databaseName]);
1037             retrieve_puts_attr(wrbuf, "file", rec->info[recInfo_filename]);
1038             retrieve_puts_attr(wrbuf, "type", rec->info[recInfo_fileType]);
1039             if (fi->score >= 0)
1040                 retrieve_puts_attr_int(wrbuf, "score", fi->score);
1041            
1042             wrbuf_printf(wrbuf,
1043                          " rank=\"" ZINT_FORMAT "\""
1044                          " size=\"%i\""
1045                          " set=\"zebra::%s\"/>\n",
1046                          recordAttr->staticrank,
1047                          recordAttr->recordSize,
1048                          elemsetname);
1049         }
1050         else if (!oid_oidcmp(input_format, yaz_oid_recsyn_sutrs))
1051         {
1052             *output_format = input_format;
1053             wrbuf_printf(wrbuf, "sysno " ZINT_FORMAT "\n", sysno);
1054             retrieve_puts_str(wrbuf, "base", rec->info[recInfo_databaseName]);
1055             retrieve_puts_str(wrbuf, "file", rec->info[recInfo_filename]);
1056             retrieve_puts_str(wrbuf, "type", rec->info[recInfo_fileType]);
1057             if (fi->score >= 0)
1058                 retrieve_puts_int(wrbuf, "score", fi->score);
1059
1060             wrbuf_printf(wrbuf,
1061                          "rank " ZINT_FORMAT "\n"
1062                          "size %i\n"
1063                          "set zebra::%s\n",
1064                          recordAttr->staticrank,
1065                          recordAttr->recordSize,
1066                          elemsetname);
1067         }
1068         if (wrbuf_len(wrbuf) == 0)
1069             ret = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1070
1071         rec_free(&rec);
1072         return ret;
1073     }
1074
1075     /* processing special elementsetnames zebra::index:: */
1076     if (elemsetname && 0 == strncmp(elemsetname, "index", 5))
1077     {
1078         int ret = zebra_special_index_fetch(
1079             fi, elemsetname + 5,
1080             input_format, output_format,
1081             result, addinfo, rec);
1082         rec_free(&rec);
1083         return ret;
1084     }
1085
1086     if (rec)
1087         rec_free(&rec);
1088     return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1089 }
1090
1091 int zebra_record_fetch(ZebraHandle zh, const char *setname,
1092                        zint sysno, int score,
1093                        ODR odr,
1094                        const Odr_oid *input_format, Z_RecordComposition *comp,
1095                        const Odr_oid **output_format,
1096                        char **rec_bufp, int *rec_lenp, char **basenamep,
1097                        char **addinfo)
1098 {
1099     Record rec;
1100     char *fname, *file_type, *basename;
1101     const char *elemsetname;
1102     struct ZebraRecStream stream;
1103     RecordAttr *recordAttr;
1104     void *clientData;
1105     int return_code = 0;
1106     zint sysnos[MAX_SYSNOS_PER_RECORD];
1107     int no_sysnos = MAX_SYSNOS_PER_RECORD;
1108     ZEBRA_RES res;
1109     struct special_fetch_s fetch_info;
1110
1111     res = zebra_result_recid_to_sysno(zh, setname, sysno, sysnos, &no_sysnos);
1112     if (res != ZEBRA_OK)
1113         return ZEBRA_FAIL;
1114
1115     sysno = sysnos[0];
1116     *basenamep = 0;
1117     *addinfo = 0;
1118     elemsetname = yaz_get_esn(comp);
1119
1120     fetch_info.zh = zh;
1121     fetch_info.setname = setname;
1122     fetch_info.sysno = sysno;
1123     fetch_info.score = score;
1124     fetch_info.nmem = odr->mem;
1125
1126     /* processing zebra special elementset names of form 'zebra:: */
1127     if (elemsetname && 0 == strncmp(elemsetname, "zebra::", 7))
1128     {
1129         WRBUF result = wrbuf_alloc();
1130         WRBUF addinfo_w = wrbuf_alloc();
1131         int r = zebra_special_fetch(&fetch_info, elemsetname + 7,
1132                                     input_format, output_format,
1133                                     result, addinfo_w);
1134         if (r == 0)
1135         {
1136             *rec_bufp = odr_strdup(odr, wrbuf_cstr(result));
1137             *rec_lenp = wrbuf_len(result);
1138         }
1139         else 
1140         {
1141             if (wrbuf_len(addinfo_w))
1142                 *addinfo = odr_strdup(odr, wrbuf_cstr(addinfo_w));
1143         }
1144         wrbuf_destroy(result);
1145         wrbuf_destroy(addinfo_w);
1146         return r;
1147     }
1148
1149     /* processing all other element set names */
1150     rec = rec_get(zh->reg->records, sysno);
1151     if (!rec)
1152     {
1153         yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno);
1154         *basenamep = 0;
1155         return YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1156     }
1157
1158
1159     recordAttr = rec_init_attr(zh->reg->zei, rec);
1160
1161     file_type = rec->info[recInfo_fileType];
1162     fname = rec->info[recInfo_filename];
1163     basename = rec->info[recInfo_databaseName];
1164     *basenamep = (char *) odr_malloc(odr, strlen(basename)+1);
1165     strcpy(*basenamep, basename);
1166
1167     yaz_log(YLOG_DEBUG, "retrieve localno=" ZINT_FORMAT " score=%d",
1168             sysno, score);
1169
1170     return_code = zebra_create_record_stream(zh, &rec, &stream);
1171
1172     if (rec)
1173     {
1174         RecType rt;
1175         struct recRetrieveCtrl retrieveCtrl;
1176
1177         retrieveCtrl.stream = &stream;
1178         retrieveCtrl.fname = fname;
1179         retrieveCtrl.localno = sysno;
1180         retrieveCtrl.staticrank = recordAttr->staticrank;
1181         retrieveCtrl.score = score;
1182         retrieveCtrl.recordSize = recordAttr->recordSize;
1183         retrieveCtrl.odr = odr;
1184         retrieveCtrl.input_format = retrieveCtrl.output_format = input_format;
1185         retrieveCtrl.comp = comp;
1186         retrieveCtrl.encoding = zh->record_encoding;
1187         retrieveCtrl.diagnostic = 0;
1188         retrieveCtrl.addinfo = 0;
1189         retrieveCtrl.dh = zh->reg->dh;
1190         retrieveCtrl.res = zh->res;
1191         retrieveCtrl.rec_buf = 0;
1192         retrieveCtrl.rec_len = -1;
1193         retrieveCtrl.handle = &fetch_info;
1194         retrieveCtrl.special_fetch = zebra_special_fetch;
1195
1196         if (!(rt = recType_byName(zh->reg->recTypes, zh->res,
1197                                   file_type, &clientData)))
1198         {
1199             char addinfo_str[100];
1200
1201             sprintf(addinfo_str, "Could not handle record type %.40s",
1202                     file_type);
1203                     
1204             *addinfo = odr_strdup(odr, addinfo_str);
1205             return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1206         }
1207         else
1208         {
1209             (*rt->retrieve)(clientData, &retrieveCtrl);
1210             return_code = retrieveCtrl.diagnostic;
1211
1212             *output_format = retrieveCtrl.output_format;
1213             *rec_bufp = (char *) retrieveCtrl.rec_buf;
1214             *rec_lenp = retrieveCtrl.rec_len;
1215             *addinfo = retrieveCtrl.addinfo;
1216         }
1217
1218         stream.destroy(&stream);
1219         rec_free(&rec);
1220     }
1221
1222     return return_code;
1223 }
1224
1225 /*
1226  * Local variables:
1227  * c-basic-offset: 4
1228  * indent-tabs-mode: nil
1229  * End:
1230  * vim: shiftwidth=4 tabstop=8 expandtab
1231  */
1232