Increment seqno for complete fields (non-ICU)
[idzebra-moved-to-github.git] / index / extract.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2011 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 /** \file
21     \brief indexes records and extract tokens for indexing and sorting
22 */
23
24 #if HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37
38
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52
53 #if FLUSH2
54 static void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
55                                        zebra_rec_keys_t ins_keys,
56                                        zint ins_rank,
57                                        zebra_rec_keys_t del_keys,
58                                        zint del_rank);
59 #else
60 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
61                                       int cmd,
62                                       zebra_rec_keys_t reckeys,
63                                       zint staticrank);
64 #endif
65
66 static void zebra_init_log_level(void)
67 {
68     if (!log_level_initialized)
69     {
70         log_level_initialized = 1;
71
72         log_level_extract = yaz_log_module_level("extract");
73         log_level_details = yaz_log_module_level("indexdetails");
74     }
75 }
76
77 static WRBUF wrbuf_hex_str(const char *cstr)
78 {
79     size_t i;
80     WRBUF w = wrbuf_alloc();
81     for (i = 0; cstr[i]; i++)
82     {
83         if (cstr[i] < ' ' || cstr[i] > 126)
84             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
85         else
86             wrbuf_putc(w, cstr[i]);
87     }
88     return w;
89 }
90
91
92 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
93                                     int cmd, zebra_rec_keys_t skp);
94 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
95 static void extract_token_add(RecWord *p);
96
97 static void check_log_limit(ZebraHandle zh)
98 {
99     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
100     {
101         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
102                 zh->m_file_verbose_limit);
103     }
104 }
105
106 static void logRecord(ZebraHandle zh)
107 {
108     check_log_limit(zh);
109     ++zh->records_processed;
110     if (!(zh->records_processed % 1000))
111     {
112         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
113                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
114                 zh->records_processed, zh->records_inserted, 
115                 zh->records_updated, zh->records_deleted);
116     }
117 }
118
119 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
120 {
121     ctrl->flagShowRecords = !zh->m_flag_rw;
122 }
123
124
125 static void extract_add_index_string(RecWord *p, 
126                                       zinfo_index_category_t cat,
127                                       const char *str, int length);
128
129 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
130
131 static void extract_init(struct recExtractCtrl *p, RecWord *w)
132 {
133     w->seqno = 1;
134     w->index_name = "any";
135     w->index_type = "w";
136     w->extractCtrl = p;
137     w->record_id = 0;
138     w->section_id = 0;
139     w->segment = 0;
140 }
141
142 struct snip_rec_info {
143     ZebraHandle zh;
144     zebra_snippets *snippets;
145 };
146
147
148 static void snippet_add_complete_field(RecWord *p, int ord,
149                                        zebra_map_t zm)
150 {
151     struct snip_rec_info *h = p->extractCtrl->handle;
152
153     const char *b = p->term_buf;
154     char buf[IT_MAX_WORD+1];
155     const char **map = 0;
156     int i = 0, remain = p->term_len;
157     const char *start = b;
158     const char *last = 0;
159
160     if (remain > 0)
161         map = zebra_maps_input(zm, &b, remain, 1);
162
163     while (remain > 0 && i < IT_MAX_WORD)
164     {
165         while (map && *map && **map == *CHR_SPACE)
166         {
167             remain = p->term_len - (b - p->term_buf);
168
169             if (i == 0)
170                 start = b;  /* set to first non-ws area */
171             if (remain > 0)
172             {
173                 int first = i ? 0 : 1;  /* first position */
174
175                 map = zebra_maps_input(zm, &b, remain, first);
176             }
177             else
178                 map = 0;
179         }
180         if (!map)
181             break;
182
183         if (i && i < IT_MAX_WORD)
184             buf[i++] = *CHR_SPACE;
185         while (map && *map && **map != *CHR_SPACE)
186         {
187             const char *cp = *map;
188
189             if (**map == *CHR_CUT)
190             {
191                 i = 0;
192             }
193             else
194             {
195                 if (i >= IT_MAX_WORD)
196                     break;
197                 while (i < IT_MAX_WORD && *cp)
198                     buf[i++] = *(cp++);
199             }
200             last = b;
201             remain = p->term_len  - (b - p->term_buf);
202             if (remain > 0)
203             {
204                 map = zebra_maps_input(zm, &b, remain, 0);
205             }
206             else
207                 map = 0;
208         }
209     }
210     if (!i)
211         return;
212     if (last && start != last && zebra_maps_is_index(zm))
213         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
214                                start, last - start);
215     p->seqno++;
216 }
217
218 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
219 {
220     struct snip_rec_info *h = p->extractCtrl->handle;
221     const char *b = p->term_buf;
222     int remain = p->term_len;
223     int first = 1;
224     const char **map = 0;
225     const char *start = b;
226     const char *last = b;
227
228     if (remain > 0)
229         map = zebra_maps_input(zm, &b, remain, 0);
230
231     while (map)
232     {
233         char buf[IT_MAX_WORD+1];
234         int i, remain;
235
236         /* Skip spaces */
237         while (map && *map && **map == *CHR_SPACE)
238         {
239             remain = p->term_len - (b - p->term_buf);
240             last = b;
241             if (remain > 0)
242                 map = zebra_maps_input(zm, &b, remain, 0);
243             else
244                 map = 0;
245         }
246         if (!map)
247             break;
248         if (start != last && zebra_maps_is_index(zm))
249         {
250             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
251                                    start, last - start);
252
253         }
254         start = last;
255
256         i = 0;
257         while (map && *map && **map != *CHR_SPACE)
258         {
259             const char *cp = *map;
260
261             while (i < IT_MAX_WORD && *cp)
262                 buf[i++] = *(cp++);
263             remain = p->term_len - (b - p->term_buf);
264             last = b;
265             if (remain > 0)
266                 map = zebra_maps_input(zm, &b, remain, 0);
267             else
268                 map = 0;
269         }
270         if (!i)
271             return;
272
273         if (first)
274         {   
275             first = 0;
276             if (zebra_maps_is_first_in_field(zm))
277             {
278                 /* first in field marker */
279                 p->seqno++;
280             }
281         }
282         if (start != last && zebra_maps_is_index(zm))
283             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
284                                    start, last - start);
285         start = last;
286         p->seqno++;
287     }
288
289 }
290
291 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
292 {
293     struct snip_rec_info *h = p->extractCtrl->handle;
294
295     const char *res_buf = 0;
296     size_t res_len = 0;
297
298     const char *display_buf = 0;
299     size_t display_len = 0;
300
301     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
302     while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
303                                    &display_buf, &display_len))
304     {
305         if (zebra_maps_is_index(zm))
306             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
307                                    display_buf, display_len);
308         p->seqno++;
309     }
310 }
311
312 static void snippet_token_add(RecWord *p)
313 {
314     struct snip_rec_info *h = p->extractCtrl->handle;
315     ZebraHandle zh = h->zh;
316     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
317
318     if (zm)
319     {
320         ZebraExplainInfo zei = zh->reg->zei;
321         int ch = zebraExplain_lookup_attr_str(
322             zei, zinfo_index_category_index, p->index_type, p->index_name);
323
324         if (zebra_maps_is_icu(zm))
325             snippet_add_icu(p, ch, zm);
326         else
327         {
328             if (zebra_maps_is_complete(zm))
329                 snippet_add_complete_field(p, ch, zm);
330             else
331                 snippet_add_incomplete_field(p, ch, zm);
332         }
333     }
334 }
335
336 static void snippet_schema_add(
337     struct recExtractCtrl *p, Odr_oid *oid)
338 {
339
340 }
341
342 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
343                      struct ZebraRecStream *stream,
344                      RecType rt, void *recTypeClientData)
345 {
346     struct recExtractCtrl extractCtrl;
347     struct snip_rec_info info;
348     int r;
349
350     extractCtrl.stream = stream;
351     extractCtrl.first_record = 1;
352     extractCtrl.init = extract_init;
353     extractCtrl.tokenAdd = snippet_token_add;
354     extractCtrl.schemaAdd = snippet_schema_add;
355     assert(zh->reg);
356     assert(zh->reg->dh);
357
358     extractCtrl.dh = zh->reg->dh;
359     
360     info.zh = zh;
361     info.snippets = sn;
362     extractCtrl.handle = &info;
363     extractCtrl.match_criteria[0] = '\0';
364     extractCtrl.staticrank = 0;
365     extractCtrl.action = action_insert;
366     
367     init_extractCtrl(zh, &extractCtrl);
368
369     extractCtrl.setStoreData = 0;
370
371     r = (*rt->extract)(recTypeClientData, &extractCtrl);
372
373 }
374
375 static void searchRecordKey(ZebraHandle zh,
376                             zebra_rec_keys_t reckeys,
377                             const char *index_name,
378                             const char **ws, int ws_length)
379 {
380     int i;
381     int ch = -1;
382     zinfo_index_category_t cat = zinfo_index_category_index;
383
384     for (i = 0; i<ws_length; i++)
385         ws[i] = NULL;
386
387     if (ch < 0)
388         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
389     if (ch < 0)
390         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
391     if (ch < 0)
392         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
393
394     if (ch < 0)
395         return ;
396
397     if (zebra_rec_keys_rewind(reckeys))
398     {
399         zint startSeq = -1;
400         const char *str;
401         size_t slen;
402         struct it_key key;
403         zint seqno;
404         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
405         {
406             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
407
408             seqno = key.mem[key.len-1];
409             
410             if (key.mem[0] == ch)
411             {
412                 zint woff;
413                 
414                 if (startSeq == -1)
415                     startSeq = seqno;
416                 woff = seqno - startSeq;
417                 if (woff >= 0 && woff < ws_length)
418                     ws[woff] = str;
419             }
420         }
421     }
422 }
423
424 #define FILE_MATCH_BLANK "\t "
425
426 static char *get_match_from_spec(ZebraHandle zh,
427                           zebra_rec_keys_t reckeys,
428                           const char *fname, const char *spec)
429 {
430     static char dstBuf[2048];      /* static here ??? */
431     char *dst = dstBuf;
432     const char *s = spec;
433
434     while (1)
435     {
436         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
437             ;
438         if (!*s)
439             break;
440         if (*s == '(')
441         {
442             const char *ws[32];
443             char attset_str[64], attname_str[64];
444             int i;
445             int first = 1;
446             
447             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
448                 ;
449             for (i = 0; *s && *s != ',' && *s != ')' && 
450                      !strchr(FILE_MATCH_BLANK, *s); s++)
451                 if (i+1 < sizeof(attset_str))
452                     attset_str[i++] = *s;
453             attset_str[i] = '\0';
454             
455             for (; strchr(FILE_MATCH_BLANK, *s); s++)
456                 ;
457             if (*s != ',')
458                 strcpy(attname_str, attset_str);
459             else
460             {
461                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
462                     ;
463                 for (i = 0; *s && *s != ')' && 
464                          !strchr(FILE_MATCH_BLANK, *s); s++)
465                     if (i+1 < sizeof(attname_str))
466                         attname_str[i++] = *s;
467                 attname_str[i] = '\0';
468             }
469             if (*s != ')')
470             {
471                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
472                       spec, zh->m_group ? zh->m_group : "none");
473                 return NULL;
474             }
475             s++;
476
477             searchRecordKey(zh, reckeys, attname_str, ws, 32);
478             if (0) /* for debugging */
479             {   
480                 for (i = 0; i<32; i++)
481                 {
482                     if (ws[i])
483                     {
484                         WRBUF w = wrbuf_hex_str(ws[i]);
485                         yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
486                         wrbuf_destroy(w);
487                     }
488                 }
489             }
490
491             for (i = 0; i<32; i++)
492                 if (ws[i])
493                 {
494                     if (first)
495                     {
496                         *dst++ = ' ';
497                         first = 0;
498                     }
499                     strcpy(dst, ws[i]);
500                     dst += strlen(ws[i]);
501                 }
502             if (first)
503             {
504                 yaz_log(YLOG_WARN, "Record didn't contain match"
505                       " fields in (%s,%s)", attset_str, attname_str);
506                 return NULL;
507             }
508         }
509         else if (*s == '$')
510         {
511             int spec_len;
512             char special[64];
513             const char *spec_src = NULL;
514             const char *s1 = ++s;
515             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
516                 s1++;
517
518             spec_len = s1 - s;
519             if (spec_len > sizeof(special)-1)
520                 spec_len = sizeof(special)-1;
521             memcpy(special, s, spec_len);
522             special[spec_len] = '\0';
523             s = s1;
524
525             if (!strcmp(special, "group"))
526                 spec_src = zh->m_group;
527             else if (!strcmp(special, "database"))
528                 spec_src = zh->basenames[0];
529             else if (!strcmp(special, "filename")) {
530                 spec_src = fname;
531             }
532             else if (!strcmp(special, "type"))
533                 spec_src = zh->m_record_type;
534             else 
535                 spec_src = NULL;
536             if (spec_src)
537             {
538                 strcpy(dst, spec_src);
539                 dst += strlen(spec_src);
540             }
541         }
542         else if (*s == '\"' || *s == '\'')
543         {
544             int stopMarker = *s++;
545             char tmpString[64];
546             int i = 0;
547
548             while (*s && *s != stopMarker)
549             {
550                 if (i+1 < sizeof(tmpString))
551                     tmpString[i++] = *s++;
552             }
553             if (*s)
554                 s++;
555             tmpString[i] = '\0';
556             strcpy(dst, tmpString);
557             dst += strlen(tmpString);
558         }
559         else
560         {
561             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
562                   spec, zh->m_group ? zh->m_group : "none");
563             return NULL;
564         }
565         *dst++ = 1;
566     }
567     if (dst == dstBuf)
568     {
569         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
570               fname, zh->m_group ? zh->m_group : "none");
571         return NULL;
572     }
573     *dst = '\0';
574
575     if (0) /* for debugging */
576     {
577         WRBUF w = wrbuf_hex_str(dstBuf);
578         yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
579         wrbuf_destroy(w);
580     }
581
582     return dstBuf;
583 }
584
585 struct recordLogInfo {
586     const char *fname;
587     int recordOffset;
588     struct recordGroup *rGroup;
589 };
590
591 /** \brief add the always-matches index entry and map to real record ID
592     \param ctrl record control
593     \param record_id custom record ID
594     \param sysno system record ID
595     
596     This function serves two purposes.. It adds the always matches
597     entry and makes a pointer from the custom record ID (if defined)
598     back to the system record ID (sysno)
599     See zebra_recid_to_sysno .
600   */
601 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
602                             zint sysno)
603 {
604     RecWord word;
605     extract_init(ctrl, &word);
606     word.record_id = record_id;
607     /* we use the seqno as placeholder for a way to get back to
608        record database from _ALLRECORDS.. This is used if a custom
609        RECORD was defined */
610     word.seqno = sysno;
611     word.index_name = "_ALLRECORDS";
612     word.index_type = "w";
613
614     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
615                               "", 0);
616 }
617
618 /* forward declaration */
619 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
620                                        struct ZebraRecStream *stream,
621                                        enum zebra_recctrl_action_t action,
622                                        const char *recordType,
623                                        zint *sysno,
624                                        const char *match_criteria,
625                                        const char *fname,
626                                        RecType recType,
627                                        void *recTypeClientData);
628
629
630 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
631                              enum zebra_recctrl_action_t action)
632 {
633     ZEBRA_RES r = ZEBRA_OK;
634     int i, fd;
635     char gprefix[128];
636     char ext[128];
637     char ext_res[128];
638     struct file_read_info *fi = 0;
639     const char *original_record_type = 0;
640     RecType recType;
641     void *recTypeClientData;
642     struct ZebraRecStream stream, *streamp;
643
644     zebra_init_log_level();
645
646     if (!zh->m_group || !*zh->m_group)
647         *gprefix = '\0';
648     else
649         sprintf(gprefix, "%s.", zh->m_group);
650     
651     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
652
653     /* determine file extension */
654     *ext = '\0';
655     for (i = strlen(fname); --i >= 0; )
656         if (fname[i] == '/')
657             break;
658         else if (fname[i] == '.')
659         {
660             strcpy(ext, fname+i+1);
661             break;
662         }
663     /* determine file type - depending on extension */
664     original_record_type = zh->m_record_type;
665     if (!zh->m_record_type)
666     {
667         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
668         zh->m_record_type = res_get(zh->res, ext_res);
669     }
670     if (!zh->m_record_type)
671     {
672         check_log_limit(zh);
673         if (zh->records_processed + zh->records_skipped
674             < zh->m_file_verbose_limit)
675             yaz_log(YLOG_LOG, "? %s", fname);
676         zh->records_skipped++;
677         return 0;
678     }
679     /* determine match criteria */
680     if (!zh->m_record_id)
681     {
682         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
683         zh->m_record_id = res_get(zh->res, ext_res);
684     }
685
686     if (!(recType =
687           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
688                           &recTypeClientData)))
689     {
690         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
691         return ZEBRA_FAIL;
692     }
693
694     switch(recType->version)
695     {
696     case 0:
697         break;
698     default:
699         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
700     }
701     if (sysno && (action == action_delete || action == action_a_delete))
702     {
703         streamp = 0;
704         fi = 0;
705     }
706     else
707     {
708         char full_rep[1024];
709
710         if (zh->path_reg && !yaz_is_abspath(fname))
711         {
712             strcpy(full_rep, zh->path_reg);
713             strcat(full_rep, "/");
714             strcat(full_rep, fname);
715         }
716         else
717             strcpy(full_rep, fname);
718         
719         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
720         {
721             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
722             zh->m_record_type = original_record_type;
723             return ZEBRA_FAIL;
724         }
725         streamp = &stream;
726         zebra_create_stream_fd(streamp, fd, 0);
727     }
728     r = zebra_extract_records_stream(zh, streamp,
729                                      action,
730                                      zh->m_record_type,
731                                      sysno,
732                                      0, /*match_criteria */
733                                      fname,
734                                      recType, recTypeClientData);
735     if (streamp)
736         stream.destroy(streamp);
737     zh->m_record_type = original_record_type;
738     return r;
739 }
740
741 /*
742   If sysno is provided, then it's used to identify the reocord.
743   If not, and match_criteria is provided, then sysno is guessed
744   If not, and a record is provided, then sysno is got from there
745   
746  */
747
748 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
749                                       const char *buf, size_t buf_size,
750                                       enum zebra_recctrl_action_t action,
751                                       const char *recordType,
752                                       zint *sysno,
753                                       const char *match_criteria,
754                                       const char *fname)
755 {
756     struct ZebraRecStream stream;
757     ZEBRA_RES res;
758     void *clientData;
759     RecType recType = 0;
760
761     if (recordType && *recordType)
762     {
763         yaz_log(log_level_extract,
764                 "Record type explicitly specified: %s", recordType);
765         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
766                                   &clientData);
767     } 
768     else
769     {
770         if (!(zh->m_record_type))
771         {
772             yaz_log(YLOG_WARN, "No such record type defined");
773             return ZEBRA_FAIL;
774         }
775         yaz_log(log_level_extract, "Get record type from rgroup: %s",
776                 zh->m_record_type);
777         recType = recType_byName(zh->reg->recTypes, zh->res,
778                                   zh->m_record_type, &clientData);
779         recordType = zh->m_record_type;
780     }
781     
782     if (!recType)
783     {
784         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
785         return ZEBRA_FAIL;
786     }
787
788     zebra_create_stream_mem(&stream, buf, buf_size);
789
790     res = zebra_extract_records_stream(zh, &stream,
791                                        action,
792                                        recordType,
793                                        sysno,
794                                        match_criteria,
795                                        fname,
796                                        recType, clientData);
797     stream.destroy(&stream);
798     return res;
799 }
800
801 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
802                                              struct ZebraRecStream *stream,
803                                              enum zebra_recctrl_action_t action,
804                                              const char *recordType,
805                                              zint *sysno,
806                                              const char *match_criteria,
807                                              const char *fname,
808                                              RecType recType,
809                                              void *recTypeClientData,
810                                              int *more)
811     
812 {
813     zint sysno0 = 0;
814     RecordAttr *recordAttr;
815     struct recExtractCtrl extractCtrl;
816     int r;
817     const char *matchStr = 0;
818     Record rec;
819     off_t start_offset = 0, end_offset = 0;
820     const char *pr_fname = fname;  /* filename to print .. */
821     int show_progress = zh->records_processed + zh->records_skipped 
822         < zh->m_file_verbose_limit ? 1:0;
823
824     zebra_init_log_level();
825
826     if (!pr_fname)
827         pr_fname = "<no file>";  /* make it printable if file is omitted */
828
829     zebra_rec_keys_reset(zh->reg->keys);
830     zebra_rec_keys_reset(zh->reg->sortKeys);
831
832     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
833     {
834         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
835                                       zh->m_explain_database))
836             return ZEBRA_FAIL;
837     }
838
839     if (stream)
840     {
841         off_t null_offset = 0;
842         extractCtrl.stream = stream;
843
844         start_offset = stream->tellf(stream);
845
846         extractCtrl.first_record = start_offset ? 0 : 1;
847         
848         stream->endf(stream, &null_offset);;
849
850         extractCtrl.init = extract_init;
851         extractCtrl.tokenAdd = extract_token_add;
852         extractCtrl.schemaAdd = extract_schema_add;
853         extractCtrl.dh = zh->reg->dh;
854         extractCtrl.handle = zh;
855         extractCtrl.match_criteria[0] = '\0';
856         extractCtrl.staticrank = 0;
857         extractCtrl.action = action;
858
859         init_extractCtrl(zh, &extractCtrl);
860
861         extract_set_store_data_prepare(&extractCtrl);
862         
863         r = (*recType->extract)(recTypeClientData, &extractCtrl);
864
865         if (action == action_update)
866         {
867             action = extractCtrl.action;
868         }
869         
870         switch (r)
871         {
872         case RECCTRL_EXTRACT_EOF:
873             return ZEBRA_FAIL;
874         case RECCTRL_EXTRACT_ERROR_GENERIC:
875             /* error occured during extraction ... */
876             yaz_log(YLOG_WARN, "extract error: generic");
877             return ZEBRA_FAIL;
878         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
879             /* error occured during extraction ... */
880             yaz_log(YLOG_WARN, "extract error: no such filter");
881             return ZEBRA_FAIL;
882         case RECCTRL_EXTRACT_SKIP:
883             if (show_progress)
884                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
885                          recordType, pr_fname, (zint) start_offset);
886             *more = 1;
887             
888             end_offset = stream->endf(stream, 0);
889             if (end_offset)
890                 stream->seekf(stream, end_offset);
891
892             return ZEBRA_OK;
893         case RECCTRL_EXTRACT_OK:
894             break;
895         default:
896             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
897             return ZEBRA_FAIL;
898         }
899         end_offset = stream->endf(stream, 0);
900         if (end_offset)
901             stream->seekf(stream, end_offset);
902         else
903             end_offset = stream->tellf(stream);
904
905         if (extractCtrl.match_criteria[0])
906             match_criteria = extractCtrl.match_criteria;
907     }
908
909     *more = 1;
910
911     if (zh->m_flag_rw == 0)
912     {
913         yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
914                 pr_fname, (zint) start_offset);
915         /* test mode .. Do not perform match */
916         return ZEBRA_OK;
917     }
918         
919     if (!sysno)
920     {
921         sysno = &sysno0;
922         
923         if (match_criteria && *match_criteria)
924             matchStr = match_criteria;
925         else
926         {
927             if (zh->m_record_id && *zh->m_record_id)
928             {
929                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
930                                                zh->m_record_id);
931                 if (!matchStr)
932                 {
933                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
934                              pr_fname, (zint) start_offset);
935                     return ZEBRA_FAIL;
936                 }
937                 if (0 && matchStr)
938                 {
939                     WRBUF w = wrbuf_alloc();
940                     size_t i;
941                     for (i = 0; i < strlen(matchStr); i++)
942                     {
943                         wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
944                     }
945                     yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
946                     wrbuf_destroy(w);
947                 }
948             }
949         }
950         if (matchStr) 
951         {
952             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
953             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
954                                           matchStr);
955
956             
957             if (log_level_extract)
958             {
959                 WRBUF w = wrbuf_hex_str(matchStr);
960                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
961                 wrbuf_destroy(w);
962             }
963             if (rinfo)
964             {
965                 assert(*rinfo == sizeof(*sysno));
966                 memcpy(sysno, rinfo+1, sizeof(*sysno));
967             }
968        }
969     }
970
971     if (! *sysno)
972     {
973         /* new record AKA does not exist already */
974         if (action == action_delete)
975         {
976             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
977                     pr_fname, (zint) start_offset);
978             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
979             return ZEBRA_FAIL;
980         }
981         else if (action == action_a_delete)
982         {
983             if (show_progress)
984                 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
985                         pr_fname, (zint) start_offset);
986             return ZEBRA_OK;
987         }
988         else if (action == action_replace)
989         {
990             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
991                          pr_fname, (zint) start_offset);
992             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
993             return ZEBRA_FAIL;
994         }
995         if (show_progress)
996             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
997                      (zint) start_offset);
998         rec = rec_new(zh->reg->records);
999
1000         *sysno = rec->sysno;
1001
1002
1003         if (stream)
1004         {
1005             all_matches_add(&extractCtrl,
1006                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1007                             *sysno);
1008         }
1009
1010
1011         recordAttr = rec_init_attr(zh->reg->zei, rec);
1012         if (extractCtrl.staticrank < 0)
1013         {
1014             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1015             extractCtrl.staticrank = 0;
1016         }
1017
1018         if (matchStr)
1019         {
1020             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1021             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1022                             sizeof(*sysno), sysno);
1023         }
1024
1025         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1026 #if FLUSH2
1027         extract_flush_record_keys2(zh, *sysno,
1028                                    zh->reg->keys, extractCtrl.staticrank,
1029                                    0, recordAttr->staticrank);
1030 #else
1031         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1032                                   extractCtrl.staticrank);
1033 #endif
1034         recordAttr->staticrank = extractCtrl.staticrank;
1035         zh->records_inserted++;
1036     } 
1037     else
1038     {
1039         /* record already exists */
1040         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1041         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1042         if (action == action_insert)
1043         {
1044             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
1045                          recordType, pr_fname, (zint) start_offset);
1046             logRecord(zh);
1047             return ZEBRA_FAIL;
1048         }
1049
1050         rec = rec_get(zh->reg->records, *sysno);
1051         assert(rec);
1052
1053         if (stream)
1054         {
1055             all_matches_add(&extractCtrl,
1056                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1057                             *sysno);
1058         }
1059         
1060         recordAttr = rec_init_attr(zh->reg->zei, rec);
1061
1062         /* decrease total size */
1063         zebraExplain_recordBytesIncrement(zh->reg->zei,
1064                                            - recordAttr->recordSize);
1065
1066         zebra_rec_keys_set_buf(delkeys,
1067                                rec->info[recInfo_delKeys],
1068                                rec->size[recInfo_delKeys],
1069                                0);
1070         zebra_rec_keys_set_buf(sortKeys,
1071                                rec->info[recInfo_sortKeys],
1072                                rec->size[recInfo_sortKeys],
1073                                0);
1074
1075         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1076 #if !FLUSH2
1077         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1078                                   recordAttr->staticrank);
1079 #endif
1080         if (action == action_delete || action == action_a_delete)
1081         {
1082             /* record going to be deleted */
1083 #if FLUSH2
1084             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1085                                        delkeys, recordAttr->staticrank);
1086 #endif       
1087             if (zebra_rec_keys_empty(delkeys))
1088             {
1089                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1090                         pr_fname, (zint) start_offset);
1091                 yaz_log(YLOG_WARN, "cannot delete file above, "
1092                         "storeKeys false (3)");
1093             }
1094             else
1095             {
1096                 if (show_progress)
1097                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1098                             pr_fname, (zint) start_offset);
1099                 zh->records_deleted++;
1100                 if (matchStr)
1101                 {
1102                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1103                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1104                 }
1105                 rec_del(zh->reg->records, &rec);
1106             }
1107             zebra_rec_keys_close(delkeys);
1108             zebra_rec_keys_close(sortKeys);
1109             rec_free(&rec);
1110             logRecord(zh);
1111             return ZEBRA_OK;
1112         }
1113         else
1114         {   /* update or special_update */
1115             if (show_progress)
1116                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1117                         pr_fname, (zint) start_offset);
1118             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1119
1120 #if FLUSH2
1121             extract_flush_record_keys2(zh, *sysno,
1122                                        zh->reg->keys, extractCtrl.staticrank,
1123                                        delkeys, recordAttr->staticrank);
1124 #else
1125             extract_flush_record_keys(zh, *sysno, 1, 
1126                                       zh->reg->keys, extractCtrl.staticrank);
1127 #endif
1128             recordAttr->staticrank = extractCtrl.staticrank;
1129             zh->records_updated++;
1130         }
1131         zebra_rec_keys_close(delkeys);
1132         zebra_rec_keys_close(sortKeys);
1133     }
1134     /* update file type */
1135     xfree(rec->info[recInfo_fileType]);
1136     rec->info[recInfo_fileType] =
1137         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1138
1139     /* update filename */
1140     xfree(rec->info[recInfo_filename]);
1141     rec->info[recInfo_filename] =
1142         rec_strdup(fname, &rec->size[recInfo_filename]);
1143
1144     /* update delete keys */
1145     xfree(rec->info[recInfo_delKeys]);
1146     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1147     {
1148         zebra_rec_keys_get_buf(zh->reg->keys,
1149                                &rec->info[recInfo_delKeys],
1150                                &rec->size[recInfo_delKeys]);
1151     }
1152     else
1153     {
1154         rec->info[recInfo_delKeys] = NULL;
1155         rec->size[recInfo_delKeys] = 0;
1156     }
1157     /* update sort keys */
1158     xfree(rec->info[recInfo_sortKeys]);
1159
1160     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1161                            &rec->info[recInfo_sortKeys],
1162                            &rec->size[recInfo_sortKeys]);
1163
1164     if (stream)
1165     {
1166         recordAttr->recordSize = end_offset - start_offset;
1167         zebraExplain_recordBytesIncrement(zh->reg->zei,
1168                                           recordAttr->recordSize);
1169     }
1170
1171     /* set run-number for this record */
1172     recordAttr->runNumber =
1173         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1174
1175     /* update store data */
1176     xfree(rec->info[recInfo_storeData]);
1177
1178     /* update store data */
1179     if (zh->store_data_buf)
1180     {
1181         rec->size[recInfo_storeData] = zh->store_data_size;
1182         rec->info[recInfo_storeData] = zh->store_data_buf;
1183         zh->store_data_buf = 0;
1184         recordAttr->recordSize = zh->store_data_size;
1185     }
1186     else if (zh->m_store_data)
1187     {
1188         off_t cur_offset = stream->tellf(stream);
1189
1190         rec->size[recInfo_storeData] = recordAttr->recordSize;
1191         rec->info[recInfo_storeData] = (char *)
1192             xmalloc(recordAttr->recordSize);
1193         stream->seekf(stream, start_offset);
1194         stream->readf(stream, rec->info[recInfo_storeData],
1195                       recordAttr->recordSize);
1196         stream->seekf(stream, cur_offset);
1197     }
1198     else
1199     {
1200         rec->info[recInfo_storeData] = NULL;
1201         rec->size[recInfo_storeData] = 0;
1202     }
1203     /* update database name */
1204     xfree(rec->info[recInfo_databaseName]);
1205     rec->info[recInfo_databaseName] =
1206         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1207
1208     /* update offset */
1209     recordAttr->recordOffset = start_offset;
1210     
1211     /* commit this record */
1212     rec_put(zh->reg->records, &rec);
1213     logRecord(zh);
1214     return ZEBRA_OK;
1215 }
1216
1217 /** \brief extracts records from stream
1218     \param zh Zebra Handle
1219     \param stream stream that we read from
1220     \param action (action_insert, action_replace, action_delete, ..)
1221     \param recordType Record filter type "grs.xml", etc.
1222     \param sysno pointer to sysno if already known; NULL otherwise
1223     \param match_criteria (NULL if not already given)
1224     \param fname filename that we read from (for logging purposes only)
1225     \param recType record type
1226     \param recTypeClientData client data for record type
1227     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1228 */
1229 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
1230                                        struct ZebraRecStream *stream,
1231                                        enum zebra_recctrl_action_t action,
1232                                        const char *recordType,
1233                                        zint *sysno,
1234                                        const char *match_criteria,
1235                                        const char *fname,
1236                                        RecType recType,
1237                                        void *recTypeClientData)
1238 {
1239     ZEBRA_RES res = ZEBRA_OK;
1240     while (1)
1241     {
1242         int more = 0;
1243         res = zebra_extract_record_stream(zh, stream,
1244                                           action,
1245                                           recordType,
1246                                           sysno,
1247                                           match_criteria,
1248                                           fname,
1249                                           recType, recTypeClientData, &more);
1250         if (!more)
1251         {
1252             res = ZEBRA_OK;
1253             break;
1254         }
1255         if (res != ZEBRA_OK)
1256             break;
1257         if (sysno)
1258             break;
1259     }
1260     return res;
1261 }
1262
1263 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1264 {
1265     ZebraHandle zh = (ZebraHandle) handle;
1266     struct recExtractCtrl extractCtrl;
1267
1268     if (zebraExplain_curDatabase(zh->reg->zei,
1269                                   rec->info[recInfo_databaseName]))
1270     {
1271         abort();
1272         if (zebraExplain_newDatabase(zh->reg->zei,
1273                                       rec->info[recInfo_databaseName], 0))
1274             abort();
1275     }
1276
1277     zebra_rec_keys_reset(zh->reg->keys);
1278     zebra_rec_keys_reset(zh->reg->sortKeys);
1279
1280     extractCtrl.init = extract_init;
1281     extractCtrl.tokenAdd = extract_token_add;
1282     extractCtrl.schemaAdd = extract_schema_add;
1283     extractCtrl.dh = zh->reg->dh;
1284
1285     init_extractCtrl(zh, &extractCtrl);
1286
1287     extractCtrl.flagShowRecords = 0;
1288     extractCtrl.match_criteria[0] = '\0';
1289     extractCtrl.staticrank = 0;
1290     extractCtrl.action = action_update;
1291
1292     extractCtrl.handle = handle;
1293     extractCtrl.first_record = 1;
1294     
1295     extract_set_store_data_prepare(&extractCtrl);
1296
1297     if (n)
1298         grs_extract_tree(&extractCtrl, n);
1299
1300     if (rec->size[recInfo_delKeys])
1301     {
1302         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1303         
1304         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1305
1306         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1307                                rec->size[recInfo_delKeys],
1308                                0);
1309 #if FLUSH2
1310         extract_flush_record_keys2(zh, rec->sysno, 
1311                                    zh->reg->keys, 0, delkeys, 0);
1312 #else
1313         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1314         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1315 #endif
1316         zebra_rec_keys_close(delkeys);
1317
1318         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1319                                rec->size[recInfo_sortKeys],
1320                                0);
1321
1322         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1323         zebra_rec_keys_close(sortkeys);
1324     }
1325     else
1326     {
1327 #if FLUSH2
1328         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1329 #else
1330         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1331 #endif
1332     }
1333     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1334     
1335     xfree(rec->info[recInfo_delKeys]);
1336     zebra_rec_keys_get_buf(zh->reg->keys,
1337                            &rec->info[recInfo_delKeys], 
1338                            &rec->size[recInfo_delKeys]);
1339
1340     xfree(rec->info[recInfo_sortKeys]);
1341     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1342                            &rec->info[recInfo_sortKeys],
1343                            &rec->size[recInfo_sortKeys]);
1344     return ZEBRA_OK;
1345 }
1346
1347 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1348                            const char *str, size_t slen, NMEM nmem, int level)
1349 {
1350     char keystr[200]; /* room for zints to print */
1351     char *dst_term = 0;
1352     int ord = CAST_ZINT_TO_INT(key->mem[0]);
1353     const char *index_type;
1354     int i;
1355     const char *string_index;
1356     
1357     zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1358                             0/* db */, &string_index);
1359     assert(index_type);
1360     zebra_term_untrans_iconv(zh, nmem, index_type,
1361                              &dst_term, str);
1362     *keystr = '\0';
1363     for (i = 0; i < key->len; i++)
1364     {
1365         sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1366     }
1367     
1368     if (*str < CHR_BASE_CHAR)
1369     {
1370         int i;
1371         char dst_buf[200]; /* room for special chars */
1372         
1373         strcpy(dst_buf , "?");
1374         
1375         if (!strcmp(str, ""))
1376             strcpy(dst_buf, "alwaysmatches");
1377         if (!strcmp(str, FIRST_IN_FIELD_STR))
1378             strcpy(dst_buf, "firstinfield");
1379         else if (!strcmp(str, CHR_UNKNOWN))
1380             strcpy(dst_buf, "unknown");
1381         else if (!strcmp(str, CHR_SPACE))
1382             strcpy(dst_buf, "space");
1383         
1384         for (i = 0; i<slen; i++)
1385         {
1386             sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1387         }
1388         yaz_log(level, "%s%s %s %s", keystr, index_type,
1389                 string_index, dst_buf);
1390         
1391     }
1392     else
1393         yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1394                 string_index, dst_term);
1395 }
1396
1397 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1398                           zebra_rec_keys_t reckeys,
1399                           int level)
1400 {
1401     if (zebra_rec_keys_rewind(reckeys))
1402     {
1403         size_t slen;
1404         const char *str;
1405         struct it_key key;
1406         NMEM nmem = nmem_create();
1407
1408         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1409         {
1410             zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1411             nmem_reset(nmem);
1412         }
1413         nmem_destroy(nmem);
1414     }
1415 }
1416
1417 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1418                              zebra_rec_keys_t reckeys)
1419 {
1420     ZebraExplainInfo zei = zh->reg->zei;
1421     struct ord_stat {
1422         int no;
1423         int ord;
1424         struct ord_stat *next;
1425     };
1426
1427     if (zebra_rec_keys_rewind(reckeys))
1428     {
1429         struct ord_stat *ord_list = 0;
1430         struct ord_stat *p;
1431         size_t slen;
1432         const char *str;
1433         struct it_key key_in;
1434         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1435         {
1436             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1437
1438             for (p = ord_list; p ; p = p->next)
1439                 if (p->ord == ord)
1440                 {
1441                     p->no++;
1442                     break;
1443                 }
1444             if (!p)
1445             {
1446                 p = xmalloc(sizeof(*p));
1447                 p->no = 1;
1448                 p->ord = ord;
1449                 p->next = ord_list;
1450                 ord_list = p;
1451             }
1452         }
1453
1454         p = ord_list;
1455         while (p)
1456         {
1457             struct ord_stat *p1 = p;
1458
1459             if (is_insert)
1460                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1461             else
1462                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1463             p = p->next;
1464             xfree(p1);
1465         }
1466     }
1467 }
1468
1469 #if FLUSH2
1470 static void extract_flush_record_keys2(
1471     ZebraHandle zh, zint sysno,
1472     zebra_rec_keys_t ins_keys, zint ins_rank,
1473     zebra_rec_keys_t del_keys, zint del_rank)
1474 {
1475     ZebraExplainInfo zei = zh->reg->zei;
1476     int normal = 0;
1477     int optimized = 0;
1478
1479     if (!zh->reg->key_block)
1480     {
1481         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1482         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1483         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1484         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1485     }
1486
1487     if (ins_keys)
1488     {
1489         extract_rec_keys_adjust(zh, 1, ins_keys);
1490         if (!del_keys)
1491             zebraExplain_recordCountIncrement(zei, 1);
1492         zebra_rec_keys_rewind(ins_keys);
1493     }
1494     if (del_keys)
1495     {
1496         extract_rec_keys_adjust(zh, 0, del_keys);
1497         if (!ins_keys)
1498             zebraExplain_recordCountIncrement(zei, -1);
1499         zebra_rec_keys_rewind(del_keys);
1500     }
1501
1502     while (1)
1503     {
1504         size_t del_slen;
1505         const char *del_str;
1506         struct it_key del_key_in;
1507         int del = 0;
1508
1509         size_t ins_slen;
1510         const char *ins_str;
1511         struct it_key ins_key_in;
1512         int ins = 0;
1513
1514         if (del_keys)
1515             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1516                                       &del_key_in);
1517         if (ins_keys)
1518             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1519                                       &ins_key_in);
1520
1521         if (del && ins && ins_rank == del_rank
1522             && !key_compare(&del_key_in, &ins_key_in) 
1523             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1524         {
1525             optimized++;
1526             continue;
1527         }
1528         if (!del && !ins)
1529             break;
1530         
1531         normal++;
1532         if (del)
1533             key_block_write(zh->reg->key_block, sysno, 
1534                             &del_key_in, 0, del_str, del_slen,
1535                             del_rank, zh->m_staticrank);
1536         if (ins)
1537             key_block_write(zh->reg->key_block, sysno, 
1538                             &ins_key_in, 1, ins_str, ins_slen,
1539                             ins_rank, zh->m_staticrank);
1540     }
1541     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1542 }
1543 #else
1544 static void extract_flush_record_keys(
1545     ZebraHandle zh, zint sysno, int cmd,
1546     zebra_rec_keys_t reckeys,
1547     zint staticrank)
1548 {
1549     ZebraExplainInfo zei = zh->reg->zei;
1550
1551     extract_rec_keys_adjust(zh, cmd, reckeys);
1552
1553     if (log_level_details)
1554     {
1555         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1556                 sysno, cmd ? "insert" : "delete");
1557         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1558     }
1559
1560     if (!zh->reg->key_block)
1561     {
1562         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1563         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1564         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1565         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1566     }
1567     zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1568
1569 #if 0
1570     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1571     print_rec_keys(zh, reckeys);
1572 #endif
1573     if (zebra_rec_keys_rewind(reckeys))
1574     {
1575         size_t slen;
1576         const char *str;
1577         struct it_key key_in;
1578         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1579         {
1580             key_block_write(zh->reg->key_block, sysno, 
1581                             &key_in, cmd, str, slen,
1582                             staticrank, zh->m_staticrank);
1583         }
1584     }
1585 }
1586 #endif
1587
1588 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1589                                      zebra_rec_keys_t reckeys,
1590                                      zebra_snippets *snippets)
1591 {
1592     NMEM nmem = nmem_create();
1593     if (zebra_rec_keys_rewind(reckeys)) 
1594     {
1595         const char *str;
1596         size_t slen;
1597         struct it_key key;
1598         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1599         {
1600             char *dst_term = 0;
1601             int ord;
1602             zint seqno;
1603             const char *index_type;
1604
1605             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1606             seqno = key.mem[key.len-1];
1607             ord = CAST_ZINT_TO_INT(key.mem[0]);
1608             
1609             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1610                                     0/* db */, 0 /* string_index */);
1611             assert(index_type);
1612             zebra_term_untrans_iconv(zh, nmem, index_type,
1613                                      &dst_term, str);
1614             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1615             nmem_reset(nmem);
1616         }
1617     }
1618     nmem_destroy(nmem);
1619     return ZEBRA_OK;
1620 }
1621
1622 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1623 {
1624     yaz_log(YLOG_LOG, "print_rec_keys");
1625     if (zebra_rec_keys_rewind(reckeys))
1626     {
1627         const char *str;
1628         size_t slen;
1629         struct it_key key;
1630         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1631         {
1632             char dst_buf[IT_MAX_WORD];
1633             zint seqno;
1634             const char *index_type;
1635             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1636             const char *db = 0;
1637             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1638
1639             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1640             
1641             seqno = key.mem[key.len-1];
1642             
1643             zebra_term_untrans(zh, index_type, dst_buf, str);
1644             
1645             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1646                     " term=%s", ord, seqno, dst_buf); 
1647         }
1648     }
1649 }
1650
1651 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1652                                      const char *str, int length)
1653 {
1654     struct it_key key;
1655     ZebraHandle zh = p->extractCtrl->handle;
1656     ZebraExplainInfo zei = zh->reg->zei;
1657     int ch, i;
1658
1659     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1660     if (ch < 0)
1661         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1662
1663     i = 0;
1664     key.mem[i++] = ch;
1665     key.mem[i++] = p->record_id;
1666     key.mem[i++] = p->section_id;
1667
1668     if (zh->m_segment_indexing)
1669         key.mem[i++] = p->segment;
1670     key.mem[i++] = p->seqno;
1671     key.len = i;
1672
1673     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1674 }
1675
1676 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1677 {
1678     struct it_key key;
1679     ZebraHandle zh = p->extractCtrl->handle;
1680     ZebraExplainInfo zei = zh->reg->zei;
1681     int ch;
1682     zinfo_index_category_t cat = zinfo_index_category_sort;
1683
1684     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1685     if (ch < 0)
1686         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1687     key.len = 3;
1688     key.mem[0] = ch;
1689     key.mem[1] = p->record_id;
1690     key.mem[2] = p->section_id;
1691
1692     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1693 }
1694
1695 static void extract_add_staticrank_string(RecWord *p,
1696                                           const char *str, int length)
1697 {
1698     char valz[40];
1699     struct recExtractCtrl *ctrl = p->extractCtrl;
1700
1701     if (length > sizeof(valz)-1)
1702         length = sizeof(valz)-1;
1703
1704     memcpy(valz, str, length);
1705     valz[length] = '\0';
1706     ctrl->staticrank = atozint(valz);
1707 }
1708
1709 static void extract_add_string(RecWord *p, zebra_map_t zm,
1710                                const char *string, int length)
1711 {
1712     assert(length > 0);
1713
1714     if (!p->index_name)
1715         return;
1716     if (log_level_details)
1717     {
1718
1719         WRBUF w = wrbuf_alloc();
1720         
1721         wrbuf_write_escaped(w, string, length);
1722         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1723         wrbuf_destroy(w);
1724     }
1725     if (zebra_maps_is_index(zm))
1726     {
1727         extract_add_index_string(p, zinfo_index_category_index,
1728                                  string, length);
1729         if (zebra_maps_is_alwaysmatches(zm))
1730         {
1731             RecWord word;
1732             memcpy(&word, p, sizeof(word));
1733
1734             word.seqno = 1;
1735             extract_add_index_string(
1736                 &word, zinfo_index_category_alwaysmatches, "", 0);
1737         }
1738     }
1739     else if (zebra_maps_is_sort(zm))
1740     {
1741         extract_add_sort_string(p, string, length);
1742     }
1743     else if (zebra_maps_is_staticrank(zm))
1744     {
1745         extract_add_staticrank_string(p, string, length);
1746     }
1747 }
1748
1749 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1750 {
1751     const char *b = p->term_buf;
1752     int remain = p->term_len;
1753     int first = 1;
1754     const char **map = 0;
1755     
1756     if (remain > 0)
1757         map = zebra_maps_input(zm, &b, remain, 0);
1758
1759     while (map)
1760     {
1761         char buf[IT_MAX_WORD+1];
1762         int i, remain;
1763
1764         /* Skip spaces */
1765         while (map && *map && **map == *CHR_SPACE)
1766         {
1767             remain = p->term_len - (b - p->term_buf);
1768             if (remain > 0)
1769                 map = zebra_maps_input(zm, &b, remain, 0);
1770             else
1771                 map = 0;
1772         }
1773         if (!map)
1774             break;
1775         i = 0;
1776         while (map && *map && **map != *CHR_SPACE)
1777         {
1778             const char *cp = *map;
1779
1780             while (i < IT_MAX_WORD && *cp)
1781                 buf[i++] = *(cp++);
1782             remain = p->term_len - (b - p->term_buf);
1783             if (remain > 0)
1784                 map = zebra_maps_input(zm, &b, remain, 0);
1785             else
1786                 map = 0;
1787         }
1788         if (!i)
1789             return;
1790
1791         if (first)
1792         {   
1793             first = 0;
1794             if (zebra_maps_is_first_in_field(zm))
1795             {
1796                 /* first in field marker */
1797                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1798                 p->seqno++;
1799             }
1800         }
1801         extract_add_string(p, zm, buf, i);
1802         p->seqno++;
1803     }
1804 }
1805
1806 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1807 {
1808     const char *b = p->term_buf;
1809     char buf[IT_MAX_WORD+1];
1810     const char **map = 0;
1811     int i = 0, remain = p->term_len;
1812
1813     if (remain > 0)
1814         map = zebra_maps_input(zm, &b, remain, 1);
1815
1816     while (remain > 0 && i < IT_MAX_WORD)
1817     {
1818         while (map && *map && **map == *CHR_SPACE)
1819         {
1820             remain = p->term_len - (b - p->term_buf);
1821
1822             if (remain > 0)
1823             {
1824                 int first = i ? 0 : 1;  /* first position */
1825                 map = zebra_maps_input(zm, &b, remain, first);
1826             }
1827             else
1828                 map = 0;
1829         }
1830         if (!map)
1831             break;
1832
1833         if (i && i < IT_MAX_WORD)
1834             buf[i++] = *CHR_SPACE;
1835         while (map && *map && **map != *CHR_SPACE)
1836         {
1837             const char *cp = *map;
1838
1839             if (**map == *CHR_CUT)
1840             {
1841                 i = 0;
1842             }
1843             else
1844             {
1845                 if (i >= IT_MAX_WORD)
1846                     break;
1847                 while (i < IT_MAX_WORD && *cp)
1848                     buf[i++] = *(cp++);
1849             }
1850             remain = p->term_len  - (b - p->term_buf);
1851             if (remain > 0)
1852             {
1853                 map = zebra_maps_input(zm, &b, remain, 0);
1854             }
1855             else
1856                 map = 0;
1857         }
1858     }
1859     if (!i)
1860         return;
1861     extract_add_string(p, zm, buf, i);
1862     p->seqno++;
1863 }
1864
1865 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1866 {
1867     const char *res_buf = 0;
1868     size_t res_len = 0;
1869
1870     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1871     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1872     {
1873         extract_add_string(p, zm, res_buf, res_len);
1874         p->seqno++;
1875     }
1876 }
1877
1878
1879 /** \brief top-level indexing handler for recctrl system
1880     \param p token data to be indexed
1881
1882     Call sequence:
1883     extract_token_add
1884     extract_add_{in}_complete / extract_add_icu
1885     extract_add_string
1886     
1887     extract_add_index_string
1888     or
1889     extract_add_sort_string
1890     or
1891     extract_add_staticrank_string
1892     
1893 */
1894 static void extract_token_add(RecWord *p)
1895 {
1896     ZebraHandle zh = p->extractCtrl->handle;
1897     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1898     WRBUF wrbuf;
1899
1900     if (log_level_details)
1901     {
1902         yaz_log(log_level_details, "extract_token_add "
1903                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1904                 p->index_type, p->index_name, 
1905                 p->seqno, p->term_len, p->term_buf);
1906     }
1907     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1908     {
1909         p->term_buf = wrbuf_buf(wrbuf);
1910         p->term_len = wrbuf_len(wrbuf);
1911     }
1912     if (zebra_maps_is_icu(zm))
1913     {
1914         extract_add_icu(p, zm);
1915     }
1916     else
1917     {
1918         if (zebra_maps_is_complete(zm))
1919             extract_add_complete_field(p, zm);
1920         else
1921             extract_add_incomplete_field(p, zm);
1922     }
1923 }
1924
1925 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1926                                       void *buf, size_t sz)
1927 {
1928     ZebraHandle zh = (ZebraHandle) p->handle;
1929
1930     xfree(zh->store_data_buf);
1931     zh->store_data_buf = 0;
1932     zh->store_data_size = 0;
1933     if (buf && sz)
1934     {
1935         zh->store_data_buf = xmalloc(sz);
1936         zh->store_data_size = sz;
1937         memcpy(zh->store_data_buf, buf, sz);
1938     }
1939 }
1940
1941 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1942 {
1943     ZebraHandle zh = (ZebraHandle) p->handle;
1944     xfree(zh->store_data_buf);
1945     zh->store_data_buf = 0;
1946     zh->store_data_size = 0;
1947     p->setStoreData = extract_set_store_data_cb;
1948 }
1949
1950 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1951 {
1952     ZebraHandle zh = (ZebraHandle) p->handle;
1953     zebraExplain_addSchema(zh->reg->zei, oid);
1954 }
1955
1956 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1957                              int cmd, zebra_rec_keys_t reckeys)
1958 {
1959 #if 0
1960     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1961             cmd, sysno);
1962     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1963 #endif
1964
1965     if (zebra_rec_keys_rewind(reckeys))
1966     {
1967         zebra_sort_index_t si = zh->reg->sort_index;
1968         size_t slen;
1969         const char *str;
1970         struct it_key key_in;
1971
1972         NMEM nmem = nmem_create();
1973         struct sort_add_ent {
1974             int ord;
1975             int cmd;
1976             struct sort_add_ent *next;
1977             WRBUF wrbuf;
1978             zint sysno;
1979             zint section_id;
1980         };
1981         struct sort_add_ent *sort_ent_list = 0;
1982
1983         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1984         {
1985             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1986             zint filter_sysno = key_in.mem[1];
1987             zint section_id = key_in.mem[2];
1988
1989             struct sort_add_ent **e = &sort_ent_list;
1990             for (; *e; e = &(*e)->next)
1991                 if ((*e)->ord == ord && section_id == (*e)->section_id)
1992                     break;
1993             if (!*e)
1994             {
1995                 *e = nmem_malloc(nmem, sizeof(**e));
1996                 (*e)->next = 0;
1997                 (*e)->wrbuf = wrbuf_alloc();
1998                 (*e)->ord = ord;
1999                 (*e)->cmd = cmd;
2000                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
2001                 (*e)->section_id = section_id;
2002             }
2003             
2004             wrbuf_write((*e)->wrbuf, str, slen);
2005             wrbuf_putc((*e)->wrbuf, '\0');
2006         }
2007         if (sort_ent_list)
2008         {
2009             zint last_sysno = 0;
2010             struct sort_add_ent *e = sort_ent_list;
2011             for (; e; e = e->next)
2012             {
2013                 if (last_sysno != e->sysno)
2014                 {
2015                     zebra_sort_sysno(si, e->sysno);
2016                     last_sysno = e->sysno;
2017                 }
2018                 zebra_sort_type(si, e->ord);
2019                 if (e->cmd == 1)
2020                     zebra_sort_add(si, e->section_id, e->wrbuf);
2021                 else
2022                     zebra_sort_delete(si, e->section_id);
2023                 wrbuf_destroy(e->wrbuf);
2024             }
2025         }
2026         nmem_destroy(nmem);
2027     }
2028 }
2029
2030 /*
2031  * Local variables:
2032  * c-basic-offset: 4
2033  * c-file-style: "Stroustrup"
2034  * indent-tabs-mode: nil
2035  * End:
2036  * vim: shiftwidth=4 tabstop=8 expandtab
2037  */
2038