dom: accept type="adelete" (accept bad deletes)
[idzebra-moved-to-github.git] / index / extract.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 2004-2013 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 /** \file
21     \brief indexes records and extract tokens for indexing and sorting
22 */
23
24 #if HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37
38
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52
53 #if FLUSH2
54 static void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
55                                        zebra_rec_keys_t ins_keys,
56                                        zint ins_rank,
57                                        zebra_rec_keys_t del_keys,
58                                        zint del_rank);
59 #else
60 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
61                                       int cmd,
62                                       zebra_rec_keys_t reckeys,
63                                       zint staticrank);
64 #endif
65
66 static void zebra_init_log_level(void)
67 {
68     if (!log_level_initialized)
69     {
70         log_level_initialized = 1;
71
72         log_level_extract = yaz_log_module_level("extract");
73         log_level_details = yaz_log_module_level("indexdetails");
74     }
75 }
76
77 static WRBUF wrbuf_hex_str(const char *cstr)
78 {
79     size_t i;
80     WRBUF w = wrbuf_alloc();
81     for (i = 0; cstr[i]; i++)
82     {
83         if (cstr[i] < ' ' || cstr[i] > 126)
84             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
85         else
86             wrbuf_putc(w, cstr[i]);
87     }
88     return w;
89 }
90
91
92 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
93                                     int cmd, zebra_rec_keys_t skp);
94 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
95 static void extract_token_add(RecWord *p);
96
97 static void check_log_limit(ZebraHandle zh)
98 {
99     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
100     {
101         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
102                 zh->m_file_verbose_limit);
103     }
104 }
105
106 static void logRecord(ZebraHandle zh)
107 {
108     check_log_limit(zh);
109     ++zh->records_processed;
110     if (!(zh->records_processed % 1000))
111     {
112         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
113                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT,
114                 zh->records_processed, zh->records_inserted,
115                 zh->records_updated, zh->records_deleted);
116     }
117 }
118
119 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
120 {
121     ctrl->flagShowRecords = !zh->m_flag_rw;
122 }
123
124
125 static void extract_add_index_string(RecWord *p,
126                                       zinfo_index_category_t cat,
127                                       const char *str, int length);
128
129 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
130
131 static void extract_init(struct recExtractCtrl *p, RecWord *w)
132 {
133     w->seqno = 1;
134     w->index_name = "any";
135     w->index_type = "w";
136     w->extractCtrl = p;
137     w->record_id = 0;
138     w->section_id = 0;
139     w->segment = 0;
140 }
141
142 struct snip_rec_info {
143     ZebraHandle zh;
144     zebra_snippets *snippets;
145 };
146
147 static int parse_complete_field(RecWord *p, zebra_map_t zm,
148                                 char *buf)
149 {
150     const char *b = p->term_buf;
151     const char **map = 0;
152     int i = 0, remain = p->term_len;
153
154     if (remain > 0)
155         map = zebra_maps_input(zm, &b, remain, 1);
156     while (remain > 0 && i < IT_MAX_WORD)
157     {
158         while (map && *map && **map == *CHR_SPACE)
159         {
160             remain = p->term_len - (b - p->term_buf);
161
162             if (remain > 0)
163             {
164                 int first = i ? 0 : 1;  /* first position */
165                 map = zebra_maps_input(zm, &b, remain, first);
166             }
167             else
168                 map = 0;
169         }
170         if (!map)
171             break;
172
173         if (i && i < IT_MAX_WORD)
174             buf[i++] = *CHR_SPACE;
175         while (map && *map && **map != *CHR_SPACE)
176         {
177             const char *cp = *map;
178
179             if (**map == *CHR_CUT)
180             {
181                 i = 0;
182             }
183             else
184             {
185                 if (i >= IT_MAX_WORD)
186                     break;
187                 while (i < IT_MAX_WORD && *cp)
188                     buf[i++] = *(cp++);
189             }
190             remain = p->term_len  - (b - p->term_buf);
191             if (remain > 0)
192             {
193                 map = zebra_maps_input(zm, &b, remain, 0);
194             }
195             else
196                 map = 0;
197         }
198     }
199     return i;
200 }
201
202 static void snippet_add_complete_field(RecWord *p, int ord,
203                                        zebra_map_t zm)
204 {
205     struct snip_rec_info *h = p->extractCtrl->handle;
206     char buf[IT_MAX_WORD+1];
207     int i = parse_complete_field(p, zm, buf);
208
209     if (!i)
210         return;
211
212     if (p->term_len && p->term_buf && zebra_maps_is_index(zm))
213         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
214                                p->term_buf, p->term_len);
215     p->seqno++;
216 }
217
218 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
219 {
220     struct snip_rec_info *h = p->extractCtrl->handle;
221     const char *b = p->term_buf;
222     int remain = p->term_len;
223     int first = 1;
224     const char **map = 0;
225     const char *start = b;
226     const char *last = b;
227
228     if (remain > 0)
229         map = zebra_maps_input(zm, &b, remain, 0);
230
231     while (map)
232     {
233         int remain;
234
235         /* Skip spaces */
236         while (map && *map && **map == *CHR_SPACE)
237         {
238             remain = p->term_len - (b - p->term_buf);
239             last = b;
240             if (remain > 0)
241                 map = zebra_maps_input(zm, &b, remain, 0);
242             else
243                 map = 0;
244         }
245         if (!map)
246             break;
247         if (start != last && zebra_maps_is_index(zm))
248         {
249             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
250                                    start, last - start);
251         }
252         start = last;
253         while (map && *map && **map != *CHR_SPACE)
254         {
255             remain = p->term_len - (b - p->term_buf);
256             last = b;
257             if (remain > 0)
258                 map = zebra_maps_input(zm, &b, remain, 0);
259             else
260                 map = 0;
261         }
262         if (start == last)
263             return ;
264
265         if (first)
266         {
267             first = 0;
268             if (zebra_maps_is_first_in_field(zm))
269             {
270                 /* first in field marker */
271                 p->seqno++;
272             }
273         }
274         if (start != last && zebra_maps_is_index(zm))
275             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
276                                    start, last - start);
277         start = last;
278         p->seqno++;
279     }
280
281 }
282
283 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
284 {
285     struct snip_rec_info *h = p->extractCtrl->handle;
286
287     const char *res_buf = 0;
288     size_t res_len = 0;
289
290     const char *display_buf = 0;
291     size_t display_len = 0;
292
293     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
294     while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
295                                    &display_buf, &display_len))
296     {
297         if (zebra_maps_is_index(zm))
298             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
299                                    display_buf, display_len);
300         p->seqno++;
301     }
302 }
303
304 static void snippet_token_add(RecWord *p)
305 {
306     struct snip_rec_info *h = p->extractCtrl->handle;
307     ZebraHandle zh = h->zh;
308     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
309
310     if (zm)
311     {
312         ZebraExplainInfo zei = zh->reg->zei;
313         int ch = zebraExplain_lookup_attr_str(
314             zei, zinfo_index_category_index, p->index_type, p->index_name);
315
316         if (zebra_maps_is_icu(zm))
317             snippet_add_icu(p, ch, zm);
318         else
319         {
320             if (zebra_maps_is_complete(zm))
321                 snippet_add_complete_field(p, ch, zm);
322             else
323                 snippet_add_incomplete_field(p, ch, zm);
324         }
325     }
326 }
327
328 static void snippet_schema_add(
329     struct recExtractCtrl *p, Odr_oid *oid)
330 {
331
332 }
333
334 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
335                      struct ZebraRecStream *stream,
336                      RecType rt, void *recTypeClientData)
337 {
338     struct recExtractCtrl extractCtrl;
339     struct snip_rec_info info;
340
341     extractCtrl.stream = stream;
342     extractCtrl.first_record = 1;
343     extractCtrl.init = extract_init;
344     extractCtrl.tokenAdd = snippet_token_add;
345     extractCtrl.schemaAdd = snippet_schema_add;
346     assert(zh->reg);
347     assert(zh->reg->dh);
348
349     extractCtrl.dh = zh->reg->dh;
350
351     info.zh = zh;
352     info.snippets = sn;
353     extractCtrl.handle = &info;
354     extractCtrl.match_criteria[0] = '\0';
355     extractCtrl.staticrank = 0;
356     extractCtrl.action = action_insert;
357
358     init_extractCtrl(zh, &extractCtrl);
359
360     extractCtrl.setStoreData = 0;
361
362     (*rt->extract)(recTypeClientData, &extractCtrl);
363 }
364
365 static void searchRecordKey(ZebraHandle zh,
366                             zebra_rec_keys_t reckeys,
367                             const char *index_name,
368                             const char **ws, int ws_length)
369 {
370     int i;
371     int ch = -1;
372     zinfo_index_category_t cat = zinfo_index_category_index;
373
374     for (i = 0; i<ws_length; i++)
375         ws[i] = NULL;
376
377     if (ch < 0)
378         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
379     if (ch < 0)
380         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
381     if (ch < 0)
382         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
383
384     if (ch < 0)
385         return ;
386
387     if (zebra_rec_keys_rewind(reckeys))
388     {
389         zint startSeq = -1;
390         const char *str;
391         size_t slen;
392         struct it_key key;
393         zint seqno;
394         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
395         {
396             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
397
398             seqno = key.mem[key.len-1];
399
400             if (key.mem[0] == ch)
401             {
402                 zint woff;
403
404                 if (startSeq == -1)
405                     startSeq = seqno;
406                 woff = seqno - startSeq;
407                 if (woff >= 0 && woff < ws_length)
408                     ws[woff] = str;
409             }
410         }
411     }
412 }
413
414 #define FILE_MATCH_BLANK "\t "
415
416 static char *get_match_from_spec(ZebraHandle zh,
417                           zebra_rec_keys_t reckeys,
418                           const char *fname, const char *spec)
419 {
420     static char dstBuf[2048];      /* static here ??? */
421     char *dst = dstBuf;
422     const char *s = spec;
423
424     while (1)
425     {
426         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
427             ;
428         if (!*s)
429             break;
430         if (*s == '(')
431         {
432             const char *ws[32];
433             char attset_str[64], attname_str[64];
434             int i;
435             int first = 1;
436
437             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
438                 ;
439             for (i = 0; *s && *s != ',' && *s != ')' &&
440                      !strchr(FILE_MATCH_BLANK, *s); s++)
441                 if (i+1 < sizeof(attset_str))
442                     attset_str[i++] = *s;
443             attset_str[i] = '\0';
444
445             for (; strchr(FILE_MATCH_BLANK, *s); s++)
446                 ;
447             if (*s != ',')
448                 strcpy(attname_str, attset_str);
449             else
450             {
451                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
452                     ;
453                 for (i = 0; *s && *s != ')' &&
454                          !strchr(FILE_MATCH_BLANK, *s); s++)
455                     if (i+1 < sizeof(attname_str))
456                         attname_str[i++] = *s;
457                 attname_str[i] = '\0';
458             }
459             if (*s != ')')
460             {
461                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
462                       spec, zh->m_group ? zh->m_group : "none");
463                 return NULL;
464             }
465             s++;
466
467             searchRecordKey(zh, reckeys, attname_str, ws, 32);
468             if (0) /* for debugging */
469             {
470                 for (i = 0; i<32; i++)
471                 {
472                     if (ws[i])
473                     {
474                         WRBUF w = wrbuf_hex_str(ws[i]);
475                         yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
476                         wrbuf_destroy(w);
477                     }
478                 }
479             }
480
481             for (i = 0; i<32; i++)
482                 if (ws[i])
483                 {
484                     if (first)
485                     {
486                         *dst++ = ' ';
487                         first = 0;
488                     }
489                     strcpy(dst, ws[i]);
490                     dst += strlen(ws[i]);
491                 }
492             if (first)
493             {
494                 yaz_log(YLOG_WARN, "Record didn't contain match"
495                       " fields in (%s,%s)", attset_str, attname_str);
496                 return NULL;
497             }
498         }
499         else if (*s == '$')
500         {
501             int spec_len;
502             char special[64];
503             const char *spec_src = NULL;
504             const char *s1 = ++s;
505             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
506                 s1++;
507
508             spec_len = s1 - s;
509             if (spec_len > sizeof(special)-1)
510                 spec_len = sizeof(special)-1;
511             memcpy(special, s, spec_len);
512             special[spec_len] = '\0';
513             s = s1;
514
515             if (!strcmp(special, "group"))
516                 spec_src = zh->m_group;
517             else if (!strcmp(special, "database"))
518                 spec_src = zh->basenames[0];
519             else if (!strcmp(special, "filename")) {
520                 spec_src = fname;
521             }
522             else if (!strcmp(special, "type"))
523                 spec_src = zh->m_record_type;
524             else
525                 spec_src = NULL;
526             if (spec_src)
527             {
528                 strcpy(dst, spec_src);
529                 dst += strlen(spec_src);
530             }
531         }
532         else if (*s == '\"' || *s == '\'')
533         {
534             int stopMarker = *s++;
535             char tmpString[64];
536             int i = 0;
537
538             while (*s && *s != stopMarker)
539             {
540                 if (i+1 < sizeof(tmpString))
541                     tmpString[i++] = *s++;
542             }
543             if (*s)
544                 s++;
545             tmpString[i] = '\0';
546             strcpy(dst, tmpString);
547             dst += strlen(tmpString);
548         }
549         else
550         {
551             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
552                   spec, zh->m_group ? zh->m_group : "none");
553             return NULL;
554         }
555         *dst++ = 1;
556     }
557     if (dst == dstBuf)
558     {
559         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
560               fname, zh->m_group ? zh->m_group : "none");
561         return NULL;
562     }
563     *dst = '\0';
564
565     if (0) /* for debugging */
566     {
567         WRBUF w = wrbuf_hex_str(dstBuf);
568         yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
569         wrbuf_destroy(w);
570     }
571
572     return dstBuf;
573 }
574
575 struct recordLogInfo {
576     const char *fname;
577     int recordOffset;
578     struct recordGroup *rGroup;
579 };
580
581 /** \brief add the always-matches index entry and map to real record ID
582     \param ctrl record control
583     \param record_id custom record ID
584     \param sysno system record ID
585
586     This function serves two purposes.. It adds the always matches
587     entry and makes a pointer from the custom record ID (if defined)
588     back to the system record ID (sysno)
589     See zebra_recid_to_sysno .
590   */
591 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
592                             zint sysno)
593 {
594     RecWord word;
595     extract_init(ctrl, &word);
596     word.record_id = record_id;
597     /* we use the seqno as placeholder for a way to get back to
598        record database from _ALLRECORDS.. This is used if a custom
599        RECORD was defined */
600     word.seqno = sysno;
601     word.index_name = "_ALLRECORDS";
602     word.index_type = "w";
603
604     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
605                               "", 0);
606 }
607
608 /* forward declaration */
609 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
610                                        struct ZebraRecStream *stream,
611                                        enum zebra_recctrl_action_t action,
612                                        const char *recordType,
613                                        zint *sysno,
614                                        const char *match_criteria,
615                                        const char *fname,
616                                        RecType recType,
617                                        void *recTypeClientData);
618
619
620 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
621                              enum zebra_recctrl_action_t action)
622 {
623     ZEBRA_RES r = ZEBRA_OK;
624     int i, fd;
625     char gprefix[128];
626     char ext[128];
627     char ext_res[128];
628     const char *original_record_type = 0;
629     RecType recType;
630     void *recTypeClientData;
631     struct ZebraRecStream stream, *streamp;
632
633     zebra_init_log_level();
634
635     if (!zh->m_group || !*zh->m_group)
636         *gprefix = '\0';
637     else
638         sprintf(gprefix, "%s.", zh->m_group);
639
640     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
641
642     /* determine file extension */
643     *ext = '\0';
644     for (i = strlen(fname); --i >= 0; )
645         if (fname[i] == '/')
646             break;
647         else if (fname[i] == '.')
648         {
649             strcpy(ext, fname+i+1);
650             break;
651         }
652     /* determine file type - depending on extension */
653     original_record_type = zh->m_record_type;
654     if (!zh->m_record_type)
655     {
656         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
657         zh->m_record_type = res_get(zh->res, ext_res);
658     }
659     if (!zh->m_record_type)
660     {
661         check_log_limit(zh);
662         if (zh->records_processed + zh->records_skipped
663             < zh->m_file_verbose_limit)
664             yaz_log(YLOG_LOG, "? %s", fname);
665         zh->records_skipped++;
666         return 0;
667     }
668     /* determine match criteria */
669     if (!zh->m_record_id)
670     {
671         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
672         zh->m_record_id = res_get(zh->res, ext_res);
673     }
674
675     if (!(recType =
676           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
677                           &recTypeClientData)))
678     {
679         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
680         return ZEBRA_FAIL;
681     }
682
683     switch(recType->version)
684     {
685     case 0:
686         break;
687     default:
688         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
689     }
690     if (sysno && (action == action_delete || action == action_a_delete))
691     {
692         streamp = 0;
693     }
694     else
695     {
696         char full_rep[1024];
697
698         if (zh->path_reg && !yaz_is_abspath(fname))
699         {
700             strcpy(full_rep, zh->path_reg);
701             strcat(full_rep, "/");
702             strcat(full_rep, fname);
703         }
704         else
705             strcpy(full_rep, fname);
706
707         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
708         {
709             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
710             zh->m_record_type = original_record_type;
711             return ZEBRA_FAIL;
712         }
713         streamp = &stream;
714         zebra_create_stream_fd(streamp, fd, 0);
715     }
716     r = zebra_extract_records_stream(zh, streamp,
717                                      action,
718                                      zh->m_record_type,
719                                      sysno,
720                                      0, /*match_criteria */
721                                      fname,
722                                      recType, recTypeClientData);
723     if (streamp)
724         stream.destroy(streamp);
725     zh->m_record_type = original_record_type;
726     return r;
727 }
728
729 /*
730   If sysno is provided, then it's used to identify the reocord.
731   If not, and match_criteria is provided, then sysno is guessed
732   If not, and a record is provided, then sysno is got from there
733
734  */
735
736 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh,
737                                       const char *buf, size_t buf_size,
738                                       enum zebra_recctrl_action_t action,
739                                       const char *recordType,
740                                       zint *sysno,
741                                       const char *match_criteria,
742                                       const char *fname)
743 {
744     struct ZebraRecStream stream;
745     ZEBRA_RES res;
746     void *clientData;
747     RecType recType = 0;
748
749     if (recordType && *recordType)
750     {
751         yaz_log(log_level_extract,
752                 "Record type explicitly specified: %s", recordType);
753         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
754                                   &clientData);
755     }
756     else
757     {
758         if (!(zh->m_record_type))
759         {
760             yaz_log(YLOG_WARN, "No such record type defined");
761             return ZEBRA_FAIL;
762         }
763         yaz_log(log_level_extract, "Get record type from rgroup: %s",
764                 zh->m_record_type);
765         recType = recType_byName(zh->reg->recTypes, zh->res,
766                                   zh->m_record_type, &clientData);
767         recordType = zh->m_record_type;
768     }
769
770     if (!recType)
771     {
772         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
773         return ZEBRA_FAIL;
774     }
775
776     zebra_create_stream_mem(&stream, buf, buf_size);
777
778     res = zebra_extract_records_stream(zh, &stream,
779                                        action,
780                                        recordType,
781                                        sysno,
782                                        match_criteria,
783                                        fname,
784                                        recType, clientData);
785     stream.destroy(&stream);
786     return res;
787 }
788
789 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
790                                              struct ZebraRecStream *stream,
791                                              enum zebra_recctrl_action_t action,
792                                              const char *recordType,
793                                              zint *sysno,
794                                              const char *match_criteria,
795                                              const char *fname,
796                                              RecType recType,
797                                              void *recTypeClientData,
798                                              int *more)
799
800 {
801     zint sysno0 = 0;
802     RecordAttr *recordAttr;
803     struct recExtractCtrl extractCtrl;
804     int r;
805     const char *matchStr = 0;
806     Record rec;
807     off_t start_offset = 0, end_offset = 0;
808     const char *pr_fname = fname;  /* filename to print .. */
809     int show_progress = zh->records_processed + zh->records_skipped
810         < zh->m_file_verbose_limit ? 1:0;
811
812     zebra_init_log_level();
813
814     if (!pr_fname)
815         pr_fname = "<no file>";  /* make it printable if file is omitted */
816
817     zebra_rec_keys_reset(zh->reg->keys);
818     zebra_rec_keys_reset(zh->reg->sortKeys);
819
820     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
821     {
822         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0],
823                                       zh->m_explain_database))
824             return ZEBRA_FAIL;
825     }
826
827     if (stream)
828     {
829         off_t null_offset = 0;
830         extractCtrl.stream = stream;
831
832         start_offset = stream->tellf(stream);
833
834         extractCtrl.first_record = start_offset ? 0 : 1;
835
836         stream->endf(stream, &null_offset);;
837
838         extractCtrl.init = extract_init;
839         extractCtrl.tokenAdd = extract_token_add;
840         extractCtrl.schemaAdd = extract_schema_add;
841         extractCtrl.dh = zh->reg->dh;
842         extractCtrl.handle = zh;
843         extractCtrl.match_criteria[0] = '\0';
844         extractCtrl.staticrank = 0;
845         extractCtrl.action = action;
846
847         init_extractCtrl(zh, &extractCtrl);
848
849         extract_set_store_data_prepare(&extractCtrl);
850
851         r = (*recType->extract)(recTypeClientData, &extractCtrl);
852
853         if (action == action_update)
854         {
855             action = extractCtrl.action;
856         }
857
858         switch (r)
859         {
860         case RECCTRL_EXTRACT_EOF:
861             return ZEBRA_FAIL;
862         case RECCTRL_EXTRACT_ERROR_GENERIC:
863             /* error occured during extraction ... */
864             yaz_log(YLOG_WARN, "extract error: generic");
865             return ZEBRA_FAIL;
866         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
867             /* error occured during extraction ... */
868             yaz_log(YLOG_WARN, "extract error: no such filter");
869             return ZEBRA_FAIL;
870         case RECCTRL_EXTRACT_SKIP:
871             if (show_progress)
872                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
873                          recordType, pr_fname, (zint) start_offset);
874             *more = 1;
875
876             end_offset = stream->endf(stream, 0);
877             if (end_offset)
878                 stream->seekf(stream, end_offset);
879
880             return ZEBRA_OK;
881         case RECCTRL_EXTRACT_OK:
882             break;
883         default:
884             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
885             return ZEBRA_FAIL;
886         }
887         end_offset = stream->endf(stream, 0);
888         if (end_offset)
889             stream->seekf(stream, end_offset);
890         else
891             end_offset = stream->tellf(stream);
892
893         if (extractCtrl.match_criteria[0])
894             match_criteria = extractCtrl.match_criteria;
895     }
896
897     *more = 1;
898
899     if (zh->m_flag_rw == 0)
900     {
901         yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
902                 pr_fname, (zint) start_offset);
903         /* test mode .. Do not perform match */
904         return ZEBRA_OK;
905     }
906
907     if (!sysno)
908     {
909         sysno = &sysno0;
910
911         if (match_criteria && *match_criteria)
912             matchStr = match_criteria;
913         else
914         {
915             if (zh->m_record_id && *zh->m_record_id)
916             {
917                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname,
918                                                zh->m_record_id);
919                 if (!matchStr)
920                 {
921                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
922                              pr_fname, (zint) start_offset);
923                     return ZEBRA_FAIL;
924                 }
925                 if (0 && matchStr)
926                 {
927                     WRBUF w = wrbuf_alloc();
928                     size_t i;
929                     for (i = 0; i < strlen(matchStr); i++)
930                     {
931                         wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
932                     }
933                     yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
934                     wrbuf_destroy(w);
935                 }
936             }
937         }
938         if (matchStr)
939         {
940             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
941             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
942                                           matchStr);
943
944
945             if (log_level_extract)
946             {
947                 WRBUF w = wrbuf_hex_str(matchStr);
948                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
949                 wrbuf_destroy(w);
950             }
951             if (rinfo)
952             {
953                 assert(*rinfo == sizeof(*sysno));
954                 memcpy(sysno, rinfo+1, sizeof(*sysno));
955             }
956        }
957     }
958
959     if (! *sysno)
960     {
961         /* new record AKA does not exist already */
962         if (action == action_delete)
963         {
964             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
965                     pr_fname, (zint) start_offset);
966             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
967             return ZEBRA_FAIL;
968         }
969         else if (action == action_a_delete)
970         {
971             if (show_progress)
972                 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
973                         pr_fname, (zint) start_offset);
974             return ZEBRA_OK;
975         }
976         else if (action == action_replace)
977         {
978             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
979                          pr_fname, (zint) start_offset);
980             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
981             return ZEBRA_FAIL;
982         }
983         if (show_progress)
984             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
985                      (zint) start_offset);
986         rec = rec_new(zh->reg->records);
987
988         *sysno = rec->sysno;
989
990
991         if (stream)
992         {
993             all_matches_add(&extractCtrl,
994                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
995                             *sysno);
996         }
997
998
999         recordAttr = rec_init_attr(zh->reg->zei, rec);
1000         if (extractCtrl.staticrank < 0)
1001         {
1002             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1003             extractCtrl.staticrank = 0;
1004         }
1005
1006         if (matchStr)
1007         {
1008             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1009             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1010                             sizeof(*sysno), sysno);
1011         }
1012
1013         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1014 #if FLUSH2
1015         extract_flush_record_keys2(zh, *sysno,
1016                                    zh->reg->keys, extractCtrl.staticrank,
1017                                    0, recordAttr->staticrank);
1018 #else
1019         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1020                                   extractCtrl.staticrank);
1021 #endif
1022         recordAttr->staticrank = extractCtrl.staticrank;
1023         zh->records_inserted++;
1024     }
1025     else
1026     {
1027         /* record already exists */
1028         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1029         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1030         if (action == action_insert)
1031         {
1032             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT,
1033                          recordType, pr_fname, (zint) start_offset);
1034             logRecord(zh);
1035             return ZEBRA_FAIL;
1036         }
1037
1038         rec = rec_get(zh->reg->records, *sysno);
1039         assert(rec);
1040
1041         if (stream)
1042         {
1043             all_matches_add(&extractCtrl,
1044                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1045                             *sysno);
1046         }
1047
1048         recordAttr = rec_init_attr(zh->reg->zei, rec);
1049
1050         /* decrease total size */
1051         zebraExplain_recordBytesIncrement(zh->reg->zei,
1052                                            - recordAttr->recordSize);
1053
1054         zebra_rec_keys_set_buf(delkeys,
1055                                rec->info[recInfo_delKeys],
1056                                rec->size[recInfo_delKeys],
1057                                0);
1058         zebra_rec_keys_set_buf(sortKeys,
1059                                rec->info[recInfo_sortKeys],
1060                                rec->size[recInfo_sortKeys],
1061                                0);
1062
1063         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1064 #if !FLUSH2
1065         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1066                                   recordAttr->staticrank);
1067 #endif
1068         if (action == action_delete || action == action_a_delete)
1069         {
1070             /* record going to be deleted */
1071 #if FLUSH2
1072             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1073                                        delkeys, recordAttr->staticrank);
1074 #endif
1075             if (zebra_rec_keys_empty(delkeys))
1076             {
1077                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1078                         pr_fname, (zint) start_offset);
1079                 yaz_log(YLOG_WARN, "cannot delete file above, "
1080                         "storeKeys false (3)");
1081             }
1082             else
1083             {
1084                 if (show_progress)
1085                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1086                             pr_fname, (zint) start_offset);
1087                 zh->records_deleted++;
1088                 if (matchStr)
1089                 {
1090                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1091                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1092                 }
1093                 rec_del(zh->reg->records, &rec);
1094             }
1095             zebra_rec_keys_close(delkeys);
1096             zebra_rec_keys_close(sortKeys);
1097             rec_free(&rec);
1098             logRecord(zh);
1099             return ZEBRA_OK;
1100         }
1101         else
1102         {   /* update or special_update */
1103             if (show_progress)
1104                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1105                         pr_fname, (zint) start_offset);
1106             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1107
1108 #if FLUSH2
1109             extract_flush_record_keys2(zh, *sysno,
1110                                        zh->reg->keys, extractCtrl.staticrank,
1111                                        delkeys, recordAttr->staticrank);
1112 #else
1113             extract_flush_record_keys(zh, *sysno, 1,
1114                                       zh->reg->keys, extractCtrl.staticrank);
1115 #endif
1116             recordAttr->staticrank = extractCtrl.staticrank;
1117             zh->records_updated++;
1118         }
1119         zebra_rec_keys_close(delkeys);
1120         zebra_rec_keys_close(sortKeys);
1121     }
1122     /* update file type */
1123     xfree(rec->info[recInfo_fileType]);
1124     rec->info[recInfo_fileType] =
1125         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1126
1127     /* update filename */
1128     xfree(rec->info[recInfo_filename]);
1129     rec->info[recInfo_filename] =
1130         rec_strdup(fname, &rec->size[recInfo_filename]);
1131
1132     /* update delete keys */
1133     xfree(rec->info[recInfo_delKeys]);
1134     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1135     {
1136         zebra_rec_keys_get_buf(zh->reg->keys,
1137                                &rec->info[recInfo_delKeys],
1138                                &rec->size[recInfo_delKeys]);
1139     }
1140     else
1141     {
1142         rec->info[recInfo_delKeys] = NULL;
1143         rec->size[recInfo_delKeys] = 0;
1144     }
1145     /* update sort keys */
1146     xfree(rec->info[recInfo_sortKeys]);
1147
1148     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1149                            &rec->info[recInfo_sortKeys],
1150                            &rec->size[recInfo_sortKeys]);
1151
1152     if (stream)
1153     {
1154         recordAttr->recordSize = end_offset - start_offset;
1155         zebraExplain_recordBytesIncrement(zh->reg->zei,
1156                                           recordAttr->recordSize);
1157     }
1158
1159     /* set run-number for this record */
1160     recordAttr->runNumber =
1161         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1162
1163     /* update store data */
1164     xfree(rec->info[recInfo_storeData]);
1165
1166     /* update store data */
1167     if (zh->store_data_buf)
1168     {
1169         rec->size[recInfo_storeData] = zh->store_data_size;
1170         rec->info[recInfo_storeData] = zh->store_data_buf;
1171         zh->store_data_buf = 0;
1172         recordAttr->recordSize = zh->store_data_size;
1173     }
1174     else if (zh->m_store_data)
1175     {
1176         off_t cur_offset = stream->tellf(stream);
1177
1178         rec->size[recInfo_storeData] = recordAttr->recordSize;
1179         rec->info[recInfo_storeData] = (char *)
1180             xmalloc(recordAttr->recordSize);
1181         stream->seekf(stream, start_offset);
1182         stream->readf(stream, rec->info[recInfo_storeData],
1183                       recordAttr->recordSize);
1184         stream->seekf(stream, cur_offset);
1185     }
1186     else
1187     {
1188         rec->info[recInfo_storeData] = NULL;
1189         rec->size[recInfo_storeData] = 0;
1190     }
1191     /* update database name */
1192     xfree(rec->info[recInfo_databaseName]);
1193     rec->info[recInfo_databaseName] =
1194         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]);
1195
1196     /* update offset */
1197     recordAttr->recordOffset = start_offset;
1198
1199     /* commit this record */
1200     rec_put(zh->reg->records, &rec);
1201     logRecord(zh);
1202     return ZEBRA_OK;
1203 }
1204
1205 /** \brief extracts records from stream
1206     \param zh Zebra Handle
1207     \param stream stream that we read from
1208     \param action (action_insert, action_replace, action_delete, ..)
1209     \param recordType Record filter type "grs.xml", etc.
1210     \param sysno pointer to sysno if already known; NULL otherwise
1211     \param match_criteria (NULL if not already given)
1212     \param fname filename that we read from (for logging purposes only)
1213     \param recType record type
1214     \param recTypeClientData client data for record type
1215     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1216 */
1217 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
1218                                        struct ZebraRecStream *stream,
1219                                        enum zebra_recctrl_action_t action,
1220                                        const char *recordType,
1221                                        zint *sysno,
1222                                        const char *match_criteria,
1223                                        const char *fname,
1224                                        RecType recType,
1225                                        void *recTypeClientData)
1226 {
1227     ZEBRA_RES res = ZEBRA_OK;
1228     while (1)
1229     {
1230         int more = 0;
1231         res = zebra_extract_record_stream(zh, stream,
1232                                           action,
1233                                           recordType,
1234                                           sysno,
1235                                           match_criteria,
1236                                           fname,
1237                                           recType, recTypeClientData, &more);
1238         if (!more)
1239         {
1240             res = ZEBRA_OK;
1241             break;
1242         }
1243         if (res != ZEBRA_OK)
1244             break;
1245         if (sysno)
1246             break;
1247     }
1248     return res;
1249 }
1250
1251 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1252 {
1253     ZebraHandle zh = (ZebraHandle) handle;
1254     struct recExtractCtrl extractCtrl;
1255
1256     if (zebraExplain_curDatabase(zh->reg->zei,
1257                                   rec->info[recInfo_databaseName]))
1258     {
1259         abort();
1260         if (zebraExplain_newDatabase(zh->reg->zei,
1261                                       rec->info[recInfo_databaseName], 0))
1262             abort();
1263     }
1264
1265     zebra_rec_keys_reset(zh->reg->keys);
1266     zebra_rec_keys_reset(zh->reg->sortKeys);
1267
1268     extractCtrl.init = extract_init;
1269     extractCtrl.tokenAdd = extract_token_add;
1270     extractCtrl.schemaAdd = extract_schema_add;
1271     extractCtrl.dh = zh->reg->dh;
1272
1273     init_extractCtrl(zh, &extractCtrl);
1274
1275     extractCtrl.flagShowRecords = 0;
1276     extractCtrl.match_criteria[0] = '\0';
1277     extractCtrl.staticrank = 0;
1278     extractCtrl.action = action_update;
1279
1280     extractCtrl.handle = handle;
1281     extractCtrl.first_record = 1;
1282
1283     extract_set_store_data_prepare(&extractCtrl);
1284
1285     if (n)
1286         grs_extract_tree(&extractCtrl, n);
1287
1288     if (rec->size[recInfo_delKeys])
1289     {
1290         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1291
1292         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1293
1294         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1295                                rec->size[recInfo_delKeys],
1296                                0);
1297 #if FLUSH2
1298         extract_flush_record_keys2(zh, rec->sysno,
1299                                    zh->reg->keys, 0, delkeys, 0);
1300 #else
1301         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1302         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1303 #endif
1304         zebra_rec_keys_close(delkeys);
1305
1306         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1307                                rec->size[recInfo_sortKeys],
1308                                0);
1309
1310         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1311         zebra_rec_keys_close(sortkeys);
1312     }
1313     else
1314     {
1315 #if FLUSH2
1316         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1317 #else
1318         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1319 #endif
1320     }
1321     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1322
1323     xfree(rec->info[recInfo_delKeys]);
1324     zebra_rec_keys_get_buf(zh->reg->keys,
1325                            &rec->info[recInfo_delKeys],
1326                            &rec->size[recInfo_delKeys]);
1327
1328     xfree(rec->info[recInfo_sortKeys]);
1329     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1330                            &rec->info[recInfo_sortKeys],
1331                            &rec->size[recInfo_sortKeys]);
1332     return ZEBRA_OK;
1333 }
1334
1335 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1336                            const char *str, size_t slen, NMEM nmem, int level)
1337 {
1338     char keystr[200]; /* room for zints to print */
1339     char *dst_term = 0;
1340     int ord = CAST_ZINT_TO_INT(key->mem[0]);
1341     const char *index_type;
1342     int i;
1343     const char *string_index;
1344
1345     zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1346                             0/* db */, &string_index);
1347     assert(index_type);
1348     zebra_term_untrans_iconv(zh, nmem, index_type,
1349                              &dst_term, str);
1350     *keystr = '\0';
1351     for (i = 0; i < key->len; i++)
1352     {
1353         sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1354     }
1355
1356     if (*str < CHR_BASE_CHAR)
1357     {
1358         int i;
1359         char dst_buf[200]; /* room for special chars */
1360
1361         strcpy(dst_buf , "?");
1362
1363         if (!strcmp(str, ""))
1364             strcpy(dst_buf, "alwaysmatches");
1365         if (!strcmp(str, FIRST_IN_FIELD_STR))
1366             strcpy(dst_buf, "firstinfield");
1367         else if (!strcmp(str, CHR_UNKNOWN))
1368             strcpy(dst_buf, "unknown");
1369         else if (!strcmp(str, CHR_SPACE))
1370             strcpy(dst_buf, "space");
1371
1372         for (i = 0; i<slen; i++)
1373         {
1374             sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1375         }
1376         yaz_log(level, "%s%s %s %s", keystr, index_type,
1377                 string_index, dst_buf);
1378
1379     }
1380     else
1381         yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1382                 string_index, dst_term);
1383 }
1384
1385 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1386                           zebra_rec_keys_t reckeys,
1387                           int level)
1388 {
1389     if (zebra_rec_keys_rewind(reckeys))
1390     {
1391         size_t slen;
1392         const char *str;
1393         struct it_key key;
1394         NMEM nmem = nmem_create();
1395
1396         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1397         {
1398             zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1399             nmem_reset(nmem);
1400         }
1401         nmem_destroy(nmem);
1402     }
1403 }
1404
1405 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1406                              zebra_rec_keys_t reckeys)
1407 {
1408     ZebraExplainInfo zei = zh->reg->zei;
1409     struct ord_stat {
1410         int no;
1411         int ord;
1412         struct ord_stat *next;
1413     };
1414
1415     if (zebra_rec_keys_rewind(reckeys))
1416     {
1417         struct ord_stat *ord_list = 0;
1418         struct ord_stat *p;
1419         size_t slen;
1420         const char *str;
1421         struct it_key key_in;
1422         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1423         {
1424             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1425
1426             for (p = ord_list; p ; p = p->next)
1427                 if (p->ord == ord)
1428                 {
1429                     p->no++;
1430                     break;
1431                 }
1432             if (!p)
1433             {
1434                 p = xmalloc(sizeof(*p));
1435                 p->no = 1;
1436                 p->ord = ord;
1437                 p->next = ord_list;
1438                 ord_list = p;
1439             }
1440         }
1441
1442         p = ord_list;
1443         while (p)
1444         {
1445             struct ord_stat *p1 = p;
1446
1447             if (is_insert)
1448                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1449             else
1450                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1451             p = p->next;
1452             xfree(p1);
1453         }
1454     }
1455 }
1456
1457 #if FLUSH2
1458 static void extract_flush_record_keys2(
1459     ZebraHandle zh, zint sysno,
1460     zebra_rec_keys_t ins_keys, zint ins_rank,
1461     zebra_rec_keys_t del_keys, zint del_rank)
1462 {
1463     ZebraExplainInfo zei = zh->reg->zei;
1464     int normal = 0;
1465     int optimized = 0;
1466
1467     if (!zh->reg->key_block)
1468     {
1469         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1470         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1471         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1472         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1473     }
1474
1475     if (ins_keys)
1476     {
1477         extract_rec_keys_adjust(zh, 1, ins_keys);
1478         if (!del_keys)
1479             zebraExplain_recordCountIncrement(zei, 1);
1480         zebra_rec_keys_rewind(ins_keys);
1481     }
1482     if (del_keys)
1483     {
1484         extract_rec_keys_adjust(zh, 0, del_keys);
1485         if (!ins_keys)
1486             zebraExplain_recordCountIncrement(zei, -1);
1487         zebra_rec_keys_rewind(del_keys);
1488     }
1489
1490     while (1)
1491     {
1492         size_t del_slen;
1493         const char *del_str;
1494         struct it_key del_key_in;
1495         int del = 0;
1496
1497         size_t ins_slen;
1498         const char *ins_str;
1499         struct it_key ins_key_in;
1500         int ins = 0;
1501
1502         if (del_keys)
1503             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1504                                       &del_key_in);
1505         if (ins_keys)
1506             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1507                                       &ins_key_in);
1508
1509         if (del && ins && ins_rank == del_rank
1510             && !key_compare(&del_key_in, &ins_key_in)
1511             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1512         {
1513             optimized++;
1514             continue;
1515         }
1516         if (!del && !ins)
1517             break;
1518
1519         normal++;
1520         if (del)
1521             key_block_write(zh->reg->key_block, sysno,
1522                             &del_key_in, 0, del_str, del_slen,
1523                             del_rank, zh->m_staticrank);
1524         if (ins)
1525             key_block_write(zh->reg->key_block, sysno,
1526                             &ins_key_in, 1, ins_str, ins_slen,
1527                             ins_rank, zh->m_staticrank);
1528     }
1529     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1530 }
1531 #else
1532 static void extract_flush_record_keys(
1533     ZebraHandle zh, zint sysno, int cmd,
1534     zebra_rec_keys_t reckeys,
1535     zint staticrank)
1536 {
1537     ZebraExplainInfo zei = zh->reg->zei;
1538
1539     extract_rec_keys_adjust(zh, cmd, reckeys);
1540
1541     if (log_level_details)
1542     {
1543         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1544                 sysno, cmd ? "insert" : "delete");
1545         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1546     }
1547
1548     if (!zh->reg->key_block)
1549     {
1550         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1551         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1552         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1553         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1554     }
1555     zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1556
1557 #if 0
1558     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1559     print_rec_keys(zh, reckeys);
1560 #endif
1561     if (zebra_rec_keys_rewind(reckeys))
1562     {
1563         size_t slen;
1564         const char *str;
1565         struct it_key key_in;
1566         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1567         {
1568             key_block_write(zh->reg->key_block, sysno,
1569                             &key_in, cmd, str, slen,
1570                             staticrank, zh->m_staticrank);
1571         }
1572     }
1573 }
1574 #endif
1575
1576 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1577                                      zebra_rec_keys_t reckeys,
1578                                      zebra_snippets *snippets)
1579 {
1580     NMEM nmem = nmem_create();
1581     if (zebra_rec_keys_rewind(reckeys))
1582     {
1583         const char *str;
1584         size_t slen;
1585         struct it_key key;
1586         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1587         {
1588             char *dst_term = 0;
1589             int ord;
1590             zint seqno;
1591             const char *index_type;
1592
1593             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1594             seqno = key.mem[key.len-1];
1595             ord = CAST_ZINT_TO_INT(key.mem[0]);
1596
1597             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1598                                     0/* db */, 0 /* string_index */);
1599             assert(index_type);
1600             zebra_term_untrans_iconv(zh, nmem, index_type,
1601                                      &dst_term, str);
1602             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1603             nmem_reset(nmem);
1604         }
1605     }
1606     nmem_destroy(nmem);
1607     return ZEBRA_OK;
1608 }
1609
1610 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1611 {
1612     yaz_log(YLOG_LOG, "print_rec_keys");
1613     if (zebra_rec_keys_rewind(reckeys))
1614     {
1615         const char *str;
1616         size_t slen;
1617         struct it_key key;
1618         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1619         {
1620             char dst_buf[IT_MAX_WORD];
1621             zint seqno;
1622             const char *index_type;
1623             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1624             const char *db = 0;
1625             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1626
1627             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1628
1629             seqno = key.mem[key.len-1];
1630
1631             zebra_term_untrans(zh, index_type, dst_buf, str);
1632
1633             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT
1634                     " term=%s", ord, seqno, dst_buf);
1635         }
1636     }
1637 }
1638
1639 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1640                                      const char *str, int length)
1641 {
1642     struct it_key key;
1643     ZebraHandle zh = p->extractCtrl->handle;
1644     ZebraExplainInfo zei = zh->reg->zei;
1645     int ch, i;
1646
1647     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1648     if (ch < 0)
1649         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1650
1651     i = 0;
1652     key.mem[i++] = ch;
1653     key.mem[i++] = p->record_id;
1654     key.mem[i++] = p->section_id;
1655
1656     if (zh->m_segment_indexing)
1657         key.mem[i++] = p->segment;
1658     key.mem[i++] = p->seqno;
1659     key.len = i;
1660
1661     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1662 }
1663
1664 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1665 {
1666     struct it_key key;
1667     ZebraHandle zh = p->extractCtrl->handle;
1668     ZebraExplainInfo zei = zh->reg->zei;
1669     int ch;
1670     zinfo_index_category_t cat = zinfo_index_category_sort;
1671
1672     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1673     if (ch < 0)
1674         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1675     key.len = 3;
1676     key.mem[0] = ch;
1677     key.mem[1] = p->record_id;
1678     key.mem[2] = p->section_id;
1679
1680     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1681 }
1682
1683 static void extract_add_staticrank_string(RecWord *p,
1684                                           const char *str, int length)
1685 {
1686     char valz[40];
1687     struct recExtractCtrl *ctrl = p->extractCtrl;
1688
1689     if (length > sizeof(valz)-1)
1690         length = sizeof(valz)-1;
1691
1692     memcpy(valz, str, length);
1693     valz[length] = '\0';
1694     ctrl->staticrank = atozint(valz);
1695 }
1696
1697 static void extract_add_string(RecWord *p, zebra_map_t zm,
1698                                const char *string, int length)
1699 {
1700     assert(length > 0);
1701
1702     if (!p->index_name)
1703         return;
1704     if (log_level_details)
1705     {
1706
1707         WRBUF w = wrbuf_alloc();
1708
1709         wrbuf_write_escaped(w, string, length);
1710         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1711         wrbuf_destroy(w);
1712     }
1713     if (zebra_maps_is_index(zm))
1714     {
1715         extract_add_index_string(p, zinfo_index_category_index,
1716                                  string, length);
1717         if (zebra_maps_is_alwaysmatches(zm))
1718         {
1719             RecWord word;
1720             memcpy(&word, p, sizeof(word));
1721
1722             word.seqno = 1;
1723             extract_add_index_string(
1724                 &word, zinfo_index_category_alwaysmatches, "", 0);
1725         }
1726     }
1727     else if (zebra_maps_is_sort(zm))
1728     {
1729         extract_add_sort_string(p, string, length);
1730     }
1731     else if (zebra_maps_is_staticrank(zm))
1732     {
1733         extract_add_staticrank_string(p, string, length);
1734     }
1735 }
1736
1737 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1738 {
1739     const char *b = p->term_buf;
1740     int remain = p->term_len;
1741     int first = 1;
1742     const char **map = 0;
1743
1744     if (remain > 0)
1745         map = zebra_maps_input(zm, &b, remain, 0);
1746
1747     while (map)
1748     {
1749         char buf[IT_MAX_WORD+1];
1750         int i, remain;
1751
1752         /* Skip spaces */
1753         while (map && *map && **map == *CHR_SPACE)
1754         {
1755             remain = p->term_len - (b - p->term_buf);
1756             if (remain > 0)
1757                 map = zebra_maps_input(zm, &b, remain, 0);
1758             else
1759                 map = 0;
1760         }
1761         if (!map)
1762             break;
1763         i = 0;
1764         while (map && *map && **map != *CHR_SPACE)
1765         {
1766             const char *cp = *map;
1767
1768             while (i < IT_MAX_WORD && *cp)
1769                 buf[i++] = *(cp++);
1770             remain = p->term_len - (b - p->term_buf);
1771             if (remain > 0)
1772                 map = zebra_maps_input(zm, &b, remain, 0);
1773             else
1774                 map = 0;
1775         }
1776         if (!i)
1777             return;
1778
1779         if (first)
1780         {
1781             first = 0;
1782             if (zebra_maps_is_first_in_field(zm))
1783             {
1784                 /* first in field marker */
1785                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1786                 p->seqno++;
1787             }
1788         }
1789         extract_add_string(p, zm, buf, i);
1790         p->seqno++;
1791     }
1792 }
1793
1794 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1795 {
1796     char buf[IT_MAX_WORD+1];
1797     int i = parse_complete_field(p, zm, buf);
1798     if (!i)
1799         return;
1800     extract_add_string(p, zm, buf, i);
1801     p->seqno++;
1802 }
1803
1804 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1805 {
1806     const char *res_buf = 0;
1807     size_t res_len = 0;
1808
1809     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1810     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1811     {
1812         if (res_len > IT_MAX_WORD)
1813         {
1814             yaz_log(YLOG_LOG, "Truncating long term %ld", (long) res_len);
1815             res_len = IT_MAX_WORD;
1816         }
1817         extract_add_string(p, zm, res_buf, res_len);
1818         p->seqno++;
1819     }
1820 }
1821
1822
1823 /** \brief top-level indexing handler for recctrl system
1824     \param p token data to be indexed
1825
1826     Call sequence:
1827     extract_token_add
1828     extract_add_{in}_complete / extract_add_icu
1829     extract_add_string
1830
1831     extract_add_index_string
1832     or
1833     extract_add_sort_string
1834     or
1835     extract_add_staticrank_string
1836
1837 */
1838 static void extract_token_add(RecWord *p)
1839 {
1840     ZebraHandle zh = p->extractCtrl->handle;
1841     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1842
1843     if (log_level_details)
1844     {
1845         yaz_log(log_level_details, "extract_token_add "
1846                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1847                 p->index_type, p->index_name,
1848                 p->seqno, p->term_len, p->term_buf);
1849     }
1850     if (zebra_maps_is_icu(zm))
1851     {
1852         extract_add_icu(p, zm);
1853     }
1854     else
1855     {
1856         if (zebra_maps_is_complete(zm))
1857             extract_add_complete_field(p, zm);
1858         else
1859             extract_add_incomplete_field(p, zm);
1860     }
1861 }
1862
1863 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1864                                       void *buf, size_t sz)
1865 {
1866     ZebraHandle zh = (ZebraHandle) p->handle;
1867
1868     xfree(zh->store_data_buf);
1869     zh->store_data_buf = 0;
1870     zh->store_data_size = 0;
1871     if (buf && sz)
1872     {
1873         zh->store_data_buf = xmalloc(sz);
1874         zh->store_data_size = sz;
1875         memcpy(zh->store_data_buf, buf, sz);
1876     }
1877 }
1878
1879 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1880 {
1881     ZebraHandle zh = (ZebraHandle) p->handle;
1882     xfree(zh->store_data_buf);
1883     zh->store_data_buf = 0;
1884     zh->store_data_size = 0;
1885     p->setStoreData = extract_set_store_data_cb;
1886 }
1887
1888 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1889 {
1890     ZebraHandle zh = (ZebraHandle) p->handle;
1891     zebraExplain_addSchema(zh->reg->zei, oid);
1892 }
1893
1894 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1895                              int cmd, zebra_rec_keys_t reckeys)
1896 {
1897 #if 0
1898     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1899             cmd, sysno);
1900     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1901 #endif
1902
1903     if (zebra_rec_keys_rewind(reckeys))
1904     {
1905         zebra_sort_index_t si = zh->reg->sort_index;
1906         size_t slen;
1907         const char *str;
1908         struct it_key key_in;
1909
1910         NMEM nmem = nmem_create();
1911         struct sort_add_ent {
1912             int ord;
1913             int cmd;
1914             struct sort_add_ent *next;
1915             WRBUF wrbuf;
1916             zint sysno;
1917             zint section_id;
1918         };
1919         struct sort_add_ent *sort_ent_list = 0;
1920
1921         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1922         {
1923             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1924             zint filter_sysno = key_in.mem[1];
1925             zint section_id = key_in.mem[2];
1926
1927             struct sort_add_ent **e = &sort_ent_list;
1928             for (; *e; e = &(*e)->next)
1929                 if ((*e)->ord == ord && section_id == (*e)->section_id)
1930                     break;
1931             if (!*e)
1932             {
1933                 *e = nmem_malloc(nmem, sizeof(**e));
1934                 (*e)->next = 0;
1935                 (*e)->wrbuf = wrbuf_alloc();
1936                 (*e)->ord = ord;
1937                 (*e)->cmd = cmd;
1938                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1939                 (*e)->section_id = section_id;
1940             }
1941
1942             wrbuf_write((*e)->wrbuf, str, slen);
1943             wrbuf_putc((*e)->wrbuf, '\0');
1944         }
1945         if (sort_ent_list)
1946         {
1947             zint last_sysno = 0;
1948             struct sort_add_ent *e = sort_ent_list;
1949             for (; e; e = e->next)
1950             {
1951                 if (last_sysno != e->sysno)
1952                 {
1953                     zebra_sort_sysno(si, e->sysno);
1954                     last_sysno = e->sysno;
1955                 }
1956                 zebra_sort_type(si, e->ord);
1957                 if (e->cmd == 1)
1958                     zebra_sort_add(si, e->section_id, e->wrbuf);
1959                 else
1960                     zebra_sort_delete(si, e->section_id);
1961                 wrbuf_destroy(e->wrbuf);
1962             }
1963         }
1964         nmem_destroy(nmem);
1965     }
1966 }
1967
1968 /*
1969  * Local variables:
1970  * c-basic-offset: 4
1971  * c-file-style: "Stroustrup"
1972  * indent-tabs-mode: nil
1973  * End:
1974  * vim: shiftwidth=4 tabstop=8 expandtab
1975  */
1976