a93373ce8f259656e1c6d37299c33ae9bb483729
[idzebra-moved-to-github.git] / index / extract.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2011 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 /** \file
21     \brief indexes records and extract tokens for indexing and sorting
22 */
23
24 #if HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37
38
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52
53 #if FLUSH2
54 static void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
55                                        zebra_rec_keys_t ins_keys,
56                                        zint ins_rank,
57                                        zebra_rec_keys_t del_keys,
58                                        zint del_rank);
59 #else
60 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
61                                       int cmd,
62                                       zebra_rec_keys_t reckeys,
63                                       zint staticrank);
64 #endif
65
66 static void zebra_init_log_level(void)
67 {
68     if (!log_level_initialized)
69     {
70         log_level_initialized = 1;
71
72         log_level_extract = yaz_log_module_level("extract");
73         log_level_details = yaz_log_module_level("indexdetails");
74     }
75 }
76
77 static WRBUF wrbuf_hex_str(const char *cstr)
78 {
79     size_t i;
80     WRBUF w = wrbuf_alloc();
81     for (i = 0; cstr[i]; i++)
82     {
83         if (cstr[i] < ' ' || cstr[i] > 126)
84             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
85         else
86             wrbuf_putc(w, cstr[i]);
87     }
88     return w;
89 }
90
91
92 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
93                                     int cmd, zebra_rec_keys_t skp);
94 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
95 static void extract_token_add(RecWord *p);
96
97 static void check_log_limit(ZebraHandle zh)
98 {
99     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
100     {
101         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
102                 zh->m_file_verbose_limit);
103     }
104 }
105
106 static void logRecord(ZebraHandle zh)
107 {
108     check_log_limit(zh);
109     ++zh->records_processed;
110     if (!(zh->records_processed % 1000))
111     {
112         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
113                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
114                 zh->records_processed, zh->records_inserted, 
115                 zh->records_updated, zh->records_deleted);
116     }
117 }
118
119 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
120 {
121     ctrl->flagShowRecords = !zh->m_flag_rw;
122 }
123
124
125 static void extract_add_index_string(RecWord *p, 
126                                       zinfo_index_category_t cat,
127                                       const char *str, int length);
128
129 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
130
131 static void extract_init(struct recExtractCtrl *p, RecWord *w)
132 {
133     w->seqno = 1;
134     w->index_name = "any";
135     w->index_type = "w";
136     w->extractCtrl = p;
137     w->record_id = 0;
138     w->section_id = 0;
139     w->segment = 0;
140 }
141
142 struct snip_rec_info {
143     ZebraHandle zh;
144     zebra_snippets *snippets;
145 };
146
147
148 static void snippet_add_complete_field(RecWord *p, int ord,
149                                        zebra_map_t zm)
150 {
151     struct snip_rec_info *h = p->extractCtrl->handle;
152     if (p->term_len && p->term_buf && zebra_maps_is_index(zm))
153         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
154                                p->term_buf, p->term_len);
155     p->seqno++;
156 }
157
158 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
159 {
160     struct snip_rec_info *h = p->extractCtrl->handle;
161     const char *b = p->term_buf;
162     int remain = p->term_len;
163     int first = 1;
164     const char **map = 0;
165     const char *start = b;
166     const char *last = b;
167
168     if (remain > 0)
169         map = zebra_maps_input(zm, &b, remain, 0);
170
171     while (map)
172     {
173         int remain;
174
175         /* Skip spaces */
176         while (map && *map && **map == *CHR_SPACE)
177         {
178             remain = p->term_len - (b - p->term_buf);
179             last = b;
180             if (remain > 0)
181                 map = zebra_maps_input(zm, &b, remain, 0);
182             else
183                 map = 0;
184         }
185         if (!map)
186             break;
187         if (start != last && zebra_maps_is_index(zm))
188         {
189             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
190                                    start, last - start);
191         }
192         start = last;
193         while (map && *map && **map != *CHR_SPACE)
194         {
195             remain = p->term_len - (b - p->term_buf);
196             last = b;
197             if (remain > 0)
198                 map = zebra_maps_input(zm, &b, remain, 0);
199             else
200                 map = 0;
201         }
202         if (start == last)
203             return ;
204
205         if (first)
206         {   
207             first = 0;
208             if (zebra_maps_is_first_in_field(zm))
209             {
210                 /* first in field marker */
211                 p->seqno++;
212             }
213         }
214         if (start != last && zebra_maps_is_index(zm))
215             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
216                                    start, last - start);
217         start = last;
218         p->seqno++;
219     }
220
221 }
222
223 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
224 {
225     struct snip_rec_info *h = p->extractCtrl->handle;
226
227     const char *res_buf = 0;
228     size_t res_len = 0;
229
230     const char *display_buf = 0;
231     size_t display_len = 0;
232
233     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
234     while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
235                                    &display_buf, &display_len))
236     {
237         if (zebra_maps_is_index(zm))
238             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
239                                    display_buf, display_len);
240         p->seqno++;
241     }
242 }
243
244 static void snippet_token_add(RecWord *p)
245 {
246     struct snip_rec_info *h = p->extractCtrl->handle;
247     ZebraHandle zh = h->zh;
248     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
249
250     if (zm)
251     {
252         ZebraExplainInfo zei = zh->reg->zei;
253         int ch = zebraExplain_lookup_attr_str(
254             zei, zinfo_index_category_index, p->index_type, p->index_name);
255
256         if (zebra_maps_is_icu(zm))
257             snippet_add_icu(p, ch, zm);
258         else
259         {
260             if (zebra_maps_is_complete(zm))
261                 snippet_add_complete_field(p, ch, zm);
262             else
263                 snippet_add_incomplete_field(p, ch, zm);
264         }
265     }
266 }
267
268 static void snippet_schema_add(
269     struct recExtractCtrl *p, Odr_oid *oid)
270 {
271
272 }
273
274 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
275                      struct ZebraRecStream *stream,
276                      RecType rt, void *recTypeClientData)
277 {
278     struct recExtractCtrl extractCtrl;
279     struct snip_rec_info info;
280
281     extractCtrl.stream = stream;
282     extractCtrl.first_record = 1;
283     extractCtrl.init = extract_init;
284     extractCtrl.tokenAdd = snippet_token_add;
285     extractCtrl.schemaAdd = snippet_schema_add;
286     assert(zh->reg);
287     assert(zh->reg->dh);
288
289     extractCtrl.dh = zh->reg->dh;
290     
291     info.zh = zh;
292     info.snippets = sn;
293     extractCtrl.handle = &info;
294     extractCtrl.match_criteria[0] = '\0';
295     extractCtrl.staticrank = 0;
296     extractCtrl.action = action_insert;
297     
298     init_extractCtrl(zh, &extractCtrl);
299
300     extractCtrl.setStoreData = 0;
301
302     (*rt->extract)(recTypeClientData, &extractCtrl);
303 }
304
305 static void searchRecordKey(ZebraHandle zh,
306                             zebra_rec_keys_t reckeys,
307                             const char *index_name,
308                             const char **ws, int ws_length)
309 {
310     int i;
311     int ch = -1;
312     zinfo_index_category_t cat = zinfo_index_category_index;
313
314     for (i = 0; i<ws_length; i++)
315         ws[i] = NULL;
316
317     if (ch < 0)
318         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
319     if (ch < 0)
320         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
321     if (ch < 0)
322         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
323
324     if (ch < 0)
325         return ;
326
327     if (zebra_rec_keys_rewind(reckeys))
328     {
329         zint startSeq = -1;
330         const char *str;
331         size_t slen;
332         struct it_key key;
333         zint seqno;
334         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
335         {
336             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
337
338             seqno = key.mem[key.len-1];
339             
340             if (key.mem[0] == ch)
341             {
342                 zint woff;
343                 
344                 if (startSeq == -1)
345                     startSeq = seqno;
346                 woff = seqno - startSeq;
347                 if (woff >= 0 && woff < ws_length)
348                     ws[woff] = str;
349             }
350         }
351     }
352 }
353
354 #define FILE_MATCH_BLANK "\t "
355
356 static char *get_match_from_spec(ZebraHandle zh,
357                           zebra_rec_keys_t reckeys,
358                           const char *fname, const char *spec)
359 {
360     static char dstBuf[2048];      /* static here ??? */
361     char *dst = dstBuf;
362     const char *s = spec;
363
364     while (1)
365     {
366         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
367             ;
368         if (!*s)
369             break;
370         if (*s == '(')
371         {
372             const char *ws[32];
373             char attset_str[64], attname_str[64];
374             int i;
375             int first = 1;
376             
377             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
378                 ;
379             for (i = 0; *s && *s != ',' && *s != ')' && 
380                      !strchr(FILE_MATCH_BLANK, *s); s++)
381                 if (i+1 < sizeof(attset_str))
382                     attset_str[i++] = *s;
383             attset_str[i] = '\0';
384             
385             for (; strchr(FILE_MATCH_BLANK, *s); s++)
386                 ;
387             if (*s != ',')
388                 strcpy(attname_str, attset_str);
389             else
390             {
391                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
392                     ;
393                 for (i = 0; *s && *s != ')' && 
394                          !strchr(FILE_MATCH_BLANK, *s); s++)
395                     if (i+1 < sizeof(attname_str))
396                         attname_str[i++] = *s;
397                 attname_str[i] = '\0';
398             }
399             if (*s != ')')
400             {
401                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
402                       spec, zh->m_group ? zh->m_group : "none");
403                 return NULL;
404             }
405             s++;
406
407             searchRecordKey(zh, reckeys, attname_str, ws, 32);
408             if (0) /* for debugging */
409             {   
410                 for (i = 0; i<32; i++)
411                 {
412                     if (ws[i])
413                     {
414                         WRBUF w = wrbuf_hex_str(ws[i]);
415                         yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
416                         wrbuf_destroy(w);
417                     }
418                 }
419             }
420
421             for (i = 0; i<32; i++)
422                 if (ws[i])
423                 {
424                     if (first)
425                     {
426                         *dst++ = ' ';
427                         first = 0;
428                     }
429                     strcpy(dst, ws[i]);
430                     dst += strlen(ws[i]);
431                 }
432             if (first)
433             {
434                 yaz_log(YLOG_WARN, "Record didn't contain match"
435                       " fields in (%s,%s)", attset_str, attname_str);
436                 return NULL;
437             }
438         }
439         else if (*s == '$')
440         {
441             int spec_len;
442             char special[64];
443             const char *spec_src = NULL;
444             const char *s1 = ++s;
445             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
446                 s1++;
447
448             spec_len = s1 - s;
449             if (spec_len > sizeof(special)-1)
450                 spec_len = sizeof(special)-1;
451             memcpy(special, s, spec_len);
452             special[spec_len] = '\0';
453             s = s1;
454
455             if (!strcmp(special, "group"))
456                 spec_src = zh->m_group;
457             else if (!strcmp(special, "database"))
458                 spec_src = zh->basenames[0];
459             else if (!strcmp(special, "filename")) {
460                 spec_src = fname;
461             }
462             else if (!strcmp(special, "type"))
463                 spec_src = zh->m_record_type;
464             else 
465                 spec_src = NULL;
466             if (spec_src)
467             {
468                 strcpy(dst, spec_src);
469                 dst += strlen(spec_src);
470             }
471         }
472         else if (*s == '\"' || *s == '\'')
473         {
474             int stopMarker = *s++;
475             char tmpString[64];
476             int i = 0;
477
478             while (*s && *s != stopMarker)
479             {
480                 if (i+1 < sizeof(tmpString))
481                     tmpString[i++] = *s++;
482             }
483             if (*s)
484                 s++;
485             tmpString[i] = '\0';
486             strcpy(dst, tmpString);
487             dst += strlen(tmpString);
488         }
489         else
490         {
491             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
492                   spec, zh->m_group ? zh->m_group : "none");
493             return NULL;
494         }
495         *dst++ = 1;
496     }
497     if (dst == dstBuf)
498     {
499         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
500               fname, zh->m_group ? zh->m_group : "none");
501         return NULL;
502     }
503     *dst = '\0';
504
505     if (0) /* for debugging */
506     {
507         WRBUF w = wrbuf_hex_str(dstBuf);
508         yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
509         wrbuf_destroy(w);
510     }
511
512     return dstBuf;
513 }
514
515 struct recordLogInfo {
516     const char *fname;
517     int recordOffset;
518     struct recordGroup *rGroup;
519 };
520
521 /** \brief add the always-matches index entry and map to real record ID
522     \param ctrl record control
523     \param record_id custom record ID
524     \param sysno system record ID
525     
526     This function serves two purposes.. It adds the always matches
527     entry and makes a pointer from the custom record ID (if defined)
528     back to the system record ID (sysno)
529     See zebra_recid_to_sysno .
530   */
531 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
532                             zint sysno)
533 {
534     RecWord word;
535     extract_init(ctrl, &word);
536     word.record_id = record_id;
537     /* we use the seqno as placeholder for a way to get back to
538        record database from _ALLRECORDS.. This is used if a custom
539        RECORD was defined */
540     word.seqno = sysno;
541     word.index_name = "_ALLRECORDS";
542     word.index_type = "w";
543
544     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
545                               "", 0);
546 }
547
548 /* forward declaration */
549 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
550                                        struct ZebraRecStream *stream,
551                                        enum zebra_recctrl_action_t action,
552                                        const char *recordType,
553                                        zint *sysno,
554                                        const char *match_criteria,
555                                        const char *fname,
556                                        RecType recType,
557                                        void *recTypeClientData);
558
559
560 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
561                              enum zebra_recctrl_action_t action)
562 {
563     ZEBRA_RES r = ZEBRA_OK;
564     int i, fd;
565     char gprefix[128];
566     char ext[128];
567     char ext_res[128];
568     const char *original_record_type = 0;
569     RecType recType;
570     void *recTypeClientData;
571     struct ZebraRecStream stream, *streamp;
572
573     zebra_init_log_level();
574
575     if (!zh->m_group || !*zh->m_group)
576         *gprefix = '\0';
577     else
578         sprintf(gprefix, "%s.", zh->m_group);
579     
580     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
581
582     /* determine file extension */
583     *ext = '\0';
584     for (i = strlen(fname); --i >= 0; )
585         if (fname[i] == '/')
586             break;
587         else if (fname[i] == '.')
588         {
589             strcpy(ext, fname+i+1);
590             break;
591         }
592     /* determine file type - depending on extension */
593     original_record_type = zh->m_record_type;
594     if (!zh->m_record_type)
595     {
596         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
597         zh->m_record_type = res_get(zh->res, ext_res);
598     }
599     if (!zh->m_record_type)
600     {
601         check_log_limit(zh);
602         if (zh->records_processed + zh->records_skipped
603             < zh->m_file_verbose_limit)
604             yaz_log(YLOG_LOG, "? %s", fname);
605         zh->records_skipped++;
606         return 0;
607     }
608     /* determine match criteria */
609     if (!zh->m_record_id)
610     {
611         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
612         zh->m_record_id = res_get(zh->res, ext_res);
613     }
614
615     if (!(recType =
616           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
617                           &recTypeClientData)))
618     {
619         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
620         return ZEBRA_FAIL;
621     }
622
623     switch(recType->version)
624     {
625     case 0:
626         break;
627     default:
628         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
629     }
630     if (sysno && (action == action_delete || action == action_a_delete))
631     {
632         streamp = 0;
633     }
634     else
635     {
636         char full_rep[1024];
637
638         if (zh->path_reg && !yaz_is_abspath(fname))
639         {
640             strcpy(full_rep, zh->path_reg);
641             strcat(full_rep, "/");
642             strcat(full_rep, fname);
643         }
644         else
645             strcpy(full_rep, fname);
646         
647         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
648         {
649             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
650             zh->m_record_type = original_record_type;
651             return ZEBRA_FAIL;
652         }
653         streamp = &stream;
654         zebra_create_stream_fd(streamp, fd, 0);
655     }
656     r = zebra_extract_records_stream(zh, streamp,
657                                      action,
658                                      zh->m_record_type,
659                                      sysno,
660                                      0, /*match_criteria */
661                                      fname,
662                                      recType, recTypeClientData);
663     if (streamp)
664         stream.destroy(streamp);
665     zh->m_record_type = original_record_type;
666     return r;
667 }
668
669 /*
670   If sysno is provided, then it's used to identify the reocord.
671   If not, and match_criteria is provided, then sysno is guessed
672   If not, and a record is provided, then sysno is got from there
673   
674  */
675
676 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
677                                       const char *buf, size_t buf_size,
678                                       enum zebra_recctrl_action_t action,
679                                       const char *recordType,
680                                       zint *sysno,
681                                       const char *match_criteria,
682                                       const char *fname)
683 {
684     struct ZebraRecStream stream;
685     ZEBRA_RES res;
686     void *clientData;
687     RecType recType = 0;
688
689     if (recordType && *recordType)
690     {
691         yaz_log(log_level_extract,
692                 "Record type explicitly specified: %s", recordType);
693         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
694                                   &clientData);
695     } 
696     else
697     {
698         if (!(zh->m_record_type))
699         {
700             yaz_log(YLOG_WARN, "No such record type defined");
701             return ZEBRA_FAIL;
702         }
703         yaz_log(log_level_extract, "Get record type from rgroup: %s",
704                 zh->m_record_type);
705         recType = recType_byName(zh->reg->recTypes, zh->res,
706                                   zh->m_record_type, &clientData);
707         recordType = zh->m_record_type;
708     }
709     
710     if (!recType)
711     {
712         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
713         return ZEBRA_FAIL;
714     }
715
716     zebra_create_stream_mem(&stream, buf, buf_size);
717
718     res = zebra_extract_records_stream(zh, &stream,
719                                        action,
720                                        recordType,
721                                        sysno,
722                                        match_criteria,
723                                        fname,
724                                        recType, clientData);
725     stream.destroy(&stream);
726     return res;
727 }
728
729 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
730                                              struct ZebraRecStream *stream,
731                                              enum zebra_recctrl_action_t action,
732                                              const char *recordType,
733                                              zint *sysno,
734                                              const char *match_criteria,
735                                              const char *fname,
736                                              RecType recType,
737                                              void *recTypeClientData,
738                                              int *more)
739     
740 {
741     zint sysno0 = 0;
742     RecordAttr *recordAttr;
743     struct recExtractCtrl extractCtrl;
744     int r;
745     const char *matchStr = 0;
746     Record rec;
747     off_t start_offset = 0, end_offset = 0;
748     const char *pr_fname = fname;  /* filename to print .. */
749     int show_progress = zh->records_processed + zh->records_skipped 
750         < zh->m_file_verbose_limit ? 1:0;
751
752     zebra_init_log_level();
753
754     if (!pr_fname)
755         pr_fname = "<no file>";  /* make it printable if file is omitted */
756
757     zebra_rec_keys_reset(zh->reg->keys);
758     zebra_rec_keys_reset(zh->reg->sortKeys);
759
760     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
761     {
762         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
763                                       zh->m_explain_database))
764             return ZEBRA_FAIL;
765     }
766
767     if (stream)
768     {
769         off_t null_offset = 0;
770         extractCtrl.stream = stream;
771
772         start_offset = stream->tellf(stream);
773
774         extractCtrl.first_record = start_offset ? 0 : 1;
775         
776         stream->endf(stream, &null_offset);;
777
778         extractCtrl.init = extract_init;
779         extractCtrl.tokenAdd = extract_token_add;
780         extractCtrl.schemaAdd = extract_schema_add;
781         extractCtrl.dh = zh->reg->dh;
782         extractCtrl.handle = zh;
783         extractCtrl.match_criteria[0] = '\0';
784         extractCtrl.staticrank = 0;
785         extractCtrl.action = action;
786
787         init_extractCtrl(zh, &extractCtrl);
788
789         extract_set_store_data_prepare(&extractCtrl);
790         
791         r = (*recType->extract)(recTypeClientData, &extractCtrl);
792
793         if (action == action_update)
794         {
795             action = extractCtrl.action;
796         }
797         
798         switch (r)
799         {
800         case RECCTRL_EXTRACT_EOF:
801             return ZEBRA_FAIL;
802         case RECCTRL_EXTRACT_ERROR_GENERIC:
803             /* error occured during extraction ... */
804             yaz_log(YLOG_WARN, "extract error: generic");
805             return ZEBRA_FAIL;
806         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
807             /* error occured during extraction ... */
808             yaz_log(YLOG_WARN, "extract error: no such filter");
809             return ZEBRA_FAIL;
810         case RECCTRL_EXTRACT_SKIP:
811             if (show_progress)
812                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
813                          recordType, pr_fname, (zint) start_offset);
814             *more = 1;
815             
816             end_offset = stream->endf(stream, 0);
817             if (end_offset)
818                 stream->seekf(stream, end_offset);
819
820             return ZEBRA_OK;
821         case RECCTRL_EXTRACT_OK:
822             break;
823         default:
824             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
825             return ZEBRA_FAIL;
826         }
827         end_offset = stream->endf(stream, 0);
828         if (end_offset)
829             stream->seekf(stream, end_offset);
830         else
831             end_offset = stream->tellf(stream);
832
833         if (extractCtrl.match_criteria[0])
834             match_criteria = extractCtrl.match_criteria;
835     }
836
837     *more = 1;
838
839     if (zh->m_flag_rw == 0)
840     {
841         yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
842                 pr_fname, (zint) start_offset);
843         /* test mode .. Do not perform match */
844         return ZEBRA_OK;
845     }
846         
847     if (!sysno)
848     {
849         sysno = &sysno0;
850         
851         if (match_criteria && *match_criteria)
852             matchStr = match_criteria;
853         else
854         {
855             if (zh->m_record_id && *zh->m_record_id)
856             {
857                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
858                                                zh->m_record_id);
859                 if (!matchStr)
860                 {
861                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
862                              pr_fname, (zint) start_offset);
863                     return ZEBRA_FAIL;
864                 }
865                 if (0 && matchStr)
866                 {
867                     WRBUF w = wrbuf_alloc();
868                     size_t i;
869                     for (i = 0; i < strlen(matchStr); i++)
870                     {
871                         wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
872                     }
873                     yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
874                     wrbuf_destroy(w);
875                 }
876             }
877         }
878         if (matchStr) 
879         {
880             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
881             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
882                                           matchStr);
883
884             
885             if (log_level_extract)
886             {
887                 WRBUF w = wrbuf_hex_str(matchStr);
888                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
889                 wrbuf_destroy(w);
890             }
891             if (rinfo)
892             {
893                 assert(*rinfo == sizeof(*sysno));
894                 memcpy(sysno, rinfo+1, sizeof(*sysno));
895             }
896        }
897     }
898
899     if (! *sysno)
900     {
901         /* new record AKA does not exist already */
902         if (action == action_delete)
903         {
904             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
905                     pr_fname, (zint) start_offset);
906             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
907             return ZEBRA_FAIL;
908         }
909         else if (action == action_a_delete)
910         {
911             if (show_progress)
912                 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
913                         pr_fname, (zint) start_offset);
914             return ZEBRA_OK;
915         }
916         else if (action == action_replace)
917         {
918             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
919                          pr_fname, (zint) start_offset);
920             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
921             return ZEBRA_FAIL;
922         }
923         if (show_progress)
924             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
925                      (zint) start_offset);
926         rec = rec_new(zh->reg->records);
927
928         *sysno = rec->sysno;
929
930
931         if (stream)
932         {
933             all_matches_add(&extractCtrl,
934                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
935                             *sysno);
936         }
937
938
939         recordAttr = rec_init_attr(zh->reg->zei, rec);
940         if (extractCtrl.staticrank < 0)
941         {
942             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
943             extractCtrl.staticrank = 0;
944         }
945
946         if (matchStr)
947         {
948             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
949             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
950                             sizeof(*sysno), sysno);
951         }
952
953         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
954 #if FLUSH2
955         extract_flush_record_keys2(zh, *sysno,
956                                    zh->reg->keys, extractCtrl.staticrank,
957                                    0, recordAttr->staticrank);
958 #else
959         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
960                                   extractCtrl.staticrank);
961 #endif
962         recordAttr->staticrank = extractCtrl.staticrank;
963         zh->records_inserted++;
964     } 
965     else
966     {
967         /* record already exists */
968         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
969         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
970         if (action == action_insert)
971         {
972             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
973                          recordType, pr_fname, (zint) start_offset);
974             logRecord(zh);
975             return ZEBRA_FAIL;
976         }
977
978         rec = rec_get(zh->reg->records, *sysno);
979         assert(rec);
980
981         if (stream)
982         {
983             all_matches_add(&extractCtrl,
984                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
985                             *sysno);
986         }
987         
988         recordAttr = rec_init_attr(zh->reg->zei, rec);
989
990         /* decrease total size */
991         zebraExplain_recordBytesIncrement(zh->reg->zei,
992                                            - recordAttr->recordSize);
993
994         zebra_rec_keys_set_buf(delkeys,
995                                rec->info[recInfo_delKeys],
996                                rec->size[recInfo_delKeys],
997                                0);
998         zebra_rec_keys_set_buf(sortKeys,
999                                rec->info[recInfo_sortKeys],
1000                                rec->size[recInfo_sortKeys],
1001                                0);
1002
1003         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1004 #if !FLUSH2
1005         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1006                                   recordAttr->staticrank);
1007 #endif
1008         if (action == action_delete || action == action_a_delete)
1009         {
1010             /* record going to be deleted */
1011 #if FLUSH2
1012             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1013                                        delkeys, recordAttr->staticrank);
1014 #endif       
1015             if (zebra_rec_keys_empty(delkeys))
1016             {
1017                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1018                         pr_fname, (zint) start_offset);
1019                 yaz_log(YLOG_WARN, "cannot delete file above, "
1020                         "storeKeys false (3)");
1021             }
1022             else
1023             {
1024                 if (show_progress)
1025                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1026                             pr_fname, (zint) start_offset);
1027                 zh->records_deleted++;
1028                 if (matchStr)
1029                 {
1030                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1031                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1032                 }
1033                 rec_del(zh->reg->records, &rec);
1034             }
1035             zebra_rec_keys_close(delkeys);
1036             zebra_rec_keys_close(sortKeys);
1037             rec_free(&rec);
1038             logRecord(zh);
1039             return ZEBRA_OK;
1040         }
1041         else
1042         {   /* update or special_update */
1043             if (show_progress)
1044                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1045                         pr_fname, (zint) start_offset);
1046             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1047
1048 #if FLUSH2
1049             extract_flush_record_keys2(zh, *sysno,
1050                                        zh->reg->keys, extractCtrl.staticrank,
1051                                        delkeys, recordAttr->staticrank);
1052 #else
1053             extract_flush_record_keys(zh, *sysno, 1, 
1054                                       zh->reg->keys, extractCtrl.staticrank);
1055 #endif
1056             recordAttr->staticrank = extractCtrl.staticrank;
1057             zh->records_updated++;
1058         }
1059         zebra_rec_keys_close(delkeys);
1060         zebra_rec_keys_close(sortKeys);
1061     }
1062     /* update file type */
1063     xfree(rec->info[recInfo_fileType]);
1064     rec->info[recInfo_fileType] =
1065         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1066
1067     /* update filename */
1068     xfree(rec->info[recInfo_filename]);
1069     rec->info[recInfo_filename] =
1070         rec_strdup(fname, &rec->size[recInfo_filename]);
1071
1072     /* update delete keys */
1073     xfree(rec->info[recInfo_delKeys]);
1074     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1075     {
1076         zebra_rec_keys_get_buf(zh->reg->keys,
1077                                &rec->info[recInfo_delKeys],
1078                                &rec->size[recInfo_delKeys]);
1079     }
1080     else
1081     {
1082         rec->info[recInfo_delKeys] = NULL;
1083         rec->size[recInfo_delKeys] = 0;
1084     }
1085     /* update sort keys */
1086     xfree(rec->info[recInfo_sortKeys]);
1087
1088     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1089                            &rec->info[recInfo_sortKeys],
1090                            &rec->size[recInfo_sortKeys]);
1091
1092     if (stream)
1093     {
1094         recordAttr->recordSize = end_offset - start_offset;
1095         zebraExplain_recordBytesIncrement(zh->reg->zei,
1096                                           recordAttr->recordSize);
1097     }
1098
1099     /* set run-number for this record */
1100     recordAttr->runNumber =
1101         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1102
1103     /* update store data */
1104     xfree(rec->info[recInfo_storeData]);
1105
1106     /* update store data */
1107     if (zh->store_data_buf)
1108     {
1109         rec->size[recInfo_storeData] = zh->store_data_size;
1110         rec->info[recInfo_storeData] = zh->store_data_buf;
1111         zh->store_data_buf = 0;
1112         recordAttr->recordSize = zh->store_data_size;
1113     }
1114     else if (zh->m_store_data)
1115     {
1116         off_t cur_offset = stream->tellf(stream);
1117
1118         rec->size[recInfo_storeData] = recordAttr->recordSize;
1119         rec->info[recInfo_storeData] = (char *)
1120             xmalloc(recordAttr->recordSize);
1121         stream->seekf(stream, start_offset);
1122         stream->readf(stream, rec->info[recInfo_storeData],
1123                       recordAttr->recordSize);
1124         stream->seekf(stream, cur_offset);
1125     }
1126     else
1127     {
1128         rec->info[recInfo_storeData] = NULL;
1129         rec->size[recInfo_storeData] = 0;
1130     }
1131     /* update database name */
1132     xfree(rec->info[recInfo_databaseName]);
1133     rec->info[recInfo_databaseName] =
1134         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1135
1136     /* update offset */
1137     recordAttr->recordOffset = start_offset;
1138     
1139     /* commit this record */
1140     rec_put(zh->reg->records, &rec);
1141     logRecord(zh);
1142     return ZEBRA_OK;
1143 }
1144
1145 /** \brief extracts records from stream
1146     \param zh Zebra Handle
1147     \param stream stream that we read from
1148     \param action (action_insert, action_replace, action_delete, ..)
1149     \param recordType Record filter type "grs.xml", etc.
1150     \param sysno pointer to sysno if already known; NULL otherwise
1151     \param match_criteria (NULL if not already given)
1152     \param fname filename that we read from (for logging purposes only)
1153     \param recType record type
1154     \param recTypeClientData client data for record type
1155     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1156 */
1157 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
1158                                        struct ZebraRecStream *stream,
1159                                        enum zebra_recctrl_action_t action,
1160                                        const char *recordType,
1161                                        zint *sysno,
1162                                        const char *match_criteria,
1163                                        const char *fname,
1164                                        RecType recType,
1165                                        void *recTypeClientData)
1166 {
1167     ZEBRA_RES res = ZEBRA_OK;
1168     while (1)
1169     {
1170         int more = 0;
1171         res = zebra_extract_record_stream(zh, stream,
1172                                           action,
1173                                           recordType,
1174                                           sysno,
1175                                           match_criteria,
1176                                           fname,
1177                                           recType, recTypeClientData, &more);
1178         if (!more)
1179         {
1180             res = ZEBRA_OK;
1181             break;
1182         }
1183         if (res != ZEBRA_OK)
1184             break;
1185         if (sysno)
1186             break;
1187     }
1188     return res;
1189 }
1190
1191 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1192 {
1193     ZebraHandle zh = (ZebraHandle) handle;
1194     struct recExtractCtrl extractCtrl;
1195
1196     if (zebraExplain_curDatabase(zh->reg->zei,
1197                                   rec->info[recInfo_databaseName]))
1198     {
1199         abort();
1200         if (zebraExplain_newDatabase(zh->reg->zei,
1201                                       rec->info[recInfo_databaseName], 0))
1202             abort();
1203     }
1204
1205     zebra_rec_keys_reset(zh->reg->keys);
1206     zebra_rec_keys_reset(zh->reg->sortKeys);
1207
1208     extractCtrl.init = extract_init;
1209     extractCtrl.tokenAdd = extract_token_add;
1210     extractCtrl.schemaAdd = extract_schema_add;
1211     extractCtrl.dh = zh->reg->dh;
1212
1213     init_extractCtrl(zh, &extractCtrl);
1214
1215     extractCtrl.flagShowRecords = 0;
1216     extractCtrl.match_criteria[0] = '\0';
1217     extractCtrl.staticrank = 0;
1218     extractCtrl.action = action_update;
1219
1220     extractCtrl.handle = handle;
1221     extractCtrl.first_record = 1;
1222     
1223     extract_set_store_data_prepare(&extractCtrl);
1224
1225     if (n)
1226         grs_extract_tree(&extractCtrl, n);
1227
1228     if (rec->size[recInfo_delKeys])
1229     {
1230         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1231         
1232         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1233
1234         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1235                                rec->size[recInfo_delKeys],
1236                                0);
1237 #if FLUSH2
1238         extract_flush_record_keys2(zh, rec->sysno, 
1239                                    zh->reg->keys, 0, delkeys, 0);
1240 #else
1241         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1242         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1243 #endif
1244         zebra_rec_keys_close(delkeys);
1245
1246         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1247                                rec->size[recInfo_sortKeys],
1248                                0);
1249
1250         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1251         zebra_rec_keys_close(sortkeys);
1252     }
1253     else
1254     {
1255 #if FLUSH2
1256         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1257 #else
1258         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1259 #endif
1260     }
1261     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1262     
1263     xfree(rec->info[recInfo_delKeys]);
1264     zebra_rec_keys_get_buf(zh->reg->keys,
1265                            &rec->info[recInfo_delKeys], 
1266                            &rec->size[recInfo_delKeys]);
1267
1268     xfree(rec->info[recInfo_sortKeys]);
1269     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1270                            &rec->info[recInfo_sortKeys],
1271                            &rec->size[recInfo_sortKeys]);
1272     return ZEBRA_OK;
1273 }
1274
1275 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1276                            const char *str, size_t slen, NMEM nmem, int level)
1277 {
1278     char keystr[200]; /* room for zints to print */
1279     char *dst_term = 0;
1280     int ord = CAST_ZINT_TO_INT(key->mem[0]);
1281     const char *index_type;
1282     int i;
1283     const char *string_index;
1284     
1285     zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1286                             0/* db */, &string_index);
1287     assert(index_type);
1288     zebra_term_untrans_iconv(zh, nmem, index_type,
1289                              &dst_term, str);
1290     *keystr = '\0';
1291     for (i = 0; i < key->len; i++)
1292     {
1293         sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1294     }
1295     
1296     if (*str < CHR_BASE_CHAR)
1297     {
1298         int i;
1299         char dst_buf[200]; /* room for special chars */
1300         
1301         strcpy(dst_buf , "?");
1302         
1303         if (!strcmp(str, ""))
1304             strcpy(dst_buf, "alwaysmatches");
1305         if (!strcmp(str, FIRST_IN_FIELD_STR))
1306             strcpy(dst_buf, "firstinfield");
1307         else if (!strcmp(str, CHR_UNKNOWN))
1308             strcpy(dst_buf, "unknown");
1309         else if (!strcmp(str, CHR_SPACE))
1310             strcpy(dst_buf, "space");
1311         
1312         for (i = 0; i<slen; i++)
1313         {
1314             sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1315         }
1316         yaz_log(level, "%s%s %s %s", keystr, index_type,
1317                 string_index, dst_buf);
1318         
1319     }
1320     else
1321         yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1322                 string_index, dst_term);
1323 }
1324
1325 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1326                           zebra_rec_keys_t reckeys,
1327                           int level)
1328 {
1329     if (zebra_rec_keys_rewind(reckeys))
1330     {
1331         size_t slen;
1332         const char *str;
1333         struct it_key key;
1334         NMEM nmem = nmem_create();
1335
1336         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1337         {
1338             zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1339             nmem_reset(nmem);
1340         }
1341         nmem_destroy(nmem);
1342     }
1343 }
1344
1345 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1346                              zebra_rec_keys_t reckeys)
1347 {
1348     ZebraExplainInfo zei = zh->reg->zei;
1349     struct ord_stat {
1350         int no;
1351         int ord;
1352         struct ord_stat *next;
1353     };
1354
1355     if (zebra_rec_keys_rewind(reckeys))
1356     {
1357         struct ord_stat *ord_list = 0;
1358         struct ord_stat *p;
1359         size_t slen;
1360         const char *str;
1361         struct it_key key_in;
1362         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1363         {
1364             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1365
1366             for (p = ord_list; p ; p = p->next)
1367                 if (p->ord == ord)
1368                 {
1369                     p->no++;
1370                     break;
1371                 }
1372             if (!p)
1373             {
1374                 p = xmalloc(sizeof(*p));
1375                 p->no = 1;
1376                 p->ord = ord;
1377                 p->next = ord_list;
1378                 ord_list = p;
1379             }
1380         }
1381
1382         p = ord_list;
1383         while (p)
1384         {
1385             struct ord_stat *p1 = p;
1386
1387             if (is_insert)
1388                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1389             else
1390                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1391             p = p->next;
1392             xfree(p1);
1393         }
1394     }
1395 }
1396
1397 #if FLUSH2
1398 static void extract_flush_record_keys2(
1399     ZebraHandle zh, zint sysno,
1400     zebra_rec_keys_t ins_keys, zint ins_rank,
1401     zebra_rec_keys_t del_keys, zint del_rank)
1402 {
1403     ZebraExplainInfo zei = zh->reg->zei;
1404     int normal = 0;
1405     int optimized = 0;
1406
1407     if (!zh->reg->key_block)
1408     {
1409         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1410         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1411         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1412         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1413     }
1414
1415     if (ins_keys)
1416     {
1417         extract_rec_keys_adjust(zh, 1, ins_keys);
1418         if (!del_keys)
1419             zebraExplain_recordCountIncrement(zei, 1);
1420         zebra_rec_keys_rewind(ins_keys);
1421     }
1422     if (del_keys)
1423     {
1424         extract_rec_keys_adjust(zh, 0, del_keys);
1425         if (!ins_keys)
1426             zebraExplain_recordCountIncrement(zei, -1);
1427         zebra_rec_keys_rewind(del_keys);
1428     }
1429
1430     while (1)
1431     {
1432         size_t del_slen;
1433         const char *del_str;
1434         struct it_key del_key_in;
1435         int del = 0;
1436
1437         size_t ins_slen;
1438         const char *ins_str;
1439         struct it_key ins_key_in;
1440         int ins = 0;
1441
1442         if (del_keys)
1443             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1444                                       &del_key_in);
1445         if (ins_keys)
1446             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1447                                       &ins_key_in);
1448
1449         if (del && ins && ins_rank == del_rank
1450             && !key_compare(&del_key_in, &ins_key_in) 
1451             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1452         {
1453             optimized++;
1454             continue;
1455         }
1456         if (!del && !ins)
1457             break;
1458         
1459         normal++;
1460         if (del)
1461             key_block_write(zh->reg->key_block, sysno, 
1462                             &del_key_in, 0, del_str, del_slen,
1463                             del_rank, zh->m_staticrank);
1464         if (ins)
1465             key_block_write(zh->reg->key_block, sysno, 
1466                             &ins_key_in, 1, ins_str, ins_slen,
1467                             ins_rank, zh->m_staticrank);
1468     }
1469     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1470 }
1471 #else
1472 static void extract_flush_record_keys(
1473     ZebraHandle zh, zint sysno, int cmd,
1474     zebra_rec_keys_t reckeys,
1475     zint staticrank)
1476 {
1477     ZebraExplainInfo zei = zh->reg->zei;
1478
1479     extract_rec_keys_adjust(zh, cmd, reckeys);
1480
1481     if (log_level_details)
1482     {
1483         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1484                 sysno, cmd ? "insert" : "delete");
1485         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1486     }
1487
1488     if (!zh->reg->key_block)
1489     {
1490         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1491         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1492         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1493         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1494     }
1495     zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1496
1497 #if 0
1498     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1499     print_rec_keys(zh, reckeys);
1500 #endif
1501     if (zebra_rec_keys_rewind(reckeys))
1502     {
1503         size_t slen;
1504         const char *str;
1505         struct it_key key_in;
1506         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1507         {
1508             key_block_write(zh->reg->key_block, sysno, 
1509                             &key_in, cmd, str, slen,
1510                             staticrank, zh->m_staticrank);
1511         }
1512     }
1513 }
1514 #endif
1515
1516 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1517                                      zebra_rec_keys_t reckeys,
1518                                      zebra_snippets *snippets)
1519 {
1520     NMEM nmem = nmem_create();
1521     if (zebra_rec_keys_rewind(reckeys)) 
1522     {
1523         const char *str;
1524         size_t slen;
1525         struct it_key key;
1526         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1527         {
1528             char *dst_term = 0;
1529             int ord;
1530             zint seqno;
1531             const char *index_type;
1532
1533             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1534             seqno = key.mem[key.len-1];
1535             ord = CAST_ZINT_TO_INT(key.mem[0]);
1536             
1537             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1538                                     0/* db */, 0 /* string_index */);
1539             assert(index_type);
1540             zebra_term_untrans_iconv(zh, nmem, index_type,
1541                                      &dst_term, str);
1542             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1543             nmem_reset(nmem);
1544         }
1545     }
1546     nmem_destroy(nmem);
1547     return ZEBRA_OK;
1548 }
1549
1550 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1551 {
1552     yaz_log(YLOG_LOG, "print_rec_keys");
1553     if (zebra_rec_keys_rewind(reckeys))
1554     {
1555         const char *str;
1556         size_t slen;
1557         struct it_key key;
1558         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1559         {
1560             char dst_buf[IT_MAX_WORD];
1561             zint seqno;
1562             const char *index_type;
1563             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1564             const char *db = 0;
1565             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1566
1567             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1568             
1569             seqno = key.mem[key.len-1];
1570             
1571             zebra_term_untrans(zh, index_type, dst_buf, str);
1572             
1573             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1574                     " term=%s", ord, seqno, dst_buf); 
1575         }
1576     }
1577 }
1578
1579 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1580                                      const char *str, int length)
1581 {
1582     struct it_key key;
1583     ZebraHandle zh = p->extractCtrl->handle;
1584     ZebraExplainInfo zei = zh->reg->zei;
1585     int ch, i;
1586
1587     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1588     if (ch < 0)
1589         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1590
1591     i = 0;
1592     key.mem[i++] = ch;
1593     key.mem[i++] = p->record_id;
1594     key.mem[i++] = p->section_id;
1595
1596     if (zh->m_segment_indexing)
1597         key.mem[i++] = p->segment;
1598     key.mem[i++] = p->seqno;
1599     key.len = i;
1600
1601     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1602 }
1603
1604 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1605 {
1606     struct it_key key;
1607     ZebraHandle zh = p->extractCtrl->handle;
1608     ZebraExplainInfo zei = zh->reg->zei;
1609     int ch;
1610     zinfo_index_category_t cat = zinfo_index_category_sort;
1611
1612     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1613     if (ch < 0)
1614         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1615     key.len = 3;
1616     key.mem[0] = ch;
1617     key.mem[1] = p->record_id;
1618     key.mem[2] = p->section_id;
1619
1620     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1621 }
1622
1623 static void extract_add_staticrank_string(RecWord *p,
1624                                           const char *str, int length)
1625 {
1626     char valz[40];
1627     struct recExtractCtrl *ctrl = p->extractCtrl;
1628
1629     if (length > sizeof(valz)-1)
1630         length = sizeof(valz)-1;
1631
1632     memcpy(valz, str, length);
1633     valz[length] = '\0';
1634     ctrl->staticrank = atozint(valz);
1635 }
1636
1637 static void extract_add_string(RecWord *p, zebra_map_t zm,
1638                                const char *string, int length)
1639 {
1640     assert(length > 0);
1641
1642     if (!p->index_name)
1643         return;
1644     if (log_level_details)
1645     {
1646
1647         WRBUF w = wrbuf_alloc();
1648         
1649         wrbuf_write_escaped(w, string, length);
1650         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1651         wrbuf_destroy(w);
1652     }
1653     if (zebra_maps_is_index(zm))
1654     {
1655         extract_add_index_string(p, zinfo_index_category_index,
1656                                  string, length);
1657         if (zebra_maps_is_alwaysmatches(zm))
1658         {
1659             RecWord word;
1660             memcpy(&word, p, sizeof(word));
1661
1662             word.seqno = 1;
1663             extract_add_index_string(
1664                 &word, zinfo_index_category_alwaysmatches, "", 0);
1665         }
1666     }
1667     else if (zebra_maps_is_sort(zm))
1668     {
1669         extract_add_sort_string(p, string, length);
1670     }
1671     else if (zebra_maps_is_staticrank(zm))
1672     {
1673         extract_add_staticrank_string(p, string, length);
1674     }
1675 }
1676
1677 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1678 {
1679     const char *b = p->term_buf;
1680     int remain = p->term_len;
1681     int first = 1;
1682     const char **map = 0;
1683     
1684     if (remain > 0)
1685         map = zebra_maps_input(zm, &b, remain, 0);
1686
1687     while (map)
1688     {
1689         char buf[IT_MAX_WORD+1];
1690         int i, remain;
1691
1692         /* Skip spaces */
1693         while (map && *map && **map == *CHR_SPACE)
1694         {
1695             remain = p->term_len - (b - p->term_buf);
1696             if (remain > 0)
1697                 map = zebra_maps_input(zm, &b, remain, 0);
1698             else
1699                 map = 0;
1700         }
1701         if (!map)
1702             break;
1703         i = 0;
1704         while (map && *map && **map != *CHR_SPACE)
1705         {
1706             const char *cp = *map;
1707
1708             while (i < IT_MAX_WORD && *cp)
1709                 buf[i++] = *(cp++);
1710             remain = p->term_len - (b - p->term_buf);
1711             if (remain > 0)
1712                 map = zebra_maps_input(zm, &b, remain, 0);
1713             else
1714                 map = 0;
1715         }
1716         if (!i)
1717             return;
1718
1719         if (first)
1720         {   
1721             first = 0;
1722             if (zebra_maps_is_first_in_field(zm))
1723             {
1724                 /* first in field marker */
1725                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1726                 p->seqno++;
1727             }
1728         }
1729         extract_add_string(p, zm, buf, i);
1730         p->seqno++;
1731     }
1732 }
1733
1734 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1735 {
1736     const char *b = p->term_buf;
1737     char buf[IT_MAX_WORD+1];
1738     const char **map = 0;
1739     int i = 0, remain = p->term_len;
1740
1741     if (remain > 0)
1742         map = zebra_maps_input(zm, &b, remain, 1);
1743
1744     while (remain > 0 && i < IT_MAX_WORD)
1745     {
1746         while (map && *map && **map == *CHR_SPACE)
1747         {
1748             remain = p->term_len - (b - p->term_buf);
1749
1750             if (remain > 0)
1751             {
1752                 int first = i ? 0 : 1;  /* first position */
1753                 map = zebra_maps_input(zm, &b, remain, first);
1754             }
1755             else
1756                 map = 0;
1757         }
1758         if (!map)
1759             break;
1760
1761         if (i && i < IT_MAX_WORD)
1762             buf[i++] = *CHR_SPACE;
1763         while (map && *map && **map != *CHR_SPACE)
1764         {
1765             const char *cp = *map;
1766
1767             if (**map == *CHR_CUT)
1768             {
1769                 i = 0;
1770             }
1771             else
1772             {
1773                 if (i >= IT_MAX_WORD)
1774                     break;
1775                 while (i < IT_MAX_WORD && *cp)
1776                     buf[i++] = *(cp++);
1777             }
1778             remain = p->term_len  - (b - p->term_buf);
1779             if (remain > 0)
1780             {
1781                 map = zebra_maps_input(zm, &b, remain, 0);
1782             }
1783             else
1784                 map = 0;
1785         }
1786     }
1787     if (!i)
1788         return;
1789     extract_add_string(p, zm, buf, i);
1790     p->seqno++;
1791 }
1792
1793 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1794 {
1795     const char *res_buf = 0;
1796     size_t res_len = 0;
1797
1798     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1799     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1800     {
1801         if (res_len > IT_MAX_WORD)
1802         {
1803             yaz_log(YLOG_LOG, "Truncating long term %ld", (long) res_len);
1804             res_len = IT_MAX_WORD;
1805         }
1806         extract_add_string(p, zm, res_buf, res_len);
1807         p->seqno++;
1808     }
1809 }
1810
1811
1812 /** \brief top-level indexing handler for recctrl system
1813     \param p token data to be indexed
1814
1815     Call sequence:
1816     extract_token_add
1817     extract_add_{in}_complete / extract_add_icu
1818     extract_add_string
1819     
1820     extract_add_index_string
1821     or
1822     extract_add_sort_string
1823     or
1824     extract_add_staticrank_string
1825     
1826 */
1827 static void extract_token_add(RecWord *p)
1828 {
1829     ZebraHandle zh = p->extractCtrl->handle;
1830     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1831
1832     if (log_level_details)
1833     {
1834         yaz_log(log_level_details, "extract_token_add "
1835                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1836                 p->index_type, p->index_name, 
1837                 p->seqno, p->term_len, p->term_buf);
1838     }
1839     if (zebra_maps_is_icu(zm))
1840     {
1841         extract_add_icu(p, zm);
1842     }
1843     else
1844     {
1845         if (zebra_maps_is_complete(zm))
1846             extract_add_complete_field(p, zm);
1847         else
1848             extract_add_incomplete_field(p, zm);
1849     }
1850 }
1851
1852 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1853                                       void *buf, size_t sz)
1854 {
1855     ZebraHandle zh = (ZebraHandle) p->handle;
1856
1857     xfree(zh->store_data_buf);
1858     zh->store_data_buf = 0;
1859     zh->store_data_size = 0;
1860     if (buf && sz)
1861     {
1862         zh->store_data_buf = xmalloc(sz);
1863         zh->store_data_size = sz;
1864         memcpy(zh->store_data_buf, buf, sz);
1865     }
1866 }
1867
1868 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1869 {
1870     ZebraHandle zh = (ZebraHandle) p->handle;
1871     xfree(zh->store_data_buf);
1872     zh->store_data_buf = 0;
1873     zh->store_data_size = 0;
1874     p->setStoreData = extract_set_store_data_cb;
1875 }
1876
1877 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1878 {
1879     ZebraHandle zh = (ZebraHandle) p->handle;
1880     zebraExplain_addSchema(zh->reg->zei, oid);
1881 }
1882
1883 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1884                              int cmd, zebra_rec_keys_t reckeys)
1885 {
1886 #if 0
1887     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1888             cmd, sysno);
1889     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1890 #endif
1891
1892     if (zebra_rec_keys_rewind(reckeys))
1893     {
1894         zebra_sort_index_t si = zh->reg->sort_index;
1895         size_t slen;
1896         const char *str;
1897         struct it_key key_in;
1898
1899         NMEM nmem = nmem_create();
1900         struct sort_add_ent {
1901             int ord;
1902             int cmd;
1903             struct sort_add_ent *next;
1904             WRBUF wrbuf;
1905             zint sysno;
1906             zint section_id;
1907         };
1908         struct sort_add_ent *sort_ent_list = 0;
1909
1910         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1911         {
1912             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1913             zint filter_sysno = key_in.mem[1];
1914             zint section_id = key_in.mem[2];
1915
1916             struct sort_add_ent **e = &sort_ent_list;
1917             for (; *e; e = &(*e)->next)
1918                 if ((*e)->ord == ord && section_id == (*e)->section_id)
1919                     break;
1920             if (!*e)
1921             {
1922                 *e = nmem_malloc(nmem, sizeof(**e));
1923                 (*e)->next = 0;
1924                 (*e)->wrbuf = wrbuf_alloc();
1925                 (*e)->ord = ord;
1926                 (*e)->cmd = cmd;
1927                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1928                 (*e)->section_id = section_id;
1929             }
1930             
1931             wrbuf_write((*e)->wrbuf, str, slen);
1932             wrbuf_putc((*e)->wrbuf, '\0');
1933         }
1934         if (sort_ent_list)
1935         {
1936             zint last_sysno = 0;
1937             struct sort_add_ent *e = sort_ent_list;
1938             for (; e; e = e->next)
1939             {
1940                 if (last_sysno != e->sysno)
1941                 {
1942                     zebra_sort_sysno(si, e->sysno);
1943                     last_sysno = e->sysno;
1944                 }
1945                 zebra_sort_type(si, e->ord);
1946                 if (e->cmd == 1)
1947                     zebra_sort_add(si, e->section_id, e->wrbuf);
1948                 else
1949                     zebra_sort_delete(si, e->section_id);
1950                 wrbuf_destroy(e->wrbuf);
1951             }
1952         }
1953         nmem_destroy(nmem);
1954     }
1955 }
1956
1957 /*
1958  * Local variables:
1959  * c-basic-offset: 4
1960  * c-file-style: "Stroustrup"
1961  * indent-tabs-mode: nil
1962  * End:
1963  * vim: shiftwidth=4 tabstop=8 expandtab
1964  */
1965