First indexing using index_types system (ICU).
[idzebra-moved-to-github.git] / index / extract.c
1 /* $Id: extract.c,v 1.264 2007-10-29 13:43:57 adam Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 /** \file
24     \brief indexes records and extract tokens for indexing and sorting
25 */
26
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37
38
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52
53 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
54                                 zebra_rec_keys_t ins_keys,
55                                 zint ins_rank,
56                                 zebra_rec_keys_t del_keys,
57                                 zint del_rank);
58
59 static void zebra_init_log_level(void)
60 {
61     if (!log_level_initialized)
62     {
63         log_level_initialized = 1;
64
65         log_level_extract = yaz_log_module_level("extract");
66         log_level_details = yaz_log_module_level("indexdetails");
67     }
68 }
69
70 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
71                                       int cmd, zebra_rec_keys_t reckeys,
72                                       zint staticrank);
73 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
74                                     int cmd, zebra_rec_keys_t skp);
75 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
76 static void extract_token_add(RecWord *p);
77 static void extract_token_add2(RecWord *p);
78
79 static void check_log_limit(ZebraHandle zh)
80 {
81     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
82     {
83         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
84                 zh->m_file_verbose_limit);
85     }
86 }
87
88 static void logRecord(ZebraHandle zh)
89 {
90     check_log_limit(zh);
91     ++zh->records_processed;
92     if (!(zh->records_processed % 1000))
93     {
94         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
95                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
96                 zh->records_processed, zh->records_inserted, 
97                 zh->records_updated, zh->records_deleted);
98     }
99 }
100
101 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
102 {
103     int i;
104     for (i = 0; i<256; i++)
105     {
106         if (zebra_maps_is_positioned(zh->reg->zebra_maps, i))
107             ctrl->seqno[i] = 1;
108         else
109             ctrl->seqno[i] = 0;
110     }
111     ctrl->flagShowRecords = !zh->m_flag_rw;
112 }
113
114
115 static void extract_add_index_string(RecWord *p, 
116                                       zinfo_index_category_t cat,
117                                       const char *str, int length);
118
119 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
120
121 static void extract_init(struct recExtractCtrl *p, RecWord *w)
122 {
123     w->seqno = 1;
124     w->index_name = "any";
125     w->index_type = 'w';
126     w->extractCtrl = p;
127     w->record_id = 0;
128     w->section_id = 0;
129     w->segment = 0;
130 }
131
132 struct snip_rec_info {
133     ZebraHandle zh;
134     zebra_snippets *snippets;
135 };
136
137
138 static void snippet_add_complete_field(RecWord *p, int ord)
139 {
140     struct snip_rec_info *h = p->extractCtrl->handle;
141     ZebraHandle zh = h->zh;
142
143     const char *b = p->term_buf;
144     char buf[IT_MAX_WORD+1];
145     const char **map = 0;
146     int i = 0, remain = p->term_len;
147     const char *start = b;
148     const char *last = 0;
149
150     if (remain > 0)
151         map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 1);
152
153     while (remain > 0 && i < IT_MAX_WORD)
154     {
155         while (map && *map && **map == *CHR_SPACE)
156         {
157             remain = p->term_len - (b - p->term_buf);
158
159             if (i == 0)
160                 start = b;  /* set to first non-ws area */
161             if (remain > 0)
162             {
163                 int first = i ? 0 : 1;  /* first position */
164
165                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, 
166                                        &b, remain, first);
167             }
168             else
169                 map = 0;
170         }
171         if (!map)
172             break;
173
174         if (i && i < IT_MAX_WORD)
175             buf[i++] = *CHR_SPACE;
176         while (map && *map && **map != *CHR_SPACE)
177         {
178             const char *cp = *map;
179
180             if (**map == *CHR_CUT)
181             {
182                 i = 0;
183             }
184             else
185             {
186                 if (i >= IT_MAX_WORD)
187                     break;
188                 while (i < IT_MAX_WORD && *cp)
189                     buf[i++] = *(cp++);
190             }
191             last = b;
192             remain = p->term_len  - (b - p->term_buf);
193             if (remain > 0)
194             {
195                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b,
196                                         remain, 0);
197             }
198             else
199                 map = 0;
200         }
201     }
202     if (!i)
203         return;
204     if (last && start != last)
205         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
206                                start, last - start);
207 }
208
209 static void snippet_add_incomplete_field(RecWord *p, int ord)
210 {
211     struct snip_rec_info *h = p->extractCtrl->handle;
212     ZebraHandle zh = h->zh;
213     const char *b = p->term_buf;
214     int remain = p->term_len;
215     int first = 1;
216     const char **map = 0;
217     const char *start = b;
218     const char *last = b;
219
220     if (remain > 0)
221         map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0);
222
223     while (map)
224     {
225         char buf[IT_MAX_WORD+1];
226         int i, remain;
227
228         /* Skip spaces */
229         while (map && *map && **map == *CHR_SPACE)
230         {
231             remain = p->term_len - (b - p->term_buf);
232             last = b;
233             if (remain > 0)
234                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b,
235                                        remain, 0);
236             else
237                 map = 0;
238         }
239         if (!map)
240             break;
241         if (start != last)
242         {
243             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
244                                    start, last - start);
245
246         }
247         start = last;
248
249         i = 0;
250         while (map && *map && **map != *CHR_SPACE)
251         {
252             const char *cp = *map;
253
254             while (i < IT_MAX_WORD && *cp)
255                 buf[i++] = *(cp++);
256             remain = p->term_len - (b - p->term_buf);
257             last = b;
258             if (remain > 0)
259                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0);
260             else
261                 map = 0;
262         }
263         if (!i)
264             return;
265
266         if (first)
267         {   
268             first = 0;
269             if (zebra_maps_is_first_in_field(zh->reg->zebra_maps, p->index_type))
270             {
271                 /* first in field marker */
272                 p->seqno++;
273             }
274         }
275         if (start != last)
276             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
277                                    start, last - start);
278         start = last;
279         p->seqno++;
280     }
281
282 }
283
284 static void snippet_token_add(RecWord *p)
285 {
286     struct snip_rec_info *h = p->extractCtrl->handle;
287     ZebraHandle zh = h->zh;
288
289     if (zebra_maps_is_index(zh->reg->zebra_maps, p->index_type))
290     {
291         ZebraExplainInfo zei = zh->reg->zei;
292         int ch = zebraExplain_lookup_attr_str(
293             zei, zinfo_index_category_index, p->index_type, p->index_name);
294
295         if(zebra_maps_is_complete (h->zh->reg->zebra_maps, p->index_type))
296             snippet_add_complete_field(p, ch);
297         else
298             snippet_add_incomplete_field(p, ch);
299     }
300 }
301
302 static void snippet_schema_add(
303     struct recExtractCtrl *p, Odr_oid *oid)
304 {
305
306 }
307
308 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
309                      struct ZebraRecStream *stream,
310                      RecType rt, void *recTypeClientData)
311 {
312     struct recExtractCtrl extractCtrl;
313     struct snip_rec_info info;
314     int r;
315
316     extractCtrl.stream = stream;
317     extractCtrl.first_record = 1;
318     extractCtrl.init = extract_init;
319     extractCtrl.tokenAdd = snippet_token_add;
320     extractCtrl.schemaAdd = snippet_schema_add;
321     assert(zh->reg);
322     assert(zh->reg->dh);
323
324     extractCtrl.dh = zh->reg->dh;
325     
326     info.zh = zh;
327     info.snippets = sn;
328     extractCtrl.handle = &info;
329     extractCtrl.match_criteria[0] = '\0';
330     extractCtrl.staticrank = 0;
331     extractCtrl.action = action_insert;
332     
333     init_extractCtrl(zh, &extractCtrl);
334
335     extractCtrl.setStoreData = 0;
336
337     r = (*rt->extract)(recTypeClientData, &extractCtrl);
338
339 }
340
341 static void searchRecordKey(ZebraHandle zh,
342                             zebra_rec_keys_t reckeys,
343                             const char *index_name,
344                             const char **ws, int ws_length)
345 {
346     int i;
347     int ch = -1;
348     zinfo_index_category_t cat = zinfo_index_category_index;
349
350     for (i = 0; i<ws_length; i++)
351         ws[i] = NULL;
352
353     if (ch < 0)
354         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, '0', index_name);
355     if (ch < 0)
356         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, 'p', index_name);
357     if (ch < 0)
358         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, 'w', index_name);
359
360     if (ch < 0)
361         return ;
362
363     if (zebra_rec_keys_rewind(reckeys))
364     {
365         zint startSeq = -1;
366         const char *str;
367         size_t slen;
368         struct it_key key;
369         zint seqno;
370         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
371         {
372             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
373
374             seqno = key.mem[key.len-1];
375             
376             if (key.mem[0] == ch)
377             {
378                 zint woff;
379                 
380                 if (startSeq == -1)
381                     startSeq = seqno;
382                 woff = seqno - startSeq;
383                 if (woff >= 0 && woff < ws_length)
384                     ws[woff] = str;
385             }
386         }
387     }
388 }
389
390 #define FILE_MATCH_BLANK "\t "
391
392 static char *get_match_from_spec(ZebraHandle zh,
393                           zebra_rec_keys_t reckeys,
394                           const char *fname, const char *spec)
395 {
396     static char dstBuf[2048];      /* static here ??? */
397     char *dst = dstBuf;
398     const char *s = spec;
399
400     while (1)
401     {
402         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
403             ;
404         if (!*s)
405             break;
406         if (*s == '(')
407         {
408             const char *ws[32];
409             char attset_str[64], attname_str[64];
410             int i;
411             int first = 1;
412             
413             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
414                 ;
415             for (i = 0; *s && *s != ',' && *s != ')' && 
416                      !strchr(FILE_MATCH_BLANK, *s); s++)
417                 if (i+1 < sizeof(attset_str))
418                     attset_str[i++] = *s;
419             attset_str[i] = '\0';
420             
421             for (; strchr(FILE_MATCH_BLANK, *s); s++)
422                 ;
423             if (*s != ',')
424                 strcpy(attname_str, attset_str);
425             else
426             {
427                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
428                     ;
429                 for (i = 0; *s && *s != ')' && 
430                          !strchr(FILE_MATCH_BLANK, *s); s++)
431                     if (i+1 < sizeof(attname_str))
432                         attname_str[i++] = *s;
433                 attname_str[i] = '\0';
434             }
435
436             searchRecordKey(zh, reckeys, attname_str, ws, 32);
437
438             if (*s != ')')
439             {
440                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
441                       spec, zh->m_group ? zh->m_group : "none");
442                 return NULL;
443             }
444             s++;
445
446             for (i = 0; i<32; i++)
447                 if (ws[i])
448                 {
449                     if (first)
450                     {
451                         *dst++ = ' ';
452                         first = 0;
453                     }
454                     strcpy(dst, ws[i]);
455                     dst += strlen(ws[i]);
456                 }
457             if (first)
458             {
459                 yaz_log(YLOG_WARN, "Record didn't contain match"
460                       " fields in (%s,%s)", attset_str, attname_str);
461                 return NULL;
462             }
463         }
464         else if (*s == '$')
465         {
466             int spec_len;
467             char special[64];
468             const char *spec_src = NULL;
469             const char *s1 = ++s;
470             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
471                 s1++;
472
473             spec_len = s1 - s;
474             if (spec_len > sizeof(special)-1)
475                 spec_len = sizeof(special)-1;
476             memcpy(special, s, spec_len);
477             special[spec_len] = '\0';
478             s = s1;
479
480             if (!strcmp(special, "group"))
481                 spec_src = zh->m_group;
482             else if (!strcmp(special, "database"))
483                 spec_src = zh->basenames[0];
484             else if (!strcmp(special, "filename")) {
485                 spec_src = fname;
486             }
487             else if (!strcmp(special, "type"))
488                 spec_src = zh->m_record_type;
489             else 
490                 spec_src = NULL;
491             if (spec_src)
492             {
493                 strcpy(dst, spec_src);
494                 dst += strlen(spec_src);
495             }
496         }
497         else if (*s == '\"' || *s == '\'')
498         {
499             int stopMarker = *s++;
500             char tmpString[64];
501             int i = 0;
502
503             while (*s && *s != stopMarker)
504             {
505                 if (i+1 < sizeof(tmpString))
506                     tmpString[i++] = *s++;
507             }
508             if (*s)
509                 s++;
510             tmpString[i] = '\0';
511             strcpy(dst, tmpString);
512             dst += strlen(tmpString);
513         }
514         else
515         {
516             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
517                   spec, zh->m_group ? zh->m_group : "none");
518             return NULL;
519         }
520         *dst++ = 1;
521     }
522     if (dst == dstBuf)
523     {
524         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
525               fname, zh->m_group ? zh->m_group : "none");
526         return NULL;
527     }
528     *dst = '\0';
529     return dstBuf;
530 }
531
532 struct recordLogInfo {
533     const char *fname;
534     int recordOffset;
535     struct recordGroup *rGroup;
536 };
537
538 static void all_matches_add(struct recExtractCtrl *ctrl)
539 {
540     RecWord word;
541     extract_init(ctrl, &word);
542     word.index_name = "_ALLRECORDS";
543     word.index_type = 'w';
544     word.seqno = 1;
545     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
546                               "", 0);
547 }
548
549 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
550                                        struct ZebraRecStream *stream,
551                                        enum zebra_recctrl_action_t action,
552                                        int test_mode, 
553                                        const char *recordType,
554                                        zint *sysno,
555                                        const char *match_criteria,
556                                        const char *fname,
557                                        RecType recType,
558                                        void *recTypeClientData);
559
560
561 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
562                              int deleteFlag)
563 {
564     ZEBRA_RES r = ZEBRA_OK;
565     int i, fd;
566     char gprefix[128];
567     char ext[128];
568     char ext_res[128];
569     struct file_read_info *fi = 0;
570     const char *original_record_type = 0;
571     RecType recType;
572     void *recTypeClientData;
573     struct ZebraRecStream stream, *streamp;
574
575     zebra_init_log_level();
576
577     if (!zh->m_group || !*zh->m_group)
578         *gprefix = '\0';
579     else
580         sprintf(gprefix, "%s.", zh->m_group);
581     
582     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
583
584     /* determine file extension */
585     *ext = '\0';
586     for (i = strlen(fname); --i >= 0; )
587         if (fname[i] == '/')
588             break;
589         else if (fname[i] == '.')
590         {
591             strcpy(ext, fname+i+1);
592             break;
593         }
594     /* determine file type - depending on extension */
595     original_record_type = zh->m_record_type;
596     if (!zh->m_record_type)
597     {
598         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
599         zh->m_record_type = res_get(zh->res, ext_res);
600     }
601     if (!zh->m_record_type)
602     {
603         check_log_limit(zh);
604         if (zh->records_processed + zh->records_skipped
605             < zh->m_file_verbose_limit)
606             yaz_log(YLOG_LOG, "? %s", fname);
607         zh->records_skipped++;
608         return 0;
609     }
610     /* determine match criteria */
611     if (!zh->m_record_id)
612     {
613         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
614         zh->m_record_id = res_get(zh->res, ext_res);
615     }
616
617     if (!(recType =
618           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
619                           &recTypeClientData)))
620     {
621         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
622         return ZEBRA_FAIL;
623     }
624
625     switch(recType->version)
626     {
627     case 0:
628         break;
629     default:
630         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
631     }
632     if (sysno && deleteFlag)
633     {
634         streamp = 0;
635         fi = 0;
636     }
637     else
638     {
639         char full_rep[1024];
640
641         if (zh->path_reg && !yaz_is_abspath(fname))
642         {
643             strcpy(full_rep, zh->path_reg);
644             strcat(full_rep, "/");
645             strcat(full_rep, fname);
646         }
647         else
648             strcpy(full_rep, fname);
649         
650         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
651         {
652             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
653             zh->m_record_type = original_record_type;
654             return ZEBRA_FAIL;
655         }
656         streamp = &stream;
657         zebra_create_stream_fd(streamp, fd, 0);
658     }
659     r = zebra_extract_records_stream(zh, streamp,
660                                      deleteFlag ? 
661                                      action_delete : action_update,
662                                      0, /* tst_mode */
663                                      zh->m_record_type,
664                                      sysno,
665                                      0, /*match_criteria */
666                                      fname,
667                                      recType, recTypeClientData);
668     if (streamp)
669         stream.destroy(streamp);
670     zh->m_record_type = original_record_type;
671     return r;
672 }
673
674 /*
675   If sysno is provided, then it's used to identify the reocord.
676   If not, and match_criteria is provided, then sysno is guessed
677   If not, and a record is provided, then sysno is got from there
678   
679  */
680
681 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
682                                       const char *buf, size_t buf_size,
683                                       enum zebra_recctrl_action_t action,
684                                       int test_mode, 
685                                       const char *recordType,
686                                       zint *sysno,
687                                       const char *match_criteria,
688                                       const char *fname)
689 {
690     struct ZebraRecStream stream;
691     ZEBRA_RES res;
692     void *clientData;
693     RecType recType = 0;
694
695     if (recordType && *recordType)
696     {
697         yaz_log(log_level_extract,
698                 "Record type explicitly specified: %s", recordType);
699         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
700                                   &clientData);
701     } 
702     else
703     {
704         if (!(zh->m_record_type))
705         {
706             yaz_log(YLOG_WARN, "No such record type defined");
707             return ZEBRA_FAIL;
708         }
709         yaz_log(log_level_extract, "Get record type from rgroup: %s",
710                 zh->m_record_type);
711         recType = recType_byName(zh->reg->recTypes, zh->res,
712                                   zh->m_record_type, &clientData);
713         recordType = zh->m_record_type;
714     }
715     
716     if (!recType)
717     {
718         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
719         return ZEBRA_FAIL;
720     }
721
722     zebra_create_stream_mem(&stream, buf, buf_size);
723
724     res = zebra_extract_records_stream(zh, &stream,
725                                        action,
726                                        test_mode, 
727                                        recordType,
728                                        sysno,
729                                        match_criteria,
730                                        fname,
731                                        recType, clientData);
732     stream.destroy(&stream);
733     return res;
734 }
735
736 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
737                                        struct ZebraRecStream *stream,
738                                        enum zebra_recctrl_action_t action,
739                                        int test_mode, 
740                                        const char *recordType,
741                                        zint *sysno,
742                                        const char *match_criteria,
743                                        const char *fname,
744                                        RecType recType,
745                                        void *recTypeClientData)
746 {
747     ZEBRA_RES res = ZEBRA_OK;
748     while (1)
749     {
750         int more = 0;
751         res = zebra_extract_record_stream(zh, stream,
752                                           action,
753                                           test_mode, 
754                                           recordType,
755                                           sysno,
756                                           match_criteria,
757                                           fname,
758                                           recType, recTypeClientData, &more);
759         if (!more)
760         {
761             res = ZEBRA_OK;
762             break;
763         }
764         if (res != ZEBRA_OK)
765             break;
766         if (sysno)
767             break;
768     }
769     return res;
770 }
771
772
773 static WRBUF wrbuf_hex_str(const char *cstr)
774 {
775     size_t i;
776     WRBUF w = wrbuf_alloc();
777     for (i = 0; cstr[i]; i++)
778     {
779         if (cstr[i] < ' ' || cstr[i] > 126)
780             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
781         else
782             wrbuf_putc(w, cstr[i]);
783     }
784     return w;
785 }
786
787 ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
788                                       struct ZebraRecStream *stream,
789                                       enum zebra_recctrl_action_t action,
790                                       int test_mode, 
791                                       const char *recordType,
792                                       zint *sysno,
793                                       const char *match_criteria,
794                                       const char *fname,
795                                       RecType recType,
796                                       void *recTypeClientData,
797                                       int *more)
798
799 {
800     zint sysno0 = 0;
801     RecordAttr *recordAttr;
802     struct recExtractCtrl extractCtrl;
803     int r;
804     const char *matchStr = 0;
805     Record rec;
806     off_t start_offset = 0, end_offset = 0;
807     const char *pr_fname = fname;  /* filename to print .. */
808     int show_progress = zh->records_processed + zh->records_skipped 
809         < zh->m_file_verbose_limit ? 1:0;
810
811     zebra_init_log_level();
812
813     if (!pr_fname)
814         pr_fname = "<no file>";  /* make it printable if file is omitted */
815
816     zebra_rec_keys_reset(zh->reg->keys);
817     zebra_rec_keys_reset(zh->reg->sortKeys);
818
819     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
820     {
821         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
822                                       zh->m_explain_database))
823             return ZEBRA_FAIL;
824     }
825
826     if (stream)
827     {
828         off_t null_offset = 0;
829         extractCtrl.stream = stream;
830
831         start_offset = stream->tellf(stream);
832
833         extractCtrl.first_record = start_offset ? 0 : 1;
834         
835         stream->endf(stream, &null_offset);;
836
837         extractCtrl.init = extract_init;
838         if (zh->reg->index_types)
839         {
840             extractCtrl.tokenAdd = extract_token_add2;
841         }
842         else
843         {
844             extractCtrl.tokenAdd = extract_token_add;
845         }
846         extractCtrl.schemaAdd = extract_schema_add;
847         extractCtrl.dh = zh->reg->dh;
848         extractCtrl.handle = zh;
849         extractCtrl.match_criteria[0] = '\0';
850         extractCtrl.staticrank = 0;
851         extractCtrl.action = action;
852
853         init_extractCtrl(zh, &extractCtrl);
854
855         extract_set_store_data_prepare(&extractCtrl);
856         
857         r = (*recType->extract)(recTypeClientData, &extractCtrl);
858
859         if (action == action_update)
860         {
861             action = extractCtrl.action;
862         }
863         
864         switch (r)
865         {
866         case RECCTRL_EXTRACT_EOF:
867             return ZEBRA_FAIL;
868         case RECCTRL_EXTRACT_ERROR_GENERIC:
869             /* error occured during extraction ... */
870             yaz_log(YLOG_WARN, "extract error: generic");
871             return ZEBRA_FAIL;
872         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
873             /* error occured during extraction ... */
874             yaz_log(YLOG_WARN, "extract error: no such filter");
875             return ZEBRA_FAIL;
876         case RECCTRL_EXTRACT_SKIP:
877             if (show_progress)
878                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
879                          recordType, pr_fname, (zint) start_offset);
880             *more = 1;
881             
882             end_offset = stream->endf(stream, 0);
883             if (end_offset)
884                 stream->seekf(stream, end_offset);
885
886             return ZEBRA_OK;
887         case RECCTRL_EXTRACT_OK:
888             break;
889         default:
890             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
891             return ZEBRA_FAIL;
892         }
893         end_offset = stream->endf(stream, 0);
894         if (end_offset)
895             stream->seekf(stream, end_offset);
896         else
897             end_offset = stream->tellf(stream);
898
899         all_matches_add(&extractCtrl);
900         
901         if (extractCtrl.match_criteria[0])
902             match_criteria = extractCtrl.match_criteria;
903     }
904
905     *more = 1;
906     if (!sysno)
907     {
908         sysno = &sysno0;
909
910         if (match_criteria && *match_criteria) {
911             matchStr = match_criteria;
912         } else {
913             if (zh->m_record_id && *zh->m_record_id) {
914                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
915                                                zh->m_record_id);
916                 if (!matchStr)
917                 {
918                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
919                              pr_fname, (zint) start_offset);
920                     return ZEBRA_FAIL;
921                 }
922             }
923         }
924         if (matchStr) 
925         {
926             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
927             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
928                                           matchStr);
929
930             
931             if (log_level_extract)
932             {
933                 WRBUF w = wrbuf_hex_str(matchStr);
934                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
935                 wrbuf_destroy(w);
936             }
937             if (rinfo)
938             {
939                 assert(*rinfo == sizeof(*sysno));
940                 memcpy(sysno, rinfo+1, sizeof(*sysno));
941             }
942        }
943     }
944     if (zebra_rec_keys_empty(zh->reg->keys))
945     {
946         /* the extraction process returned no information - the record
947            is probably empty - unless flagShowRecords is in use */
948         if (test_mode)
949             return ZEBRA_OK;
950     }
951
952     if (! *sysno)
953     {
954         /* new record */
955         if (action == action_delete)
956         {
957             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
958                          pr_fname, (zint) start_offset);
959             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
960             return ZEBRA_FAIL;
961         }
962         else if (action == action_replace)
963         {
964             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
965                          pr_fname, (zint) start_offset);
966             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
967             return ZEBRA_FAIL;
968         }
969         if (show_progress)
970             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
971                      (zint) start_offset);
972         rec = rec_new(zh->reg->records);
973
974         *sysno = rec->sysno;
975
976         recordAttr = rec_init_attr(zh->reg->zei, rec);
977         if (extractCtrl.staticrank < 0)
978         {
979             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
980             extractCtrl.staticrank = 0;
981         }
982
983         if (matchStr)
984         {
985             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
986             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
987                             sizeof(*sysno), sysno);
988         }
989
990         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
991 #if FLUSH2
992         extract_flush_record_keys2(zh, *sysno,
993                                    zh->reg->keys, extractCtrl.staticrank,
994                                    0, recordAttr->staticrank);
995 #else
996         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
997                                   extractCtrl.staticrank);
998 #endif
999         recordAttr->staticrank = extractCtrl.staticrank;
1000         zh->records_inserted++;
1001     } 
1002     else
1003     {
1004         /* record already exists */
1005         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1006         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1007         if (action == action_insert)
1008         {
1009             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
1010                          recordType, pr_fname, (zint) start_offset);
1011             logRecord(zh);
1012             return ZEBRA_FAIL;
1013         }
1014
1015         rec = rec_get(zh->reg->records, *sysno);
1016         assert(rec);
1017         
1018         recordAttr = rec_init_attr(zh->reg->zei, rec);
1019
1020         /* decrease total size */
1021         zebraExplain_recordBytesIncrement(zh->reg->zei,
1022                                            - recordAttr->recordSize);
1023
1024         zebra_rec_keys_set_buf(delkeys,
1025                                rec->info[recInfo_delKeys],
1026                                rec->size[recInfo_delKeys],
1027                                0);
1028         zebra_rec_keys_set_buf(sortKeys,
1029                                rec->info[recInfo_sortKeys],
1030                                rec->size[recInfo_sortKeys],
1031                                0);
1032
1033         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1034 #if !FLUSH2
1035         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1036                                   recordAttr->staticrank);
1037 #endif
1038         if (action == action_delete)
1039         {
1040             /* record going to be deleted */
1041 #if FLUSH2
1042             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1043                                        delkeys, recordAttr->staticrank);
1044 #endif       
1045             if (zebra_rec_keys_empty(delkeys))
1046             {
1047                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1048                         pr_fname, (zint) start_offset);
1049                 yaz_log(YLOG_WARN, "cannot delete file above, "
1050                         "storeKeys false (3)");
1051             }
1052             else
1053             {
1054                 if (show_progress)
1055                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1056                             pr_fname, (zint) start_offset);
1057                 zh->records_deleted++;
1058                 if (matchStr)
1059                 {
1060                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1061                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1062                 }
1063                 rec_del(zh->reg->records, &rec);
1064             }
1065             zebra_rec_keys_close(delkeys);
1066             zebra_rec_keys_close(sortKeys);
1067             rec_free(&rec);
1068             logRecord(zh);
1069             return ZEBRA_OK;
1070         }
1071         else
1072         {   /* update or special_update */
1073             if (show_progress)
1074                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1075                         pr_fname, (zint) start_offset);
1076             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1077
1078 #if FLUSH2
1079             extract_flush_record_keys2(zh, *sysno,
1080                                        zh->reg->keys, extractCtrl.staticrank,
1081                                        delkeys, recordAttr->staticrank);
1082 #else
1083             extract_flush_record_keys(zh, *sysno, 1, 
1084                                       zh->reg->keys, extractCtrl.staticrank);
1085 #endif
1086             recordAttr->staticrank = extractCtrl.staticrank;
1087             zh->records_updated++;
1088         }
1089         zebra_rec_keys_close(delkeys);
1090         zebra_rec_keys_close(sortKeys);
1091     }
1092     /* update file type */
1093     xfree(rec->info[recInfo_fileType]);
1094     rec->info[recInfo_fileType] =
1095         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1096
1097     /* update filename */
1098     xfree(rec->info[recInfo_filename]);
1099     rec->info[recInfo_filename] =
1100         rec_strdup(fname, &rec->size[recInfo_filename]);
1101
1102     /* update delete keys */
1103     xfree(rec->info[recInfo_delKeys]);
1104     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1105     {
1106         zebra_rec_keys_get_buf(zh->reg->keys,
1107                                &rec->info[recInfo_delKeys],
1108                                &rec->size[recInfo_delKeys]);
1109     }
1110     else
1111     {
1112         rec->info[recInfo_delKeys] = NULL;
1113         rec->size[recInfo_delKeys] = 0;
1114     }
1115     /* update sort keys */
1116     xfree(rec->info[recInfo_sortKeys]);
1117
1118     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1119                            &rec->info[recInfo_sortKeys],
1120                            &rec->size[recInfo_sortKeys]);
1121
1122     if (stream)
1123     {
1124         recordAttr->recordSize = end_offset - start_offset;
1125         zebraExplain_recordBytesIncrement(zh->reg->zei,
1126                                           recordAttr->recordSize);
1127     }
1128
1129     /* set run-number for this record */
1130     recordAttr->runNumber =
1131         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1132
1133     /* update store data */
1134     xfree(rec->info[recInfo_storeData]);
1135
1136     /* update store data */
1137     if (zh->store_data_buf)
1138     {
1139         rec->size[recInfo_storeData] = zh->store_data_size;
1140         rec->info[recInfo_storeData] = zh->store_data_buf;
1141         zh->store_data_buf = 0;
1142         recordAttr->recordSize = zh->store_data_size;
1143     }
1144     else if (zh->m_store_data)
1145     {
1146         off_t cur_offset = stream->tellf(stream);
1147
1148         rec->size[recInfo_storeData] = recordAttr->recordSize;
1149         rec->info[recInfo_storeData] = (char *)
1150             xmalloc(recordAttr->recordSize);
1151         stream->seekf(stream, start_offset);
1152         stream->readf(stream, rec->info[recInfo_storeData],
1153                       recordAttr->recordSize);
1154         stream->seekf(stream, cur_offset);
1155     }
1156     else
1157     {
1158         rec->info[recInfo_storeData] = NULL;
1159         rec->size[recInfo_storeData] = 0;
1160     }
1161     /* update database name */
1162     xfree(rec->info[recInfo_databaseName]);
1163     rec->info[recInfo_databaseName] =
1164         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1165
1166     /* update offset */
1167     recordAttr->recordOffset = start_offset;
1168     
1169     /* commit this record */
1170     rec_put(zh->reg->records, &rec);
1171     logRecord(zh);
1172     return ZEBRA_OK;
1173 }
1174
1175 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1176 {
1177     ZebraHandle zh = (ZebraHandle) handle;
1178     struct recExtractCtrl extractCtrl;
1179
1180     if (zebraExplain_curDatabase(zh->reg->zei,
1181                                   rec->info[recInfo_databaseName]))
1182     {
1183         abort();
1184         if (zebraExplain_newDatabase(zh->reg->zei,
1185                                       rec->info[recInfo_databaseName], 0))
1186             abort();
1187     }
1188
1189     zebra_rec_keys_reset(zh->reg->keys);
1190     zebra_rec_keys_reset(zh->reg->sortKeys);
1191
1192     extractCtrl.init = extract_init;
1193     extractCtrl.tokenAdd = extract_token_add;
1194     extractCtrl.schemaAdd = extract_schema_add;
1195     extractCtrl.dh = zh->reg->dh;
1196
1197     init_extractCtrl(zh, &extractCtrl);
1198
1199     extractCtrl.flagShowRecords = 0;
1200     extractCtrl.match_criteria[0] = '\0';
1201     extractCtrl.staticrank = 0;
1202     extractCtrl.action = action_update;
1203
1204     extractCtrl.handle = handle;
1205     extractCtrl.first_record = 1;
1206     
1207     extract_set_store_data_prepare(&extractCtrl);
1208
1209     if (n)
1210         grs_extract_tree(&extractCtrl, n);
1211
1212     if (rec->size[recInfo_delKeys])
1213     {
1214         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1215         
1216         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1217
1218         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1219                                rec->size[recInfo_delKeys],
1220                                0);
1221 #if FLUSH2
1222         extract_flush_record_keys2(zh, rec->sysno, 
1223                                    zh->reg->keys, 0, delkeys, 0);
1224 #else
1225         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1226         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1227 #endif
1228         zebra_rec_keys_close(delkeys);
1229
1230         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1231                                rec->size[recInfo_sortKeys],
1232                                0);
1233
1234         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1235         zebra_rec_keys_close(sortkeys);
1236     }
1237     else
1238     {
1239 #if FLUSH2
1240         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1241 #else
1242         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1243 #endif
1244     }
1245     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1246     
1247     xfree(rec->info[recInfo_delKeys]);
1248     zebra_rec_keys_get_buf(zh->reg->keys,
1249                            &rec->info[recInfo_delKeys], 
1250                            &rec->size[recInfo_delKeys]);
1251
1252     xfree(rec->info[recInfo_sortKeys]);
1253     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1254                            &rec->info[recInfo_sortKeys],
1255                            &rec->size[recInfo_sortKeys]);
1256     return ZEBRA_OK;
1257 }
1258
1259 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1260                           zebra_rec_keys_t reckeys,
1261                           int level)
1262 {
1263     if (zebra_rec_keys_rewind(reckeys))
1264     {
1265         size_t slen;
1266         const char *str;
1267         struct it_key key;
1268         NMEM nmem = nmem_create();
1269
1270         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1271         {
1272             char keystr[200]; /* room for zints to print */
1273             char *dst_term = 0;
1274             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1275             int index_type, i;
1276             const char *string_index;
1277             
1278             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1279                                     0/* db */, &string_index);
1280             assert(index_type);
1281             zebra_term_untrans_iconv(zh, nmem, index_type,
1282                                      &dst_term, str);
1283             *keystr = '\0';
1284             for (i = 0; i<key.len; i++)
1285             {
1286                 sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key.mem[i]);
1287             }
1288
1289             if (*str < CHR_BASE_CHAR)
1290             {
1291                 int i;
1292                 char dst_buf[200]; /* room for special chars */
1293
1294                 strcpy(dst_buf , "?");
1295
1296                 if (!strcmp(str, ""))
1297                     strcpy(dst_buf, "alwaysmatches");
1298                 if (!strcmp(str, FIRST_IN_FIELD_STR))
1299                     strcpy(dst_buf, "firstinfield");
1300                 else if (!strcmp(str, CHR_UNKNOWN))
1301                     strcpy(dst_buf, "unknown");
1302                 else if (!strcmp(str, CHR_SPACE))
1303                     strcpy(dst_buf, "space");
1304                 
1305                 for (i = 0; i<slen; i++)
1306                 {
1307                     sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1308                 }
1309                 yaz_log(level, "%s%c %s %s", keystr, index_type,
1310                         string_index, dst_buf);
1311                 
1312             }
1313             else
1314                 yaz_log(level, "%s%c %s \"%s\"", keystr, index_type,
1315                         string_index, dst_term);
1316
1317             nmem_reset(nmem);
1318         }
1319         nmem_destroy(nmem);
1320     }
1321 }
1322
1323 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1324                              zebra_rec_keys_t reckeys)
1325 {
1326     ZebraExplainInfo zei = zh->reg->zei;
1327     struct ord_stat {
1328         int no;
1329         int ord;
1330         struct ord_stat *next;
1331     };
1332
1333     if (zebra_rec_keys_rewind(reckeys))
1334     {
1335         struct ord_stat *ord_list = 0;
1336         struct ord_stat *p;
1337         size_t slen;
1338         const char *str;
1339         struct it_key key_in;
1340         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1341         {
1342             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1343
1344             for (p = ord_list; p ; p = p->next)
1345                 if (p->ord == ord)
1346                 {
1347                     p->no++;
1348                     break;
1349                 }
1350             if (!p)
1351             {
1352                 p = xmalloc(sizeof(*p));
1353                 p->no = 1;
1354                 p->ord = ord;
1355                 p->next = ord_list;
1356                 ord_list = p;
1357             }
1358         }
1359
1360         p = ord_list;
1361         while (p)
1362         {
1363             struct ord_stat *p1 = p;
1364
1365             if (is_insert)
1366                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1367             else
1368                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1369             p = p->next;
1370             xfree(p1);
1371         }
1372     }
1373 }
1374
1375 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
1376                                 zebra_rec_keys_t ins_keys, zint ins_rank,
1377                                 zebra_rec_keys_t del_keys, zint del_rank)
1378 {
1379     ZebraExplainInfo zei = zh->reg->zei;
1380     int normal = 0;
1381     int optimized = 0;
1382
1383     if (!zh->reg->key_block)
1384     {
1385         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1386         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1387         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1388         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1389     }
1390
1391     if (ins_keys)
1392     {
1393         extract_rec_keys_adjust(zh, 1, ins_keys);
1394         if (!del_keys)
1395             zebraExplain_recordCountIncrement(zei, 1);
1396         zebra_rec_keys_rewind(ins_keys);
1397     }
1398     if (del_keys)
1399     {
1400         extract_rec_keys_adjust(zh, 0, del_keys);
1401         if (!ins_keys)
1402             zebraExplain_recordCountIncrement(zei, -1);
1403         zebra_rec_keys_rewind(del_keys);
1404     }
1405
1406     while (1)
1407     {
1408         size_t del_slen;
1409         const char *del_str;
1410         struct it_key del_key_in;
1411         int del = 0;
1412
1413         size_t ins_slen;
1414         const char *ins_str;
1415         struct it_key ins_key_in;
1416         int ins = 0;
1417
1418         if (del_keys)
1419             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1420                                       &del_key_in);
1421         if (ins_keys)
1422             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1423                                       &ins_key_in);
1424
1425         if (del && ins && ins_rank == del_rank
1426             && !key_compare(&del_key_in, &ins_key_in) 
1427             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1428         {
1429             optimized++;
1430             continue;
1431         }
1432         if (!del && !ins)
1433             break;
1434         
1435         normal++;
1436         if (del)
1437             key_block_write(zh->reg->key_block, sysno, 
1438                             &del_key_in, 0, del_str, del_slen,
1439                             del_rank, zh->m_staticrank);
1440         if (ins)
1441             key_block_write(zh->reg->key_block, sysno, 
1442                             &ins_key_in, 1, ins_str, ins_slen,
1443                             ins_rank, zh->m_staticrank);
1444     }
1445     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1446 }
1447
1448 void extract_flush_record_keys(ZebraHandle zh, zint sysno, int cmd,
1449                                zebra_rec_keys_t reckeys,
1450                                zint staticrank)
1451 {
1452     ZebraExplainInfo zei = zh->reg->zei;
1453
1454     extract_rec_keys_adjust(zh, cmd, reckeys);
1455
1456     if (log_level_details)
1457     {
1458         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1459                 sysno, cmd ? "insert" : "delete");
1460         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1461     }
1462
1463     if (!zh->reg->key_block)
1464     {
1465         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1466         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1467         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1468         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1469     }
1470     zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1471
1472 #if 0
1473     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1474     print_rec_keys(zh, reckeys);
1475 #endif
1476     if (zebra_rec_keys_rewind(reckeys))
1477     {
1478         size_t slen;
1479         const char *str;
1480         struct it_key key_in;
1481         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1482         {
1483             key_block_write(zh->reg->key_block, sysno, 
1484                             &key_in, cmd, str, slen,
1485                             staticrank, zh->m_staticrank);
1486         }
1487     }
1488 }
1489
1490 ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh,
1491                                      zebra_rec_keys_t reckeys,
1492                                      zebra_snippets *snippets)
1493 {
1494     NMEM nmem = nmem_create();
1495     if (zebra_rec_keys_rewind(reckeys)) 
1496     {
1497         const char *str;
1498         size_t slen;
1499         struct it_key key;
1500         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1501         {
1502             char *dst_term = 0;
1503             int ord;
1504             zint seqno;
1505             int index_type;
1506
1507             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1508             seqno = key.mem[key.len-1];
1509             ord = CAST_ZINT_TO_INT(key.mem[0]);
1510             
1511             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1512                                     0/* db */, 0 /* string_index */);
1513             assert(index_type);
1514             zebra_term_untrans_iconv(zh, nmem, index_type,
1515                                      &dst_term, str);
1516             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1517             nmem_reset(nmem);
1518         }
1519     }
1520     nmem_destroy(nmem);
1521     return ZEBRA_OK;
1522 }
1523
1524 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1525 {
1526     yaz_log(YLOG_LOG, "print_rec_keys");
1527     if (zebra_rec_keys_rewind(reckeys))
1528     {
1529         const char *str;
1530         size_t slen;
1531         struct it_key key;
1532         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1533         {
1534             char dst_buf[IT_MAX_WORD];
1535             zint seqno;
1536             int index_type;
1537             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1538             const char *db = 0;
1539             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1540
1541             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1542             
1543             seqno = key.mem[key.len-1];
1544             
1545             zebra_term_untrans(zh, index_type, dst_buf, str);
1546             
1547             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1548                     " term=%s", ord, seqno, dst_buf); 
1549         }
1550     }
1551 }
1552
1553 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1554                                      const char *str, int length)
1555 {
1556     struct it_key key;
1557     ZebraHandle zh = p->extractCtrl->handle;
1558     ZebraExplainInfo zei = zh->reg->zei;
1559     int ch, i;
1560
1561     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1562     if (ch < 0)
1563         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1564
1565     i = 0;
1566     key.mem[i++] = ch;
1567     key.mem[i++] = p->record_id;
1568     key.mem[i++] = p->section_id;
1569
1570     if (zh->m_segment_indexing)
1571         key.mem[i++] = p->segment;
1572     key.mem[i++] = p->seqno;
1573     key.len = i;
1574
1575     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1576 }
1577
1578 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1579 {
1580     struct it_key key;
1581     ZebraHandle zh = p->extractCtrl->handle;
1582     ZebraExplainInfo zei = zh->reg->zei;
1583     int ch;
1584     zinfo_index_category_t cat = zinfo_index_category_sort;
1585
1586     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1587     if (ch < 0)
1588         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1589     key.len = 2;
1590     key.mem[0] = ch;
1591     key.mem[1] = p->record_id;
1592
1593     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1594 }
1595
1596 static void extract_add_staticrank_string(RecWord *p,
1597                                           const char *str, int length)
1598 {
1599     char valz[40];
1600     struct recExtractCtrl *ctrl = p->extractCtrl;
1601
1602     if (length > sizeof(valz)-1)
1603         length = sizeof(valz)-1;
1604
1605     memcpy(valz, str, length);
1606     valz[length] = '\0';
1607     ctrl->staticrank = atozint(valz);
1608 }
1609
1610 static void extract_add_string(RecWord *p, const char *string, int length)
1611 {
1612     ZebraHandle zh = p->extractCtrl->handle;
1613     assert(length > 0);
1614
1615     if (!p->index_name)
1616         return;
1617
1618     if (zebra_maps_is_index(zh->reg->zebra_maps, p->index_type))
1619     {
1620         extract_add_index_string(p, zinfo_index_category_index,
1621                                  string, length);
1622         if (zebra_maps_is_alwaysmatches(zh->reg->zebra_maps, p->index_type))
1623         {
1624             RecWord word;
1625             memcpy(&word, p, sizeof(word));
1626
1627             word.seqno = 1;
1628             extract_add_index_string(
1629                 &word, zinfo_index_category_alwaysmatches, "", 0);
1630         }
1631     }
1632     else if (zebra_maps_is_sort(zh->reg->zebra_maps, p->index_type))
1633     {
1634         extract_add_sort_string(p, string, length);
1635     }
1636     else if (zebra_maps_is_staticrank(zh->reg->zebra_maps, p->index_type))
1637     {
1638         extract_add_staticrank_string(p, string, length);
1639     }
1640 }
1641
1642 static void extract_add_incomplete_field(RecWord *p)
1643 {
1644     ZebraHandle zh = p->extractCtrl->handle;
1645     const char *b = p->term_buf;
1646     int remain = p->term_len;
1647     int first = 1;
1648     const char **map = 0;
1649     
1650     if (remain > 0)
1651         map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0);
1652
1653     while (map)
1654     {
1655         char buf[IT_MAX_WORD+1];
1656         int i, remain;
1657
1658         /* Skip spaces */
1659         while (map && *map && **map == *CHR_SPACE)
1660         {
1661             remain = p->term_len - (b - p->term_buf);
1662             if (remain > 0)
1663                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b,
1664                                        remain, 0);
1665             else
1666                 map = 0;
1667         }
1668         if (!map)
1669             break;
1670         i = 0;
1671         while (map && *map && **map != *CHR_SPACE)
1672         {
1673             const char *cp = *map;
1674
1675             while (i < IT_MAX_WORD && *cp)
1676                 buf[i++] = *(cp++);
1677             remain = p->term_len - (b - p->term_buf);
1678             if (remain > 0)
1679                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0);
1680             else
1681                 map = 0;
1682         }
1683         if (!i)
1684             return;
1685
1686         if (first)
1687         {   
1688             first = 0;
1689             if (zebra_maps_is_first_in_field(zh->reg->zebra_maps, p->index_type))
1690             {
1691                 /* first in field marker */
1692                 extract_add_string(p, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1693                 p->seqno++;
1694             }
1695         }
1696         extract_add_string(p, buf, i);
1697         p->seqno++;
1698     }
1699 }
1700
1701 static void extract_add_complete_field(RecWord *p)
1702 {
1703     ZebraHandle zh = p->extractCtrl->handle;
1704     const char *b = p->term_buf;
1705     char buf[IT_MAX_WORD+1];
1706     const char **map = 0;
1707     int i = 0, remain = p->term_len;
1708
1709     if (remain > 0)
1710         map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 1);
1711
1712     while (remain > 0 && i < IT_MAX_WORD)
1713     {
1714         while (map && *map && **map == *CHR_SPACE)
1715         {
1716             remain = p->term_len - (b - p->term_buf);
1717
1718             if (remain > 0)
1719             {
1720                 int first = i ? 0 : 1;  /* first position */
1721                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, first);
1722             }
1723             else
1724                 map = 0;
1725         }
1726         if (!map)
1727             break;
1728
1729         if (i && i < IT_MAX_WORD)
1730             buf[i++] = *CHR_SPACE;
1731         while (map && *map && **map != *CHR_SPACE)
1732         {
1733             const char *cp = *map;
1734
1735             if (**map == *CHR_CUT)
1736             {
1737                 i = 0;
1738             }
1739             else
1740             {
1741                 if (i >= IT_MAX_WORD)
1742                     break;
1743                 while (i < IT_MAX_WORD && *cp)
1744                     buf[i++] = *(cp++);
1745             }
1746             remain = p->term_len  - (b - p->term_buf);
1747             if (remain > 0)
1748             {
1749                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b,
1750                                         remain, 0);
1751             }
1752             else
1753                 map = 0;
1754         }
1755     }
1756     if (!i)
1757         return;
1758     extract_add_string(p, buf, i);
1759 }
1760
1761 static void extract_token_add2_index(ZebraHandle zh, zebra_index_type_t type,
1762                                      RecWord *p)
1763 {
1764     struct it_key key;
1765     const char *res_buf = 0;
1766     size_t res_len = 0;
1767     int r = zebra_index_type_tokenize(type, p->term_buf, p->term_len,
1768                                       &res_buf, &res_len);
1769     int cat = zinfo_index_category_index;
1770     int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
1771     if (ch < 0)
1772         ch = zebraExplain_add_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
1773     while (r)
1774     {
1775         int i = 0;
1776         key.mem[i++] = ch;
1777         key.mem[i++] = p->record_id;
1778         key.mem[i++] = p->section_id;
1779         
1780         if (zh->m_segment_indexing)
1781             key.mem[i++] = p->segment;
1782         key.mem[i++] = p->seqno;
1783         key.len = i;
1784
1785         yaz_log(YLOG_LOG, "keys_write %.*s", (int) res_len, res_buf);
1786         zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key);
1787         
1788         p->seqno++;
1789         r = zebra_index_type_tokenize(type, 0, 0, &res_buf, &res_len);
1790     }
1791 }
1792
1793 static void extract_token_add2(RecWord *p)
1794 {
1795     zebra_index_type_t type;
1796     ZebraHandle zh = p->extractCtrl->handle;
1797     char type_tmp[2];
1798     type_tmp[0] = p->index_type;
1799     type_tmp[1] = '\0';
1800     type = zebra_index_type_get(zh->reg->index_types, type_tmp);
1801     if (type)
1802     {
1803         if (zebra_index_type_is_index(type))
1804         {
1805             extract_token_add2_index(zh, type, p);
1806         }
1807         else if (zebra_index_type_is_sort(type))
1808         {
1809             ;
1810             
1811         }
1812     }
1813 }
1814
1815 /** \brief top-level indexing handler for recctrl system
1816     \param p token data to be indexed
1817
1818     Call sequence:
1819     extract_token
1820     zebra_add_{in}_complete
1821     extract_add_string
1822     
1823     extract_add_index_string
1824     or
1825     extract_add_sort_string
1826     or
1827     extract_add_staticrank_string
1828     
1829 */
1830 static void extract_token_add(RecWord *p)
1831 {
1832     ZebraHandle zh = p->extractCtrl->handle;
1833     WRBUF wrbuf;
1834
1835     if (log_level_details)
1836     {
1837         yaz_log(log_level_details, "extract_token_add "
1838                 "type=%c index=%s seqno=" ZINT_FORMAT " s=%.*s",
1839                 p->index_type, p->index_name, 
1840                 p->seqno, p->term_len, p->term_buf);
1841     }
1842     if ((wrbuf = zebra_replace(zh->reg->zebra_maps, p->index_type, 0,
1843                                p->term_buf, p->term_len)))
1844     {
1845         p->term_buf = wrbuf_buf(wrbuf);
1846         p->term_len = wrbuf_len(wrbuf);
1847     }
1848     if (zebra_maps_is_complete(zh->reg->zebra_maps, p->index_type))
1849         extract_add_complete_field(p);
1850     else
1851         extract_add_incomplete_field(p);
1852 }
1853
1854 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1855                                       void *buf, size_t sz)
1856 {
1857     ZebraHandle zh = (ZebraHandle) p->handle;
1858
1859     xfree(zh->store_data_buf);
1860     zh->store_data_buf = 0;
1861     zh->store_data_size = 0;
1862     if (buf && sz)
1863     {
1864         zh->store_data_buf = xmalloc(sz);
1865         zh->store_data_size = sz;
1866         memcpy(zh->store_data_buf, buf, sz);
1867     }
1868 }
1869
1870 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1871 {
1872     ZebraHandle zh = (ZebraHandle) p->handle;
1873     xfree(zh->store_data_buf);
1874     zh->store_data_buf = 0;
1875     zh->store_data_size = 0;
1876     p->setStoreData = extract_set_store_data_cb;
1877 }
1878
1879 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1880 {
1881     ZebraHandle zh = (ZebraHandle) p->handle;
1882     zebraExplain_addSchema(zh->reg->zei, oid);
1883 }
1884
1885 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1886                              int cmd, zebra_rec_keys_t reckeys)
1887 {
1888 #if 0
1889     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1890             cmd, sysno);
1891     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1892 #endif
1893
1894     if (zebra_rec_keys_rewind(reckeys))
1895     {
1896         zebra_sort_index_t si = zh->reg->sort_index;
1897         size_t slen;
1898         const char *str;
1899         struct it_key key_in;
1900
1901         zebra_sort_sysno(si, sysno);
1902
1903         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1904         {
1905             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1906             
1907             zebra_sort_type(si, ord);
1908             if (cmd == 1)
1909                 zebra_sort_add(si, str, slen);
1910             else
1911                 zebra_sort_delete(si);
1912         }
1913     }
1914 }
1915
1916 /*
1917  * Local variables:
1918  * c-basic-offset: 4
1919  * indent-tabs-mode: nil
1920  * End:
1921  * vim: shiftwidth=4 tabstop=8 expandtab
1922  */
1923