Fixes for snippet/scan where we do "reextract". The setStoreData
[idzebra-moved-to-github.git] / index / extract.c
1 /* $Id: extract.c,v 1.262 2007-08-31 07:02:24 adam Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 #include <stdio.h>
24 #include <assert.h>
25 #include <ctype.h>
26 #ifdef WIN32
27 #include <io.h>
28 #endif
29 #if HAVE_UNISTD_H
30 #include <unistd.h>
31 #endif
32 #include <fcntl.h>
33
34 #include "index.h"
35 #include "orddict.h"
36 #include <direntz.h>
37 #include <charmap.h>
38
39 static int log_level_extract = 0;
40 static int log_level_details = 0;
41 static int log_level_initialized = 0;
42
43 /* 1 if we use eliminitate identical delete/insert keys */
44 /* eventually this the 0-case code will be removed */
45 #define FLUSH2 1
46
47 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
48                                 zebra_rec_keys_t ins_keys,
49                                 zint ins_rank,
50                                 zebra_rec_keys_t del_keys,
51                                 zint del_rank);
52
53 static void zebra_init_log_level(void)
54 {
55     if (!log_level_initialized)
56     {
57         log_level_initialized = 1;
58
59         log_level_extract = yaz_log_module_level("extract");
60         log_level_details = yaz_log_module_level("indexdetails");
61     }
62 }
63
64 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
65                                       int cmd, zebra_rec_keys_t reckeys,
66                                       zint staticrank);
67 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
68                                     int cmd, zebra_rec_keys_t skp);
69 static void extract_schema_add (struct recExtractCtrl *p, Odr_oid *oid);
70 static void extract_token_add (RecWord *p);
71
72 static void check_log_limit(ZebraHandle zh)
73 {
74     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
75     {
76         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
77                 zh->m_file_verbose_limit);
78     }
79 }
80
81 static void logRecord (ZebraHandle zh)
82 {
83     check_log_limit(zh);
84     ++zh->records_processed;
85     if (!(zh->records_processed % 1000))
86     {
87         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
88                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
89                 zh->records_processed, zh->records_inserted, 
90                 zh->records_updated, zh->records_deleted);
91     }
92 }
93
94 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
95 {
96     int i;
97     for (i = 0; i<256; i++)
98     {
99         if (zebra_maps_is_positioned(zh->reg->zebra_maps, i))
100             ctrl->seqno[i] = 1;
101         else
102             ctrl->seqno[i] = 0;
103     }
104     ctrl->flagShowRecords = !zh->m_flag_rw;
105 }
106
107
108 static void extract_add_index_string (RecWord *p, 
109                                       zinfo_index_category_t cat,
110                                       const char *str, int length);
111
112 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
113
114 static void extract_init(struct recExtractCtrl *p, RecWord *w)
115 {
116     w->seqno = 1;
117     w->index_name = "any";
118     w->index_type = 'w';
119     w->extractCtrl = p;
120     w->record_id = 0;
121     w->section_id = 0;
122     w->segment = 0;
123 }
124
125 struct snip_rec_info {
126     ZebraHandle zh;
127     zebra_snippets *snippets;
128 };
129
130
131 static void snippet_add_complete_field(RecWord *p, int ord)
132 {
133     struct snip_rec_info *h = p->extractCtrl->handle;
134     ZebraHandle zh = h->zh;
135
136     const char *b = p->term_buf;
137     char buf[IT_MAX_WORD+1];
138     const char **map = 0;
139     int i = 0, remain = p->term_len;
140     const char *start = b;
141     const char *last = 0;
142
143     if (remain > 0)
144         map = zebra_maps_input (zh->reg->zebra_maps, p->index_type, &b, remain, 1);
145
146     while (remain > 0 && i < IT_MAX_WORD)
147     {
148         while (map && *map && **map == *CHR_SPACE)
149         {
150             remain = p->term_len - (b - p->term_buf);
151
152             if (i == 0)
153                 start = b;  /* set to first non-ws area */
154             if (remain > 0)
155             {
156                 int first = i ? 0 : 1;  /* first position */
157
158                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, 
159                                        &b, remain, first);
160             }
161             else
162                 map = 0;
163         }
164         if (!map)
165             break;
166
167         if (i && i < IT_MAX_WORD)
168             buf[i++] = *CHR_SPACE;
169         while (map && *map && **map != *CHR_SPACE)
170         {
171             const char *cp = *map;
172
173             if (**map == *CHR_CUT)
174             {
175                 i = 0;
176             }
177             else
178             {
179                 if (i >= IT_MAX_WORD)
180                     break;
181                 while (i < IT_MAX_WORD && *cp)
182                     buf[i++] = *(cp++);
183             }
184             last = b;
185             remain = p->term_len  - (b - p->term_buf);
186             if (remain > 0)
187             {
188                 map = zebra_maps_input (zh->reg->zebra_maps, p->index_type, &b,
189                                         remain, 0);
190             }
191             else
192                 map = 0;
193         }
194     }
195     if (!i)
196         return;
197     if (last && start != last)
198         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
199                                start, last - start);
200 }
201
202 static void snippet_add_incomplete_field(RecWord *p, int ord)
203 {
204     struct snip_rec_info *h = p->extractCtrl->handle;
205     ZebraHandle zh = h->zh;
206     const char *b = p->term_buf;
207     int remain = p->term_len;
208     int first = 1;
209     const char **map = 0;
210     const char *start = b;
211     const char *last = b;
212
213     if (remain > 0)
214         map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0);
215
216     while (map)
217     {
218         char buf[IT_MAX_WORD+1];
219         int i, remain;
220
221         /* Skip spaces */
222         while (map && *map && **map == *CHR_SPACE)
223         {
224             remain = p->term_len - (b - p->term_buf);
225             last = b;
226             if (remain > 0)
227                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b,
228                                        remain, 0);
229             else
230                 map = 0;
231         }
232         if (!map)
233             break;
234         if (start != last)
235         {
236             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
237                                    start, last - start);
238
239         }
240         start = last;
241
242         i = 0;
243         while (map && *map && **map != *CHR_SPACE)
244         {
245             const char *cp = *map;
246
247             while (i < IT_MAX_WORD && *cp)
248                 buf[i++] = *(cp++);
249             remain = p->term_len - (b - p->term_buf);
250             last = b;
251             if (remain > 0)
252                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0);
253             else
254                 map = 0;
255         }
256         if (!i)
257             return;
258
259         if (first)
260         {   
261             first = 0;
262             if (zebra_maps_is_first_in_field(zh->reg->zebra_maps, p->index_type))
263             {
264                 /* first in field marker */
265                 p->seqno++;
266             }
267         }
268         if (start != last)
269             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
270                                    start, last - start);
271         start = last;
272         p->seqno++;
273     }
274
275 }
276
277 static void snippet_token_add(RecWord *p)
278 {
279     struct snip_rec_info *h = p->extractCtrl->handle;
280     ZebraHandle zh = h->zh;
281
282     if (zebra_maps_is_index(zh->reg->zebra_maps, p->index_type))
283     {
284         ZebraExplainInfo zei = zh->reg->zei;
285         int ch = zebraExplain_lookup_attr_str(
286             zei, zinfo_index_category_index, p->index_type, p->index_name);
287
288         if (zebra_maps_is_complete (h->zh->reg->zebra_maps, p->index_type))
289             snippet_add_complete_field (p, ch);
290         else
291             snippet_add_incomplete_field(p, ch);
292     }
293 }
294
295 static void snippet_schema_add(
296     struct recExtractCtrl *p, Odr_oid *oid)
297 {
298
299 }
300
301 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
302                      struct ZebraRecStream *stream,
303                      RecType rt, void *recTypeClientData)
304 {
305     struct recExtractCtrl extractCtrl;
306     struct snip_rec_info info;
307     int r;
308
309     extractCtrl.stream = stream;
310     extractCtrl.first_record = 1;
311     extractCtrl.init = extract_init;
312     extractCtrl.tokenAdd = snippet_token_add;
313     extractCtrl.schemaAdd = snippet_schema_add;
314     assert(zh->reg);
315     assert(zh->reg->dh);
316
317     extractCtrl.dh = zh->reg->dh;
318     
319     info.zh = zh;
320     info.snippets = sn;
321     extractCtrl.handle = &info;
322     extractCtrl.match_criteria[0] = '\0';
323     extractCtrl.staticrank = 0;
324     extractCtrl.action = action_insert;
325     
326     init_extractCtrl(zh, &extractCtrl);
327
328     extractCtrl.setStoreData = 0;
329
330     r = (*rt->extract)(recTypeClientData, &extractCtrl);
331
332 }
333
334 static void searchRecordKey(ZebraHandle zh,
335                             zebra_rec_keys_t reckeys,
336                             const char *index_name,
337                             const char **ws, int ws_length)
338 {
339     int i;
340     int ch = -1;
341     zinfo_index_category_t cat = zinfo_index_category_index;
342
343     for (i = 0; i<ws_length; i++)
344         ws[i] = NULL;
345
346     if (ch < 0)
347         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, '0', index_name);
348     if (ch < 0)
349         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, 'p', index_name);
350     if (ch < 0)
351         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, 'w', index_name);
352
353     if (ch < 0)
354         return ;
355
356     if (zebra_rec_keys_rewind(reckeys))
357     {
358         zint startSeq = -1;
359         const char *str;
360         size_t slen;
361         struct it_key key;
362         zint seqno;
363         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
364         {
365             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
366
367             seqno = key.mem[key.len-1];
368             
369             if (key.mem[0] == ch)
370             {
371                 zint woff;
372                 
373                 if (startSeq == -1)
374                     startSeq = seqno;
375                 woff = seqno - startSeq;
376                 if (woff >= 0 && woff < ws_length)
377                     ws[woff] = str;
378             }
379         }
380     }
381 }
382
383 #define FILE_MATCH_BLANK "\t "
384
385 static char *get_match_from_spec(ZebraHandle zh,
386                           zebra_rec_keys_t reckeys,
387                           const char *fname, const char *spec)
388 {
389     static char dstBuf[2048];      /* static here ??? */
390     char *dst = dstBuf;
391     const char *s = spec;
392
393     while (1)
394     {
395         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
396             ;
397         if (!*s)
398             break;
399         if (*s == '(')
400         {
401             const char *ws[32];
402             char attset_str[64], attname_str[64];
403             int i;
404             int first = 1;
405             
406             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
407                 ;
408             for (i = 0; *s && *s != ',' && *s != ')' && 
409                      !strchr(FILE_MATCH_BLANK, *s); s++)
410                 if (i+1 < sizeof(attset_str))
411                     attset_str[i++] = *s;
412             attset_str[i] = '\0';
413             
414             for (; strchr(FILE_MATCH_BLANK, *s); s++)
415                 ;
416             if (*s != ',')
417                 strcpy(attname_str, attset_str);
418             else
419             {
420                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
421                     ;
422                 for (i = 0; *s && *s != ')' && 
423                          !strchr(FILE_MATCH_BLANK, *s); s++)
424                     if (i+1 < sizeof(attname_str))
425                         attname_str[i++] = *s;
426                 attname_str[i] = '\0';
427             }
428
429             searchRecordKey (zh, reckeys, attname_str, ws, 32);
430
431             if (*s != ')')
432             {
433                 yaz_log (YLOG_WARN, "Missing ) in match criteria %s in group %s",
434                       spec, zh->m_group ? zh->m_group : "none");
435                 return NULL;
436             }
437             s++;
438
439             for (i = 0; i<32; i++)
440                 if (ws[i])
441                 {
442                     if (first)
443                     {
444                         *dst++ = ' ';
445                         first = 0;
446                     }
447                     strcpy (dst, ws[i]);
448                     dst += strlen(ws[i]);
449                 }
450             if (first)
451             {
452                 yaz_log (YLOG_WARN, "Record didn't contain match"
453                       " fields in (%s,%s)", attset_str, attname_str);
454                 return NULL;
455             }
456         }
457         else if (*s == '$')
458         {
459             int spec_len;
460             char special[64];
461             const char *spec_src = NULL;
462             const char *s1 = ++s;
463             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
464                 s1++;
465
466             spec_len = s1 - s;
467             if (spec_len > sizeof(special)-1)
468                 spec_len = sizeof(special)-1;
469             memcpy (special, s, spec_len);
470             special[spec_len] = '\0';
471             s = s1;
472
473             if (!strcmp (special, "group"))
474                 spec_src = zh->m_group;
475             else if (!strcmp (special, "database"))
476                 spec_src = zh->basenames[0];
477             else if (!strcmp (special, "filename")) {
478                 spec_src = fname;
479             }
480             else if (!strcmp (special, "type"))
481                 spec_src = zh->m_record_type;
482             else 
483                 spec_src = NULL;
484             if (spec_src)
485             {
486                 strcpy (dst, spec_src);
487                 dst += strlen (spec_src);
488             }
489         }
490         else if (*s == '\"' || *s == '\'')
491         {
492             int stopMarker = *s++;
493             char tmpString[64];
494             int i = 0;
495
496             while (*s && *s != stopMarker)
497             {
498                 if (i+1 < sizeof(tmpString))
499                     tmpString[i++] = *s++;
500             }
501             if (*s)
502                 s++;
503             tmpString[i] = '\0';
504             strcpy (dst, tmpString);
505             dst += strlen (tmpString);
506         }
507         else
508         {
509             yaz_log (YLOG_WARN, "Syntax error in match criteria %s in group %s",
510                   spec, zh->m_group ? zh->m_group : "none");
511             return NULL;
512         }
513         *dst++ = 1;
514     }
515     if (dst == dstBuf)
516     {
517         yaz_log (YLOG_WARN, "No match criteria for record %s in group %s",
518               fname, zh->m_group ? zh->m_group : "none");
519         return NULL;
520     }
521     *dst = '\0';
522     return dstBuf;
523 }
524
525 struct recordLogInfo {
526     const char *fname;
527     int recordOffset;
528     struct recordGroup *rGroup;
529 };
530
531 static void all_matches_add(struct recExtractCtrl *ctrl)
532 {
533     RecWord word;
534     extract_init(ctrl, &word);
535     word.index_name = "_ALLRECORDS";
536     word.index_type = 'w';
537     word.seqno = 1;
538     extract_add_index_string (&word, zinfo_index_category_alwaysmatches,
539                               "", 0);
540 }
541
542 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
543                                        struct ZebraRecStream *stream,
544                                        enum zebra_recctrl_action_t action,
545                                        int test_mode, 
546                                        const char *recordType,
547                                        zint *sysno,
548                                        const char *match_criteria,
549                                        const char *fname,
550                                        RecType recType,
551                                        void *recTypeClientData);
552
553
554 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
555                              int deleteFlag)
556 {
557     ZEBRA_RES r = ZEBRA_OK;
558     int i, fd;
559     char gprefix[128];
560     char ext[128];
561     char ext_res[128];
562     struct file_read_info *fi = 0;
563     const char *original_record_type = 0;
564     RecType recType;
565     void *recTypeClientData;
566     struct ZebraRecStream stream, *streamp;
567
568     zebra_init_log_level();
569
570     if (!zh->m_group || !*zh->m_group)
571         *gprefix = '\0';
572     else
573         sprintf (gprefix, "%s.", zh->m_group);
574     
575     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
576
577     /* determine file extension */
578     *ext = '\0';
579     for (i = strlen(fname); --i >= 0; )
580         if (fname[i] == '/')
581             break;
582         else if (fname[i] == '.')
583         {
584             strcpy (ext, fname+i+1);
585             break;
586         }
587     /* determine file type - depending on extension */
588     original_record_type = zh->m_record_type;
589     if (!zh->m_record_type)
590     {
591         sprintf (ext_res, "%srecordType.%s", gprefix, ext);
592         zh->m_record_type = res_get (zh->res, ext_res);
593     }
594     if (!zh->m_record_type)
595     {
596         check_log_limit(zh);
597         if (zh->records_processed + zh->records_skipped
598             < zh->m_file_verbose_limit)
599             yaz_log (YLOG_LOG, "? %s", fname);
600         zh->records_skipped++;
601         return 0;
602     }
603     /* determine match criteria */
604     if (!zh->m_record_id)
605     {
606         sprintf (ext_res, "%srecordId.%s", gprefix, ext);
607         zh->m_record_id = res_get (zh->res, ext_res);
608     }
609
610     if (!(recType =
611           recType_byName (zh->reg->recTypes, zh->res, zh->m_record_type,
612                           &recTypeClientData)))
613     {
614         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
615         return ZEBRA_FAIL;
616     }
617
618     switch(recType->version)
619     {
620     case 0:
621         break;
622     default:
623         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
624     }
625     if (sysno && deleteFlag)
626     {
627         streamp = 0;
628         fi = 0;
629     }
630     else
631     {
632         char full_rep[1024];
633
634         if (zh->path_reg && !yaz_is_abspath (fname))
635         {
636             strcpy (full_rep, zh->path_reg);
637             strcat (full_rep, "/");
638             strcat (full_rep, fname);
639         }
640         else
641             strcpy (full_rep, fname);
642         
643         if ((fd = open (full_rep, O_BINARY|O_RDONLY)) == -1)
644         {
645             yaz_log (YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
646             zh->m_record_type = original_record_type;
647             return ZEBRA_FAIL;
648         }
649         streamp = &stream;
650         zebra_create_stream_fd(streamp, fd, 0);
651     }
652     r = zebra_extract_records_stream(zh, streamp,
653                                      deleteFlag ? 
654                                      action_delete : action_update,
655                                      0, /* tst_mode */
656                                      zh->m_record_type,
657                                      sysno,
658                                      0, /*match_criteria */
659                                      fname,
660                                      recType, recTypeClientData);
661     if (streamp)
662         stream.destroy(streamp);
663     zh->m_record_type = original_record_type;
664     return r;
665 }
666
667 /*
668   If sysno is provided, then it's used to identify the reocord.
669   If not, and match_criteria is provided, then sysno is guessed
670   If not, and a record is provided, then sysno is got from there
671   
672  */
673
674 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
675                                       const char *buf, size_t buf_size,
676                                       enum zebra_recctrl_action_t action,
677                                       int test_mode, 
678                                       const char *recordType,
679                                       zint *sysno,
680                                       const char *match_criteria,
681                                       const char *fname)
682 {
683     struct ZebraRecStream stream;
684     ZEBRA_RES res;
685     void *clientData;
686     RecType recType = 0;
687
688     if (recordType && *recordType)
689     {
690         yaz_log(log_level_extract,
691                 "Record type explicitly specified: %s", recordType);
692         recType = recType_byName (zh->reg->recTypes, zh->res, recordType,
693                                   &clientData);
694     } 
695     else
696     {
697         if (!(zh->m_record_type))
698         {
699             yaz_log (YLOG_WARN, "No such record type defined");
700             return ZEBRA_FAIL;
701         }
702         yaz_log(log_level_extract, "Get record type from rgroup: %s",
703                 zh->m_record_type);
704         recType = recType_byName (zh->reg->recTypes, zh->res,
705                                   zh->m_record_type, &clientData);
706         recordType = zh->m_record_type;
707     }
708     
709     if (!recType)
710     {
711         yaz_log (YLOG_WARN, "No such record type: %s", recordType);
712         return ZEBRA_FAIL;
713     }
714
715     zebra_create_stream_mem(&stream, buf, buf_size);
716
717     res = zebra_extract_records_stream(zh, &stream,
718                                        action,
719                                        test_mode, 
720                                        recordType,
721                                        sysno,
722                                        match_criteria,
723                                        fname,
724                                        recType, clientData);
725     stream.destroy(&stream);
726     return res;
727 }
728
729 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
730                                        struct ZebraRecStream *stream,
731                                        enum zebra_recctrl_action_t action,
732                                        int test_mode, 
733                                        const char *recordType,
734                                        zint *sysno,
735                                        const char *match_criteria,
736                                        const char *fname,
737                                        RecType recType,
738                                        void *recTypeClientData)
739 {
740     ZEBRA_RES res = ZEBRA_OK;
741     while (1)
742     {
743         int more = 0;
744         res = zebra_extract_record_stream(zh, stream,
745                                           action,
746                                           test_mode, 
747                                           recordType,
748                                           sysno,
749                                           match_criteria,
750                                           fname,
751                                           recType, recTypeClientData, &more);
752         if (!more)
753         {
754             res = ZEBRA_OK;
755             break;
756         }
757         if (res != ZEBRA_OK)
758             break;
759         if (sysno)
760             break;
761     }
762     return res;
763 }
764
765
766 static WRBUF wrbuf_hex_str(const char *cstr)
767 {
768     size_t i;
769     WRBUF w = wrbuf_alloc();
770     for (i = 0; cstr[i]; i++)
771     {
772         if (cstr[i] < ' ' || cstr[i] > 126)
773             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
774         else
775             wrbuf_putc(w, cstr[i]);
776     }
777     return w;
778 }
779
780 ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
781                                       struct ZebraRecStream *stream,
782                                       enum zebra_recctrl_action_t action,
783                                       int test_mode, 
784                                       const char *recordType,
785                                       zint *sysno,
786                                       const char *match_criteria,
787                                       const char *fname,
788                                       RecType recType,
789                                       void *recTypeClientData,
790                                       int *more)
791
792 {
793     zint sysno0 = 0;
794     RecordAttr *recordAttr;
795     struct recExtractCtrl extractCtrl;
796     int r;
797     const char *matchStr = 0;
798     Record rec;
799     off_t start_offset = 0, end_offset = 0;
800     const char *pr_fname = fname;  /* filename to print .. */
801     int show_progress = zh->records_processed + zh->records_skipped 
802         < zh->m_file_verbose_limit ? 1:0;
803
804     zebra_init_log_level();
805
806     if (!pr_fname)
807         pr_fname = "<no file>";  /* make it printable if file is omitted */
808
809     zebra_rec_keys_reset(zh->reg->keys);
810     zebra_rec_keys_reset(zh->reg->sortKeys);
811
812     if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0]))
813     {
814         if (zebraExplain_newDatabase (zh->reg->zei, zh->basenames[0], 
815                                       zh->m_explain_database))
816             return ZEBRA_FAIL;
817     }
818
819     if (stream)
820     {
821         off_t null_offset = 0;
822         extractCtrl.stream = stream;
823
824         start_offset = stream->tellf(stream);
825
826         extractCtrl.first_record = start_offset ? 0 : 1;
827         
828         stream->endf(stream, &null_offset);;
829
830         extractCtrl.init = extract_init;
831         extractCtrl.tokenAdd = extract_token_add;
832         extractCtrl.schemaAdd = extract_schema_add;
833         extractCtrl.dh = zh->reg->dh;
834         extractCtrl.handle = zh;
835         extractCtrl.match_criteria[0] = '\0';
836         extractCtrl.staticrank = 0;
837         extractCtrl.action = action;
838
839         init_extractCtrl(zh, &extractCtrl);
840
841         extract_set_store_data_prepare(&extractCtrl);
842         
843         r = (*recType->extract)(recTypeClientData, &extractCtrl);
844
845         if (action == action_update)
846         {
847             action = extractCtrl.action;
848         }
849         
850         switch (r)
851         {
852         case RECCTRL_EXTRACT_EOF:
853             return ZEBRA_FAIL;
854         case RECCTRL_EXTRACT_ERROR_GENERIC:
855             /* error occured during extraction ... */
856             yaz_log (YLOG_WARN, "extract error: generic");
857             return ZEBRA_FAIL;
858         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
859             /* error occured during extraction ... */
860             yaz_log (YLOG_WARN, "extract error: no such filter");
861             return ZEBRA_FAIL;
862         case RECCTRL_EXTRACT_SKIP:
863             if (show_progress)
864                 yaz_log (YLOG_LOG, "skip %s %s " ZINT_FORMAT,
865                          recordType, pr_fname, (zint) start_offset);
866             *more = 1;
867             
868             end_offset = stream->endf(stream, 0);
869             if (end_offset)
870                 stream->seekf(stream, end_offset);
871
872             return ZEBRA_OK;
873         case RECCTRL_EXTRACT_OK:
874             break;
875         default:
876             yaz_log (YLOG_WARN, "extract error: unknown error: %d", r);
877             return ZEBRA_FAIL;
878         }
879         end_offset = stream->endf(stream, 0);
880         if (end_offset)
881             stream->seekf(stream, end_offset);
882         else
883             end_offset = stream->tellf(stream);
884
885         all_matches_add(&extractCtrl);
886         
887         if (extractCtrl.match_criteria[0])
888             match_criteria = extractCtrl.match_criteria;
889     }
890
891     *more = 1;
892     if (!sysno)
893     {
894         sysno = &sysno0;
895
896         if (match_criteria && *match_criteria) {
897             matchStr = match_criteria;
898         } else {
899             if (zh->m_record_id && *zh->m_record_id) {
900                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
901                                                zh->m_record_id);
902                 if (!matchStr)
903                 {
904                     yaz_log (YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
905                              pr_fname, (zint) start_offset);
906                     return ZEBRA_FAIL;
907                 }
908             }
909         }
910         if (matchStr) 
911         {
912             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
913             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
914                                           matchStr);
915
916             
917             if (log_level_extract)
918             {
919                 WRBUF w = wrbuf_hex_str(matchStr);
920                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
921                 wrbuf_destroy(w);
922             }
923             if (rinfo)
924             {
925                 assert(*rinfo == sizeof(*sysno));
926                 memcpy (sysno, rinfo+1, sizeof(*sysno));
927             }
928        }
929     }
930     if (zebra_rec_keys_empty(zh->reg->keys))
931     {
932         /* the extraction process returned no information - the record
933            is probably empty - unless flagShowRecords is in use */
934         if (test_mode)
935             return ZEBRA_OK;
936     }
937
938     if (! *sysno)
939     {
940         /* new record */
941         if (action == action_delete)
942         {
943             yaz_log (YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
944                          pr_fname, (zint) start_offset);
945             yaz_log (YLOG_WARN, "cannot delete record above (seems new)");
946             return ZEBRA_FAIL;
947         }
948         else if (action == action_replace)
949         {
950             yaz_log (YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
951                          pr_fname, (zint) start_offset);
952             yaz_log (YLOG_WARN, "cannot update record above (seems new)");
953             return ZEBRA_FAIL;
954         }
955         if (show_progress)
956             yaz_log (YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
957                      (zint) start_offset);
958         rec = rec_new (zh->reg->records);
959
960         *sysno = rec->sysno;
961
962         recordAttr = rec_init_attr (zh->reg->zei, rec);
963         if (extractCtrl.staticrank < 0)
964         {
965             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
966             extractCtrl.staticrank = 0;
967         }
968
969         if (matchStr)
970         {
971             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
972             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
973                             sizeof(*sysno), sysno);
974         }
975
976         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
977 #if FLUSH2
978         extract_flush_record_keys2(zh, *sysno,
979                                    zh->reg->keys, extractCtrl.staticrank,
980                                    0, recordAttr->staticrank);
981 #else
982         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
983                                   extractCtrl.staticrank);
984 #endif
985         recordAttr->staticrank = extractCtrl.staticrank;
986         zh->records_inserted++;
987     } 
988     else
989     {
990         /* record already exists */
991         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
992         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
993         if (action == action_insert)
994         {
995             yaz_log (YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
996                          recordType, pr_fname, (zint) start_offset);
997             logRecord(zh);
998             return ZEBRA_FAIL;
999         }
1000
1001         rec = rec_get (zh->reg->records, *sysno);
1002         assert (rec);
1003         
1004         recordAttr = rec_init_attr (zh->reg->zei, rec);
1005
1006         /* decrease total size */
1007         zebraExplain_recordBytesIncrement (zh->reg->zei,
1008                                            - recordAttr->recordSize);
1009
1010         zebra_rec_keys_set_buf(delkeys,
1011                                rec->info[recInfo_delKeys],
1012                                rec->size[recInfo_delKeys],
1013                                0);
1014         zebra_rec_keys_set_buf(sortKeys,
1015                                rec->info[recInfo_sortKeys],
1016                                rec->size[recInfo_sortKeys],
1017                                0);
1018
1019         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1020 #if !FLUSH2
1021         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1022                                   recordAttr->staticrank);
1023 #endif
1024         if (action == action_delete)
1025         {
1026             /* record going to be deleted */
1027 #if FLUSH2
1028             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1029                                        delkeys, recordAttr->staticrank);
1030 #endif       
1031             if (zebra_rec_keys_empty(delkeys))
1032             {
1033                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1034                         pr_fname, (zint) start_offset);
1035                 yaz_log(YLOG_WARN, "cannot delete file above, "
1036                         "storeKeys false (3)");
1037             }
1038             else
1039             {
1040                 if (show_progress)
1041                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1042                             pr_fname, (zint) start_offset);
1043                 zh->records_deleted++;
1044                 if (matchStr)
1045                 {
1046                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1047                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1048                 }
1049                 rec_del (zh->reg->records, &rec);
1050             }
1051             zebra_rec_keys_close(delkeys);
1052             zebra_rec_keys_close(sortKeys);
1053             rec_free(&rec);
1054             logRecord(zh);
1055             return ZEBRA_OK;
1056         }
1057         else
1058         {   /* update or special_update */
1059             if (show_progress)
1060                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1061                         pr_fname, (zint) start_offset);
1062             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1063
1064 #if FLUSH2
1065             extract_flush_record_keys2(zh, *sysno,
1066                                        zh->reg->keys, extractCtrl.staticrank,
1067                                        delkeys, recordAttr->staticrank);
1068 #else
1069             extract_flush_record_keys(zh, *sysno, 1, 
1070                                       zh->reg->keys, extractCtrl.staticrank);
1071 #endif
1072             recordAttr->staticrank = extractCtrl.staticrank;
1073             zh->records_updated++;
1074         }
1075         zebra_rec_keys_close(delkeys);
1076         zebra_rec_keys_close(sortKeys);
1077     }
1078     /* update file type */
1079     xfree (rec->info[recInfo_fileType]);
1080     rec->info[recInfo_fileType] =
1081         rec_strdup (recordType, &rec->size[recInfo_fileType]);
1082
1083     /* update filename */
1084     xfree (rec->info[recInfo_filename]);
1085     rec->info[recInfo_filename] =
1086         rec_strdup (fname, &rec->size[recInfo_filename]);
1087
1088     /* update delete keys */
1089     xfree (rec->info[recInfo_delKeys]);
1090     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1091     {
1092         zebra_rec_keys_get_buf(zh->reg->keys,
1093                                &rec->info[recInfo_delKeys],
1094                                &rec->size[recInfo_delKeys]);
1095     }
1096     else
1097     {
1098         rec->info[recInfo_delKeys] = NULL;
1099         rec->size[recInfo_delKeys] = 0;
1100     }
1101     /* update sort keys */
1102     xfree (rec->info[recInfo_sortKeys]);
1103
1104     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1105                            &rec->info[recInfo_sortKeys],
1106                            &rec->size[recInfo_sortKeys]);
1107
1108     if (stream)
1109     {
1110         recordAttr->recordSize = end_offset - start_offset;
1111         zebraExplain_recordBytesIncrement(zh->reg->zei,
1112                                           recordAttr->recordSize);
1113     }
1114
1115     /* set run-number for this record */
1116     recordAttr->runNumber =
1117         zebraExplain_runNumberIncrement (zh->reg->zei, 0);
1118
1119     /* update store data */
1120     xfree (rec->info[recInfo_storeData]);
1121
1122     /* update store data */
1123     if (zh->store_data_buf)
1124     {
1125         rec->size[recInfo_storeData] = zh->store_data_size;
1126         rec->info[recInfo_storeData] = zh->store_data_buf;
1127         zh->store_data_buf = 0;
1128         recordAttr->recordSize = zh->store_data_size;
1129     }
1130     else if (zh->m_store_data)
1131     {
1132         off_t cur_offset = stream->tellf(stream);
1133
1134         rec->size[recInfo_storeData] = recordAttr->recordSize;
1135         rec->info[recInfo_storeData] = (char *)
1136             xmalloc (recordAttr->recordSize);
1137         stream->seekf(stream, start_offset);
1138         stream->readf(stream, rec->info[recInfo_storeData],
1139                       recordAttr->recordSize);
1140         stream->seekf(stream, cur_offset);
1141     }
1142     else
1143     {
1144         rec->info[recInfo_storeData] = NULL;
1145         rec->size[recInfo_storeData] = 0;
1146     }
1147     /* update database name */
1148     xfree (rec->info[recInfo_databaseName]);
1149     rec->info[recInfo_databaseName] =
1150         rec_strdup (zh->basenames[0], &rec->size[recInfo_databaseName]); 
1151
1152     /* update offset */
1153     recordAttr->recordOffset = start_offset;
1154     
1155     /* commit this record */
1156     rec_put (zh->reg->records, &rec);
1157     logRecord(zh);
1158     return ZEBRA_OK;
1159 }
1160
1161 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1162 {
1163     ZebraHandle zh = (ZebraHandle) handle;
1164     struct recExtractCtrl extractCtrl;
1165
1166     if (zebraExplain_curDatabase (zh->reg->zei,
1167                                   rec->info[recInfo_databaseName]))
1168     {
1169         abort();
1170         if (zebraExplain_newDatabase (zh->reg->zei,
1171                                       rec->info[recInfo_databaseName], 0))
1172             abort ();
1173     }
1174
1175     zebra_rec_keys_reset(zh->reg->keys);
1176     zebra_rec_keys_reset(zh->reg->sortKeys);
1177
1178     extractCtrl.init = extract_init;
1179     extractCtrl.tokenAdd = extract_token_add;
1180     extractCtrl.schemaAdd = extract_schema_add;
1181     extractCtrl.dh = zh->reg->dh;
1182
1183     init_extractCtrl(zh, &extractCtrl);
1184
1185     extractCtrl.flagShowRecords = 0;
1186     extractCtrl.match_criteria[0] = '\0';
1187     extractCtrl.staticrank = 0;
1188     extractCtrl.action = action_update;
1189
1190     extractCtrl.handle = handle;
1191     extractCtrl.first_record = 1;
1192     
1193     extract_set_store_data_prepare(&extractCtrl);
1194
1195     if (n)
1196         grs_extract_tree(&extractCtrl, n);
1197
1198     if (rec->size[recInfo_delKeys])
1199     {
1200         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1201         
1202         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1203
1204         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1205                                rec->size[recInfo_delKeys],
1206                                0);
1207 #if FLUSH2
1208         extract_flush_record_keys2(zh, rec->sysno, 
1209                                    zh->reg->keys, 0, delkeys, 0);
1210 #else
1211         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1212         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1213 #endif
1214         zebra_rec_keys_close(delkeys);
1215
1216         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1217                                rec->size[recInfo_sortKeys],
1218                                0);
1219
1220         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1221         zebra_rec_keys_close(sortkeys);
1222     }
1223     else
1224     {
1225 #if FLUSH2
1226         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1227 #else
1228         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1229 #endif
1230     }
1231     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1232     
1233     xfree (rec->info[recInfo_delKeys]);
1234     zebra_rec_keys_get_buf(zh->reg->keys,
1235                            &rec->info[recInfo_delKeys], 
1236                            &rec->size[recInfo_delKeys]);
1237
1238     xfree (rec->info[recInfo_sortKeys]);
1239     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1240                            &rec->info[recInfo_sortKeys],
1241                            &rec->size[recInfo_sortKeys]);
1242     return ZEBRA_OK;
1243 }
1244
1245 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1246                           zebra_rec_keys_t reckeys,
1247                           int level)
1248 {
1249     if (zebra_rec_keys_rewind(reckeys))
1250     {
1251         size_t slen;
1252         const char *str;
1253         struct it_key key;
1254         NMEM nmem = nmem_create();
1255
1256         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1257         {
1258             char keystr[200]; /* room for zints to print */
1259             char *dst_term = 0;
1260             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1261             int index_type, i;
1262             const char *string_index;
1263             
1264             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1265                                     0/* db */, &string_index);
1266             assert(index_type);
1267             zebra_term_untrans_iconv(zh, nmem, index_type,
1268                                      &dst_term, str);
1269             *keystr = '\0';
1270             for (i = 0; i<key.len; i++)
1271             {
1272                 sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key.mem[i]);
1273             }
1274
1275             if (*str < CHR_BASE_CHAR)
1276             {
1277                 int i;
1278                 char dst_buf[200]; /* room for special chars */
1279
1280                 strcpy(dst_buf , "?");
1281
1282                 if (!strcmp(str, ""))
1283                     strcpy(dst_buf, "alwaysmatches");
1284                 if (!strcmp(str, FIRST_IN_FIELD_STR))
1285                     strcpy(dst_buf, "firstinfield");
1286                 else if (!strcmp(str, CHR_UNKNOWN))
1287                     strcpy(dst_buf, "unknown");
1288                 else if (!strcmp(str, CHR_SPACE))
1289                     strcpy(dst_buf, "space");
1290                 
1291                 for (i = 0; i<slen; i++)
1292                 {
1293                     sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1294                 }
1295                 yaz_log(level, "%s%c %s %s", keystr, index_type,
1296                         string_index, dst_buf);
1297                 
1298             }
1299             else
1300                 yaz_log(level, "%s%c %s \"%s\"", keystr, index_type,
1301                         string_index, dst_term);
1302
1303             nmem_reset(nmem);
1304         }
1305         nmem_destroy(nmem);
1306     }
1307 }
1308
1309 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1310                              zebra_rec_keys_t reckeys)
1311 {
1312     ZebraExplainInfo zei = zh->reg->zei;
1313     struct ord_stat {
1314         int no;
1315         int ord;
1316         struct ord_stat *next;
1317     };
1318
1319     if (zebra_rec_keys_rewind(reckeys))
1320     {
1321         struct ord_stat *ord_list = 0;
1322         struct ord_stat *p;
1323         size_t slen;
1324         const char *str;
1325         struct it_key key_in;
1326         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1327         {
1328             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1329
1330             for (p = ord_list; p ; p = p->next)
1331                 if (p->ord == ord)
1332                 {
1333                     p->no++;
1334                     break;
1335                 }
1336             if (!p)
1337             {
1338                 p = xmalloc(sizeof(*p));
1339                 p->no = 1;
1340                 p->ord = ord;
1341                 p->next = ord_list;
1342                 ord_list = p;
1343             }
1344         }
1345
1346         p = ord_list;
1347         while (p)
1348         {
1349             struct ord_stat *p1 = p;
1350
1351             if (is_insert)
1352                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1353             else
1354                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1355             p = p->next;
1356             xfree(p1);
1357         }
1358     }
1359 }
1360
1361 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
1362                                 zebra_rec_keys_t ins_keys, zint ins_rank,
1363                                 zebra_rec_keys_t del_keys, zint del_rank)
1364 {
1365     ZebraExplainInfo zei = zh->reg->zei;
1366     int normal = 0;
1367     int optimized = 0;
1368
1369     if (!zh->reg->key_block)
1370     {
1371         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1372         const char *key_tmp_dir = res_get_def (zh->res, "keyTmpDir", ".");
1373         int use_threads = atoi(res_get_def (zh->res, "threads", "1"));
1374         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1375     }
1376
1377     if (ins_keys)
1378     {
1379         extract_rec_keys_adjust(zh, 1, ins_keys);
1380         if (!del_keys)
1381             zebraExplain_recordCountIncrement (zei, 1);
1382         zebra_rec_keys_rewind(ins_keys);
1383     }
1384     if (del_keys)
1385     {
1386         extract_rec_keys_adjust(zh, 0, del_keys);
1387         if (!ins_keys)
1388             zebraExplain_recordCountIncrement (zei, -1);
1389         zebra_rec_keys_rewind(del_keys);
1390     }
1391
1392     while (1)
1393     {
1394         size_t del_slen;
1395         const char *del_str;
1396         struct it_key del_key_in;
1397         int del = 0;
1398
1399         size_t ins_slen;
1400         const char *ins_str;
1401         struct it_key ins_key_in;
1402         int ins = 0;
1403
1404         if (del_keys)
1405             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1406                                       &del_key_in);
1407         if (ins_keys)
1408             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1409                                       &ins_key_in);
1410
1411         if (del && ins && ins_rank == del_rank
1412             && !key_compare(&del_key_in, &ins_key_in) 
1413             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1414         {
1415             optimized++;
1416             continue;
1417         }
1418         if (!del && !ins)
1419             break;
1420         
1421         normal++;
1422         if (del)
1423             key_block_write(zh->reg->key_block, sysno, 
1424                             &del_key_in, 0, del_str, del_slen,
1425                             del_rank, zh->m_staticrank);
1426         if (ins)
1427             key_block_write(zh->reg->key_block, sysno, 
1428                             &ins_key_in, 1, ins_str, ins_slen,
1429                             ins_rank, zh->m_staticrank);
1430     }
1431     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1432 }
1433
1434 void extract_flush_record_keys(ZebraHandle zh, zint sysno, int cmd,
1435                                zebra_rec_keys_t reckeys,
1436                                zint staticrank)
1437 {
1438     ZebraExplainInfo zei = zh->reg->zei;
1439
1440     extract_rec_keys_adjust(zh, cmd, reckeys);
1441
1442     if (log_level_details)
1443     {
1444         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1445                 sysno, cmd ? "insert" : "delete");
1446         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1447     }
1448
1449     if (!zh->reg->key_block)
1450     {
1451         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1452         const char *key_tmp_dir = res_get_def (zh->res, "keyTmpDir", ".");
1453         int use_threads = atoi(res_get_def (zh->res, "threads", "1"));
1454         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1455     }
1456     zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1);
1457
1458 #if 0
1459     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1460     print_rec_keys(zh, reckeys);
1461 #endif
1462     if (zebra_rec_keys_rewind(reckeys))
1463     {
1464         size_t slen;
1465         const char *str;
1466         struct it_key key_in;
1467         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1468         {
1469             key_block_write(zh->reg->key_block, sysno, 
1470                             &key_in, cmd, str, slen,
1471                             staticrank, zh->m_staticrank);
1472         }
1473     }
1474 }
1475
1476 ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh,
1477                                      zebra_rec_keys_t reckeys,
1478                                      zebra_snippets *snippets)
1479 {
1480     NMEM nmem = nmem_create();
1481     if (zebra_rec_keys_rewind(reckeys)) 
1482     {
1483         const char *str;
1484         size_t slen;
1485         struct it_key key;
1486         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1487         {
1488             char *dst_term = 0;
1489             int ord;
1490             zint seqno;
1491             int index_type;
1492
1493             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1494             seqno = key.mem[key.len-1];
1495             ord = CAST_ZINT_TO_INT(key.mem[0]);
1496             
1497             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1498                                     0/* db */, 0 /* string_index */);
1499             assert(index_type);
1500             zebra_term_untrans_iconv(zh, nmem, index_type,
1501                                      &dst_term, str);
1502             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1503             nmem_reset(nmem);
1504         }
1505     }
1506     nmem_destroy(nmem);
1507     return ZEBRA_OK;
1508 }
1509
1510 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1511 {
1512     yaz_log(YLOG_LOG, "print_rec_keys");
1513     if (zebra_rec_keys_rewind(reckeys))
1514     {
1515         const char *str;
1516         size_t slen;
1517         struct it_key key;
1518         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1519         {
1520             char dst_buf[IT_MAX_WORD];
1521             zint seqno;
1522             int index_type;
1523             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1524             const char *db = 0;
1525             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1526
1527             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1528             
1529             seqno = key.mem[key.len-1];
1530             
1531             zebra_term_untrans(zh, index_type, dst_buf, str);
1532             
1533             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1534                     " term=%s", ord, seqno, dst_buf); 
1535         }
1536     }
1537 }
1538
1539 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1540                                      const char *str, int length)
1541 {
1542     struct it_key key;
1543     ZebraHandle zh = p->extractCtrl->handle;
1544     ZebraExplainInfo zei = zh->reg->zei;
1545     int ch, i;
1546
1547     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1548     if (ch < 0)
1549         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1550
1551     i = 0;
1552     key.mem[i++] = ch;
1553     key.mem[i++] = p->record_id;
1554     key.mem[i++] = p->section_id;
1555
1556     if (zh->m_segment_indexing)
1557         key.mem[i++] = p->segment;
1558     key.mem[i++] = p->seqno;
1559     key.len = i;
1560
1561     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1562 }
1563
1564 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1565 {
1566     struct it_key key;
1567     ZebraHandle zh = p->extractCtrl->handle;
1568     ZebraExplainInfo zei = zh->reg->zei;
1569     int ch;
1570     zinfo_index_category_t cat = zinfo_index_category_sort;
1571
1572     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1573     if (ch < 0)
1574         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1575     key.len = 2;
1576     key.mem[0] = ch;
1577     key.mem[1] = p->record_id;
1578
1579     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1580 }
1581
1582 static void extract_add_staticrank_string(RecWord *p,
1583                                           const char *str, int length)
1584 {
1585     char valz[40];
1586     struct recExtractCtrl *ctrl = p->extractCtrl;
1587
1588     if (length > sizeof(valz)-1)
1589         length = sizeof(valz)-1;
1590
1591     memcpy(valz, str, length);
1592     valz[length] = '\0';
1593     ctrl->staticrank = atozint(valz);
1594 }
1595
1596 static void extract_add_string(RecWord *p, const char *string, int length)
1597 {
1598     ZebraHandle zh = p->extractCtrl->handle;
1599     assert (length > 0);
1600
1601     if (!p->index_name)
1602         return;
1603
1604     if (zebra_maps_is_index(zh->reg->zebra_maps, p->index_type))
1605     {
1606         extract_add_index_string(p, zinfo_index_category_index,
1607                                  string, length);
1608         if (zebra_maps_is_alwaysmatches(zh->reg->zebra_maps, p->index_type))
1609         {
1610             RecWord word;
1611             memcpy(&word, p, sizeof(word));
1612
1613             word.seqno = 1;
1614             extract_add_index_string(
1615                 &word, zinfo_index_category_alwaysmatches, "", 0);
1616         }
1617     }
1618     else if (zebra_maps_is_sort(zh->reg->zebra_maps, p->index_type))
1619     {
1620         extract_add_sort_string(p, string, length);
1621     }
1622     else if (zebra_maps_is_staticrank(zh->reg->zebra_maps, p->index_type))
1623     {
1624         extract_add_staticrank_string(p, string, length);
1625     }
1626 }
1627
1628 static void extract_add_incomplete_field(RecWord *p)
1629 {
1630     ZebraHandle zh = p->extractCtrl->handle;
1631     const char *b = p->term_buf;
1632     int remain = p->term_len;
1633     int first = 1;
1634     const char **map = 0;
1635     
1636     if (remain > 0)
1637         map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0);
1638
1639     while (map)
1640     {
1641         char buf[IT_MAX_WORD+1];
1642         int i, remain;
1643
1644         /* Skip spaces */
1645         while (map && *map && **map == *CHR_SPACE)
1646         {
1647             remain = p->term_len - (b - p->term_buf);
1648             if (remain > 0)
1649                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b,
1650                                        remain, 0);
1651             else
1652                 map = 0;
1653         }
1654         if (!map)
1655             break;
1656         i = 0;
1657         while (map && *map && **map != *CHR_SPACE)
1658         {
1659             const char *cp = *map;
1660
1661             while (i < IT_MAX_WORD && *cp)
1662                 buf[i++] = *(cp++);
1663             remain = p->term_len - (b - p->term_buf);
1664             if (remain > 0)
1665                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0);
1666             else
1667                 map = 0;
1668         }
1669         if (!i)
1670             return;
1671
1672         if (first)
1673         {   
1674             first = 0;
1675             if (zebra_maps_is_first_in_field(zh->reg->zebra_maps, p->index_type))
1676             {
1677                 /* first in field marker */
1678                 extract_add_string(p, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1679                 p->seqno++;
1680             }
1681         }
1682         extract_add_string (p, buf, i);
1683         p->seqno++;
1684     }
1685 }
1686
1687 static void extract_add_complete_field (RecWord *p)
1688 {
1689     ZebraHandle zh = p->extractCtrl->handle;
1690     const char *b = p->term_buf;
1691     char buf[IT_MAX_WORD+1];
1692     const char **map = 0;
1693     int i = 0, remain = p->term_len;
1694
1695     if (remain > 0)
1696         map = zebra_maps_input (zh->reg->zebra_maps, p->index_type, &b, remain, 1);
1697
1698     while (remain > 0 && i < IT_MAX_WORD)
1699     {
1700         while (map && *map && **map == *CHR_SPACE)
1701         {
1702             remain = p->term_len - (b - p->term_buf);
1703
1704             if (remain > 0)
1705             {
1706                 int first = i ? 0 : 1;  /* first position */
1707                 map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, first);
1708             }
1709             else
1710                 map = 0;
1711         }
1712         if (!map)
1713             break;
1714
1715         if (i && i < IT_MAX_WORD)
1716             buf[i++] = *CHR_SPACE;
1717         while (map && *map && **map != *CHR_SPACE)
1718         {
1719             const char *cp = *map;
1720
1721             if (**map == *CHR_CUT)
1722             {
1723                 i = 0;
1724             }
1725             else
1726             {
1727                 if (i >= IT_MAX_WORD)
1728                     break;
1729                 while (i < IT_MAX_WORD && *cp)
1730                     buf[i++] = *(cp++);
1731             }
1732             remain = p->term_len  - (b - p->term_buf);
1733             if (remain > 0)
1734             {
1735                 map = zebra_maps_input (zh->reg->zebra_maps, p->index_type, &b,
1736                                         remain, 0);
1737             }
1738             else
1739                 map = 0;
1740         }
1741     }
1742     if (!i)
1743         return;
1744     extract_add_string (p, buf, i);
1745 }
1746
1747 static void extract_token_add(RecWord *p)
1748 {
1749     ZebraHandle zh = p->extractCtrl->handle;
1750     WRBUF wrbuf;
1751
1752     if (log_level_details)
1753     {
1754         yaz_log(log_level_details, "extract_token_add "
1755                 "type=%c index=%s seqno=" ZINT_FORMAT " s=%.*s",
1756                 p->index_type, p->index_name, 
1757                 p->seqno, p->term_len, p->term_buf);
1758     }
1759     if ((wrbuf = zebra_replace(zh->reg->zebra_maps, p->index_type, 0,
1760                                p->term_buf, p->term_len)))
1761     {
1762         p->term_buf = wrbuf_buf(wrbuf);
1763         p->term_len = wrbuf_len(wrbuf);
1764     }
1765     if (zebra_maps_is_complete (zh->reg->zebra_maps, p->index_type))
1766         extract_add_complete_field (p);
1767     else
1768         extract_add_incomplete_field(p);
1769 }
1770
1771 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1772                                       void *buf, size_t sz)
1773 {
1774     ZebraHandle zh = (ZebraHandle) p->handle;
1775
1776     xfree(zh->store_data_buf);
1777     zh->store_data_buf = 0;
1778     zh->store_data_size = 0;
1779     if (buf && sz)
1780     {
1781         zh->store_data_buf = xmalloc(sz);
1782         zh->store_data_size = sz;
1783         memcpy(zh->store_data_buf, buf, sz);
1784     }
1785 }
1786
1787 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1788 {
1789     ZebraHandle zh = (ZebraHandle) p->handle;
1790     xfree(zh->store_data_buf);
1791     zh->store_data_buf = 0;
1792     zh->store_data_size = 0;
1793     p->setStoreData = extract_set_store_data_cb;
1794 }
1795
1796 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1797 {
1798     ZebraHandle zh = (ZebraHandle) p->handle;
1799     zebraExplain_addSchema (zh->reg->zei, oid);
1800 }
1801
1802 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1803                              int cmd, zebra_rec_keys_t reckeys)
1804 {
1805 #if 0
1806     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1807             cmd, sysno);
1808     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1809 #endif
1810
1811     if (zebra_rec_keys_rewind(reckeys))
1812     {
1813         zebra_sort_index_t si = zh->reg->sort_index;
1814         size_t slen;
1815         const char *str;
1816         struct it_key key_in;
1817
1818         zebra_sort_sysno(si, sysno);
1819
1820         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1821         {
1822             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1823             
1824             zebra_sort_type(si, ord);
1825             if (cmd == 1)
1826                 zebra_sort_add(si, str, slen);
1827             else
1828                 zebra_sort_delete(si);
1829         }
1830     }
1831 }
1832
1833 /*
1834  * Local variables:
1835  * c-basic-offset: 4
1836  * indent-tabs-mode: nil
1837  * End:
1838  * vim: shiftwidth=4 tabstop=8 expandtab
1839  */
1840