c70e6a1ee26b25b37476e21f23e0f53442ca11ba
[idzebra-moved-to-github.git] / index / extract.c
1 /* $Id: extract.c,v 1.267 2007-10-31 16:56:14 adam Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 /** \file
24     \brief indexes records and extract tokens for indexing and sorting
25 */
26
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37
38
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52
53 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
54                                 zebra_rec_keys_t ins_keys,
55                                 zint ins_rank,
56                                 zebra_rec_keys_t del_keys,
57                                 zint del_rank);
58
59 static void zebra_init_log_level(void)
60 {
61     if (!log_level_initialized)
62     {
63         log_level_initialized = 1;
64
65         log_level_extract = yaz_log_module_level("extract");
66         log_level_details = yaz_log_module_level("indexdetails");
67     }
68 }
69
70 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
71                                     int cmd, zebra_rec_keys_t skp);
72 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
73 static void extract_token_add(RecWord *p);
74 static void extract_token_add2(RecWord *p);
75
76 static void check_log_limit(ZebraHandle zh)
77 {
78     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
79     {
80         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
81                 zh->m_file_verbose_limit);
82     }
83 }
84
85 static void logRecord(ZebraHandle zh)
86 {
87     check_log_limit(zh);
88     ++zh->records_processed;
89     if (!(zh->records_processed % 1000))
90     {
91         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
92                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
93                 zh->records_processed, zh->records_inserted, 
94                 zh->records_updated, zh->records_deleted);
95     }
96 }
97
98 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
99 {
100     ctrl->flagShowRecords = !zh->m_flag_rw;
101 }
102
103
104 static void extract_add_index_string(RecWord *p, 
105                                       zinfo_index_category_t cat,
106                                       const char *str, int length);
107
108 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
109
110 static void extract_init(struct recExtractCtrl *p, RecWord *w)
111 {
112     w->seqno = 1;
113     w->index_name = "any";
114     w->index_type = "w";
115     w->extractCtrl = p;
116     w->record_id = 0;
117     w->section_id = 0;
118     w->segment = 0;
119 }
120
121 struct snip_rec_info {
122     ZebraHandle zh;
123     zebra_snippets *snippets;
124 };
125
126
127 static void snippet_add_complete_field(RecWord *p, int ord,
128                                        zebra_map_t zm)
129 {
130     struct snip_rec_info *h = p->extractCtrl->handle;
131
132     const char *b = p->term_buf;
133     char buf[IT_MAX_WORD+1];
134     const char **map = 0;
135     int i = 0, remain = p->term_len;
136     const char *start = b;
137     const char *last = 0;
138
139     if (remain > 0)
140         map = zebra_maps_input(zm, &b, remain, 1);
141
142     while (remain > 0 && i < IT_MAX_WORD)
143     {
144         while (map && *map && **map == *CHR_SPACE)
145         {
146             remain = p->term_len - (b - p->term_buf);
147
148             if (i == 0)
149                 start = b;  /* set to first non-ws area */
150             if (remain > 0)
151             {
152                 int first = i ? 0 : 1;  /* first position */
153
154                 map = zebra_maps_input(zm, &b, remain, first);
155             }
156             else
157                 map = 0;
158         }
159         if (!map)
160             break;
161
162         if (i && i < IT_MAX_WORD)
163             buf[i++] = *CHR_SPACE;
164         while (map && *map && **map != *CHR_SPACE)
165         {
166             const char *cp = *map;
167
168             if (**map == *CHR_CUT)
169             {
170                 i = 0;
171             }
172             else
173             {
174                 if (i >= IT_MAX_WORD)
175                     break;
176                 while (i < IT_MAX_WORD && *cp)
177                     buf[i++] = *(cp++);
178             }
179             last = b;
180             remain = p->term_len  - (b - p->term_buf);
181             if (remain > 0)
182             {
183                 map = zebra_maps_input(zm, &b, remain, 0);
184             }
185             else
186                 map = 0;
187         }
188     }
189     if (!i)
190         return;
191     if (last && start != last)
192         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
193                                start, last - start);
194 }
195
196 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
197 {
198     struct snip_rec_info *h = p->extractCtrl->handle;
199     const char *b = p->term_buf;
200     int remain = p->term_len;
201     int first = 1;
202     const char **map = 0;
203     const char *start = b;
204     const char *last = b;
205
206     if (remain > 0)
207         map = zebra_maps_input(zm, &b, remain, 0);
208
209     while (map)
210     {
211         char buf[IT_MAX_WORD+1];
212         int i, remain;
213
214         /* Skip spaces */
215         while (map && *map && **map == *CHR_SPACE)
216         {
217             remain = p->term_len - (b - p->term_buf);
218             last = b;
219             if (remain > 0)
220                 map = zebra_maps_input(zm, &b, remain, 0);
221             else
222                 map = 0;
223         }
224         if (!map)
225             break;
226         if (start != last)
227         {
228             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
229                                    start, last - start);
230
231         }
232         start = last;
233
234         i = 0;
235         while (map && *map && **map != *CHR_SPACE)
236         {
237             const char *cp = *map;
238
239             while (i < IT_MAX_WORD && *cp)
240                 buf[i++] = *(cp++);
241             remain = p->term_len - (b - p->term_buf);
242             last = b;
243             if (remain > 0)
244                 map = zebra_maps_input(zm, &b, remain, 0);
245             else
246                 map = 0;
247         }
248         if (!i)
249             return;
250
251         if (first)
252         {   
253             first = 0;
254             if (zebra_maps_is_first_in_field(zm))
255             {
256                 /* first in field marker */
257                 p->seqno++;
258             }
259         }
260         if (start != last)
261             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
262                                    start, last - start);
263         start = last;
264         p->seqno++;
265     }
266
267 }
268
269 static void snippet_token_add(RecWord *p)
270 {
271     struct snip_rec_info *h = p->extractCtrl->handle;
272     ZebraHandle zh = h->zh;
273     zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, p->index_type);
274
275     if (zm && zebra_maps_is_index(zm))
276     {
277         ZebraExplainInfo zei = zh->reg->zei;
278         int ch = zebraExplain_lookup_attr_str(
279             zei, zinfo_index_category_index, p->index_type, p->index_name);
280
281         if (zebra_maps_is_complete(zm))
282             snippet_add_complete_field(p, ch, zm);
283         else
284             snippet_add_incomplete_field(p, ch, zm);
285     }
286 }
287
288 static void snippet_schema_add(
289     struct recExtractCtrl *p, Odr_oid *oid)
290 {
291
292 }
293
294 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
295                      struct ZebraRecStream *stream,
296                      RecType rt, void *recTypeClientData)
297 {
298     struct recExtractCtrl extractCtrl;
299     struct snip_rec_info info;
300     int r;
301
302     extractCtrl.stream = stream;
303     extractCtrl.first_record = 1;
304     extractCtrl.init = extract_init;
305     extractCtrl.tokenAdd = snippet_token_add;
306     extractCtrl.schemaAdd = snippet_schema_add;
307     assert(zh->reg);
308     assert(zh->reg->dh);
309
310     extractCtrl.dh = zh->reg->dh;
311     
312     info.zh = zh;
313     info.snippets = sn;
314     extractCtrl.handle = &info;
315     extractCtrl.match_criteria[0] = '\0';
316     extractCtrl.staticrank = 0;
317     extractCtrl.action = action_insert;
318     
319     init_extractCtrl(zh, &extractCtrl);
320
321     extractCtrl.setStoreData = 0;
322
323     r = (*rt->extract)(recTypeClientData, &extractCtrl);
324
325 }
326
327 static void searchRecordKey(ZebraHandle zh,
328                             zebra_rec_keys_t reckeys,
329                             const char *index_name,
330                             const char **ws, int ws_length)
331 {
332     int i;
333     int ch = -1;
334     zinfo_index_category_t cat = zinfo_index_category_index;
335
336     for (i = 0; i<ws_length; i++)
337         ws[i] = NULL;
338
339     if (ch < 0)
340         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
341     if (ch < 0)
342         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
343     if (ch < 0)
344         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
345
346     if (ch < 0)
347         return ;
348
349     if (zebra_rec_keys_rewind(reckeys))
350     {
351         zint startSeq = -1;
352         const char *str;
353         size_t slen;
354         struct it_key key;
355         zint seqno;
356         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
357         {
358             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
359
360             seqno = key.mem[key.len-1];
361             
362             if (key.mem[0] == ch)
363             {
364                 zint woff;
365                 
366                 if (startSeq == -1)
367                     startSeq = seqno;
368                 woff = seqno - startSeq;
369                 if (woff >= 0 && woff < ws_length)
370                     ws[woff] = str;
371             }
372         }
373     }
374 }
375
376 #define FILE_MATCH_BLANK "\t "
377
378 static char *get_match_from_spec(ZebraHandle zh,
379                           zebra_rec_keys_t reckeys,
380                           const char *fname, const char *spec)
381 {
382     static char dstBuf[2048];      /* static here ??? */
383     char *dst = dstBuf;
384     const char *s = spec;
385
386     while (1)
387     {
388         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
389             ;
390         if (!*s)
391             break;
392         if (*s == '(')
393         {
394             const char *ws[32];
395             char attset_str[64], attname_str[64];
396             int i;
397             int first = 1;
398             
399             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
400                 ;
401             for (i = 0; *s && *s != ',' && *s != ')' && 
402                      !strchr(FILE_MATCH_BLANK, *s); s++)
403                 if (i+1 < sizeof(attset_str))
404                     attset_str[i++] = *s;
405             attset_str[i] = '\0';
406             
407             for (; strchr(FILE_MATCH_BLANK, *s); s++)
408                 ;
409             if (*s != ',')
410                 strcpy(attname_str, attset_str);
411             else
412             {
413                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
414                     ;
415                 for (i = 0; *s && *s != ')' && 
416                          !strchr(FILE_MATCH_BLANK, *s); s++)
417                     if (i+1 < sizeof(attname_str))
418                         attname_str[i++] = *s;
419                 attname_str[i] = '\0';
420             }
421
422             searchRecordKey(zh, reckeys, attname_str, ws, 32);
423
424             if (*s != ')')
425             {
426                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
427                       spec, zh->m_group ? zh->m_group : "none");
428                 return NULL;
429             }
430             s++;
431
432             for (i = 0; i<32; i++)
433                 if (ws[i])
434                 {
435                     if (first)
436                     {
437                         *dst++ = ' ';
438                         first = 0;
439                     }
440                     strcpy(dst, ws[i]);
441                     dst += strlen(ws[i]);
442                 }
443             if (first)
444             {
445                 yaz_log(YLOG_WARN, "Record didn't contain match"
446                       " fields in (%s,%s)", attset_str, attname_str);
447                 return NULL;
448             }
449         }
450         else if (*s == '$')
451         {
452             int spec_len;
453             char special[64];
454             const char *spec_src = NULL;
455             const char *s1 = ++s;
456             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
457                 s1++;
458
459             spec_len = s1 - s;
460             if (spec_len > sizeof(special)-1)
461                 spec_len = sizeof(special)-1;
462             memcpy(special, s, spec_len);
463             special[spec_len] = '\0';
464             s = s1;
465
466             if (!strcmp(special, "group"))
467                 spec_src = zh->m_group;
468             else if (!strcmp(special, "database"))
469                 spec_src = zh->basenames[0];
470             else if (!strcmp(special, "filename")) {
471                 spec_src = fname;
472             }
473             else if (!strcmp(special, "type"))
474                 spec_src = zh->m_record_type;
475             else 
476                 spec_src = NULL;
477             if (spec_src)
478             {
479                 strcpy(dst, spec_src);
480                 dst += strlen(spec_src);
481             }
482         }
483         else if (*s == '\"' || *s == '\'')
484         {
485             int stopMarker = *s++;
486             char tmpString[64];
487             int i = 0;
488
489             while (*s && *s != stopMarker)
490             {
491                 if (i+1 < sizeof(tmpString))
492                     tmpString[i++] = *s++;
493             }
494             if (*s)
495                 s++;
496             tmpString[i] = '\0';
497             strcpy(dst, tmpString);
498             dst += strlen(tmpString);
499         }
500         else
501         {
502             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
503                   spec, zh->m_group ? zh->m_group : "none");
504             return NULL;
505         }
506         *dst++ = 1;
507     }
508     if (dst == dstBuf)
509     {
510         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
511               fname, zh->m_group ? zh->m_group : "none");
512         return NULL;
513     }
514     *dst = '\0';
515     return dstBuf;
516 }
517
518 struct recordLogInfo {
519     const char *fname;
520     int recordOffset;
521     struct recordGroup *rGroup;
522 };
523
524 static void all_matches_add(struct recExtractCtrl *ctrl)
525 {
526     RecWord word;
527     extract_init(ctrl, &word);
528     word.index_name = "_ALLRECORDS";
529     word.index_type = "w";
530     word.seqno = 1;
531     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
532                               "", 0);
533 }
534
535 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
536                                        struct ZebraRecStream *stream,
537                                        enum zebra_recctrl_action_t action,
538                                        int test_mode, 
539                                        const char *recordType,
540                                        zint *sysno,
541                                        const char *match_criteria,
542                                        const char *fname,
543                                        RecType recType,
544                                        void *recTypeClientData);
545
546
547 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
548                              int deleteFlag)
549 {
550     ZEBRA_RES r = ZEBRA_OK;
551     int i, fd;
552     char gprefix[128];
553     char ext[128];
554     char ext_res[128];
555     struct file_read_info *fi = 0;
556     const char *original_record_type = 0;
557     RecType recType;
558     void *recTypeClientData;
559     struct ZebraRecStream stream, *streamp;
560
561     zebra_init_log_level();
562
563     if (!zh->m_group || !*zh->m_group)
564         *gprefix = '\0';
565     else
566         sprintf(gprefix, "%s.", zh->m_group);
567     
568     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
569
570     /* determine file extension */
571     *ext = '\0';
572     for (i = strlen(fname); --i >= 0; )
573         if (fname[i] == '/')
574             break;
575         else if (fname[i] == '.')
576         {
577             strcpy(ext, fname+i+1);
578             break;
579         }
580     /* determine file type - depending on extension */
581     original_record_type = zh->m_record_type;
582     if (!zh->m_record_type)
583     {
584         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
585         zh->m_record_type = res_get(zh->res, ext_res);
586     }
587     if (!zh->m_record_type)
588     {
589         check_log_limit(zh);
590         if (zh->records_processed + zh->records_skipped
591             < zh->m_file_verbose_limit)
592             yaz_log(YLOG_LOG, "? %s", fname);
593         zh->records_skipped++;
594         return 0;
595     }
596     /* determine match criteria */
597     if (!zh->m_record_id)
598     {
599         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
600         zh->m_record_id = res_get(zh->res, ext_res);
601     }
602
603     if (!(recType =
604           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
605                           &recTypeClientData)))
606     {
607         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
608         return ZEBRA_FAIL;
609     }
610
611     switch(recType->version)
612     {
613     case 0:
614         break;
615     default:
616         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
617     }
618     if (sysno && deleteFlag)
619     {
620         streamp = 0;
621         fi = 0;
622     }
623     else
624     {
625         char full_rep[1024];
626
627         if (zh->path_reg && !yaz_is_abspath(fname))
628         {
629             strcpy(full_rep, zh->path_reg);
630             strcat(full_rep, "/");
631             strcat(full_rep, fname);
632         }
633         else
634             strcpy(full_rep, fname);
635         
636         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
637         {
638             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
639             zh->m_record_type = original_record_type;
640             return ZEBRA_FAIL;
641         }
642         streamp = &stream;
643         zebra_create_stream_fd(streamp, fd, 0);
644     }
645     r = zebra_extract_records_stream(zh, streamp,
646                                      deleteFlag ? 
647                                      action_delete : action_update,
648                                      0, /* tst_mode */
649                                      zh->m_record_type,
650                                      sysno,
651                                      0, /*match_criteria */
652                                      fname,
653                                      recType, recTypeClientData);
654     if (streamp)
655         stream.destroy(streamp);
656     zh->m_record_type = original_record_type;
657     return r;
658 }
659
660 /*
661   If sysno is provided, then it's used to identify the reocord.
662   If not, and match_criteria is provided, then sysno is guessed
663   If not, and a record is provided, then sysno is got from there
664   
665  */
666
667 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
668                                       const char *buf, size_t buf_size,
669                                       enum zebra_recctrl_action_t action,
670                                       int test_mode, 
671                                       const char *recordType,
672                                       zint *sysno,
673                                       const char *match_criteria,
674                                       const char *fname)
675 {
676     struct ZebraRecStream stream;
677     ZEBRA_RES res;
678     void *clientData;
679     RecType recType = 0;
680
681     if (recordType && *recordType)
682     {
683         yaz_log(log_level_extract,
684                 "Record type explicitly specified: %s", recordType);
685         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
686                                   &clientData);
687     } 
688     else
689     {
690         if (!(zh->m_record_type))
691         {
692             yaz_log(YLOG_WARN, "No such record type defined");
693             return ZEBRA_FAIL;
694         }
695         yaz_log(log_level_extract, "Get record type from rgroup: %s",
696                 zh->m_record_type);
697         recType = recType_byName(zh->reg->recTypes, zh->res,
698                                   zh->m_record_type, &clientData);
699         recordType = zh->m_record_type;
700     }
701     
702     if (!recType)
703     {
704         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
705         return ZEBRA_FAIL;
706     }
707
708     zebra_create_stream_mem(&stream, buf, buf_size);
709
710     res = zebra_extract_records_stream(zh, &stream,
711                                        action,
712                                        test_mode, 
713                                        recordType,
714                                        sysno,
715                                        match_criteria,
716                                        fname,
717                                        recType, clientData);
718     stream.destroy(&stream);
719     return res;
720 }
721
722 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
723                                        struct ZebraRecStream *stream,
724                                        enum zebra_recctrl_action_t action,
725                                        int test_mode, 
726                                        const char *recordType,
727                                        zint *sysno,
728                                        const char *match_criteria,
729                                        const char *fname,
730                                        RecType recType,
731                                        void *recTypeClientData)
732 {
733     ZEBRA_RES res = ZEBRA_OK;
734     while (1)
735     {
736         int more = 0;
737         res = zebra_extract_record_stream(zh, stream,
738                                           action,
739                                           test_mode, 
740                                           recordType,
741                                           sysno,
742                                           match_criteria,
743                                           fname,
744                                           recType, recTypeClientData, &more);
745         if (!more)
746         {
747             res = ZEBRA_OK;
748             break;
749         }
750         if (res != ZEBRA_OK)
751             break;
752         if (sysno)
753             break;
754     }
755     return res;
756 }
757
758
759 static WRBUF wrbuf_hex_str(const char *cstr)
760 {
761     size_t i;
762     WRBUF w = wrbuf_alloc();
763     for (i = 0; cstr[i]; i++)
764     {
765         if (cstr[i] < ' ' || cstr[i] > 126)
766             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
767         else
768             wrbuf_putc(w, cstr[i]);
769     }
770     return w;
771 }
772
773 ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
774                                       struct ZebraRecStream *stream,
775                                       enum zebra_recctrl_action_t action,
776                                       int test_mode, 
777                                       const char *recordType,
778                                       zint *sysno,
779                                       const char *match_criteria,
780                                       const char *fname,
781                                       RecType recType,
782                                       void *recTypeClientData,
783                                       int *more)
784
785 {
786     zint sysno0 = 0;
787     RecordAttr *recordAttr;
788     struct recExtractCtrl extractCtrl;
789     int r;
790     const char *matchStr = 0;
791     Record rec;
792     off_t start_offset = 0, end_offset = 0;
793     const char *pr_fname = fname;  /* filename to print .. */
794     int show_progress = zh->records_processed + zh->records_skipped 
795         < zh->m_file_verbose_limit ? 1:0;
796
797     zebra_init_log_level();
798
799     if (!pr_fname)
800         pr_fname = "<no file>";  /* make it printable if file is omitted */
801
802     zebra_rec_keys_reset(zh->reg->keys);
803     zebra_rec_keys_reset(zh->reg->sortKeys);
804
805     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
806     {
807         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
808                                       zh->m_explain_database))
809             return ZEBRA_FAIL;
810     }
811
812     if (stream)
813     {
814         off_t null_offset = 0;
815         extractCtrl.stream = stream;
816
817         start_offset = stream->tellf(stream);
818
819         extractCtrl.first_record = start_offset ? 0 : 1;
820         
821         stream->endf(stream, &null_offset);;
822
823         extractCtrl.init = extract_init;
824         if (zh->reg->index_types)
825         {
826             extractCtrl.tokenAdd = extract_token_add2;
827         }
828         else
829         {
830             extractCtrl.tokenAdd = extract_token_add;
831         }
832         extractCtrl.schemaAdd = extract_schema_add;
833         extractCtrl.dh = zh->reg->dh;
834         extractCtrl.handle = zh;
835         extractCtrl.match_criteria[0] = '\0';
836         extractCtrl.staticrank = 0;
837         extractCtrl.action = action;
838
839         init_extractCtrl(zh, &extractCtrl);
840
841         extract_set_store_data_prepare(&extractCtrl);
842         
843         r = (*recType->extract)(recTypeClientData, &extractCtrl);
844
845         if (action == action_update)
846         {
847             action = extractCtrl.action;
848         }
849         
850         switch (r)
851         {
852         case RECCTRL_EXTRACT_EOF:
853             return ZEBRA_FAIL;
854         case RECCTRL_EXTRACT_ERROR_GENERIC:
855             /* error occured during extraction ... */
856             yaz_log(YLOG_WARN, "extract error: generic");
857             return ZEBRA_FAIL;
858         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
859             /* error occured during extraction ... */
860             yaz_log(YLOG_WARN, "extract error: no such filter");
861             return ZEBRA_FAIL;
862         case RECCTRL_EXTRACT_SKIP:
863             if (show_progress)
864                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
865                          recordType, pr_fname, (zint) start_offset);
866             *more = 1;
867             
868             end_offset = stream->endf(stream, 0);
869             if (end_offset)
870                 stream->seekf(stream, end_offset);
871
872             return ZEBRA_OK;
873         case RECCTRL_EXTRACT_OK:
874             break;
875         default:
876             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
877             return ZEBRA_FAIL;
878         }
879         end_offset = stream->endf(stream, 0);
880         if (end_offset)
881             stream->seekf(stream, end_offset);
882         else
883             end_offset = stream->tellf(stream);
884
885         all_matches_add(&extractCtrl);
886         
887         if (extractCtrl.match_criteria[0])
888             match_criteria = extractCtrl.match_criteria;
889     }
890
891     *more = 1;
892     if (!sysno)
893     {
894         sysno = &sysno0;
895
896         if (match_criteria && *match_criteria) {
897             matchStr = match_criteria;
898         } else {
899             if (zh->m_record_id && *zh->m_record_id) {
900                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
901                                                zh->m_record_id);
902                 if (!matchStr)
903                 {
904                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
905                              pr_fname, (zint) start_offset);
906                     return ZEBRA_FAIL;
907                 }
908             }
909         }
910         if (matchStr) 
911         {
912             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
913             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
914                                           matchStr);
915
916             
917             if (log_level_extract)
918             {
919                 WRBUF w = wrbuf_hex_str(matchStr);
920                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
921                 wrbuf_destroy(w);
922             }
923             if (rinfo)
924             {
925                 assert(*rinfo == sizeof(*sysno));
926                 memcpy(sysno, rinfo+1, sizeof(*sysno));
927             }
928        }
929     }
930     if (zebra_rec_keys_empty(zh->reg->keys))
931     {
932         /* the extraction process returned no information - the record
933            is probably empty - unless flagShowRecords is in use */
934         if (test_mode)
935             return ZEBRA_OK;
936     }
937
938     if (! *sysno)
939     {
940         /* new record */
941         if (action == action_delete)
942         {
943             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
944                          pr_fname, (zint) start_offset);
945             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
946             return ZEBRA_FAIL;
947         }
948         else if (action == action_replace)
949         {
950             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
951                          pr_fname, (zint) start_offset);
952             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
953             return ZEBRA_FAIL;
954         }
955         if (show_progress)
956             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
957                      (zint) start_offset);
958         rec = rec_new(zh->reg->records);
959
960         *sysno = rec->sysno;
961
962         recordAttr = rec_init_attr(zh->reg->zei, rec);
963         if (extractCtrl.staticrank < 0)
964         {
965             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
966             extractCtrl.staticrank = 0;
967         }
968
969         if (matchStr)
970         {
971             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
972             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
973                             sizeof(*sysno), sysno);
974         }
975
976         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
977 #if FLUSH2
978         extract_flush_record_keys2(zh, *sysno,
979                                    zh->reg->keys, extractCtrl.staticrank,
980                                    0, recordAttr->staticrank);
981 #else
982         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
983                                   extractCtrl.staticrank);
984 #endif
985         recordAttr->staticrank = extractCtrl.staticrank;
986         zh->records_inserted++;
987     } 
988     else
989     {
990         /* record already exists */
991         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
992         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
993         if (action == action_insert)
994         {
995             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
996                          recordType, pr_fname, (zint) start_offset);
997             logRecord(zh);
998             return ZEBRA_FAIL;
999         }
1000
1001         rec = rec_get(zh->reg->records, *sysno);
1002         assert(rec);
1003         
1004         recordAttr = rec_init_attr(zh->reg->zei, rec);
1005
1006         /* decrease total size */
1007         zebraExplain_recordBytesIncrement(zh->reg->zei,
1008                                            - recordAttr->recordSize);
1009
1010         zebra_rec_keys_set_buf(delkeys,
1011                                rec->info[recInfo_delKeys],
1012                                rec->size[recInfo_delKeys],
1013                                0);
1014         zebra_rec_keys_set_buf(sortKeys,
1015                                rec->info[recInfo_sortKeys],
1016                                rec->size[recInfo_sortKeys],
1017                                0);
1018
1019         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1020 #if !FLUSH2
1021         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1022                                   recordAttr->staticrank);
1023 #endif
1024         if (action == action_delete)
1025         {
1026             /* record going to be deleted */
1027 #if FLUSH2
1028             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1029                                        delkeys, recordAttr->staticrank);
1030 #endif       
1031             if (zebra_rec_keys_empty(delkeys))
1032             {
1033                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1034                         pr_fname, (zint) start_offset);
1035                 yaz_log(YLOG_WARN, "cannot delete file above, "
1036                         "storeKeys false (3)");
1037             }
1038             else
1039             {
1040                 if (show_progress)
1041                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1042                             pr_fname, (zint) start_offset);
1043                 zh->records_deleted++;
1044                 if (matchStr)
1045                 {
1046                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1047                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1048                 }
1049                 rec_del(zh->reg->records, &rec);
1050             }
1051             zebra_rec_keys_close(delkeys);
1052             zebra_rec_keys_close(sortKeys);
1053             rec_free(&rec);
1054             logRecord(zh);
1055             return ZEBRA_OK;
1056         }
1057         else
1058         {   /* update or special_update */
1059             if (show_progress)
1060                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1061                         pr_fname, (zint) start_offset);
1062             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1063
1064 #if FLUSH2
1065             extract_flush_record_keys2(zh, *sysno,
1066                                        zh->reg->keys, extractCtrl.staticrank,
1067                                        delkeys, recordAttr->staticrank);
1068 #else
1069             extract_flush_record_keys(zh, *sysno, 1, 
1070                                       zh->reg->keys, extractCtrl.staticrank);
1071 #endif
1072             recordAttr->staticrank = extractCtrl.staticrank;
1073             zh->records_updated++;
1074         }
1075         zebra_rec_keys_close(delkeys);
1076         zebra_rec_keys_close(sortKeys);
1077     }
1078     /* update file type */
1079     xfree(rec->info[recInfo_fileType]);
1080     rec->info[recInfo_fileType] =
1081         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1082
1083     /* update filename */
1084     xfree(rec->info[recInfo_filename]);
1085     rec->info[recInfo_filename] =
1086         rec_strdup(fname, &rec->size[recInfo_filename]);
1087
1088     /* update delete keys */
1089     xfree(rec->info[recInfo_delKeys]);
1090     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1091     {
1092         zebra_rec_keys_get_buf(zh->reg->keys,
1093                                &rec->info[recInfo_delKeys],
1094                                &rec->size[recInfo_delKeys]);
1095     }
1096     else
1097     {
1098         rec->info[recInfo_delKeys] = NULL;
1099         rec->size[recInfo_delKeys] = 0;
1100     }
1101     /* update sort keys */
1102     xfree(rec->info[recInfo_sortKeys]);
1103
1104     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1105                            &rec->info[recInfo_sortKeys],
1106                            &rec->size[recInfo_sortKeys]);
1107
1108     if (stream)
1109     {
1110         recordAttr->recordSize = end_offset - start_offset;
1111         zebraExplain_recordBytesIncrement(zh->reg->zei,
1112                                           recordAttr->recordSize);
1113     }
1114
1115     /* set run-number for this record */
1116     recordAttr->runNumber =
1117         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1118
1119     /* update store data */
1120     xfree(rec->info[recInfo_storeData]);
1121
1122     /* update store data */
1123     if (zh->store_data_buf)
1124     {
1125         rec->size[recInfo_storeData] = zh->store_data_size;
1126         rec->info[recInfo_storeData] = zh->store_data_buf;
1127         zh->store_data_buf = 0;
1128         recordAttr->recordSize = zh->store_data_size;
1129     }
1130     else if (zh->m_store_data)
1131     {
1132         off_t cur_offset = stream->tellf(stream);
1133
1134         rec->size[recInfo_storeData] = recordAttr->recordSize;
1135         rec->info[recInfo_storeData] = (char *)
1136             xmalloc(recordAttr->recordSize);
1137         stream->seekf(stream, start_offset);
1138         stream->readf(stream, rec->info[recInfo_storeData],
1139                       recordAttr->recordSize);
1140         stream->seekf(stream, cur_offset);
1141     }
1142     else
1143     {
1144         rec->info[recInfo_storeData] = NULL;
1145         rec->size[recInfo_storeData] = 0;
1146     }
1147     /* update database name */
1148     xfree(rec->info[recInfo_databaseName]);
1149     rec->info[recInfo_databaseName] =
1150         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1151
1152     /* update offset */
1153     recordAttr->recordOffset = start_offset;
1154     
1155     /* commit this record */
1156     rec_put(zh->reg->records, &rec);
1157     logRecord(zh);
1158     return ZEBRA_OK;
1159 }
1160
1161 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1162 {
1163     ZebraHandle zh = (ZebraHandle) handle;
1164     struct recExtractCtrl extractCtrl;
1165
1166     if (zebraExplain_curDatabase(zh->reg->zei,
1167                                   rec->info[recInfo_databaseName]))
1168     {
1169         abort();
1170         if (zebraExplain_newDatabase(zh->reg->zei,
1171                                       rec->info[recInfo_databaseName], 0))
1172             abort();
1173     }
1174
1175     zebra_rec_keys_reset(zh->reg->keys);
1176     zebra_rec_keys_reset(zh->reg->sortKeys);
1177
1178     extractCtrl.init = extract_init;
1179     extractCtrl.tokenAdd = extract_token_add;
1180     extractCtrl.schemaAdd = extract_schema_add;
1181     extractCtrl.dh = zh->reg->dh;
1182
1183     init_extractCtrl(zh, &extractCtrl);
1184
1185     extractCtrl.flagShowRecords = 0;
1186     extractCtrl.match_criteria[0] = '\0';
1187     extractCtrl.staticrank = 0;
1188     extractCtrl.action = action_update;
1189
1190     extractCtrl.handle = handle;
1191     extractCtrl.first_record = 1;
1192     
1193     extract_set_store_data_prepare(&extractCtrl);
1194
1195     if (n)
1196         grs_extract_tree(&extractCtrl, n);
1197
1198     if (rec->size[recInfo_delKeys])
1199     {
1200         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1201         
1202         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1203
1204         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1205                                rec->size[recInfo_delKeys],
1206                                0);
1207 #if FLUSH2
1208         extract_flush_record_keys2(zh, rec->sysno, 
1209                                    zh->reg->keys, 0, delkeys, 0);
1210 #else
1211         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1212         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1213 #endif
1214         zebra_rec_keys_close(delkeys);
1215
1216         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1217                                rec->size[recInfo_sortKeys],
1218                                0);
1219
1220         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1221         zebra_rec_keys_close(sortkeys);
1222     }
1223     else
1224     {
1225 #if FLUSH2
1226         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1227 #else
1228         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1229 #endif
1230     }
1231     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1232     
1233     xfree(rec->info[recInfo_delKeys]);
1234     zebra_rec_keys_get_buf(zh->reg->keys,
1235                            &rec->info[recInfo_delKeys], 
1236                            &rec->size[recInfo_delKeys]);
1237
1238     xfree(rec->info[recInfo_sortKeys]);
1239     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1240                            &rec->info[recInfo_sortKeys],
1241                            &rec->size[recInfo_sortKeys]);
1242     return ZEBRA_OK;
1243 }
1244
1245 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1246                           zebra_rec_keys_t reckeys,
1247                           int level)
1248 {
1249     if (zebra_rec_keys_rewind(reckeys))
1250     {
1251         size_t slen;
1252         const char *str;
1253         struct it_key key;
1254         NMEM nmem = nmem_create();
1255
1256         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1257         {
1258             char keystr[200]; /* room for zints to print */
1259             char *dst_term = 0;
1260             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1261             const char *index_type;
1262             int i;
1263             const char *string_index;
1264             
1265             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1266                                     0/* db */, &string_index);
1267             assert(index_type);
1268             zebra_term_untrans_iconv(zh, nmem, index_type,
1269                                      &dst_term, str);
1270             *keystr = '\0';
1271             for (i = 0; i<key.len; i++)
1272             {
1273                 sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key.mem[i]);
1274             }
1275
1276             if (*str < CHR_BASE_CHAR)
1277             {
1278                 int i;
1279                 char dst_buf[200]; /* room for special chars */
1280
1281                 strcpy(dst_buf , "?");
1282
1283                 if (!strcmp(str, ""))
1284                     strcpy(dst_buf, "alwaysmatches");
1285                 if (!strcmp(str, FIRST_IN_FIELD_STR))
1286                     strcpy(dst_buf, "firstinfield");
1287                 else if (!strcmp(str, CHR_UNKNOWN))
1288                     strcpy(dst_buf, "unknown");
1289                 else if (!strcmp(str, CHR_SPACE))
1290                     strcpy(dst_buf, "space");
1291                 
1292                 for (i = 0; i<slen; i++)
1293                 {
1294                     sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1295                 }
1296                 yaz_log(level, "%s%s %s %s", keystr, index_type,
1297                         string_index, dst_buf);
1298                 
1299             }
1300             else
1301                 yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1302                         string_index, dst_term);
1303
1304             nmem_reset(nmem);
1305         }
1306         nmem_destroy(nmem);
1307     }
1308 }
1309
1310 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1311                              zebra_rec_keys_t reckeys)
1312 {
1313     ZebraExplainInfo zei = zh->reg->zei;
1314     struct ord_stat {
1315         int no;
1316         int ord;
1317         struct ord_stat *next;
1318     };
1319
1320     if (zebra_rec_keys_rewind(reckeys))
1321     {
1322         struct ord_stat *ord_list = 0;
1323         struct ord_stat *p;
1324         size_t slen;
1325         const char *str;
1326         struct it_key key_in;
1327         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1328         {
1329             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1330
1331             for (p = ord_list; p ; p = p->next)
1332                 if (p->ord == ord)
1333                 {
1334                     p->no++;
1335                     break;
1336                 }
1337             if (!p)
1338             {
1339                 p = xmalloc(sizeof(*p));
1340                 p->no = 1;
1341                 p->ord = ord;
1342                 p->next = ord_list;
1343                 ord_list = p;
1344             }
1345         }
1346
1347         p = ord_list;
1348         while (p)
1349         {
1350             struct ord_stat *p1 = p;
1351
1352             if (is_insert)
1353                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1354             else
1355                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1356             p = p->next;
1357             xfree(p1);
1358         }
1359     }
1360 }
1361
1362 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
1363                                 zebra_rec_keys_t ins_keys, zint ins_rank,
1364                                 zebra_rec_keys_t del_keys, zint del_rank)
1365 {
1366     ZebraExplainInfo zei = zh->reg->zei;
1367     int normal = 0;
1368     int optimized = 0;
1369
1370     if (!zh->reg->key_block)
1371     {
1372         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1373         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1374         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1375         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1376     }
1377
1378     if (ins_keys)
1379     {
1380         extract_rec_keys_adjust(zh, 1, ins_keys);
1381         if (!del_keys)
1382             zebraExplain_recordCountIncrement(zei, 1);
1383         zebra_rec_keys_rewind(ins_keys);
1384     }
1385     if (del_keys)
1386     {
1387         extract_rec_keys_adjust(zh, 0, del_keys);
1388         if (!ins_keys)
1389             zebraExplain_recordCountIncrement(zei, -1);
1390         zebra_rec_keys_rewind(del_keys);
1391     }
1392
1393     while (1)
1394     {
1395         size_t del_slen;
1396         const char *del_str;
1397         struct it_key del_key_in;
1398         int del = 0;
1399
1400         size_t ins_slen;
1401         const char *ins_str;
1402         struct it_key ins_key_in;
1403         int ins = 0;
1404
1405         if (del_keys)
1406             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1407                                       &del_key_in);
1408         if (ins_keys)
1409             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1410                                       &ins_key_in);
1411
1412         if (del && ins && ins_rank == del_rank
1413             && !key_compare(&del_key_in, &ins_key_in) 
1414             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1415         {
1416             optimized++;
1417             continue;
1418         }
1419         if (!del && !ins)
1420             break;
1421         
1422         normal++;
1423         if (del)
1424             key_block_write(zh->reg->key_block, sysno, 
1425                             &del_key_in, 0, del_str, del_slen,
1426                             del_rank, zh->m_staticrank);
1427         if (ins)
1428             key_block_write(zh->reg->key_block, sysno, 
1429                             &ins_key_in, 1, ins_str, ins_slen,
1430                             ins_rank, zh->m_staticrank);
1431     }
1432     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1433 }
1434
1435
1436 ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh,
1437                                      zebra_rec_keys_t reckeys,
1438                                      zebra_snippets *snippets)
1439 {
1440     NMEM nmem = nmem_create();
1441     if (zebra_rec_keys_rewind(reckeys)) 
1442     {
1443         const char *str;
1444         size_t slen;
1445         struct it_key key;
1446         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1447         {
1448             char *dst_term = 0;
1449             int ord;
1450             zint seqno;
1451             const char *index_type;
1452
1453             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1454             seqno = key.mem[key.len-1];
1455             ord = CAST_ZINT_TO_INT(key.mem[0]);
1456             
1457             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1458                                     0/* db */, 0 /* string_index */);
1459             assert(index_type);
1460             zebra_term_untrans_iconv(zh, nmem, index_type,
1461                                      &dst_term, str);
1462             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1463             nmem_reset(nmem);
1464         }
1465     }
1466     nmem_destroy(nmem);
1467     return ZEBRA_OK;
1468 }
1469
1470 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1471 {
1472     yaz_log(YLOG_LOG, "print_rec_keys");
1473     if (zebra_rec_keys_rewind(reckeys))
1474     {
1475         const char *str;
1476         size_t slen;
1477         struct it_key key;
1478         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1479         {
1480             char dst_buf[IT_MAX_WORD];
1481             zint seqno;
1482             const char *index_type;
1483             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1484             const char *db = 0;
1485             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1486
1487             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1488             
1489             seqno = key.mem[key.len-1];
1490             
1491             zebra_term_untrans(zh, index_type, dst_buf, str);
1492             
1493             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1494                     " term=%s", ord, seqno, dst_buf); 
1495         }
1496     }
1497 }
1498
1499 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1500                                      const char *str, int length)
1501 {
1502     struct it_key key;
1503     ZebraHandle zh = p->extractCtrl->handle;
1504     ZebraExplainInfo zei = zh->reg->zei;
1505     int ch, i;
1506
1507     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1508     if (ch < 0)
1509         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1510
1511     i = 0;
1512     key.mem[i++] = ch;
1513     key.mem[i++] = p->record_id;
1514     key.mem[i++] = p->section_id;
1515
1516     if (zh->m_segment_indexing)
1517         key.mem[i++] = p->segment;
1518     key.mem[i++] = p->seqno;
1519     key.len = i;
1520
1521     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1522 }
1523
1524 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1525 {
1526     struct it_key key;
1527     ZebraHandle zh = p->extractCtrl->handle;
1528     ZebraExplainInfo zei = zh->reg->zei;
1529     int ch;
1530     zinfo_index_category_t cat = zinfo_index_category_sort;
1531
1532     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1533     if (ch < 0)
1534         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1535     key.len = 2;
1536     key.mem[0] = ch;
1537     key.mem[1] = p->record_id;
1538
1539     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1540 }
1541
1542 static void extract_add_staticrank_string(RecWord *p,
1543                                           const char *str, int length)
1544 {
1545     char valz[40];
1546     struct recExtractCtrl *ctrl = p->extractCtrl;
1547
1548     if (length > sizeof(valz)-1)
1549         length = sizeof(valz)-1;
1550
1551     memcpy(valz, str, length);
1552     valz[length] = '\0';
1553     ctrl->staticrank = atozint(valz);
1554 }
1555
1556 static void extract_add_string(RecWord *p, zebra_map_t zm,
1557                                const char *string, int length)
1558 {
1559     assert(length > 0);
1560
1561     if (!p->index_name)
1562         return;
1563
1564     if (zebra_maps_is_index(zm))
1565     {
1566         extract_add_index_string(p, zinfo_index_category_index,
1567                                  string, length);
1568         if (zebra_maps_is_alwaysmatches(zm))
1569         {
1570             RecWord word;
1571             memcpy(&word, p, sizeof(word));
1572
1573             word.seqno = 1;
1574             extract_add_index_string(
1575                 &word, zinfo_index_category_alwaysmatches, "", 0);
1576         }
1577     }
1578     else if (zebra_maps_is_sort(zm))
1579     {
1580         extract_add_sort_string(p, string, length);
1581     }
1582     else if (zebra_maps_is_staticrank(zm))
1583     {
1584         extract_add_staticrank_string(p, string, length);
1585     }
1586 }
1587
1588 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1589 {
1590     const char *b = p->term_buf;
1591     int remain = p->term_len;
1592     int first = 1;
1593     const char **map = 0;
1594     
1595     if (remain > 0)
1596         map = zebra_maps_input(zm, &b, remain, 0);
1597
1598     while (map)
1599     {
1600         char buf[IT_MAX_WORD+1];
1601         int i, remain;
1602
1603         /* Skip spaces */
1604         while (map && *map && **map == *CHR_SPACE)
1605         {
1606             remain = p->term_len - (b - p->term_buf);
1607             if (remain > 0)
1608                 map = zebra_maps_input(zm, &b, remain, 0);
1609             else
1610                 map = 0;
1611         }
1612         if (!map)
1613             break;
1614         i = 0;
1615         while (map && *map && **map != *CHR_SPACE)
1616         {
1617             const char *cp = *map;
1618
1619             while (i < IT_MAX_WORD && *cp)
1620                 buf[i++] = *(cp++);
1621             remain = p->term_len - (b - p->term_buf);
1622             if (remain > 0)
1623                 map = zebra_maps_input(zm, &b, remain, 0);
1624             else
1625                 map = 0;
1626         }
1627         if (!i)
1628             return;
1629
1630         if (first)
1631         {   
1632             first = 0;
1633             if (zebra_maps_is_first_in_field(zm))
1634             {
1635                 /* first in field marker */
1636                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1637                 p->seqno++;
1638             }
1639         }
1640         extract_add_string(p, zm, buf, i);
1641         p->seqno++;
1642     }
1643 }
1644
1645 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1646 {
1647     const char *b = p->term_buf;
1648     char buf[IT_MAX_WORD+1];
1649     const char **map = 0;
1650     int i = 0, remain = p->term_len;
1651
1652     if (remain > 0)
1653         map = zebra_maps_input(zm, &b, remain, 1);
1654
1655     while (remain > 0 && i < IT_MAX_WORD)
1656     {
1657         while (map && *map && **map == *CHR_SPACE)
1658         {
1659             remain = p->term_len - (b - p->term_buf);
1660
1661             if (remain > 0)
1662             {
1663                 int first = i ? 0 : 1;  /* first position */
1664                 map = zebra_maps_input(zm, &b, remain, first);
1665             }
1666             else
1667                 map = 0;
1668         }
1669         if (!map)
1670             break;
1671
1672         if (i && i < IT_MAX_WORD)
1673             buf[i++] = *CHR_SPACE;
1674         while (map && *map && **map != *CHR_SPACE)
1675         {
1676             const char *cp = *map;
1677
1678             if (**map == *CHR_CUT)
1679             {
1680                 i = 0;
1681             }
1682             else
1683             {
1684                 if (i >= IT_MAX_WORD)
1685                     break;
1686                 while (i < IT_MAX_WORD && *cp)
1687                     buf[i++] = *(cp++);
1688             }
1689             remain = p->term_len  - (b - p->term_buf);
1690             if (remain > 0)
1691             {
1692                 map = zebra_maps_input(zm, &b, remain, 0);
1693             }
1694             else
1695                 map = 0;
1696         }
1697     }
1698     if (!i)
1699         return;
1700     extract_add_string(p, zm, buf, i);
1701 }
1702
1703 static void extract_token_add2_index(ZebraHandle zh, zebra_index_type_t type,
1704                                      RecWord *p)
1705 {
1706     struct it_key key;
1707     const char *res_buf = 0;
1708     size_t res_len = 0;
1709     int r = zebra_index_type_tokenize(type, p->term_buf, p->term_len,
1710                                       &res_buf, &res_len);
1711     int cat = zinfo_index_category_index;
1712     int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
1713     if (ch < 0)
1714         ch = zebraExplain_add_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
1715     while (r)
1716     {
1717         int i = 0;
1718         key.mem[i++] = ch;
1719         key.mem[i++] = p->record_id;
1720         key.mem[i++] = p->section_id;
1721         
1722         if (zh->m_segment_indexing)
1723             key.mem[i++] = p->segment;
1724         key.mem[i++] = p->seqno;
1725         key.len = i;
1726
1727         yaz_log(YLOG_LOG, "keys_write %.*s", (int) res_len, res_buf);
1728         zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key);
1729         
1730         p->seqno++;
1731         r = zebra_index_type_tokenize(type, 0, 0, &res_buf, &res_len);
1732     }
1733 }
1734
1735 static void extract_token_add2(RecWord *p)
1736 {
1737     ZebraHandle zh = p->extractCtrl->handle;
1738     zebra_index_type_t type = zebra_index_type_get(zh->reg->index_types, p->index_type);
1739     if (type)
1740     {
1741         if (zebra_index_type_is_index(type))
1742         {
1743             extract_token_add2_index(zh, type, p);
1744         }
1745         else if (zebra_index_type_is_sort(type))
1746         {
1747             ;
1748             
1749         }
1750     }
1751 }
1752
1753 /** \brief top-level indexing handler for recctrl system
1754     \param p token data to be indexed
1755
1756     Call sequence:
1757     extract_token
1758     zebra_add_{in}_complete
1759     extract_add_string
1760     
1761     extract_add_index_string
1762     or
1763     extract_add_sort_string
1764     or
1765     extract_add_staticrank_string
1766     
1767 */
1768 static void extract_token_add(RecWord *p)
1769 {
1770     ZebraHandle zh = p->extractCtrl->handle;
1771     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1772     WRBUF wrbuf;
1773
1774     if (log_level_details)
1775     {
1776         yaz_log(log_level_details, "extract_token_add "
1777                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1778                 p->index_type, p->index_name, 
1779                 p->seqno, p->term_len, p->term_buf);
1780     }
1781     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1782     {
1783         p->term_buf = wrbuf_buf(wrbuf);
1784         p->term_len = wrbuf_len(wrbuf);
1785     }
1786     if (zebra_maps_is_complete(zm))
1787         extract_add_complete_field(p, zm);
1788     else
1789         extract_add_incomplete_field(p, zm);
1790 }
1791
1792 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1793                                       void *buf, size_t sz)
1794 {
1795     ZebraHandle zh = (ZebraHandle) p->handle;
1796
1797     xfree(zh->store_data_buf);
1798     zh->store_data_buf = 0;
1799     zh->store_data_size = 0;
1800     if (buf && sz)
1801     {
1802         zh->store_data_buf = xmalloc(sz);
1803         zh->store_data_size = sz;
1804         memcpy(zh->store_data_buf, buf, sz);
1805     }
1806 }
1807
1808 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1809 {
1810     ZebraHandle zh = (ZebraHandle) p->handle;
1811     xfree(zh->store_data_buf);
1812     zh->store_data_buf = 0;
1813     zh->store_data_size = 0;
1814     p->setStoreData = extract_set_store_data_cb;
1815 }
1816
1817 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1818 {
1819     ZebraHandle zh = (ZebraHandle) p->handle;
1820     zebraExplain_addSchema(zh->reg->zei, oid);
1821 }
1822
1823 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1824                              int cmd, zebra_rec_keys_t reckeys)
1825 {
1826 #if 0
1827     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1828             cmd, sysno);
1829     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1830 #endif
1831
1832     if (zebra_rec_keys_rewind(reckeys))
1833     {
1834         zebra_sort_index_t si = zh->reg->sort_index;
1835         size_t slen;
1836         const char *str;
1837         struct it_key key_in;
1838
1839         zebra_sort_sysno(si, sysno);
1840
1841         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1842         {
1843             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1844             
1845             zebra_sort_type(si, ord);
1846             if (cmd == 1)
1847                 zebra_sort_add(si, str, slen);
1848             else
1849                 zebra_sort_delete(si);
1850         }
1851     }
1852 }
1853
1854 /*
1855  * Local variables:
1856  * c-basic-offset: 4
1857  * indent-tabs-mode: nil
1858  * End:
1859  * vim: shiftwidth=4 tabstop=8 expandtab
1860  */
1861