First successful test with ICU sortkeys in dictionary.
[idzebra-moved-to-github.git] / index / extract.c
1 /* $Id: extract.c,v 1.269 2007-11-08 21:21:58 adam Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 /** \file
24     \brief indexes records and extract tokens for indexing and sorting
25 */
26
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37
38
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52
53 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
54                                 zebra_rec_keys_t ins_keys,
55                                 zint ins_rank,
56                                 zebra_rec_keys_t del_keys,
57                                 zint del_rank);
58
59 static void zebra_init_log_level(void)
60 {
61     if (!log_level_initialized)
62     {
63         log_level_initialized = 1;
64
65         log_level_extract = yaz_log_module_level("extract");
66         log_level_details = yaz_log_module_level("indexdetails");
67     }
68 }
69
70 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
71                                     int cmd, zebra_rec_keys_t skp);
72 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
73 static void extract_token_add(RecWord *p);
74
75 static void check_log_limit(ZebraHandle zh)
76 {
77     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
78     {
79         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
80                 zh->m_file_verbose_limit);
81     }
82 }
83
84 static void logRecord(ZebraHandle zh)
85 {
86     check_log_limit(zh);
87     ++zh->records_processed;
88     if (!(zh->records_processed % 1000))
89     {
90         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
91                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
92                 zh->records_processed, zh->records_inserted, 
93                 zh->records_updated, zh->records_deleted);
94     }
95 }
96
97 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
98 {
99     ctrl->flagShowRecords = !zh->m_flag_rw;
100 }
101
102
103 static void extract_add_index_string(RecWord *p, 
104                                       zinfo_index_category_t cat,
105                                       const char *str, int length);
106
107 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
108
109 static void extract_init(struct recExtractCtrl *p, RecWord *w)
110 {
111     w->seqno = 1;
112     w->index_name = "any";
113     w->index_type = "w";
114     w->extractCtrl = p;
115     w->record_id = 0;
116     w->section_id = 0;
117     w->segment = 0;
118 }
119
120 struct snip_rec_info {
121     ZebraHandle zh;
122     zebra_snippets *snippets;
123 };
124
125
126 static void snippet_add_complete_field(RecWord *p, int ord,
127                                        zebra_map_t zm)
128 {
129     struct snip_rec_info *h = p->extractCtrl->handle;
130
131     const char *b = p->term_buf;
132     char buf[IT_MAX_WORD+1];
133     const char **map = 0;
134     int i = 0, remain = p->term_len;
135     const char *start = b;
136     const char *last = 0;
137
138     if (remain > 0)
139         map = zebra_maps_input(zm, &b, remain, 1);
140
141     while (remain > 0 && i < IT_MAX_WORD)
142     {
143         while (map && *map && **map == *CHR_SPACE)
144         {
145             remain = p->term_len - (b - p->term_buf);
146
147             if (i == 0)
148                 start = b;  /* set to first non-ws area */
149             if (remain > 0)
150             {
151                 int first = i ? 0 : 1;  /* first position */
152
153                 map = zebra_maps_input(zm, &b, remain, first);
154             }
155             else
156                 map = 0;
157         }
158         if (!map)
159             break;
160
161         if (i && i < IT_MAX_WORD)
162             buf[i++] = *CHR_SPACE;
163         while (map && *map && **map != *CHR_SPACE)
164         {
165             const char *cp = *map;
166
167             if (**map == *CHR_CUT)
168             {
169                 i = 0;
170             }
171             else
172             {
173                 if (i >= IT_MAX_WORD)
174                     break;
175                 while (i < IT_MAX_WORD && *cp)
176                     buf[i++] = *(cp++);
177             }
178             last = b;
179             remain = p->term_len  - (b - p->term_buf);
180             if (remain > 0)
181             {
182                 map = zebra_maps_input(zm, &b, remain, 0);
183             }
184             else
185                 map = 0;
186         }
187     }
188     if (!i)
189         return;
190     if (last && start != last)
191         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
192                                start, last - start);
193 }
194
195 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
196 {
197     struct snip_rec_info *h = p->extractCtrl->handle;
198     const char *b = p->term_buf;
199     int remain = p->term_len;
200     int first = 1;
201     const char **map = 0;
202     const char *start = b;
203     const char *last = b;
204
205     if (remain > 0)
206         map = zebra_maps_input(zm, &b, remain, 0);
207
208     while (map)
209     {
210         char buf[IT_MAX_WORD+1];
211         int i, remain;
212
213         /* Skip spaces */
214         while (map && *map && **map == *CHR_SPACE)
215         {
216             remain = p->term_len - (b - p->term_buf);
217             last = b;
218             if (remain > 0)
219                 map = zebra_maps_input(zm, &b, remain, 0);
220             else
221                 map = 0;
222         }
223         if (!map)
224             break;
225         if (start != last)
226         {
227             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
228                                    start, last - start);
229
230         }
231         start = last;
232
233         i = 0;
234         while (map && *map && **map != *CHR_SPACE)
235         {
236             const char *cp = *map;
237
238             while (i < IT_MAX_WORD && *cp)
239                 buf[i++] = *(cp++);
240             remain = p->term_len - (b - p->term_buf);
241             last = b;
242             if (remain > 0)
243                 map = zebra_maps_input(zm, &b, remain, 0);
244             else
245                 map = 0;
246         }
247         if (!i)
248             return;
249
250         if (first)
251         {   
252             first = 0;
253             if (zebra_maps_is_first_in_field(zm))
254             {
255                 /* first in field marker */
256                 p->seqno++;
257             }
258         }
259         if (start != last)
260             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
261                                    start, last - start);
262         start = last;
263         p->seqno++;
264     }
265
266 }
267
268 static void snippet_token_add(RecWord *p)
269 {
270     struct snip_rec_info *h = p->extractCtrl->handle;
271     ZebraHandle zh = h->zh;
272     zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, p->index_type);
273
274     if (zm && zebra_maps_is_index(zm))
275     {
276         ZebraExplainInfo zei = zh->reg->zei;
277         int ch = zebraExplain_lookup_attr_str(
278             zei, zinfo_index_category_index, p->index_type, p->index_name);
279
280         if (zebra_maps_is_complete(zm))
281             snippet_add_complete_field(p, ch, zm);
282         else
283             snippet_add_incomplete_field(p, ch, zm);
284     }
285 }
286
287 static void snippet_schema_add(
288     struct recExtractCtrl *p, Odr_oid *oid)
289 {
290
291 }
292
293 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
294                      struct ZebraRecStream *stream,
295                      RecType rt, void *recTypeClientData)
296 {
297     struct recExtractCtrl extractCtrl;
298     struct snip_rec_info info;
299     int r;
300
301     extractCtrl.stream = stream;
302     extractCtrl.first_record = 1;
303     extractCtrl.init = extract_init;
304     extractCtrl.tokenAdd = snippet_token_add;
305     extractCtrl.schemaAdd = snippet_schema_add;
306     assert(zh->reg);
307     assert(zh->reg->dh);
308
309     extractCtrl.dh = zh->reg->dh;
310     
311     info.zh = zh;
312     info.snippets = sn;
313     extractCtrl.handle = &info;
314     extractCtrl.match_criteria[0] = '\0';
315     extractCtrl.staticrank = 0;
316     extractCtrl.action = action_insert;
317     
318     init_extractCtrl(zh, &extractCtrl);
319
320     extractCtrl.setStoreData = 0;
321
322     r = (*rt->extract)(recTypeClientData, &extractCtrl);
323
324 }
325
326 static void searchRecordKey(ZebraHandle zh,
327                             zebra_rec_keys_t reckeys,
328                             const char *index_name,
329                             const char **ws, int ws_length)
330 {
331     int i;
332     int ch = -1;
333     zinfo_index_category_t cat = zinfo_index_category_index;
334
335     for (i = 0; i<ws_length; i++)
336         ws[i] = NULL;
337
338     if (ch < 0)
339         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
340     if (ch < 0)
341         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
342     if (ch < 0)
343         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
344
345     if (ch < 0)
346         return ;
347
348     if (zebra_rec_keys_rewind(reckeys))
349     {
350         zint startSeq = -1;
351         const char *str;
352         size_t slen;
353         struct it_key key;
354         zint seqno;
355         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
356         {
357             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
358
359             seqno = key.mem[key.len-1];
360             
361             if (key.mem[0] == ch)
362             {
363                 zint woff;
364                 
365                 if (startSeq == -1)
366                     startSeq = seqno;
367                 woff = seqno - startSeq;
368                 if (woff >= 0 && woff < ws_length)
369                     ws[woff] = str;
370             }
371         }
372     }
373 }
374
375 #define FILE_MATCH_BLANK "\t "
376
377 static char *get_match_from_spec(ZebraHandle zh,
378                           zebra_rec_keys_t reckeys,
379                           const char *fname, const char *spec)
380 {
381     static char dstBuf[2048];      /* static here ??? */
382     char *dst = dstBuf;
383     const char *s = spec;
384
385     while (1)
386     {
387         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
388             ;
389         if (!*s)
390             break;
391         if (*s == '(')
392         {
393             const char *ws[32];
394             char attset_str[64], attname_str[64];
395             int i;
396             int first = 1;
397             
398             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
399                 ;
400             for (i = 0; *s && *s != ',' && *s != ')' && 
401                      !strchr(FILE_MATCH_BLANK, *s); s++)
402                 if (i+1 < sizeof(attset_str))
403                     attset_str[i++] = *s;
404             attset_str[i] = '\0';
405             
406             for (; strchr(FILE_MATCH_BLANK, *s); s++)
407                 ;
408             if (*s != ',')
409                 strcpy(attname_str, attset_str);
410             else
411             {
412                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
413                     ;
414                 for (i = 0; *s && *s != ')' && 
415                          !strchr(FILE_MATCH_BLANK, *s); s++)
416                     if (i+1 < sizeof(attname_str))
417                         attname_str[i++] = *s;
418                 attname_str[i] = '\0';
419             }
420
421             searchRecordKey(zh, reckeys, attname_str, ws, 32);
422
423             if (*s != ')')
424             {
425                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
426                       spec, zh->m_group ? zh->m_group : "none");
427                 return NULL;
428             }
429             s++;
430
431             for (i = 0; i<32; i++)
432                 if (ws[i])
433                 {
434                     if (first)
435                     {
436                         *dst++ = ' ';
437                         first = 0;
438                     }
439                     strcpy(dst, ws[i]);
440                     dst += strlen(ws[i]);
441                 }
442             if (first)
443             {
444                 yaz_log(YLOG_WARN, "Record didn't contain match"
445                       " fields in (%s,%s)", attset_str, attname_str);
446                 return NULL;
447             }
448         }
449         else if (*s == '$')
450         {
451             int spec_len;
452             char special[64];
453             const char *spec_src = NULL;
454             const char *s1 = ++s;
455             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
456                 s1++;
457
458             spec_len = s1 - s;
459             if (spec_len > sizeof(special)-1)
460                 spec_len = sizeof(special)-1;
461             memcpy(special, s, spec_len);
462             special[spec_len] = '\0';
463             s = s1;
464
465             if (!strcmp(special, "group"))
466                 spec_src = zh->m_group;
467             else if (!strcmp(special, "database"))
468                 spec_src = zh->basenames[0];
469             else if (!strcmp(special, "filename")) {
470                 spec_src = fname;
471             }
472             else if (!strcmp(special, "type"))
473                 spec_src = zh->m_record_type;
474             else 
475                 spec_src = NULL;
476             if (spec_src)
477             {
478                 strcpy(dst, spec_src);
479                 dst += strlen(spec_src);
480             }
481         }
482         else if (*s == '\"' || *s == '\'')
483         {
484             int stopMarker = *s++;
485             char tmpString[64];
486             int i = 0;
487
488             while (*s && *s != stopMarker)
489             {
490                 if (i+1 < sizeof(tmpString))
491                     tmpString[i++] = *s++;
492             }
493             if (*s)
494                 s++;
495             tmpString[i] = '\0';
496             strcpy(dst, tmpString);
497             dst += strlen(tmpString);
498         }
499         else
500         {
501             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
502                   spec, zh->m_group ? zh->m_group : "none");
503             return NULL;
504         }
505         *dst++ = 1;
506     }
507     if (dst == dstBuf)
508     {
509         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
510               fname, zh->m_group ? zh->m_group : "none");
511         return NULL;
512     }
513     *dst = '\0';
514     return dstBuf;
515 }
516
517 struct recordLogInfo {
518     const char *fname;
519     int recordOffset;
520     struct recordGroup *rGroup;
521 };
522
523 static void all_matches_add(struct recExtractCtrl *ctrl)
524 {
525     RecWord word;
526     extract_init(ctrl, &word);
527     word.index_name = "_ALLRECORDS";
528     word.index_type = "w";
529     word.seqno = 1;
530     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
531                               "", 0);
532 }
533
534 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
535                                        struct ZebraRecStream *stream,
536                                        enum zebra_recctrl_action_t action,
537                                        int test_mode, 
538                                        const char *recordType,
539                                        zint *sysno,
540                                        const char *match_criteria,
541                                        const char *fname,
542                                        RecType recType,
543                                        void *recTypeClientData);
544
545
546 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
547                              int deleteFlag)
548 {
549     ZEBRA_RES r = ZEBRA_OK;
550     int i, fd;
551     char gprefix[128];
552     char ext[128];
553     char ext_res[128];
554     struct file_read_info *fi = 0;
555     const char *original_record_type = 0;
556     RecType recType;
557     void *recTypeClientData;
558     struct ZebraRecStream stream, *streamp;
559
560     zebra_init_log_level();
561
562     if (!zh->m_group || !*zh->m_group)
563         *gprefix = '\0';
564     else
565         sprintf(gprefix, "%s.", zh->m_group);
566     
567     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
568
569     /* determine file extension */
570     *ext = '\0';
571     for (i = strlen(fname); --i >= 0; )
572         if (fname[i] == '/')
573             break;
574         else if (fname[i] == '.')
575         {
576             strcpy(ext, fname+i+1);
577             break;
578         }
579     /* determine file type - depending on extension */
580     original_record_type = zh->m_record_type;
581     if (!zh->m_record_type)
582     {
583         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
584         zh->m_record_type = res_get(zh->res, ext_res);
585     }
586     if (!zh->m_record_type)
587     {
588         check_log_limit(zh);
589         if (zh->records_processed + zh->records_skipped
590             < zh->m_file_verbose_limit)
591             yaz_log(YLOG_LOG, "? %s", fname);
592         zh->records_skipped++;
593         return 0;
594     }
595     /* determine match criteria */
596     if (!zh->m_record_id)
597     {
598         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
599         zh->m_record_id = res_get(zh->res, ext_res);
600     }
601
602     if (!(recType =
603           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
604                           &recTypeClientData)))
605     {
606         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
607         return ZEBRA_FAIL;
608     }
609
610     switch(recType->version)
611     {
612     case 0:
613         break;
614     default:
615         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
616     }
617     if (sysno && deleteFlag)
618     {
619         streamp = 0;
620         fi = 0;
621     }
622     else
623     {
624         char full_rep[1024];
625
626         if (zh->path_reg && !yaz_is_abspath(fname))
627         {
628             strcpy(full_rep, zh->path_reg);
629             strcat(full_rep, "/");
630             strcat(full_rep, fname);
631         }
632         else
633             strcpy(full_rep, fname);
634         
635         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
636         {
637             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
638             zh->m_record_type = original_record_type;
639             return ZEBRA_FAIL;
640         }
641         streamp = &stream;
642         zebra_create_stream_fd(streamp, fd, 0);
643     }
644     r = zebra_extract_records_stream(zh, streamp,
645                                      deleteFlag ? 
646                                      action_delete : action_update,
647                                      0, /* tst_mode */
648                                      zh->m_record_type,
649                                      sysno,
650                                      0, /*match_criteria */
651                                      fname,
652                                      recType, recTypeClientData);
653     if (streamp)
654         stream.destroy(streamp);
655     zh->m_record_type = original_record_type;
656     return r;
657 }
658
659 /*
660   If sysno is provided, then it's used to identify the reocord.
661   If not, and match_criteria is provided, then sysno is guessed
662   If not, and a record is provided, then sysno is got from there
663   
664  */
665
666 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
667                                       const char *buf, size_t buf_size,
668                                       enum zebra_recctrl_action_t action,
669                                       int test_mode, 
670                                       const char *recordType,
671                                       zint *sysno,
672                                       const char *match_criteria,
673                                       const char *fname)
674 {
675     struct ZebraRecStream stream;
676     ZEBRA_RES res;
677     void *clientData;
678     RecType recType = 0;
679
680     if (recordType && *recordType)
681     {
682         yaz_log(log_level_extract,
683                 "Record type explicitly specified: %s", recordType);
684         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
685                                   &clientData);
686     } 
687     else
688     {
689         if (!(zh->m_record_type))
690         {
691             yaz_log(YLOG_WARN, "No such record type defined");
692             return ZEBRA_FAIL;
693         }
694         yaz_log(log_level_extract, "Get record type from rgroup: %s",
695                 zh->m_record_type);
696         recType = recType_byName(zh->reg->recTypes, zh->res,
697                                   zh->m_record_type, &clientData);
698         recordType = zh->m_record_type;
699     }
700     
701     if (!recType)
702     {
703         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
704         return ZEBRA_FAIL;
705     }
706
707     zebra_create_stream_mem(&stream, buf, buf_size);
708
709     res = zebra_extract_records_stream(zh, &stream,
710                                        action,
711                                        test_mode, 
712                                        recordType,
713                                        sysno,
714                                        match_criteria,
715                                        fname,
716                                        recType, clientData);
717     stream.destroy(&stream);
718     return res;
719 }
720
721 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
722                                        struct ZebraRecStream *stream,
723                                        enum zebra_recctrl_action_t action,
724                                        int test_mode, 
725                                        const char *recordType,
726                                        zint *sysno,
727                                        const char *match_criteria,
728                                        const char *fname,
729                                        RecType recType,
730                                        void *recTypeClientData)
731 {
732     ZEBRA_RES res = ZEBRA_OK;
733     while (1)
734     {
735         int more = 0;
736         res = zebra_extract_record_stream(zh, stream,
737                                           action,
738                                           test_mode, 
739                                           recordType,
740                                           sysno,
741                                           match_criteria,
742                                           fname,
743                                           recType, recTypeClientData, &more);
744         if (!more)
745         {
746             res = ZEBRA_OK;
747             break;
748         }
749         if (res != ZEBRA_OK)
750             break;
751         if (sysno)
752             break;
753     }
754     return res;
755 }
756
757
758 static WRBUF wrbuf_hex_str(const char *cstr)
759 {
760     size_t i;
761     WRBUF w = wrbuf_alloc();
762     for (i = 0; cstr[i]; i++)
763     {
764         if (cstr[i] < ' ' || cstr[i] > 126)
765             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
766         else
767             wrbuf_putc(w, cstr[i]);
768     }
769     return w;
770 }
771
772 ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
773                                       struct ZebraRecStream *stream,
774                                       enum zebra_recctrl_action_t action,
775                                       int test_mode, 
776                                       const char *recordType,
777                                       zint *sysno,
778                                       const char *match_criteria,
779                                       const char *fname,
780                                       RecType recType,
781                                       void *recTypeClientData,
782                                       int *more)
783
784 {
785     zint sysno0 = 0;
786     RecordAttr *recordAttr;
787     struct recExtractCtrl extractCtrl;
788     int r;
789     const char *matchStr = 0;
790     Record rec;
791     off_t start_offset = 0, end_offset = 0;
792     const char *pr_fname = fname;  /* filename to print .. */
793     int show_progress = zh->records_processed + zh->records_skipped 
794         < zh->m_file_verbose_limit ? 1:0;
795
796     zebra_init_log_level();
797
798     if (!pr_fname)
799         pr_fname = "<no file>";  /* make it printable if file is omitted */
800
801     zebra_rec_keys_reset(zh->reg->keys);
802     zebra_rec_keys_reset(zh->reg->sortKeys);
803
804     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
805     {
806         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
807                                       zh->m_explain_database))
808             return ZEBRA_FAIL;
809     }
810
811     if (stream)
812     {
813         off_t null_offset = 0;
814         extractCtrl.stream = stream;
815
816         start_offset = stream->tellf(stream);
817
818         extractCtrl.first_record = start_offset ? 0 : 1;
819         
820         stream->endf(stream, &null_offset);;
821
822         extractCtrl.init = extract_init;
823         extractCtrl.tokenAdd = extract_token_add;
824         extractCtrl.schemaAdd = extract_schema_add;
825         extractCtrl.dh = zh->reg->dh;
826         extractCtrl.handle = zh;
827         extractCtrl.match_criteria[0] = '\0';
828         extractCtrl.staticrank = 0;
829         extractCtrl.action = action;
830
831         init_extractCtrl(zh, &extractCtrl);
832
833         extract_set_store_data_prepare(&extractCtrl);
834         
835         r = (*recType->extract)(recTypeClientData, &extractCtrl);
836
837         if (action == action_update)
838         {
839             action = extractCtrl.action;
840         }
841         
842         switch (r)
843         {
844         case RECCTRL_EXTRACT_EOF:
845             return ZEBRA_FAIL;
846         case RECCTRL_EXTRACT_ERROR_GENERIC:
847             /* error occured during extraction ... */
848             yaz_log(YLOG_WARN, "extract error: generic");
849             return ZEBRA_FAIL;
850         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
851             /* error occured during extraction ... */
852             yaz_log(YLOG_WARN, "extract error: no such filter");
853             return ZEBRA_FAIL;
854         case RECCTRL_EXTRACT_SKIP:
855             if (show_progress)
856                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
857                          recordType, pr_fname, (zint) start_offset);
858             *more = 1;
859             
860             end_offset = stream->endf(stream, 0);
861             if (end_offset)
862                 stream->seekf(stream, end_offset);
863
864             return ZEBRA_OK;
865         case RECCTRL_EXTRACT_OK:
866             break;
867         default:
868             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
869             return ZEBRA_FAIL;
870         }
871         end_offset = stream->endf(stream, 0);
872         if (end_offset)
873             stream->seekf(stream, end_offset);
874         else
875             end_offset = stream->tellf(stream);
876
877         all_matches_add(&extractCtrl);
878         
879         if (extractCtrl.match_criteria[0])
880             match_criteria = extractCtrl.match_criteria;
881     }
882
883     *more = 1;
884     if (!sysno)
885     {
886         sysno = &sysno0;
887
888         if (match_criteria && *match_criteria) {
889             matchStr = match_criteria;
890         } else {
891             if (zh->m_record_id && *zh->m_record_id) {
892                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
893                                                zh->m_record_id);
894                 if (!matchStr)
895                 {
896                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
897                              pr_fname, (zint) start_offset);
898                     return ZEBRA_FAIL;
899                 }
900             }
901         }
902         if (matchStr) 
903         {
904             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
905             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
906                                           matchStr);
907
908             
909             if (log_level_extract)
910             {
911                 WRBUF w = wrbuf_hex_str(matchStr);
912                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
913                 wrbuf_destroy(w);
914             }
915             if (rinfo)
916             {
917                 assert(*rinfo == sizeof(*sysno));
918                 memcpy(sysno, rinfo+1, sizeof(*sysno));
919             }
920        }
921     }
922     if (zebra_rec_keys_empty(zh->reg->keys))
923     {
924         /* the extraction process returned no information - the record
925            is probably empty - unless flagShowRecords is in use */
926         if (test_mode)
927             return ZEBRA_OK;
928     }
929
930     if (! *sysno)
931     {
932         /* new record */
933         if (action == action_delete)
934         {
935             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
936                          pr_fname, (zint) start_offset);
937             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
938             return ZEBRA_FAIL;
939         }
940         else if (action == action_replace)
941         {
942             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
943                          pr_fname, (zint) start_offset);
944             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
945             return ZEBRA_FAIL;
946         }
947         if (show_progress)
948             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
949                      (zint) start_offset);
950         rec = rec_new(zh->reg->records);
951
952         *sysno = rec->sysno;
953
954         recordAttr = rec_init_attr(zh->reg->zei, rec);
955         if (extractCtrl.staticrank < 0)
956         {
957             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
958             extractCtrl.staticrank = 0;
959         }
960
961         if (matchStr)
962         {
963             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
964             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
965                             sizeof(*sysno), sysno);
966         }
967
968         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
969 #if FLUSH2
970         extract_flush_record_keys2(zh, *sysno,
971                                    zh->reg->keys, extractCtrl.staticrank,
972                                    0, recordAttr->staticrank);
973 #else
974         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
975                                   extractCtrl.staticrank);
976 #endif
977         recordAttr->staticrank = extractCtrl.staticrank;
978         zh->records_inserted++;
979     } 
980     else
981     {
982         /* record already exists */
983         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
984         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
985         if (action == action_insert)
986         {
987             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
988                          recordType, pr_fname, (zint) start_offset);
989             logRecord(zh);
990             return ZEBRA_FAIL;
991         }
992
993         rec = rec_get(zh->reg->records, *sysno);
994         assert(rec);
995         
996         recordAttr = rec_init_attr(zh->reg->zei, rec);
997
998         /* decrease total size */
999         zebraExplain_recordBytesIncrement(zh->reg->zei,
1000                                            - recordAttr->recordSize);
1001
1002         zebra_rec_keys_set_buf(delkeys,
1003                                rec->info[recInfo_delKeys],
1004                                rec->size[recInfo_delKeys],
1005                                0);
1006         zebra_rec_keys_set_buf(sortKeys,
1007                                rec->info[recInfo_sortKeys],
1008                                rec->size[recInfo_sortKeys],
1009                                0);
1010
1011         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1012 #if !FLUSH2
1013         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1014                                   recordAttr->staticrank);
1015 #endif
1016         if (action == action_delete)
1017         {
1018             /* record going to be deleted */
1019 #if FLUSH2
1020             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1021                                        delkeys, recordAttr->staticrank);
1022 #endif       
1023             if (zebra_rec_keys_empty(delkeys))
1024             {
1025                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1026                         pr_fname, (zint) start_offset);
1027                 yaz_log(YLOG_WARN, "cannot delete file above, "
1028                         "storeKeys false (3)");
1029             }
1030             else
1031             {
1032                 if (show_progress)
1033                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1034                             pr_fname, (zint) start_offset);
1035                 zh->records_deleted++;
1036                 if (matchStr)
1037                 {
1038                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1039                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1040                 }
1041                 rec_del(zh->reg->records, &rec);
1042             }
1043             zebra_rec_keys_close(delkeys);
1044             zebra_rec_keys_close(sortKeys);
1045             rec_free(&rec);
1046             logRecord(zh);
1047             return ZEBRA_OK;
1048         }
1049         else
1050         {   /* update or special_update */
1051             if (show_progress)
1052                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1053                         pr_fname, (zint) start_offset);
1054             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1055
1056 #if FLUSH2
1057             extract_flush_record_keys2(zh, *sysno,
1058                                        zh->reg->keys, extractCtrl.staticrank,
1059                                        delkeys, recordAttr->staticrank);
1060 #else
1061             extract_flush_record_keys(zh, *sysno, 1, 
1062                                       zh->reg->keys, extractCtrl.staticrank);
1063 #endif
1064             recordAttr->staticrank = extractCtrl.staticrank;
1065             zh->records_updated++;
1066         }
1067         zebra_rec_keys_close(delkeys);
1068         zebra_rec_keys_close(sortKeys);
1069     }
1070     /* update file type */
1071     xfree(rec->info[recInfo_fileType]);
1072     rec->info[recInfo_fileType] =
1073         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1074
1075     /* update filename */
1076     xfree(rec->info[recInfo_filename]);
1077     rec->info[recInfo_filename] =
1078         rec_strdup(fname, &rec->size[recInfo_filename]);
1079
1080     /* update delete keys */
1081     xfree(rec->info[recInfo_delKeys]);
1082     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1083     {
1084         zebra_rec_keys_get_buf(zh->reg->keys,
1085                                &rec->info[recInfo_delKeys],
1086                                &rec->size[recInfo_delKeys]);
1087     }
1088     else
1089     {
1090         rec->info[recInfo_delKeys] = NULL;
1091         rec->size[recInfo_delKeys] = 0;
1092     }
1093     /* update sort keys */
1094     xfree(rec->info[recInfo_sortKeys]);
1095
1096     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1097                            &rec->info[recInfo_sortKeys],
1098                            &rec->size[recInfo_sortKeys]);
1099
1100     if (stream)
1101     {
1102         recordAttr->recordSize = end_offset - start_offset;
1103         zebraExplain_recordBytesIncrement(zh->reg->zei,
1104                                           recordAttr->recordSize);
1105     }
1106
1107     /* set run-number for this record */
1108     recordAttr->runNumber =
1109         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1110
1111     /* update store data */
1112     xfree(rec->info[recInfo_storeData]);
1113
1114     /* update store data */
1115     if (zh->store_data_buf)
1116     {
1117         rec->size[recInfo_storeData] = zh->store_data_size;
1118         rec->info[recInfo_storeData] = zh->store_data_buf;
1119         zh->store_data_buf = 0;
1120         recordAttr->recordSize = zh->store_data_size;
1121     }
1122     else if (zh->m_store_data)
1123     {
1124         off_t cur_offset = stream->tellf(stream);
1125
1126         rec->size[recInfo_storeData] = recordAttr->recordSize;
1127         rec->info[recInfo_storeData] = (char *)
1128             xmalloc(recordAttr->recordSize);
1129         stream->seekf(stream, start_offset);
1130         stream->readf(stream, rec->info[recInfo_storeData],
1131                       recordAttr->recordSize);
1132         stream->seekf(stream, cur_offset);
1133     }
1134     else
1135     {
1136         rec->info[recInfo_storeData] = NULL;
1137         rec->size[recInfo_storeData] = 0;
1138     }
1139     /* update database name */
1140     xfree(rec->info[recInfo_databaseName]);
1141     rec->info[recInfo_databaseName] =
1142         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1143
1144     /* update offset */
1145     recordAttr->recordOffset = start_offset;
1146     
1147     /* commit this record */
1148     rec_put(zh->reg->records, &rec);
1149     logRecord(zh);
1150     return ZEBRA_OK;
1151 }
1152
1153 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1154 {
1155     ZebraHandle zh = (ZebraHandle) handle;
1156     struct recExtractCtrl extractCtrl;
1157
1158     if (zebraExplain_curDatabase(zh->reg->zei,
1159                                   rec->info[recInfo_databaseName]))
1160     {
1161         abort();
1162         if (zebraExplain_newDatabase(zh->reg->zei,
1163                                       rec->info[recInfo_databaseName], 0))
1164             abort();
1165     }
1166
1167     zebra_rec_keys_reset(zh->reg->keys);
1168     zebra_rec_keys_reset(zh->reg->sortKeys);
1169
1170     extractCtrl.init = extract_init;
1171     extractCtrl.tokenAdd = extract_token_add;
1172     extractCtrl.schemaAdd = extract_schema_add;
1173     extractCtrl.dh = zh->reg->dh;
1174
1175     init_extractCtrl(zh, &extractCtrl);
1176
1177     extractCtrl.flagShowRecords = 0;
1178     extractCtrl.match_criteria[0] = '\0';
1179     extractCtrl.staticrank = 0;
1180     extractCtrl.action = action_update;
1181
1182     extractCtrl.handle = handle;
1183     extractCtrl.first_record = 1;
1184     
1185     extract_set_store_data_prepare(&extractCtrl);
1186
1187     if (n)
1188         grs_extract_tree(&extractCtrl, n);
1189
1190     if (rec->size[recInfo_delKeys])
1191     {
1192         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1193         
1194         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1195
1196         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1197                                rec->size[recInfo_delKeys],
1198                                0);
1199 #if FLUSH2
1200         extract_flush_record_keys2(zh, rec->sysno, 
1201                                    zh->reg->keys, 0, delkeys, 0);
1202 #else
1203         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1204         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1205 #endif
1206         zebra_rec_keys_close(delkeys);
1207
1208         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1209                                rec->size[recInfo_sortKeys],
1210                                0);
1211
1212         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1213         zebra_rec_keys_close(sortkeys);
1214     }
1215     else
1216     {
1217 #if FLUSH2
1218         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1219 #else
1220         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1221 #endif
1222     }
1223     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1224     
1225     xfree(rec->info[recInfo_delKeys]);
1226     zebra_rec_keys_get_buf(zh->reg->keys,
1227                            &rec->info[recInfo_delKeys], 
1228                            &rec->size[recInfo_delKeys]);
1229
1230     xfree(rec->info[recInfo_sortKeys]);
1231     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1232                            &rec->info[recInfo_sortKeys],
1233                            &rec->size[recInfo_sortKeys]);
1234     return ZEBRA_OK;
1235 }
1236
1237 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1238                           zebra_rec_keys_t reckeys,
1239                           int level)
1240 {
1241     if (zebra_rec_keys_rewind(reckeys))
1242     {
1243         size_t slen;
1244         const char *str;
1245         struct it_key key;
1246         NMEM nmem = nmem_create();
1247
1248         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1249         {
1250             char keystr[200]; /* room for zints to print */
1251             char *dst_term = 0;
1252             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1253             const char *index_type;
1254             int i;
1255             const char *string_index;
1256             
1257             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1258                                     0/* db */, &string_index);
1259             assert(index_type);
1260             zebra_term_untrans_iconv(zh, nmem, index_type,
1261                                      &dst_term, str);
1262             *keystr = '\0';
1263             for (i = 0; i<key.len; i++)
1264             {
1265                 sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key.mem[i]);
1266             }
1267
1268             if (*str < CHR_BASE_CHAR)
1269             {
1270                 int i;
1271                 char dst_buf[200]; /* room for special chars */
1272
1273                 strcpy(dst_buf , "?");
1274
1275                 if (!strcmp(str, ""))
1276                     strcpy(dst_buf, "alwaysmatches");
1277                 if (!strcmp(str, FIRST_IN_FIELD_STR))
1278                     strcpy(dst_buf, "firstinfield");
1279                 else if (!strcmp(str, CHR_UNKNOWN))
1280                     strcpy(dst_buf, "unknown");
1281                 else if (!strcmp(str, CHR_SPACE))
1282                     strcpy(dst_buf, "space");
1283                 
1284                 for (i = 0; i<slen; i++)
1285                 {
1286                     sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1287                 }
1288                 yaz_log(level, "%s%s %s %s", keystr, index_type,
1289                         string_index, dst_buf);
1290                 
1291             }
1292             else
1293                 yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1294                         string_index, dst_term);
1295
1296             nmem_reset(nmem);
1297         }
1298         nmem_destroy(nmem);
1299     }
1300 }
1301
1302 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1303                              zebra_rec_keys_t reckeys)
1304 {
1305     ZebraExplainInfo zei = zh->reg->zei;
1306     struct ord_stat {
1307         int no;
1308         int ord;
1309         struct ord_stat *next;
1310     };
1311
1312     if (zebra_rec_keys_rewind(reckeys))
1313     {
1314         struct ord_stat *ord_list = 0;
1315         struct ord_stat *p;
1316         size_t slen;
1317         const char *str;
1318         struct it_key key_in;
1319         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1320         {
1321             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1322
1323             for (p = ord_list; p ; p = p->next)
1324                 if (p->ord == ord)
1325                 {
1326                     p->no++;
1327                     break;
1328                 }
1329             if (!p)
1330             {
1331                 p = xmalloc(sizeof(*p));
1332                 p->no = 1;
1333                 p->ord = ord;
1334                 p->next = ord_list;
1335                 ord_list = p;
1336             }
1337         }
1338
1339         p = ord_list;
1340         while (p)
1341         {
1342             struct ord_stat *p1 = p;
1343
1344             if (is_insert)
1345                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1346             else
1347                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1348             p = p->next;
1349             xfree(p1);
1350         }
1351     }
1352 }
1353
1354 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
1355                                 zebra_rec_keys_t ins_keys, zint ins_rank,
1356                                 zebra_rec_keys_t del_keys, zint del_rank)
1357 {
1358     ZebraExplainInfo zei = zh->reg->zei;
1359     int normal = 0;
1360     int optimized = 0;
1361
1362     if (!zh->reg->key_block)
1363     {
1364         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1365         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1366         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1367         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1368     }
1369
1370     if (ins_keys)
1371     {
1372         extract_rec_keys_adjust(zh, 1, ins_keys);
1373         if (!del_keys)
1374             zebraExplain_recordCountIncrement(zei, 1);
1375         zebra_rec_keys_rewind(ins_keys);
1376     }
1377     if (del_keys)
1378     {
1379         extract_rec_keys_adjust(zh, 0, del_keys);
1380         if (!ins_keys)
1381             zebraExplain_recordCountIncrement(zei, -1);
1382         zebra_rec_keys_rewind(del_keys);
1383     }
1384
1385     while (1)
1386     {
1387         size_t del_slen;
1388         const char *del_str;
1389         struct it_key del_key_in;
1390         int del = 0;
1391
1392         size_t ins_slen;
1393         const char *ins_str;
1394         struct it_key ins_key_in;
1395         int ins = 0;
1396
1397         if (del_keys)
1398             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1399                                       &del_key_in);
1400         if (ins_keys)
1401             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1402                                       &ins_key_in);
1403
1404         if (del && ins && ins_rank == del_rank
1405             && !key_compare(&del_key_in, &ins_key_in) 
1406             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1407         {
1408             optimized++;
1409             continue;
1410         }
1411         if (!del && !ins)
1412             break;
1413         
1414         normal++;
1415         if (del)
1416             key_block_write(zh->reg->key_block, sysno, 
1417                             &del_key_in, 0, del_str, del_slen,
1418                             del_rank, zh->m_staticrank);
1419         if (ins)
1420             key_block_write(zh->reg->key_block, sysno, 
1421                             &ins_key_in, 1, ins_str, ins_slen,
1422                             ins_rank, zh->m_staticrank);
1423     }
1424     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1425 }
1426
1427
1428 ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh,
1429                                      zebra_rec_keys_t reckeys,
1430                                      zebra_snippets *snippets)
1431 {
1432     NMEM nmem = nmem_create();
1433     if (zebra_rec_keys_rewind(reckeys)) 
1434     {
1435         const char *str;
1436         size_t slen;
1437         struct it_key key;
1438         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1439         {
1440             char *dst_term = 0;
1441             int ord;
1442             zint seqno;
1443             const char *index_type;
1444
1445             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1446             seqno = key.mem[key.len-1];
1447             ord = CAST_ZINT_TO_INT(key.mem[0]);
1448             
1449             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1450                                     0/* db */, 0 /* string_index */);
1451             assert(index_type);
1452             zebra_term_untrans_iconv(zh, nmem, index_type,
1453                                      &dst_term, str);
1454             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1455             nmem_reset(nmem);
1456         }
1457     }
1458     nmem_destroy(nmem);
1459     return ZEBRA_OK;
1460 }
1461
1462 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1463 {
1464     yaz_log(YLOG_LOG, "print_rec_keys");
1465     if (zebra_rec_keys_rewind(reckeys))
1466     {
1467         const char *str;
1468         size_t slen;
1469         struct it_key key;
1470         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1471         {
1472             char dst_buf[IT_MAX_WORD];
1473             zint seqno;
1474             const char *index_type;
1475             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1476             const char *db = 0;
1477             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1478
1479             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1480             
1481             seqno = key.mem[key.len-1];
1482             
1483             zebra_term_untrans(zh, index_type, dst_buf, str);
1484             
1485             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1486                     " term=%s", ord, seqno, dst_buf); 
1487         }
1488     }
1489 }
1490
1491 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1492                                      const char *str, int length)
1493 {
1494     struct it_key key;
1495     ZebraHandle zh = p->extractCtrl->handle;
1496     ZebraExplainInfo zei = zh->reg->zei;
1497     int ch, i;
1498
1499     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1500     if (ch < 0)
1501         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1502
1503     i = 0;
1504     key.mem[i++] = ch;
1505     key.mem[i++] = p->record_id;
1506     key.mem[i++] = p->section_id;
1507
1508     if (zh->m_segment_indexing)
1509         key.mem[i++] = p->segment;
1510     key.mem[i++] = p->seqno;
1511     key.len = i;
1512
1513     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1514 }
1515
1516 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1517 {
1518     struct it_key key;
1519     ZebraHandle zh = p->extractCtrl->handle;
1520     ZebraExplainInfo zei = zh->reg->zei;
1521     int ch;
1522     zinfo_index_category_t cat = zinfo_index_category_sort;
1523
1524     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1525     if (ch < 0)
1526         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1527     key.len = 2;
1528     key.mem[0] = ch;
1529     key.mem[1] = p->record_id;
1530
1531     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1532 }
1533
1534 static void extract_add_staticrank_string(RecWord *p,
1535                                           const char *str, int length)
1536 {
1537     char valz[40];
1538     struct recExtractCtrl *ctrl = p->extractCtrl;
1539
1540     if (length > sizeof(valz)-1)
1541         length = sizeof(valz)-1;
1542
1543     memcpy(valz, str, length);
1544     valz[length] = '\0';
1545     ctrl->staticrank = atozint(valz);
1546 }
1547
1548 static void extract_add_string(RecWord *p, zebra_map_t zm,
1549                                const char *string, int length)
1550 {
1551     assert(length > 0);
1552
1553     if (!p->index_name)
1554         return;
1555
1556     if (zebra_maps_is_index(zm))
1557     {
1558         extract_add_index_string(p, zinfo_index_category_index,
1559                                  string, length);
1560         if (zebra_maps_is_alwaysmatches(zm))
1561         {
1562             RecWord word;
1563             memcpy(&word, p, sizeof(word));
1564
1565             word.seqno = 1;
1566             extract_add_index_string(
1567                 &word, zinfo_index_category_alwaysmatches, "", 0);
1568         }
1569     }
1570     else if (zebra_maps_is_sort(zm))
1571     {
1572         extract_add_sort_string(p, string, length);
1573     }
1574     else if (zebra_maps_is_staticrank(zm))
1575     {
1576         extract_add_staticrank_string(p, string, length);
1577     }
1578 }
1579
1580 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1581 {
1582     const char *b = p->term_buf;
1583     int remain = p->term_len;
1584     int first = 1;
1585     const char **map = 0;
1586     
1587     if (remain > 0)
1588         map = zebra_maps_input(zm, &b, remain, 0);
1589
1590     while (map)
1591     {
1592         char buf[IT_MAX_WORD+1];
1593         int i, remain;
1594
1595         /* Skip spaces */
1596         while (map && *map && **map == *CHR_SPACE)
1597         {
1598             remain = p->term_len - (b - p->term_buf);
1599             if (remain > 0)
1600                 map = zebra_maps_input(zm, &b, remain, 0);
1601             else
1602                 map = 0;
1603         }
1604         if (!map)
1605             break;
1606         i = 0;
1607         while (map && *map && **map != *CHR_SPACE)
1608         {
1609             const char *cp = *map;
1610
1611             while (i < IT_MAX_WORD && *cp)
1612                 buf[i++] = *(cp++);
1613             remain = p->term_len - (b - p->term_buf);
1614             if (remain > 0)
1615                 map = zebra_maps_input(zm, &b, remain, 0);
1616             else
1617                 map = 0;
1618         }
1619         if (!i)
1620             return;
1621
1622         if (first)
1623         {   
1624             first = 0;
1625             if (zebra_maps_is_first_in_field(zm))
1626             {
1627                 /* first in field marker */
1628                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1629                 p->seqno++;
1630             }
1631         }
1632         extract_add_string(p, zm, buf, i);
1633         p->seqno++;
1634     }
1635 }
1636
1637 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1638 {
1639     const char *b = p->term_buf;
1640     char buf[IT_MAX_WORD+1];
1641     const char **map = 0;
1642     int i = 0, remain = p->term_len;
1643
1644     if (remain > 0)
1645         map = zebra_maps_input(zm, &b, remain, 1);
1646
1647     while (remain > 0 && i < IT_MAX_WORD)
1648     {
1649         while (map && *map && **map == *CHR_SPACE)
1650         {
1651             remain = p->term_len - (b - p->term_buf);
1652
1653             if (remain > 0)
1654             {
1655                 int first = i ? 0 : 1;  /* first position */
1656                 map = zebra_maps_input(zm, &b, remain, first);
1657             }
1658             else
1659                 map = 0;
1660         }
1661         if (!map)
1662             break;
1663
1664         if (i && i < IT_MAX_WORD)
1665             buf[i++] = *CHR_SPACE;
1666         while (map && *map && **map != *CHR_SPACE)
1667         {
1668             const char *cp = *map;
1669
1670             if (**map == *CHR_CUT)
1671             {
1672                 i = 0;
1673             }
1674             else
1675             {
1676                 if (i >= IT_MAX_WORD)
1677                     break;
1678                 while (i < IT_MAX_WORD && *cp)
1679                     buf[i++] = *(cp++);
1680             }
1681             remain = p->term_len  - (b - p->term_buf);
1682             if (remain > 0)
1683             {
1684                 map = zebra_maps_input(zm, &b, remain, 0);
1685             }
1686             else
1687                 map = 0;
1688         }
1689     }
1690     if (!i)
1691         return;
1692     extract_add_string(p, zm, buf, i);
1693 }
1694
1695 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1696 {
1697     struct it_key key;
1698     const char *res_buf = 0;
1699     size_t res_len = 0;
1700     ZebraHandle zh = p->extractCtrl->handle;
1701     int r = zebra_map_tokenize(zm, p->term_buf, p->term_len,
1702                                &res_buf, &res_len);
1703     int cat = zinfo_index_category_index;
1704     int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
1705     if (ch < 0)
1706         ch = zebraExplain_add_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
1707     while (r)
1708     {
1709         int i = 0;
1710         key.mem[i++] = ch;
1711         key.mem[i++] = p->record_id;
1712         key.mem[i++] = p->section_id;
1713         
1714         if (zh->m_segment_indexing)
1715             key.mem[i++] = p->segment;
1716         key.mem[i++] = p->seqno;
1717         key.len = i;
1718
1719         zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key);
1720         
1721         p->seqno++;
1722         r = zebra_map_tokenize(zm, 0, 0, &res_buf, &res_len);
1723     }
1724 }
1725
1726
1727 /** \brief top-level indexing handler for recctrl system
1728     \param p token data to be indexed
1729
1730     Call sequence:
1731     extract_token
1732     zebra_add_{in}_complete
1733     extract_add_string
1734     
1735     extract_add_index_string
1736     or
1737     extract_add_sort_string
1738     or
1739     extract_add_staticrank_string
1740     
1741 */
1742 static void extract_token_add(RecWord *p)
1743 {
1744     ZebraHandle zh = p->extractCtrl->handle;
1745     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1746     WRBUF wrbuf;
1747
1748     if (log_level_details)
1749     {
1750         yaz_log(log_level_details, "extract_token_add "
1751                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1752                 p->index_type, p->index_name, 
1753                 p->seqno, p->term_len, p->term_buf);
1754     }
1755     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1756     {
1757         p->term_buf = wrbuf_buf(wrbuf);
1758         p->term_len = wrbuf_len(wrbuf);
1759     }
1760     if (zebra_maps_is_icu(zm))
1761     {
1762         extract_add_icu(p, zm);
1763     }
1764     else
1765     {
1766         if (zebra_maps_is_complete(zm))
1767             extract_add_complete_field(p, zm);
1768         else
1769             extract_add_incomplete_field(p, zm);
1770     }
1771 }
1772
1773 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1774                                       void *buf, size_t sz)
1775 {
1776     ZebraHandle zh = (ZebraHandle) p->handle;
1777
1778     xfree(zh->store_data_buf);
1779     zh->store_data_buf = 0;
1780     zh->store_data_size = 0;
1781     if (buf && sz)
1782     {
1783         zh->store_data_buf = xmalloc(sz);
1784         zh->store_data_size = sz;
1785         memcpy(zh->store_data_buf, buf, sz);
1786     }
1787 }
1788
1789 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1790 {
1791     ZebraHandle zh = (ZebraHandle) p->handle;
1792     xfree(zh->store_data_buf);
1793     zh->store_data_buf = 0;
1794     zh->store_data_size = 0;
1795     p->setStoreData = extract_set_store_data_cb;
1796 }
1797
1798 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1799 {
1800     ZebraHandle zh = (ZebraHandle) p->handle;
1801     zebraExplain_addSchema(zh->reg->zei, oid);
1802 }
1803
1804 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1805                              int cmd, zebra_rec_keys_t reckeys)
1806 {
1807 #if 0
1808     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1809             cmd, sysno);
1810     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1811 #endif
1812
1813     if (zebra_rec_keys_rewind(reckeys))
1814     {
1815         zebra_sort_index_t si = zh->reg->sort_index;
1816         size_t slen;
1817         const char *str;
1818         struct it_key key_in;
1819
1820         zebra_sort_sysno(si, sysno);
1821
1822         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1823         {
1824             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1825             
1826             zebra_sort_type(si, ord);
1827             if (cmd == 1)
1828                 zebra_sort_add(si, str, slen);
1829             else
1830                 zebra_sort_delete(si);
1831         }
1832     }
1833 }
1834
1835 /*
1836  * Local variables:
1837  * c-basic-offset: 4
1838  * indent-tabs-mode: nil
1839  * End:
1840  * vim: shiftwidth=4 tabstop=8 expandtab
1841  */
1842