Bump copyright year
[idzebra-moved-to-github.git] / index / extract.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2010 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 /** \file
21     \brief indexes records and extract tokens for indexing and sorting
22 */
23
24 #include <stdio.h>
25 #include <assert.h>
26 #include <ctype.h>
27 #ifdef WIN32
28 #include <io.h>
29 #endif
30 #if HAVE_UNISTD_H
31 #include <unistd.h>
32 #endif
33 #include <fcntl.h>
34
35
36 #include "index.h"
37 #include "orddict.h"
38 #include <direntz.h>
39 #include <charmap.h>
40 #include <yaz/snprintf.h>
41
42 static int log_level_extract = 0;
43 static int log_level_details = 0;
44 static int log_level_initialized = 0;
45
46 /* 1 if we use eliminitate identical delete/insert keys */
47 /* eventually this the 0-case code will be removed */
48 #define FLUSH2 1
49
50 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
51                                 zebra_rec_keys_t ins_keys,
52                                 zint ins_rank,
53                                 zebra_rec_keys_t del_keys,
54                                 zint del_rank);
55
56 static void zebra_init_log_level(void)
57 {
58     if (!log_level_initialized)
59     {
60         log_level_initialized = 1;
61
62         log_level_extract = yaz_log_module_level("extract");
63         log_level_details = yaz_log_module_level("indexdetails");
64     }
65 }
66
67 static WRBUF wrbuf_hex_str(const char *cstr)
68 {
69     size_t i;
70     WRBUF w = wrbuf_alloc();
71     for (i = 0; cstr[i]; i++)
72     {
73         if (cstr[i] < ' ' || cstr[i] > 126)
74             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
75         else
76             wrbuf_putc(w, cstr[i]);
77     }
78     return w;
79 }
80
81
82 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
83                                     int cmd, zebra_rec_keys_t skp);
84 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
85 static void extract_token_add(RecWord *p);
86
87 static void check_log_limit(ZebraHandle zh)
88 {
89     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
90     {
91         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
92                 zh->m_file_verbose_limit);
93     }
94 }
95
96 static void logRecord(ZebraHandle zh)
97 {
98     check_log_limit(zh);
99     ++zh->records_processed;
100     if (!(zh->records_processed % 1000))
101     {
102         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
103                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
104                 zh->records_processed, zh->records_inserted, 
105                 zh->records_updated, zh->records_deleted);
106     }
107 }
108
109 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
110 {
111     ctrl->flagShowRecords = !zh->m_flag_rw;
112 }
113
114
115 static void extract_add_index_string(RecWord *p, 
116                                       zinfo_index_category_t cat,
117                                       const char *str, int length);
118
119 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
120
121 static void extract_init(struct recExtractCtrl *p, RecWord *w)
122 {
123     w->seqno = 1;
124     w->index_name = "any";
125     w->index_type = "w";
126     w->extractCtrl = p;
127     w->record_id = 0;
128     w->section_id = 0;
129     w->segment = 0;
130 }
131
132 struct snip_rec_info {
133     ZebraHandle zh;
134     zebra_snippets *snippets;
135 };
136
137
138 static void snippet_add_complete_field(RecWord *p, int ord,
139                                        zebra_map_t zm)
140 {
141     struct snip_rec_info *h = p->extractCtrl->handle;
142
143     const char *b = p->term_buf;
144     char buf[IT_MAX_WORD+1];
145     const char **map = 0;
146     int i = 0, remain = p->term_len;
147     const char *start = b;
148     const char *last = 0;
149
150     if (remain > 0)
151         map = zebra_maps_input(zm, &b, remain, 1);
152
153     while (remain > 0 && i < IT_MAX_WORD)
154     {
155         while (map && *map && **map == *CHR_SPACE)
156         {
157             remain = p->term_len - (b - p->term_buf);
158
159             if (i == 0)
160                 start = b;  /* set to first non-ws area */
161             if (remain > 0)
162             {
163                 int first = i ? 0 : 1;  /* first position */
164
165                 map = zebra_maps_input(zm, &b, remain, first);
166             }
167             else
168                 map = 0;
169         }
170         if (!map)
171             break;
172
173         if (i && i < IT_MAX_WORD)
174             buf[i++] = *CHR_SPACE;
175         while (map && *map && **map != *CHR_SPACE)
176         {
177             const char *cp = *map;
178
179             if (**map == *CHR_CUT)
180             {
181                 i = 0;
182             }
183             else
184             {
185                 if (i >= IT_MAX_WORD)
186                     break;
187                 while (i < IT_MAX_WORD && *cp)
188                     buf[i++] = *(cp++);
189             }
190             last = b;
191             remain = p->term_len  - (b - p->term_buf);
192             if (remain > 0)
193             {
194                 map = zebra_maps_input(zm, &b, remain, 0);
195             }
196             else
197                 map = 0;
198         }
199     }
200     if (!i)
201         return;
202     if (last && start != last && zebra_maps_is_index(zm))
203         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
204                                start, last - start);
205 }
206
207 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
208 {
209     struct snip_rec_info *h = p->extractCtrl->handle;
210     const char *b = p->term_buf;
211     int remain = p->term_len;
212     int first = 1;
213     const char **map = 0;
214     const char *start = b;
215     const char *last = b;
216
217     if (remain > 0)
218         map = zebra_maps_input(zm, &b, remain, 0);
219
220     while (map)
221     {
222         char buf[IT_MAX_WORD+1];
223         int i, remain;
224
225         /* Skip spaces */
226         while (map && *map && **map == *CHR_SPACE)
227         {
228             remain = p->term_len - (b - p->term_buf);
229             last = b;
230             if (remain > 0)
231                 map = zebra_maps_input(zm, &b, remain, 0);
232             else
233                 map = 0;
234         }
235         if (!map)
236             break;
237         if (start != last && zebra_maps_is_index(zm))
238         {
239             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
240                                    start, last - start);
241
242         }
243         start = last;
244
245         i = 0;
246         while (map && *map && **map != *CHR_SPACE)
247         {
248             const char *cp = *map;
249
250             while (i < IT_MAX_WORD && *cp)
251                 buf[i++] = *(cp++);
252             remain = p->term_len - (b - p->term_buf);
253             last = b;
254             if (remain > 0)
255                 map = zebra_maps_input(zm, &b, remain, 0);
256             else
257                 map = 0;
258         }
259         if (!i)
260             return;
261
262         if (first)
263         {   
264             first = 0;
265             if (zebra_maps_is_first_in_field(zm))
266             {
267                 /* first in field marker */
268                 p->seqno++;
269             }
270         }
271         if (start != last && zebra_maps_is_index(zm))
272             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
273                                    start, last - start);
274         start = last;
275         p->seqno++;
276     }
277
278 }
279
280 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
281 {
282     struct snip_rec_info *h = p->extractCtrl->handle;
283
284     const char *res_buf = 0;
285     size_t res_len = 0;
286
287     const char *display_buf = 0;
288     size_t display_len = 0;
289
290     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
291     while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
292                                    &display_buf, &display_len))
293     {
294         if (zebra_maps_is_index(zm))
295             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
296                                    display_buf, display_len);
297         p->seqno++;
298     }
299 }
300
301 static void snippet_token_add(RecWord *p)
302 {
303     struct snip_rec_info *h = p->extractCtrl->handle;
304     ZebraHandle zh = h->zh;
305     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
306
307     if (zm)
308     {
309         ZebraExplainInfo zei = zh->reg->zei;
310         int ch = zebraExplain_lookup_attr_str(
311             zei, zinfo_index_category_index, p->index_type, p->index_name);
312
313         if (zebra_maps_is_icu(zm))
314             snippet_add_icu(p, ch, zm);
315         else
316         {
317             if (zebra_maps_is_complete(zm))
318                 snippet_add_complete_field(p, ch, zm);
319             else
320                 snippet_add_incomplete_field(p, ch, zm);
321         }
322     }
323 }
324
325 static void snippet_schema_add(
326     struct recExtractCtrl *p, Odr_oid *oid)
327 {
328
329 }
330
331 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
332                      struct ZebraRecStream *stream,
333                      RecType rt, void *recTypeClientData)
334 {
335     struct recExtractCtrl extractCtrl;
336     struct snip_rec_info info;
337     int r;
338
339     extractCtrl.stream = stream;
340     extractCtrl.first_record = 1;
341     extractCtrl.init = extract_init;
342     extractCtrl.tokenAdd = snippet_token_add;
343     extractCtrl.schemaAdd = snippet_schema_add;
344     assert(zh->reg);
345     assert(zh->reg->dh);
346
347     extractCtrl.dh = zh->reg->dh;
348     
349     info.zh = zh;
350     info.snippets = sn;
351     extractCtrl.handle = &info;
352     extractCtrl.match_criteria[0] = '\0';
353     extractCtrl.staticrank = 0;
354     extractCtrl.action = action_insert;
355     
356     init_extractCtrl(zh, &extractCtrl);
357
358     extractCtrl.setStoreData = 0;
359
360     r = (*rt->extract)(recTypeClientData, &extractCtrl);
361
362 }
363
364 static void searchRecordKey(ZebraHandle zh,
365                             zebra_rec_keys_t reckeys,
366                             const char *index_name,
367                             const char **ws, int ws_length)
368 {
369     int i;
370     int ch = -1;
371     zinfo_index_category_t cat = zinfo_index_category_index;
372
373     for (i = 0; i<ws_length; i++)
374         ws[i] = NULL;
375
376     if (ch < 0)
377         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
378     if (ch < 0)
379         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
380     if (ch < 0)
381         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
382
383     if (ch < 0)
384         return ;
385
386     if (zebra_rec_keys_rewind(reckeys))
387     {
388         zint startSeq = -1;
389         const char *str;
390         size_t slen;
391         struct it_key key;
392         zint seqno;
393         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
394         {
395             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
396
397             seqno = key.mem[key.len-1];
398             
399             if (key.mem[0] == ch)
400             {
401                 zint woff;
402                 
403                 if (startSeq == -1)
404                     startSeq = seqno;
405                 woff = seqno - startSeq;
406                 if (woff >= 0 && woff < ws_length)
407                     ws[woff] = str;
408             }
409         }
410     }
411 }
412
413 #define FILE_MATCH_BLANK "\t "
414
415 static char *get_match_from_spec(ZebraHandle zh,
416                           zebra_rec_keys_t reckeys,
417                           const char *fname, const char *spec)
418 {
419     static char dstBuf[2048];      /* static here ??? */
420     char *dst = dstBuf;
421     const char *s = spec;
422
423     while (1)
424     {
425         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
426             ;
427         if (!*s)
428             break;
429         if (*s == '(')
430         {
431             const char *ws[32];
432             char attset_str[64], attname_str[64];
433             int i;
434             int first = 1;
435             
436             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
437                 ;
438             for (i = 0; *s && *s != ',' && *s != ')' && 
439                      !strchr(FILE_MATCH_BLANK, *s); s++)
440                 if (i+1 < sizeof(attset_str))
441                     attset_str[i++] = *s;
442             attset_str[i] = '\0';
443             
444             for (; strchr(FILE_MATCH_BLANK, *s); s++)
445                 ;
446             if (*s != ',')
447                 strcpy(attname_str, attset_str);
448             else
449             {
450                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
451                     ;
452                 for (i = 0; *s && *s != ')' && 
453                          !strchr(FILE_MATCH_BLANK, *s); s++)
454                     if (i+1 < sizeof(attname_str))
455                         attname_str[i++] = *s;
456                 attname_str[i] = '\0';
457             }
458             if (*s != ')')
459             {
460                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
461                       spec, zh->m_group ? zh->m_group : "none");
462                 return NULL;
463             }
464             s++;
465
466             searchRecordKey(zh, reckeys, attname_str, ws, 32);
467             if (0) /* for debugging */
468             {   
469                 for (i = 0; i<32; i++)
470                 {
471                     if (ws[i])
472                     {
473                         WRBUF w = wrbuf_hex_str(ws[i]);
474                         yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
475                         wrbuf_destroy(w);
476                     }
477                 }
478             }
479
480             for (i = 0; i<32; i++)
481                 if (ws[i])
482                 {
483                     if (first)
484                     {
485                         *dst++ = ' ';
486                         first = 0;
487                     }
488                     strcpy(dst, ws[i]);
489                     dst += strlen(ws[i]);
490                 }
491             if (first)
492             {
493                 yaz_log(YLOG_WARN, "Record didn't contain match"
494                       " fields in (%s,%s)", attset_str, attname_str);
495                 return NULL;
496             }
497         }
498         else if (*s == '$')
499         {
500             int spec_len;
501             char special[64];
502             const char *spec_src = NULL;
503             const char *s1 = ++s;
504             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
505                 s1++;
506
507             spec_len = s1 - s;
508             if (spec_len > sizeof(special)-1)
509                 spec_len = sizeof(special)-1;
510             memcpy(special, s, spec_len);
511             special[spec_len] = '\0';
512             s = s1;
513
514             if (!strcmp(special, "group"))
515                 spec_src = zh->m_group;
516             else if (!strcmp(special, "database"))
517                 spec_src = zh->basenames[0];
518             else if (!strcmp(special, "filename")) {
519                 spec_src = fname;
520             }
521             else if (!strcmp(special, "type"))
522                 spec_src = zh->m_record_type;
523             else 
524                 spec_src = NULL;
525             if (spec_src)
526             {
527                 strcpy(dst, spec_src);
528                 dst += strlen(spec_src);
529             }
530         }
531         else if (*s == '\"' || *s == '\'')
532         {
533             int stopMarker = *s++;
534             char tmpString[64];
535             int i = 0;
536
537             while (*s && *s != stopMarker)
538             {
539                 if (i+1 < sizeof(tmpString))
540                     tmpString[i++] = *s++;
541             }
542             if (*s)
543                 s++;
544             tmpString[i] = '\0';
545             strcpy(dst, tmpString);
546             dst += strlen(tmpString);
547         }
548         else
549         {
550             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
551                   spec, zh->m_group ? zh->m_group : "none");
552             return NULL;
553         }
554         *dst++ = 1;
555     }
556     if (dst == dstBuf)
557     {
558         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
559               fname, zh->m_group ? zh->m_group : "none");
560         return NULL;
561     }
562     *dst = '\0';
563
564     if (0) /* for debugging */
565     {
566         WRBUF w = wrbuf_hex_str(dstBuf);
567         yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
568         wrbuf_destroy(w);
569     }
570
571     return dstBuf;
572 }
573
574 struct recordLogInfo {
575     const char *fname;
576     int recordOffset;
577     struct recordGroup *rGroup;
578 };
579
580 /** \brief add the always-matches index entry and map to real record ID
581     \param ctrl record control
582     \param record_id custom record ID
583     \param sysno system record ID
584     
585     This function serves two purposes.. It adds the always matches
586     entry and makes a pointer from the custom record ID (if defined)
587     back to the system record ID (sysno)
588     See zebra_recid_to_sysno .
589   */
590 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
591                             zint sysno)
592 {
593     RecWord word;
594     extract_init(ctrl, &word);
595     word.record_id = record_id;
596     /* we use the seqno as placeholder for a way to get back to
597        record database from _ALLRECORDS.. This is used if a custom
598        RECORD was defined */
599     word.seqno = sysno;
600     word.index_name = "_ALLRECORDS";
601     word.index_type = "w";
602
603     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
604                               "", 0);
605 }
606
607 /* forward declaration */
608 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
609                                        struct ZebraRecStream *stream,
610                                        enum zebra_recctrl_action_t action,
611                                        const char *recordType,
612                                        zint *sysno,
613                                        const char *match_criteria,
614                                        const char *fname,
615                                        RecType recType,
616                                        void *recTypeClientData);
617
618
619 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
620                              enum zebra_recctrl_action_t action)
621 {
622     ZEBRA_RES r = ZEBRA_OK;
623     int i, fd;
624     char gprefix[128];
625     char ext[128];
626     char ext_res[128];
627     struct file_read_info *fi = 0;
628     const char *original_record_type = 0;
629     RecType recType;
630     void *recTypeClientData;
631     struct ZebraRecStream stream, *streamp;
632
633     zebra_init_log_level();
634
635     if (!zh->m_group || !*zh->m_group)
636         *gprefix = '\0';
637     else
638         sprintf(gprefix, "%s.", zh->m_group);
639     
640     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
641
642     /* determine file extension */
643     *ext = '\0';
644     for (i = strlen(fname); --i >= 0; )
645         if (fname[i] == '/')
646             break;
647         else if (fname[i] == '.')
648         {
649             strcpy(ext, fname+i+1);
650             break;
651         }
652     /* determine file type - depending on extension */
653     original_record_type = zh->m_record_type;
654     if (!zh->m_record_type)
655     {
656         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
657         zh->m_record_type = res_get(zh->res, ext_res);
658     }
659     if (!zh->m_record_type)
660     {
661         check_log_limit(zh);
662         if (zh->records_processed + zh->records_skipped
663             < zh->m_file_verbose_limit)
664             yaz_log(YLOG_LOG, "? %s", fname);
665         zh->records_skipped++;
666         return 0;
667     }
668     /* determine match criteria */
669     if (!zh->m_record_id)
670     {
671         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
672         zh->m_record_id = res_get(zh->res, ext_res);
673     }
674
675     if (!(recType =
676           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
677                           &recTypeClientData)))
678     {
679         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
680         return ZEBRA_FAIL;
681     }
682
683     switch(recType->version)
684     {
685     case 0:
686         break;
687     default:
688         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
689     }
690     if (sysno && (action == action_delete || action == action_a_delete))
691     {
692         streamp = 0;
693         fi = 0;
694     }
695     else
696     {
697         char full_rep[1024];
698
699         if (zh->path_reg && !yaz_is_abspath(fname))
700         {
701             strcpy(full_rep, zh->path_reg);
702             strcat(full_rep, "/");
703             strcat(full_rep, fname);
704         }
705         else
706             strcpy(full_rep, fname);
707         
708         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
709         {
710             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
711             zh->m_record_type = original_record_type;
712             return ZEBRA_FAIL;
713         }
714         streamp = &stream;
715         zebra_create_stream_fd(streamp, fd, 0);
716     }
717     r = zebra_extract_records_stream(zh, streamp,
718                                      action,
719                                      zh->m_record_type,
720                                      sysno,
721                                      0, /*match_criteria */
722                                      fname,
723                                      recType, recTypeClientData);
724     if (streamp)
725         stream.destroy(streamp);
726     zh->m_record_type = original_record_type;
727     return r;
728 }
729
730 /*
731   If sysno is provided, then it's used to identify the reocord.
732   If not, and match_criteria is provided, then sysno is guessed
733   If not, and a record is provided, then sysno is got from there
734   
735  */
736
737 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
738                                       const char *buf, size_t buf_size,
739                                       enum zebra_recctrl_action_t action,
740                                       const char *recordType,
741                                       zint *sysno,
742                                       const char *match_criteria,
743                                       const char *fname)
744 {
745     struct ZebraRecStream stream;
746     ZEBRA_RES res;
747     void *clientData;
748     RecType recType = 0;
749
750     if (recordType && *recordType)
751     {
752         yaz_log(log_level_extract,
753                 "Record type explicitly specified: %s", recordType);
754         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
755                                   &clientData);
756     } 
757     else
758     {
759         if (!(zh->m_record_type))
760         {
761             yaz_log(YLOG_WARN, "No such record type defined");
762             return ZEBRA_FAIL;
763         }
764         yaz_log(log_level_extract, "Get record type from rgroup: %s",
765                 zh->m_record_type);
766         recType = recType_byName(zh->reg->recTypes, zh->res,
767                                   zh->m_record_type, &clientData);
768         recordType = zh->m_record_type;
769     }
770     
771     if (!recType)
772     {
773         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
774         return ZEBRA_FAIL;
775     }
776
777     zebra_create_stream_mem(&stream, buf, buf_size);
778
779     res = zebra_extract_records_stream(zh, &stream,
780                                        action,
781                                        recordType,
782                                        sysno,
783                                        match_criteria,
784                                        fname,
785                                        recType, clientData);
786     stream.destroy(&stream);
787     return res;
788 }
789
790 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
791                                              struct ZebraRecStream *stream,
792                                              enum zebra_recctrl_action_t action,
793                                              const char *recordType,
794                                              zint *sysno,
795                                              const char *match_criteria,
796                                              const char *fname,
797                                              RecType recType,
798                                              void *recTypeClientData,
799                                              int *more)
800     
801 {
802     zint sysno0 = 0;
803     RecordAttr *recordAttr;
804     struct recExtractCtrl extractCtrl;
805     int r;
806     const char *matchStr = 0;
807     Record rec;
808     off_t start_offset = 0, end_offset = 0;
809     const char *pr_fname = fname;  /* filename to print .. */
810     int show_progress = zh->records_processed + zh->records_skipped 
811         < zh->m_file_verbose_limit ? 1:0;
812
813     zebra_init_log_level();
814
815     if (!pr_fname)
816         pr_fname = "<no file>";  /* make it printable if file is omitted */
817
818     zebra_rec_keys_reset(zh->reg->keys);
819     zebra_rec_keys_reset(zh->reg->sortKeys);
820
821     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
822     {
823         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
824                                       zh->m_explain_database))
825             return ZEBRA_FAIL;
826     }
827
828     if (stream)
829     {
830         off_t null_offset = 0;
831         extractCtrl.stream = stream;
832
833         start_offset = stream->tellf(stream);
834
835         extractCtrl.first_record = start_offset ? 0 : 1;
836         
837         stream->endf(stream, &null_offset);;
838
839         extractCtrl.init = extract_init;
840         extractCtrl.tokenAdd = extract_token_add;
841         extractCtrl.schemaAdd = extract_schema_add;
842         extractCtrl.dh = zh->reg->dh;
843         extractCtrl.handle = zh;
844         extractCtrl.match_criteria[0] = '\0';
845         extractCtrl.staticrank = 0;
846         extractCtrl.action = action;
847
848         init_extractCtrl(zh, &extractCtrl);
849
850         extract_set_store_data_prepare(&extractCtrl);
851         
852         r = (*recType->extract)(recTypeClientData, &extractCtrl);
853
854         if (action == action_update)
855         {
856             action = extractCtrl.action;
857         }
858         
859         switch (r)
860         {
861         case RECCTRL_EXTRACT_EOF:
862             return ZEBRA_FAIL;
863         case RECCTRL_EXTRACT_ERROR_GENERIC:
864             /* error occured during extraction ... */
865             yaz_log(YLOG_WARN, "extract error: generic");
866             return ZEBRA_FAIL;
867         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
868             /* error occured during extraction ... */
869             yaz_log(YLOG_WARN, "extract error: no such filter");
870             return ZEBRA_FAIL;
871         case RECCTRL_EXTRACT_SKIP:
872             if (show_progress)
873                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
874                          recordType, pr_fname, (zint) start_offset);
875             *more = 1;
876             
877             end_offset = stream->endf(stream, 0);
878             if (end_offset)
879                 stream->seekf(stream, end_offset);
880
881             return ZEBRA_OK;
882         case RECCTRL_EXTRACT_OK:
883             break;
884         default:
885             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
886             return ZEBRA_FAIL;
887         }
888         end_offset = stream->endf(stream, 0);
889         if (end_offset)
890             stream->seekf(stream, end_offset);
891         else
892             end_offset = stream->tellf(stream);
893
894         if (extractCtrl.match_criteria[0])
895             match_criteria = extractCtrl.match_criteria;
896     }
897
898     *more = 1;
899
900     if (zh->m_flag_rw == 0)
901     {
902         yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
903                 pr_fname, (zint) start_offset);
904         /* test mode .. Do not perform match */
905         return ZEBRA_OK;
906     }
907         
908     if (!sysno)
909     {
910         sysno = &sysno0;
911         
912         if (match_criteria && *match_criteria)
913             matchStr = match_criteria;
914         else
915         {
916             if (zh->m_record_id && *zh->m_record_id)
917             {
918                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
919                                                zh->m_record_id);
920                 if (!matchStr)
921                 {
922                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
923                              pr_fname, (zint) start_offset);
924                     return ZEBRA_FAIL;
925                 }
926                 if (0 && matchStr)
927                 {
928                     WRBUF w = wrbuf_alloc();
929                     size_t i;
930                     for (i = 0; i < strlen(matchStr); i++)
931                     {
932                         wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
933                     }
934                     yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
935                     wrbuf_destroy(w);
936                 }
937             }
938         }
939         if (matchStr) 
940         {
941             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
942             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
943                                           matchStr);
944
945             
946             if (log_level_extract)
947             {
948                 WRBUF w = wrbuf_hex_str(matchStr);
949                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
950                 wrbuf_destroy(w);
951             }
952             if (rinfo)
953             {
954                 assert(*rinfo == sizeof(*sysno));
955                 memcpy(sysno, rinfo+1, sizeof(*sysno));
956             }
957        }
958     }
959
960     if (! *sysno)
961     {
962         /* new record AKA does not exist already */
963         if (action == action_delete)
964         {
965             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
966                     pr_fname, (zint) start_offset);
967             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
968             return ZEBRA_FAIL;
969         }
970         else if (action == action_a_delete)
971         {
972             if (show_progress)
973                 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
974                         pr_fname, (zint) start_offset);
975             return ZEBRA_OK;
976         }
977         else if (action == action_replace)
978         {
979             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
980                          pr_fname, (zint) start_offset);
981             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
982             return ZEBRA_FAIL;
983         }
984         if (show_progress)
985             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
986                      (zint) start_offset);
987         rec = rec_new(zh->reg->records);
988
989         *sysno = rec->sysno;
990
991
992         if (stream)
993         {
994             all_matches_add(&extractCtrl,
995                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
996                             *sysno);
997         }
998
999
1000         recordAttr = rec_init_attr(zh->reg->zei, rec);
1001         if (extractCtrl.staticrank < 0)
1002         {
1003             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1004             extractCtrl.staticrank = 0;
1005         }
1006
1007         if (matchStr)
1008         {
1009             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1010             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1011                             sizeof(*sysno), sysno);
1012         }
1013
1014         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1015 #if FLUSH2
1016         extract_flush_record_keys2(zh, *sysno,
1017                                    zh->reg->keys, extractCtrl.staticrank,
1018                                    0, recordAttr->staticrank);
1019 #else
1020         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1021                                   extractCtrl.staticrank);
1022 #endif
1023         recordAttr->staticrank = extractCtrl.staticrank;
1024         zh->records_inserted++;
1025     } 
1026     else
1027     {
1028         /* record already exists */
1029         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1030         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1031         if (action == action_insert)
1032         {
1033             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
1034                          recordType, pr_fname, (zint) start_offset);
1035             logRecord(zh);
1036             return ZEBRA_FAIL;
1037         }
1038
1039         rec = rec_get(zh->reg->records, *sysno);
1040         assert(rec);
1041
1042         if (stream)
1043         {
1044             all_matches_add(&extractCtrl,
1045                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1046                             *sysno);
1047         }
1048         
1049         recordAttr = rec_init_attr(zh->reg->zei, rec);
1050
1051         /* decrease total size */
1052         zebraExplain_recordBytesIncrement(zh->reg->zei,
1053                                            - recordAttr->recordSize);
1054
1055         zebra_rec_keys_set_buf(delkeys,
1056                                rec->info[recInfo_delKeys],
1057                                rec->size[recInfo_delKeys],
1058                                0);
1059         zebra_rec_keys_set_buf(sortKeys,
1060                                rec->info[recInfo_sortKeys],
1061                                rec->size[recInfo_sortKeys],
1062                                0);
1063
1064         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1065 #if !FLUSH2
1066         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1067                                   recordAttr->staticrank);
1068 #endif
1069         if (action == action_delete || action == action_a_delete)
1070         {
1071             /* record going to be deleted */
1072 #if FLUSH2
1073             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1074                                        delkeys, recordAttr->staticrank);
1075 #endif       
1076             if (zebra_rec_keys_empty(delkeys))
1077             {
1078                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1079                         pr_fname, (zint) start_offset);
1080                 yaz_log(YLOG_WARN, "cannot delete file above, "
1081                         "storeKeys false (3)");
1082             }
1083             else
1084             {
1085                 if (show_progress)
1086                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1087                             pr_fname, (zint) start_offset);
1088                 zh->records_deleted++;
1089                 if (matchStr)
1090                 {
1091                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1092                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1093                 }
1094                 rec_del(zh->reg->records, &rec);
1095             }
1096             zebra_rec_keys_close(delkeys);
1097             zebra_rec_keys_close(sortKeys);
1098             rec_free(&rec);
1099             logRecord(zh);
1100             return ZEBRA_OK;
1101         }
1102         else
1103         {   /* update or special_update */
1104             if (show_progress)
1105                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1106                         pr_fname, (zint) start_offset);
1107             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1108
1109 #if FLUSH2
1110             extract_flush_record_keys2(zh, *sysno,
1111                                        zh->reg->keys, extractCtrl.staticrank,
1112                                        delkeys, recordAttr->staticrank);
1113 #else
1114             extract_flush_record_keys(zh, *sysno, 1, 
1115                                       zh->reg->keys, extractCtrl.staticrank);
1116 #endif
1117             recordAttr->staticrank = extractCtrl.staticrank;
1118             zh->records_updated++;
1119         }
1120         zebra_rec_keys_close(delkeys);
1121         zebra_rec_keys_close(sortKeys);
1122     }
1123     /* update file type */
1124     xfree(rec->info[recInfo_fileType]);
1125     rec->info[recInfo_fileType] =
1126         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1127
1128     /* update filename */
1129     xfree(rec->info[recInfo_filename]);
1130     rec->info[recInfo_filename] =
1131         rec_strdup(fname, &rec->size[recInfo_filename]);
1132
1133     /* update delete keys */
1134     xfree(rec->info[recInfo_delKeys]);
1135     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1136     {
1137         zebra_rec_keys_get_buf(zh->reg->keys,
1138                                &rec->info[recInfo_delKeys],
1139                                &rec->size[recInfo_delKeys]);
1140     }
1141     else
1142     {
1143         rec->info[recInfo_delKeys] = NULL;
1144         rec->size[recInfo_delKeys] = 0;
1145     }
1146     /* update sort keys */
1147     xfree(rec->info[recInfo_sortKeys]);
1148
1149     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1150                            &rec->info[recInfo_sortKeys],
1151                            &rec->size[recInfo_sortKeys]);
1152
1153     if (stream)
1154     {
1155         recordAttr->recordSize = end_offset - start_offset;
1156         zebraExplain_recordBytesIncrement(zh->reg->zei,
1157                                           recordAttr->recordSize);
1158     }
1159
1160     /* set run-number for this record */
1161     recordAttr->runNumber =
1162         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1163
1164     /* update store data */
1165     xfree(rec->info[recInfo_storeData]);
1166
1167     /* update store data */
1168     if (zh->store_data_buf)
1169     {
1170         rec->size[recInfo_storeData] = zh->store_data_size;
1171         rec->info[recInfo_storeData] = zh->store_data_buf;
1172         zh->store_data_buf = 0;
1173         recordAttr->recordSize = zh->store_data_size;
1174     }
1175     else if (zh->m_store_data)
1176     {
1177         off_t cur_offset = stream->tellf(stream);
1178
1179         rec->size[recInfo_storeData] = recordAttr->recordSize;
1180         rec->info[recInfo_storeData] = (char *)
1181             xmalloc(recordAttr->recordSize);
1182         stream->seekf(stream, start_offset);
1183         stream->readf(stream, rec->info[recInfo_storeData],
1184                       recordAttr->recordSize);
1185         stream->seekf(stream, cur_offset);
1186     }
1187     else
1188     {
1189         rec->info[recInfo_storeData] = NULL;
1190         rec->size[recInfo_storeData] = 0;
1191     }
1192     /* update database name */
1193     xfree(rec->info[recInfo_databaseName]);
1194     rec->info[recInfo_databaseName] =
1195         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1196
1197     /* update offset */
1198     recordAttr->recordOffset = start_offset;
1199     
1200     /* commit this record */
1201     rec_put(zh->reg->records, &rec);
1202     logRecord(zh);
1203     return ZEBRA_OK;
1204 }
1205
1206 /** \brief extracts records from stream
1207     \param zh Zebra Handle
1208     \param stream stream that we read from
1209     \param action (action_insert, action_replace, action_delete, ..)
1210     \param recordType Record filter type "grs.xml", etc.
1211     \param sysno pointer to sysno if already known; NULL otherwise
1212     \param match_criteria (NULL if not already given)
1213     \param fname filename that we read from (for logging purposes only)
1214     \param recType record type
1215     \param recTypeClientData client data for record type
1216     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1217 */
1218 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
1219                                        struct ZebraRecStream *stream,
1220                                        enum zebra_recctrl_action_t action,
1221                                        const char *recordType,
1222                                        zint *sysno,
1223                                        const char *match_criteria,
1224                                        const char *fname,
1225                                        RecType recType,
1226                                        void *recTypeClientData)
1227 {
1228     ZEBRA_RES res = ZEBRA_OK;
1229     while (1)
1230     {
1231         int more = 0;
1232         res = zebra_extract_record_stream(zh, stream,
1233                                           action,
1234                                           recordType,
1235                                           sysno,
1236                                           match_criteria,
1237                                           fname,
1238                                           recType, recTypeClientData, &more);
1239         if (!more)
1240         {
1241             res = ZEBRA_OK;
1242             break;
1243         }
1244         if (res != ZEBRA_OK)
1245             break;
1246         if (sysno)
1247             break;
1248     }
1249     return res;
1250 }
1251
1252 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1253 {
1254     ZebraHandle zh = (ZebraHandle) handle;
1255     struct recExtractCtrl extractCtrl;
1256
1257     if (zebraExplain_curDatabase(zh->reg->zei,
1258                                   rec->info[recInfo_databaseName]))
1259     {
1260         abort();
1261         if (zebraExplain_newDatabase(zh->reg->zei,
1262                                       rec->info[recInfo_databaseName], 0))
1263             abort();
1264     }
1265
1266     zebra_rec_keys_reset(zh->reg->keys);
1267     zebra_rec_keys_reset(zh->reg->sortKeys);
1268
1269     extractCtrl.init = extract_init;
1270     extractCtrl.tokenAdd = extract_token_add;
1271     extractCtrl.schemaAdd = extract_schema_add;
1272     extractCtrl.dh = zh->reg->dh;
1273
1274     init_extractCtrl(zh, &extractCtrl);
1275
1276     extractCtrl.flagShowRecords = 0;
1277     extractCtrl.match_criteria[0] = '\0';
1278     extractCtrl.staticrank = 0;
1279     extractCtrl.action = action_update;
1280
1281     extractCtrl.handle = handle;
1282     extractCtrl.first_record = 1;
1283     
1284     extract_set_store_data_prepare(&extractCtrl);
1285
1286     if (n)
1287         grs_extract_tree(&extractCtrl, n);
1288
1289     if (rec->size[recInfo_delKeys])
1290     {
1291         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1292         
1293         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1294
1295         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1296                                rec->size[recInfo_delKeys],
1297                                0);
1298 #if FLUSH2
1299         extract_flush_record_keys2(zh, rec->sysno, 
1300                                    zh->reg->keys, 0, delkeys, 0);
1301 #else
1302         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1303         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1304 #endif
1305         zebra_rec_keys_close(delkeys);
1306
1307         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1308                                rec->size[recInfo_sortKeys],
1309                                0);
1310
1311         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1312         zebra_rec_keys_close(sortkeys);
1313     }
1314     else
1315     {
1316 #if FLUSH2
1317         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1318 #else
1319         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1320 #endif
1321     }
1322     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1323     
1324     xfree(rec->info[recInfo_delKeys]);
1325     zebra_rec_keys_get_buf(zh->reg->keys,
1326                            &rec->info[recInfo_delKeys], 
1327                            &rec->size[recInfo_delKeys]);
1328
1329     xfree(rec->info[recInfo_sortKeys]);
1330     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1331                            &rec->info[recInfo_sortKeys],
1332                            &rec->size[recInfo_sortKeys]);
1333     return ZEBRA_OK;
1334 }
1335
1336 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1337                           zebra_rec_keys_t reckeys,
1338                           int level)
1339 {
1340     if (zebra_rec_keys_rewind(reckeys))
1341     {
1342         size_t slen;
1343         const char *str;
1344         struct it_key key;
1345         NMEM nmem = nmem_create();
1346
1347         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1348         {
1349             char keystr[200]; /* room for zints to print */
1350             char *dst_term = 0;
1351             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1352             const char *index_type;
1353             int i;
1354             const char *string_index;
1355             
1356             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1357                                     0/* db */, &string_index);
1358             assert(index_type);
1359             zebra_term_untrans_iconv(zh, nmem, index_type,
1360                                      &dst_term, str);
1361             *keystr = '\0';
1362             for (i = 0; i<key.len; i++)
1363             {
1364                 sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key.mem[i]);
1365             }
1366
1367             if (*str < CHR_BASE_CHAR)
1368             {
1369                 int i;
1370                 char dst_buf[200]; /* room for special chars */
1371
1372                 strcpy(dst_buf , "?");
1373
1374                 if (!strcmp(str, ""))
1375                     strcpy(dst_buf, "alwaysmatches");
1376                 if (!strcmp(str, FIRST_IN_FIELD_STR))
1377                     strcpy(dst_buf, "firstinfield");
1378                 else if (!strcmp(str, CHR_UNKNOWN))
1379                     strcpy(dst_buf, "unknown");
1380                 else if (!strcmp(str, CHR_SPACE))
1381                     strcpy(dst_buf, "space");
1382                 
1383                 for (i = 0; i<slen; i++)
1384                 {
1385                     sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1386                 }
1387                 yaz_log(level, "%s%s %s %s", keystr, index_type,
1388                         string_index, dst_buf);
1389                 
1390             }
1391             else
1392                 yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1393                         string_index, dst_term);
1394
1395             nmem_reset(nmem);
1396         }
1397         nmem_destroy(nmem);
1398     }
1399 }
1400
1401 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1402                              zebra_rec_keys_t reckeys)
1403 {
1404     ZebraExplainInfo zei = zh->reg->zei;
1405     struct ord_stat {
1406         int no;
1407         int ord;
1408         struct ord_stat *next;
1409     };
1410
1411     if (zebra_rec_keys_rewind(reckeys))
1412     {
1413         struct ord_stat *ord_list = 0;
1414         struct ord_stat *p;
1415         size_t slen;
1416         const char *str;
1417         struct it_key key_in;
1418         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1419         {
1420             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1421
1422             for (p = ord_list; p ; p = p->next)
1423                 if (p->ord == ord)
1424                 {
1425                     p->no++;
1426                     break;
1427                 }
1428             if (!p)
1429             {
1430                 p = xmalloc(sizeof(*p));
1431                 p->no = 1;
1432                 p->ord = ord;
1433                 p->next = ord_list;
1434                 ord_list = p;
1435             }
1436         }
1437
1438         p = ord_list;
1439         while (p)
1440         {
1441             struct ord_stat *p1 = p;
1442
1443             if (is_insert)
1444                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1445             else
1446                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1447             p = p->next;
1448             xfree(p1);
1449         }
1450     }
1451 }
1452
1453 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
1454                                 zebra_rec_keys_t ins_keys, zint ins_rank,
1455                                 zebra_rec_keys_t del_keys, zint del_rank)
1456 {
1457     ZebraExplainInfo zei = zh->reg->zei;
1458     int normal = 0;
1459     int optimized = 0;
1460
1461     if (!zh->reg->key_block)
1462     {
1463         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1464         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1465         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1466         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1467     }
1468
1469     if (ins_keys)
1470     {
1471         extract_rec_keys_adjust(zh, 1, ins_keys);
1472         if (!del_keys)
1473             zebraExplain_recordCountIncrement(zei, 1);
1474         zebra_rec_keys_rewind(ins_keys);
1475     }
1476     if (del_keys)
1477     {
1478         extract_rec_keys_adjust(zh, 0, del_keys);
1479         if (!ins_keys)
1480             zebraExplain_recordCountIncrement(zei, -1);
1481         zebra_rec_keys_rewind(del_keys);
1482     }
1483
1484     while (1)
1485     {
1486         size_t del_slen;
1487         const char *del_str;
1488         struct it_key del_key_in;
1489         int del = 0;
1490
1491         size_t ins_slen;
1492         const char *ins_str;
1493         struct it_key ins_key_in;
1494         int ins = 0;
1495
1496         if (del_keys)
1497             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1498                                       &del_key_in);
1499         if (ins_keys)
1500             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1501                                       &ins_key_in);
1502
1503         if (del && ins && ins_rank == del_rank
1504             && !key_compare(&del_key_in, &ins_key_in) 
1505             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1506         {
1507             optimized++;
1508             continue;
1509         }
1510         if (!del && !ins)
1511             break;
1512         
1513         normal++;
1514         if (del)
1515             key_block_write(zh->reg->key_block, sysno, 
1516                             &del_key_in, 0, del_str, del_slen,
1517                             del_rank, zh->m_staticrank);
1518         if (ins)
1519             key_block_write(zh->reg->key_block, sysno, 
1520                             &ins_key_in, 1, ins_str, ins_slen,
1521                             ins_rank, zh->m_staticrank);
1522     }
1523     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1524 }
1525
1526
1527 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1528                                      zebra_rec_keys_t reckeys,
1529                                      zebra_snippets *snippets)
1530 {
1531     NMEM nmem = nmem_create();
1532     if (zebra_rec_keys_rewind(reckeys)) 
1533     {
1534         const char *str;
1535         size_t slen;
1536         struct it_key key;
1537         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1538         {
1539             char *dst_term = 0;
1540             int ord;
1541             zint seqno;
1542             const char *index_type;
1543
1544             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1545             seqno = key.mem[key.len-1];
1546             ord = CAST_ZINT_TO_INT(key.mem[0]);
1547             
1548             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1549                                     0/* db */, 0 /* string_index */);
1550             assert(index_type);
1551             zebra_term_untrans_iconv(zh, nmem, index_type,
1552                                      &dst_term, str);
1553             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1554             nmem_reset(nmem);
1555         }
1556     }
1557     nmem_destroy(nmem);
1558     return ZEBRA_OK;
1559 }
1560
1561 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1562 {
1563     yaz_log(YLOG_LOG, "print_rec_keys");
1564     if (zebra_rec_keys_rewind(reckeys))
1565     {
1566         const char *str;
1567         size_t slen;
1568         struct it_key key;
1569         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1570         {
1571             char dst_buf[IT_MAX_WORD];
1572             zint seqno;
1573             const char *index_type;
1574             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1575             const char *db = 0;
1576             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1577
1578             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1579             
1580             seqno = key.mem[key.len-1];
1581             
1582             zebra_term_untrans(zh, index_type, dst_buf, str);
1583             
1584             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1585                     " term=%s", ord, seqno, dst_buf); 
1586         }
1587     }
1588 }
1589
1590 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1591                                      const char *str, int length)
1592 {
1593     struct it_key key;
1594     ZebraHandle zh = p->extractCtrl->handle;
1595     ZebraExplainInfo zei = zh->reg->zei;
1596     int ch, i;
1597
1598     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1599     if (ch < 0)
1600         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1601
1602     i = 0;
1603     key.mem[i++] = ch;
1604     key.mem[i++] = p->record_id;
1605     key.mem[i++] = p->section_id;
1606
1607     if (zh->m_segment_indexing)
1608         key.mem[i++] = p->segment;
1609     key.mem[i++] = p->seqno;
1610     key.len = i;
1611
1612     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1613 }
1614
1615 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1616 {
1617     struct it_key key;
1618     ZebraHandle zh = p->extractCtrl->handle;
1619     ZebraExplainInfo zei = zh->reg->zei;
1620     int ch;
1621     zinfo_index_category_t cat = zinfo_index_category_sort;
1622
1623     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1624     if (ch < 0)
1625         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1626     key.len = 3;
1627     key.mem[0] = ch;
1628     key.mem[1] = p->record_id;
1629     key.mem[2] = p->section_id;
1630
1631     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1632 }
1633
1634 static void extract_add_staticrank_string(RecWord *p,
1635                                           const char *str, int length)
1636 {
1637     char valz[40];
1638     struct recExtractCtrl *ctrl = p->extractCtrl;
1639
1640     if (length > sizeof(valz)-1)
1641         length = sizeof(valz)-1;
1642
1643     memcpy(valz, str, length);
1644     valz[length] = '\0';
1645     ctrl->staticrank = atozint(valz);
1646 }
1647
1648 static void extract_add_string(RecWord *p, zebra_map_t zm,
1649                                const char *string, int length)
1650 {
1651     assert(length > 0);
1652
1653     if (!p->index_name)
1654         return;
1655     if (log_level_details)
1656     {
1657
1658         WRBUF w = wrbuf_alloc();
1659         
1660         wrbuf_write_escaped(w, string, length);
1661         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1662         wrbuf_destroy(w);
1663     }
1664     if (zebra_maps_is_index(zm))
1665     {
1666         extract_add_index_string(p, zinfo_index_category_index,
1667                                  string, length);
1668         if (zebra_maps_is_alwaysmatches(zm))
1669         {
1670             RecWord word;
1671             memcpy(&word, p, sizeof(word));
1672
1673             word.seqno = 1;
1674             extract_add_index_string(
1675                 &word, zinfo_index_category_alwaysmatches, "", 0);
1676         }
1677     }
1678     else if (zebra_maps_is_sort(zm))
1679     {
1680         extract_add_sort_string(p, string, length);
1681     }
1682     else if (zebra_maps_is_staticrank(zm))
1683     {
1684         extract_add_staticrank_string(p, string, length);
1685     }
1686 }
1687
1688 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1689 {
1690     const char *b = p->term_buf;
1691     int remain = p->term_len;
1692     int first = 1;
1693     const char **map = 0;
1694     
1695     if (remain > 0)
1696         map = zebra_maps_input(zm, &b, remain, 0);
1697
1698     while (map)
1699     {
1700         char buf[IT_MAX_WORD+1];
1701         int i, remain;
1702
1703         /* Skip spaces */
1704         while (map && *map && **map == *CHR_SPACE)
1705         {
1706             remain = p->term_len - (b - p->term_buf);
1707             if (remain > 0)
1708                 map = zebra_maps_input(zm, &b, remain, 0);
1709             else
1710                 map = 0;
1711         }
1712         if (!map)
1713             break;
1714         i = 0;
1715         while (map && *map && **map != *CHR_SPACE)
1716         {
1717             const char *cp = *map;
1718
1719             while (i < IT_MAX_WORD && *cp)
1720                 buf[i++] = *(cp++);
1721             remain = p->term_len - (b - p->term_buf);
1722             if (remain > 0)
1723                 map = zebra_maps_input(zm, &b, remain, 0);
1724             else
1725                 map = 0;
1726         }
1727         if (!i)
1728             return;
1729
1730         if (first)
1731         {   
1732             first = 0;
1733             if (zebra_maps_is_first_in_field(zm))
1734             {
1735                 /* first in field marker */
1736                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1737                 p->seqno++;
1738             }
1739         }
1740         extract_add_string(p, zm, buf, i);
1741         p->seqno++;
1742     }
1743 }
1744
1745 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1746 {
1747     const char *b = p->term_buf;
1748     char buf[IT_MAX_WORD+1];
1749     const char **map = 0;
1750     int i = 0, remain = p->term_len;
1751
1752     if (remain > 0)
1753         map = zebra_maps_input(zm, &b, remain, 1);
1754
1755     while (remain > 0 && i < IT_MAX_WORD)
1756     {
1757         while (map && *map && **map == *CHR_SPACE)
1758         {
1759             remain = p->term_len - (b - p->term_buf);
1760
1761             if (remain > 0)
1762             {
1763                 int first = i ? 0 : 1;  /* first position */
1764                 map = zebra_maps_input(zm, &b, remain, first);
1765             }
1766             else
1767                 map = 0;
1768         }
1769         if (!map)
1770             break;
1771
1772         if (i && i < IT_MAX_WORD)
1773             buf[i++] = *CHR_SPACE;
1774         while (map && *map && **map != *CHR_SPACE)
1775         {
1776             const char *cp = *map;
1777
1778             if (**map == *CHR_CUT)
1779             {
1780                 i = 0;
1781             }
1782             else
1783             {
1784                 if (i >= IT_MAX_WORD)
1785                     break;
1786                 while (i < IT_MAX_WORD && *cp)
1787                     buf[i++] = *(cp++);
1788             }
1789             remain = p->term_len  - (b - p->term_buf);
1790             if (remain > 0)
1791             {
1792                 map = zebra_maps_input(zm, &b, remain, 0);
1793             }
1794             else
1795                 map = 0;
1796         }
1797     }
1798     if (!i)
1799         return;
1800     extract_add_string(p, zm, buf, i);
1801 }
1802
1803 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1804 {
1805     const char *res_buf = 0;
1806     size_t res_len = 0;
1807
1808     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1809     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1810     {
1811         extract_add_string(p, zm, res_buf, res_len);
1812         p->seqno++;
1813     }
1814 }
1815
1816
1817 /** \brief top-level indexing handler for recctrl system
1818     \param p token data to be indexed
1819
1820     Call sequence:
1821     extract_token_add
1822     extract_add_{in}_complete / extract_add_icu
1823     extract_add_string
1824     
1825     extract_add_index_string
1826     or
1827     extract_add_sort_string
1828     or
1829     extract_add_staticrank_string
1830     
1831 */
1832 static void extract_token_add(RecWord *p)
1833 {
1834     ZebraHandle zh = p->extractCtrl->handle;
1835     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1836     WRBUF wrbuf;
1837
1838     if (log_level_details)
1839     {
1840         yaz_log(log_level_details, "extract_token_add "
1841                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1842                 p->index_type, p->index_name, 
1843                 p->seqno, p->term_len, p->term_buf);
1844     }
1845     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1846     {
1847         p->term_buf = wrbuf_buf(wrbuf);
1848         p->term_len = wrbuf_len(wrbuf);
1849     }
1850     if (zebra_maps_is_icu(zm))
1851     {
1852         extract_add_icu(p, zm);
1853     }
1854     else
1855     {
1856         if (zebra_maps_is_complete(zm))
1857             extract_add_complete_field(p, zm);
1858         else
1859             extract_add_incomplete_field(p, zm);
1860     }
1861 }
1862
1863 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1864                                       void *buf, size_t sz)
1865 {
1866     ZebraHandle zh = (ZebraHandle) p->handle;
1867
1868     xfree(zh->store_data_buf);
1869     zh->store_data_buf = 0;
1870     zh->store_data_size = 0;
1871     if (buf && sz)
1872     {
1873         zh->store_data_buf = xmalloc(sz);
1874         zh->store_data_size = sz;
1875         memcpy(zh->store_data_buf, buf, sz);
1876     }
1877 }
1878
1879 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1880 {
1881     ZebraHandle zh = (ZebraHandle) p->handle;
1882     xfree(zh->store_data_buf);
1883     zh->store_data_buf = 0;
1884     zh->store_data_size = 0;
1885     p->setStoreData = extract_set_store_data_cb;
1886 }
1887
1888 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1889 {
1890     ZebraHandle zh = (ZebraHandle) p->handle;
1891     zebraExplain_addSchema(zh->reg->zei, oid);
1892 }
1893
1894 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1895                              int cmd, zebra_rec_keys_t reckeys)
1896 {
1897 #if 0
1898     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1899             cmd, sysno);
1900     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1901 #endif
1902
1903     if (zebra_rec_keys_rewind(reckeys))
1904     {
1905         zebra_sort_index_t si = zh->reg->sort_index;
1906         size_t slen;
1907         const char *str;
1908         struct it_key key_in;
1909
1910         NMEM nmem = nmem_create();
1911         struct sort_add_ent {
1912             int ord;
1913             int cmd;
1914             struct sort_add_ent *next;
1915             WRBUF wrbuf;
1916             zint sysno;
1917             zint section_id;
1918         };
1919         struct sort_add_ent *sort_ent_list = 0;
1920
1921         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1922         {
1923             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1924             zint filter_sysno = key_in.mem[1];
1925             zint section_id = key_in.mem[2];
1926
1927             struct sort_add_ent **e = &sort_ent_list;
1928             for (; *e; e = &(*e)->next)
1929                 if ((*e)->ord == ord && section_id == (*e)->section_id)
1930                     break;
1931             if (!*e)
1932             {
1933                 *e = nmem_malloc(nmem, sizeof(**e));
1934                 (*e)->next = 0;
1935                 (*e)->wrbuf = wrbuf_alloc();
1936                 (*e)->ord = ord;
1937                 (*e)->cmd = cmd;
1938                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1939                 (*e)->section_id = section_id;
1940             }
1941             
1942             wrbuf_write((*e)->wrbuf, str, slen);
1943             wrbuf_putc((*e)->wrbuf, '\0');
1944         }
1945         if (sort_ent_list)
1946         {
1947             zint last_sysno = 0;
1948             struct sort_add_ent *e = sort_ent_list;
1949             for (; e; e = e->next)
1950             {
1951                 if (last_sysno != e->sysno)
1952                 {
1953                     zebra_sort_sysno(si, e->sysno);
1954                     last_sysno = e->sysno;
1955                 }
1956                 zebra_sort_type(si, e->ord);
1957                 if (e->cmd == 1)
1958                     zebra_sort_add(si, e->section_id, e->wrbuf);
1959                 else
1960                     zebra_sort_delete(si, e->section_id);
1961                 wrbuf_destroy(e->wrbuf);
1962             }
1963         }
1964         nmem_destroy(nmem);
1965     }
1966 }
1967
1968 /*
1969  * Local variables:
1970  * c-basic-offset: 4
1971  * c-file-style: "Stroustrup"
1972  * indent-tabs-mode: nil
1973  * End:
1974  * vim: shiftwidth=4 tabstop=8 expandtab
1975  */
1976