e0047fc87abe72e5047f8389cf88dbbf6c392bcc
[idzebra-moved-to-github.git] / index / extract.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2011 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 /** \file
21     \brief indexes records and extract tokens for indexing and sorting
22 */
23
24 #include <stdio.h>
25 #include <assert.h>
26 #include <ctype.h>
27 #ifdef WIN32
28 #include <io.h>
29 #endif
30 #if HAVE_UNISTD_H
31 #include <unistd.h>
32 #endif
33 #include <fcntl.h>
34
35
36 #include "index.h"
37 #include "orddict.h"
38 #include <direntz.h>
39 #include <charmap.h>
40 #include <yaz/snprintf.h>
41
42 static int log_level_extract = 0;
43 static int log_level_details = 0;
44 static int log_level_initialized = 0;
45
46 /* 1 if we use eliminitate identical delete/insert keys */
47 /* eventually this the 0-case code will be removed */
48 #define FLUSH2 1
49
50 #if FLUSH2
51 static void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
52                                        zebra_rec_keys_t ins_keys,
53                                        zint ins_rank,
54                                        zebra_rec_keys_t del_keys,
55                                        zint del_rank);
56 #else
57 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
58                                       int cmd,
59                                       zebra_rec_keys_t reckeys,
60                                       zint staticrank);
61 #endif
62
63 static void zebra_init_log_level(void)
64 {
65     if (!log_level_initialized)
66     {
67         log_level_initialized = 1;
68
69         log_level_extract = yaz_log_module_level("extract");
70         log_level_details = yaz_log_module_level("indexdetails");
71     }
72 }
73
74 static WRBUF wrbuf_hex_str(const char *cstr)
75 {
76     size_t i;
77     WRBUF w = wrbuf_alloc();
78     for (i = 0; cstr[i]; i++)
79     {
80         if (cstr[i] < ' ' || cstr[i] > 126)
81             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
82         else
83             wrbuf_putc(w, cstr[i]);
84     }
85     return w;
86 }
87
88
89 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
90                                     int cmd, zebra_rec_keys_t skp);
91 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
92 static void extract_token_add(RecWord *p);
93
94 static void check_log_limit(ZebraHandle zh)
95 {
96     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
97     {
98         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
99                 zh->m_file_verbose_limit);
100     }
101 }
102
103 static void logRecord(ZebraHandle zh)
104 {
105     check_log_limit(zh);
106     ++zh->records_processed;
107     if (!(zh->records_processed % 1000))
108     {
109         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
110                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
111                 zh->records_processed, zh->records_inserted, 
112                 zh->records_updated, zh->records_deleted);
113     }
114 }
115
116 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
117 {
118     ctrl->flagShowRecords = !zh->m_flag_rw;
119 }
120
121
122 static void extract_add_index_string(RecWord *p, 
123                                       zinfo_index_category_t cat,
124                                       const char *str, int length);
125
126 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
127
128 static void extract_init(struct recExtractCtrl *p, RecWord *w)
129 {
130     w->seqno = 1;
131     w->index_name = "any";
132     w->index_type = "w";
133     w->extractCtrl = p;
134     w->record_id = 0;
135     w->section_id = 0;
136     w->segment = 0;
137 }
138
139 struct snip_rec_info {
140     ZebraHandle zh;
141     zebra_snippets *snippets;
142 };
143
144
145 static void snippet_add_complete_field(RecWord *p, int ord,
146                                        zebra_map_t zm)
147 {
148     struct snip_rec_info *h = p->extractCtrl->handle;
149
150     const char *b = p->term_buf;
151     char buf[IT_MAX_WORD+1];
152     const char **map = 0;
153     int i = 0, remain = p->term_len;
154     const char *start = b;
155     const char *last = 0;
156
157     if (remain > 0)
158         map = zebra_maps_input(zm, &b, remain, 1);
159
160     while (remain > 0 && i < IT_MAX_WORD)
161     {
162         while (map && *map && **map == *CHR_SPACE)
163         {
164             remain = p->term_len - (b - p->term_buf);
165
166             if (i == 0)
167                 start = b;  /* set to first non-ws area */
168             if (remain > 0)
169             {
170                 int first = i ? 0 : 1;  /* first position */
171
172                 map = zebra_maps_input(zm, &b, remain, first);
173             }
174             else
175                 map = 0;
176         }
177         if (!map)
178             break;
179
180         if (i && i < IT_MAX_WORD)
181             buf[i++] = *CHR_SPACE;
182         while (map && *map && **map != *CHR_SPACE)
183         {
184             const char *cp = *map;
185
186             if (**map == *CHR_CUT)
187             {
188                 i = 0;
189             }
190             else
191             {
192                 if (i >= IT_MAX_WORD)
193                     break;
194                 while (i < IT_MAX_WORD && *cp)
195                     buf[i++] = *(cp++);
196             }
197             last = b;
198             remain = p->term_len  - (b - p->term_buf);
199             if (remain > 0)
200             {
201                 map = zebra_maps_input(zm, &b, remain, 0);
202             }
203             else
204                 map = 0;
205         }
206     }
207     if (!i)
208         return;
209     if (last && start != last && zebra_maps_is_index(zm))
210         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
211                                start, last - start);
212 }
213
214 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
215 {
216     struct snip_rec_info *h = p->extractCtrl->handle;
217     const char *b = p->term_buf;
218     int remain = p->term_len;
219     int first = 1;
220     const char **map = 0;
221     const char *start = b;
222     const char *last = b;
223
224     if (remain > 0)
225         map = zebra_maps_input(zm, &b, remain, 0);
226
227     while (map)
228     {
229         char buf[IT_MAX_WORD+1];
230         int i, remain;
231
232         /* Skip spaces */
233         while (map && *map && **map == *CHR_SPACE)
234         {
235             remain = p->term_len - (b - p->term_buf);
236             last = b;
237             if (remain > 0)
238                 map = zebra_maps_input(zm, &b, remain, 0);
239             else
240                 map = 0;
241         }
242         if (!map)
243             break;
244         if (start != last && zebra_maps_is_index(zm))
245         {
246             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
247                                    start, last - start);
248
249         }
250         start = last;
251
252         i = 0;
253         while (map && *map && **map != *CHR_SPACE)
254         {
255             const char *cp = *map;
256
257             while (i < IT_MAX_WORD && *cp)
258                 buf[i++] = *(cp++);
259             remain = p->term_len - (b - p->term_buf);
260             last = b;
261             if (remain > 0)
262                 map = zebra_maps_input(zm, &b, remain, 0);
263             else
264                 map = 0;
265         }
266         if (!i)
267             return;
268
269         if (first)
270         {   
271             first = 0;
272             if (zebra_maps_is_first_in_field(zm))
273             {
274                 /* first in field marker */
275                 p->seqno++;
276             }
277         }
278         if (start != last && zebra_maps_is_index(zm))
279             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
280                                    start, last - start);
281         start = last;
282         p->seqno++;
283     }
284
285 }
286
287 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
288 {
289     struct snip_rec_info *h = p->extractCtrl->handle;
290
291     const char *res_buf = 0;
292     size_t res_len = 0;
293
294     const char *display_buf = 0;
295     size_t display_len = 0;
296
297     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
298     while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
299                                    &display_buf, &display_len))
300     {
301         if (zebra_maps_is_index(zm))
302             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
303                                    display_buf, display_len);
304         p->seqno++;
305     }
306 }
307
308 static void snippet_token_add(RecWord *p)
309 {
310     struct snip_rec_info *h = p->extractCtrl->handle;
311     ZebraHandle zh = h->zh;
312     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
313
314     if (zm)
315     {
316         ZebraExplainInfo zei = zh->reg->zei;
317         int ch = zebraExplain_lookup_attr_str(
318             zei, zinfo_index_category_index, p->index_type, p->index_name);
319
320         if (zebra_maps_is_icu(zm))
321             snippet_add_icu(p, ch, zm);
322         else
323         {
324             if (zebra_maps_is_complete(zm))
325                 snippet_add_complete_field(p, ch, zm);
326             else
327                 snippet_add_incomplete_field(p, ch, zm);
328         }
329     }
330 }
331
332 static void snippet_schema_add(
333     struct recExtractCtrl *p, Odr_oid *oid)
334 {
335
336 }
337
338 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
339                      struct ZebraRecStream *stream,
340                      RecType rt, void *recTypeClientData)
341 {
342     struct recExtractCtrl extractCtrl;
343     struct snip_rec_info info;
344     int r;
345
346     extractCtrl.stream = stream;
347     extractCtrl.first_record = 1;
348     extractCtrl.init = extract_init;
349     extractCtrl.tokenAdd = snippet_token_add;
350     extractCtrl.schemaAdd = snippet_schema_add;
351     assert(zh->reg);
352     assert(zh->reg->dh);
353
354     extractCtrl.dh = zh->reg->dh;
355     
356     info.zh = zh;
357     info.snippets = sn;
358     extractCtrl.handle = &info;
359     extractCtrl.match_criteria[0] = '\0';
360     extractCtrl.staticrank = 0;
361     extractCtrl.action = action_insert;
362     
363     init_extractCtrl(zh, &extractCtrl);
364
365     extractCtrl.setStoreData = 0;
366
367     r = (*rt->extract)(recTypeClientData, &extractCtrl);
368
369 }
370
371 static void searchRecordKey(ZebraHandle zh,
372                             zebra_rec_keys_t reckeys,
373                             const char *index_name,
374                             const char **ws, int ws_length)
375 {
376     int i;
377     int ch = -1;
378     zinfo_index_category_t cat = zinfo_index_category_index;
379
380     for (i = 0; i<ws_length; i++)
381         ws[i] = NULL;
382
383     if (ch < 0)
384         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
385     if (ch < 0)
386         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
387     if (ch < 0)
388         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
389
390     if (ch < 0)
391         return ;
392
393     if (zebra_rec_keys_rewind(reckeys))
394     {
395         zint startSeq = -1;
396         const char *str;
397         size_t slen;
398         struct it_key key;
399         zint seqno;
400         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
401         {
402             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
403
404             seqno = key.mem[key.len-1];
405             
406             if (key.mem[0] == ch)
407             {
408                 zint woff;
409                 
410                 if (startSeq == -1)
411                     startSeq = seqno;
412                 woff = seqno - startSeq;
413                 if (woff >= 0 && woff < ws_length)
414                     ws[woff] = str;
415             }
416         }
417     }
418 }
419
420 #define FILE_MATCH_BLANK "\t "
421
422 static char *get_match_from_spec(ZebraHandle zh,
423                           zebra_rec_keys_t reckeys,
424                           const char *fname, const char *spec)
425 {
426     static char dstBuf[2048];      /* static here ??? */
427     char *dst = dstBuf;
428     const char *s = spec;
429
430     while (1)
431     {
432         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
433             ;
434         if (!*s)
435             break;
436         if (*s == '(')
437         {
438             const char *ws[32];
439             char attset_str[64], attname_str[64];
440             int i;
441             int first = 1;
442             
443             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
444                 ;
445             for (i = 0; *s && *s != ',' && *s != ')' && 
446                      !strchr(FILE_MATCH_BLANK, *s); s++)
447                 if (i+1 < sizeof(attset_str))
448                     attset_str[i++] = *s;
449             attset_str[i] = '\0';
450             
451             for (; strchr(FILE_MATCH_BLANK, *s); s++)
452                 ;
453             if (*s != ',')
454                 strcpy(attname_str, attset_str);
455             else
456             {
457                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
458                     ;
459                 for (i = 0; *s && *s != ')' && 
460                          !strchr(FILE_MATCH_BLANK, *s); s++)
461                     if (i+1 < sizeof(attname_str))
462                         attname_str[i++] = *s;
463                 attname_str[i] = '\0';
464             }
465             if (*s != ')')
466             {
467                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
468                       spec, zh->m_group ? zh->m_group : "none");
469                 return NULL;
470             }
471             s++;
472
473             searchRecordKey(zh, reckeys, attname_str, ws, 32);
474             if (0) /* for debugging */
475             {   
476                 for (i = 0; i<32; i++)
477                 {
478                     if (ws[i])
479                     {
480                         WRBUF w = wrbuf_hex_str(ws[i]);
481                         yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
482                         wrbuf_destroy(w);
483                     }
484                 }
485             }
486
487             for (i = 0; i<32; i++)
488                 if (ws[i])
489                 {
490                     if (first)
491                     {
492                         *dst++ = ' ';
493                         first = 0;
494                     }
495                     strcpy(dst, ws[i]);
496                     dst += strlen(ws[i]);
497                 }
498             if (first)
499             {
500                 yaz_log(YLOG_WARN, "Record didn't contain match"
501                       " fields in (%s,%s)", attset_str, attname_str);
502                 return NULL;
503             }
504         }
505         else if (*s == '$')
506         {
507             int spec_len;
508             char special[64];
509             const char *spec_src = NULL;
510             const char *s1 = ++s;
511             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
512                 s1++;
513
514             spec_len = s1 - s;
515             if (spec_len > sizeof(special)-1)
516                 spec_len = sizeof(special)-1;
517             memcpy(special, s, spec_len);
518             special[spec_len] = '\0';
519             s = s1;
520
521             if (!strcmp(special, "group"))
522                 spec_src = zh->m_group;
523             else if (!strcmp(special, "database"))
524                 spec_src = zh->basenames[0];
525             else if (!strcmp(special, "filename")) {
526                 spec_src = fname;
527             }
528             else if (!strcmp(special, "type"))
529                 spec_src = zh->m_record_type;
530             else 
531                 spec_src = NULL;
532             if (spec_src)
533             {
534                 strcpy(dst, spec_src);
535                 dst += strlen(spec_src);
536             }
537         }
538         else if (*s == '\"' || *s == '\'')
539         {
540             int stopMarker = *s++;
541             char tmpString[64];
542             int i = 0;
543
544             while (*s && *s != stopMarker)
545             {
546                 if (i+1 < sizeof(tmpString))
547                     tmpString[i++] = *s++;
548             }
549             if (*s)
550                 s++;
551             tmpString[i] = '\0';
552             strcpy(dst, tmpString);
553             dst += strlen(tmpString);
554         }
555         else
556         {
557             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
558                   spec, zh->m_group ? zh->m_group : "none");
559             return NULL;
560         }
561         *dst++ = 1;
562     }
563     if (dst == dstBuf)
564     {
565         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
566               fname, zh->m_group ? zh->m_group : "none");
567         return NULL;
568     }
569     *dst = '\0';
570
571     if (0) /* for debugging */
572     {
573         WRBUF w = wrbuf_hex_str(dstBuf);
574         yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
575         wrbuf_destroy(w);
576     }
577
578     return dstBuf;
579 }
580
581 struct recordLogInfo {
582     const char *fname;
583     int recordOffset;
584     struct recordGroup *rGroup;
585 };
586
587 /** \brief add the always-matches index entry and map to real record ID
588     \param ctrl record control
589     \param record_id custom record ID
590     \param sysno system record ID
591     
592     This function serves two purposes.. It adds the always matches
593     entry and makes a pointer from the custom record ID (if defined)
594     back to the system record ID (sysno)
595     See zebra_recid_to_sysno .
596   */
597 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
598                             zint sysno)
599 {
600     RecWord word;
601     extract_init(ctrl, &word);
602     word.record_id = record_id;
603     /* we use the seqno as placeholder for a way to get back to
604        record database from _ALLRECORDS.. This is used if a custom
605        RECORD was defined */
606     word.seqno = sysno;
607     word.index_name = "_ALLRECORDS";
608     word.index_type = "w";
609
610     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
611                               "", 0);
612 }
613
614 /* forward declaration */
615 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
616                                        struct ZebraRecStream *stream,
617                                        enum zebra_recctrl_action_t action,
618                                        const char *recordType,
619                                        zint *sysno,
620                                        const char *match_criteria,
621                                        const char *fname,
622                                        RecType recType,
623                                        void *recTypeClientData);
624
625
626 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
627                              enum zebra_recctrl_action_t action)
628 {
629     ZEBRA_RES r = ZEBRA_OK;
630     int i, fd;
631     char gprefix[128];
632     char ext[128];
633     char ext_res[128];
634     struct file_read_info *fi = 0;
635     const char *original_record_type = 0;
636     RecType recType;
637     void *recTypeClientData;
638     struct ZebraRecStream stream, *streamp;
639
640     zebra_init_log_level();
641
642     if (!zh->m_group || !*zh->m_group)
643         *gprefix = '\0';
644     else
645         sprintf(gprefix, "%s.", zh->m_group);
646     
647     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
648
649     /* determine file extension */
650     *ext = '\0';
651     for (i = strlen(fname); --i >= 0; )
652         if (fname[i] == '/')
653             break;
654         else if (fname[i] == '.')
655         {
656             strcpy(ext, fname+i+1);
657             break;
658         }
659     /* determine file type - depending on extension */
660     original_record_type = zh->m_record_type;
661     if (!zh->m_record_type)
662     {
663         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
664         zh->m_record_type = res_get(zh->res, ext_res);
665     }
666     if (!zh->m_record_type)
667     {
668         check_log_limit(zh);
669         if (zh->records_processed + zh->records_skipped
670             < zh->m_file_verbose_limit)
671             yaz_log(YLOG_LOG, "? %s", fname);
672         zh->records_skipped++;
673         return 0;
674     }
675     /* determine match criteria */
676     if (!zh->m_record_id)
677     {
678         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
679         zh->m_record_id = res_get(zh->res, ext_res);
680     }
681
682     if (!(recType =
683           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
684                           &recTypeClientData)))
685     {
686         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
687         return ZEBRA_FAIL;
688     }
689
690     switch(recType->version)
691     {
692     case 0:
693         break;
694     default:
695         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
696     }
697     if (sysno && (action == action_delete || action == action_a_delete))
698     {
699         streamp = 0;
700         fi = 0;
701     }
702     else
703     {
704         char full_rep[1024];
705
706         if (zh->path_reg && !yaz_is_abspath(fname))
707         {
708             strcpy(full_rep, zh->path_reg);
709             strcat(full_rep, "/");
710             strcat(full_rep, fname);
711         }
712         else
713             strcpy(full_rep, fname);
714         
715         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
716         {
717             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
718             zh->m_record_type = original_record_type;
719             return ZEBRA_FAIL;
720         }
721         streamp = &stream;
722         zebra_create_stream_fd(streamp, fd, 0);
723     }
724     r = zebra_extract_records_stream(zh, streamp,
725                                      action,
726                                      zh->m_record_type,
727                                      sysno,
728                                      0, /*match_criteria */
729                                      fname,
730                                      recType, recTypeClientData);
731     if (streamp)
732         stream.destroy(streamp);
733     zh->m_record_type = original_record_type;
734     return r;
735 }
736
737 /*
738   If sysno is provided, then it's used to identify the reocord.
739   If not, and match_criteria is provided, then sysno is guessed
740   If not, and a record is provided, then sysno is got from there
741   
742  */
743
744 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
745                                       const char *buf, size_t buf_size,
746                                       enum zebra_recctrl_action_t action,
747                                       const char *recordType,
748                                       zint *sysno,
749                                       const char *match_criteria,
750                                       const char *fname)
751 {
752     struct ZebraRecStream stream;
753     ZEBRA_RES res;
754     void *clientData;
755     RecType recType = 0;
756
757     if (recordType && *recordType)
758     {
759         yaz_log(log_level_extract,
760                 "Record type explicitly specified: %s", recordType);
761         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
762                                   &clientData);
763     } 
764     else
765     {
766         if (!(zh->m_record_type))
767         {
768             yaz_log(YLOG_WARN, "No such record type defined");
769             return ZEBRA_FAIL;
770         }
771         yaz_log(log_level_extract, "Get record type from rgroup: %s",
772                 zh->m_record_type);
773         recType = recType_byName(zh->reg->recTypes, zh->res,
774                                   zh->m_record_type, &clientData);
775         recordType = zh->m_record_type;
776     }
777     
778     if (!recType)
779     {
780         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
781         return ZEBRA_FAIL;
782     }
783
784     zebra_create_stream_mem(&stream, buf, buf_size);
785
786     res = zebra_extract_records_stream(zh, &stream,
787                                        action,
788                                        recordType,
789                                        sysno,
790                                        match_criteria,
791                                        fname,
792                                        recType, clientData);
793     stream.destroy(&stream);
794     return res;
795 }
796
797 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
798                                              struct ZebraRecStream *stream,
799                                              enum zebra_recctrl_action_t action,
800                                              const char *recordType,
801                                              zint *sysno,
802                                              const char *match_criteria,
803                                              const char *fname,
804                                              RecType recType,
805                                              void *recTypeClientData,
806                                              int *more)
807     
808 {
809     zint sysno0 = 0;
810     RecordAttr *recordAttr;
811     struct recExtractCtrl extractCtrl;
812     int r;
813     const char *matchStr = 0;
814     Record rec;
815     off_t start_offset = 0, end_offset = 0;
816     const char *pr_fname = fname;  /* filename to print .. */
817     int show_progress = zh->records_processed + zh->records_skipped 
818         < zh->m_file_verbose_limit ? 1:0;
819
820     zebra_init_log_level();
821
822     if (!pr_fname)
823         pr_fname = "<no file>";  /* make it printable if file is omitted */
824
825     zebra_rec_keys_reset(zh->reg->keys);
826     zebra_rec_keys_reset(zh->reg->sortKeys);
827
828     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
829     {
830         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
831                                       zh->m_explain_database))
832             return ZEBRA_FAIL;
833     }
834
835     if (stream)
836     {
837         off_t null_offset = 0;
838         extractCtrl.stream = stream;
839
840         start_offset = stream->tellf(stream);
841
842         extractCtrl.first_record = start_offset ? 0 : 1;
843         
844         stream->endf(stream, &null_offset);;
845
846         extractCtrl.init = extract_init;
847         extractCtrl.tokenAdd = extract_token_add;
848         extractCtrl.schemaAdd = extract_schema_add;
849         extractCtrl.dh = zh->reg->dh;
850         extractCtrl.handle = zh;
851         extractCtrl.match_criteria[0] = '\0';
852         extractCtrl.staticrank = 0;
853         extractCtrl.action = action;
854
855         init_extractCtrl(zh, &extractCtrl);
856
857         extract_set_store_data_prepare(&extractCtrl);
858         
859         r = (*recType->extract)(recTypeClientData, &extractCtrl);
860
861         if (action == action_update)
862         {
863             action = extractCtrl.action;
864         }
865         
866         switch (r)
867         {
868         case RECCTRL_EXTRACT_EOF:
869             return ZEBRA_FAIL;
870         case RECCTRL_EXTRACT_ERROR_GENERIC:
871             /* error occured during extraction ... */
872             yaz_log(YLOG_WARN, "extract error: generic");
873             return ZEBRA_FAIL;
874         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
875             /* error occured during extraction ... */
876             yaz_log(YLOG_WARN, "extract error: no such filter");
877             return ZEBRA_FAIL;
878         case RECCTRL_EXTRACT_SKIP:
879             if (show_progress)
880                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
881                          recordType, pr_fname, (zint) start_offset);
882             *more = 1;
883             
884             end_offset = stream->endf(stream, 0);
885             if (end_offset)
886                 stream->seekf(stream, end_offset);
887
888             return ZEBRA_OK;
889         case RECCTRL_EXTRACT_OK:
890             break;
891         default:
892             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
893             return ZEBRA_FAIL;
894         }
895         end_offset = stream->endf(stream, 0);
896         if (end_offset)
897             stream->seekf(stream, end_offset);
898         else
899             end_offset = stream->tellf(stream);
900
901         if (extractCtrl.match_criteria[0])
902             match_criteria = extractCtrl.match_criteria;
903     }
904
905     *more = 1;
906
907     if (zh->m_flag_rw == 0)
908     {
909         yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
910                 pr_fname, (zint) start_offset);
911         /* test mode .. Do not perform match */
912         return ZEBRA_OK;
913     }
914         
915     if (!sysno)
916     {
917         sysno = &sysno0;
918         
919         if (match_criteria && *match_criteria)
920             matchStr = match_criteria;
921         else
922         {
923             if (zh->m_record_id && *zh->m_record_id)
924             {
925                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
926                                                zh->m_record_id);
927                 if (!matchStr)
928                 {
929                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
930                              pr_fname, (zint) start_offset);
931                     return ZEBRA_FAIL;
932                 }
933                 if (0 && matchStr)
934                 {
935                     WRBUF w = wrbuf_alloc();
936                     size_t i;
937                     for (i = 0; i < strlen(matchStr); i++)
938                     {
939                         wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
940                     }
941                     yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
942                     wrbuf_destroy(w);
943                 }
944             }
945         }
946         if (matchStr) 
947         {
948             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
949             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
950                                           matchStr);
951
952             
953             if (log_level_extract)
954             {
955                 WRBUF w = wrbuf_hex_str(matchStr);
956                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
957                 wrbuf_destroy(w);
958             }
959             if (rinfo)
960             {
961                 assert(*rinfo == sizeof(*sysno));
962                 memcpy(sysno, rinfo+1, sizeof(*sysno));
963             }
964        }
965     }
966
967     if (! *sysno)
968     {
969         /* new record AKA does not exist already */
970         if (action == action_delete)
971         {
972             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
973                     pr_fname, (zint) start_offset);
974             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
975             return ZEBRA_FAIL;
976         }
977         else if (action == action_a_delete)
978         {
979             if (show_progress)
980                 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
981                         pr_fname, (zint) start_offset);
982             return ZEBRA_OK;
983         }
984         else if (action == action_replace)
985         {
986             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
987                          pr_fname, (zint) start_offset);
988             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
989             return ZEBRA_FAIL;
990         }
991         if (show_progress)
992             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
993                      (zint) start_offset);
994         rec = rec_new(zh->reg->records);
995
996         *sysno = rec->sysno;
997
998
999         if (stream)
1000         {
1001             all_matches_add(&extractCtrl,
1002                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1003                             *sysno);
1004         }
1005
1006
1007         recordAttr = rec_init_attr(zh->reg->zei, rec);
1008         if (extractCtrl.staticrank < 0)
1009         {
1010             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1011             extractCtrl.staticrank = 0;
1012         }
1013
1014         if (matchStr)
1015         {
1016             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1017             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1018                             sizeof(*sysno), sysno);
1019         }
1020
1021         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1022 #if FLUSH2
1023         extract_flush_record_keys2(zh, *sysno,
1024                                    zh->reg->keys, extractCtrl.staticrank,
1025                                    0, recordAttr->staticrank);
1026 #else
1027         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1028                                   extractCtrl.staticrank);
1029 #endif
1030         recordAttr->staticrank = extractCtrl.staticrank;
1031         zh->records_inserted++;
1032     } 
1033     else
1034     {
1035         /* record already exists */
1036         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1037         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1038         if (action == action_insert)
1039         {
1040             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
1041                          recordType, pr_fname, (zint) start_offset);
1042             logRecord(zh);
1043             return ZEBRA_FAIL;
1044         }
1045
1046         rec = rec_get(zh->reg->records, *sysno);
1047         assert(rec);
1048
1049         if (stream)
1050         {
1051             all_matches_add(&extractCtrl,
1052                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1053                             *sysno);
1054         }
1055         
1056         recordAttr = rec_init_attr(zh->reg->zei, rec);
1057
1058         /* decrease total size */
1059         zebraExplain_recordBytesIncrement(zh->reg->zei,
1060                                            - recordAttr->recordSize);
1061
1062         zebra_rec_keys_set_buf(delkeys,
1063                                rec->info[recInfo_delKeys],
1064                                rec->size[recInfo_delKeys],
1065                                0);
1066         zebra_rec_keys_set_buf(sortKeys,
1067                                rec->info[recInfo_sortKeys],
1068                                rec->size[recInfo_sortKeys],
1069                                0);
1070
1071         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1072 #if !FLUSH2
1073         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1074                                   recordAttr->staticrank);
1075 #endif
1076         if (action == action_delete || action == action_a_delete)
1077         {
1078             /* record going to be deleted */
1079 #if FLUSH2
1080             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1081                                        delkeys, recordAttr->staticrank);
1082 #endif       
1083             if (zebra_rec_keys_empty(delkeys))
1084             {
1085                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1086                         pr_fname, (zint) start_offset);
1087                 yaz_log(YLOG_WARN, "cannot delete file above, "
1088                         "storeKeys false (3)");
1089             }
1090             else
1091             {
1092                 if (show_progress)
1093                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1094                             pr_fname, (zint) start_offset);
1095                 zh->records_deleted++;
1096                 if (matchStr)
1097                 {
1098                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1099                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1100                 }
1101                 rec_del(zh->reg->records, &rec);
1102             }
1103             zebra_rec_keys_close(delkeys);
1104             zebra_rec_keys_close(sortKeys);
1105             rec_free(&rec);
1106             logRecord(zh);
1107             return ZEBRA_OK;
1108         }
1109         else
1110         {   /* update or special_update */
1111             if (show_progress)
1112                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1113                         pr_fname, (zint) start_offset);
1114             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1115
1116 #if FLUSH2
1117             extract_flush_record_keys2(zh, *sysno,
1118                                        zh->reg->keys, extractCtrl.staticrank,
1119                                        delkeys, recordAttr->staticrank);
1120 #else
1121             extract_flush_record_keys(zh, *sysno, 1, 
1122                                       zh->reg->keys, extractCtrl.staticrank);
1123 #endif
1124             recordAttr->staticrank = extractCtrl.staticrank;
1125             zh->records_updated++;
1126         }
1127         zebra_rec_keys_close(delkeys);
1128         zebra_rec_keys_close(sortKeys);
1129     }
1130     /* update file type */
1131     xfree(rec->info[recInfo_fileType]);
1132     rec->info[recInfo_fileType] =
1133         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1134
1135     /* update filename */
1136     xfree(rec->info[recInfo_filename]);
1137     rec->info[recInfo_filename] =
1138         rec_strdup(fname, &rec->size[recInfo_filename]);
1139
1140     /* update delete keys */
1141     xfree(rec->info[recInfo_delKeys]);
1142     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1143     {
1144         zebra_rec_keys_get_buf(zh->reg->keys,
1145                                &rec->info[recInfo_delKeys],
1146                                &rec->size[recInfo_delKeys]);
1147     }
1148     else
1149     {
1150         rec->info[recInfo_delKeys] = NULL;
1151         rec->size[recInfo_delKeys] = 0;
1152     }
1153     /* update sort keys */
1154     xfree(rec->info[recInfo_sortKeys]);
1155
1156     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1157                            &rec->info[recInfo_sortKeys],
1158                            &rec->size[recInfo_sortKeys]);
1159
1160     if (stream)
1161     {
1162         recordAttr->recordSize = end_offset - start_offset;
1163         zebraExplain_recordBytesIncrement(zh->reg->zei,
1164                                           recordAttr->recordSize);
1165     }
1166
1167     /* set run-number for this record */
1168     recordAttr->runNumber =
1169         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1170
1171     /* update store data */
1172     xfree(rec->info[recInfo_storeData]);
1173
1174     /* update store data */
1175     if (zh->store_data_buf)
1176     {
1177         rec->size[recInfo_storeData] = zh->store_data_size;
1178         rec->info[recInfo_storeData] = zh->store_data_buf;
1179         zh->store_data_buf = 0;
1180         recordAttr->recordSize = zh->store_data_size;
1181     }
1182     else if (zh->m_store_data)
1183     {
1184         off_t cur_offset = stream->tellf(stream);
1185
1186         rec->size[recInfo_storeData] = recordAttr->recordSize;
1187         rec->info[recInfo_storeData] = (char *)
1188             xmalloc(recordAttr->recordSize);
1189         stream->seekf(stream, start_offset);
1190         stream->readf(stream, rec->info[recInfo_storeData],
1191                       recordAttr->recordSize);
1192         stream->seekf(stream, cur_offset);
1193     }
1194     else
1195     {
1196         rec->info[recInfo_storeData] = NULL;
1197         rec->size[recInfo_storeData] = 0;
1198     }
1199     /* update database name */
1200     xfree(rec->info[recInfo_databaseName]);
1201     rec->info[recInfo_databaseName] =
1202         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1203
1204     /* update offset */
1205     recordAttr->recordOffset = start_offset;
1206     
1207     /* commit this record */
1208     rec_put(zh->reg->records, &rec);
1209     logRecord(zh);
1210     return ZEBRA_OK;
1211 }
1212
1213 /** \brief extracts records from stream
1214     \param zh Zebra Handle
1215     \param stream stream that we read from
1216     \param action (action_insert, action_replace, action_delete, ..)
1217     \param recordType Record filter type "grs.xml", etc.
1218     \param sysno pointer to sysno if already known; NULL otherwise
1219     \param match_criteria (NULL if not already given)
1220     \param fname filename that we read from (for logging purposes only)
1221     \param recType record type
1222     \param recTypeClientData client data for record type
1223     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1224 */
1225 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
1226                                        struct ZebraRecStream *stream,
1227                                        enum zebra_recctrl_action_t action,
1228                                        const char *recordType,
1229                                        zint *sysno,
1230                                        const char *match_criteria,
1231                                        const char *fname,
1232                                        RecType recType,
1233                                        void *recTypeClientData)
1234 {
1235     ZEBRA_RES res = ZEBRA_OK;
1236     while (1)
1237     {
1238         int more = 0;
1239         res = zebra_extract_record_stream(zh, stream,
1240                                           action,
1241                                           recordType,
1242                                           sysno,
1243                                           match_criteria,
1244                                           fname,
1245                                           recType, recTypeClientData, &more);
1246         if (!more)
1247         {
1248             res = ZEBRA_OK;
1249             break;
1250         }
1251         if (res != ZEBRA_OK)
1252             break;
1253         if (sysno)
1254             break;
1255     }
1256     return res;
1257 }
1258
1259 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1260 {
1261     ZebraHandle zh = (ZebraHandle) handle;
1262     struct recExtractCtrl extractCtrl;
1263
1264     if (zebraExplain_curDatabase(zh->reg->zei,
1265                                   rec->info[recInfo_databaseName]))
1266     {
1267         abort();
1268         if (zebraExplain_newDatabase(zh->reg->zei,
1269                                       rec->info[recInfo_databaseName], 0))
1270             abort();
1271     }
1272
1273     zebra_rec_keys_reset(zh->reg->keys);
1274     zebra_rec_keys_reset(zh->reg->sortKeys);
1275
1276     extractCtrl.init = extract_init;
1277     extractCtrl.tokenAdd = extract_token_add;
1278     extractCtrl.schemaAdd = extract_schema_add;
1279     extractCtrl.dh = zh->reg->dh;
1280
1281     init_extractCtrl(zh, &extractCtrl);
1282
1283     extractCtrl.flagShowRecords = 0;
1284     extractCtrl.match_criteria[0] = '\0';
1285     extractCtrl.staticrank = 0;
1286     extractCtrl.action = action_update;
1287
1288     extractCtrl.handle = handle;
1289     extractCtrl.first_record = 1;
1290     
1291     extract_set_store_data_prepare(&extractCtrl);
1292
1293     if (n)
1294         grs_extract_tree(&extractCtrl, n);
1295
1296     if (rec->size[recInfo_delKeys])
1297     {
1298         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1299         
1300         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1301
1302         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1303                                rec->size[recInfo_delKeys],
1304                                0);
1305 #if FLUSH2
1306         extract_flush_record_keys2(zh, rec->sysno, 
1307                                    zh->reg->keys, 0, delkeys, 0);
1308 #else
1309         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1310         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1311 #endif
1312         zebra_rec_keys_close(delkeys);
1313
1314         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1315                                rec->size[recInfo_sortKeys],
1316                                0);
1317
1318         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1319         zebra_rec_keys_close(sortkeys);
1320     }
1321     else
1322     {
1323 #if FLUSH2
1324         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1325 #else
1326         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1327 #endif
1328     }
1329     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1330     
1331     xfree(rec->info[recInfo_delKeys]);
1332     zebra_rec_keys_get_buf(zh->reg->keys,
1333                            &rec->info[recInfo_delKeys], 
1334                            &rec->size[recInfo_delKeys]);
1335
1336     xfree(rec->info[recInfo_sortKeys]);
1337     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1338                            &rec->info[recInfo_sortKeys],
1339                            &rec->size[recInfo_sortKeys]);
1340     return ZEBRA_OK;
1341 }
1342
1343 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1344                            const char *str, size_t slen, NMEM nmem, int level)
1345 {
1346     char keystr[200]; /* room for zints to print */
1347     char *dst_term = 0;
1348     int ord = CAST_ZINT_TO_INT(key->mem[0]);
1349     const char *index_type;
1350     int i;
1351     const char *string_index;
1352     
1353     zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1354                             0/* db */, &string_index);
1355     assert(index_type);
1356     zebra_term_untrans_iconv(zh, nmem, index_type,
1357                              &dst_term, str);
1358     *keystr = '\0';
1359     for (i = 0; i < key->len; i++)
1360     {
1361         sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1362     }
1363     
1364     if (*str < CHR_BASE_CHAR)
1365     {
1366         int i;
1367         char dst_buf[200]; /* room for special chars */
1368         
1369         strcpy(dst_buf , "?");
1370         
1371         if (!strcmp(str, ""))
1372             strcpy(dst_buf, "alwaysmatches");
1373         if (!strcmp(str, FIRST_IN_FIELD_STR))
1374             strcpy(dst_buf, "firstinfield");
1375         else if (!strcmp(str, CHR_UNKNOWN))
1376             strcpy(dst_buf, "unknown");
1377         else if (!strcmp(str, CHR_SPACE))
1378             strcpy(dst_buf, "space");
1379         
1380         for (i = 0; i<slen; i++)
1381         {
1382             sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1383         }
1384         yaz_log(level, "%s%s %s %s", keystr, index_type,
1385                 string_index, dst_buf);
1386         
1387     }
1388     else
1389         yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1390                 string_index, dst_term);
1391 }
1392
1393 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1394                           zebra_rec_keys_t reckeys,
1395                           int level)
1396 {
1397     if (zebra_rec_keys_rewind(reckeys))
1398     {
1399         size_t slen;
1400         const char *str;
1401         struct it_key key;
1402         NMEM nmem = nmem_create();
1403
1404         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1405         {
1406             zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1407             nmem_reset(nmem);
1408         }
1409         nmem_destroy(nmem);
1410     }
1411 }
1412
1413 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1414                              zebra_rec_keys_t reckeys)
1415 {
1416     ZebraExplainInfo zei = zh->reg->zei;
1417     struct ord_stat {
1418         int no;
1419         int ord;
1420         struct ord_stat *next;
1421     };
1422
1423     if (zebra_rec_keys_rewind(reckeys))
1424     {
1425         struct ord_stat *ord_list = 0;
1426         struct ord_stat *p;
1427         size_t slen;
1428         const char *str;
1429         struct it_key key_in;
1430         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1431         {
1432             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1433
1434             for (p = ord_list; p ; p = p->next)
1435                 if (p->ord == ord)
1436                 {
1437                     p->no++;
1438                     break;
1439                 }
1440             if (!p)
1441             {
1442                 p = xmalloc(sizeof(*p));
1443                 p->no = 1;
1444                 p->ord = ord;
1445                 p->next = ord_list;
1446                 ord_list = p;
1447             }
1448         }
1449
1450         p = ord_list;
1451         while (p)
1452         {
1453             struct ord_stat *p1 = p;
1454
1455             if (is_insert)
1456                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1457             else
1458                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1459             p = p->next;
1460             xfree(p1);
1461         }
1462     }
1463 }
1464
1465 static void extract_flush_record_keys(
1466     ZebraHandle zh, zint sysno, int cmd,
1467     zebra_rec_keys_t reckeys,
1468     zint staticrank)
1469 {
1470     ZebraExplainInfo zei = zh->reg->zei;
1471
1472     extract_rec_keys_adjust(zh, cmd, reckeys);
1473
1474     if (log_level_details)
1475     {
1476         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1477                 sysno, cmd ? "insert" : "delete");
1478         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1479     }
1480
1481     if (!zh->reg->key_block)
1482     {
1483         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1484         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1485         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1486         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1487     }
1488     zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1489
1490 #if 0
1491     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1492     print_rec_keys(zh, reckeys);
1493 #endif
1494     if (zebra_rec_keys_rewind(reckeys))
1495     {
1496         size_t slen;
1497         const char *str;
1498         struct it_key key_in;
1499         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1500         {
1501             key_block_write(zh->reg->key_block, sysno, 
1502                             &key_in, cmd, str, slen,
1503                             staticrank, zh->m_staticrank);
1504         }
1505     }
1506 }
1507
1508 static void extract_flush_record_keys2(
1509     ZebraHandle zh, zint sysno,
1510     zebra_rec_keys_t ins_keys, zint ins_rank,
1511     zebra_rec_keys_t del_keys, zint del_rank)
1512 {
1513     ZebraExplainInfo zei = zh->reg->zei;
1514     int normal = 0;
1515     int optimized = 0;
1516
1517     if (!zh->reg->key_block)
1518     {
1519         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1520         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1521         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1522         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1523     }
1524
1525     if (ins_keys)
1526     {
1527         extract_rec_keys_adjust(zh, 1, ins_keys);
1528         if (!del_keys)
1529             zebraExplain_recordCountIncrement(zei, 1);
1530         zebra_rec_keys_rewind(ins_keys);
1531     }
1532     if (del_keys)
1533     {
1534         extract_rec_keys_adjust(zh, 0, del_keys);
1535         if (!ins_keys)
1536             zebraExplain_recordCountIncrement(zei, -1);
1537         zebra_rec_keys_rewind(del_keys);
1538     }
1539
1540     while (1)
1541     {
1542         size_t del_slen;
1543         const char *del_str;
1544         struct it_key del_key_in;
1545         int del = 0;
1546
1547         size_t ins_slen;
1548         const char *ins_str;
1549         struct it_key ins_key_in;
1550         int ins = 0;
1551
1552         if (del_keys)
1553             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1554                                       &del_key_in);
1555         if (ins_keys)
1556             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1557                                       &ins_key_in);
1558
1559         if (del && ins && ins_rank == del_rank
1560             && !key_compare(&del_key_in, &ins_key_in) 
1561             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1562         {
1563             optimized++;
1564             continue;
1565         }
1566         if (!del && !ins)
1567             break;
1568         
1569         normal++;
1570         if (del)
1571             key_block_write(zh->reg->key_block, sysno, 
1572                             &del_key_in, 0, del_str, del_slen,
1573                             del_rank, zh->m_staticrank);
1574         if (ins)
1575             key_block_write(zh->reg->key_block, sysno, 
1576                             &ins_key_in, 1, ins_str, ins_slen,
1577                             ins_rank, zh->m_staticrank);
1578     }
1579     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1580 }
1581
1582
1583 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1584                                      zebra_rec_keys_t reckeys,
1585                                      zebra_snippets *snippets)
1586 {
1587     NMEM nmem = nmem_create();
1588     if (zebra_rec_keys_rewind(reckeys)) 
1589     {
1590         const char *str;
1591         size_t slen;
1592         struct it_key key;
1593         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1594         {
1595             char *dst_term = 0;
1596             int ord;
1597             zint seqno;
1598             const char *index_type;
1599
1600             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1601             seqno = key.mem[key.len-1];
1602             ord = CAST_ZINT_TO_INT(key.mem[0]);
1603             
1604             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1605                                     0/* db */, 0 /* string_index */);
1606             assert(index_type);
1607             zebra_term_untrans_iconv(zh, nmem, index_type,
1608                                      &dst_term, str);
1609             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1610             nmem_reset(nmem);
1611         }
1612     }
1613     nmem_destroy(nmem);
1614     return ZEBRA_OK;
1615 }
1616
1617 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1618 {
1619     yaz_log(YLOG_LOG, "print_rec_keys");
1620     if (zebra_rec_keys_rewind(reckeys))
1621     {
1622         const char *str;
1623         size_t slen;
1624         struct it_key key;
1625         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1626         {
1627             char dst_buf[IT_MAX_WORD];
1628             zint seqno;
1629             const char *index_type;
1630             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1631             const char *db = 0;
1632             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1633
1634             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1635             
1636             seqno = key.mem[key.len-1];
1637             
1638             zebra_term_untrans(zh, index_type, dst_buf, str);
1639             
1640             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1641                     " term=%s", ord, seqno, dst_buf); 
1642         }
1643     }
1644 }
1645
1646 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1647                                      const char *str, int length)
1648 {
1649     struct it_key key;
1650     ZebraHandle zh = p->extractCtrl->handle;
1651     ZebraExplainInfo zei = zh->reg->zei;
1652     int ch, i;
1653
1654     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1655     if (ch < 0)
1656         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1657
1658     i = 0;
1659     key.mem[i++] = ch;
1660     key.mem[i++] = p->record_id;
1661     key.mem[i++] = p->section_id;
1662
1663     if (zh->m_segment_indexing)
1664         key.mem[i++] = p->segment;
1665     key.mem[i++] = p->seqno;
1666     key.len = i;
1667
1668     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1669 }
1670
1671 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1672 {
1673     struct it_key key;
1674     ZebraHandle zh = p->extractCtrl->handle;
1675     ZebraExplainInfo zei = zh->reg->zei;
1676     int ch;
1677     zinfo_index_category_t cat = zinfo_index_category_sort;
1678
1679     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1680     if (ch < 0)
1681         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1682     key.len = 3;
1683     key.mem[0] = ch;
1684     key.mem[1] = p->record_id;
1685     key.mem[2] = p->section_id;
1686
1687     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1688 }
1689
1690 static void extract_add_staticrank_string(RecWord *p,
1691                                           const char *str, int length)
1692 {
1693     char valz[40];
1694     struct recExtractCtrl *ctrl = p->extractCtrl;
1695
1696     if (length > sizeof(valz)-1)
1697         length = sizeof(valz)-1;
1698
1699     memcpy(valz, str, length);
1700     valz[length] = '\0';
1701     ctrl->staticrank = atozint(valz);
1702 }
1703
1704 static void extract_add_string(RecWord *p, zebra_map_t zm,
1705                                const char *string, int length)
1706 {
1707     assert(length > 0);
1708
1709     if (!p->index_name)
1710         return;
1711     if (log_level_details)
1712     {
1713
1714         WRBUF w = wrbuf_alloc();
1715         
1716         wrbuf_write_escaped(w, string, length);
1717         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1718         wrbuf_destroy(w);
1719     }
1720     if (zebra_maps_is_index(zm))
1721     {
1722         extract_add_index_string(p, zinfo_index_category_index,
1723                                  string, length);
1724         if (zebra_maps_is_alwaysmatches(zm))
1725         {
1726             RecWord word;
1727             memcpy(&word, p, sizeof(word));
1728
1729             word.seqno = 1;
1730             extract_add_index_string(
1731                 &word, zinfo_index_category_alwaysmatches, "", 0);
1732         }
1733     }
1734     else if (zebra_maps_is_sort(zm))
1735     {
1736         extract_add_sort_string(p, string, length);
1737     }
1738     else if (zebra_maps_is_staticrank(zm))
1739     {
1740         extract_add_staticrank_string(p, string, length);
1741     }
1742 }
1743
1744 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1745 {
1746     const char *b = p->term_buf;
1747     int remain = p->term_len;
1748     int first = 1;
1749     const char **map = 0;
1750     
1751     if (remain > 0)
1752         map = zebra_maps_input(zm, &b, remain, 0);
1753
1754     while (map)
1755     {
1756         char buf[IT_MAX_WORD+1];
1757         int i, remain;
1758
1759         /* Skip spaces */
1760         while (map && *map && **map == *CHR_SPACE)
1761         {
1762             remain = p->term_len - (b - p->term_buf);
1763             if (remain > 0)
1764                 map = zebra_maps_input(zm, &b, remain, 0);
1765             else
1766                 map = 0;
1767         }
1768         if (!map)
1769             break;
1770         i = 0;
1771         while (map && *map && **map != *CHR_SPACE)
1772         {
1773             const char *cp = *map;
1774
1775             while (i < IT_MAX_WORD && *cp)
1776                 buf[i++] = *(cp++);
1777             remain = p->term_len - (b - p->term_buf);
1778             if (remain > 0)
1779                 map = zebra_maps_input(zm, &b, remain, 0);
1780             else
1781                 map = 0;
1782         }
1783         if (!i)
1784             return;
1785
1786         if (first)
1787         {   
1788             first = 0;
1789             if (zebra_maps_is_first_in_field(zm))
1790             {
1791                 /* first in field marker */
1792                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1793                 p->seqno++;
1794             }
1795         }
1796         extract_add_string(p, zm, buf, i);
1797         p->seqno++;
1798     }
1799 }
1800
1801 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1802 {
1803     const char *b = p->term_buf;
1804     char buf[IT_MAX_WORD+1];
1805     const char **map = 0;
1806     int i = 0, remain = p->term_len;
1807
1808     if (remain > 0)
1809         map = zebra_maps_input(zm, &b, remain, 1);
1810
1811     while (remain > 0 && i < IT_MAX_WORD)
1812     {
1813         while (map && *map && **map == *CHR_SPACE)
1814         {
1815             remain = p->term_len - (b - p->term_buf);
1816
1817             if (remain > 0)
1818             {
1819                 int first = i ? 0 : 1;  /* first position */
1820                 map = zebra_maps_input(zm, &b, remain, first);
1821             }
1822             else
1823                 map = 0;
1824         }
1825         if (!map)
1826             break;
1827
1828         if (i && i < IT_MAX_WORD)
1829             buf[i++] = *CHR_SPACE;
1830         while (map && *map && **map != *CHR_SPACE)
1831         {
1832             const char *cp = *map;
1833
1834             if (**map == *CHR_CUT)
1835             {
1836                 i = 0;
1837             }
1838             else
1839             {
1840                 if (i >= IT_MAX_WORD)
1841                     break;
1842                 while (i < IT_MAX_WORD && *cp)
1843                     buf[i++] = *(cp++);
1844             }
1845             remain = p->term_len  - (b - p->term_buf);
1846             if (remain > 0)
1847             {
1848                 map = zebra_maps_input(zm, &b, remain, 0);
1849             }
1850             else
1851                 map = 0;
1852         }
1853     }
1854     if (!i)
1855         return;
1856     extract_add_string(p, zm, buf, i);
1857 }
1858
1859 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1860 {
1861     const char *res_buf = 0;
1862     size_t res_len = 0;
1863
1864     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1865     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1866     {
1867         extract_add_string(p, zm, res_buf, res_len);
1868         p->seqno++;
1869     }
1870 }
1871
1872
1873 /** \brief top-level indexing handler for recctrl system
1874     \param p token data to be indexed
1875
1876     Call sequence:
1877     extract_token_add
1878     extract_add_{in}_complete / extract_add_icu
1879     extract_add_string
1880     
1881     extract_add_index_string
1882     or
1883     extract_add_sort_string
1884     or
1885     extract_add_staticrank_string
1886     
1887 */
1888 static void extract_token_add(RecWord *p)
1889 {
1890     ZebraHandle zh = p->extractCtrl->handle;
1891     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1892     WRBUF wrbuf;
1893
1894     if (log_level_details)
1895     {
1896         yaz_log(log_level_details, "extract_token_add "
1897                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1898                 p->index_type, p->index_name, 
1899                 p->seqno, p->term_len, p->term_buf);
1900     }
1901     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1902     {
1903         p->term_buf = wrbuf_buf(wrbuf);
1904         p->term_len = wrbuf_len(wrbuf);
1905     }
1906     if (zebra_maps_is_icu(zm))
1907     {
1908         extract_add_icu(p, zm);
1909     }
1910     else
1911     {
1912         if (zebra_maps_is_complete(zm))
1913             extract_add_complete_field(p, zm);
1914         else
1915             extract_add_incomplete_field(p, zm);
1916     }
1917 }
1918
1919 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1920                                       void *buf, size_t sz)
1921 {
1922     ZebraHandle zh = (ZebraHandle) p->handle;
1923
1924     xfree(zh->store_data_buf);
1925     zh->store_data_buf = 0;
1926     zh->store_data_size = 0;
1927     if (buf && sz)
1928     {
1929         zh->store_data_buf = xmalloc(sz);
1930         zh->store_data_size = sz;
1931         memcpy(zh->store_data_buf, buf, sz);
1932     }
1933 }
1934
1935 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1936 {
1937     ZebraHandle zh = (ZebraHandle) p->handle;
1938     xfree(zh->store_data_buf);
1939     zh->store_data_buf = 0;
1940     zh->store_data_size = 0;
1941     p->setStoreData = extract_set_store_data_cb;
1942 }
1943
1944 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1945 {
1946     ZebraHandle zh = (ZebraHandle) p->handle;
1947     zebraExplain_addSchema(zh->reg->zei, oid);
1948 }
1949
1950 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1951                              int cmd, zebra_rec_keys_t reckeys)
1952 {
1953 #if 0
1954     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1955             cmd, sysno);
1956     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1957 #endif
1958
1959     if (zebra_rec_keys_rewind(reckeys))
1960     {
1961         zebra_sort_index_t si = zh->reg->sort_index;
1962         size_t slen;
1963         const char *str;
1964         struct it_key key_in;
1965
1966         NMEM nmem = nmem_create();
1967         struct sort_add_ent {
1968             int ord;
1969             int cmd;
1970             struct sort_add_ent *next;
1971             WRBUF wrbuf;
1972             zint sysno;
1973             zint section_id;
1974         };
1975         struct sort_add_ent *sort_ent_list = 0;
1976
1977         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1978         {
1979             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1980             zint filter_sysno = key_in.mem[1];
1981             zint section_id = key_in.mem[2];
1982
1983             struct sort_add_ent **e = &sort_ent_list;
1984             for (; *e; e = &(*e)->next)
1985                 if ((*e)->ord == ord && section_id == (*e)->section_id)
1986                     break;
1987             if (!*e)
1988             {
1989                 *e = nmem_malloc(nmem, sizeof(**e));
1990                 (*e)->next = 0;
1991                 (*e)->wrbuf = wrbuf_alloc();
1992                 (*e)->ord = ord;
1993                 (*e)->cmd = cmd;
1994                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1995                 (*e)->section_id = section_id;
1996             }
1997             
1998             wrbuf_write((*e)->wrbuf, str, slen);
1999             wrbuf_putc((*e)->wrbuf, '\0');
2000         }
2001         if (sort_ent_list)
2002         {
2003             zint last_sysno = 0;
2004             struct sort_add_ent *e = sort_ent_list;
2005             for (; e; e = e->next)
2006             {
2007                 if (last_sysno != e->sysno)
2008                 {
2009                     zebra_sort_sysno(si, e->sysno);
2010                     last_sysno = e->sysno;
2011                 }
2012                 zebra_sort_type(si, e->ord);
2013                 if (e->cmd == 1)
2014                     zebra_sort_add(si, e->section_id, e->wrbuf);
2015                 else
2016                     zebra_sort_delete(si, e->section_id);
2017                 wrbuf_destroy(e->wrbuf);
2018             }
2019         }
2020         nmem_destroy(nmem);
2021     }
2022 }
2023
2024 /*
2025  * Local variables:
2026  * c-basic-offset: 4
2027  * c-file-style: "Stroustrup"
2028  * indent-tabs-mode: nil
2029  * End:
2030  * vim: shiftwidth=4 tabstop=8 expandtab
2031  */
2032