Fix comp for FLUSH2==0
[idzebra-moved-to-github.git] / index / extract.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2011 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 /** \file
21     \brief indexes records and extract tokens for indexing and sorting
22 */
23
24 #include <stdio.h>
25 #include <assert.h>
26 #include <ctype.h>
27 #ifdef WIN32
28 #include <io.h>
29 #endif
30 #if HAVE_UNISTD_H
31 #include <unistd.h>
32 #endif
33 #include <fcntl.h>
34
35
36 #include "index.h"
37 #include "orddict.h"
38 #include <direntz.h>
39 #include <charmap.h>
40 #include <yaz/snprintf.h>
41
42 static int log_level_extract = 0;
43 static int log_level_details = 0;
44 static int log_level_initialized = 0;
45
46 /* 1 if we use eliminitate identical delete/insert keys */
47 /* eventually this the 0-case code will be removed */
48 #define FLUSH2 1
49
50 #if FLUSH2
51 static void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
52                                        zebra_rec_keys_t ins_keys,
53                                        zint ins_rank,
54                                        zebra_rec_keys_t del_keys,
55                                        zint del_rank);
56 #else
57 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
58                                       int cmd,
59                                       zebra_rec_keys_t reckeys,
60                                       zint staticrank);
61 #endif
62
63 static void zebra_init_log_level(void)
64 {
65     if (!log_level_initialized)
66     {
67         log_level_initialized = 1;
68
69         log_level_extract = yaz_log_module_level("extract");
70         log_level_details = yaz_log_module_level("indexdetails");
71     }
72 }
73
74 static WRBUF wrbuf_hex_str(const char *cstr)
75 {
76     size_t i;
77     WRBUF w = wrbuf_alloc();
78     for (i = 0; cstr[i]; i++)
79     {
80         if (cstr[i] < ' ' || cstr[i] > 126)
81             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
82         else
83             wrbuf_putc(w, cstr[i]);
84     }
85     return w;
86 }
87
88
89 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
90                                     int cmd, zebra_rec_keys_t skp);
91 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
92 static void extract_token_add(RecWord *p);
93
94 static void check_log_limit(ZebraHandle zh)
95 {
96     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
97     {
98         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
99                 zh->m_file_verbose_limit);
100     }
101 }
102
103 static void logRecord(ZebraHandle zh)
104 {
105     check_log_limit(zh);
106     ++zh->records_processed;
107     if (!(zh->records_processed % 1000))
108     {
109         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
110                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
111                 zh->records_processed, zh->records_inserted, 
112                 zh->records_updated, zh->records_deleted);
113     }
114 }
115
116 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
117 {
118     ctrl->flagShowRecords = !zh->m_flag_rw;
119 }
120
121
122 static void extract_add_index_string(RecWord *p, 
123                                       zinfo_index_category_t cat,
124                                       const char *str, int length);
125
126 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
127
128 static void extract_init(struct recExtractCtrl *p, RecWord *w)
129 {
130     w->seqno = 1;
131     w->index_name = "any";
132     w->index_type = "w";
133     w->extractCtrl = p;
134     w->record_id = 0;
135     w->section_id = 0;
136     w->segment = 0;
137 }
138
139 struct snip_rec_info {
140     ZebraHandle zh;
141     zebra_snippets *snippets;
142 };
143
144
145 static void snippet_add_complete_field(RecWord *p, int ord,
146                                        zebra_map_t zm)
147 {
148     struct snip_rec_info *h = p->extractCtrl->handle;
149
150     const char *b = p->term_buf;
151     char buf[IT_MAX_WORD+1];
152     const char **map = 0;
153     int i = 0, remain = p->term_len;
154     const char *start = b;
155     const char *last = 0;
156
157     if (remain > 0)
158         map = zebra_maps_input(zm, &b, remain, 1);
159
160     while (remain > 0 && i < IT_MAX_WORD)
161     {
162         while (map && *map && **map == *CHR_SPACE)
163         {
164             remain = p->term_len - (b - p->term_buf);
165
166             if (i == 0)
167                 start = b;  /* set to first non-ws area */
168             if (remain > 0)
169             {
170                 int first = i ? 0 : 1;  /* first position */
171
172                 map = zebra_maps_input(zm, &b, remain, first);
173             }
174             else
175                 map = 0;
176         }
177         if (!map)
178             break;
179
180         if (i && i < IT_MAX_WORD)
181             buf[i++] = *CHR_SPACE;
182         while (map && *map && **map != *CHR_SPACE)
183         {
184             const char *cp = *map;
185
186             if (**map == *CHR_CUT)
187             {
188                 i = 0;
189             }
190             else
191             {
192                 if (i >= IT_MAX_WORD)
193                     break;
194                 while (i < IT_MAX_WORD && *cp)
195                     buf[i++] = *(cp++);
196             }
197             last = b;
198             remain = p->term_len  - (b - p->term_buf);
199             if (remain > 0)
200             {
201                 map = zebra_maps_input(zm, &b, remain, 0);
202             }
203             else
204                 map = 0;
205         }
206     }
207     if (!i)
208         return;
209     if (last && start != last && zebra_maps_is_index(zm))
210         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
211                                start, last - start);
212 }
213
214 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
215 {
216     struct snip_rec_info *h = p->extractCtrl->handle;
217     const char *b = p->term_buf;
218     int remain = p->term_len;
219     int first = 1;
220     const char **map = 0;
221     const char *start = b;
222     const char *last = b;
223
224     if (remain > 0)
225         map = zebra_maps_input(zm, &b, remain, 0);
226
227     while (map)
228     {
229         char buf[IT_MAX_WORD+1];
230         int i, remain;
231
232         /* Skip spaces */
233         while (map && *map && **map == *CHR_SPACE)
234         {
235             remain = p->term_len - (b - p->term_buf);
236             last = b;
237             if (remain > 0)
238                 map = zebra_maps_input(zm, &b, remain, 0);
239             else
240                 map = 0;
241         }
242         if (!map)
243             break;
244         if (start != last && zebra_maps_is_index(zm))
245         {
246             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
247                                    start, last - start);
248
249         }
250         start = last;
251
252         i = 0;
253         while (map && *map && **map != *CHR_SPACE)
254         {
255             const char *cp = *map;
256
257             while (i < IT_MAX_WORD && *cp)
258                 buf[i++] = *(cp++);
259             remain = p->term_len - (b - p->term_buf);
260             last = b;
261             if (remain > 0)
262                 map = zebra_maps_input(zm, &b, remain, 0);
263             else
264                 map = 0;
265         }
266         if (!i)
267             return;
268
269         if (first)
270         {   
271             first = 0;
272             if (zebra_maps_is_first_in_field(zm))
273             {
274                 /* first in field marker */
275                 p->seqno++;
276             }
277         }
278         if (start != last && zebra_maps_is_index(zm))
279             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
280                                    start, last - start);
281         start = last;
282         p->seqno++;
283     }
284
285 }
286
287 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
288 {
289     struct snip_rec_info *h = p->extractCtrl->handle;
290
291     const char *res_buf = 0;
292     size_t res_len = 0;
293
294     const char *display_buf = 0;
295     size_t display_len = 0;
296
297     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
298     while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
299                                    &display_buf, &display_len))
300     {
301         if (zebra_maps_is_index(zm))
302             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
303                                    display_buf, display_len);
304         p->seqno++;
305     }
306 }
307
308 static void snippet_token_add(RecWord *p)
309 {
310     struct snip_rec_info *h = p->extractCtrl->handle;
311     ZebraHandle zh = h->zh;
312     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
313
314     if (zm)
315     {
316         ZebraExplainInfo zei = zh->reg->zei;
317         int ch = zebraExplain_lookup_attr_str(
318             zei, zinfo_index_category_index, p->index_type, p->index_name);
319
320         if (zebra_maps_is_icu(zm))
321             snippet_add_icu(p, ch, zm);
322         else
323         {
324             if (zebra_maps_is_complete(zm))
325                 snippet_add_complete_field(p, ch, zm);
326             else
327                 snippet_add_incomplete_field(p, ch, zm);
328         }
329     }
330 }
331
332 static void snippet_schema_add(
333     struct recExtractCtrl *p, Odr_oid *oid)
334 {
335
336 }
337
338 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
339                      struct ZebraRecStream *stream,
340                      RecType rt, void *recTypeClientData)
341 {
342     struct recExtractCtrl extractCtrl;
343     struct snip_rec_info info;
344     int r;
345
346     extractCtrl.stream = stream;
347     extractCtrl.first_record = 1;
348     extractCtrl.init = extract_init;
349     extractCtrl.tokenAdd = snippet_token_add;
350     extractCtrl.schemaAdd = snippet_schema_add;
351     assert(zh->reg);
352     assert(zh->reg->dh);
353
354     extractCtrl.dh = zh->reg->dh;
355     
356     info.zh = zh;
357     info.snippets = sn;
358     extractCtrl.handle = &info;
359     extractCtrl.match_criteria[0] = '\0';
360     extractCtrl.staticrank = 0;
361     extractCtrl.action = action_insert;
362     
363     init_extractCtrl(zh, &extractCtrl);
364
365     extractCtrl.setStoreData = 0;
366
367     r = (*rt->extract)(recTypeClientData, &extractCtrl);
368
369 }
370
371 static void searchRecordKey(ZebraHandle zh,
372                             zebra_rec_keys_t reckeys,
373                             const char *index_name,
374                             const char **ws, int ws_length)
375 {
376     int i;
377     int ch = -1;
378     zinfo_index_category_t cat = zinfo_index_category_index;
379
380     for (i = 0; i<ws_length; i++)
381         ws[i] = NULL;
382
383     if (ch < 0)
384         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
385     if (ch < 0)
386         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
387     if (ch < 0)
388         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
389
390     if (ch < 0)
391         return ;
392
393     if (zebra_rec_keys_rewind(reckeys))
394     {
395         zint startSeq = -1;
396         const char *str;
397         size_t slen;
398         struct it_key key;
399         zint seqno;
400         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
401         {
402             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
403
404             seqno = key.mem[key.len-1];
405             
406             if (key.mem[0] == ch)
407             {
408                 zint woff;
409                 
410                 if (startSeq == -1)
411                     startSeq = seqno;
412                 woff = seqno - startSeq;
413                 if (woff >= 0 && woff < ws_length)
414                     ws[woff] = str;
415             }
416         }
417     }
418 }
419
420 #define FILE_MATCH_BLANK "\t "
421
422 static char *get_match_from_spec(ZebraHandle zh,
423                           zebra_rec_keys_t reckeys,
424                           const char *fname, const char *spec)
425 {
426     static char dstBuf[2048];      /* static here ??? */
427     char *dst = dstBuf;
428     const char *s = spec;
429
430     while (1)
431     {
432         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
433             ;
434         if (!*s)
435             break;
436         if (*s == '(')
437         {
438             const char *ws[32];
439             char attset_str[64], attname_str[64];
440             int i;
441             int first = 1;
442             
443             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
444                 ;
445             for (i = 0; *s && *s != ',' && *s != ')' && 
446                      !strchr(FILE_MATCH_BLANK, *s); s++)
447                 if (i+1 < sizeof(attset_str))
448                     attset_str[i++] = *s;
449             attset_str[i] = '\0';
450             
451             for (; strchr(FILE_MATCH_BLANK, *s); s++)
452                 ;
453             if (*s != ',')
454                 strcpy(attname_str, attset_str);
455             else
456             {
457                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
458                     ;
459                 for (i = 0; *s && *s != ')' && 
460                          !strchr(FILE_MATCH_BLANK, *s); s++)
461                     if (i+1 < sizeof(attname_str))
462                         attname_str[i++] = *s;
463                 attname_str[i] = '\0';
464             }
465             if (*s != ')')
466             {
467                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
468                       spec, zh->m_group ? zh->m_group : "none");
469                 return NULL;
470             }
471             s++;
472
473             searchRecordKey(zh, reckeys, attname_str, ws, 32);
474             if (0) /* for debugging */
475             {   
476                 for (i = 0; i<32; i++)
477                 {
478                     if (ws[i])
479                     {
480                         WRBUF w = wrbuf_hex_str(ws[i]);
481                         yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
482                         wrbuf_destroy(w);
483                     }
484                 }
485             }
486
487             for (i = 0; i<32; i++)
488                 if (ws[i])
489                 {
490                     if (first)
491                     {
492                         *dst++ = ' ';
493                         first = 0;
494                     }
495                     strcpy(dst, ws[i]);
496                     dst += strlen(ws[i]);
497                 }
498             if (first)
499             {
500                 yaz_log(YLOG_WARN, "Record didn't contain match"
501                       " fields in (%s,%s)", attset_str, attname_str);
502                 return NULL;
503             }
504         }
505         else if (*s == '$')
506         {
507             int spec_len;
508             char special[64];
509             const char *spec_src = NULL;
510             const char *s1 = ++s;
511             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
512                 s1++;
513
514             spec_len = s1 - s;
515             if (spec_len > sizeof(special)-1)
516                 spec_len = sizeof(special)-1;
517             memcpy(special, s, spec_len);
518             special[spec_len] = '\0';
519             s = s1;
520
521             if (!strcmp(special, "group"))
522                 spec_src = zh->m_group;
523             else if (!strcmp(special, "database"))
524                 spec_src = zh->basenames[0];
525             else if (!strcmp(special, "filename")) {
526                 spec_src = fname;
527             }
528             else if (!strcmp(special, "type"))
529                 spec_src = zh->m_record_type;
530             else 
531                 spec_src = NULL;
532             if (spec_src)
533             {
534                 strcpy(dst, spec_src);
535                 dst += strlen(spec_src);
536             }
537         }
538         else if (*s == '\"' || *s == '\'')
539         {
540             int stopMarker = *s++;
541             char tmpString[64];
542             int i = 0;
543
544             while (*s && *s != stopMarker)
545             {
546                 if (i+1 < sizeof(tmpString))
547                     tmpString[i++] = *s++;
548             }
549             if (*s)
550                 s++;
551             tmpString[i] = '\0';
552             strcpy(dst, tmpString);
553             dst += strlen(tmpString);
554         }
555         else
556         {
557             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
558                   spec, zh->m_group ? zh->m_group : "none");
559             return NULL;
560         }
561         *dst++ = 1;
562     }
563     if (dst == dstBuf)
564     {
565         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
566               fname, zh->m_group ? zh->m_group : "none");
567         return NULL;
568     }
569     *dst = '\0';
570
571     if (0) /* for debugging */
572     {
573         WRBUF w = wrbuf_hex_str(dstBuf);
574         yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
575         wrbuf_destroy(w);
576     }
577
578     return dstBuf;
579 }
580
581 struct recordLogInfo {
582     const char *fname;
583     int recordOffset;
584     struct recordGroup *rGroup;
585 };
586
587 /** \brief add the always-matches index entry and map to real record ID
588     \param ctrl record control
589     \param record_id custom record ID
590     \param sysno system record ID
591     
592     This function serves two purposes.. It adds the always matches
593     entry and makes a pointer from the custom record ID (if defined)
594     back to the system record ID (sysno)
595     See zebra_recid_to_sysno .
596   */
597 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
598                             zint sysno)
599 {
600     RecWord word;
601     extract_init(ctrl, &word);
602     word.record_id = record_id;
603     /* we use the seqno as placeholder for a way to get back to
604        record database from _ALLRECORDS.. This is used if a custom
605        RECORD was defined */
606     word.seqno = sysno;
607     word.index_name = "_ALLRECORDS";
608     word.index_type = "w";
609
610     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
611                               "", 0);
612 }
613
614 /* forward declaration */
615 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
616                                        struct ZebraRecStream *stream,
617                                        enum zebra_recctrl_action_t action,
618                                        const char *recordType,
619                                        zint *sysno,
620                                        const char *match_criteria,
621                                        const char *fname,
622                                        RecType recType,
623                                        void *recTypeClientData);
624
625
626 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
627                              enum zebra_recctrl_action_t action)
628 {
629     ZEBRA_RES r = ZEBRA_OK;
630     int i, fd;
631     char gprefix[128];
632     char ext[128];
633     char ext_res[128];
634     struct file_read_info *fi = 0;
635     const char *original_record_type = 0;
636     RecType recType;
637     void *recTypeClientData;
638     struct ZebraRecStream stream, *streamp;
639
640     zebra_init_log_level();
641
642     if (!zh->m_group || !*zh->m_group)
643         *gprefix = '\0';
644     else
645         sprintf(gprefix, "%s.", zh->m_group);
646     
647     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
648
649     /* determine file extension */
650     *ext = '\0';
651     for (i = strlen(fname); --i >= 0; )
652         if (fname[i] == '/')
653             break;
654         else if (fname[i] == '.')
655         {
656             strcpy(ext, fname+i+1);
657             break;
658         }
659     /* determine file type - depending on extension */
660     original_record_type = zh->m_record_type;
661     if (!zh->m_record_type)
662     {
663         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
664         zh->m_record_type = res_get(zh->res, ext_res);
665     }
666     if (!zh->m_record_type)
667     {
668         check_log_limit(zh);
669         if (zh->records_processed + zh->records_skipped
670             < zh->m_file_verbose_limit)
671             yaz_log(YLOG_LOG, "? %s", fname);
672         zh->records_skipped++;
673         return 0;
674     }
675     /* determine match criteria */
676     if (!zh->m_record_id)
677     {
678         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
679         zh->m_record_id = res_get(zh->res, ext_res);
680     }
681
682     if (!(recType =
683           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
684                           &recTypeClientData)))
685     {
686         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
687         return ZEBRA_FAIL;
688     }
689
690     switch(recType->version)
691     {
692     case 0:
693         break;
694     default:
695         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
696     }
697     if (sysno && (action == action_delete || action == action_a_delete))
698     {
699         streamp = 0;
700         fi = 0;
701     }
702     else
703     {
704         char full_rep[1024];
705
706         if (zh->path_reg && !yaz_is_abspath(fname))
707         {
708             strcpy(full_rep, zh->path_reg);
709             strcat(full_rep, "/");
710             strcat(full_rep, fname);
711         }
712         else
713             strcpy(full_rep, fname);
714         
715         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
716         {
717             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
718             zh->m_record_type = original_record_type;
719             return ZEBRA_FAIL;
720         }
721         streamp = &stream;
722         zebra_create_stream_fd(streamp, fd, 0);
723     }
724     r = zebra_extract_records_stream(zh, streamp,
725                                      action,
726                                      zh->m_record_type,
727                                      sysno,
728                                      0, /*match_criteria */
729                                      fname,
730                                      recType, recTypeClientData);
731     if (streamp)
732         stream.destroy(streamp);
733     zh->m_record_type = original_record_type;
734     return r;
735 }
736
737 /*
738   If sysno is provided, then it's used to identify the reocord.
739   If not, and match_criteria is provided, then sysno is guessed
740   If not, and a record is provided, then sysno is got from there
741   
742  */
743
744 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
745                                       const char *buf, size_t buf_size,
746                                       enum zebra_recctrl_action_t action,
747                                       const char *recordType,
748                                       zint *sysno,
749                                       const char *match_criteria,
750                                       const char *fname)
751 {
752     struct ZebraRecStream stream;
753     ZEBRA_RES res;
754     void *clientData;
755     RecType recType = 0;
756
757     if (recordType && *recordType)
758     {
759         yaz_log(log_level_extract,
760                 "Record type explicitly specified: %s", recordType);
761         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
762                                   &clientData);
763     } 
764     else
765     {
766         if (!(zh->m_record_type))
767         {
768             yaz_log(YLOG_WARN, "No such record type defined");
769             return ZEBRA_FAIL;
770         }
771         yaz_log(log_level_extract, "Get record type from rgroup: %s",
772                 zh->m_record_type);
773         recType = recType_byName(zh->reg->recTypes, zh->res,
774                                   zh->m_record_type, &clientData);
775         recordType = zh->m_record_type;
776     }
777     
778     if (!recType)
779     {
780         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
781         return ZEBRA_FAIL;
782     }
783
784     zebra_create_stream_mem(&stream, buf, buf_size);
785
786     res = zebra_extract_records_stream(zh, &stream,
787                                        action,
788                                        recordType,
789                                        sysno,
790                                        match_criteria,
791                                        fname,
792                                        recType, clientData);
793     stream.destroy(&stream);
794     return res;
795 }
796
797 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
798                                              struct ZebraRecStream *stream,
799                                              enum zebra_recctrl_action_t action,
800                                              const char *recordType,
801                                              zint *sysno,
802                                              const char *match_criteria,
803                                              const char *fname,
804                                              RecType recType,
805                                              void *recTypeClientData,
806                                              int *more)
807     
808 {
809     zint sysno0 = 0;
810     RecordAttr *recordAttr;
811     struct recExtractCtrl extractCtrl;
812     int r;
813     const char *matchStr = 0;
814     Record rec;
815     off_t start_offset = 0, end_offset = 0;
816     const char *pr_fname = fname;  /* filename to print .. */
817     int show_progress = zh->records_processed + zh->records_skipped 
818         < zh->m_file_verbose_limit ? 1:0;
819
820     zebra_init_log_level();
821
822     if (!pr_fname)
823         pr_fname = "<no file>";  /* make it printable if file is omitted */
824
825     zebra_rec_keys_reset(zh->reg->keys);
826     zebra_rec_keys_reset(zh->reg->sortKeys);
827
828     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
829     {
830         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
831                                       zh->m_explain_database))
832             return ZEBRA_FAIL;
833     }
834
835     if (stream)
836     {
837         off_t null_offset = 0;
838         extractCtrl.stream = stream;
839
840         start_offset = stream->tellf(stream);
841
842         extractCtrl.first_record = start_offset ? 0 : 1;
843         
844         stream->endf(stream, &null_offset);;
845
846         extractCtrl.init = extract_init;
847         extractCtrl.tokenAdd = extract_token_add;
848         extractCtrl.schemaAdd = extract_schema_add;
849         extractCtrl.dh = zh->reg->dh;
850         extractCtrl.handle = zh;
851         extractCtrl.match_criteria[0] = '\0';
852         extractCtrl.staticrank = 0;
853         extractCtrl.action = action;
854
855         init_extractCtrl(zh, &extractCtrl);
856
857         extract_set_store_data_prepare(&extractCtrl);
858         
859         r = (*recType->extract)(recTypeClientData, &extractCtrl);
860
861         if (action == action_update)
862         {
863             action = extractCtrl.action;
864         }
865         
866         switch (r)
867         {
868         case RECCTRL_EXTRACT_EOF:
869             return ZEBRA_FAIL;
870         case RECCTRL_EXTRACT_ERROR_GENERIC:
871             /* error occured during extraction ... */
872             yaz_log(YLOG_WARN, "extract error: generic");
873             return ZEBRA_FAIL;
874         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
875             /* error occured during extraction ... */
876             yaz_log(YLOG_WARN, "extract error: no such filter");
877             return ZEBRA_FAIL;
878         case RECCTRL_EXTRACT_SKIP:
879             if (show_progress)
880                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
881                          recordType, pr_fname, (zint) start_offset);
882             *more = 1;
883             
884             end_offset = stream->endf(stream, 0);
885             if (end_offset)
886                 stream->seekf(stream, end_offset);
887
888             return ZEBRA_OK;
889         case RECCTRL_EXTRACT_OK:
890             break;
891         default:
892             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
893             return ZEBRA_FAIL;
894         }
895         end_offset = stream->endf(stream, 0);
896         if (end_offset)
897             stream->seekf(stream, end_offset);
898         else
899             end_offset = stream->tellf(stream);
900
901         if (extractCtrl.match_criteria[0])
902             match_criteria = extractCtrl.match_criteria;
903     }
904
905     *more = 1;
906
907     if (zh->m_flag_rw == 0)
908     {
909         yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
910                 pr_fname, (zint) start_offset);
911         /* test mode .. Do not perform match */
912         return ZEBRA_OK;
913     }
914         
915     if (!sysno)
916     {
917         sysno = &sysno0;
918         
919         if (match_criteria && *match_criteria)
920             matchStr = match_criteria;
921         else
922         {
923             if (zh->m_record_id && *zh->m_record_id)
924             {
925                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
926                                                zh->m_record_id);
927                 if (!matchStr)
928                 {
929                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
930                              pr_fname, (zint) start_offset);
931                     return ZEBRA_FAIL;
932                 }
933                 if (0 && matchStr)
934                 {
935                     WRBUF w = wrbuf_alloc();
936                     size_t i;
937                     for (i = 0; i < strlen(matchStr); i++)
938                     {
939                         wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
940                     }
941                     yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
942                     wrbuf_destroy(w);
943                 }
944             }
945         }
946         if (matchStr) 
947         {
948             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
949             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
950                                           matchStr);
951
952             
953             if (log_level_extract)
954             {
955                 WRBUF w = wrbuf_hex_str(matchStr);
956                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
957                 wrbuf_destroy(w);
958             }
959             if (rinfo)
960             {
961                 assert(*rinfo == sizeof(*sysno));
962                 memcpy(sysno, rinfo+1, sizeof(*sysno));
963             }
964        }
965     }
966
967     if (! *sysno)
968     {
969         /* new record AKA does not exist already */
970         if (action == action_delete)
971         {
972             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
973                     pr_fname, (zint) start_offset);
974             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
975             return ZEBRA_FAIL;
976         }
977         else if (action == action_a_delete)
978         {
979             if (show_progress)
980                 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
981                         pr_fname, (zint) start_offset);
982             return ZEBRA_OK;
983         }
984         else if (action == action_replace)
985         {
986             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
987                          pr_fname, (zint) start_offset);
988             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
989             return ZEBRA_FAIL;
990         }
991         if (show_progress)
992             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
993                      (zint) start_offset);
994         rec = rec_new(zh->reg->records);
995
996         *sysno = rec->sysno;
997
998
999         if (stream)
1000         {
1001             all_matches_add(&extractCtrl,
1002                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1003                             *sysno);
1004         }
1005
1006
1007         recordAttr = rec_init_attr(zh->reg->zei, rec);
1008         if (extractCtrl.staticrank < 0)
1009         {
1010             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1011             extractCtrl.staticrank = 0;
1012         }
1013
1014         if (matchStr)
1015         {
1016             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1017             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1018                             sizeof(*sysno), sysno);
1019         }
1020
1021         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1022 #if FLUSH2
1023         extract_flush_record_keys2(zh, *sysno,
1024                                    zh->reg->keys, extractCtrl.staticrank,
1025                                    0, recordAttr->staticrank);
1026 #else
1027         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1028                                   extractCtrl.staticrank);
1029 #endif
1030         recordAttr->staticrank = extractCtrl.staticrank;
1031         zh->records_inserted++;
1032     } 
1033     else
1034     {
1035         /* record already exists */
1036         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1037         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1038         if (action == action_insert)
1039         {
1040             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
1041                          recordType, pr_fname, (zint) start_offset);
1042             logRecord(zh);
1043             return ZEBRA_FAIL;
1044         }
1045
1046         rec = rec_get(zh->reg->records, *sysno);
1047         assert(rec);
1048
1049         if (stream)
1050         {
1051             all_matches_add(&extractCtrl,
1052                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1053                             *sysno);
1054         }
1055         
1056         recordAttr = rec_init_attr(zh->reg->zei, rec);
1057
1058         /* decrease total size */
1059         zebraExplain_recordBytesIncrement(zh->reg->zei,
1060                                            - recordAttr->recordSize);
1061
1062         zebra_rec_keys_set_buf(delkeys,
1063                                rec->info[recInfo_delKeys],
1064                                rec->size[recInfo_delKeys],
1065                                0);
1066         zebra_rec_keys_set_buf(sortKeys,
1067                                rec->info[recInfo_sortKeys],
1068                                rec->size[recInfo_sortKeys],
1069                                0);
1070
1071         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1072 #if !FLUSH2
1073         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1074                                   recordAttr->staticrank);
1075 #endif
1076         if (action == action_delete || action == action_a_delete)
1077         {
1078             /* record going to be deleted */
1079 #if FLUSH2
1080             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1081                                        delkeys, recordAttr->staticrank);
1082 #endif       
1083             if (zebra_rec_keys_empty(delkeys))
1084             {
1085                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1086                         pr_fname, (zint) start_offset);
1087                 yaz_log(YLOG_WARN, "cannot delete file above, "
1088                         "storeKeys false (3)");
1089             }
1090             else
1091             {
1092                 if (show_progress)
1093                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1094                             pr_fname, (zint) start_offset);
1095                 zh->records_deleted++;
1096                 if (matchStr)
1097                 {
1098                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1099                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1100                 }
1101                 rec_del(zh->reg->records, &rec);
1102             }
1103             zebra_rec_keys_close(delkeys);
1104             zebra_rec_keys_close(sortKeys);
1105             rec_free(&rec);
1106             logRecord(zh);
1107             return ZEBRA_OK;
1108         }
1109         else
1110         {   /* update or special_update */
1111             if (show_progress)
1112                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1113                         pr_fname, (zint) start_offset);
1114             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1115
1116 #if FLUSH2
1117             extract_flush_record_keys2(zh, *sysno,
1118                                        zh->reg->keys, extractCtrl.staticrank,
1119                                        delkeys, recordAttr->staticrank);
1120 #else
1121             extract_flush_record_keys(zh, *sysno, 1, 
1122                                       zh->reg->keys, extractCtrl.staticrank);
1123 #endif
1124             recordAttr->staticrank = extractCtrl.staticrank;
1125             zh->records_updated++;
1126         }
1127         zebra_rec_keys_close(delkeys);
1128         zebra_rec_keys_close(sortKeys);
1129     }
1130     /* update file type */
1131     xfree(rec->info[recInfo_fileType]);
1132     rec->info[recInfo_fileType] =
1133         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1134
1135     /* update filename */
1136     xfree(rec->info[recInfo_filename]);
1137     rec->info[recInfo_filename] =
1138         rec_strdup(fname, &rec->size[recInfo_filename]);
1139
1140     /* update delete keys */
1141     xfree(rec->info[recInfo_delKeys]);
1142     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1143     {
1144         zebra_rec_keys_get_buf(zh->reg->keys,
1145                                &rec->info[recInfo_delKeys],
1146                                &rec->size[recInfo_delKeys]);
1147     }
1148     else
1149     {
1150         rec->info[recInfo_delKeys] = NULL;
1151         rec->size[recInfo_delKeys] = 0;
1152     }
1153     /* update sort keys */
1154     xfree(rec->info[recInfo_sortKeys]);
1155
1156     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1157                            &rec->info[recInfo_sortKeys],
1158                            &rec->size[recInfo_sortKeys]);
1159
1160     if (stream)
1161     {
1162         recordAttr->recordSize = end_offset - start_offset;
1163         zebraExplain_recordBytesIncrement(zh->reg->zei,
1164                                           recordAttr->recordSize);
1165     }
1166
1167     /* set run-number for this record */
1168     recordAttr->runNumber =
1169         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1170
1171     /* update store data */
1172     xfree(rec->info[recInfo_storeData]);
1173
1174     /* update store data */
1175     if (zh->store_data_buf)
1176     {
1177         rec->size[recInfo_storeData] = zh->store_data_size;
1178         rec->info[recInfo_storeData] = zh->store_data_buf;
1179         zh->store_data_buf = 0;
1180         recordAttr->recordSize = zh->store_data_size;
1181     }
1182     else if (zh->m_store_data)
1183     {
1184         off_t cur_offset = stream->tellf(stream);
1185
1186         rec->size[recInfo_storeData] = recordAttr->recordSize;
1187         rec->info[recInfo_storeData] = (char *)
1188             xmalloc(recordAttr->recordSize);
1189         stream->seekf(stream, start_offset);
1190         stream->readf(stream, rec->info[recInfo_storeData],
1191                       recordAttr->recordSize);
1192         stream->seekf(stream, cur_offset);
1193     }
1194     else
1195     {
1196         rec->info[recInfo_storeData] = NULL;
1197         rec->size[recInfo_storeData] = 0;
1198     }
1199     /* update database name */
1200     xfree(rec->info[recInfo_databaseName]);
1201     rec->info[recInfo_databaseName] =
1202         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1203
1204     /* update offset */
1205     recordAttr->recordOffset = start_offset;
1206     
1207     /* commit this record */
1208     rec_put(zh->reg->records, &rec);
1209     logRecord(zh);
1210     return ZEBRA_OK;
1211 }
1212
1213 /** \brief extracts records from stream
1214     \param zh Zebra Handle
1215     \param stream stream that we read from
1216     \param action (action_insert, action_replace, action_delete, ..)
1217     \param recordType Record filter type "grs.xml", etc.
1218     \param sysno pointer to sysno if already known; NULL otherwise
1219     \param match_criteria (NULL if not already given)
1220     \param fname filename that we read from (for logging purposes only)
1221     \param recType record type
1222     \param recTypeClientData client data for record type
1223     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1224 */
1225 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
1226                                        struct ZebraRecStream *stream,
1227                                        enum zebra_recctrl_action_t action,
1228                                        const char *recordType,
1229                                        zint *sysno,
1230                                        const char *match_criteria,
1231                                        const char *fname,
1232                                        RecType recType,
1233                                        void *recTypeClientData)
1234 {
1235     ZEBRA_RES res = ZEBRA_OK;
1236     while (1)
1237     {
1238         int more = 0;
1239         res = zebra_extract_record_stream(zh, stream,
1240                                           action,
1241                                           recordType,
1242                                           sysno,
1243                                           match_criteria,
1244                                           fname,
1245                                           recType, recTypeClientData, &more);
1246         if (!more)
1247         {
1248             res = ZEBRA_OK;
1249             break;
1250         }
1251         if (res != ZEBRA_OK)
1252             break;
1253         if (sysno)
1254             break;
1255     }
1256     return res;
1257 }
1258
1259 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1260 {
1261     ZebraHandle zh = (ZebraHandle) handle;
1262     struct recExtractCtrl extractCtrl;
1263
1264     if (zebraExplain_curDatabase(zh->reg->zei,
1265                                   rec->info[recInfo_databaseName]))
1266     {
1267         abort();
1268         if (zebraExplain_newDatabase(zh->reg->zei,
1269                                       rec->info[recInfo_databaseName], 0))
1270             abort();
1271     }
1272
1273     zebra_rec_keys_reset(zh->reg->keys);
1274     zebra_rec_keys_reset(zh->reg->sortKeys);
1275
1276     extractCtrl.init = extract_init;
1277     extractCtrl.tokenAdd = extract_token_add;
1278     extractCtrl.schemaAdd = extract_schema_add;
1279     extractCtrl.dh = zh->reg->dh;
1280
1281     init_extractCtrl(zh, &extractCtrl);
1282
1283     extractCtrl.flagShowRecords = 0;
1284     extractCtrl.match_criteria[0] = '\0';
1285     extractCtrl.staticrank = 0;
1286     extractCtrl.action = action_update;
1287
1288     extractCtrl.handle = handle;
1289     extractCtrl.first_record = 1;
1290     
1291     extract_set_store_data_prepare(&extractCtrl);
1292
1293     if (n)
1294         grs_extract_tree(&extractCtrl, n);
1295
1296     if (rec->size[recInfo_delKeys])
1297     {
1298         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1299         
1300         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1301
1302         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1303                                rec->size[recInfo_delKeys],
1304                                0);
1305 #if FLUSH2
1306         extract_flush_record_keys2(zh, rec->sysno, 
1307                                    zh->reg->keys, 0, delkeys, 0);
1308 #else
1309         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1310         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1311 #endif
1312         zebra_rec_keys_close(delkeys);
1313
1314         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1315                                rec->size[recInfo_sortKeys],
1316                                0);
1317
1318         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1319         zebra_rec_keys_close(sortkeys);
1320     }
1321     else
1322     {
1323 #if FLUSH2
1324         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1325 #else
1326         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1327 #endif
1328     }
1329     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1330     
1331     xfree(rec->info[recInfo_delKeys]);
1332     zebra_rec_keys_get_buf(zh->reg->keys,
1333                            &rec->info[recInfo_delKeys], 
1334                            &rec->size[recInfo_delKeys]);
1335
1336     xfree(rec->info[recInfo_sortKeys]);
1337     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1338                            &rec->info[recInfo_sortKeys],
1339                            &rec->size[recInfo_sortKeys]);
1340     return ZEBRA_OK;
1341 }
1342
1343 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1344                            const char *str, size_t slen, NMEM nmem, int level)
1345 {
1346     char keystr[200]; /* room for zints to print */
1347     char *dst_term = 0;
1348     int ord = CAST_ZINT_TO_INT(key->mem[0]);
1349     const char *index_type;
1350     int i;
1351     const char *string_index;
1352     
1353     zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1354                             0/* db */, &string_index);
1355     assert(index_type);
1356     zebra_term_untrans_iconv(zh, nmem, index_type,
1357                              &dst_term, str);
1358     *keystr = '\0';
1359     for (i = 0; i < key->len; i++)
1360     {
1361         sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1362     }
1363     
1364     if (*str < CHR_BASE_CHAR)
1365     {
1366         int i;
1367         char dst_buf[200]; /* room for special chars */
1368         
1369         strcpy(dst_buf , "?");
1370         
1371         if (!strcmp(str, ""))
1372             strcpy(dst_buf, "alwaysmatches");
1373         if (!strcmp(str, FIRST_IN_FIELD_STR))
1374             strcpy(dst_buf, "firstinfield");
1375         else if (!strcmp(str, CHR_UNKNOWN))
1376             strcpy(dst_buf, "unknown");
1377         else if (!strcmp(str, CHR_SPACE))
1378             strcpy(dst_buf, "space");
1379         
1380         for (i = 0; i<slen; i++)
1381         {
1382             sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1383         }
1384         yaz_log(level, "%s%s %s %s", keystr, index_type,
1385                 string_index, dst_buf);
1386         
1387     }
1388     else
1389         yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1390                 string_index, dst_term);
1391 }
1392
1393 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1394                           zebra_rec_keys_t reckeys,
1395                           int level)
1396 {
1397     if (zebra_rec_keys_rewind(reckeys))
1398     {
1399         size_t slen;
1400         const char *str;
1401         struct it_key key;
1402         NMEM nmem = nmem_create();
1403
1404         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1405         {
1406             zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1407             nmem_reset(nmem);
1408         }
1409         nmem_destroy(nmem);
1410     }
1411 }
1412
1413 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1414                              zebra_rec_keys_t reckeys)
1415 {
1416     ZebraExplainInfo zei = zh->reg->zei;
1417     struct ord_stat {
1418         int no;
1419         int ord;
1420         struct ord_stat *next;
1421     };
1422
1423     if (zebra_rec_keys_rewind(reckeys))
1424     {
1425         struct ord_stat *ord_list = 0;
1426         struct ord_stat *p;
1427         size_t slen;
1428         const char *str;
1429         struct it_key key_in;
1430         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1431         {
1432             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1433
1434             for (p = ord_list; p ; p = p->next)
1435                 if (p->ord == ord)
1436                 {
1437                     p->no++;
1438                     break;
1439                 }
1440             if (!p)
1441             {
1442                 p = xmalloc(sizeof(*p));
1443                 p->no = 1;
1444                 p->ord = ord;
1445                 p->next = ord_list;
1446                 ord_list = p;
1447             }
1448         }
1449
1450         p = ord_list;
1451         while (p)
1452         {
1453             struct ord_stat *p1 = p;
1454
1455             if (is_insert)
1456                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1457             else
1458                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1459             p = p->next;
1460             xfree(p1);
1461         }
1462     }
1463 }
1464
1465 #if FLUSH2
1466 static void extract_flush_record_keys2(
1467     ZebraHandle zh, zint sysno,
1468     zebra_rec_keys_t ins_keys, zint ins_rank,
1469     zebra_rec_keys_t del_keys, zint del_rank)
1470 {
1471     ZebraExplainInfo zei = zh->reg->zei;
1472     int normal = 0;
1473     int optimized = 0;
1474
1475     if (!zh->reg->key_block)
1476     {
1477         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1478         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1479         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1480         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1481     }
1482
1483     if (ins_keys)
1484     {
1485         extract_rec_keys_adjust(zh, 1, ins_keys);
1486         if (!del_keys)
1487             zebraExplain_recordCountIncrement(zei, 1);
1488         zebra_rec_keys_rewind(ins_keys);
1489     }
1490     if (del_keys)
1491     {
1492         extract_rec_keys_adjust(zh, 0, del_keys);
1493         if (!ins_keys)
1494             zebraExplain_recordCountIncrement(zei, -1);
1495         zebra_rec_keys_rewind(del_keys);
1496     }
1497
1498     while (1)
1499     {
1500         size_t del_slen;
1501         const char *del_str;
1502         struct it_key del_key_in;
1503         int del = 0;
1504
1505         size_t ins_slen;
1506         const char *ins_str;
1507         struct it_key ins_key_in;
1508         int ins = 0;
1509
1510         if (del_keys)
1511             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1512                                       &del_key_in);
1513         if (ins_keys)
1514             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1515                                       &ins_key_in);
1516
1517         if (del && ins && ins_rank == del_rank
1518             && !key_compare(&del_key_in, &ins_key_in) 
1519             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1520         {
1521             optimized++;
1522             continue;
1523         }
1524         if (!del && !ins)
1525             break;
1526         
1527         normal++;
1528         if (del)
1529             key_block_write(zh->reg->key_block, sysno, 
1530                             &del_key_in, 0, del_str, del_slen,
1531                             del_rank, zh->m_staticrank);
1532         if (ins)
1533             key_block_write(zh->reg->key_block, sysno, 
1534                             &ins_key_in, 1, ins_str, ins_slen,
1535                             ins_rank, zh->m_staticrank);
1536     }
1537     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1538 }
1539 #else
1540 static void extract_flush_record_keys(
1541     ZebraHandle zh, zint sysno, int cmd,
1542     zebra_rec_keys_t reckeys,
1543     zint staticrank)
1544 {
1545     ZebraExplainInfo zei = zh->reg->zei;
1546
1547     extract_rec_keys_adjust(zh, cmd, reckeys);
1548
1549     if (log_level_details)
1550     {
1551         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1552                 sysno, cmd ? "insert" : "delete");
1553         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1554     }
1555
1556     if (!zh->reg->key_block)
1557     {
1558         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1559         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1560         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1561         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1562     }
1563     zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1564
1565 #if 0
1566     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1567     print_rec_keys(zh, reckeys);
1568 #endif
1569     if (zebra_rec_keys_rewind(reckeys))
1570     {
1571         size_t slen;
1572         const char *str;
1573         struct it_key key_in;
1574         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1575         {
1576             key_block_write(zh->reg->key_block, sysno, 
1577                             &key_in, cmd, str, slen,
1578                             staticrank, zh->m_staticrank);
1579         }
1580     }
1581 }
1582 #endif
1583
1584 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1585                                      zebra_rec_keys_t reckeys,
1586                                      zebra_snippets *snippets)
1587 {
1588     NMEM nmem = nmem_create();
1589     if (zebra_rec_keys_rewind(reckeys)) 
1590     {
1591         const char *str;
1592         size_t slen;
1593         struct it_key key;
1594         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1595         {
1596             char *dst_term = 0;
1597             int ord;
1598             zint seqno;
1599             const char *index_type;
1600
1601             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1602             seqno = key.mem[key.len-1];
1603             ord = CAST_ZINT_TO_INT(key.mem[0]);
1604             
1605             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1606                                     0/* db */, 0 /* string_index */);
1607             assert(index_type);
1608             zebra_term_untrans_iconv(zh, nmem, index_type,
1609                                      &dst_term, str);
1610             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1611             nmem_reset(nmem);
1612         }
1613     }
1614     nmem_destroy(nmem);
1615     return ZEBRA_OK;
1616 }
1617
1618 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1619 {
1620     yaz_log(YLOG_LOG, "print_rec_keys");
1621     if (zebra_rec_keys_rewind(reckeys))
1622     {
1623         const char *str;
1624         size_t slen;
1625         struct it_key key;
1626         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1627         {
1628             char dst_buf[IT_MAX_WORD];
1629             zint seqno;
1630             const char *index_type;
1631             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1632             const char *db = 0;
1633             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1634
1635             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1636             
1637             seqno = key.mem[key.len-1];
1638             
1639             zebra_term_untrans(zh, index_type, dst_buf, str);
1640             
1641             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1642                     " term=%s", ord, seqno, dst_buf); 
1643         }
1644     }
1645 }
1646
1647 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1648                                      const char *str, int length)
1649 {
1650     struct it_key key;
1651     ZebraHandle zh = p->extractCtrl->handle;
1652     ZebraExplainInfo zei = zh->reg->zei;
1653     int ch, i;
1654
1655     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1656     if (ch < 0)
1657         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1658
1659     i = 0;
1660     key.mem[i++] = ch;
1661     key.mem[i++] = p->record_id;
1662     key.mem[i++] = p->section_id;
1663
1664     if (zh->m_segment_indexing)
1665         key.mem[i++] = p->segment;
1666     key.mem[i++] = p->seqno;
1667     key.len = i;
1668
1669     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1670 }
1671
1672 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1673 {
1674     struct it_key key;
1675     ZebraHandle zh = p->extractCtrl->handle;
1676     ZebraExplainInfo zei = zh->reg->zei;
1677     int ch;
1678     zinfo_index_category_t cat = zinfo_index_category_sort;
1679
1680     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1681     if (ch < 0)
1682         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1683     key.len = 3;
1684     key.mem[0] = ch;
1685     key.mem[1] = p->record_id;
1686     key.mem[2] = p->section_id;
1687
1688     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1689 }
1690
1691 static void extract_add_staticrank_string(RecWord *p,
1692                                           const char *str, int length)
1693 {
1694     char valz[40];
1695     struct recExtractCtrl *ctrl = p->extractCtrl;
1696
1697     if (length > sizeof(valz)-1)
1698         length = sizeof(valz)-1;
1699
1700     memcpy(valz, str, length);
1701     valz[length] = '\0';
1702     ctrl->staticrank = atozint(valz);
1703 }
1704
1705 static void extract_add_string(RecWord *p, zebra_map_t zm,
1706                                const char *string, int length)
1707 {
1708     assert(length > 0);
1709
1710     if (!p->index_name)
1711         return;
1712     if (log_level_details)
1713     {
1714
1715         WRBUF w = wrbuf_alloc();
1716         
1717         wrbuf_write_escaped(w, string, length);
1718         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1719         wrbuf_destroy(w);
1720     }
1721     if (zebra_maps_is_index(zm))
1722     {
1723         extract_add_index_string(p, zinfo_index_category_index,
1724                                  string, length);
1725         if (zebra_maps_is_alwaysmatches(zm))
1726         {
1727             RecWord word;
1728             memcpy(&word, p, sizeof(word));
1729
1730             word.seqno = 1;
1731             extract_add_index_string(
1732                 &word, zinfo_index_category_alwaysmatches, "", 0);
1733         }
1734     }
1735     else if (zebra_maps_is_sort(zm))
1736     {
1737         extract_add_sort_string(p, string, length);
1738     }
1739     else if (zebra_maps_is_staticrank(zm))
1740     {
1741         extract_add_staticrank_string(p, string, length);
1742     }
1743 }
1744
1745 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1746 {
1747     const char *b = p->term_buf;
1748     int remain = p->term_len;
1749     int first = 1;
1750     const char **map = 0;
1751     
1752     if (remain > 0)
1753         map = zebra_maps_input(zm, &b, remain, 0);
1754
1755     while (map)
1756     {
1757         char buf[IT_MAX_WORD+1];
1758         int i, remain;
1759
1760         /* Skip spaces */
1761         while (map && *map && **map == *CHR_SPACE)
1762         {
1763             remain = p->term_len - (b - p->term_buf);
1764             if (remain > 0)
1765                 map = zebra_maps_input(zm, &b, remain, 0);
1766             else
1767                 map = 0;
1768         }
1769         if (!map)
1770             break;
1771         i = 0;
1772         while (map && *map && **map != *CHR_SPACE)
1773         {
1774             const char *cp = *map;
1775
1776             while (i < IT_MAX_WORD && *cp)
1777                 buf[i++] = *(cp++);
1778             remain = p->term_len - (b - p->term_buf);
1779             if (remain > 0)
1780                 map = zebra_maps_input(zm, &b, remain, 0);
1781             else
1782                 map = 0;
1783         }
1784         if (!i)
1785             return;
1786
1787         if (first)
1788         {   
1789             first = 0;
1790             if (zebra_maps_is_first_in_field(zm))
1791             {
1792                 /* first in field marker */
1793                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1794                 p->seqno++;
1795             }
1796         }
1797         extract_add_string(p, zm, buf, i);
1798         p->seqno++;
1799     }
1800 }
1801
1802 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1803 {
1804     const char *b = p->term_buf;
1805     char buf[IT_MAX_WORD+1];
1806     const char **map = 0;
1807     int i = 0, remain = p->term_len;
1808
1809     if (remain > 0)
1810         map = zebra_maps_input(zm, &b, remain, 1);
1811
1812     while (remain > 0 && i < IT_MAX_WORD)
1813     {
1814         while (map && *map && **map == *CHR_SPACE)
1815         {
1816             remain = p->term_len - (b - p->term_buf);
1817
1818             if (remain > 0)
1819             {
1820                 int first = i ? 0 : 1;  /* first position */
1821                 map = zebra_maps_input(zm, &b, remain, first);
1822             }
1823             else
1824                 map = 0;
1825         }
1826         if (!map)
1827             break;
1828
1829         if (i && i < IT_MAX_WORD)
1830             buf[i++] = *CHR_SPACE;
1831         while (map && *map && **map != *CHR_SPACE)
1832         {
1833             const char *cp = *map;
1834
1835             if (**map == *CHR_CUT)
1836             {
1837                 i = 0;
1838             }
1839             else
1840             {
1841                 if (i >= IT_MAX_WORD)
1842                     break;
1843                 while (i < IT_MAX_WORD && *cp)
1844                     buf[i++] = *(cp++);
1845             }
1846             remain = p->term_len  - (b - p->term_buf);
1847             if (remain > 0)
1848             {
1849                 map = zebra_maps_input(zm, &b, remain, 0);
1850             }
1851             else
1852                 map = 0;
1853         }
1854     }
1855     if (!i)
1856         return;
1857     extract_add_string(p, zm, buf, i);
1858 }
1859
1860 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1861 {
1862     const char *res_buf = 0;
1863     size_t res_len = 0;
1864
1865     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1866     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1867     {
1868         extract_add_string(p, zm, res_buf, res_len);
1869         p->seqno++;
1870     }
1871 }
1872
1873
1874 /** \brief top-level indexing handler for recctrl system
1875     \param p token data to be indexed
1876
1877     Call sequence:
1878     extract_token_add
1879     extract_add_{in}_complete / extract_add_icu
1880     extract_add_string
1881     
1882     extract_add_index_string
1883     or
1884     extract_add_sort_string
1885     or
1886     extract_add_staticrank_string
1887     
1888 */
1889 static void extract_token_add(RecWord *p)
1890 {
1891     ZebraHandle zh = p->extractCtrl->handle;
1892     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1893     WRBUF wrbuf;
1894
1895     if (log_level_details)
1896     {
1897         yaz_log(log_level_details, "extract_token_add "
1898                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1899                 p->index_type, p->index_name, 
1900                 p->seqno, p->term_len, p->term_buf);
1901     }
1902     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1903     {
1904         p->term_buf = wrbuf_buf(wrbuf);
1905         p->term_len = wrbuf_len(wrbuf);
1906     }
1907     if (zebra_maps_is_icu(zm))
1908     {
1909         extract_add_icu(p, zm);
1910     }
1911     else
1912     {
1913         if (zebra_maps_is_complete(zm))
1914             extract_add_complete_field(p, zm);
1915         else
1916             extract_add_incomplete_field(p, zm);
1917     }
1918 }
1919
1920 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1921                                       void *buf, size_t sz)
1922 {
1923     ZebraHandle zh = (ZebraHandle) p->handle;
1924
1925     xfree(zh->store_data_buf);
1926     zh->store_data_buf = 0;
1927     zh->store_data_size = 0;
1928     if (buf && sz)
1929     {
1930         zh->store_data_buf = xmalloc(sz);
1931         zh->store_data_size = sz;
1932         memcpy(zh->store_data_buf, buf, sz);
1933     }
1934 }
1935
1936 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1937 {
1938     ZebraHandle zh = (ZebraHandle) p->handle;
1939     xfree(zh->store_data_buf);
1940     zh->store_data_buf = 0;
1941     zh->store_data_size = 0;
1942     p->setStoreData = extract_set_store_data_cb;
1943 }
1944
1945 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1946 {
1947     ZebraHandle zh = (ZebraHandle) p->handle;
1948     zebraExplain_addSchema(zh->reg->zei, oid);
1949 }
1950
1951 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1952                              int cmd, zebra_rec_keys_t reckeys)
1953 {
1954 #if 0
1955     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1956             cmd, sysno);
1957     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1958 #endif
1959
1960     if (zebra_rec_keys_rewind(reckeys))
1961     {
1962         zebra_sort_index_t si = zh->reg->sort_index;
1963         size_t slen;
1964         const char *str;
1965         struct it_key key_in;
1966
1967         NMEM nmem = nmem_create();
1968         struct sort_add_ent {
1969             int ord;
1970             int cmd;
1971             struct sort_add_ent *next;
1972             WRBUF wrbuf;
1973             zint sysno;
1974             zint section_id;
1975         };
1976         struct sort_add_ent *sort_ent_list = 0;
1977
1978         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1979         {
1980             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1981             zint filter_sysno = key_in.mem[1];
1982             zint section_id = key_in.mem[2];
1983
1984             struct sort_add_ent **e = &sort_ent_list;
1985             for (; *e; e = &(*e)->next)
1986                 if ((*e)->ord == ord && section_id == (*e)->section_id)
1987                     break;
1988             if (!*e)
1989             {
1990                 *e = nmem_malloc(nmem, sizeof(**e));
1991                 (*e)->next = 0;
1992                 (*e)->wrbuf = wrbuf_alloc();
1993                 (*e)->ord = ord;
1994                 (*e)->cmd = cmd;
1995                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1996                 (*e)->section_id = section_id;
1997             }
1998             
1999             wrbuf_write((*e)->wrbuf, str, slen);
2000             wrbuf_putc((*e)->wrbuf, '\0');
2001         }
2002         if (sort_ent_list)
2003         {
2004             zint last_sysno = 0;
2005             struct sort_add_ent *e = sort_ent_list;
2006             for (; e; e = e->next)
2007             {
2008                 if (last_sysno != e->sysno)
2009                 {
2010                     zebra_sort_sysno(si, e->sysno);
2011                     last_sysno = e->sysno;
2012                 }
2013                 zebra_sort_type(si, e->ord);
2014                 if (e->cmd == 1)
2015                     zebra_sort_add(si, e->section_id, e->wrbuf);
2016                 else
2017                     zebra_sort_delete(si, e->section_id);
2018                 wrbuf_destroy(e->wrbuf);
2019             }
2020         }
2021         nmem_destroy(nmem);
2022     }
2023 }
2024
2025 /*
2026  * Local variables:
2027  * c-basic-offset: 4
2028  * c-file-style: "Stroustrup"
2029  * indent-tabs-mode: nil
2030  * End:
2031  * vim: shiftwidth=4 tabstop=8 expandtab
2032  */
2033