Happy new year.
[idzebra-moved-to-github.git] / index / extract.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2011 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 /** \file
21     \brief indexes records and extract tokens for indexing and sorting
22 */
23
24 #include <stdio.h>
25 #include <assert.h>
26 #include <ctype.h>
27 #ifdef WIN32
28 #include <io.h>
29 #endif
30 #if HAVE_UNISTD_H
31 #include <unistd.h>
32 #endif
33 #include <fcntl.h>
34
35
36 #include "index.h"
37 #include "orddict.h"
38 #include <direntz.h>
39 #include <charmap.h>
40 #include <yaz/snprintf.h>
41
42 static int log_level_extract = 0;
43 static int log_level_details = 0;
44 static int log_level_initialized = 0;
45
46 /* 1 if we use eliminitate identical delete/insert keys */
47 /* eventually this the 0-case code will be removed */
48 #define FLUSH2 1
49
50 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
51                                 zebra_rec_keys_t ins_keys,
52                                 zint ins_rank,
53                                 zebra_rec_keys_t del_keys,
54                                 zint del_rank);
55
56 static void zebra_init_log_level(void)
57 {
58     if (!log_level_initialized)
59     {
60         log_level_initialized = 1;
61
62         log_level_extract = yaz_log_module_level("extract");
63         log_level_details = yaz_log_module_level("indexdetails");
64     }
65 }
66
67 static WRBUF wrbuf_hex_str(const char *cstr)
68 {
69     size_t i;
70     WRBUF w = wrbuf_alloc();
71     for (i = 0; cstr[i]; i++)
72     {
73         if (cstr[i] < ' ' || cstr[i] > 126)
74             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
75         else
76             wrbuf_putc(w, cstr[i]);
77     }
78     return w;
79 }
80
81
82 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
83                                     int cmd, zebra_rec_keys_t skp);
84 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
85 static void extract_token_add(RecWord *p);
86
87 static void check_log_limit(ZebraHandle zh)
88 {
89     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
90     {
91         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
92                 zh->m_file_verbose_limit);
93     }
94 }
95
96 static void logRecord(ZebraHandle zh)
97 {
98     check_log_limit(zh);
99     ++zh->records_processed;
100     if (!(zh->records_processed % 1000))
101     {
102         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
103                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
104                 zh->records_processed, zh->records_inserted, 
105                 zh->records_updated, zh->records_deleted);
106     }
107 }
108
109 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
110 {
111     ctrl->flagShowRecords = !zh->m_flag_rw;
112 }
113
114
115 static void extract_add_index_string(RecWord *p, 
116                                       zinfo_index_category_t cat,
117                                       const char *str, int length);
118
119 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
120
121 static void extract_init(struct recExtractCtrl *p, RecWord *w)
122 {
123     w->seqno = 1;
124     w->index_name = "any";
125     w->index_type = "w";
126     w->extractCtrl = p;
127     w->record_id = 0;
128     w->section_id = 0;
129     w->segment = 0;
130 }
131
132 struct snip_rec_info {
133     ZebraHandle zh;
134     zebra_snippets *snippets;
135 };
136
137
138 static void snippet_add_complete_field(RecWord *p, int ord,
139                                        zebra_map_t zm)
140 {
141     struct snip_rec_info *h = p->extractCtrl->handle;
142
143     const char *b = p->term_buf;
144     char buf[IT_MAX_WORD+1];
145     const char **map = 0;
146     int i = 0, remain = p->term_len;
147     const char *start = b;
148     const char *last = 0;
149
150     if (remain > 0)
151         map = zebra_maps_input(zm, &b, remain, 1);
152
153     while (remain > 0 && i < IT_MAX_WORD)
154     {
155         while (map && *map && **map == *CHR_SPACE)
156         {
157             remain = p->term_len - (b - p->term_buf);
158
159             if (i == 0)
160                 start = b;  /* set to first non-ws area */
161             if (remain > 0)
162             {
163                 int first = i ? 0 : 1;  /* first position */
164
165                 map = zebra_maps_input(zm, &b, remain, first);
166             }
167             else
168                 map = 0;
169         }
170         if (!map)
171             break;
172
173         if (i && i < IT_MAX_WORD)
174             buf[i++] = *CHR_SPACE;
175         while (map && *map && **map != *CHR_SPACE)
176         {
177             const char *cp = *map;
178
179             if (**map == *CHR_CUT)
180             {
181                 i = 0;
182             }
183             else
184             {
185                 if (i >= IT_MAX_WORD)
186                     break;
187                 while (i < IT_MAX_WORD && *cp)
188                     buf[i++] = *(cp++);
189             }
190             last = b;
191             remain = p->term_len  - (b - p->term_buf);
192             if (remain > 0)
193             {
194                 map = zebra_maps_input(zm, &b, remain, 0);
195             }
196             else
197                 map = 0;
198         }
199     }
200     if (!i)
201         return;
202     if (last && start != last && zebra_maps_is_index(zm))
203         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
204                                start, last - start);
205 }
206
207 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
208 {
209     struct snip_rec_info *h = p->extractCtrl->handle;
210     const char *b = p->term_buf;
211     int remain = p->term_len;
212     int first = 1;
213     const char **map = 0;
214     const char *start = b;
215     const char *last = b;
216
217     if (remain > 0)
218         map = zebra_maps_input(zm, &b, remain, 0);
219
220     while (map)
221     {
222         char buf[IT_MAX_WORD+1];
223         int i, remain;
224
225         /* Skip spaces */
226         while (map && *map && **map == *CHR_SPACE)
227         {
228             remain = p->term_len - (b - p->term_buf);
229             last = b;
230             if (remain > 0)
231                 map = zebra_maps_input(zm, &b, remain, 0);
232             else
233                 map = 0;
234         }
235         if (!map)
236             break;
237         if (start != last && zebra_maps_is_index(zm))
238         {
239             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
240                                    start, last - start);
241
242         }
243         start = last;
244
245         i = 0;
246         while (map && *map && **map != *CHR_SPACE)
247         {
248             const char *cp = *map;
249
250             while (i < IT_MAX_WORD && *cp)
251                 buf[i++] = *(cp++);
252             remain = p->term_len - (b - p->term_buf);
253             last = b;
254             if (remain > 0)
255                 map = zebra_maps_input(zm, &b, remain, 0);
256             else
257                 map = 0;
258         }
259         if (!i)
260             return;
261
262         if (first)
263         {   
264             first = 0;
265             if (zebra_maps_is_first_in_field(zm))
266             {
267                 /* first in field marker */
268                 p->seqno++;
269             }
270         }
271         if (start != last && zebra_maps_is_index(zm))
272             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
273                                    start, last - start);
274         start = last;
275         p->seqno++;
276     }
277
278 }
279
280 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
281 {
282     struct snip_rec_info *h = p->extractCtrl->handle;
283
284     const char *res_buf = 0;
285     size_t res_len = 0;
286
287     const char *display_buf = 0;
288     size_t display_len = 0;
289
290     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
291     while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
292                                    &display_buf, &display_len))
293     {
294         if (zebra_maps_is_index(zm))
295             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
296                                    display_buf, display_len);
297         p->seqno++;
298     }
299 }
300
301 static void snippet_token_add(RecWord *p)
302 {
303     struct snip_rec_info *h = p->extractCtrl->handle;
304     ZebraHandle zh = h->zh;
305     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
306
307     if (zm)
308     {
309         ZebraExplainInfo zei = zh->reg->zei;
310         int ch = zebraExplain_lookup_attr_str(
311             zei, zinfo_index_category_index, p->index_type, p->index_name);
312
313         if (zebra_maps_is_icu(zm))
314             snippet_add_icu(p, ch, zm);
315         else
316         {
317             if (zebra_maps_is_complete(zm))
318                 snippet_add_complete_field(p, ch, zm);
319             else
320                 snippet_add_incomplete_field(p, ch, zm);
321         }
322     }
323 }
324
325 static void snippet_schema_add(
326     struct recExtractCtrl *p, Odr_oid *oid)
327 {
328
329 }
330
331 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
332                      struct ZebraRecStream *stream,
333                      RecType rt, void *recTypeClientData)
334 {
335     struct recExtractCtrl extractCtrl;
336     struct snip_rec_info info;
337     int r;
338
339     extractCtrl.stream = stream;
340     extractCtrl.first_record = 1;
341     extractCtrl.init = extract_init;
342     extractCtrl.tokenAdd = snippet_token_add;
343     extractCtrl.schemaAdd = snippet_schema_add;
344     assert(zh->reg);
345     assert(zh->reg->dh);
346
347     extractCtrl.dh = zh->reg->dh;
348     
349     info.zh = zh;
350     info.snippets = sn;
351     extractCtrl.handle = &info;
352     extractCtrl.match_criteria[0] = '\0';
353     extractCtrl.staticrank = 0;
354     extractCtrl.action = action_insert;
355     
356     init_extractCtrl(zh, &extractCtrl);
357
358     extractCtrl.setStoreData = 0;
359
360     r = (*rt->extract)(recTypeClientData, &extractCtrl);
361
362 }
363
364 static void searchRecordKey(ZebraHandle zh,
365                             zebra_rec_keys_t reckeys,
366                             const char *index_name,
367                             const char **ws, int ws_length)
368 {
369     int i;
370     int ch = -1;
371     zinfo_index_category_t cat = zinfo_index_category_index;
372
373     for (i = 0; i<ws_length; i++)
374         ws[i] = NULL;
375
376     if (ch < 0)
377         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
378     if (ch < 0)
379         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
380     if (ch < 0)
381         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
382
383     if (ch < 0)
384         return ;
385
386     if (zebra_rec_keys_rewind(reckeys))
387     {
388         zint startSeq = -1;
389         const char *str;
390         size_t slen;
391         struct it_key key;
392         zint seqno;
393         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
394         {
395             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
396
397             seqno = key.mem[key.len-1];
398             
399             if (key.mem[0] == ch)
400             {
401                 zint woff;
402                 
403                 if (startSeq == -1)
404                     startSeq = seqno;
405                 woff = seqno - startSeq;
406                 if (woff >= 0 && woff < ws_length)
407                     ws[woff] = str;
408             }
409         }
410     }
411 }
412
413 #define FILE_MATCH_BLANK "\t "
414
415 static char *get_match_from_spec(ZebraHandle zh,
416                           zebra_rec_keys_t reckeys,
417                           const char *fname, const char *spec)
418 {
419     static char dstBuf[2048];      /* static here ??? */
420     char *dst = dstBuf;
421     const char *s = spec;
422
423     while (1)
424     {
425         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
426             ;
427         if (!*s)
428             break;
429         if (*s == '(')
430         {
431             const char *ws[32];
432             char attset_str[64], attname_str[64];
433             int i;
434             int first = 1;
435             
436             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
437                 ;
438             for (i = 0; *s && *s != ',' && *s != ')' && 
439                      !strchr(FILE_MATCH_BLANK, *s); s++)
440                 if (i+1 < sizeof(attset_str))
441                     attset_str[i++] = *s;
442             attset_str[i] = '\0';
443             
444             for (; strchr(FILE_MATCH_BLANK, *s); s++)
445                 ;
446             if (*s != ',')
447                 strcpy(attname_str, attset_str);
448             else
449             {
450                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
451                     ;
452                 for (i = 0; *s && *s != ')' && 
453                          !strchr(FILE_MATCH_BLANK, *s); s++)
454                     if (i+1 < sizeof(attname_str))
455                         attname_str[i++] = *s;
456                 attname_str[i] = '\0';
457             }
458             if (*s != ')')
459             {
460                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
461                       spec, zh->m_group ? zh->m_group : "none");
462                 return NULL;
463             }
464             s++;
465
466             searchRecordKey(zh, reckeys, attname_str, ws, 32);
467             if (0) /* for debugging */
468             {   
469                 for (i = 0; i<32; i++)
470                 {
471                     if (ws[i])
472                     {
473                         WRBUF w = wrbuf_hex_str(ws[i]);
474                         yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
475                         wrbuf_destroy(w);
476                     }
477                 }
478             }
479
480             for (i = 0; i<32; i++)
481                 if (ws[i])
482                 {
483                     if (first)
484                     {
485                         *dst++ = ' ';
486                         first = 0;
487                     }
488                     strcpy(dst, ws[i]);
489                     dst += strlen(ws[i]);
490                 }
491             if (first)
492             {
493                 yaz_log(YLOG_WARN, "Record didn't contain match"
494                       " fields in (%s,%s)", attset_str, attname_str);
495                 return NULL;
496             }
497         }
498         else if (*s == '$')
499         {
500             int spec_len;
501             char special[64];
502             const char *spec_src = NULL;
503             const char *s1 = ++s;
504             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
505                 s1++;
506
507             spec_len = s1 - s;
508             if (spec_len > sizeof(special)-1)
509                 spec_len = sizeof(special)-1;
510             memcpy(special, s, spec_len);
511             special[spec_len] = '\0';
512             s = s1;
513
514             if (!strcmp(special, "group"))
515                 spec_src = zh->m_group;
516             else if (!strcmp(special, "database"))
517                 spec_src = zh->basenames[0];
518             else if (!strcmp(special, "filename")) {
519                 spec_src = fname;
520             }
521             else if (!strcmp(special, "type"))
522                 spec_src = zh->m_record_type;
523             else 
524                 spec_src = NULL;
525             if (spec_src)
526             {
527                 strcpy(dst, spec_src);
528                 dst += strlen(spec_src);
529             }
530         }
531         else if (*s == '\"' || *s == '\'')
532         {
533             int stopMarker = *s++;
534             char tmpString[64];
535             int i = 0;
536
537             while (*s && *s != stopMarker)
538             {
539                 if (i+1 < sizeof(tmpString))
540                     tmpString[i++] = *s++;
541             }
542             if (*s)
543                 s++;
544             tmpString[i] = '\0';
545             strcpy(dst, tmpString);
546             dst += strlen(tmpString);
547         }
548         else
549         {
550             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
551                   spec, zh->m_group ? zh->m_group : "none");
552             return NULL;
553         }
554         *dst++ = 1;
555     }
556     if (dst == dstBuf)
557     {
558         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
559               fname, zh->m_group ? zh->m_group : "none");
560         return NULL;
561     }
562     *dst = '\0';
563
564     if (0) /* for debugging */
565     {
566         WRBUF w = wrbuf_hex_str(dstBuf);
567         yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
568         wrbuf_destroy(w);
569     }
570
571     return dstBuf;
572 }
573
574 struct recordLogInfo {
575     const char *fname;
576     int recordOffset;
577     struct recordGroup *rGroup;
578 };
579
580 /** \brief add the always-matches index entry and map to real record ID
581     \param ctrl record control
582     \param record_id custom record ID
583     \param sysno system record ID
584     
585     This function serves two purposes.. It adds the always matches
586     entry and makes a pointer from the custom record ID (if defined)
587     back to the system record ID (sysno)
588     See zebra_recid_to_sysno .
589   */
590 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
591                             zint sysno)
592 {
593     RecWord word;
594     extract_init(ctrl, &word);
595     word.record_id = record_id;
596     /* we use the seqno as placeholder for a way to get back to
597        record database from _ALLRECORDS.. This is used if a custom
598        RECORD was defined */
599     word.seqno = sysno;
600     word.index_name = "_ALLRECORDS";
601     word.index_type = "w";
602
603     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
604                               "", 0);
605 }
606
607 /* forward declaration */
608 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
609                                        struct ZebraRecStream *stream,
610                                        enum zebra_recctrl_action_t action,
611                                        const char *recordType,
612                                        zint *sysno,
613                                        const char *match_criteria,
614                                        const char *fname,
615                                        RecType recType,
616                                        void *recTypeClientData);
617
618
619 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
620                              enum zebra_recctrl_action_t action)
621 {
622     ZEBRA_RES r = ZEBRA_OK;
623     int i, fd;
624     char gprefix[128];
625     char ext[128];
626     char ext_res[128];
627     struct file_read_info *fi = 0;
628     const char *original_record_type = 0;
629     RecType recType;
630     void *recTypeClientData;
631     struct ZebraRecStream stream, *streamp;
632
633     zebra_init_log_level();
634
635     if (!zh->m_group || !*zh->m_group)
636         *gprefix = '\0';
637     else
638         sprintf(gprefix, "%s.", zh->m_group);
639     
640     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
641
642     /* determine file extension */
643     *ext = '\0';
644     for (i = strlen(fname); --i >= 0; )
645         if (fname[i] == '/')
646             break;
647         else if (fname[i] == '.')
648         {
649             strcpy(ext, fname+i+1);
650             break;
651         }
652     /* determine file type - depending on extension */
653     original_record_type = zh->m_record_type;
654     if (!zh->m_record_type)
655     {
656         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
657         zh->m_record_type = res_get(zh->res, ext_res);
658     }
659     if (!zh->m_record_type)
660     {
661         check_log_limit(zh);
662         if (zh->records_processed + zh->records_skipped
663             < zh->m_file_verbose_limit)
664             yaz_log(YLOG_LOG, "? %s", fname);
665         zh->records_skipped++;
666         return 0;
667     }
668     /* determine match criteria */
669     if (!zh->m_record_id)
670     {
671         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
672         zh->m_record_id = res_get(zh->res, ext_res);
673     }
674
675     if (!(recType =
676           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
677                           &recTypeClientData)))
678     {
679         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
680         return ZEBRA_FAIL;
681     }
682
683     switch(recType->version)
684     {
685     case 0:
686         break;
687     default:
688         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
689     }
690     if (sysno && (action == action_delete || action == action_a_delete))
691     {
692         streamp = 0;
693         fi = 0;
694     }
695     else
696     {
697         char full_rep[1024];
698
699         if (zh->path_reg && !yaz_is_abspath(fname))
700         {
701             strcpy(full_rep, zh->path_reg);
702             strcat(full_rep, "/");
703             strcat(full_rep, fname);
704         }
705         else
706             strcpy(full_rep, fname);
707         
708         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
709         {
710             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
711             zh->m_record_type = original_record_type;
712             return ZEBRA_FAIL;
713         }
714         streamp = &stream;
715         zebra_create_stream_fd(streamp, fd, 0);
716     }
717     r = zebra_extract_records_stream(zh, streamp,
718                                      action,
719                                      zh->m_record_type,
720                                      sysno,
721                                      0, /*match_criteria */
722                                      fname,
723                                      recType, recTypeClientData);
724     if (streamp)
725         stream.destroy(streamp);
726     zh->m_record_type = original_record_type;
727     return r;
728 }
729
730 /*
731   If sysno is provided, then it's used to identify the reocord.
732   If not, and match_criteria is provided, then sysno is guessed
733   If not, and a record is provided, then sysno is got from there
734   
735  */
736
737 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
738                                       const char *buf, size_t buf_size,
739                                       enum zebra_recctrl_action_t action,
740                                       const char *recordType,
741                                       zint *sysno,
742                                       const char *match_criteria,
743                                       const char *fname)
744 {
745     struct ZebraRecStream stream;
746     ZEBRA_RES res;
747     void *clientData;
748     RecType recType = 0;
749
750     if (recordType && *recordType)
751     {
752         yaz_log(log_level_extract,
753                 "Record type explicitly specified: %s", recordType);
754         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
755                                   &clientData);
756     } 
757     else
758     {
759         if (!(zh->m_record_type))
760         {
761             yaz_log(YLOG_WARN, "No such record type defined");
762             return ZEBRA_FAIL;
763         }
764         yaz_log(log_level_extract, "Get record type from rgroup: %s",
765                 zh->m_record_type);
766         recType = recType_byName(zh->reg->recTypes, zh->res,
767                                   zh->m_record_type, &clientData);
768         recordType = zh->m_record_type;
769     }
770     
771     if (!recType)
772     {
773         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
774         return ZEBRA_FAIL;
775     }
776
777     zebra_create_stream_mem(&stream, buf, buf_size);
778
779     res = zebra_extract_records_stream(zh, &stream,
780                                        action,
781                                        recordType,
782                                        sysno,
783                                        match_criteria,
784                                        fname,
785                                        recType, clientData);
786     stream.destroy(&stream);
787     return res;
788 }
789
790 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
791                                              struct ZebraRecStream *stream,
792                                              enum zebra_recctrl_action_t action,
793                                              const char *recordType,
794                                              zint *sysno,
795                                              const char *match_criteria,
796                                              const char *fname,
797                                              RecType recType,
798                                              void *recTypeClientData,
799                                              int *more)
800     
801 {
802     zint sysno0 = 0;
803     RecordAttr *recordAttr;
804     struct recExtractCtrl extractCtrl;
805     int r;
806     const char *matchStr = 0;
807     Record rec;
808     off_t start_offset = 0, end_offset = 0;
809     const char *pr_fname = fname;  /* filename to print .. */
810     int show_progress = zh->records_processed + zh->records_skipped 
811         < zh->m_file_verbose_limit ? 1:0;
812
813     zebra_init_log_level();
814
815     if (!pr_fname)
816         pr_fname = "<no file>";  /* make it printable if file is omitted */
817
818     zebra_rec_keys_reset(zh->reg->keys);
819     zebra_rec_keys_reset(zh->reg->sortKeys);
820
821     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
822     {
823         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
824                                       zh->m_explain_database))
825             return ZEBRA_FAIL;
826     }
827
828     if (stream)
829     {
830         off_t null_offset = 0;
831         extractCtrl.stream = stream;
832
833         start_offset = stream->tellf(stream);
834
835         extractCtrl.first_record = start_offset ? 0 : 1;
836         
837         stream->endf(stream, &null_offset);;
838
839         extractCtrl.init = extract_init;
840         extractCtrl.tokenAdd = extract_token_add;
841         extractCtrl.schemaAdd = extract_schema_add;
842         extractCtrl.dh = zh->reg->dh;
843         extractCtrl.handle = zh;
844         extractCtrl.match_criteria[0] = '\0';
845         extractCtrl.staticrank = 0;
846         extractCtrl.action = action;
847
848         init_extractCtrl(zh, &extractCtrl);
849
850         extract_set_store_data_prepare(&extractCtrl);
851         
852         r = (*recType->extract)(recTypeClientData, &extractCtrl);
853
854         if (action == action_update)
855         {
856             action = extractCtrl.action;
857         }
858         
859         switch (r)
860         {
861         case RECCTRL_EXTRACT_EOF:
862             return ZEBRA_FAIL;
863         case RECCTRL_EXTRACT_ERROR_GENERIC:
864             /* error occured during extraction ... */
865             yaz_log(YLOG_WARN, "extract error: generic");
866             return ZEBRA_FAIL;
867         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
868             /* error occured during extraction ... */
869             yaz_log(YLOG_WARN, "extract error: no such filter");
870             return ZEBRA_FAIL;
871         case RECCTRL_EXTRACT_SKIP:
872             if (show_progress)
873                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
874                          recordType, pr_fname, (zint) start_offset);
875             *more = 1;
876             
877             end_offset = stream->endf(stream, 0);
878             if (end_offset)
879                 stream->seekf(stream, end_offset);
880
881             return ZEBRA_OK;
882         case RECCTRL_EXTRACT_OK:
883             break;
884         default:
885             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
886             return ZEBRA_FAIL;
887         }
888         end_offset = stream->endf(stream, 0);
889         if (end_offset)
890             stream->seekf(stream, end_offset);
891         else
892             end_offset = stream->tellf(stream);
893
894         if (extractCtrl.match_criteria[0])
895             match_criteria = extractCtrl.match_criteria;
896     }
897
898     *more = 1;
899
900     if (zh->m_flag_rw == 0)
901     {
902         yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
903                 pr_fname, (zint) start_offset);
904         /* test mode .. Do not perform match */
905         return ZEBRA_OK;
906     }
907         
908     if (!sysno)
909     {
910         sysno = &sysno0;
911         
912         if (match_criteria && *match_criteria)
913             matchStr = match_criteria;
914         else
915         {
916             if (zh->m_record_id && *zh->m_record_id)
917             {
918                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
919                                                zh->m_record_id);
920                 if (!matchStr)
921                 {
922                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
923                              pr_fname, (zint) start_offset);
924                     return ZEBRA_FAIL;
925                 }
926                 if (0 && matchStr)
927                 {
928                     WRBUF w = wrbuf_alloc();
929                     size_t i;
930                     for (i = 0; i < strlen(matchStr); i++)
931                     {
932                         wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
933                     }
934                     yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
935                     wrbuf_destroy(w);
936                 }
937             }
938         }
939         if (matchStr) 
940         {
941             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
942             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
943                                           matchStr);
944
945             
946             if (log_level_extract)
947             {
948                 WRBUF w = wrbuf_hex_str(matchStr);
949                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
950                 wrbuf_destroy(w);
951             }
952             if (rinfo)
953             {
954                 assert(*rinfo == sizeof(*sysno));
955                 memcpy(sysno, rinfo+1, sizeof(*sysno));
956             }
957        }
958     }
959
960     if (! *sysno)
961     {
962         /* new record AKA does not exist already */
963         if (action == action_delete)
964         {
965             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
966                     pr_fname, (zint) start_offset);
967             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
968             return ZEBRA_FAIL;
969         }
970         else if (action == action_a_delete)
971         {
972             if (show_progress)
973                 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
974                         pr_fname, (zint) start_offset);
975             return ZEBRA_OK;
976         }
977         else if (action == action_replace)
978         {
979             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
980                          pr_fname, (zint) start_offset);
981             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
982             return ZEBRA_FAIL;
983         }
984         if (show_progress)
985             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
986                      (zint) start_offset);
987         rec = rec_new(zh->reg->records);
988
989         *sysno = rec->sysno;
990
991
992         if (stream)
993         {
994             all_matches_add(&extractCtrl,
995                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
996                             *sysno);
997         }
998
999
1000         recordAttr = rec_init_attr(zh->reg->zei, rec);
1001         if (extractCtrl.staticrank < 0)
1002         {
1003             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1004             extractCtrl.staticrank = 0;
1005         }
1006
1007         if (matchStr)
1008         {
1009             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1010             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1011                             sizeof(*sysno), sysno);
1012         }
1013
1014         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1015 #if FLUSH2
1016         extract_flush_record_keys2(zh, *sysno,
1017                                    zh->reg->keys, extractCtrl.staticrank,
1018                                    0, recordAttr->staticrank);
1019 #else
1020         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1021                                   extractCtrl.staticrank);
1022 #endif
1023         recordAttr->staticrank = extractCtrl.staticrank;
1024         zh->records_inserted++;
1025     } 
1026     else
1027     {
1028         /* record already exists */
1029         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1030         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1031         if (action == action_insert)
1032         {
1033             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
1034                          recordType, pr_fname, (zint) start_offset);
1035             logRecord(zh);
1036             return ZEBRA_FAIL;
1037         }
1038
1039         rec = rec_get(zh->reg->records, *sysno);
1040         assert(rec);
1041
1042         if (stream)
1043         {
1044             all_matches_add(&extractCtrl,
1045                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1046                             *sysno);
1047         }
1048         
1049         recordAttr = rec_init_attr(zh->reg->zei, rec);
1050
1051         /* decrease total size */
1052         zebraExplain_recordBytesIncrement(zh->reg->zei,
1053                                            - recordAttr->recordSize);
1054
1055         zebra_rec_keys_set_buf(delkeys,
1056                                rec->info[recInfo_delKeys],
1057                                rec->size[recInfo_delKeys],
1058                                0);
1059         zebra_rec_keys_set_buf(sortKeys,
1060                                rec->info[recInfo_sortKeys],
1061                                rec->size[recInfo_sortKeys],
1062                                0);
1063
1064         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1065 #if !FLUSH2
1066         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1067                                   recordAttr->staticrank);
1068 #endif
1069         if (action == action_delete || action == action_a_delete)
1070         {
1071             /* record going to be deleted */
1072 #if FLUSH2
1073             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1074                                        delkeys, recordAttr->staticrank);
1075 #endif       
1076             if (zebra_rec_keys_empty(delkeys))
1077             {
1078                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1079                         pr_fname, (zint) start_offset);
1080                 yaz_log(YLOG_WARN, "cannot delete file above, "
1081                         "storeKeys false (3)");
1082             }
1083             else
1084             {
1085                 if (show_progress)
1086                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1087                             pr_fname, (zint) start_offset);
1088                 zh->records_deleted++;
1089                 if (matchStr)
1090                 {
1091                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1092                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1093                 }
1094                 rec_del(zh->reg->records, &rec);
1095             }
1096             zebra_rec_keys_close(delkeys);
1097             zebra_rec_keys_close(sortKeys);
1098             rec_free(&rec);
1099             logRecord(zh);
1100             return ZEBRA_OK;
1101         }
1102         else
1103         {   /* update or special_update */
1104             if (show_progress)
1105                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1106                         pr_fname, (zint) start_offset);
1107             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1108
1109 #if FLUSH2
1110             extract_flush_record_keys2(zh, *sysno,
1111                                        zh->reg->keys, extractCtrl.staticrank,
1112                                        delkeys, recordAttr->staticrank);
1113 #else
1114             extract_flush_record_keys(zh, *sysno, 1, 
1115                                       zh->reg->keys, extractCtrl.staticrank);
1116 #endif
1117             recordAttr->staticrank = extractCtrl.staticrank;
1118             zh->records_updated++;
1119         }
1120         zebra_rec_keys_close(delkeys);
1121         zebra_rec_keys_close(sortKeys);
1122     }
1123     /* update file type */
1124     xfree(rec->info[recInfo_fileType]);
1125     rec->info[recInfo_fileType] =
1126         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1127
1128     /* update filename */
1129     xfree(rec->info[recInfo_filename]);
1130     rec->info[recInfo_filename] =
1131         rec_strdup(fname, &rec->size[recInfo_filename]);
1132
1133     /* update delete keys */
1134     xfree(rec->info[recInfo_delKeys]);
1135     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1136     {
1137         zebra_rec_keys_get_buf(zh->reg->keys,
1138                                &rec->info[recInfo_delKeys],
1139                                &rec->size[recInfo_delKeys]);
1140     }
1141     else
1142     {
1143         rec->info[recInfo_delKeys] = NULL;
1144         rec->size[recInfo_delKeys] = 0;
1145     }
1146     /* update sort keys */
1147     xfree(rec->info[recInfo_sortKeys]);
1148
1149     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1150                            &rec->info[recInfo_sortKeys],
1151                            &rec->size[recInfo_sortKeys]);
1152
1153     if (stream)
1154     {
1155         recordAttr->recordSize = end_offset - start_offset;
1156         zebraExplain_recordBytesIncrement(zh->reg->zei,
1157                                           recordAttr->recordSize);
1158     }
1159
1160     /* set run-number for this record */
1161     recordAttr->runNumber =
1162         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1163
1164     /* update store data */
1165     xfree(rec->info[recInfo_storeData]);
1166
1167     /* update store data */
1168     if (zh->store_data_buf)
1169     {
1170         rec->size[recInfo_storeData] = zh->store_data_size;
1171         rec->info[recInfo_storeData] = zh->store_data_buf;
1172         zh->store_data_buf = 0;
1173         recordAttr->recordSize = zh->store_data_size;
1174     }
1175     else if (zh->m_store_data)
1176     {
1177         off_t cur_offset = stream->tellf(stream);
1178
1179         rec->size[recInfo_storeData] = recordAttr->recordSize;
1180         rec->info[recInfo_storeData] = (char *)
1181             xmalloc(recordAttr->recordSize);
1182         stream->seekf(stream, start_offset);
1183         stream->readf(stream, rec->info[recInfo_storeData],
1184                       recordAttr->recordSize);
1185         stream->seekf(stream, cur_offset);
1186     }
1187     else
1188     {
1189         rec->info[recInfo_storeData] = NULL;
1190         rec->size[recInfo_storeData] = 0;
1191     }
1192     /* update database name */
1193     xfree(rec->info[recInfo_databaseName]);
1194     rec->info[recInfo_databaseName] =
1195         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1196
1197     /* update offset */
1198     recordAttr->recordOffset = start_offset;
1199     
1200     /* commit this record */
1201     rec_put(zh->reg->records, &rec);
1202     logRecord(zh);
1203     return ZEBRA_OK;
1204 }
1205
1206 /** \brief extracts records from stream
1207     \param zh Zebra Handle
1208     \param stream stream that we read from
1209     \param action (action_insert, action_replace, action_delete, ..)
1210     \param recordType Record filter type "grs.xml", etc.
1211     \param sysno pointer to sysno if already known; NULL otherwise
1212     \param match_criteria (NULL if not already given)
1213     \param fname filename that we read from (for logging purposes only)
1214     \param recType record type
1215     \param recTypeClientData client data for record type
1216     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1217 */
1218 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
1219                                        struct ZebraRecStream *stream,
1220                                        enum zebra_recctrl_action_t action,
1221                                        const char *recordType,
1222                                        zint *sysno,
1223                                        const char *match_criteria,
1224                                        const char *fname,
1225                                        RecType recType,
1226                                        void *recTypeClientData)
1227 {
1228     ZEBRA_RES res = ZEBRA_OK;
1229     while (1)
1230     {
1231         int more = 0;
1232         res = zebra_extract_record_stream(zh, stream,
1233                                           action,
1234                                           recordType,
1235                                           sysno,
1236                                           match_criteria,
1237                                           fname,
1238                                           recType, recTypeClientData, &more);
1239         if (!more)
1240         {
1241             res = ZEBRA_OK;
1242             break;
1243         }
1244         if (res != ZEBRA_OK)
1245             break;
1246         if (sysno)
1247             break;
1248     }
1249     return res;
1250 }
1251
1252 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1253 {
1254     ZebraHandle zh = (ZebraHandle) handle;
1255     struct recExtractCtrl extractCtrl;
1256
1257     if (zebraExplain_curDatabase(zh->reg->zei,
1258                                   rec->info[recInfo_databaseName]))
1259     {
1260         abort();
1261         if (zebraExplain_newDatabase(zh->reg->zei,
1262                                       rec->info[recInfo_databaseName], 0))
1263             abort();
1264     }
1265
1266     zebra_rec_keys_reset(zh->reg->keys);
1267     zebra_rec_keys_reset(zh->reg->sortKeys);
1268
1269     extractCtrl.init = extract_init;
1270     extractCtrl.tokenAdd = extract_token_add;
1271     extractCtrl.schemaAdd = extract_schema_add;
1272     extractCtrl.dh = zh->reg->dh;
1273
1274     init_extractCtrl(zh, &extractCtrl);
1275
1276     extractCtrl.flagShowRecords = 0;
1277     extractCtrl.match_criteria[0] = '\0';
1278     extractCtrl.staticrank = 0;
1279     extractCtrl.action = action_update;
1280
1281     extractCtrl.handle = handle;
1282     extractCtrl.first_record = 1;
1283     
1284     extract_set_store_data_prepare(&extractCtrl);
1285
1286     if (n)
1287         grs_extract_tree(&extractCtrl, n);
1288
1289     if (rec->size[recInfo_delKeys])
1290     {
1291         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1292         
1293         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1294
1295         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1296                                rec->size[recInfo_delKeys],
1297                                0);
1298 #if FLUSH2
1299         extract_flush_record_keys2(zh, rec->sysno, 
1300                                    zh->reg->keys, 0, delkeys, 0);
1301 #else
1302         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1303         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1304 #endif
1305         zebra_rec_keys_close(delkeys);
1306
1307         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1308                                rec->size[recInfo_sortKeys],
1309                                0);
1310
1311         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1312         zebra_rec_keys_close(sortkeys);
1313     }
1314     else
1315     {
1316 #if FLUSH2
1317         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1318 #else
1319         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1320 #endif
1321     }
1322     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1323     
1324     xfree(rec->info[recInfo_delKeys]);
1325     zebra_rec_keys_get_buf(zh->reg->keys,
1326                            &rec->info[recInfo_delKeys], 
1327                            &rec->size[recInfo_delKeys]);
1328
1329     xfree(rec->info[recInfo_sortKeys]);
1330     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1331                            &rec->info[recInfo_sortKeys],
1332                            &rec->size[recInfo_sortKeys]);
1333     return ZEBRA_OK;
1334 }
1335
1336 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1337                            const char *str, size_t slen, NMEM nmem, int level)
1338 {
1339     char keystr[200]; /* room for zints to print */
1340     char *dst_term = 0;
1341     int ord = CAST_ZINT_TO_INT(key->mem[0]);
1342     const char *index_type;
1343     int i;
1344     const char *string_index;
1345     
1346     zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1347                             0/* db */, &string_index);
1348     assert(index_type);
1349     zebra_term_untrans_iconv(zh, nmem, index_type,
1350                              &dst_term, str);
1351     *keystr = '\0';
1352     for (i = 0; i < key->len; i++)
1353     {
1354         sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1355     }
1356     
1357     if (*str < CHR_BASE_CHAR)
1358     {
1359         int i;
1360         char dst_buf[200]; /* room for special chars */
1361         
1362         strcpy(dst_buf , "?");
1363         
1364         if (!strcmp(str, ""))
1365             strcpy(dst_buf, "alwaysmatches");
1366         if (!strcmp(str, FIRST_IN_FIELD_STR))
1367             strcpy(dst_buf, "firstinfield");
1368         else if (!strcmp(str, CHR_UNKNOWN))
1369             strcpy(dst_buf, "unknown");
1370         else if (!strcmp(str, CHR_SPACE))
1371             strcpy(dst_buf, "space");
1372         
1373         for (i = 0; i<slen; i++)
1374         {
1375             sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1376         }
1377         yaz_log(level, "%s%s %s %s", keystr, index_type,
1378                 string_index, dst_buf);
1379         
1380     }
1381     else
1382         yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1383                 string_index, dst_term);
1384 }
1385
1386 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1387                           zebra_rec_keys_t reckeys,
1388                           int level)
1389 {
1390     if (zebra_rec_keys_rewind(reckeys))
1391     {
1392         size_t slen;
1393         const char *str;
1394         struct it_key key;
1395         NMEM nmem = nmem_create();
1396
1397         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1398         {
1399             zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1400             nmem_reset(nmem);
1401         }
1402         nmem_destroy(nmem);
1403     }
1404 }
1405
1406 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1407                              zebra_rec_keys_t reckeys)
1408 {
1409     ZebraExplainInfo zei = zh->reg->zei;
1410     struct ord_stat {
1411         int no;
1412         int ord;
1413         struct ord_stat *next;
1414     };
1415
1416     if (zebra_rec_keys_rewind(reckeys))
1417     {
1418         struct ord_stat *ord_list = 0;
1419         struct ord_stat *p;
1420         size_t slen;
1421         const char *str;
1422         struct it_key key_in;
1423         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1424         {
1425             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1426
1427             for (p = ord_list; p ; p = p->next)
1428                 if (p->ord == ord)
1429                 {
1430                     p->no++;
1431                     break;
1432                 }
1433             if (!p)
1434             {
1435                 p = xmalloc(sizeof(*p));
1436                 p->no = 1;
1437                 p->ord = ord;
1438                 p->next = ord_list;
1439                 ord_list = p;
1440             }
1441         }
1442
1443         p = ord_list;
1444         while (p)
1445         {
1446             struct ord_stat *p1 = p;
1447
1448             if (is_insert)
1449                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1450             else
1451                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1452             p = p->next;
1453             xfree(p1);
1454         }
1455     }
1456 }
1457
1458 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
1459                                 zebra_rec_keys_t ins_keys, zint ins_rank,
1460                                 zebra_rec_keys_t del_keys, zint del_rank)
1461 {
1462     ZebraExplainInfo zei = zh->reg->zei;
1463     int normal = 0;
1464     int optimized = 0;
1465
1466     if (!zh->reg->key_block)
1467     {
1468         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1469         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1470         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1471         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1472     }
1473
1474     if (ins_keys)
1475     {
1476         extract_rec_keys_adjust(zh, 1, ins_keys);
1477         if (!del_keys)
1478             zebraExplain_recordCountIncrement(zei, 1);
1479         zebra_rec_keys_rewind(ins_keys);
1480     }
1481     if (del_keys)
1482     {
1483         extract_rec_keys_adjust(zh, 0, del_keys);
1484         if (!ins_keys)
1485             zebraExplain_recordCountIncrement(zei, -1);
1486         zebra_rec_keys_rewind(del_keys);
1487     }
1488
1489     while (1)
1490     {
1491         size_t del_slen;
1492         const char *del_str;
1493         struct it_key del_key_in;
1494         int del = 0;
1495
1496         size_t ins_slen;
1497         const char *ins_str;
1498         struct it_key ins_key_in;
1499         int ins = 0;
1500
1501         if (del_keys)
1502             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1503                                       &del_key_in);
1504         if (ins_keys)
1505             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1506                                       &ins_key_in);
1507
1508         if (del && ins && ins_rank == del_rank
1509             && !key_compare(&del_key_in, &ins_key_in) 
1510             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1511         {
1512             optimized++;
1513             continue;
1514         }
1515         if (!del && !ins)
1516             break;
1517         
1518         normal++;
1519         if (del)
1520             key_block_write(zh->reg->key_block, sysno, 
1521                             &del_key_in, 0, del_str, del_slen,
1522                             del_rank, zh->m_staticrank);
1523         if (ins)
1524             key_block_write(zh->reg->key_block, sysno, 
1525                             &ins_key_in, 1, ins_str, ins_slen,
1526                             ins_rank, zh->m_staticrank);
1527     }
1528     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1529 }
1530
1531
1532 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1533                                      zebra_rec_keys_t reckeys,
1534                                      zebra_snippets *snippets)
1535 {
1536     NMEM nmem = nmem_create();
1537     if (zebra_rec_keys_rewind(reckeys)) 
1538     {
1539         const char *str;
1540         size_t slen;
1541         struct it_key key;
1542         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1543         {
1544             char *dst_term = 0;
1545             int ord;
1546             zint seqno;
1547             const char *index_type;
1548
1549             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1550             seqno = key.mem[key.len-1];
1551             ord = CAST_ZINT_TO_INT(key.mem[0]);
1552             
1553             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1554                                     0/* db */, 0 /* string_index */);
1555             assert(index_type);
1556             zebra_term_untrans_iconv(zh, nmem, index_type,
1557                                      &dst_term, str);
1558             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1559             nmem_reset(nmem);
1560         }
1561     }
1562     nmem_destroy(nmem);
1563     return ZEBRA_OK;
1564 }
1565
1566 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1567 {
1568     yaz_log(YLOG_LOG, "print_rec_keys");
1569     if (zebra_rec_keys_rewind(reckeys))
1570     {
1571         const char *str;
1572         size_t slen;
1573         struct it_key key;
1574         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1575         {
1576             char dst_buf[IT_MAX_WORD];
1577             zint seqno;
1578             const char *index_type;
1579             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1580             const char *db = 0;
1581             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1582
1583             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1584             
1585             seqno = key.mem[key.len-1];
1586             
1587             zebra_term_untrans(zh, index_type, dst_buf, str);
1588             
1589             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1590                     " term=%s", ord, seqno, dst_buf); 
1591         }
1592     }
1593 }
1594
1595 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1596                                      const char *str, int length)
1597 {
1598     struct it_key key;
1599     ZebraHandle zh = p->extractCtrl->handle;
1600     ZebraExplainInfo zei = zh->reg->zei;
1601     int ch, i;
1602
1603     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1604     if (ch < 0)
1605         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1606
1607     i = 0;
1608     key.mem[i++] = ch;
1609     key.mem[i++] = p->record_id;
1610     key.mem[i++] = p->section_id;
1611
1612     if (zh->m_segment_indexing)
1613         key.mem[i++] = p->segment;
1614     key.mem[i++] = p->seqno;
1615     key.len = i;
1616
1617     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1618 }
1619
1620 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1621 {
1622     struct it_key key;
1623     ZebraHandle zh = p->extractCtrl->handle;
1624     ZebraExplainInfo zei = zh->reg->zei;
1625     int ch;
1626     zinfo_index_category_t cat = zinfo_index_category_sort;
1627
1628     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1629     if (ch < 0)
1630         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1631     key.len = 3;
1632     key.mem[0] = ch;
1633     key.mem[1] = p->record_id;
1634     key.mem[2] = p->section_id;
1635
1636     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1637 }
1638
1639 static void extract_add_staticrank_string(RecWord *p,
1640                                           const char *str, int length)
1641 {
1642     char valz[40];
1643     struct recExtractCtrl *ctrl = p->extractCtrl;
1644
1645     if (length > sizeof(valz)-1)
1646         length = sizeof(valz)-1;
1647
1648     memcpy(valz, str, length);
1649     valz[length] = '\0';
1650     ctrl->staticrank = atozint(valz);
1651 }
1652
1653 static void extract_add_string(RecWord *p, zebra_map_t zm,
1654                                const char *string, int length)
1655 {
1656     assert(length > 0);
1657
1658     if (!p->index_name)
1659         return;
1660     if (log_level_details)
1661     {
1662
1663         WRBUF w = wrbuf_alloc();
1664         
1665         wrbuf_write_escaped(w, string, length);
1666         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1667         wrbuf_destroy(w);
1668     }
1669     if (zebra_maps_is_index(zm))
1670     {
1671         extract_add_index_string(p, zinfo_index_category_index,
1672                                  string, length);
1673         if (zebra_maps_is_alwaysmatches(zm))
1674         {
1675             RecWord word;
1676             memcpy(&word, p, sizeof(word));
1677
1678             word.seqno = 1;
1679             extract_add_index_string(
1680                 &word, zinfo_index_category_alwaysmatches, "", 0);
1681         }
1682     }
1683     else if (zebra_maps_is_sort(zm))
1684     {
1685         extract_add_sort_string(p, string, length);
1686     }
1687     else if (zebra_maps_is_staticrank(zm))
1688     {
1689         extract_add_staticrank_string(p, string, length);
1690     }
1691 }
1692
1693 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1694 {
1695     const char *b = p->term_buf;
1696     int remain = p->term_len;
1697     int first = 1;
1698     const char **map = 0;
1699     
1700     if (remain > 0)
1701         map = zebra_maps_input(zm, &b, remain, 0);
1702
1703     while (map)
1704     {
1705         char buf[IT_MAX_WORD+1];
1706         int i, remain;
1707
1708         /* Skip spaces */
1709         while (map && *map && **map == *CHR_SPACE)
1710         {
1711             remain = p->term_len - (b - p->term_buf);
1712             if (remain > 0)
1713                 map = zebra_maps_input(zm, &b, remain, 0);
1714             else
1715                 map = 0;
1716         }
1717         if (!map)
1718             break;
1719         i = 0;
1720         while (map && *map && **map != *CHR_SPACE)
1721         {
1722             const char *cp = *map;
1723
1724             while (i < IT_MAX_WORD && *cp)
1725                 buf[i++] = *(cp++);
1726             remain = p->term_len - (b - p->term_buf);
1727             if (remain > 0)
1728                 map = zebra_maps_input(zm, &b, remain, 0);
1729             else
1730                 map = 0;
1731         }
1732         if (!i)
1733             return;
1734
1735         if (first)
1736         {   
1737             first = 0;
1738             if (zebra_maps_is_first_in_field(zm))
1739             {
1740                 /* first in field marker */
1741                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1742                 p->seqno++;
1743             }
1744         }
1745         extract_add_string(p, zm, buf, i);
1746         p->seqno++;
1747     }
1748 }
1749
1750 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1751 {
1752     const char *b = p->term_buf;
1753     char buf[IT_MAX_WORD+1];
1754     const char **map = 0;
1755     int i = 0, remain = p->term_len;
1756
1757     if (remain > 0)
1758         map = zebra_maps_input(zm, &b, remain, 1);
1759
1760     while (remain > 0 && i < IT_MAX_WORD)
1761     {
1762         while (map && *map && **map == *CHR_SPACE)
1763         {
1764             remain = p->term_len - (b - p->term_buf);
1765
1766             if (remain > 0)
1767             {
1768                 int first = i ? 0 : 1;  /* first position */
1769                 map = zebra_maps_input(zm, &b, remain, first);
1770             }
1771             else
1772                 map = 0;
1773         }
1774         if (!map)
1775             break;
1776
1777         if (i && i < IT_MAX_WORD)
1778             buf[i++] = *CHR_SPACE;
1779         while (map && *map && **map != *CHR_SPACE)
1780         {
1781             const char *cp = *map;
1782
1783             if (**map == *CHR_CUT)
1784             {
1785                 i = 0;
1786             }
1787             else
1788             {
1789                 if (i >= IT_MAX_WORD)
1790                     break;
1791                 while (i < IT_MAX_WORD && *cp)
1792                     buf[i++] = *(cp++);
1793             }
1794             remain = p->term_len  - (b - p->term_buf);
1795             if (remain > 0)
1796             {
1797                 map = zebra_maps_input(zm, &b, remain, 0);
1798             }
1799             else
1800                 map = 0;
1801         }
1802     }
1803     if (!i)
1804         return;
1805     extract_add_string(p, zm, buf, i);
1806 }
1807
1808 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1809 {
1810     const char *res_buf = 0;
1811     size_t res_len = 0;
1812
1813     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1814     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1815     {
1816         extract_add_string(p, zm, res_buf, res_len);
1817         p->seqno++;
1818     }
1819 }
1820
1821
1822 /** \brief top-level indexing handler for recctrl system
1823     \param p token data to be indexed
1824
1825     Call sequence:
1826     extract_token_add
1827     extract_add_{in}_complete / extract_add_icu
1828     extract_add_string
1829     
1830     extract_add_index_string
1831     or
1832     extract_add_sort_string
1833     or
1834     extract_add_staticrank_string
1835     
1836 */
1837 static void extract_token_add(RecWord *p)
1838 {
1839     ZebraHandle zh = p->extractCtrl->handle;
1840     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1841     WRBUF wrbuf;
1842
1843     if (log_level_details)
1844     {
1845         yaz_log(log_level_details, "extract_token_add "
1846                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1847                 p->index_type, p->index_name, 
1848                 p->seqno, p->term_len, p->term_buf);
1849     }
1850     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1851     {
1852         p->term_buf = wrbuf_buf(wrbuf);
1853         p->term_len = wrbuf_len(wrbuf);
1854     }
1855     if (zebra_maps_is_icu(zm))
1856     {
1857         extract_add_icu(p, zm);
1858     }
1859     else
1860     {
1861         if (zebra_maps_is_complete(zm))
1862             extract_add_complete_field(p, zm);
1863         else
1864             extract_add_incomplete_field(p, zm);
1865     }
1866 }
1867
1868 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1869                                       void *buf, size_t sz)
1870 {
1871     ZebraHandle zh = (ZebraHandle) p->handle;
1872
1873     xfree(zh->store_data_buf);
1874     zh->store_data_buf = 0;
1875     zh->store_data_size = 0;
1876     if (buf && sz)
1877     {
1878         zh->store_data_buf = xmalloc(sz);
1879         zh->store_data_size = sz;
1880         memcpy(zh->store_data_buf, buf, sz);
1881     }
1882 }
1883
1884 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1885 {
1886     ZebraHandle zh = (ZebraHandle) p->handle;
1887     xfree(zh->store_data_buf);
1888     zh->store_data_buf = 0;
1889     zh->store_data_size = 0;
1890     p->setStoreData = extract_set_store_data_cb;
1891 }
1892
1893 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1894 {
1895     ZebraHandle zh = (ZebraHandle) p->handle;
1896     zebraExplain_addSchema(zh->reg->zei, oid);
1897 }
1898
1899 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1900                              int cmd, zebra_rec_keys_t reckeys)
1901 {
1902 #if 0
1903     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1904             cmd, sysno);
1905     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1906 #endif
1907
1908     if (zebra_rec_keys_rewind(reckeys))
1909     {
1910         zebra_sort_index_t si = zh->reg->sort_index;
1911         size_t slen;
1912         const char *str;
1913         struct it_key key_in;
1914
1915         NMEM nmem = nmem_create();
1916         struct sort_add_ent {
1917             int ord;
1918             int cmd;
1919             struct sort_add_ent *next;
1920             WRBUF wrbuf;
1921             zint sysno;
1922             zint section_id;
1923         };
1924         struct sort_add_ent *sort_ent_list = 0;
1925
1926         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1927         {
1928             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1929             zint filter_sysno = key_in.mem[1];
1930             zint section_id = key_in.mem[2];
1931
1932             struct sort_add_ent **e = &sort_ent_list;
1933             for (; *e; e = &(*e)->next)
1934                 if ((*e)->ord == ord && section_id == (*e)->section_id)
1935                     break;
1936             if (!*e)
1937             {
1938                 *e = nmem_malloc(nmem, sizeof(**e));
1939                 (*e)->next = 0;
1940                 (*e)->wrbuf = wrbuf_alloc();
1941                 (*e)->ord = ord;
1942                 (*e)->cmd = cmd;
1943                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1944                 (*e)->section_id = section_id;
1945             }
1946             
1947             wrbuf_write((*e)->wrbuf, str, slen);
1948             wrbuf_putc((*e)->wrbuf, '\0');
1949         }
1950         if (sort_ent_list)
1951         {
1952             zint last_sysno = 0;
1953             struct sort_add_ent *e = sort_ent_list;
1954             for (; e; e = e->next)
1955             {
1956                 if (last_sysno != e->sysno)
1957                 {
1958                     zebra_sort_sysno(si, e->sysno);
1959                     last_sysno = e->sysno;
1960                 }
1961                 zebra_sort_type(si, e->ord);
1962                 if (e->cmd == 1)
1963                     zebra_sort_add(si, e->section_id, e->wrbuf);
1964                 else
1965                     zebra_sort_delete(si, e->section_id);
1966                 wrbuf_destroy(e->wrbuf);
1967             }
1968         }
1969         nmem_destroy(nmem);
1970     }
1971 }
1972
1973 /*
1974  * Local variables:
1975  * c-basic-offset: 4
1976  * c-file-style: "Stroustrup"
1977  * indent-tabs-mode: nil
1978  * End:
1979  * vim: shiftwidth=4 tabstop=8 expandtab
1980  */
1981