configure produces config.h
[idzebra-moved-to-github.git] / index / extract.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2011 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 /** \file
21     \brief indexes records and extract tokens for indexing and sorting
22 */
23
24 #if HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37
38
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52
53 #if FLUSH2
54 static void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
55                                        zebra_rec_keys_t ins_keys,
56                                        zint ins_rank,
57                                        zebra_rec_keys_t del_keys,
58                                        zint del_rank);
59 #else
60 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
61                                       int cmd,
62                                       zebra_rec_keys_t reckeys,
63                                       zint staticrank);
64 #endif
65
66 static void zebra_init_log_level(void)
67 {
68     if (!log_level_initialized)
69     {
70         log_level_initialized = 1;
71
72         log_level_extract = yaz_log_module_level("extract");
73         log_level_details = yaz_log_module_level("indexdetails");
74     }
75 }
76
77 static WRBUF wrbuf_hex_str(const char *cstr)
78 {
79     size_t i;
80     WRBUF w = wrbuf_alloc();
81     for (i = 0; cstr[i]; i++)
82     {
83         if (cstr[i] < ' ' || cstr[i] > 126)
84             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
85         else
86             wrbuf_putc(w, cstr[i]);
87     }
88     return w;
89 }
90
91
92 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
93                                     int cmd, zebra_rec_keys_t skp);
94 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
95 static void extract_token_add(RecWord *p);
96
97 static void check_log_limit(ZebraHandle zh)
98 {
99     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
100     {
101         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
102                 zh->m_file_verbose_limit);
103     }
104 }
105
106 static void logRecord(ZebraHandle zh)
107 {
108     check_log_limit(zh);
109     ++zh->records_processed;
110     if (!(zh->records_processed % 1000))
111     {
112         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
113                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
114                 zh->records_processed, zh->records_inserted, 
115                 zh->records_updated, zh->records_deleted);
116     }
117 }
118
119 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
120 {
121     ctrl->flagShowRecords = !zh->m_flag_rw;
122 }
123
124
125 static void extract_add_index_string(RecWord *p, 
126                                       zinfo_index_category_t cat,
127                                       const char *str, int length);
128
129 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
130
131 static void extract_init(struct recExtractCtrl *p, RecWord *w)
132 {
133     w->seqno = 1;
134     w->index_name = "any";
135     w->index_type = "w";
136     w->extractCtrl = p;
137     w->record_id = 0;
138     w->section_id = 0;
139     w->segment = 0;
140 }
141
142 struct snip_rec_info {
143     ZebraHandle zh;
144     zebra_snippets *snippets;
145 };
146
147
148 static void snippet_add_complete_field(RecWord *p, int ord,
149                                        zebra_map_t zm)
150 {
151     struct snip_rec_info *h = p->extractCtrl->handle;
152
153     const char *b = p->term_buf;
154     char buf[IT_MAX_WORD+1];
155     const char **map = 0;
156     int i = 0, remain = p->term_len;
157     const char *start = b;
158     const char *last = 0;
159
160     if (remain > 0)
161         map = zebra_maps_input(zm, &b, remain, 1);
162
163     while (remain > 0 && i < IT_MAX_WORD)
164     {
165         while (map && *map && **map == *CHR_SPACE)
166         {
167             remain = p->term_len - (b - p->term_buf);
168
169             if (i == 0)
170                 start = b;  /* set to first non-ws area */
171             if (remain > 0)
172             {
173                 int first = i ? 0 : 1;  /* first position */
174
175                 map = zebra_maps_input(zm, &b, remain, first);
176             }
177             else
178                 map = 0;
179         }
180         if (!map)
181             break;
182
183         if (i && i < IT_MAX_WORD)
184             buf[i++] = *CHR_SPACE;
185         while (map && *map && **map != *CHR_SPACE)
186         {
187             const char *cp = *map;
188
189             if (**map == *CHR_CUT)
190             {
191                 i = 0;
192             }
193             else
194             {
195                 if (i >= IT_MAX_WORD)
196                     break;
197                 while (i < IT_MAX_WORD && *cp)
198                     buf[i++] = *(cp++);
199             }
200             last = b;
201             remain = p->term_len  - (b - p->term_buf);
202             if (remain > 0)
203             {
204                 map = zebra_maps_input(zm, &b, remain, 0);
205             }
206             else
207                 map = 0;
208         }
209     }
210     if (!i)
211         return;
212     if (last && start != last && zebra_maps_is_index(zm))
213         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
214                                start, last - start);
215 }
216
217 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
218 {
219     struct snip_rec_info *h = p->extractCtrl->handle;
220     const char *b = p->term_buf;
221     int remain = p->term_len;
222     int first = 1;
223     const char **map = 0;
224     const char *start = b;
225     const char *last = b;
226
227     if (remain > 0)
228         map = zebra_maps_input(zm, &b, remain, 0);
229
230     while (map)
231     {
232         char buf[IT_MAX_WORD+1];
233         int i, remain;
234
235         /* Skip spaces */
236         while (map && *map && **map == *CHR_SPACE)
237         {
238             remain = p->term_len - (b - p->term_buf);
239             last = b;
240             if (remain > 0)
241                 map = zebra_maps_input(zm, &b, remain, 0);
242             else
243                 map = 0;
244         }
245         if (!map)
246             break;
247         if (start != last && zebra_maps_is_index(zm))
248         {
249             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
250                                    start, last - start);
251
252         }
253         start = last;
254
255         i = 0;
256         while (map && *map && **map != *CHR_SPACE)
257         {
258             const char *cp = *map;
259
260             while (i < IT_MAX_WORD && *cp)
261                 buf[i++] = *(cp++);
262             remain = p->term_len - (b - p->term_buf);
263             last = b;
264             if (remain > 0)
265                 map = zebra_maps_input(zm, &b, remain, 0);
266             else
267                 map = 0;
268         }
269         if (!i)
270             return;
271
272         if (first)
273         {   
274             first = 0;
275             if (zebra_maps_is_first_in_field(zm))
276             {
277                 /* first in field marker */
278                 p->seqno++;
279             }
280         }
281         if (start != last && zebra_maps_is_index(zm))
282             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
283                                    start, last - start);
284         start = last;
285         p->seqno++;
286     }
287
288 }
289
290 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
291 {
292     struct snip_rec_info *h = p->extractCtrl->handle;
293
294     const char *res_buf = 0;
295     size_t res_len = 0;
296
297     const char *display_buf = 0;
298     size_t display_len = 0;
299
300     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
301     while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
302                                    &display_buf, &display_len))
303     {
304         if (zebra_maps_is_index(zm))
305             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
306                                    display_buf, display_len);
307         p->seqno++;
308     }
309 }
310
311 static void snippet_token_add(RecWord *p)
312 {
313     struct snip_rec_info *h = p->extractCtrl->handle;
314     ZebraHandle zh = h->zh;
315     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
316
317     if (zm)
318     {
319         ZebraExplainInfo zei = zh->reg->zei;
320         int ch = zebraExplain_lookup_attr_str(
321             zei, zinfo_index_category_index, p->index_type, p->index_name);
322
323         if (zebra_maps_is_icu(zm))
324             snippet_add_icu(p, ch, zm);
325         else
326         {
327             if (zebra_maps_is_complete(zm))
328                 snippet_add_complete_field(p, ch, zm);
329             else
330                 snippet_add_incomplete_field(p, ch, zm);
331         }
332     }
333 }
334
335 static void snippet_schema_add(
336     struct recExtractCtrl *p, Odr_oid *oid)
337 {
338
339 }
340
341 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
342                      struct ZebraRecStream *stream,
343                      RecType rt, void *recTypeClientData)
344 {
345     struct recExtractCtrl extractCtrl;
346     struct snip_rec_info info;
347     int r;
348
349     extractCtrl.stream = stream;
350     extractCtrl.first_record = 1;
351     extractCtrl.init = extract_init;
352     extractCtrl.tokenAdd = snippet_token_add;
353     extractCtrl.schemaAdd = snippet_schema_add;
354     assert(zh->reg);
355     assert(zh->reg->dh);
356
357     extractCtrl.dh = zh->reg->dh;
358     
359     info.zh = zh;
360     info.snippets = sn;
361     extractCtrl.handle = &info;
362     extractCtrl.match_criteria[0] = '\0';
363     extractCtrl.staticrank = 0;
364     extractCtrl.action = action_insert;
365     
366     init_extractCtrl(zh, &extractCtrl);
367
368     extractCtrl.setStoreData = 0;
369
370     r = (*rt->extract)(recTypeClientData, &extractCtrl);
371
372 }
373
374 static void searchRecordKey(ZebraHandle zh,
375                             zebra_rec_keys_t reckeys,
376                             const char *index_name,
377                             const char **ws, int ws_length)
378 {
379     int i;
380     int ch = -1;
381     zinfo_index_category_t cat = zinfo_index_category_index;
382
383     for (i = 0; i<ws_length; i++)
384         ws[i] = NULL;
385
386     if (ch < 0)
387         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
388     if (ch < 0)
389         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
390     if (ch < 0)
391         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
392
393     if (ch < 0)
394         return ;
395
396     if (zebra_rec_keys_rewind(reckeys))
397     {
398         zint startSeq = -1;
399         const char *str;
400         size_t slen;
401         struct it_key key;
402         zint seqno;
403         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
404         {
405             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
406
407             seqno = key.mem[key.len-1];
408             
409             if (key.mem[0] == ch)
410             {
411                 zint woff;
412                 
413                 if (startSeq == -1)
414                     startSeq = seqno;
415                 woff = seqno - startSeq;
416                 if (woff >= 0 && woff < ws_length)
417                     ws[woff] = str;
418             }
419         }
420     }
421 }
422
423 #define FILE_MATCH_BLANK "\t "
424
425 static char *get_match_from_spec(ZebraHandle zh,
426                           zebra_rec_keys_t reckeys,
427                           const char *fname, const char *spec)
428 {
429     static char dstBuf[2048];      /* static here ??? */
430     char *dst = dstBuf;
431     const char *s = spec;
432
433     while (1)
434     {
435         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
436             ;
437         if (!*s)
438             break;
439         if (*s == '(')
440         {
441             const char *ws[32];
442             char attset_str[64], attname_str[64];
443             int i;
444             int first = 1;
445             
446             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
447                 ;
448             for (i = 0; *s && *s != ',' && *s != ')' && 
449                      !strchr(FILE_MATCH_BLANK, *s); s++)
450                 if (i+1 < sizeof(attset_str))
451                     attset_str[i++] = *s;
452             attset_str[i] = '\0';
453             
454             for (; strchr(FILE_MATCH_BLANK, *s); s++)
455                 ;
456             if (*s != ',')
457                 strcpy(attname_str, attset_str);
458             else
459             {
460                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
461                     ;
462                 for (i = 0; *s && *s != ')' && 
463                          !strchr(FILE_MATCH_BLANK, *s); s++)
464                     if (i+1 < sizeof(attname_str))
465                         attname_str[i++] = *s;
466                 attname_str[i] = '\0';
467             }
468             if (*s != ')')
469             {
470                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
471                       spec, zh->m_group ? zh->m_group : "none");
472                 return NULL;
473             }
474             s++;
475
476             searchRecordKey(zh, reckeys, attname_str, ws, 32);
477             if (0) /* for debugging */
478             {   
479                 for (i = 0; i<32; i++)
480                 {
481                     if (ws[i])
482                     {
483                         WRBUF w = wrbuf_hex_str(ws[i]);
484                         yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
485                         wrbuf_destroy(w);
486                     }
487                 }
488             }
489
490             for (i = 0; i<32; i++)
491                 if (ws[i])
492                 {
493                     if (first)
494                     {
495                         *dst++ = ' ';
496                         first = 0;
497                     }
498                     strcpy(dst, ws[i]);
499                     dst += strlen(ws[i]);
500                 }
501             if (first)
502             {
503                 yaz_log(YLOG_WARN, "Record didn't contain match"
504                       " fields in (%s,%s)", attset_str, attname_str);
505                 return NULL;
506             }
507         }
508         else if (*s == '$')
509         {
510             int spec_len;
511             char special[64];
512             const char *spec_src = NULL;
513             const char *s1 = ++s;
514             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
515                 s1++;
516
517             spec_len = s1 - s;
518             if (spec_len > sizeof(special)-1)
519                 spec_len = sizeof(special)-1;
520             memcpy(special, s, spec_len);
521             special[spec_len] = '\0';
522             s = s1;
523
524             if (!strcmp(special, "group"))
525                 spec_src = zh->m_group;
526             else if (!strcmp(special, "database"))
527                 spec_src = zh->basenames[0];
528             else if (!strcmp(special, "filename")) {
529                 spec_src = fname;
530             }
531             else if (!strcmp(special, "type"))
532                 spec_src = zh->m_record_type;
533             else 
534                 spec_src = NULL;
535             if (spec_src)
536             {
537                 strcpy(dst, spec_src);
538                 dst += strlen(spec_src);
539             }
540         }
541         else if (*s == '\"' || *s == '\'')
542         {
543             int stopMarker = *s++;
544             char tmpString[64];
545             int i = 0;
546
547             while (*s && *s != stopMarker)
548             {
549                 if (i+1 < sizeof(tmpString))
550                     tmpString[i++] = *s++;
551             }
552             if (*s)
553                 s++;
554             tmpString[i] = '\0';
555             strcpy(dst, tmpString);
556             dst += strlen(tmpString);
557         }
558         else
559         {
560             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
561                   spec, zh->m_group ? zh->m_group : "none");
562             return NULL;
563         }
564         *dst++ = 1;
565     }
566     if (dst == dstBuf)
567     {
568         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
569               fname, zh->m_group ? zh->m_group : "none");
570         return NULL;
571     }
572     *dst = '\0';
573
574     if (0) /* for debugging */
575     {
576         WRBUF w = wrbuf_hex_str(dstBuf);
577         yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
578         wrbuf_destroy(w);
579     }
580
581     return dstBuf;
582 }
583
584 struct recordLogInfo {
585     const char *fname;
586     int recordOffset;
587     struct recordGroup *rGroup;
588 };
589
590 /** \brief add the always-matches index entry and map to real record ID
591     \param ctrl record control
592     \param record_id custom record ID
593     \param sysno system record ID
594     
595     This function serves two purposes.. It adds the always matches
596     entry and makes a pointer from the custom record ID (if defined)
597     back to the system record ID (sysno)
598     See zebra_recid_to_sysno .
599   */
600 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
601                             zint sysno)
602 {
603     RecWord word;
604     extract_init(ctrl, &word);
605     word.record_id = record_id;
606     /* we use the seqno as placeholder for a way to get back to
607        record database from _ALLRECORDS.. This is used if a custom
608        RECORD was defined */
609     word.seqno = sysno;
610     word.index_name = "_ALLRECORDS";
611     word.index_type = "w";
612
613     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
614                               "", 0);
615 }
616
617 /* forward declaration */
618 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
619                                        struct ZebraRecStream *stream,
620                                        enum zebra_recctrl_action_t action,
621                                        const char *recordType,
622                                        zint *sysno,
623                                        const char *match_criteria,
624                                        const char *fname,
625                                        RecType recType,
626                                        void *recTypeClientData);
627
628
629 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
630                              enum zebra_recctrl_action_t action)
631 {
632     ZEBRA_RES r = ZEBRA_OK;
633     int i, fd;
634     char gprefix[128];
635     char ext[128];
636     char ext_res[128];
637     struct file_read_info *fi = 0;
638     const char *original_record_type = 0;
639     RecType recType;
640     void *recTypeClientData;
641     struct ZebraRecStream stream, *streamp;
642
643     zebra_init_log_level();
644
645     if (!zh->m_group || !*zh->m_group)
646         *gprefix = '\0';
647     else
648         sprintf(gprefix, "%s.", zh->m_group);
649     
650     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
651
652     /* determine file extension */
653     *ext = '\0';
654     for (i = strlen(fname); --i >= 0; )
655         if (fname[i] == '/')
656             break;
657         else if (fname[i] == '.')
658         {
659             strcpy(ext, fname+i+1);
660             break;
661         }
662     /* determine file type - depending on extension */
663     original_record_type = zh->m_record_type;
664     if (!zh->m_record_type)
665     {
666         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
667         zh->m_record_type = res_get(zh->res, ext_res);
668     }
669     if (!zh->m_record_type)
670     {
671         check_log_limit(zh);
672         if (zh->records_processed + zh->records_skipped
673             < zh->m_file_verbose_limit)
674             yaz_log(YLOG_LOG, "? %s", fname);
675         zh->records_skipped++;
676         return 0;
677     }
678     /* determine match criteria */
679     if (!zh->m_record_id)
680     {
681         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
682         zh->m_record_id = res_get(zh->res, ext_res);
683     }
684
685     if (!(recType =
686           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
687                           &recTypeClientData)))
688     {
689         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
690         return ZEBRA_FAIL;
691     }
692
693     switch(recType->version)
694     {
695     case 0:
696         break;
697     default:
698         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
699     }
700     if (sysno && (action == action_delete || action == action_a_delete))
701     {
702         streamp = 0;
703         fi = 0;
704     }
705     else
706     {
707         char full_rep[1024];
708
709         if (zh->path_reg && !yaz_is_abspath(fname))
710         {
711             strcpy(full_rep, zh->path_reg);
712             strcat(full_rep, "/");
713             strcat(full_rep, fname);
714         }
715         else
716             strcpy(full_rep, fname);
717         
718         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
719         {
720             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
721             zh->m_record_type = original_record_type;
722             return ZEBRA_FAIL;
723         }
724         streamp = &stream;
725         zebra_create_stream_fd(streamp, fd, 0);
726     }
727     r = zebra_extract_records_stream(zh, streamp,
728                                      action,
729                                      zh->m_record_type,
730                                      sysno,
731                                      0, /*match_criteria */
732                                      fname,
733                                      recType, recTypeClientData);
734     if (streamp)
735         stream.destroy(streamp);
736     zh->m_record_type = original_record_type;
737     return r;
738 }
739
740 /*
741   If sysno is provided, then it's used to identify the reocord.
742   If not, and match_criteria is provided, then sysno is guessed
743   If not, and a record is provided, then sysno is got from there
744   
745  */
746
747 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
748                                       const char *buf, size_t buf_size,
749                                       enum zebra_recctrl_action_t action,
750                                       const char *recordType,
751                                       zint *sysno,
752                                       const char *match_criteria,
753                                       const char *fname)
754 {
755     struct ZebraRecStream stream;
756     ZEBRA_RES res;
757     void *clientData;
758     RecType recType = 0;
759
760     if (recordType && *recordType)
761     {
762         yaz_log(log_level_extract,
763                 "Record type explicitly specified: %s", recordType);
764         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
765                                   &clientData);
766     } 
767     else
768     {
769         if (!(zh->m_record_type))
770         {
771             yaz_log(YLOG_WARN, "No such record type defined");
772             return ZEBRA_FAIL;
773         }
774         yaz_log(log_level_extract, "Get record type from rgroup: %s",
775                 zh->m_record_type);
776         recType = recType_byName(zh->reg->recTypes, zh->res,
777                                   zh->m_record_type, &clientData);
778         recordType = zh->m_record_type;
779     }
780     
781     if (!recType)
782     {
783         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
784         return ZEBRA_FAIL;
785     }
786
787     zebra_create_stream_mem(&stream, buf, buf_size);
788
789     res = zebra_extract_records_stream(zh, &stream,
790                                        action,
791                                        recordType,
792                                        sysno,
793                                        match_criteria,
794                                        fname,
795                                        recType, clientData);
796     stream.destroy(&stream);
797     return res;
798 }
799
800 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
801                                              struct ZebraRecStream *stream,
802                                              enum zebra_recctrl_action_t action,
803                                              const char *recordType,
804                                              zint *sysno,
805                                              const char *match_criteria,
806                                              const char *fname,
807                                              RecType recType,
808                                              void *recTypeClientData,
809                                              int *more)
810     
811 {
812     zint sysno0 = 0;
813     RecordAttr *recordAttr;
814     struct recExtractCtrl extractCtrl;
815     int r;
816     const char *matchStr = 0;
817     Record rec;
818     off_t start_offset = 0, end_offset = 0;
819     const char *pr_fname = fname;  /* filename to print .. */
820     int show_progress = zh->records_processed + zh->records_skipped 
821         < zh->m_file_verbose_limit ? 1:0;
822
823     zebra_init_log_level();
824
825     if (!pr_fname)
826         pr_fname = "<no file>";  /* make it printable if file is omitted */
827
828     zebra_rec_keys_reset(zh->reg->keys);
829     zebra_rec_keys_reset(zh->reg->sortKeys);
830
831     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
832     {
833         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
834                                       zh->m_explain_database))
835             return ZEBRA_FAIL;
836     }
837
838     if (stream)
839     {
840         off_t null_offset = 0;
841         extractCtrl.stream = stream;
842
843         start_offset = stream->tellf(stream);
844
845         extractCtrl.first_record = start_offset ? 0 : 1;
846         
847         stream->endf(stream, &null_offset);;
848
849         extractCtrl.init = extract_init;
850         extractCtrl.tokenAdd = extract_token_add;
851         extractCtrl.schemaAdd = extract_schema_add;
852         extractCtrl.dh = zh->reg->dh;
853         extractCtrl.handle = zh;
854         extractCtrl.match_criteria[0] = '\0';
855         extractCtrl.staticrank = 0;
856         extractCtrl.action = action;
857
858         init_extractCtrl(zh, &extractCtrl);
859
860         extract_set_store_data_prepare(&extractCtrl);
861         
862         r = (*recType->extract)(recTypeClientData, &extractCtrl);
863
864         if (action == action_update)
865         {
866             action = extractCtrl.action;
867         }
868         
869         switch (r)
870         {
871         case RECCTRL_EXTRACT_EOF:
872             return ZEBRA_FAIL;
873         case RECCTRL_EXTRACT_ERROR_GENERIC:
874             /* error occured during extraction ... */
875             yaz_log(YLOG_WARN, "extract error: generic");
876             return ZEBRA_FAIL;
877         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
878             /* error occured during extraction ... */
879             yaz_log(YLOG_WARN, "extract error: no such filter");
880             return ZEBRA_FAIL;
881         case RECCTRL_EXTRACT_SKIP:
882             if (show_progress)
883                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
884                          recordType, pr_fname, (zint) start_offset);
885             *more = 1;
886             
887             end_offset = stream->endf(stream, 0);
888             if (end_offset)
889                 stream->seekf(stream, end_offset);
890
891             return ZEBRA_OK;
892         case RECCTRL_EXTRACT_OK:
893             break;
894         default:
895             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
896             return ZEBRA_FAIL;
897         }
898         end_offset = stream->endf(stream, 0);
899         if (end_offset)
900             stream->seekf(stream, end_offset);
901         else
902             end_offset = stream->tellf(stream);
903
904         if (extractCtrl.match_criteria[0])
905             match_criteria = extractCtrl.match_criteria;
906     }
907
908     *more = 1;
909
910     if (zh->m_flag_rw == 0)
911     {
912         yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
913                 pr_fname, (zint) start_offset);
914         /* test mode .. Do not perform match */
915         return ZEBRA_OK;
916     }
917         
918     if (!sysno)
919     {
920         sysno = &sysno0;
921         
922         if (match_criteria && *match_criteria)
923             matchStr = match_criteria;
924         else
925         {
926             if (zh->m_record_id && *zh->m_record_id)
927             {
928                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
929                                                zh->m_record_id);
930                 if (!matchStr)
931                 {
932                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
933                              pr_fname, (zint) start_offset);
934                     return ZEBRA_FAIL;
935                 }
936                 if (0 && matchStr)
937                 {
938                     WRBUF w = wrbuf_alloc();
939                     size_t i;
940                     for (i = 0; i < strlen(matchStr); i++)
941                     {
942                         wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
943                     }
944                     yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
945                     wrbuf_destroy(w);
946                 }
947             }
948         }
949         if (matchStr) 
950         {
951             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
952             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
953                                           matchStr);
954
955             
956             if (log_level_extract)
957             {
958                 WRBUF w = wrbuf_hex_str(matchStr);
959                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
960                 wrbuf_destroy(w);
961             }
962             if (rinfo)
963             {
964                 assert(*rinfo == sizeof(*sysno));
965                 memcpy(sysno, rinfo+1, sizeof(*sysno));
966             }
967        }
968     }
969
970     if (! *sysno)
971     {
972         /* new record AKA does not exist already */
973         if (action == action_delete)
974         {
975             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
976                     pr_fname, (zint) start_offset);
977             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
978             return ZEBRA_FAIL;
979         }
980         else if (action == action_a_delete)
981         {
982             if (show_progress)
983                 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
984                         pr_fname, (zint) start_offset);
985             return ZEBRA_OK;
986         }
987         else if (action == action_replace)
988         {
989             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
990                          pr_fname, (zint) start_offset);
991             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
992             return ZEBRA_FAIL;
993         }
994         if (show_progress)
995             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
996                      (zint) start_offset);
997         rec = rec_new(zh->reg->records);
998
999         *sysno = rec->sysno;
1000
1001
1002         if (stream)
1003         {
1004             all_matches_add(&extractCtrl,
1005                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1006                             *sysno);
1007         }
1008
1009
1010         recordAttr = rec_init_attr(zh->reg->zei, rec);
1011         if (extractCtrl.staticrank < 0)
1012         {
1013             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1014             extractCtrl.staticrank = 0;
1015         }
1016
1017         if (matchStr)
1018         {
1019             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1020             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1021                             sizeof(*sysno), sysno);
1022         }
1023
1024         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1025 #if FLUSH2
1026         extract_flush_record_keys2(zh, *sysno,
1027                                    zh->reg->keys, extractCtrl.staticrank,
1028                                    0, recordAttr->staticrank);
1029 #else
1030         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1031                                   extractCtrl.staticrank);
1032 #endif
1033         recordAttr->staticrank = extractCtrl.staticrank;
1034         zh->records_inserted++;
1035     } 
1036     else
1037     {
1038         /* record already exists */
1039         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1040         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1041         if (action == action_insert)
1042         {
1043             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
1044                          recordType, pr_fname, (zint) start_offset);
1045             logRecord(zh);
1046             return ZEBRA_FAIL;
1047         }
1048
1049         rec = rec_get(zh->reg->records, *sysno);
1050         assert(rec);
1051
1052         if (stream)
1053         {
1054             all_matches_add(&extractCtrl,
1055                             zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1056                             *sysno);
1057         }
1058         
1059         recordAttr = rec_init_attr(zh->reg->zei, rec);
1060
1061         /* decrease total size */
1062         zebraExplain_recordBytesIncrement(zh->reg->zei,
1063                                            - recordAttr->recordSize);
1064
1065         zebra_rec_keys_set_buf(delkeys,
1066                                rec->info[recInfo_delKeys],
1067                                rec->size[recInfo_delKeys],
1068                                0);
1069         zebra_rec_keys_set_buf(sortKeys,
1070                                rec->info[recInfo_sortKeys],
1071                                rec->size[recInfo_sortKeys],
1072                                0);
1073
1074         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1075 #if !FLUSH2
1076         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1077                                   recordAttr->staticrank);
1078 #endif
1079         if (action == action_delete || action == action_a_delete)
1080         {
1081             /* record going to be deleted */
1082 #if FLUSH2
1083             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1084                                        delkeys, recordAttr->staticrank);
1085 #endif       
1086             if (zebra_rec_keys_empty(delkeys))
1087             {
1088                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1089                         pr_fname, (zint) start_offset);
1090                 yaz_log(YLOG_WARN, "cannot delete file above, "
1091                         "storeKeys false (3)");
1092             }
1093             else
1094             {
1095                 if (show_progress)
1096                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1097                             pr_fname, (zint) start_offset);
1098                 zh->records_deleted++;
1099                 if (matchStr)
1100                 {
1101                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1102                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1103                 }
1104                 rec_del(zh->reg->records, &rec);
1105             }
1106             zebra_rec_keys_close(delkeys);
1107             zebra_rec_keys_close(sortKeys);
1108             rec_free(&rec);
1109             logRecord(zh);
1110             return ZEBRA_OK;
1111         }
1112         else
1113         {   /* update or special_update */
1114             if (show_progress)
1115                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1116                         pr_fname, (zint) start_offset);
1117             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1118
1119 #if FLUSH2
1120             extract_flush_record_keys2(zh, *sysno,
1121                                        zh->reg->keys, extractCtrl.staticrank,
1122                                        delkeys, recordAttr->staticrank);
1123 #else
1124             extract_flush_record_keys(zh, *sysno, 1, 
1125                                       zh->reg->keys, extractCtrl.staticrank);
1126 #endif
1127             recordAttr->staticrank = extractCtrl.staticrank;
1128             zh->records_updated++;
1129         }
1130         zebra_rec_keys_close(delkeys);
1131         zebra_rec_keys_close(sortKeys);
1132     }
1133     /* update file type */
1134     xfree(rec->info[recInfo_fileType]);
1135     rec->info[recInfo_fileType] =
1136         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1137
1138     /* update filename */
1139     xfree(rec->info[recInfo_filename]);
1140     rec->info[recInfo_filename] =
1141         rec_strdup(fname, &rec->size[recInfo_filename]);
1142
1143     /* update delete keys */
1144     xfree(rec->info[recInfo_delKeys]);
1145     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1146     {
1147         zebra_rec_keys_get_buf(zh->reg->keys,
1148                                &rec->info[recInfo_delKeys],
1149                                &rec->size[recInfo_delKeys]);
1150     }
1151     else
1152     {
1153         rec->info[recInfo_delKeys] = NULL;
1154         rec->size[recInfo_delKeys] = 0;
1155     }
1156     /* update sort keys */
1157     xfree(rec->info[recInfo_sortKeys]);
1158
1159     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1160                            &rec->info[recInfo_sortKeys],
1161                            &rec->size[recInfo_sortKeys]);
1162
1163     if (stream)
1164     {
1165         recordAttr->recordSize = end_offset - start_offset;
1166         zebraExplain_recordBytesIncrement(zh->reg->zei,
1167                                           recordAttr->recordSize);
1168     }
1169
1170     /* set run-number for this record */
1171     recordAttr->runNumber =
1172         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1173
1174     /* update store data */
1175     xfree(rec->info[recInfo_storeData]);
1176
1177     /* update store data */
1178     if (zh->store_data_buf)
1179     {
1180         rec->size[recInfo_storeData] = zh->store_data_size;
1181         rec->info[recInfo_storeData] = zh->store_data_buf;
1182         zh->store_data_buf = 0;
1183         recordAttr->recordSize = zh->store_data_size;
1184     }
1185     else if (zh->m_store_data)
1186     {
1187         off_t cur_offset = stream->tellf(stream);
1188
1189         rec->size[recInfo_storeData] = recordAttr->recordSize;
1190         rec->info[recInfo_storeData] = (char *)
1191             xmalloc(recordAttr->recordSize);
1192         stream->seekf(stream, start_offset);
1193         stream->readf(stream, rec->info[recInfo_storeData],
1194                       recordAttr->recordSize);
1195         stream->seekf(stream, cur_offset);
1196     }
1197     else
1198     {
1199         rec->info[recInfo_storeData] = NULL;
1200         rec->size[recInfo_storeData] = 0;
1201     }
1202     /* update database name */
1203     xfree(rec->info[recInfo_databaseName]);
1204     rec->info[recInfo_databaseName] =
1205         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1206
1207     /* update offset */
1208     recordAttr->recordOffset = start_offset;
1209     
1210     /* commit this record */
1211     rec_put(zh->reg->records, &rec);
1212     logRecord(zh);
1213     return ZEBRA_OK;
1214 }
1215
1216 /** \brief extracts records from stream
1217     \param zh Zebra Handle
1218     \param stream stream that we read from
1219     \param action (action_insert, action_replace, action_delete, ..)
1220     \param recordType Record filter type "grs.xml", etc.
1221     \param sysno pointer to sysno if already known; NULL otherwise
1222     \param match_criteria (NULL if not already given)
1223     \param fname filename that we read from (for logging purposes only)
1224     \param recType record type
1225     \param recTypeClientData client data for record type
1226     \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1227 */
1228 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
1229                                        struct ZebraRecStream *stream,
1230                                        enum zebra_recctrl_action_t action,
1231                                        const char *recordType,
1232                                        zint *sysno,
1233                                        const char *match_criteria,
1234                                        const char *fname,
1235                                        RecType recType,
1236                                        void *recTypeClientData)
1237 {
1238     ZEBRA_RES res = ZEBRA_OK;
1239     while (1)
1240     {
1241         int more = 0;
1242         res = zebra_extract_record_stream(zh, stream,
1243                                           action,
1244                                           recordType,
1245                                           sysno,
1246                                           match_criteria,
1247                                           fname,
1248                                           recType, recTypeClientData, &more);
1249         if (!more)
1250         {
1251             res = ZEBRA_OK;
1252             break;
1253         }
1254         if (res != ZEBRA_OK)
1255             break;
1256         if (sysno)
1257             break;
1258     }
1259     return res;
1260 }
1261
1262 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1263 {
1264     ZebraHandle zh = (ZebraHandle) handle;
1265     struct recExtractCtrl extractCtrl;
1266
1267     if (zebraExplain_curDatabase(zh->reg->zei,
1268                                   rec->info[recInfo_databaseName]))
1269     {
1270         abort();
1271         if (zebraExplain_newDatabase(zh->reg->zei,
1272                                       rec->info[recInfo_databaseName], 0))
1273             abort();
1274     }
1275
1276     zebra_rec_keys_reset(zh->reg->keys);
1277     zebra_rec_keys_reset(zh->reg->sortKeys);
1278
1279     extractCtrl.init = extract_init;
1280     extractCtrl.tokenAdd = extract_token_add;
1281     extractCtrl.schemaAdd = extract_schema_add;
1282     extractCtrl.dh = zh->reg->dh;
1283
1284     init_extractCtrl(zh, &extractCtrl);
1285
1286     extractCtrl.flagShowRecords = 0;
1287     extractCtrl.match_criteria[0] = '\0';
1288     extractCtrl.staticrank = 0;
1289     extractCtrl.action = action_update;
1290
1291     extractCtrl.handle = handle;
1292     extractCtrl.first_record = 1;
1293     
1294     extract_set_store_data_prepare(&extractCtrl);
1295
1296     if (n)
1297         grs_extract_tree(&extractCtrl, n);
1298
1299     if (rec->size[recInfo_delKeys])
1300     {
1301         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1302         
1303         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1304
1305         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1306                                rec->size[recInfo_delKeys],
1307                                0);
1308 #if FLUSH2
1309         extract_flush_record_keys2(zh, rec->sysno, 
1310                                    zh->reg->keys, 0, delkeys, 0);
1311 #else
1312         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1313         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1314 #endif
1315         zebra_rec_keys_close(delkeys);
1316
1317         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1318                                rec->size[recInfo_sortKeys],
1319                                0);
1320
1321         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1322         zebra_rec_keys_close(sortkeys);
1323     }
1324     else
1325     {
1326 #if FLUSH2
1327         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1328 #else
1329         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1330 #endif
1331     }
1332     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1333     
1334     xfree(rec->info[recInfo_delKeys]);
1335     zebra_rec_keys_get_buf(zh->reg->keys,
1336                            &rec->info[recInfo_delKeys], 
1337                            &rec->size[recInfo_delKeys]);
1338
1339     xfree(rec->info[recInfo_sortKeys]);
1340     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1341                            &rec->info[recInfo_sortKeys],
1342                            &rec->size[recInfo_sortKeys]);
1343     return ZEBRA_OK;
1344 }
1345
1346 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1347                            const char *str, size_t slen, NMEM nmem, int level)
1348 {
1349     char keystr[200]; /* room for zints to print */
1350     char *dst_term = 0;
1351     int ord = CAST_ZINT_TO_INT(key->mem[0]);
1352     const char *index_type;
1353     int i;
1354     const char *string_index;
1355     
1356     zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1357                             0/* db */, &string_index);
1358     assert(index_type);
1359     zebra_term_untrans_iconv(zh, nmem, index_type,
1360                              &dst_term, str);
1361     *keystr = '\0';
1362     for (i = 0; i < key->len; i++)
1363     {
1364         sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1365     }
1366     
1367     if (*str < CHR_BASE_CHAR)
1368     {
1369         int i;
1370         char dst_buf[200]; /* room for special chars */
1371         
1372         strcpy(dst_buf , "?");
1373         
1374         if (!strcmp(str, ""))
1375             strcpy(dst_buf, "alwaysmatches");
1376         if (!strcmp(str, FIRST_IN_FIELD_STR))
1377             strcpy(dst_buf, "firstinfield");
1378         else if (!strcmp(str, CHR_UNKNOWN))
1379             strcpy(dst_buf, "unknown");
1380         else if (!strcmp(str, CHR_SPACE))
1381             strcpy(dst_buf, "space");
1382         
1383         for (i = 0; i<slen; i++)
1384         {
1385             sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1386         }
1387         yaz_log(level, "%s%s %s %s", keystr, index_type,
1388                 string_index, dst_buf);
1389         
1390     }
1391     else
1392         yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1393                 string_index, dst_term);
1394 }
1395
1396 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1397                           zebra_rec_keys_t reckeys,
1398                           int level)
1399 {
1400     if (zebra_rec_keys_rewind(reckeys))
1401     {
1402         size_t slen;
1403         const char *str;
1404         struct it_key key;
1405         NMEM nmem = nmem_create();
1406
1407         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1408         {
1409             zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1410             nmem_reset(nmem);
1411         }
1412         nmem_destroy(nmem);
1413     }
1414 }
1415
1416 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1417                              zebra_rec_keys_t reckeys)
1418 {
1419     ZebraExplainInfo zei = zh->reg->zei;
1420     struct ord_stat {
1421         int no;
1422         int ord;
1423         struct ord_stat *next;
1424     };
1425
1426     if (zebra_rec_keys_rewind(reckeys))
1427     {
1428         struct ord_stat *ord_list = 0;
1429         struct ord_stat *p;
1430         size_t slen;
1431         const char *str;
1432         struct it_key key_in;
1433         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1434         {
1435             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1436
1437             for (p = ord_list; p ; p = p->next)
1438                 if (p->ord == ord)
1439                 {
1440                     p->no++;
1441                     break;
1442                 }
1443             if (!p)
1444             {
1445                 p = xmalloc(sizeof(*p));
1446                 p->no = 1;
1447                 p->ord = ord;
1448                 p->next = ord_list;
1449                 ord_list = p;
1450             }
1451         }
1452
1453         p = ord_list;
1454         while (p)
1455         {
1456             struct ord_stat *p1 = p;
1457
1458             if (is_insert)
1459                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1460             else
1461                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1462             p = p->next;
1463             xfree(p1);
1464         }
1465     }
1466 }
1467
1468 #if FLUSH2
1469 static void extract_flush_record_keys2(
1470     ZebraHandle zh, zint sysno,
1471     zebra_rec_keys_t ins_keys, zint ins_rank,
1472     zebra_rec_keys_t del_keys, zint del_rank)
1473 {
1474     ZebraExplainInfo zei = zh->reg->zei;
1475     int normal = 0;
1476     int optimized = 0;
1477
1478     if (!zh->reg->key_block)
1479     {
1480         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1481         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1482         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1483         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1484     }
1485
1486     if (ins_keys)
1487     {
1488         extract_rec_keys_adjust(zh, 1, ins_keys);
1489         if (!del_keys)
1490             zebraExplain_recordCountIncrement(zei, 1);
1491         zebra_rec_keys_rewind(ins_keys);
1492     }
1493     if (del_keys)
1494     {
1495         extract_rec_keys_adjust(zh, 0, del_keys);
1496         if (!ins_keys)
1497             zebraExplain_recordCountIncrement(zei, -1);
1498         zebra_rec_keys_rewind(del_keys);
1499     }
1500
1501     while (1)
1502     {
1503         size_t del_slen;
1504         const char *del_str;
1505         struct it_key del_key_in;
1506         int del = 0;
1507
1508         size_t ins_slen;
1509         const char *ins_str;
1510         struct it_key ins_key_in;
1511         int ins = 0;
1512
1513         if (del_keys)
1514             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1515                                       &del_key_in);
1516         if (ins_keys)
1517             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1518                                       &ins_key_in);
1519
1520         if (del && ins && ins_rank == del_rank
1521             && !key_compare(&del_key_in, &ins_key_in) 
1522             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1523         {
1524             optimized++;
1525             continue;
1526         }
1527         if (!del && !ins)
1528             break;
1529         
1530         normal++;
1531         if (del)
1532             key_block_write(zh->reg->key_block, sysno, 
1533                             &del_key_in, 0, del_str, del_slen,
1534                             del_rank, zh->m_staticrank);
1535         if (ins)
1536             key_block_write(zh->reg->key_block, sysno, 
1537                             &ins_key_in, 1, ins_str, ins_slen,
1538                             ins_rank, zh->m_staticrank);
1539     }
1540     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1541 }
1542 #else
1543 static void extract_flush_record_keys(
1544     ZebraHandle zh, zint sysno, int cmd,
1545     zebra_rec_keys_t reckeys,
1546     zint staticrank)
1547 {
1548     ZebraExplainInfo zei = zh->reg->zei;
1549
1550     extract_rec_keys_adjust(zh, cmd, reckeys);
1551
1552     if (log_level_details)
1553     {
1554         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1555                 sysno, cmd ? "insert" : "delete");
1556         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1557     }
1558
1559     if (!zh->reg->key_block)
1560     {
1561         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1562         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1563         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1564         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1565     }
1566     zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1567
1568 #if 0
1569     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1570     print_rec_keys(zh, reckeys);
1571 #endif
1572     if (zebra_rec_keys_rewind(reckeys))
1573     {
1574         size_t slen;
1575         const char *str;
1576         struct it_key key_in;
1577         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1578         {
1579             key_block_write(zh->reg->key_block, sysno, 
1580                             &key_in, cmd, str, slen,
1581                             staticrank, zh->m_staticrank);
1582         }
1583     }
1584 }
1585 #endif
1586
1587 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1588                                      zebra_rec_keys_t reckeys,
1589                                      zebra_snippets *snippets)
1590 {
1591     NMEM nmem = nmem_create();
1592     if (zebra_rec_keys_rewind(reckeys)) 
1593     {
1594         const char *str;
1595         size_t slen;
1596         struct it_key key;
1597         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1598         {
1599             char *dst_term = 0;
1600             int ord;
1601             zint seqno;
1602             const char *index_type;
1603
1604             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1605             seqno = key.mem[key.len-1];
1606             ord = CAST_ZINT_TO_INT(key.mem[0]);
1607             
1608             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1609                                     0/* db */, 0 /* string_index */);
1610             assert(index_type);
1611             zebra_term_untrans_iconv(zh, nmem, index_type,
1612                                      &dst_term, str);
1613             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1614             nmem_reset(nmem);
1615         }
1616     }
1617     nmem_destroy(nmem);
1618     return ZEBRA_OK;
1619 }
1620
1621 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1622 {
1623     yaz_log(YLOG_LOG, "print_rec_keys");
1624     if (zebra_rec_keys_rewind(reckeys))
1625     {
1626         const char *str;
1627         size_t slen;
1628         struct it_key key;
1629         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1630         {
1631             char dst_buf[IT_MAX_WORD];
1632             zint seqno;
1633             const char *index_type;
1634             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1635             const char *db = 0;
1636             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1637
1638             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1639             
1640             seqno = key.mem[key.len-1];
1641             
1642             zebra_term_untrans(zh, index_type, dst_buf, str);
1643             
1644             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1645                     " term=%s", ord, seqno, dst_buf); 
1646         }
1647     }
1648 }
1649
1650 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1651                                      const char *str, int length)
1652 {
1653     struct it_key key;
1654     ZebraHandle zh = p->extractCtrl->handle;
1655     ZebraExplainInfo zei = zh->reg->zei;
1656     int ch, i;
1657
1658     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1659     if (ch < 0)
1660         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1661
1662     i = 0;
1663     key.mem[i++] = ch;
1664     key.mem[i++] = p->record_id;
1665     key.mem[i++] = p->section_id;
1666
1667     if (zh->m_segment_indexing)
1668         key.mem[i++] = p->segment;
1669     key.mem[i++] = p->seqno;
1670     key.len = i;
1671
1672     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1673 }
1674
1675 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1676 {
1677     struct it_key key;
1678     ZebraHandle zh = p->extractCtrl->handle;
1679     ZebraExplainInfo zei = zh->reg->zei;
1680     int ch;
1681     zinfo_index_category_t cat = zinfo_index_category_sort;
1682
1683     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1684     if (ch < 0)
1685         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1686     key.len = 3;
1687     key.mem[0] = ch;
1688     key.mem[1] = p->record_id;
1689     key.mem[2] = p->section_id;
1690
1691     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1692 }
1693
1694 static void extract_add_staticrank_string(RecWord *p,
1695                                           const char *str, int length)
1696 {
1697     char valz[40];
1698     struct recExtractCtrl *ctrl = p->extractCtrl;
1699
1700     if (length > sizeof(valz)-1)
1701         length = sizeof(valz)-1;
1702
1703     memcpy(valz, str, length);
1704     valz[length] = '\0';
1705     ctrl->staticrank = atozint(valz);
1706 }
1707
1708 static void extract_add_string(RecWord *p, zebra_map_t zm,
1709                                const char *string, int length)
1710 {
1711     assert(length > 0);
1712
1713     if (!p->index_name)
1714         return;
1715     if (log_level_details)
1716     {
1717
1718         WRBUF w = wrbuf_alloc();
1719         
1720         wrbuf_write_escaped(w, string, length);
1721         yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1722         wrbuf_destroy(w);
1723     }
1724     if (zebra_maps_is_index(zm))
1725     {
1726         extract_add_index_string(p, zinfo_index_category_index,
1727                                  string, length);
1728         if (zebra_maps_is_alwaysmatches(zm))
1729         {
1730             RecWord word;
1731             memcpy(&word, p, sizeof(word));
1732
1733             word.seqno = 1;
1734             extract_add_index_string(
1735                 &word, zinfo_index_category_alwaysmatches, "", 0);
1736         }
1737     }
1738     else if (zebra_maps_is_sort(zm))
1739     {
1740         extract_add_sort_string(p, string, length);
1741     }
1742     else if (zebra_maps_is_staticrank(zm))
1743     {
1744         extract_add_staticrank_string(p, string, length);
1745     }
1746 }
1747
1748 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1749 {
1750     const char *b = p->term_buf;
1751     int remain = p->term_len;
1752     int first = 1;
1753     const char **map = 0;
1754     
1755     if (remain > 0)
1756         map = zebra_maps_input(zm, &b, remain, 0);
1757
1758     while (map)
1759     {
1760         char buf[IT_MAX_WORD+1];
1761         int i, remain;
1762
1763         /* Skip spaces */
1764         while (map && *map && **map == *CHR_SPACE)
1765         {
1766             remain = p->term_len - (b - p->term_buf);
1767             if (remain > 0)
1768                 map = zebra_maps_input(zm, &b, remain, 0);
1769             else
1770                 map = 0;
1771         }
1772         if (!map)
1773             break;
1774         i = 0;
1775         while (map && *map && **map != *CHR_SPACE)
1776         {
1777             const char *cp = *map;
1778
1779             while (i < IT_MAX_WORD && *cp)
1780                 buf[i++] = *(cp++);
1781             remain = p->term_len - (b - p->term_buf);
1782             if (remain > 0)
1783                 map = zebra_maps_input(zm, &b, remain, 0);
1784             else
1785                 map = 0;
1786         }
1787         if (!i)
1788             return;
1789
1790         if (first)
1791         {   
1792             first = 0;
1793             if (zebra_maps_is_first_in_field(zm))
1794             {
1795                 /* first in field marker */
1796                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1797                 p->seqno++;
1798             }
1799         }
1800         extract_add_string(p, zm, buf, i);
1801         p->seqno++;
1802     }
1803 }
1804
1805 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1806 {
1807     const char *b = p->term_buf;
1808     char buf[IT_MAX_WORD+1];
1809     const char **map = 0;
1810     int i = 0, remain = p->term_len;
1811
1812     if (remain > 0)
1813         map = zebra_maps_input(zm, &b, remain, 1);
1814
1815     while (remain > 0 && i < IT_MAX_WORD)
1816     {
1817         while (map && *map && **map == *CHR_SPACE)
1818         {
1819             remain = p->term_len - (b - p->term_buf);
1820
1821             if (remain > 0)
1822             {
1823                 int first = i ? 0 : 1;  /* first position */
1824                 map = zebra_maps_input(zm, &b, remain, first);
1825             }
1826             else
1827                 map = 0;
1828         }
1829         if (!map)
1830             break;
1831
1832         if (i && i < IT_MAX_WORD)
1833             buf[i++] = *CHR_SPACE;
1834         while (map && *map && **map != *CHR_SPACE)
1835         {
1836             const char *cp = *map;
1837
1838             if (**map == *CHR_CUT)
1839             {
1840                 i = 0;
1841             }
1842             else
1843             {
1844                 if (i >= IT_MAX_WORD)
1845                     break;
1846                 while (i < IT_MAX_WORD && *cp)
1847                     buf[i++] = *(cp++);
1848             }
1849             remain = p->term_len  - (b - p->term_buf);
1850             if (remain > 0)
1851             {
1852                 map = zebra_maps_input(zm, &b, remain, 0);
1853             }
1854             else
1855                 map = 0;
1856         }
1857     }
1858     if (!i)
1859         return;
1860     extract_add_string(p, zm, buf, i);
1861 }
1862
1863 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1864 {
1865     const char *res_buf = 0;
1866     size_t res_len = 0;
1867
1868     zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1869     while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1870     {
1871         extract_add_string(p, zm, res_buf, res_len);
1872         p->seqno++;
1873     }
1874 }
1875
1876
1877 /** \brief top-level indexing handler for recctrl system
1878     \param p token data to be indexed
1879
1880     Call sequence:
1881     extract_token_add
1882     extract_add_{in}_complete / extract_add_icu
1883     extract_add_string
1884     
1885     extract_add_index_string
1886     or
1887     extract_add_sort_string
1888     or
1889     extract_add_staticrank_string
1890     
1891 */
1892 static void extract_token_add(RecWord *p)
1893 {
1894     ZebraHandle zh = p->extractCtrl->handle;
1895     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1896     WRBUF wrbuf;
1897
1898     if (log_level_details)
1899     {
1900         yaz_log(log_level_details, "extract_token_add "
1901                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1902                 p->index_type, p->index_name, 
1903                 p->seqno, p->term_len, p->term_buf);
1904     }
1905     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1906     {
1907         p->term_buf = wrbuf_buf(wrbuf);
1908         p->term_len = wrbuf_len(wrbuf);
1909     }
1910     if (zebra_maps_is_icu(zm))
1911     {
1912         extract_add_icu(p, zm);
1913     }
1914     else
1915     {
1916         if (zebra_maps_is_complete(zm))
1917             extract_add_complete_field(p, zm);
1918         else
1919             extract_add_incomplete_field(p, zm);
1920     }
1921 }
1922
1923 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1924                                       void *buf, size_t sz)
1925 {
1926     ZebraHandle zh = (ZebraHandle) p->handle;
1927
1928     xfree(zh->store_data_buf);
1929     zh->store_data_buf = 0;
1930     zh->store_data_size = 0;
1931     if (buf && sz)
1932     {
1933         zh->store_data_buf = xmalloc(sz);
1934         zh->store_data_size = sz;
1935         memcpy(zh->store_data_buf, buf, sz);
1936     }
1937 }
1938
1939 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1940 {
1941     ZebraHandle zh = (ZebraHandle) p->handle;
1942     xfree(zh->store_data_buf);
1943     zh->store_data_buf = 0;
1944     zh->store_data_size = 0;
1945     p->setStoreData = extract_set_store_data_cb;
1946 }
1947
1948 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1949 {
1950     ZebraHandle zh = (ZebraHandle) p->handle;
1951     zebraExplain_addSchema(zh->reg->zei, oid);
1952 }
1953
1954 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1955                              int cmd, zebra_rec_keys_t reckeys)
1956 {
1957 #if 0
1958     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1959             cmd, sysno);
1960     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1961 #endif
1962
1963     if (zebra_rec_keys_rewind(reckeys))
1964     {
1965         zebra_sort_index_t si = zh->reg->sort_index;
1966         size_t slen;
1967         const char *str;
1968         struct it_key key_in;
1969
1970         NMEM nmem = nmem_create();
1971         struct sort_add_ent {
1972             int ord;
1973             int cmd;
1974             struct sort_add_ent *next;
1975             WRBUF wrbuf;
1976             zint sysno;
1977             zint section_id;
1978         };
1979         struct sort_add_ent *sort_ent_list = 0;
1980
1981         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1982         {
1983             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1984             zint filter_sysno = key_in.mem[1];
1985             zint section_id = key_in.mem[2];
1986
1987             struct sort_add_ent **e = &sort_ent_list;
1988             for (; *e; e = &(*e)->next)
1989                 if ((*e)->ord == ord && section_id == (*e)->section_id)
1990                     break;
1991             if (!*e)
1992             {
1993                 *e = nmem_malloc(nmem, sizeof(**e));
1994                 (*e)->next = 0;
1995                 (*e)->wrbuf = wrbuf_alloc();
1996                 (*e)->ord = ord;
1997                 (*e)->cmd = cmd;
1998                 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1999                 (*e)->section_id = section_id;
2000             }
2001             
2002             wrbuf_write((*e)->wrbuf, str, slen);
2003             wrbuf_putc((*e)->wrbuf, '\0');
2004         }
2005         if (sort_ent_list)
2006         {
2007             zint last_sysno = 0;
2008             struct sort_add_ent *e = sort_ent_list;
2009             for (; e; e = e->next)
2010             {
2011                 if (last_sysno != e->sysno)
2012                 {
2013                     zebra_sort_sysno(si, e->sysno);
2014                     last_sysno = e->sysno;
2015                 }
2016                 zebra_sort_type(si, e->ord);
2017                 if (e->cmd == 1)
2018                     zebra_sort_add(si, e->section_id, e->wrbuf);
2019                 else
2020                     zebra_sort_delete(si, e->section_id);
2021                 wrbuf_destroy(e->wrbuf);
2022             }
2023         }
2024         nmem_destroy(nmem);
2025     }
2026 }
2027
2028 /*
2029  * Local variables:
2030  * c-basic-offset: 4
2031  * c-file-style: "Stroustrup"
2032  * indent-tabs-mode: nil
2033  * End:
2034  * vim: shiftwidth=4 tabstop=8 expandtab
2035  */
2036