66eaff1f8ee1bfe9cf9271a4a33d41399858eb12
[idzebra-moved-to-github.git] / index / extract.c
1 /* $Id: extract.c,v 1.266 2007-10-30 19:17:15 adam Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 /** \file
24     \brief indexes records and extract tokens for indexing and sorting
25 */
26
27 #include <stdio.h>
28 #include <assert.h>
29 #include <ctype.h>
30 #ifdef WIN32
31 #include <io.h>
32 #endif
33 #if HAVE_UNISTD_H
34 #include <unistd.h>
35 #endif
36 #include <fcntl.h>
37
38
39 #include "index.h"
40 #include "orddict.h"
41 #include <direntz.h>
42 #include <charmap.h>
43 #include <yaz/snprintf.h>
44
45 static int log_level_extract = 0;
46 static int log_level_details = 0;
47 static int log_level_initialized = 0;
48
49 /* 1 if we use eliminitate identical delete/insert keys */
50 /* eventually this the 0-case code will be removed */
51 #define FLUSH2 1
52
53 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
54                                 zebra_rec_keys_t ins_keys,
55                                 zint ins_rank,
56                                 zebra_rec_keys_t del_keys,
57                                 zint del_rank);
58
59 static void zebra_init_log_level(void)
60 {
61     if (!log_level_initialized)
62     {
63         log_level_initialized = 1;
64
65         log_level_extract = yaz_log_module_level("extract");
66         log_level_details = yaz_log_module_level("indexdetails");
67     }
68 }
69
70 static void extract_flush_record_keys(ZebraHandle zh, zint sysno,
71                                       int cmd, zebra_rec_keys_t reckeys,
72                                       zint staticrank);
73 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
74                                     int cmd, zebra_rec_keys_t skp);
75 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
76 static void extract_token_add(RecWord *p);
77 static void extract_token_add2(RecWord *p);
78
79 static void check_log_limit(ZebraHandle zh)
80 {
81     if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
82     {
83         yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
84                 zh->m_file_verbose_limit);
85     }
86 }
87
88 static void logRecord(ZebraHandle zh)
89 {
90     check_log_limit(zh);
91     ++zh->records_processed;
92     if (!(zh->records_processed % 1000))
93     {
94         yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
95                 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, 
96                 zh->records_processed, zh->records_inserted, 
97                 zh->records_updated, zh->records_deleted);
98     }
99 }
100
101 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
102 {
103     int i;
104     for (i = 0; i<256; i++)
105     {
106         zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, i);
107         if (zebra_maps_is_positioned(zm))
108             ctrl->seqno[i] = 1;
109         else
110             ctrl->seqno[i] = 0;
111     }
112     ctrl->flagShowRecords = !zh->m_flag_rw;
113 }
114
115
116 static void extract_add_index_string(RecWord *p, 
117                                       zinfo_index_category_t cat,
118                                       const char *str, int length);
119
120 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
121
122 static void extract_init(struct recExtractCtrl *p, RecWord *w)
123 {
124     w->seqno = 1;
125     w->index_name = "any";
126     w->index_type = "w";
127     w->extractCtrl = p;
128     w->record_id = 0;
129     w->section_id = 0;
130     w->segment = 0;
131 }
132
133 struct snip_rec_info {
134     ZebraHandle zh;
135     zebra_snippets *snippets;
136 };
137
138
139 static void snippet_add_complete_field(RecWord *p, int ord,
140                                        zebra_map_t zm)
141 {
142     struct snip_rec_info *h = p->extractCtrl->handle;
143
144     const char *b = p->term_buf;
145     char buf[IT_MAX_WORD+1];
146     const char **map = 0;
147     int i = 0, remain = p->term_len;
148     const char *start = b;
149     const char *last = 0;
150
151     if (remain > 0)
152         map = zebra_maps_input(zm, &b, remain, 1);
153
154     while (remain > 0 && i < IT_MAX_WORD)
155     {
156         while (map && *map && **map == *CHR_SPACE)
157         {
158             remain = p->term_len - (b - p->term_buf);
159
160             if (i == 0)
161                 start = b;  /* set to first non-ws area */
162             if (remain > 0)
163             {
164                 int first = i ? 0 : 1;  /* first position */
165
166                 map = zebra_maps_input(zm, &b, remain, first);
167             }
168             else
169                 map = 0;
170         }
171         if (!map)
172             break;
173
174         if (i && i < IT_MAX_WORD)
175             buf[i++] = *CHR_SPACE;
176         while (map && *map && **map != *CHR_SPACE)
177         {
178             const char *cp = *map;
179
180             if (**map == *CHR_CUT)
181             {
182                 i = 0;
183             }
184             else
185             {
186                 if (i >= IT_MAX_WORD)
187                     break;
188                 while (i < IT_MAX_WORD && *cp)
189                     buf[i++] = *(cp++);
190             }
191             last = b;
192             remain = p->term_len  - (b - p->term_buf);
193             if (remain > 0)
194             {
195                 map = zebra_maps_input(zm, &b, remain, 0);
196             }
197             else
198                 map = 0;
199         }
200     }
201     if (!i)
202         return;
203     if (last && start != last)
204         zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
205                                start, last - start);
206 }
207
208 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
209 {
210     struct snip_rec_info *h = p->extractCtrl->handle;
211     const char *b = p->term_buf;
212     int remain = p->term_len;
213     int first = 1;
214     const char **map = 0;
215     const char *start = b;
216     const char *last = b;
217
218     if (remain > 0)
219         map = zebra_maps_input(zm, &b, remain, 0);
220
221     while (map)
222     {
223         char buf[IT_MAX_WORD+1];
224         int i, remain;
225
226         /* Skip spaces */
227         while (map && *map && **map == *CHR_SPACE)
228         {
229             remain = p->term_len - (b - p->term_buf);
230             last = b;
231             if (remain > 0)
232                 map = zebra_maps_input(zm, &b, remain, 0);
233             else
234                 map = 0;
235         }
236         if (!map)
237             break;
238         if (start != last)
239         {
240             zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
241                                    start, last - start);
242
243         }
244         start = last;
245
246         i = 0;
247         while (map && *map && **map != *CHR_SPACE)
248         {
249             const char *cp = *map;
250
251             while (i < IT_MAX_WORD && *cp)
252                 buf[i++] = *(cp++);
253             remain = p->term_len - (b - p->term_buf);
254             last = b;
255             if (remain > 0)
256                 map = zebra_maps_input(zm, &b, remain, 0);
257             else
258                 map = 0;
259         }
260         if (!i)
261             return;
262
263         if (first)
264         {   
265             first = 0;
266             if (zebra_maps_is_first_in_field(zm))
267             {
268                 /* first in field marker */
269                 p->seqno++;
270             }
271         }
272         if (start != last)
273             zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
274                                    start, last - start);
275         start = last;
276         p->seqno++;
277     }
278
279 }
280
281 static void snippet_token_add(RecWord *p)
282 {
283     struct snip_rec_info *h = p->extractCtrl->handle;
284     ZebraHandle zh = h->zh;
285     zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, *p->index_type);
286
287     if (zm && zebra_maps_is_index(zm))
288     {
289         ZebraExplainInfo zei = zh->reg->zei;
290         int ch = zebraExplain_lookup_attr_str(
291             zei, zinfo_index_category_index, p->index_type, p->index_name);
292
293         if (zebra_maps_is_complete(zm))
294             snippet_add_complete_field(p, ch, zm);
295         else
296             snippet_add_incomplete_field(p, ch, zm);
297     }
298 }
299
300 static void snippet_schema_add(
301     struct recExtractCtrl *p, Odr_oid *oid)
302 {
303
304 }
305
306 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
307                      struct ZebraRecStream *stream,
308                      RecType rt, void *recTypeClientData)
309 {
310     struct recExtractCtrl extractCtrl;
311     struct snip_rec_info info;
312     int r;
313
314     extractCtrl.stream = stream;
315     extractCtrl.first_record = 1;
316     extractCtrl.init = extract_init;
317     extractCtrl.tokenAdd = snippet_token_add;
318     extractCtrl.schemaAdd = snippet_schema_add;
319     assert(zh->reg);
320     assert(zh->reg->dh);
321
322     extractCtrl.dh = zh->reg->dh;
323     
324     info.zh = zh;
325     info.snippets = sn;
326     extractCtrl.handle = &info;
327     extractCtrl.match_criteria[0] = '\0';
328     extractCtrl.staticrank = 0;
329     extractCtrl.action = action_insert;
330     
331     init_extractCtrl(zh, &extractCtrl);
332
333     extractCtrl.setStoreData = 0;
334
335     r = (*rt->extract)(recTypeClientData, &extractCtrl);
336
337 }
338
339 static void searchRecordKey(ZebraHandle zh,
340                             zebra_rec_keys_t reckeys,
341                             const char *index_name,
342                             const char **ws, int ws_length)
343 {
344     int i;
345     int ch = -1;
346     zinfo_index_category_t cat = zinfo_index_category_index;
347
348     for (i = 0; i<ws_length; i++)
349         ws[i] = NULL;
350
351     if (ch < 0)
352         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
353     if (ch < 0)
354         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
355     if (ch < 0)
356         ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
357
358     if (ch < 0)
359         return ;
360
361     if (zebra_rec_keys_rewind(reckeys))
362     {
363         zint startSeq = -1;
364         const char *str;
365         size_t slen;
366         struct it_key key;
367         zint seqno;
368         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
369         {
370             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
371
372             seqno = key.mem[key.len-1];
373             
374             if (key.mem[0] == ch)
375             {
376                 zint woff;
377                 
378                 if (startSeq == -1)
379                     startSeq = seqno;
380                 woff = seqno - startSeq;
381                 if (woff >= 0 && woff < ws_length)
382                     ws[woff] = str;
383             }
384         }
385     }
386 }
387
388 #define FILE_MATCH_BLANK "\t "
389
390 static char *get_match_from_spec(ZebraHandle zh,
391                           zebra_rec_keys_t reckeys,
392                           const char *fname, const char *spec)
393 {
394     static char dstBuf[2048];      /* static here ??? */
395     char *dst = dstBuf;
396     const char *s = spec;
397
398     while (1)
399     {
400         for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
401             ;
402         if (!*s)
403             break;
404         if (*s == '(')
405         {
406             const char *ws[32];
407             char attset_str[64], attname_str[64];
408             int i;
409             int first = 1;
410             
411             for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
412                 ;
413             for (i = 0; *s && *s != ',' && *s != ')' && 
414                      !strchr(FILE_MATCH_BLANK, *s); s++)
415                 if (i+1 < sizeof(attset_str))
416                     attset_str[i++] = *s;
417             attset_str[i] = '\0';
418             
419             for (; strchr(FILE_MATCH_BLANK, *s); s++)
420                 ;
421             if (*s != ',')
422                 strcpy(attname_str, attset_str);
423             else
424             {
425                 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
426                     ;
427                 for (i = 0; *s && *s != ')' && 
428                          !strchr(FILE_MATCH_BLANK, *s); s++)
429                     if (i+1 < sizeof(attname_str))
430                         attname_str[i++] = *s;
431                 attname_str[i] = '\0';
432             }
433
434             searchRecordKey(zh, reckeys, attname_str, ws, 32);
435
436             if (*s != ')')
437             {
438                 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
439                       spec, zh->m_group ? zh->m_group : "none");
440                 return NULL;
441             }
442             s++;
443
444             for (i = 0; i<32; i++)
445                 if (ws[i])
446                 {
447                     if (first)
448                     {
449                         *dst++ = ' ';
450                         first = 0;
451                     }
452                     strcpy(dst, ws[i]);
453                     dst += strlen(ws[i]);
454                 }
455             if (first)
456             {
457                 yaz_log(YLOG_WARN, "Record didn't contain match"
458                       " fields in (%s,%s)", attset_str, attname_str);
459                 return NULL;
460             }
461         }
462         else if (*s == '$')
463         {
464             int spec_len;
465             char special[64];
466             const char *spec_src = NULL;
467             const char *s1 = ++s;
468             while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
469                 s1++;
470
471             spec_len = s1 - s;
472             if (spec_len > sizeof(special)-1)
473                 spec_len = sizeof(special)-1;
474             memcpy(special, s, spec_len);
475             special[spec_len] = '\0';
476             s = s1;
477
478             if (!strcmp(special, "group"))
479                 spec_src = zh->m_group;
480             else if (!strcmp(special, "database"))
481                 spec_src = zh->basenames[0];
482             else if (!strcmp(special, "filename")) {
483                 spec_src = fname;
484             }
485             else if (!strcmp(special, "type"))
486                 spec_src = zh->m_record_type;
487             else 
488                 spec_src = NULL;
489             if (spec_src)
490             {
491                 strcpy(dst, spec_src);
492                 dst += strlen(spec_src);
493             }
494         }
495         else if (*s == '\"' || *s == '\'')
496         {
497             int stopMarker = *s++;
498             char tmpString[64];
499             int i = 0;
500
501             while (*s && *s != stopMarker)
502             {
503                 if (i+1 < sizeof(tmpString))
504                     tmpString[i++] = *s++;
505             }
506             if (*s)
507                 s++;
508             tmpString[i] = '\0';
509             strcpy(dst, tmpString);
510             dst += strlen(tmpString);
511         }
512         else
513         {
514             yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
515                   spec, zh->m_group ? zh->m_group : "none");
516             return NULL;
517         }
518         *dst++ = 1;
519     }
520     if (dst == dstBuf)
521     {
522         yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
523               fname, zh->m_group ? zh->m_group : "none");
524         return NULL;
525     }
526     *dst = '\0';
527     return dstBuf;
528 }
529
530 struct recordLogInfo {
531     const char *fname;
532     int recordOffset;
533     struct recordGroup *rGroup;
534 };
535
536 static void all_matches_add(struct recExtractCtrl *ctrl)
537 {
538     RecWord word;
539     extract_init(ctrl, &word);
540     word.index_name = "_ALLRECORDS";
541     word.index_type = "w";
542     word.seqno = 1;
543     extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
544                               "", 0);
545 }
546
547 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
548                                        struct ZebraRecStream *stream,
549                                        enum zebra_recctrl_action_t action,
550                                        int test_mode, 
551                                        const char *recordType,
552                                        zint *sysno,
553                                        const char *match_criteria,
554                                        const char *fname,
555                                        RecType recType,
556                                        void *recTypeClientData);
557
558
559 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, 
560                              int deleteFlag)
561 {
562     ZEBRA_RES r = ZEBRA_OK;
563     int i, fd;
564     char gprefix[128];
565     char ext[128];
566     char ext_res[128];
567     struct file_read_info *fi = 0;
568     const char *original_record_type = 0;
569     RecType recType;
570     void *recTypeClientData;
571     struct ZebraRecStream stream, *streamp;
572
573     zebra_init_log_level();
574
575     if (!zh->m_group || !*zh->m_group)
576         *gprefix = '\0';
577     else
578         sprintf(gprefix, "%s.", zh->m_group);
579     
580     yaz_log(log_level_extract, "zebra_extract_file %s", fname);
581
582     /* determine file extension */
583     *ext = '\0';
584     for (i = strlen(fname); --i >= 0; )
585         if (fname[i] == '/')
586             break;
587         else if (fname[i] == '.')
588         {
589             strcpy(ext, fname+i+1);
590             break;
591         }
592     /* determine file type - depending on extension */
593     original_record_type = zh->m_record_type;
594     if (!zh->m_record_type)
595     {
596         sprintf(ext_res, "%srecordType.%s", gprefix, ext);
597         zh->m_record_type = res_get(zh->res, ext_res);
598     }
599     if (!zh->m_record_type)
600     {
601         check_log_limit(zh);
602         if (zh->records_processed + zh->records_skipped
603             < zh->m_file_verbose_limit)
604             yaz_log(YLOG_LOG, "? %s", fname);
605         zh->records_skipped++;
606         return 0;
607     }
608     /* determine match criteria */
609     if (!zh->m_record_id)
610     {
611         sprintf(ext_res, "%srecordId.%s", gprefix, ext);
612         zh->m_record_id = res_get(zh->res, ext_res);
613     }
614
615     if (!(recType =
616           recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
617                           &recTypeClientData)))
618     {
619         yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
620         return ZEBRA_FAIL;
621     }
622
623     switch(recType->version)
624     {
625     case 0:
626         break;
627     default:
628         yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
629     }
630     if (sysno && deleteFlag)
631     {
632         streamp = 0;
633         fi = 0;
634     }
635     else
636     {
637         char full_rep[1024];
638
639         if (zh->path_reg && !yaz_is_abspath(fname))
640         {
641             strcpy(full_rep, zh->path_reg);
642             strcat(full_rep, "/");
643             strcat(full_rep, fname);
644         }
645         else
646             strcpy(full_rep, fname);
647         
648         if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
649         {
650             yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
651             zh->m_record_type = original_record_type;
652             return ZEBRA_FAIL;
653         }
654         streamp = &stream;
655         zebra_create_stream_fd(streamp, fd, 0);
656     }
657     r = zebra_extract_records_stream(zh, streamp,
658                                      deleteFlag ? 
659                                      action_delete : action_update,
660                                      0, /* tst_mode */
661                                      zh->m_record_type,
662                                      sysno,
663                                      0, /*match_criteria */
664                                      fname,
665                                      recType, recTypeClientData);
666     if (streamp)
667         stream.destroy(streamp);
668     zh->m_record_type = original_record_type;
669     return r;
670 }
671
672 /*
673   If sysno is provided, then it's used to identify the reocord.
674   If not, and match_criteria is provided, then sysno is guessed
675   If not, and a record is provided, then sysno is got from there
676   
677  */
678
679 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, 
680                                       const char *buf, size_t buf_size,
681                                       enum zebra_recctrl_action_t action,
682                                       int test_mode, 
683                                       const char *recordType,
684                                       zint *sysno,
685                                       const char *match_criteria,
686                                       const char *fname)
687 {
688     struct ZebraRecStream stream;
689     ZEBRA_RES res;
690     void *clientData;
691     RecType recType = 0;
692
693     if (recordType && *recordType)
694     {
695         yaz_log(log_level_extract,
696                 "Record type explicitly specified: %s", recordType);
697         recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
698                                   &clientData);
699     } 
700     else
701     {
702         if (!(zh->m_record_type))
703         {
704             yaz_log(YLOG_WARN, "No such record type defined");
705             return ZEBRA_FAIL;
706         }
707         yaz_log(log_level_extract, "Get record type from rgroup: %s",
708                 zh->m_record_type);
709         recType = recType_byName(zh->reg->recTypes, zh->res,
710                                   zh->m_record_type, &clientData);
711         recordType = zh->m_record_type;
712     }
713     
714     if (!recType)
715     {
716         yaz_log(YLOG_WARN, "No such record type: %s", recordType);
717         return ZEBRA_FAIL;
718     }
719
720     zebra_create_stream_mem(&stream, buf, buf_size);
721
722     res = zebra_extract_records_stream(zh, &stream,
723                                        action,
724                                        test_mode, 
725                                        recordType,
726                                        sysno,
727                                        match_criteria,
728                                        fname,
729                                        recType, clientData);
730     stream.destroy(&stream);
731     return res;
732 }
733
734 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, 
735                                        struct ZebraRecStream *stream,
736                                        enum zebra_recctrl_action_t action,
737                                        int test_mode, 
738                                        const char *recordType,
739                                        zint *sysno,
740                                        const char *match_criteria,
741                                        const char *fname,
742                                        RecType recType,
743                                        void *recTypeClientData)
744 {
745     ZEBRA_RES res = ZEBRA_OK;
746     while (1)
747     {
748         int more = 0;
749         res = zebra_extract_record_stream(zh, stream,
750                                           action,
751                                           test_mode, 
752                                           recordType,
753                                           sysno,
754                                           match_criteria,
755                                           fname,
756                                           recType, recTypeClientData, &more);
757         if (!more)
758         {
759             res = ZEBRA_OK;
760             break;
761         }
762         if (res != ZEBRA_OK)
763             break;
764         if (sysno)
765             break;
766     }
767     return res;
768 }
769
770
771 static WRBUF wrbuf_hex_str(const char *cstr)
772 {
773     size_t i;
774     WRBUF w = wrbuf_alloc();
775     for (i = 0; cstr[i]; i++)
776     {
777         if (cstr[i] < ' ' || cstr[i] > 126)
778             wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
779         else
780             wrbuf_putc(w, cstr[i]);
781     }
782     return w;
783 }
784
785 ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 
786                                       struct ZebraRecStream *stream,
787                                       enum zebra_recctrl_action_t action,
788                                       int test_mode, 
789                                       const char *recordType,
790                                       zint *sysno,
791                                       const char *match_criteria,
792                                       const char *fname,
793                                       RecType recType,
794                                       void *recTypeClientData,
795                                       int *more)
796
797 {
798     zint sysno0 = 0;
799     RecordAttr *recordAttr;
800     struct recExtractCtrl extractCtrl;
801     int r;
802     const char *matchStr = 0;
803     Record rec;
804     off_t start_offset = 0, end_offset = 0;
805     const char *pr_fname = fname;  /* filename to print .. */
806     int show_progress = zh->records_processed + zh->records_skipped 
807         < zh->m_file_verbose_limit ? 1:0;
808
809     zebra_init_log_level();
810
811     if (!pr_fname)
812         pr_fname = "<no file>";  /* make it printable if file is omitted */
813
814     zebra_rec_keys_reset(zh->reg->keys);
815     zebra_rec_keys_reset(zh->reg->sortKeys);
816
817     if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
818     {
819         if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], 
820                                       zh->m_explain_database))
821             return ZEBRA_FAIL;
822     }
823
824     if (stream)
825     {
826         off_t null_offset = 0;
827         extractCtrl.stream = stream;
828
829         start_offset = stream->tellf(stream);
830
831         extractCtrl.first_record = start_offset ? 0 : 1;
832         
833         stream->endf(stream, &null_offset);;
834
835         extractCtrl.init = extract_init;
836         if (zh->reg->index_types)
837         {
838             extractCtrl.tokenAdd = extract_token_add2;
839         }
840         else
841         {
842             extractCtrl.tokenAdd = extract_token_add;
843         }
844         extractCtrl.schemaAdd = extract_schema_add;
845         extractCtrl.dh = zh->reg->dh;
846         extractCtrl.handle = zh;
847         extractCtrl.match_criteria[0] = '\0';
848         extractCtrl.staticrank = 0;
849         extractCtrl.action = action;
850
851         init_extractCtrl(zh, &extractCtrl);
852
853         extract_set_store_data_prepare(&extractCtrl);
854         
855         r = (*recType->extract)(recTypeClientData, &extractCtrl);
856
857         if (action == action_update)
858         {
859             action = extractCtrl.action;
860         }
861         
862         switch (r)
863         {
864         case RECCTRL_EXTRACT_EOF:
865             return ZEBRA_FAIL;
866         case RECCTRL_EXTRACT_ERROR_GENERIC:
867             /* error occured during extraction ... */
868             yaz_log(YLOG_WARN, "extract error: generic");
869             return ZEBRA_FAIL;
870         case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
871             /* error occured during extraction ... */
872             yaz_log(YLOG_WARN, "extract error: no such filter");
873             return ZEBRA_FAIL;
874         case RECCTRL_EXTRACT_SKIP:
875             if (show_progress)
876                 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
877                          recordType, pr_fname, (zint) start_offset);
878             *more = 1;
879             
880             end_offset = stream->endf(stream, 0);
881             if (end_offset)
882                 stream->seekf(stream, end_offset);
883
884             return ZEBRA_OK;
885         case RECCTRL_EXTRACT_OK:
886             break;
887         default:
888             yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
889             return ZEBRA_FAIL;
890         }
891         end_offset = stream->endf(stream, 0);
892         if (end_offset)
893             stream->seekf(stream, end_offset);
894         else
895             end_offset = stream->tellf(stream);
896
897         all_matches_add(&extractCtrl);
898         
899         if (extractCtrl.match_criteria[0])
900             match_criteria = extractCtrl.match_criteria;
901     }
902
903     *more = 1;
904     if (!sysno)
905     {
906         sysno = &sysno0;
907
908         if (match_criteria && *match_criteria) {
909             matchStr = match_criteria;
910         } else {
911             if (zh->m_record_id && *zh->m_record_id) {
912                 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, 
913                                                zh->m_record_id);
914                 if (!matchStr)
915                 {
916                     yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
917                              pr_fname, (zint) start_offset);
918                     return ZEBRA_FAIL;
919                 }
920             }
921         }
922         if (matchStr) 
923         {
924             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
925             char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
926                                           matchStr);
927
928             
929             if (log_level_extract)
930             {
931                 WRBUF w = wrbuf_hex_str(matchStr);
932                 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
933                 wrbuf_destroy(w);
934             }
935             if (rinfo)
936             {
937                 assert(*rinfo == sizeof(*sysno));
938                 memcpy(sysno, rinfo+1, sizeof(*sysno));
939             }
940        }
941     }
942     if (zebra_rec_keys_empty(zh->reg->keys))
943     {
944         /* the extraction process returned no information - the record
945            is probably empty - unless flagShowRecords is in use */
946         if (test_mode)
947             return ZEBRA_OK;
948     }
949
950     if (! *sysno)
951     {
952         /* new record */
953         if (action == action_delete)
954         {
955             yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
956                          pr_fname, (zint) start_offset);
957             yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
958             return ZEBRA_FAIL;
959         }
960         else if (action == action_replace)
961         {
962             yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
963                          pr_fname, (zint) start_offset);
964             yaz_log(YLOG_WARN, "cannot update record above (seems new)");
965             return ZEBRA_FAIL;
966         }
967         if (show_progress)
968             yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
969                      (zint) start_offset);
970         rec = rec_new(zh->reg->records);
971
972         *sysno = rec->sysno;
973
974         recordAttr = rec_init_attr(zh->reg->zei, rec);
975         if (extractCtrl.staticrank < 0)
976         {
977             yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
978             extractCtrl.staticrank = 0;
979         }
980
981         if (matchStr)
982         {
983             int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
984             dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
985                             sizeof(*sysno), sysno);
986         }
987
988         extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
989 #if FLUSH2
990         extract_flush_record_keys2(zh, *sysno,
991                                    zh->reg->keys, extractCtrl.staticrank,
992                                    0, recordAttr->staticrank);
993 #else
994         extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
995                                   extractCtrl.staticrank);
996 #endif
997         recordAttr->staticrank = extractCtrl.staticrank;
998         zh->records_inserted++;
999     } 
1000     else
1001     {
1002         /* record already exists */
1003         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1004         zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1005         if (action == action_insert)
1006         {
1007             yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, 
1008                          recordType, pr_fname, (zint) start_offset);
1009             logRecord(zh);
1010             return ZEBRA_FAIL;
1011         }
1012
1013         rec = rec_get(zh->reg->records, *sysno);
1014         assert(rec);
1015         
1016         recordAttr = rec_init_attr(zh->reg->zei, rec);
1017
1018         /* decrease total size */
1019         zebraExplain_recordBytesIncrement(zh->reg->zei,
1020                                            - recordAttr->recordSize);
1021
1022         zebra_rec_keys_set_buf(delkeys,
1023                                rec->info[recInfo_delKeys],
1024                                rec->size[recInfo_delKeys],
1025                                0);
1026         zebra_rec_keys_set_buf(sortKeys,
1027                                rec->info[recInfo_sortKeys],
1028                                rec->size[recInfo_sortKeys],
1029                                0);
1030
1031         extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1032 #if !FLUSH2
1033         extract_flush_record_keys(zh, *sysno, 0, delkeys,
1034                                   recordAttr->staticrank);
1035 #endif
1036         if (action == action_delete)
1037         {
1038             /* record going to be deleted */
1039 #if FLUSH2
1040             extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1041                                        delkeys, recordAttr->staticrank);
1042 #endif       
1043             if (zebra_rec_keys_empty(delkeys))
1044             {
1045                 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1046                         pr_fname, (zint) start_offset);
1047                 yaz_log(YLOG_WARN, "cannot delete file above, "
1048                         "storeKeys false (3)");
1049             }
1050             else
1051             {
1052                 if (show_progress)
1053                     yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1054                             pr_fname, (zint) start_offset);
1055                 zh->records_deleted++;
1056                 if (matchStr)
1057                 {
1058                     int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1059                     dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1060                 }
1061                 rec_del(zh->reg->records, &rec);
1062             }
1063             zebra_rec_keys_close(delkeys);
1064             zebra_rec_keys_close(sortKeys);
1065             rec_free(&rec);
1066             logRecord(zh);
1067             return ZEBRA_OK;
1068         }
1069         else
1070         {   /* update or special_update */
1071             if (show_progress)
1072                 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1073                         pr_fname, (zint) start_offset);
1074             extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1075
1076 #if FLUSH2
1077             extract_flush_record_keys2(zh, *sysno,
1078                                        zh->reg->keys, extractCtrl.staticrank,
1079                                        delkeys, recordAttr->staticrank);
1080 #else
1081             extract_flush_record_keys(zh, *sysno, 1, 
1082                                       zh->reg->keys, extractCtrl.staticrank);
1083 #endif
1084             recordAttr->staticrank = extractCtrl.staticrank;
1085             zh->records_updated++;
1086         }
1087         zebra_rec_keys_close(delkeys);
1088         zebra_rec_keys_close(sortKeys);
1089     }
1090     /* update file type */
1091     xfree(rec->info[recInfo_fileType]);
1092     rec->info[recInfo_fileType] =
1093         rec_strdup(recordType, &rec->size[recInfo_fileType]);
1094
1095     /* update filename */
1096     xfree(rec->info[recInfo_filename]);
1097     rec->info[recInfo_filename] =
1098         rec_strdup(fname, &rec->size[recInfo_filename]);
1099
1100     /* update delete keys */
1101     xfree(rec->info[recInfo_delKeys]);
1102     if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1103     {
1104         zebra_rec_keys_get_buf(zh->reg->keys,
1105                                &rec->info[recInfo_delKeys],
1106                                &rec->size[recInfo_delKeys]);
1107     }
1108     else
1109     {
1110         rec->info[recInfo_delKeys] = NULL;
1111         rec->size[recInfo_delKeys] = 0;
1112     }
1113     /* update sort keys */
1114     xfree(rec->info[recInfo_sortKeys]);
1115
1116     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1117                            &rec->info[recInfo_sortKeys],
1118                            &rec->size[recInfo_sortKeys]);
1119
1120     if (stream)
1121     {
1122         recordAttr->recordSize = end_offset - start_offset;
1123         zebraExplain_recordBytesIncrement(zh->reg->zei,
1124                                           recordAttr->recordSize);
1125     }
1126
1127     /* set run-number for this record */
1128     recordAttr->runNumber =
1129         zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1130
1131     /* update store data */
1132     xfree(rec->info[recInfo_storeData]);
1133
1134     /* update store data */
1135     if (zh->store_data_buf)
1136     {
1137         rec->size[recInfo_storeData] = zh->store_data_size;
1138         rec->info[recInfo_storeData] = zh->store_data_buf;
1139         zh->store_data_buf = 0;
1140         recordAttr->recordSize = zh->store_data_size;
1141     }
1142     else if (zh->m_store_data)
1143     {
1144         off_t cur_offset = stream->tellf(stream);
1145
1146         rec->size[recInfo_storeData] = recordAttr->recordSize;
1147         rec->info[recInfo_storeData] = (char *)
1148             xmalloc(recordAttr->recordSize);
1149         stream->seekf(stream, start_offset);
1150         stream->readf(stream, rec->info[recInfo_storeData],
1151                       recordAttr->recordSize);
1152         stream->seekf(stream, cur_offset);
1153     }
1154     else
1155     {
1156         rec->info[recInfo_storeData] = NULL;
1157         rec->size[recInfo_storeData] = 0;
1158     }
1159     /* update database name */
1160     xfree(rec->info[recInfo_databaseName]);
1161     rec->info[recInfo_databaseName] =
1162         rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); 
1163
1164     /* update offset */
1165     recordAttr->recordOffset = start_offset;
1166     
1167     /* commit this record */
1168     rec_put(zh->reg->records, &rec);
1169     logRecord(zh);
1170     return ZEBRA_OK;
1171 }
1172
1173 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1174 {
1175     ZebraHandle zh = (ZebraHandle) handle;
1176     struct recExtractCtrl extractCtrl;
1177
1178     if (zebraExplain_curDatabase(zh->reg->zei,
1179                                   rec->info[recInfo_databaseName]))
1180     {
1181         abort();
1182         if (zebraExplain_newDatabase(zh->reg->zei,
1183                                       rec->info[recInfo_databaseName], 0))
1184             abort();
1185     }
1186
1187     zebra_rec_keys_reset(zh->reg->keys);
1188     zebra_rec_keys_reset(zh->reg->sortKeys);
1189
1190     extractCtrl.init = extract_init;
1191     extractCtrl.tokenAdd = extract_token_add;
1192     extractCtrl.schemaAdd = extract_schema_add;
1193     extractCtrl.dh = zh->reg->dh;
1194
1195     init_extractCtrl(zh, &extractCtrl);
1196
1197     extractCtrl.flagShowRecords = 0;
1198     extractCtrl.match_criteria[0] = '\0';
1199     extractCtrl.staticrank = 0;
1200     extractCtrl.action = action_update;
1201
1202     extractCtrl.handle = handle;
1203     extractCtrl.first_record = 1;
1204     
1205     extract_set_store_data_prepare(&extractCtrl);
1206
1207     if (n)
1208         grs_extract_tree(&extractCtrl, n);
1209
1210     if (rec->size[recInfo_delKeys])
1211     {
1212         zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1213         
1214         zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1215
1216         zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1217                                rec->size[recInfo_delKeys],
1218                                0);
1219 #if FLUSH2
1220         extract_flush_record_keys2(zh, rec->sysno, 
1221                                    zh->reg->keys, 0, delkeys, 0);
1222 #else
1223         extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1224         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1225 #endif
1226         zebra_rec_keys_close(delkeys);
1227
1228         zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1229                                rec->size[recInfo_sortKeys],
1230                                0);
1231
1232         extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1233         zebra_rec_keys_close(sortkeys);
1234     }
1235     else
1236     {
1237 #if FLUSH2
1238         extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1239 #else
1240         extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);        
1241 #endif
1242     }
1243     extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1244     
1245     xfree(rec->info[recInfo_delKeys]);
1246     zebra_rec_keys_get_buf(zh->reg->keys,
1247                            &rec->info[recInfo_delKeys], 
1248                            &rec->size[recInfo_delKeys]);
1249
1250     xfree(rec->info[recInfo_sortKeys]);
1251     zebra_rec_keys_get_buf(zh->reg->sortKeys,
1252                            &rec->info[recInfo_sortKeys],
1253                            &rec->size[recInfo_sortKeys]);
1254     return ZEBRA_OK;
1255 }
1256
1257 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1258                           zebra_rec_keys_t reckeys,
1259                           int level)
1260 {
1261     if (zebra_rec_keys_rewind(reckeys))
1262     {
1263         size_t slen;
1264         const char *str;
1265         struct it_key key;
1266         NMEM nmem = nmem_create();
1267
1268         while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1269         {
1270             char keystr[200]; /* room for zints to print */
1271             char *dst_term = 0;
1272             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1273             const char *index_type;
1274             int i;
1275             const char *string_index;
1276             
1277             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1278                                     0/* db */, &string_index);
1279             assert(index_type);
1280             zebra_term_untrans_iconv(zh, nmem, *index_type,
1281                                      &dst_term, str);
1282             *keystr = '\0';
1283             for (i = 0; i<key.len; i++)
1284             {
1285                 sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key.mem[i]);
1286             }
1287
1288             if (*str < CHR_BASE_CHAR)
1289             {
1290                 int i;
1291                 char dst_buf[200]; /* room for special chars */
1292
1293                 strcpy(dst_buf , "?");
1294
1295                 if (!strcmp(str, ""))
1296                     strcpy(dst_buf, "alwaysmatches");
1297                 if (!strcmp(str, FIRST_IN_FIELD_STR))
1298                     strcpy(dst_buf, "firstinfield");
1299                 else if (!strcmp(str, CHR_UNKNOWN))
1300                     strcpy(dst_buf, "unknown");
1301                 else if (!strcmp(str, CHR_SPACE))
1302                     strcpy(dst_buf, "space");
1303                 
1304                 for (i = 0; i<slen; i++)
1305                 {
1306                     sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1307                 }
1308                 yaz_log(level, "%s%s %s %s", keystr, index_type,
1309                         string_index, dst_buf);
1310                 
1311             }
1312             else
1313                 yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1314                         string_index, dst_term);
1315
1316             nmem_reset(nmem);
1317         }
1318         nmem_destroy(nmem);
1319     }
1320 }
1321
1322 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1323                              zebra_rec_keys_t reckeys)
1324 {
1325     ZebraExplainInfo zei = zh->reg->zei;
1326     struct ord_stat {
1327         int no;
1328         int ord;
1329         struct ord_stat *next;
1330     };
1331
1332     if (zebra_rec_keys_rewind(reckeys))
1333     {
1334         struct ord_stat *ord_list = 0;
1335         struct ord_stat *p;
1336         size_t slen;
1337         const char *str;
1338         struct it_key key_in;
1339         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1340         {
1341             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1342
1343             for (p = ord_list; p ; p = p->next)
1344                 if (p->ord == ord)
1345                 {
1346                     p->no++;
1347                     break;
1348                 }
1349             if (!p)
1350             {
1351                 p = xmalloc(sizeof(*p));
1352                 p->no = 1;
1353                 p->ord = ord;
1354                 p->next = ord_list;
1355                 ord_list = p;
1356             }
1357         }
1358
1359         p = ord_list;
1360         while (p)
1361         {
1362             struct ord_stat *p1 = p;
1363
1364             if (is_insert)
1365                 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1366             else
1367                 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1368             p = p->next;
1369             xfree(p1);
1370         }
1371     }
1372 }
1373
1374 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
1375                                 zebra_rec_keys_t ins_keys, zint ins_rank,
1376                                 zebra_rec_keys_t del_keys, zint del_rank)
1377 {
1378     ZebraExplainInfo zei = zh->reg->zei;
1379     int normal = 0;
1380     int optimized = 0;
1381
1382     if (!zh->reg->key_block)
1383     {
1384         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1385         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1386         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1387         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1388     }
1389
1390     if (ins_keys)
1391     {
1392         extract_rec_keys_adjust(zh, 1, ins_keys);
1393         if (!del_keys)
1394             zebraExplain_recordCountIncrement(zei, 1);
1395         zebra_rec_keys_rewind(ins_keys);
1396     }
1397     if (del_keys)
1398     {
1399         extract_rec_keys_adjust(zh, 0, del_keys);
1400         if (!ins_keys)
1401             zebraExplain_recordCountIncrement(zei, -1);
1402         zebra_rec_keys_rewind(del_keys);
1403     }
1404
1405     while (1)
1406     {
1407         size_t del_slen;
1408         const char *del_str;
1409         struct it_key del_key_in;
1410         int del = 0;
1411
1412         size_t ins_slen;
1413         const char *ins_str;
1414         struct it_key ins_key_in;
1415         int ins = 0;
1416
1417         if (del_keys)
1418             del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1419                                       &del_key_in);
1420         if (ins_keys)
1421             ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1422                                       &ins_key_in);
1423
1424         if (del && ins && ins_rank == del_rank
1425             && !key_compare(&del_key_in, &ins_key_in) 
1426             && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1427         {
1428             optimized++;
1429             continue;
1430         }
1431         if (!del && !ins)
1432             break;
1433         
1434         normal++;
1435         if (del)
1436             key_block_write(zh->reg->key_block, sysno, 
1437                             &del_key_in, 0, del_str, del_slen,
1438                             del_rank, zh->m_staticrank);
1439         if (ins)
1440             key_block_write(zh->reg->key_block, sysno, 
1441                             &ins_key_in, 1, ins_str, ins_slen,
1442                             ins_rank, zh->m_staticrank);
1443     }
1444     yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1445 }
1446
1447 void extract_flush_record_keys(ZebraHandle zh, zint sysno, int cmd,
1448                                zebra_rec_keys_t reckeys,
1449                                zint staticrank)
1450 {
1451     ZebraExplainInfo zei = zh->reg->zei;
1452
1453     extract_rec_keys_adjust(zh, cmd, reckeys);
1454
1455     if (log_level_details)
1456     {
1457         yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s",
1458                 sysno, cmd ? "insert" : "delete");
1459         extract_rec_keys_log(zh, cmd, reckeys, log_level_details);
1460     }
1461
1462     if (!zh->reg->key_block)
1463     {
1464         int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1465         const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1466         int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1467         zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1468     }
1469     zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1);
1470
1471 #if 0
1472     yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd);
1473     print_rec_keys(zh, reckeys);
1474 #endif
1475     if (zebra_rec_keys_rewind(reckeys))
1476     {
1477         size_t slen;
1478         const char *str;
1479         struct it_key key_in;
1480         while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1481         {
1482             key_block_write(zh->reg->key_block, sysno, 
1483                             &key_in, cmd, str, slen,
1484                             staticrank, zh->m_staticrank);
1485         }
1486     }
1487 }
1488
1489 ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh,
1490                                      zebra_rec_keys_t reckeys,
1491                                      zebra_snippets *snippets)
1492 {
1493     NMEM nmem = nmem_create();
1494     if (zebra_rec_keys_rewind(reckeys)) 
1495     {
1496         const char *str;
1497         size_t slen;
1498         struct it_key key;
1499         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1500         {
1501             char *dst_term = 0;
1502             int ord;
1503             zint seqno;
1504             const char *index_type;
1505
1506             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1507             seqno = key.mem[key.len-1];
1508             ord = CAST_ZINT_TO_INT(key.mem[0]);
1509             
1510             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1511                                     0/* db */, 0 /* string_index */);
1512             assert(index_type);
1513             zebra_term_untrans_iconv(zh, nmem, *index_type,
1514                                      &dst_term, str);
1515             zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1516             nmem_reset(nmem);
1517         }
1518     }
1519     nmem_destroy(nmem);
1520     return ZEBRA_OK;
1521 }
1522
1523 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1524 {
1525     yaz_log(YLOG_LOG, "print_rec_keys");
1526     if (zebra_rec_keys_rewind(reckeys))
1527     {
1528         const char *str;
1529         size_t slen;
1530         struct it_key key;
1531         while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1532         {
1533             char dst_buf[IT_MAX_WORD];
1534             zint seqno;
1535             const char *index_type;
1536             int ord = CAST_ZINT_TO_INT(key.mem[0]);
1537             const char *db = 0;
1538             assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1539
1540             zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1541             
1542             seqno = key.mem[key.len-1];
1543             
1544             zebra_term_untrans(zh, *index_type, dst_buf, str);
1545             
1546             yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT 
1547                     " term=%s", ord, seqno, dst_buf); 
1548         }
1549     }
1550 }
1551
1552 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1553                                      const char *str, int length)
1554 {
1555     struct it_key key;
1556     ZebraHandle zh = p->extractCtrl->handle;
1557     ZebraExplainInfo zei = zh->reg->zei;
1558     int ch, i;
1559
1560     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1561     if (ch < 0)
1562         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1563
1564     i = 0;
1565     key.mem[i++] = ch;
1566     key.mem[i++] = p->record_id;
1567     key.mem[i++] = p->section_id;
1568
1569     if (zh->m_segment_indexing)
1570         key.mem[i++] = p->segment;
1571     key.mem[i++] = p->seqno;
1572     key.len = i;
1573
1574     zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1575 }
1576
1577 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1578 {
1579     struct it_key key;
1580     ZebraHandle zh = p->extractCtrl->handle;
1581     ZebraExplainInfo zei = zh->reg->zei;
1582     int ch;
1583     zinfo_index_category_t cat = zinfo_index_category_sort;
1584
1585     ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1586     if (ch < 0)
1587         ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1588     key.len = 2;
1589     key.mem[0] = ch;
1590     key.mem[1] = p->record_id;
1591
1592     zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1593 }
1594
1595 static void extract_add_staticrank_string(RecWord *p,
1596                                           const char *str, int length)
1597 {
1598     char valz[40];
1599     struct recExtractCtrl *ctrl = p->extractCtrl;
1600
1601     if (length > sizeof(valz)-1)
1602         length = sizeof(valz)-1;
1603
1604     memcpy(valz, str, length);
1605     valz[length] = '\0';
1606     ctrl->staticrank = atozint(valz);
1607 }
1608
1609 static void extract_add_string(RecWord *p, zebra_map_t zm,
1610                                const char *string, int length)
1611 {
1612     assert(length > 0);
1613
1614     if (!p->index_name)
1615         return;
1616
1617     if (zebra_maps_is_index(zm))
1618     {
1619         extract_add_index_string(p, zinfo_index_category_index,
1620                                  string, length);
1621         if (zebra_maps_is_alwaysmatches(zm))
1622         {
1623             RecWord word;
1624             memcpy(&word, p, sizeof(word));
1625
1626             word.seqno = 1;
1627             extract_add_index_string(
1628                 &word, zinfo_index_category_alwaysmatches, "", 0);
1629         }
1630     }
1631     else if (zebra_maps_is_sort(zm))
1632     {
1633         extract_add_sort_string(p, string, length);
1634     }
1635     else if (zebra_maps_is_staticrank(zm))
1636     {
1637         extract_add_staticrank_string(p, string, length);
1638     }
1639 }
1640
1641 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1642 {
1643     const char *b = p->term_buf;
1644     int remain = p->term_len;
1645     int first = 1;
1646     const char **map = 0;
1647     
1648     if (remain > 0)
1649         map = zebra_maps_input(zm, &b, remain, 0);
1650
1651     while (map)
1652     {
1653         char buf[IT_MAX_WORD+1];
1654         int i, remain;
1655
1656         /* Skip spaces */
1657         while (map && *map && **map == *CHR_SPACE)
1658         {
1659             remain = p->term_len - (b - p->term_buf);
1660             if (remain > 0)
1661                 map = zebra_maps_input(zm, &b, remain, 0);
1662             else
1663                 map = 0;
1664         }
1665         if (!map)
1666             break;
1667         i = 0;
1668         while (map && *map && **map != *CHR_SPACE)
1669         {
1670             const char *cp = *map;
1671
1672             while (i < IT_MAX_WORD && *cp)
1673                 buf[i++] = *(cp++);
1674             remain = p->term_len - (b - p->term_buf);
1675             if (remain > 0)
1676                 map = zebra_maps_input(zm, &b, remain, 0);
1677             else
1678                 map = 0;
1679         }
1680         if (!i)
1681             return;
1682
1683         if (first)
1684         {   
1685             first = 0;
1686             if (zebra_maps_is_first_in_field(zm))
1687             {
1688                 /* first in field marker */
1689                 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1690                 p->seqno++;
1691             }
1692         }
1693         extract_add_string(p, zm, buf, i);
1694         p->seqno++;
1695     }
1696 }
1697
1698 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1699 {
1700     const char *b = p->term_buf;
1701     char buf[IT_MAX_WORD+1];
1702     const char **map = 0;
1703     int i = 0, remain = p->term_len;
1704
1705     if (remain > 0)
1706         map = zebra_maps_input(zm, &b, remain, 1);
1707
1708     while (remain > 0 && i < IT_MAX_WORD)
1709     {
1710         while (map && *map && **map == *CHR_SPACE)
1711         {
1712             remain = p->term_len - (b - p->term_buf);
1713
1714             if (remain > 0)
1715             {
1716                 int first = i ? 0 : 1;  /* first position */
1717                 map = zebra_maps_input(zm, &b, remain, first);
1718             }
1719             else
1720                 map = 0;
1721         }
1722         if (!map)
1723             break;
1724
1725         if (i && i < IT_MAX_WORD)
1726             buf[i++] = *CHR_SPACE;
1727         while (map && *map && **map != *CHR_SPACE)
1728         {
1729             const char *cp = *map;
1730
1731             if (**map == *CHR_CUT)
1732             {
1733                 i = 0;
1734             }
1735             else
1736             {
1737                 if (i >= IT_MAX_WORD)
1738                     break;
1739                 while (i < IT_MAX_WORD && *cp)
1740                     buf[i++] = *(cp++);
1741             }
1742             remain = p->term_len  - (b - p->term_buf);
1743             if (remain > 0)
1744             {
1745                 map = zebra_maps_input(zm, &b, remain, 0);
1746             }
1747             else
1748                 map = 0;
1749         }
1750     }
1751     if (!i)
1752         return;
1753     extract_add_string(p, zm, buf, i);
1754 }
1755
1756 static void extract_token_add2_index(ZebraHandle zh, zebra_index_type_t type,
1757                                      RecWord *p)
1758 {
1759     struct it_key key;
1760     const char *res_buf = 0;
1761     size_t res_len = 0;
1762     int r = zebra_index_type_tokenize(type, p->term_buf, p->term_len,
1763                                       &res_buf, &res_len);
1764     int cat = zinfo_index_category_index;
1765     int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
1766     if (ch < 0)
1767         ch = zebraExplain_add_attr_str(zh->reg->zei, cat, p->index_type, p->index_name);
1768     while (r)
1769     {
1770         int i = 0;
1771         key.mem[i++] = ch;
1772         key.mem[i++] = p->record_id;
1773         key.mem[i++] = p->section_id;
1774         
1775         if (zh->m_segment_indexing)
1776             key.mem[i++] = p->segment;
1777         key.mem[i++] = p->seqno;
1778         key.len = i;
1779
1780         yaz_log(YLOG_LOG, "keys_write %.*s", (int) res_len, res_buf);
1781         zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key);
1782         
1783         p->seqno++;
1784         r = zebra_index_type_tokenize(type, 0, 0, &res_buf, &res_len);
1785     }
1786 }
1787
1788 static void extract_token_add2(RecWord *p)
1789 {
1790     ZebraHandle zh = p->extractCtrl->handle;
1791     zebra_index_type_t type = zebra_index_type_get(zh->reg->index_types, p->index_type);
1792     if (type)
1793     {
1794         if (zebra_index_type_is_index(type))
1795         {
1796             extract_token_add2_index(zh, type, p);
1797         }
1798         else if (zebra_index_type_is_sort(type))
1799         {
1800             ;
1801             
1802         }
1803     }
1804 }
1805
1806 /** \brief top-level indexing handler for recctrl system
1807     \param p token data to be indexed
1808
1809     Call sequence:
1810     extract_token
1811     zebra_add_{in}_complete
1812     extract_add_string
1813     
1814     extract_add_index_string
1815     or
1816     extract_add_sort_string
1817     or
1818     extract_add_staticrank_string
1819     
1820 */
1821 static void extract_token_add(RecWord *p)
1822 {
1823     ZebraHandle zh = p->extractCtrl->handle;
1824     zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, *p->index_type);
1825     WRBUF wrbuf;
1826
1827     if (log_level_details)
1828     {
1829         yaz_log(log_level_details, "extract_token_add "
1830                 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1831                 p->index_type, p->index_name, 
1832                 p->seqno, p->term_len, p->term_buf);
1833     }
1834     if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1835     {
1836         p->term_buf = wrbuf_buf(wrbuf);
1837         p->term_len = wrbuf_len(wrbuf);
1838     }
1839     if (zebra_maps_is_complete(zm))
1840         extract_add_complete_field(p, zm);
1841     else
1842         extract_add_incomplete_field(p, zm);
1843 }
1844
1845 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1846                                       void *buf, size_t sz)
1847 {
1848     ZebraHandle zh = (ZebraHandle) p->handle;
1849
1850     xfree(zh->store_data_buf);
1851     zh->store_data_buf = 0;
1852     zh->store_data_size = 0;
1853     if (buf && sz)
1854     {
1855         zh->store_data_buf = xmalloc(sz);
1856         zh->store_data_size = sz;
1857         memcpy(zh->store_data_buf, buf, sz);
1858     }
1859 }
1860
1861 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1862 {
1863     ZebraHandle zh = (ZebraHandle) p->handle;
1864     xfree(zh->store_data_buf);
1865     zh->store_data_buf = 0;
1866     zh->store_data_size = 0;
1867     p->setStoreData = extract_set_store_data_cb;
1868 }
1869
1870 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1871 {
1872     ZebraHandle zh = (ZebraHandle) p->handle;
1873     zebraExplain_addSchema(zh->reg->zei, oid);
1874 }
1875
1876 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1877                              int cmd, zebra_rec_keys_t reckeys)
1878 {
1879 #if 0
1880     yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1881             cmd, sysno);
1882     extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1883 #endif
1884
1885     if (zebra_rec_keys_rewind(reckeys))
1886     {
1887         zebra_sort_index_t si = zh->reg->sort_index;
1888         size_t slen;
1889         const char *str;
1890         struct it_key key_in;
1891
1892         zebra_sort_sysno(si, sysno);
1893
1894         while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1895         {
1896             int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1897             
1898             zebra_sort_type(si, ord);
1899             if (cmd == 1)
1900                 zebra_sort_add(si, str, slen);
1901             else
1902                 zebra_sort_delete(si);
1903         }
1904     }
1905 }
1906
1907 /*
1908  * Local variables:
1909  * c-basic-offset: 4
1910  * indent-tabs-mode: nil
1911  * End:
1912  * vim: shiftwidth=4 tabstop=8 expandtab
1913  */
1914