Using NMEM rather than ODR for memory stuff.
[idzebra-moved-to-github.git] / index / mod_dom.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1995-2008 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #include <stdio.h>
21 #include <assert.h>
22 #include <ctype.h>
23 #include <stdarg.h>
24
25 #include <yaz/diagbib1.h>
26 #include <yaz/tpath.h>
27 #include <yaz/snprintf.h>
28
29 #include <libxml/xmlversion.h>
30 #include <libxml/parser.h>
31 #include <libxml/tree.h>
32 #include <libxml/xmlIO.h>
33 #include <libxml/xmlreader.h>
34 #include <libxslt/transform.h>
35 #include <libxslt/xsltutils.h>
36
37 #if YAZ_HAVE_EXSLT
38 #include <libexslt/exslt.h>
39 #endif
40
41 #include <idzebra/util.h>
42 #include <idzebra/recctrl.h>
43 #include <yaz/oid_db.h>
44
45 /* DOM filter style indexing */
46 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
47 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
48
49 /* DOM filter style indexing */
50 #define ZEBRA_PI_NAME "zebra-2.0"
51 static const char *zebra_pi_name = ZEBRA_PI_NAME;
52
53
54
55 struct convert_s {
56     const char *stylesheet;
57     xsltStylesheetPtr stylesheet_xsp;
58     struct convert_s *next;
59 };
60
61 struct filter_extract {
62     const char *name;
63     struct convert_s *convert;
64 };
65
66 struct filter_store {
67     struct convert_s *convert;
68 };
69
70 struct filter_retrieve {
71     const char *name;
72     const char *identifier;
73     struct convert_s *convert;
74     struct filter_retrieve *next;
75 };
76
77 #define DOM_INPUT_XMLREADER 1
78 #define DOM_INPUT_MARC 2
79 struct filter_input {
80     const char *syntax;
81     const char *name;
82     struct convert_s *convert;
83     int type;
84     union {
85         struct {
86             xmlTextReaderPtr reader;
87             int split_level;
88         } xmlreader;
89         struct {
90             const char *input_charset;
91             yaz_marc_t handle;
92             yaz_iconv_t iconv;
93         } marc;
94     } u;
95     struct filter_input *next;
96 };
97   
98 struct filter_info {
99     char *fname;
100     char *full_name;
101     const char *profile_path;
102     NMEM nmem_record;
103     NMEM nmem_config;
104     xmlDocPtr doc_config;
105     struct filter_extract *extract;
106     struct filter_retrieve *retrieve_list;
107     struct filter_input *input_list;
108     struct filter_store *store;
109     int record_info_invoked;
110 };
111
112
113
114 #define XML_STRCMP(a,b)   strcmp((char*)a, b)
115 #define XML_STRLEN(a) strlen((char*)a)
116
117
118 #define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
119
120 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
121                     const char *fmt, ...)
122 #ifdef __GNUC__
123     __attribute__ ((format (printf, 4, 5)))
124 #endif
125     ;
126
127 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
128                     const char *fmt, ...)
129 {
130     va_list ap;
131     char buf[4096];
132
133     va_start(ap, fmt);
134     yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
135     if (ptr)
136     {
137         yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none", 
138                 xmlGetLineNo(ptr), buf);
139     }
140     else
141     {
142         yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
143     }
144     va_end(ap);
145 }
146
147
148 static void set_param_str(const char **params, const char *name,
149                           const char *value, NMEM nmem)
150 {
151     char *quoted = nmem_malloc(nmem, 3 + strlen(value));
152     sprintf(quoted, "'%s'", value);
153     while (*params)
154         params++;
155     params[0] = name;
156     params[1] = quoted;
157     params[2] = 0;
158 }
159
160 static void set_param_int(const char **params, const char *name,
161                           zint value, NMEM nmem)
162 {
163     char *quoted = nmem_malloc(nmem, 30); /* 25 digits enough for 2^64 */
164     while (*params)
165         params++;
166     sprintf(quoted, "'" ZINT_FORMAT "'", value);
167     params[0] = name;
168     params[1] = quoted;
169     params[2] = 0;
170 }
171
172 static void *filter_init(Res res, RecType recType)
173 {
174     struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
175     tinfo->fname = 0;
176     tinfo->full_name = 0;
177     tinfo->profile_path = 0;
178     tinfo->nmem_record = nmem_create();
179     tinfo->nmem_config = nmem_create();
180     tinfo->extract = 0;
181     tinfo->retrieve_list = 0;
182     tinfo->input_list = 0;
183     tinfo->store = 0;
184     tinfo->doc_config = 0;
185     tinfo->record_info_invoked = 0;
186
187 #if YAZ_HAVE_EXSLT
188     exsltRegisterAll(); 
189 #endif
190
191     return tinfo;
192 }
193
194 static int attr_content(struct _xmlAttr *attr, const char *name,
195                         const char **dst_content)
196 {
197     if (!XML_STRCMP(attr->name, name) && attr->children 
198         && attr->children->type == XML_TEXT_NODE)
199     {
200         *dst_content = (const char *)(attr->children->content);
201         return 1;
202     }
203     return 0;
204 }
205
206 static void destroy_xsp(struct convert_s *c)
207 {
208     while(c)
209     {
210         if (c->stylesheet_xsp)
211             xsltFreeStylesheet(c->stylesheet_xsp);
212         c = c->next;
213     }
214 }
215
216 static void destroy_dom(struct filter_info *tinfo)
217 {
218     if (tinfo->extract)
219     {
220         destroy_xsp(tinfo->extract->convert);
221         tinfo->extract = 0;
222     }
223     if (tinfo->store)
224     {
225         destroy_xsp(tinfo->store->convert);
226         tinfo->store = 0;
227     }
228     if (tinfo->input_list)
229     {
230         struct filter_input *i_ptr;
231         for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
232         {
233             switch(i_ptr->type)
234             {
235             case DOM_INPUT_XMLREADER:
236                 if (i_ptr->u.xmlreader.reader)
237                     xmlFreeTextReader(i_ptr->u.xmlreader.reader);
238                 break;
239             case DOM_INPUT_MARC:
240                 yaz_iconv_close(i_ptr->u.marc.iconv);
241                 yaz_marc_destroy(i_ptr->u.marc.handle);
242                 break;
243             }
244             destroy_xsp(i_ptr->convert);
245         }
246         tinfo->input_list = 0;
247     }
248     if (tinfo->retrieve_list)
249     {
250         struct filter_retrieve *r_ptr;
251         for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
252             destroy_xsp(r_ptr->convert);
253         tinfo->retrieve_list = 0;
254     }
255
256     if (tinfo->doc_config)
257     {
258         xmlFreeDoc(tinfo->doc_config);
259         tinfo->doc_config = 0;
260     }
261     nmem_reset(tinfo->nmem_config);
262 }
263
264 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
265                                struct convert_s **l)
266 {
267     *l = 0;
268     FOR_EACH_ELEMENT(ptr) {
269         if (!XML_STRCMP(ptr->name, "xslt"))
270         {
271             struct _xmlAttr *attr;
272             struct convert_s *p 
273                 = nmem_malloc(tinfo->nmem_config, sizeof(*p));
274             
275             p->next = 0;
276             p->stylesheet = 0;
277             p->stylesheet_xsp = 0;
278             
279             for (attr = ptr->properties; attr; attr = attr->next)
280                 if (attr_content(attr, "stylesheet", &p->stylesheet))
281                     ;
282                 else
283                 {
284                     dom_log(YLOG_WARN, tinfo, ptr,
285                             "bad attribute @%s", attr->name);
286                 }
287             if (p->stylesheet)
288             {
289                 char tmp_xslt_full_name[1024];
290                 if (!yaz_filepath_resolve(p->stylesheet, 
291                                           tinfo->profile_path,
292                                           NULL, 
293                                           tmp_xslt_full_name))
294                 {
295                     dom_log(YLOG_WARN, tinfo, 0,
296                             "stylesheet %s not found in "
297                             "path %s",
298                             p->stylesheet, 
299                             tinfo->profile_path);
300                     return ZEBRA_FAIL;
301                 }
302                 
303                 p->stylesheet_xsp
304                     = xsltParseStylesheetFile((const xmlChar*) 
305                                               tmp_xslt_full_name);
306                 if (!p->stylesheet_xsp)
307                 {
308                     dom_log(YLOG_WARN, tinfo, 0,
309                             "could not parse xslt stylesheet %s",
310                             tmp_xslt_full_name);
311                     return ZEBRA_FAIL;
312                 }
313                 }
314                 else
315                 {
316                     dom_log(YLOG_WARN, tinfo, ptr,
317                             "missing attribute 'stylesheet' ");
318                     return ZEBRA_FAIL;
319                 }
320                 *l = p;
321                 l = &p->next;
322         }
323         else
324         {
325             dom_log(YLOG_WARN, tinfo, ptr,
326                     "bad element '%s', expected <xslt>", ptr->name);
327             return ZEBRA_FAIL;
328         }
329     }
330     return ZEBRA_OK;
331 }
332
333 static ZEBRA_RES perform_convert(struct filter_info *tinfo, 
334                                  struct recExtractCtrl *extctr,
335                                  struct convert_s *convert,
336                                  const char **params,
337                                  xmlDocPtr *doc,
338                                  xsltStylesheetPtr *last_xsp)
339 {
340     for (; convert; convert = convert->next)
341     {
342         xmlChar *buf_out = 0;
343         int len_out = 0;
344         xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
345                                                 *doc, params);
346         if (last_xsp)
347             *last_xsp = convert->stylesheet_xsp;
348         
349         if (!res_doc)
350             break;
351
352         /* now saving into buffer and re-reading into DOM to avoid annoing
353            XSLT problem with thrown-out indentation text nodes */
354         xsltSaveResultToString(&buf_out, &len_out, res_doc,
355                                convert->stylesheet_xsp); 
356         xmlFreeDoc(res_doc);
357
358         xmlFreeDoc(*doc);
359
360         *doc = xmlParseMemory((const char *) buf_out, len_out);
361
362         /* writing debug info out */
363         if (extctr && extctr->flagShowRecords)
364             yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s", 
365                     tinfo->fname ? tinfo->fname : "(none)", 
366                     convert->stylesheet,
367                     len_out, buf_out);
368         
369         xmlFree(buf_out);
370     }
371     return ZEBRA_OK;
372 }
373
374 static struct filter_input *new_input(struct filter_info *tinfo, int type)
375 {
376     struct filter_input *p;
377     struct filter_input **np = &tinfo->input_list;
378     for (;*np; np = &(*np)->next)
379         ;
380     p = *np = nmem_malloc(tinfo->nmem_config, sizeof(*p));
381     p->next = 0;
382     p->syntax = 0;
383     p->name = 0;
384     p->convert = 0;
385     p->type = type;
386     return p;
387 }
388
389 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
390                              const char *syntax, const char *name)
391 {
392     FOR_EACH_ELEMENT(ptr) {
393         if (!XML_STRCMP(ptr->name, "marc"))
394         {
395             yaz_iconv_t iconv = 0;
396             const char *input_charset = "marc-8";
397             struct _xmlAttr *attr;
398             
399             for (attr = ptr->properties; attr; attr = attr->next)
400             {
401                 if (attr_content(attr, "inputcharset", &input_charset))
402                     ;
403                 else
404                 {
405                     dom_log(YLOG_WARN, tinfo, ptr,
406                             "bad attribute @%s, expected @inputcharset",
407                             attr->name);
408                 }
409             }
410             iconv = yaz_iconv_open("utf-8", input_charset);
411             if (!iconv)
412             {
413                 dom_log(YLOG_WARN, tinfo, ptr, 
414                         "unsupported @charset '%s'", input_charset);
415                 return ZEBRA_FAIL;
416             }
417             else
418             {
419                 struct filter_input *p 
420                     = new_input(tinfo, DOM_INPUT_MARC);
421                 p->u.marc.handle = yaz_marc_create();
422                 p->u.marc.iconv = iconv;
423                 
424                 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
425                 
426                 ptr = ptr->next;
427                 
428                 parse_convert(tinfo, ptr, &p->convert);
429             }
430             break;
431
432         }
433         else if (!XML_STRCMP(ptr->name, "xmlreader"))
434         {
435             struct filter_input *p 
436                 = new_input(tinfo, DOM_INPUT_XMLREADER);
437             struct _xmlAttr *attr;
438             const char *level_str = 0;
439
440             p->u.xmlreader.split_level = 0;
441             p->u.xmlreader.reader = 0;
442
443             for (attr = ptr->properties; attr; attr = attr->next)
444             {
445                 if (attr_content(attr, "level", &level_str))
446                     ;
447                 else
448                 {
449                     dom_log(YLOG_WARN, tinfo, ptr,
450                             "bad attribute @%s, expected @level",
451                             attr->name);
452                 }
453             }
454             if (level_str)
455                 p->u.xmlreader.split_level = atoi(level_str);
456                 
457             ptr = ptr->next;
458
459             parse_convert(tinfo, ptr, &p->convert);
460             break;
461         }
462         else
463         {
464             dom_log(YLOG_WARN, tinfo, ptr,
465                     "bad element <%s>, expected <marc>|<xmlreader>",
466                     ptr->name);
467             return ZEBRA_FAIL;
468         }
469     }
470     return ZEBRA_OK;
471 }
472
473 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
474 {
475     char tmp_full_name[1024];
476     xmlNodePtr ptr;
477     xmlDocPtr doc;
478
479     tinfo->fname = nmem_strdup(tinfo->nmem_config, fname);
480     
481     if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path, 
482                              NULL, tmp_full_name))
483         tinfo->full_name = nmem_strdup(tinfo->nmem_config, tmp_full_name);
484     else
485         tinfo->full_name = nmem_strdup(tinfo->nmem_config, tinfo->fname);
486     
487     yaz_log(YLOG_LOG, "%s dom filter: "
488             "loading config file %s", tinfo->fname, tinfo->full_name);
489
490     doc = xmlParseFile(tinfo->full_name);
491     if (!doc)
492     {
493         yaz_log(YLOG_WARN, "%s: dom filter: "
494                 "failed to parse config file %s",
495                 tinfo->fname, tinfo->full_name);
496         return ZEBRA_FAIL;
497     }
498     /* save because we store ptrs to the content */ 
499     tinfo->doc_config = doc;
500     
501     ptr = xmlDocGetRootElement(doc);
502     if (!ptr || ptr->type != XML_ELEMENT_NODE 
503         || XML_STRCMP(ptr->name, "dom"))
504     {
505         dom_log(YLOG_WARN, tinfo, ptr,
506                 "bad root element <%s>, expected root element <dom>", 
507                 ptr->name);  
508         return ZEBRA_FAIL;
509     }
510
511     ptr = ptr->children;
512     FOR_EACH_ELEMENT(ptr) {
513         if (!XML_STRCMP(ptr->name, "extract"))
514         {
515             /*
516               <extract name="index">
517               <xslt stylesheet="first.xsl"/>
518               <xslt stylesheet="second.xsl"/>
519               </extract>
520             */
521             struct _xmlAttr *attr;
522             struct filter_extract *f =
523                 nmem_malloc(tinfo->nmem_config, sizeof(*f));
524             
525             tinfo->extract = f;
526             f->name = 0;
527             f->convert = 0;
528             for (attr = ptr->properties; attr; attr = attr->next)
529             {
530                 if (attr_content(attr, "name", &f->name))
531                     ;
532                 else
533                 {
534                     dom_log(YLOG_WARN, tinfo, ptr,
535                             "bad attribute @%s, expected @name",
536                             attr->name);
537                 }
538             }
539             parse_convert(tinfo, ptr->children, &f->convert);
540         }
541         else if (!XML_STRCMP(ptr->name, "retrieve"))
542         {  
543             /* 
544                <retrieve name="F">
545                <xslt stylesheet="some.xsl"/>
546                <xslt stylesheet="some.xsl"/>
547                </retrieve>
548             */
549             struct _xmlAttr *attr;
550             struct filter_retrieve **fp = &tinfo->retrieve_list;
551             struct filter_retrieve *f =
552                 nmem_malloc(tinfo->nmem_config, sizeof(*f));
553             
554             while (*fp)
555                 fp = &(*fp)->next;
556
557             *fp = f;
558             f->name = 0;
559             f->identifier = 0;
560             f->convert = 0;
561             f->next = 0;
562
563             for (attr = ptr->properties; attr; attr = attr->next)
564             {
565                 if (attr_content(attr, "identifier", 
566                                  &f->identifier))
567                     ;
568                 else if (attr_content(attr, "name", &f->name))
569                     ;
570                 else
571                 {
572                     dom_log(YLOG_WARN, tinfo, ptr,
573                             "bad attribute @%s,  expected @identifier|@name",
574                             attr->name);
575                 }
576             }
577             parse_convert(tinfo, ptr->children, &f->convert);
578         }
579         else if (!XML_STRCMP(ptr->name, "store"))
580         {
581             /*
582               <store name="F">
583               <xslt stylesheet="some.xsl"/>
584               <xslt stylesheet="some.xsl"/>
585               </retrieve>
586             */
587             struct filter_store *f =
588                 nmem_malloc(tinfo->nmem_config, sizeof(*f));
589             
590             tinfo->store = f;
591             f->convert = 0;
592             parse_convert(tinfo, ptr->children, &f->convert);
593         }
594         else if (!XML_STRCMP(ptr->name, "input"))
595         {
596             /*
597               <input syntax="xml">
598               <xmlreader level="1"/>
599               </input>
600               <input syntax="usmarc">
601               <marc inputcharset="marc-8"/>
602               </input>
603             */
604             struct _xmlAttr *attr;
605             const char  *syntax = 0;
606             const char *name = 0;
607             for (attr = ptr->properties; attr; attr = attr->next)
608             {
609                 if (attr_content(attr, "syntax", &syntax))
610                     ;
611                 else if (attr_content(attr, "name", &name))
612                     ;
613                 else
614                 {
615                     dom_log(YLOG_WARN, tinfo, ptr,
616                             "bad attribute @%s,  expected @syntax|@name",
617                             attr->name);
618                 }
619             }
620             parse_input(tinfo, ptr->children, syntax, name);
621         }
622         else
623         {
624             dom_log(YLOG_WARN, tinfo, ptr,
625                     "bad element <%s>, "
626                     "expected <extract>|<input>|<retrieve>|<store>",
627                     ptr->name);
628             return ZEBRA_FAIL;
629         }
630     }
631     if (!tinfo->input_list)
632     {
633         struct filter_input *p 
634             = new_input(tinfo, DOM_INPUT_XMLREADER);
635         p->u.xmlreader.split_level = 0;
636         p->u.xmlreader.reader = 0;
637     }
638     return ZEBRA_OK;
639 }
640
641 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
642                                                const char *est)
643 {
644     struct filter_retrieve *f = tinfo->retrieve_list;
645
646     /* return first schema if no est is provided */
647     if (!est)
648         return f;
649     for (; f; f = f->next)
650     { 
651         /* find requested schema */
652         if (est) 
653         {    
654             if (f->identifier && !strcmp(f->identifier, est))
655                 return f;
656             if (f->name && !strcmp(f->name, est))
657                 return f;
658         } 
659     }
660     return 0;
661 }
662
663 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
664 {
665     struct filter_info *tinfo = clientData;
666     if (!args || !*args)
667     {
668         yaz_log(YLOG_WARN, "dom filter: need config file");
669         return ZEBRA_FAIL;
670     }
671
672     if (tinfo->fname && !strcmp(args, tinfo->fname))
673         return ZEBRA_OK;
674     
675     tinfo->profile_path = res_get(res, "profilePath");
676
677     destroy_dom(tinfo);
678     return parse_dom(tinfo, args);
679 }
680
681 static void filter_destroy(void *clientData)
682 {
683     struct filter_info *tinfo = clientData;
684     destroy_dom(tinfo);
685     nmem_destroy(tinfo->nmem_config);
686     nmem_destroy(tinfo->nmem_record);
687     xfree(tinfo);
688 }
689
690 static int ioread_ex(void *context, char *buffer, int len)
691 {
692     struct recExtractCtrl *p = context;
693     return p->stream->readf(p->stream, buffer, len);
694 }
695
696 static int ioclose_ex(void *context)
697 {
698     return 0;
699 }
700
701
702 /* DOM filter style indexing */
703 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
704                             const char **dst_content)
705 {
706     if (0 == XML_STRCMP(attr->name, name) && attr->children 
707         && attr->children->type == XML_TEXT_NODE)
708     {
709         *dst_content = (const char *) (attr->children->content);
710         return 1;
711     }
712     return 0;
713 }
714
715
716 /* DOM filter style indexing */
717 static void index_value_of(struct filter_info *tinfo, 
718                            struct recExtractCtrl *extctr,
719                            RecWord* recword, 
720                            xmlNodePtr node, 
721                            const char *index_p)
722 {
723     if (tinfo->record_info_invoked == 1)
724     {
725         xmlChar *text = xmlNodeGetContent(node);
726         size_t text_len = strlen((const char *)text);
727        
728         /* if there is no text, we do not need to proceed */
729         if (text_len)
730         {            
731             const char *look = index_p;
732             const char *bval;
733             const char *eval;
734
735             xmlChar index[256];
736             xmlChar type[256];
737
738             /* assingning text to be indexed */
739             recword->term_buf = (const char *)text;
740             recword->term_len = text_len;
741
742             /* parsing all index name/type pairs */
743             /* may not start with ' ' or ':' */
744             while (*look && ' ' != *look && ':' != *look)
745             {
746                 /* setting name and type to zero */
747                 *index = '\0';
748                 *type = '\0';
749     
750                 /* parsing one index name */
751                 bval = look;
752                 while (*look && ':' != *look && ' ' != *look)
753                 {
754                     look++;
755                 }
756                 eval = look;
757                 strncpy((char *)index, (const char *)bval, eval - bval);
758                 index[eval - bval] = '\0';
759     
760     
761                 /* parsing one index type, if existing */
762                 if (':' == *look)
763                 {
764                     look++;
765       
766                     bval = look;
767                     while (*look && ' ' != *look)
768                     {
769                         look++;
770                     }
771                     eval = look;
772                     strncpy((char *)type, (const char *)bval, eval - bval);
773                     type[eval - bval] = '\0';
774                 }
775
776                 /* actually indexing the text given */
777
778                 recword->index_name = (const char *)index;
779                 if (*type)
780                     recword->index_type = (const char *) type;
781
782                 /* writing debug out */
783                 if (extctr->flagShowRecords)
784                     dom_log(YLOG_LOG, tinfo, 0, 
785                             "INDEX '%s:%s' '%s'", 
786                             (const char *) index,
787                             (const char *) type, 
788                             (const char *) text);
789                 
790                 (extctr->tokenAdd)(recword);
791
792                 /* eat whitespaces */
793                 if (*look && ' ' == *look)
794                 {
795                     look++;
796                 } 
797             }
798         }
799         xmlFree(text); 
800     }
801 }
802
803
804 /* DOM filter style indexing */
805 static void set_record_info(struct filter_info *tinfo, 
806                             struct recExtractCtrl *extctr, 
807                             xmlNodePtr node, 
808                             const char * id_p, 
809                             const char * rank_p, 
810                             const char * type_p)
811 {
812     /* writing debug info out */
813     if (extctr && extctr->flagShowRecords)
814         dom_log(YLOG_LOG, tinfo, node,
815                 "RECORD id=%s rank=%s type=%s", 
816                 id_p ? (const char *) id_p : "(null)",
817                 rank_p ? (const char *) rank_p : "(null)",
818                 type_p ? (const char *) type_p : "(null)");
819     
820
821     if (id_p && *id_p)
822         sscanf((const char *)id_p, "%255s", extctr->match_criteria);
823
824     if (rank_p && *rank_p)
825         extctr->staticrank = atozint((const char *)rank_p);
826
827     if (type_p && *type_p)
828     {
829         enum zebra_recctrl_action_t action = action_update;
830         if (!strcmp(type_p, "insert"))
831             action = action_insert;
832         else if (!strcmp(type_p, "delete"))
833             action = action_delete;
834         else if (!strcmp(type_p, "replace"))
835             action = action_replace;
836         else if (!strcmp(type_p, "update"))
837             action = action_update;
838         else
839             dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p);
840         extctr->action = action;
841     }
842
843     if (tinfo->record_info_invoked == 1)
844     {
845         /* warn about multiple only once */
846         dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
847     }
848     tinfo->record_info_invoked++;
849
850 }
851
852
853 /* DOM filter style indexing */
854 static void process_xml_element_zebra_node(struct filter_info *tinfo, 
855                                            struct recExtractCtrl *extctr, 
856                                            RecWord* recword, 
857                                            xmlNodePtr node)
858 {
859     if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
860         && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
861     {
862          if (0 == XML_STRCMP(node->name, "index"))
863          {
864             const char *index_p = 0;
865
866             struct _xmlAttr *attr;      
867             for (attr = node->properties; attr; attr = attr->next)
868             {
869                 if (attr_content_xml(attr, "name", &index_p))
870                 {
871                     index_value_of(tinfo, extctr, recword, node, index_p);
872                 }  
873                 else
874                 {
875                     dom_log(YLOG_WARN, tinfo, node,
876                             "bad attribute @%s, expected @name",
877                             attr->name);
878                 }
879             }
880         }
881         else if (0 == XML_STRCMP(node->name, "record"))
882         {
883             const char *id_p = 0;
884             const char *rank_p = 0;
885             const char *type_p = 0;
886
887             struct _xmlAttr *attr;
888             for (attr = node->properties; attr; attr = attr->next)
889             {
890                 if (attr_content_xml(attr, "id", &id_p))
891                     ;
892                 else if (attr_content_xml(attr, "rank", &rank_p))
893                     ;
894                 else if (attr_content_xml(attr, "type", &type_p))
895                     ;
896                 else
897                 {
898                     dom_log(YLOG_WARN, tinfo, node,
899                             "bad attribute @%s, expected @id|@rank|@type",
900                             attr->name);
901                 }
902             }
903             set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
904         } 
905         else
906         {
907             dom_log(YLOG_WARN, tinfo, node,
908                     "bad element <%s>,"
909                     " expected <record>|<index> in namespace '%s'",
910                     node->name, zebra_dom_ns);
911         }
912     }
913 }
914
915 static int attr_content_pi(const char **c_ptr, const char *name,
916                            char *value, size_t value_max)
917 {
918     size_t name_len = strlen(name);
919     const char *look = *c_ptr;
920     int ret = 0;
921
922     *value = '\0';
923     while (*look && ' ' == *look)
924         look++;
925     if (strlen(look) > name_len)
926     {
927         if (look[name_len] == '=' && !memcmp(look, name, name_len))
928         {
929             size_t i = 0;
930             look += name_len+1;
931             while (*look && ' ' != *look)
932             {
933                 if (i < value_max-1)
934                     value[i++] = *look;
935                 look++;
936             }
937             value[i] = '\0';
938             ret = 1;
939         }
940     }
941     while (*look && ' ' == *look)
942         look++;
943     *c_ptr = look;
944     return ret;
945 }
946
947 /* DOM filter style indexing */
948 static void process_xml_pi_node(struct filter_info *tinfo, 
949                                 struct recExtractCtrl *extctr, 
950                                 xmlNodePtr node,
951                                 const char **index_pp)
952 {
953     /* if right PI name, continue parsing PI */
954     if (0 == strcmp(zebra_pi_name, (const char *)node->name))
955     {
956         xmlChar *pi_p =  node->content;
957         const char *look = (const char *) node->content;
958     
959         /* parsing PI record instructions */
960         if (0 == strncmp((const char *)look, "record", 6))
961         {
962             char id[256];
963             char rank[256];
964             char type[256];
965             
966             *id = '\0';
967             *rank = '\0';
968             *type = '\0';
969             look += 6;
970             while (*look)
971                 if (attr_content_pi(&look, "id", id, sizeof(id)))
972                     ;
973                 else if (attr_content_pi(&look, "rank", rank, sizeof(rank)))
974                     ;
975                 else if (attr_content_pi(&look, "type", type, sizeof(type)))
976                 {
977                     dom_log(YLOG_WARN, tinfo, node,
978                             "content '%s', can not parse '%s'",
979                             pi_p, look);
980                     break;
981                 }
982             set_record_info(tinfo, extctr, node, id, rank, type);
983         } 
984         /* parsing index instruction */
985         else if (0 == strncmp((const char *)look, "index", 5))
986         {
987             look += 5;
988       
989             /* eat whitespace */
990             while (*look && ' ' == *look)
991                 look++;
992
993             /* export index instructions to outside */
994             *index_pp = look;
995         } 
996         else 
997         {
998             dom_log(YLOG_WARN, tinfo, node,
999                     "content '%s', can not parse '%s'",
1000                     pi_p, look);
1001         }
1002     }
1003 }
1004
1005 /* DOM filter style indexing */
1006 static void process_xml_element_node(struct filter_info *tinfo, 
1007                                      struct recExtractCtrl *extctr, 
1008                                      RecWord* recword, 
1009                                      xmlNodePtr node)
1010 {
1011     /* remember indexing instruction from PI to next element node */
1012     const char *index_p = 0;
1013
1014     /* check if we are an element node in the special zebra namespace 
1015        and either set record data or index value-of node content*/
1016     process_xml_element_zebra_node(tinfo, extctr, recword, node);
1017   
1018     /* loop through kid nodes */
1019     for (node = node->children; node; node = node->next)
1020     {
1021         /* check and set PI record and index index instructions */
1022         if (node->type == XML_PI_NODE)
1023         {
1024             process_xml_pi_node(tinfo, extctr, node, &index_p);
1025         }
1026         else if (node->type == XML_ELEMENT_NODE)
1027         {
1028             /* if there was a PI index instruction before this element */
1029             if (index_p)
1030             {
1031                 index_value_of(tinfo, extctr, recword, node, index_p);
1032                 index_p = 0;
1033             }
1034             process_xml_element_node(tinfo, extctr, recword,node);
1035         }
1036         else
1037             continue;
1038     }
1039 }
1040
1041
1042 /* DOM filter style indexing */
1043 static void extract_dom_doc_node(struct filter_info *tinfo, 
1044                                  struct recExtractCtrl *extctr, 
1045                                  xmlDocPtr doc)
1046 {
1047     /* only need to do the initialization once, reuse recword for all terms */
1048     RecWord recword;
1049     (*extctr->init)(extctr, &recword);
1050
1051     process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1052 }
1053
1054
1055
1056
1057 static int convert_extract_doc(struct filter_info *tinfo, 
1058                                struct filter_input *input,
1059                                struct recExtractCtrl *p, 
1060                                xmlDocPtr doc)
1061
1062 {
1063     xmlChar *buf_out;
1064     int len_out;
1065     const char *params[10];
1066     xsltStylesheetPtr last_xsp = 0;
1067     xmlDocPtr store_doc = 0;
1068
1069     /* per default do not ingest record */
1070     tinfo->record_info_invoked = 0;
1071
1072     /* exit if empty document given */
1073     if (!doc)
1074         return RECCTRL_EXTRACT_SKIP;
1075
1076     /* we actuallu have a document which needs to be processed further */
1077     params[0] = 0;
1078     set_param_str(params, "schema", zebra_dom_ns, tinfo->nmem_record);
1079
1080     if (p && p->flagShowRecords)
1081     {
1082         xmlChar *buf_out;
1083         int len_out;
1084 #if 0 
1085         FILE *outf = fopen("extract.xml", "w");
1086         xmlDocDumpMemory(doc, &buf_out, &len_out);
1087         fwrite(buf_out, 1, len_out, outf);
1088 #endif
1089         yaz_log(YLOG_LOG, "Extract Doc: %.*s", len_out, buf_out);
1090 #if 0
1091         fclose(outf);
1092 #endif
1093     }
1094
1095     /* input conversion */
1096     perform_convert(tinfo, p, input->convert, params, &doc, 0);
1097
1098
1099     if (tinfo->store)
1100     {
1101         /* store conversion */
1102         store_doc = xmlCopyDoc(doc, 1);
1103         perform_convert(tinfo, p, tinfo->store->convert,
1104                         params, &store_doc, &last_xsp);
1105     }
1106     
1107     /* saving either store doc or original doc in case no store doc exists */
1108     if (last_xsp)
1109         xsltSaveResultToString(&buf_out, &len_out, 
1110                                store_doc ? store_doc : doc, last_xsp);
1111     else
1112         xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1113
1114     if (p->setStoreData)
1115         (*p->setStoreData)(p, buf_out, len_out);
1116     xmlFree(buf_out);
1117
1118     if (store_doc)
1119         xmlFreeDoc(store_doc);
1120
1121     /* extract conversion */
1122     perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0);
1123
1124
1125     /* finally, do the indexing */
1126     if (doc){
1127         extract_dom_doc_node(tinfo, p, doc);
1128         xmlFreeDoc(doc);
1129     }
1130     
1131     /* there was nothing to index, so there is no inserted/updated record */
1132     if (tinfo->record_info_invoked == 0)
1133         return RECCTRL_EXTRACT_SKIP;
1134
1135     return RECCTRL_EXTRACT_OK;
1136 }
1137
1138 static int extract_xml_split(struct filter_info *tinfo,
1139                              struct filter_input *input,
1140                              struct recExtractCtrl *p)
1141 {
1142     int ret;
1143
1144     if (p->first_record)
1145     {
1146         if (input->u.xmlreader.reader)
1147             xmlFreeTextReader(input->u.xmlreader.reader);
1148         input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1149                                                    p /* I/O handler */,
1150                                                    0 /* URL */, 
1151                                                    0 /* encoding */,
1152                                                    XML_PARSE_XINCLUDE
1153                                                    | XML_PARSE_NOENT
1154                                                    | XML_PARSE_NONET);
1155     }
1156     if (!input->u.xmlreader.reader)
1157         return RECCTRL_EXTRACT_ERROR_GENERIC;
1158
1159     ret = xmlTextReaderRead(input->u.xmlreader.reader);
1160     while (ret == 1)
1161     {
1162         int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1163         int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1164
1165         if (type == XML_READER_TYPE_ELEMENT && 
1166             input->u.xmlreader.split_level == depth)
1167         {
1168             xmlNodePtr ptr;
1169
1170             /* per default do not ingest record */
1171             tinfo->record_info_invoked = 0;
1172             
1173             ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
1174             if (ptr)
1175                 {
1176                 /* we have a new document */
1177
1178                 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1179                 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1180                 
1181                 xmlDocSetRootElement(doc, ptr2);
1182                 
1183                 /* writing debug info out */
1184                 if (p->flagShowRecords)
1185                 {
1186                     xmlChar *buf_out = 0;
1187                     int len_out = 0;
1188                     xmlDocDumpMemory(doc, &buf_out, &len_out);
1189                     yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s", 
1190                             tinfo->fname ? tinfo->fname : "(none)",
1191                             depth, len_out, buf_out); 
1192                     xmlFree(buf_out);
1193                 }
1194                 
1195                 return convert_extract_doc(tinfo, input, p, doc);
1196             }
1197             else
1198             {
1199                 xmlFreeTextReader(input->u.xmlreader.reader);
1200                 input->u.xmlreader.reader = 0;
1201                 return RECCTRL_EXTRACT_ERROR_GENERIC;
1202             }
1203         }
1204         ret = xmlTextReaderRead(input->u.xmlreader.reader);
1205     }
1206     xmlFreeTextReader(input->u.xmlreader.reader);
1207     input->u.xmlreader.reader = 0;
1208     return RECCTRL_EXTRACT_EOF;
1209 }
1210
1211 static int extract_xml_full(struct filter_info *tinfo, 
1212                             struct filter_input *input,
1213                             struct recExtractCtrl *p)
1214 {
1215     if (p->first_record) /* only one record per stream */
1216     {
1217         xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, 
1218                                   p /* I/O handler */,
1219                                   0 /* URL */,
1220                                   0 /* encoding */,
1221                                   XML_PARSE_XINCLUDE
1222                                   | XML_PARSE_NOENT
1223                                   | XML_PARSE_NONET);
1224         if (!doc)
1225         {
1226             return RECCTRL_EXTRACT_ERROR_GENERIC;
1227         }
1228         return convert_extract_doc(tinfo, input, p, doc);
1229     }
1230     else
1231         return RECCTRL_EXTRACT_EOF;
1232 }
1233
1234 static int extract_iso2709(struct filter_info *tinfo,
1235                            struct filter_input *input,
1236                            struct recExtractCtrl *p)
1237 {
1238     char buf[100000];
1239     int record_length;
1240     int read_bytes, r;
1241
1242     if (p->stream->readf(p->stream, buf, 5) != 5)
1243         return RECCTRL_EXTRACT_EOF;
1244     while (*buf < '0' || *buf > '9')
1245     {
1246         int i;
1247
1248         dom_log(YLOG_WARN, tinfo, 0,
1249                 "MARC: Skipping bad byte %d (0x%02X)",
1250                 *buf & 0xff, *buf & 0xff);
1251         for (i = 0; i<4; i++)
1252             buf[i] = buf[i+1];
1253
1254         if (p->stream->readf(p->stream, buf+4, 1) != 1)
1255             return RECCTRL_EXTRACT_EOF;
1256     }
1257     record_length = atoi_n (buf, 5);
1258     if (record_length < 25)
1259     {
1260         dom_log(YLOG_WARN, tinfo, 0,
1261                 "MARC record length < 25, is %d",  record_length);
1262         return RECCTRL_EXTRACT_ERROR_GENERIC;
1263     }
1264     read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1265     if (read_bytes < record_length-5)
1266     {
1267         dom_log(YLOG_WARN, tinfo, 0,
1268                 "couldn't read whole MARC record");
1269         return RECCTRL_EXTRACT_ERROR_GENERIC;
1270     }
1271     r = yaz_marc_read_iso2709(input->u.marc.handle,  buf, record_length);
1272     if (r < record_length)
1273     {
1274         dom_log (YLOG_WARN, tinfo, 0,
1275                  "parsing of MARC record failed r=%d length=%d",
1276                  r, record_length);
1277         return RECCTRL_EXTRACT_ERROR_GENERIC;
1278     }
1279     else
1280     {
1281         xmlDocPtr rdoc;
1282         xmlNode *root_ptr;
1283         yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 
1284                            "http://www.loc.gov/MARC21/slim", 0, 0);
1285         rdoc = xmlNewDoc((const xmlChar*) "1.0");
1286         xmlDocSetRootElement(rdoc, root_ptr);
1287         return convert_extract_doc(tinfo, input, p, rdoc);        
1288     }
1289     return RECCTRL_EXTRACT_OK;
1290 }
1291
1292 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1293 {
1294     struct filter_info *tinfo = clientData;
1295     struct filter_input *input = tinfo->input_list;
1296
1297     if (!input)
1298         return RECCTRL_EXTRACT_ERROR_GENERIC;
1299     
1300     nmem_reset(tinfo->nmem_record);
1301
1302     if (p->setStoreData == 0)
1303         return extract_xml_full(tinfo, input, p);
1304     switch(input->type)
1305     {
1306     case DOM_INPUT_XMLREADER:
1307         if (input->u.xmlreader.split_level == 0)
1308             return extract_xml_full(tinfo, input, p);
1309         else
1310             return extract_xml_split(tinfo, input, p);
1311         break;
1312     case DOM_INPUT_MARC:
1313         return extract_iso2709(tinfo, input, p);
1314     }
1315     return RECCTRL_EXTRACT_ERROR_GENERIC;
1316 }
1317
1318 static int ioread_ret(void *context, char *buffer, int len)
1319 {
1320     struct recRetrieveCtrl *p = context;
1321     return p->stream->readf(p->stream, buffer, len);
1322 }
1323
1324 static int ioclose_ret(void *context)
1325 {
1326     return 0;
1327 }
1328
1329 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1330 {
1331     /* const char *esn = zebra_dom_ns; */
1332     const char *esn = 0;
1333     const char *params[32];
1334     struct filter_info *tinfo = clientData;
1335     xmlDocPtr doc;
1336     struct filter_retrieve *retrieve;
1337     xsltStylesheetPtr last_xsp = 0;
1338
1339     if (p->comp)
1340     {
1341         if (p->comp->which == Z_RecordComp_simple
1342             && p->comp->u.simple->which == Z_ElementSetNames_generic)
1343         {
1344             esn = p->comp->u.simple->u.generic;
1345         }
1346         else if (p->comp->which == Z_RecordComp_complex 
1347                  && p->comp->u.complex->generic->elementSpec
1348                  && p->comp->u.complex->generic->elementSpec->which ==
1349                  Z_ElementSpec_elementSetName)
1350         {
1351             esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1352         }
1353     }
1354     retrieve = lookup_retrieve(tinfo, esn);
1355     if (!retrieve)
1356     {
1357         p->diagnostic =
1358             YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1359         p->addinfo = odr_strdup(p->odr, esn);
1360         return 0;
1361     }
1362
1363     params[0] = 0;
1364     set_param_int(params, "id", p->localno, p->odr->mem);
1365     if (p->fname)
1366         set_param_str(params, "filename", p->fname, p->odr->mem);
1367     if (p->staticrank >= 0)
1368         set_param_int(params, "rank", p->staticrank, p->odr->mem);
1369
1370     if (esn)
1371         set_param_str(params, "schema", esn, p->odr->mem);
1372     else
1373         if (retrieve->name)
1374             set_param_str(params, "schema", retrieve->name, p->odr->mem);
1375         else if (retrieve->identifier)
1376             set_param_str(params, "schema", retrieve->identifier, p->odr->mem);
1377         else
1378             set_param_str(params, "schema", "", p->odr->mem);
1379
1380     if (p->score >= 0)
1381         set_param_int(params, "score", p->score, p->odr->mem);
1382     set_param_int(params, "size", p->recordSize, p->odr->mem);
1383
1384     doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1385                     0 /* URL */,
1386                     0 /* encoding */,
1387                     XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
1388     if (!doc)
1389     {
1390         p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1391         return 0;
1392     }
1393
1394     /* retrieve conversion */
1395     perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp);
1396     if (!doc)
1397     {
1398         p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1399     }
1400     else if (!p->input_format
1401              || !oid_oidcmp(p->input_format, yaz_oid_recsyn_xml))
1402     {
1403         xmlChar *buf_out;
1404         int len_out;
1405
1406         if (last_xsp)
1407             xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1408         else
1409             xmlDocDumpMemory(doc, &buf_out, &len_out);            
1410
1411         p->output_format = yaz_oid_recsyn_xml;
1412         p->rec_len = len_out;
1413         p->rec_buf = odr_malloc(p->odr, p->rec_len);
1414         memcpy(p->rec_buf, buf_out, p->rec_len);
1415         xmlFree(buf_out);
1416     }
1417     else if (!oid_oidcmp(p->output_format, yaz_oid_recsyn_sutrs))
1418     {
1419         xmlChar *buf_out;
1420         int len_out;
1421
1422         if (last_xsp)
1423             xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1424         else
1425             xmlDocDumpMemory(doc, &buf_out, &len_out);            
1426         
1427         p->output_format = yaz_oid_recsyn_sutrs;
1428         p->rec_len = len_out;
1429         p->rec_buf = odr_malloc(p->odr, p->rec_len);
1430         memcpy(p->rec_buf, buf_out, p->rec_len);
1431         
1432         xmlFree(buf_out);
1433     }
1434     else
1435     {
1436         p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1437     }
1438     xmlFreeDoc(doc);
1439     return 0;
1440 }
1441
1442 static struct recType filter_type = {
1443     0,
1444     "dom",
1445     filter_init,
1446     filter_config,
1447     filter_destroy,
1448     filter_extract,
1449     filter_retrieve
1450 };
1451
1452 RecType
1453 #ifdef IDZEBRA_STATIC_DOM
1454 idzebra_filter_dom
1455 #else
1456 idzebra_filter
1457 #endif
1458
1459 [] = {
1460     &filter_type,
1461     0,
1462 };
1463 /*
1464  * Local variables:
1465  * c-basic-offset: 4
1466  * indent-tabs-mode: nil
1467  * End:
1468  * vim: shiftwidth=4 tabstop=8 expandtab
1469  */
1470