Fixed bug #898: xslt tests fails on several platforms. Problem was
[idzebra-moved-to-github.git] / index / mod_dom.c
1 /* $Id: mod_dom.c,v 1.16 2007-02-18 21:53:22 adam Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5    This file is part of the Zebra server.
6
7    Zebra is free software; you can redistribute it and/or modify it under
8    the terms of the GNU General Public License as published by the Free
9    Software Foundation; either version 2, or (at your option) any later
10    version.
11
12    Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or
14    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15    for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 #include <stdio.h>
24 #include <assert.h>
25 #include <ctype.h>
26
27 #include <yaz/diagbib1.h>
28 #include <yaz/tpath.h>
29
30 #include <libxml/xmlversion.h>
31 #include <libxml/parser.h>
32 #include <libxml/tree.h>
33 #include <libxml/xmlIO.h>
34 #include <libxml/xmlreader.h>
35 #include <libxslt/transform.h>
36 #include <libxslt/xsltutils.h>
37
38 #if YAZ_HAVE_EXSLT
39 #include <libexslt/exslt.h>
40 #endif
41
42 #include <idzebra/util.h>
43 #include <idzebra/recctrl.h>
44
45 /* DOM filter style indexing */
46 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
47 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
48
49 /* DOM filter style indexing */
50 #define ZEBRA_PI_NAME "zebra-2.0"
51 static const char *zebra_pi_name = ZEBRA_PI_NAME;
52
53
54
55 struct convert_s {
56     const char *stylesheet;
57     xsltStylesheetPtr stylesheet_xsp;
58     struct convert_s *next;
59 };
60
61 struct filter_extract {
62     const char *name;
63     struct convert_s *convert;
64 };
65
66 struct filter_store {
67     struct convert_s *convert;
68 };
69
70 struct filter_retrieve {
71     const char *name;
72     const char *identifier;
73     struct convert_s *convert;
74     struct filter_retrieve *next;
75 };
76
77 #define DOM_INPUT_XMLREADER 1
78 #define DOM_INPUT_MARC 2
79 struct filter_input {
80     const char *syntax;
81     const char *name;
82     struct convert_s *convert;
83     int type;
84     union {
85         struct {
86             const char *input_charset;
87             yaz_marc_t handle;
88             yaz_iconv_t iconv;
89         } marc;
90         struct {
91             xmlTextReaderPtr reader;
92             int split_level;
93         } xmlreader;
94     } u;
95     struct filter_input *next;
96 };
97   
98 struct filter_info {
99     char *fname;
100     char *full_name;
101     const char *profile_path;
102     ODR odr_record;
103     ODR odr_config;
104     xmlDocPtr doc_config;
105     struct filter_extract *extract;
106     struct filter_retrieve *retrieve_list;
107     struct filter_input *input_list;
108     struct filter_store *store;
109 };
110
111 #define XML_STRCMP(a,b)   strcmp((char*)a, b)
112 #define XML_STRLEN(a) strlen((char*)a)
113
114
115
116
117 static void set_param_str(const char **params, const char *name,
118                           const char *value, ODR odr)
119 {
120     char *quoted = odr_malloc(odr, 3 + strlen(value));
121     sprintf(quoted, "'%s'", value);
122     while (*params)
123         params++;
124     params[0] = name;
125     params[1] = quoted;
126     params[2] = 0;
127 }
128
129 static void set_param_int(const char **params, const char *name,
130                           zint value, ODR odr)
131 {
132     char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
133     while (*params)
134         params++;
135     sprintf(quoted, "'" ZINT_FORMAT "'", value);
136     params[0] = name;
137     params[1] = quoted;
138     params[2] = 0;
139 }
140
141 static void *filter_init(Res res, RecType recType)
142 {
143     struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
144     tinfo->fname = 0;
145     tinfo->full_name = 0;
146     tinfo->profile_path = 0;
147     tinfo->odr_record = odr_createmem(ODR_ENCODE);
148     tinfo->odr_config = odr_createmem(ODR_ENCODE);
149     tinfo->extract = 0;
150     tinfo->retrieve_list = 0;
151     tinfo->input_list = 0;
152     tinfo->store = 0;
153     tinfo->doc_config = 0;
154
155 #if YAZ_HAVE_EXSLT
156     exsltRegisterAll(); 
157 #endif
158
159     return tinfo;
160 }
161
162 static int attr_content(struct _xmlAttr *attr, const char *name,
163                         const char **dst_content)
164 {
165     if (!XML_STRCMP(attr->name, name) && attr->children 
166         && attr->children->type == XML_TEXT_NODE)
167     {
168         *dst_content = (const char *)(attr->children->content);
169         return 1;
170     }
171     return 0;
172 }
173
174 static void destroy_xsp(struct convert_s *c)
175 {
176     while(c)
177     {
178         if (c->stylesheet_xsp)
179             xsltFreeStylesheet(c->stylesheet_xsp);
180         c = c->next;
181     }
182 }
183
184 static void destroy_dom(struct filter_info *tinfo)
185 {
186     if (tinfo->extract)
187     {
188         destroy_xsp(tinfo->extract->convert);
189         tinfo->extract = 0;
190     }
191     if (tinfo->store)
192     {
193         destroy_xsp(tinfo->store->convert);
194         tinfo->store = 0;
195     }
196     if (tinfo->input_list)
197     {
198         struct filter_input *i_ptr;
199         for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
200         {
201             switch(i_ptr->type)
202             {
203             case DOM_INPUT_XMLREADER:
204                 if (i_ptr->u.xmlreader.reader)
205                     xmlFreeTextReader(i_ptr->u.xmlreader.reader);
206                 break;
207             case DOM_INPUT_MARC:
208                 yaz_iconv_close(i_ptr->u.marc.iconv);
209                 yaz_marc_destroy(i_ptr->u.marc.handle);
210                 break;
211             }
212             destroy_xsp(i_ptr->convert);
213         }
214         tinfo->input_list = 0;
215     }
216     if (tinfo->retrieve_list)
217     {
218         struct filter_retrieve *r_ptr;
219         for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
220             destroy_xsp(r_ptr->convert);
221         tinfo->retrieve_list = 0;
222     }
223
224     if (tinfo->doc_config)
225     {
226         xmlFreeDoc(tinfo->doc_config);
227         tinfo->doc_config = 0;
228     }
229     odr_reset(tinfo->odr_config);
230 }
231
232 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
233                                struct convert_s **l)
234 {
235     *l = 0;
236     for(; ptr; ptr = ptr->next)
237     {
238         if (ptr->type != XML_ELEMENT_NODE)
239             continue;
240         if (!XML_STRCMP(ptr->name, "xslt"))
241         {
242             struct _xmlAttr *attr;
243             struct convert_s *p 
244                 = odr_malloc(tinfo->odr_config, sizeof(*p));
245             
246             p->next = 0;
247             p->stylesheet = 0;
248             p->stylesheet_xsp = 0;
249             
250             for (attr = ptr->properties; attr; attr = attr->next)
251                 if (attr_content(attr, "stylesheet", &p->stylesheet))
252                     ;
253                 else
254                 {
255                     xmlChar *node_path = xmlGetNodePath(ptr);
256                     yaz_log(YLOG_WARN, "%s: dom filter: "
257                             "%s bad attribute @%s, "
258                             "expected @stylesheet",
259                             tinfo->fname, 
260                             node_path, attr->name);
261                     xmlFree(node_path);
262                 }
263             if (p->stylesheet)
264             {
265                 char tmp_xslt_full_name[1024];
266                 if (!yaz_filepath_resolve(p->stylesheet, 
267                                           tinfo->profile_path,
268                                           NULL, 
269                                           tmp_xslt_full_name))
270                 {
271                     yaz_log(YLOG_WARN, "%s: dom filter: "
272                             "stylesheet %s not found in "
273                             "path %s",
274                             tinfo->fname,
275                             p->stylesheet, 
276                             tinfo->profile_path);
277                     return ZEBRA_FAIL;
278                 }
279                 
280                 p->stylesheet_xsp
281                     = xsltParseStylesheetFile((const xmlChar*) 
282                                               tmp_xslt_full_name);
283                 if (!p->stylesheet_xsp)
284                 {
285                     yaz_log(YLOG_WARN, "%s: dom filter: "
286                             "could not parse xslt "
287                             "stylesheet %s",
288                             tinfo->fname, tmp_xslt_full_name);
289                     return ZEBRA_FAIL;
290                 }
291             }
292             else
293             {
294                 xmlChar *node_path = xmlGetNodePath(ptr);
295                 yaz_log(YLOG_WARN, "%s: dom filter: "
296                         "%s missing attribute 'stylesheet' ", 
297                         tinfo->fname, node_path);
298                 xmlFree(node_path);
299                 return ZEBRA_FAIL;
300             }
301             *l = p;
302             l = &p->next;
303         }
304         else
305         {
306             xmlChar *node_path = xmlGetNodePath(ptr);
307             yaz_log(YLOG_LOG, 
308                     "%s: dom filter: "
309                     "%s bad node '%s'",
310                     tinfo->fname, node_path, ptr->name);
311             xmlFree(node_path);
312             return ZEBRA_FAIL;
313         }
314     }
315     return ZEBRA_OK;
316 }
317
318 static ZEBRA_RES perform_convert(struct filter_info *tinfo, 
319                                  struct convert_s *convert,
320                                  const char **params,
321                                  xmlDocPtr *doc,
322                                  xsltStylesheetPtr *last_xsp)
323 {
324     for (; convert; convert = convert->next)
325     {
326         xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
327                                                 *doc, params);
328         if (last_xsp)
329             *last_xsp = convert->stylesheet_xsp;
330         xmlFreeDoc(*doc);
331         *doc = res_doc;
332     }
333     return ZEBRA_OK;
334 }
335
336 static struct filter_input *new_input(struct filter_info *tinfo, int type)
337 {
338     struct filter_input *p;
339     struct filter_input **np = &tinfo->input_list;
340     for (;*np; np = &(*np)->next)
341         ;
342     p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
343     p->next = 0;
344     p->syntax = 0;
345     p->name = 0;
346     p->convert = 0;
347     p->type = type;
348     return p;
349 }
350
351 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
352                              const char *syntax,
353                              const char *name)
354 {
355     for (; ptr; ptr = ptr->next)
356     {
357         if (ptr->type != XML_ELEMENT_NODE)
358             continue;
359         if (!XML_STRCMP(ptr->name, "marc"))
360         {
361             yaz_iconv_t iconv = 0;
362             const char *input_charset = "marc-8";
363             struct _xmlAttr *attr;
364             
365             for (attr = ptr->properties; attr; attr = attr->next)
366             {
367                 if (attr_content(attr, "charset", &input_charset))
368                     ;
369                 else
370                 {
371                     xmlChar *node_path = xmlGetNodePath(ptr);
372                     yaz_log(YLOG_WARN, "%s: dom filter: "
373                             "%s bad attribute @%s,"
374                             " expected @charset",
375                             tinfo->fname, 
376                             node_path, attr->name);
377                     xmlFree(node_path);
378                 }
379             }
380             iconv = yaz_iconv_open("utf-8", input_charset);
381             if (!iconv)
382             {
383                 xmlChar *node_path = xmlGetNodePath(ptr);
384                 yaz_log(YLOG_WARN, "%s: dom filter: "
385                         "%s unsupported @charset '%s'", 
386                         tinfo->fname, node_path,
387                         input_charset);
388                 xmlFree(node_path);
389                 return ZEBRA_FAIL;
390             }
391             else
392             {
393                 struct filter_input *p 
394                     = new_input(tinfo, DOM_INPUT_MARC);
395                 p->u.marc.handle = yaz_marc_create();
396                 p->u.marc.iconv = iconv;
397                 
398                 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
399                 
400                 ptr = ptr->next;
401                 
402                 parse_convert(tinfo, ptr, &p->convert);
403             }
404             break;
405
406         }
407         else if (!XML_STRCMP(ptr->name, "xmlreader"))
408         {
409             struct filter_input *p 
410                 = new_input(tinfo, DOM_INPUT_XMLREADER);
411             struct _xmlAttr *attr;
412             const char *level_str = 0;
413
414             p->u.xmlreader.split_level = 0;
415             p->u.xmlreader.reader = 0;
416
417             for (attr = ptr->properties; attr; attr = attr->next)
418             {
419                 if (attr_content(attr, "level", &level_str))
420                     ;
421                 else
422                 {
423                     xmlChar *node_path = xmlGetNodePath(ptr);
424                     yaz_log(YLOG_WARN, "%s: dom filter: "
425                             "%s bad attribute @%s,"
426                             " expected @level",
427                             tinfo->fname, node_path,
428                             attr->name);
429                     xmlFree(node_path);
430                 }
431             }
432             if (level_str)
433                 p->u.xmlreader.split_level = atoi(level_str);
434                 
435             ptr = ptr->next;
436
437             parse_convert(tinfo, ptr, &p->convert);
438             break;
439         }
440         else
441         {
442             xmlChar *node_path = xmlGetNodePath(ptr);
443             yaz_log(YLOG_WARN, "%s: dom filter: "
444                     "%s bad element <%s>,"
445                     " expected <marc>|<xmlreader>",
446                     tinfo->fname, node_path, ptr->name);
447             xmlFree(node_path);
448             return ZEBRA_FAIL;
449         }
450     }
451     return ZEBRA_OK;
452 }
453
454 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
455 {
456     char tmp_full_name[1024];
457     xmlNodePtr ptr;
458     xmlDocPtr doc;
459
460     tinfo->fname = odr_strdup(tinfo->odr_config, fname);
461     
462     if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path, 
463                              NULL, tmp_full_name))
464         tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
465     else
466         tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
467     
468     yaz_log(YLOG_LOG, "%s dom filter: "
469             "loading config file %s", tinfo->fname, tinfo->full_name);
470     
471     doc = xmlParseFile(tinfo->full_name);
472     if (!doc)
473     {
474         yaz_log(YLOG_WARN, "%s: dom filter: "
475                 "failed to parse config file %s",
476                 tinfo->fname, tinfo->full_name);
477         return ZEBRA_FAIL;
478     }
479     /* save because we store ptrs to the content */ 
480     tinfo->doc_config = doc;
481     
482     ptr = xmlDocGetRootElement(doc);
483     if (!ptr || ptr->type != XML_ELEMENT_NODE 
484         || XML_STRCMP(ptr->name, "dom"))
485     {
486         xmlChar *node_path = xmlGetNodePath(ptr);
487         yaz_log(YLOG_WARN, "%s: dom filter: "
488                 "%s bad root element <%s>,"
489                 " expected root element <dom>", 
490                 tinfo->fname, node_path, ptr->name);  
491         xmlFree(node_path);
492         return ZEBRA_FAIL;
493     }
494
495     for (ptr = ptr->children; ptr; ptr = ptr->next)
496     {
497         if (ptr->type != XML_ELEMENT_NODE)
498             continue;
499         if (!XML_STRCMP(ptr->name, "extract"))
500         {
501             /*
502               <extract name="index">
503               <xslt stylesheet="first.xsl"/>
504               <xslt stylesheet="second.xsl"/>
505               </extract>
506             */
507             struct _xmlAttr *attr;
508             struct filter_extract *f =
509                 odr_malloc(tinfo->odr_config, sizeof(*f));
510             
511             tinfo->extract = f;
512             f->name = 0;
513             f->convert = 0;
514             for (attr = ptr->properties; attr; attr = attr->next)
515             {
516                 if (attr_content(attr, "name", &f->name))
517                     ;
518                 else
519                 {
520                     xmlChar *node_path = xmlGetNodePath(ptr);
521                     yaz_log(YLOG_WARN, "%s: dom filter: "
522                             "%s bad attribute @%s"
523                             " expected @name",
524                             tinfo->fname, 
525                             node_path, attr->name);
526                     xmlFree(node_path);
527                 }
528             }
529             parse_convert(tinfo, ptr->children, &f->convert);
530         }
531         else if (!XML_STRCMP(ptr->name, "retrieve"))
532         {  
533             /* 
534                <retrieve name="F">
535                <xslt stylesheet="some.xsl"/>
536                <xslt stylesheet="some.xsl"/>
537                </retrieve>
538             */
539             struct _xmlAttr *attr;
540             struct filter_retrieve **fp = &tinfo->retrieve_list;
541             struct filter_retrieve *f =
542                 odr_malloc(tinfo->odr_config, sizeof(*f));
543             
544             while (*fp)
545                 fp = &(*fp)->next;
546
547             *fp = f;
548             f->name = 0;
549             f->identifier = 0;
550             f->convert = 0;
551             f->next = 0;
552
553             for (attr = ptr->properties; attr; attr = attr->next)
554             {
555                 if (attr_content(attr, "identifier", 
556                                  &f->identifier))
557                     ;
558                 else if (attr_content(attr, "name", &f->name))
559                     ;
560                 else
561                 {
562                     xmlChar *node_path = xmlGetNodePath(ptr);
563                     yaz_log(YLOG_WARN, "%s: dom filter: "
564                             "%s bad attribute @%s"
565                             " expected @identifier|@name",
566                             tinfo->fname, 
567                             node_path, attr->name);
568                     xmlFree(node_path);
569                 }
570             }
571             parse_convert(tinfo, ptr->children, &f->convert);
572         }
573         else if (!XML_STRCMP(ptr->name, "store"))
574         {
575             /*
576               <store name="F">
577               <xslt stylesheet="some.xsl"/>
578               <xslt stylesheet="some.xsl"/>
579               </retrieve>
580             */
581             struct filter_store *f =
582                 odr_malloc(tinfo->odr_config, sizeof(*f));
583             
584             tinfo->store = f;
585             f->convert = 0;
586             parse_convert(tinfo, ptr->children, &f->convert);
587         }
588         else if (!XML_STRCMP(ptr->name, "input"))
589         {
590             /*
591               <input syntax="xml">
592               <xmlreader level="1"/>
593               </input>
594               <input syntax="usmarc">
595               <marc inputcharset="marc-8"/>
596               </input>
597             */
598             struct _xmlAttr *attr;
599             const char  *syntax = 0;
600             const char *name = 0;
601             for (attr = ptr->properties; attr; attr = attr->next)
602             {
603                 if (attr_content(attr, "syntax", &syntax))
604                     ;
605                 else if (attr_content(attr, "name", &name))
606                     ;
607                 else
608                 {
609                     xmlChar *node_path = xmlGetNodePath(ptr);
610                     yaz_log(YLOG_WARN, "%s: dom filter: "
611                             "%s bad attribute @%s"
612                             " expected @syntax|@name",
613                             tinfo->fname, 
614                             node_path, attr->name);
615                     xmlFree(node_path);
616                 }
617             }
618             parse_input(tinfo, ptr->children, syntax, name);
619         }
620         else
621         {
622             xmlChar *node_path = xmlGetNodePath(ptr);
623             yaz_log(YLOG_WARN, "%s: dom filter: "
624                     "%s bad element <%s>,"
625                     " expected <extract>|<input>|<retrieve>|<store>",
626                     tinfo->fname, node_path, ptr->name);
627             xmlFree(node_path);
628             return ZEBRA_FAIL;
629         }
630     }
631     return ZEBRA_OK;
632 }
633
634 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
635                                                const char *est)
636 {
637     struct filter_retrieve *f = tinfo->retrieve_list;
638
639     /* return first schema if no est is provided */
640     if (!est)
641         return f;
642     for (; f; f = f->next)
643     { 
644         /* find requested schema */
645         if (est) 
646         {    
647             if (f->identifier && !strcmp(f->identifier, est))
648                 return f;
649             if (f->name && !strcmp(f->name, est))
650                 return f;
651         } 
652     }
653     return 0;
654 }
655
656 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
657 {
658     struct filter_info *tinfo = clientData;
659     if (!args || !*args)
660     {
661         yaz_log(YLOG_WARN, "dom filter: need config file");
662         return ZEBRA_FAIL;
663     }
664
665     if (tinfo->fname && !strcmp(args, tinfo->fname))
666         return ZEBRA_OK;
667     
668     tinfo->profile_path = res_get(res, "profilePath");
669
670     destroy_dom(tinfo);
671     return parse_dom(tinfo, args);
672 }
673
674 static void filter_destroy(void *clientData)
675 {
676     struct filter_info *tinfo = clientData;
677     destroy_dom(tinfo);
678     odr_destroy(tinfo->odr_config);
679     odr_destroy(tinfo->odr_record);
680     xfree(tinfo);
681 }
682
683 static int ioread_ex(void *context, char *buffer, int len)
684 {
685     struct recExtractCtrl *p = context;
686     return p->stream->readf(p->stream, buffer, len);
687 }
688
689 static int ioclose_ex(void *context)
690 {
691     return 0;
692 }
693
694
695 /* DOM filter style indexing */
696 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
697                             xmlChar **dst_content)
698 {
699     if (0 == XML_STRCMP(attr->name, name) && attr->children 
700         && attr->children->type == XML_TEXT_NODE)
701     {
702         *dst_content = (attr->children->content);
703         return 1;
704     }
705     return 0;
706 }
707
708
709 /* DOM filter style indexing */
710 static void index_value_of(struct filter_info *tinfo, 
711                            struct recExtractCtrl *extctr,
712                            RecWord* recword, 
713                            xmlNodePtr node, 
714                            xmlChar * index_p)
715 {
716     xmlChar *text = xmlNodeGetContent(node);
717     size_t text_len = strlen((const char *)text);
718
719
720     /* if there is no text, we do not need to proceed */
721     if (text_len)
722     {            
723         xmlChar *look = index_p;
724         xmlChar *bval;
725         xmlChar *eval;
726
727         xmlChar index[256];
728         xmlChar type[256];
729
730         /* assingning text to be indexed */
731         recword->term_buf = (const char *)text;
732         recword->term_len = text_len;
733
734         /* parsing all index name/type pairs */
735         /* may not start with ' ' or ':' */
736         while (*look && ' ' != *look && ':' != *look)
737         {
738             /* setting name and type to zero */
739             *index = '\0';
740             *type = '\0';
741     
742             /* parsing one index name */
743             bval = look;
744             while (*look && ':' != *look && ' ' != *look)
745             {
746                 look++;
747             }
748             eval = look;
749             strncpy((char *)index, (const char *)bval, eval - bval);
750             index[eval - bval] = '\0';
751     
752     
753             /* parsing one index type, if existing */
754             if (':' == *look)
755             {
756                 look++;
757       
758                 bval = look;
759                 while (*look && ' ' != *look)
760                 {
761                     look++;
762                 }
763                 eval = look;
764                 strncpy((char *)type, (const char *)bval, eval - bval);
765                 type[eval - bval] = '\0';
766             }
767
768             /* actually indexing the text given */
769             yaz_log(YLOG_DEBUG, "%s dom filter: "
770                     "INDEX  '%s:%s' '%s'", 
771                     tinfo->fname, index, type, text);
772
773             recword->index_name = (const char *)index;
774             if (type && *type)
775                 recword->index_type = *type;
776             (extctr->tokenAdd)(recword);
777
778             /* eat whitespaces */
779             if (*look && ' ' == *look && *(look+1))
780             {
781                 look++;
782             } 
783         }
784     }
785     
786     xmlFree(text); 
787 }
788
789
790 /* DOM filter style indexing */
791 static void set_record_info(struct filter_info *tinfo, 
792                             struct recExtractCtrl *extctr, 
793                             xmlChar * id_p, 
794                             xmlChar * rank_p, 
795                             xmlChar * type_p)
796 {
797     yaz_log(YLOG_DEBUG, "%s dom filter: "
798             "RECORD id=%s rank=%s type=%s", 
799             tinfo->fname,  id_p, rank_p, type_p);
800     
801     if (id_p)
802         sscanf((const char *)id_p, "%255s", extctr->match_criteria);
803
804     if (rank_p)
805         extctr->staticrank = atozint((const char *)rank_p);
806
807     /*     if (!strcmp("update", type_str)) */
808     /*         index_node(tinfo, ctrl, ptr, recword); */
809     /*     else if (!strcmp("delete", type_str)) */
810     /*         yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); */
811     /*     else */
812     /*         yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'",  */
813     /*                 type_str); */
814
815 }
816
817
818 /* DOM filter style indexing */
819 static void process_xml_element_zebra_node(struct filter_info *tinfo, 
820                                            struct recExtractCtrl *extctr, 
821                                            RecWord* recword, 
822                                            xmlNodePtr node)
823 {
824     if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
825         && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
826     {
827          if (0 == XML_STRCMP(node->name, "index"))
828          {
829             xmlChar *index_p = 0;
830
831             struct _xmlAttr *attr;      
832             for (attr = node->properties; attr; attr = attr->next)
833             {
834                 if (attr_content_xml(attr, "name", &index_p))
835                 {
836                     index_value_of(tinfo, extctr, recword,node, index_p);
837                 }  
838                 else
839                 {
840                     xmlChar *node_path = xmlGetNodePath(node);
841                     yaz_log(YLOG_WARN,"%s dom filter: "
842                             "%s bad attribute @%s, expected @name",
843                             tinfo->fname, node_path, attr->name);
844                     xmlFree(node_path);
845                 }
846             }
847         }
848         else if (0 == XML_STRCMP(node->name, "record"))
849         {
850             xmlChar *id_p = 0;
851             xmlChar *rank_p = 0;
852             xmlChar *type_p = 0;
853
854             struct _xmlAttr *attr;
855             for (attr = node->properties; attr; attr = attr->next)
856             {
857                 if (attr_content_xml(attr, "id", &id_p))
858                     ;
859                 else if (attr_content_xml(attr, "rank", &rank_p))
860                     ;
861                 else if (attr_content_xml(attr, "type", &type_p))
862                     ;
863                 else
864                 {
865                     xmlChar *node_path = xmlGetNodePath(node);
866                     yaz_log(YLOG_WARN,"%s dom filter: "
867                             "%s bad attribute @%s,"
868                             " expected @id|@rank|@type",
869                             tinfo->fname, node_path, attr->name);
870                     xmlFree(node_path);
871                 }
872
873                 if (type_p && 0 != strcmp("update", (const char *)type_p))
874                 {
875                     xmlChar *node_path = xmlGetNodePath(node);
876                     yaz_log(YLOG_WARN,"%s dom filter: "
877                             "%s attribute @%s,"
878                             " only implemented '@type='update'",
879                             tinfo->fname, node_path, attr->name);
880                     xmlFree(node_path);
881                 }
882           
883
884             }
885             set_record_info(tinfo, extctr, id_p, rank_p, type_p);
886         } 
887         else
888         {
889             xmlChar *node_path = xmlGetNodePath(node);
890             yaz_log(YLOG_WARN,"%s dom filter: "
891                     "%s bad element <%s>,"
892                     " expected <record>|<index> in namespace '%s'",
893                     tinfo->fname, node_path, 
894                     node->name, zebra_dom_ns);
895             xmlFree(node_path);
896         }
897     }
898 }
899
900
901 /* DOM filter style indexing */
902 static void process_xml_pi_node(struct filter_info *tinfo, 
903                                 struct recExtractCtrl *extctr, 
904                                 xmlNodePtr node,
905                                 xmlChar **index_pp)
906 {
907     /* if right PI name, continue parsing PI */
908     if (0 == strcmp(zebra_pi_name, (const char *)node->name))
909     {
910         xmlChar *pi_p =  node->content;
911         xmlChar *look = pi_p;
912     
913         xmlChar *bval;
914         xmlChar *eval;
915
916         /* parsing PI record instructions */
917         if (0 == strncmp((const char *)look, "record", 6))
918         {
919             xmlChar id[256];
920             xmlChar rank[256];
921             xmlChar type[256];
922
923             *id = '\0';
924             *rank = '\0';
925             *type = '\0';
926       
927             look += 6;
928       
929             /* eat whitespace */
930             while (*look && ' ' == *look && *(look+1))
931                 look++;
932
933             /* parse possible id */
934             if (*look && 0 == strncmp((const char *)look, "id=", 3))
935             {
936                 look += 3;
937                 bval = look;
938                 while (*look && ' ' != *look)
939                     look++;
940                 eval = look;
941                 strncpy((char *)id, (const char *)bval, eval - bval);
942                 id[eval - bval] = '\0';
943             }
944       
945             /* eat whitespace */
946             while (*look && ' ' == *look && *(look+1))
947                 look++;
948       
949             /* parse possible rank */
950             if (*look && 0 == strncmp((const char *)look, "rank=", 5))
951             {
952                 look += 6;
953                 bval = look;
954                 while (*look && ' ' != *look)
955                     look++;
956                 eval = look;
957                 strncpy((char *)rank, (const char *)bval, eval - bval);
958                 rank[eval - bval] = '\0';
959             }
960
961             /* eat whitespace */
962             while (*look && ' ' == *look && *(look+1))
963                 look++;
964
965             if (look && '\0' != *look)
966             {
967                 xmlChar *node_path = xmlGetNodePath(node);
968                 yaz_log(YLOG_WARN,"%s dom filter: "
969                         "%s content '%s', can not parse '%s'",
970                         tinfo->fname, node_path, pi_p, look);
971                 xmlFree(node_path);
972             }
973             else 
974                 set_record_info(tinfo, extctr, id, rank, 0);
975
976         } 
977         /* parsing index instruction */
978         else if (0 == strncmp((const char *)look, "index", 5))
979         {
980             look += 5;
981       
982             /* eat whitespace */
983             while (*look && ' ' == *look && *(look+1))
984                 look++;
985
986             /* export index instructions to outside */
987             *index_pp = look;
988         } 
989         else 
990         {
991             xmlChar *node_path = xmlGetNodePath(node);
992             yaz_log(YLOG_WARN,"%s dom filter: "
993                     "%s content '%s', can not parse '%s'",
994                     tinfo->fname, node_path, pi_p, look);
995             xmlFree(node_path);
996         }
997     }
998 }
999
1000 /* DOM filter style indexing */
1001 static void process_xml_element_node(struct filter_info *tinfo, 
1002                                      struct recExtractCtrl *extctr, 
1003                                      RecWord* recword, 
1004                                      xmlNodePtr node)
1005 {
1006     /* remember indexing instruction from PI to next element node */
1007     xmlChar *index_p = 0;
1008
1009     /* check if we are an element node in the special zebra namespace 
1010        and either set record data or index value-of node content*/
1011     process_xml_element_zebra_node(tinfo, extctr, recword, node);
1012   
1013     /* loop through kid nodes */
1014     for (node = node->children; node; node = node->next)
1015     {
1016         /* check and set PI record and index index instructions */
1017         if (node->type == XML_PI_NODE)
1018         {
1019             process_xml_pi_node(tinfo, extctr, node, &index_p);
1020         }
1021         else if (node->type == XML_ELEMENT_NODE)
1022         {
1023             /* if there was a PI index instruction before this element */
1024             if (index_p)
1025             {
1026                 index_value_of(tinfo, extctr, recword, node, index_p);
1027                 index_p = 0;
1028             }
1029             process_xml_element_node(tinfo, extctr, recword,node);
1030         }
1031         else
1032             continue;
1033     }
1034 }
1035
1036
1037 /* DOM filter style indexing */
1038 static void extract_dom_doc_node(struct filter_info *tinfo, 
1039                                  struct recExtractCtrl *extctr, 
1040                                  xmlDocPtr doc)
1041 {
1042     xmlChar *buf_out;
1043     int len_out;
1044
1045     /* only need to do the initialization once, reuse recword for all terms */
1046     RecWord recword;
1047     (*extctr->init)(extctr, &recword);
1048
1049     if (extctr->flagShowRecords)
1050     {
1051         xmlDocDumpMemory(doc, &buf_out, &len_out);
1052         fwrite(buf_out, len_out, 1, stdout);
1053         xmlFree(buf_out);
1054     }
1055
1056     process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1057 }
1058
1059
1060
1061
1062 static int convert_extract_doc(struct filter_info *tinfo, 
1063                                struct filter_input *input,
1064                                struct recExtractCtrl *p, 
1065                                xmlDocPtr doc)
1066
1067 {
1068     xmlChar *buf_out;
1069     int len_out;
1070     const char *params[10];
1071     xsltStylesheetPtr last_xsp = 0;
1072     xmlDocPtr store_doc = 0;
1073
1074     params[0] = 0;
1075     set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
1076
1077     /* input conversion */
1078     perform_convert(tinfo, input->convert, params, &doc, 0);
1079
1080     if (tinfo->store)
1081     {
1082         /* store conversion */
1083         store_doc = xmlCopyDoc(doc, 1);
1084         perform_convert(tinfo, tinfo->store->convert,
1085                         params, &store_doc, &last_xsp);
1086     }
1087     
1088     if (last_xsp)
1089         xsltSaveResultToString(&buf_out, &len_out, 
1090                                store_doc ? store_doc : doc, last_xsp);
1091     else
1092         xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1093     if (p->flagShowRecords)
1094         fwrite(buf_out, len_out, 1, stdout);
1095     (*p->setStoreData)(p, buf_out, len_out);
1096     xmlFree(buf_out);
1097
1098     if (store_doc)
1099         xmlFreeDoc(store_doc);
1100
1101     /* extract conversion */
1102     perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
1103
1104     /* finally, do the indexing */
1105     if (doc)
1106     {
1107         extract_dom_doc_node(tinfo, p, doc);
1108         /* extract_doc_alvis(tinfo, p, doc); */
1109         xmlFreeDoc(doc);
1110     }
1111
1112     return RECCTRL_EXTRACT_OK;
1113 }
1114
1115 static int extract_xml_split(struct filter_info *tinfo,
1116                              struct filter_input *input,
1117                              struct recExtractCtrl *p)
1118 {
1119     int ret;
1120
1121     if (p->first_record)
1122     {
1123         if (input->u.xmlreader.reader)
1124             xmlFreeTextReader(input->u.xmlreader.reader);
1125         input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1126                                                    p /* I/O handler */,
1127                                                    0 /* URL */, 
1128                                                    0 /* encoding */,
1129                                                    XML_PARSE_XINCLUDE|
1130                                                    XML_PARSE_NOENT);
1131     }
1132     if (!input->u.xmlreader.reader)
1133         return RECCTRL_EXTRACT_ERROR_GENERIC;
1134
1135     ret = xmlTextReaderRead(input->u.xmlreader.reader);
1136     while (ret == 1)
1137     {
1138         int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1139         int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1140         if (type == XML_READER_TYPE_ELEMENT && 
1141             input->u.xmlreader.split_level == depth)
1142         {
1143             xmlNodePtr ptr
1144                 = xmlTextReaderExpand(input->u.xmlreader.reader);
1145             if (ptr)
1146             {
1147                 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1148                 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1149                 
1150                 xmlDocSetRootElement(doc, ptr2);
1151                 
1152                 return convert_extract_doc(tinfo, input, p, doc);
1153             }
1154             else
1155             {
1156                 xmlFreeTextReader(input->u.xmlreader.reader);
1157                 input->u.xmlreader.reader = 0;
1158                 return RECCTRL_EXTRACT_ERROR_GENERIC;
1159             }
1160         }
1161         ret = xmlTextReaderRead(input->u.xmlreader.reader);
1162     }
1163     xmlFreeTextReader(input->u.xmlreader.reader);
1164     input->u.xmlreader.reader = 0;
1165     return RECCTRL_EXTRACT_EOF;
1166 }
1167
1168 static int extract_xml_full(struct filter_info *tinfo, 
1169                             struct filter_input *input,
1170                             struct recExtractCtrl *p)
1171 {
1172     if (p->first_record) /* only one record per stream */
1173     {
1174         xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, 
1175                                   p /* I/O handler */,
1176                                   0 /* URL */,
1177                                   0 /* encoding */,
1178                                   XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1179         if (!doc)
1180         {
1181             return RECCTRL_EXTRACT_ERROR_GENERIC;
1182         }
1183         return convert_extract_doc(tinfo, input, p, doc);
1184     }
1185     else
1186         return RECCTRL_EXTRACT_EOF;
1187 }
1188
1189 static int extract_iso2709(struct filter_info *tinfo,
1190                            struct filter_input *input,
1191                            struct recExtractCtrl *p)
1192 {
1193     char buf[100000];
1194     int record_length;
1195     int read_bytes, r;
1196
1197     if (p->stream->readf(p->stream, buf, 5) != 5)
1198         return RECCTRL_EXTRACT_EOF;
1199     while (*buf < '0' || *buf > '9')
1200     {
1201         int i;
1202
1203         yaz_log(YLOG_WARN, "%s dom filter: "
1204                 "MARC: Skipping bad byte %d (0x%02X)",
1205                 tinfo->fname, *buf & 0xff, *buf & 0xff);
1206         for (i = 0; i<4; i++)
1207             buf[i] = buf[i+1];
1208
1209         if (p->stream->readf(p->stream, buf+4, 1) != 1)
1210             return RECCTRL_EXTRACT_EOF;
1211     }
1212     record_length = atoi_n (buf, 5);
1213     if (record_length < 25)
1214     {
1215         yaz_log (YLOG_WARN, "%s dom filter: "
1216                  "MARC record length < 25, is %d", 
1217                  tinfo->fname, record_length);
1218         return RECCTRL_EXTRACT_ERROR_GENERIC;
1219     }
1220     read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1221     if (read_bytes < record_length-5)
1222     {
1223         yaz_log (YLOG_WARN, "%s dom filter: "
1224                  "Couldn't read whole MARC record",
1225                  tinfo->fname);
1226         return RECCTRL_EXTRACT_ERROR_GENERIC;
1227     }
1228     r = yaz_marc_read_iso2709(input->u.marc.handle,  buf, record_length);
1229     if (r < record_length)
1230     {
1231         yaz_log (YLOG_WARN, "%s dom filter: "
1232                  "Parsing of MARC record failed r=%d length=%d",
1233                  tinfo->fname, r, record_length);
1234         return RECCTRL_EXTRACT_ERROR_GENERIC;
1235     }
1236     else
1237     {
1238         xmlDocPtr rdoc;
1239         xmlNode *root_ptr;
1240         yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1241         rdoc = xmlNewDoc((const xmlChar*) "1.0");
1242         xmlDocSetRootElement(rdoc, root_ptr);
1243         return convert_extract_doc(tinfo, input, p, rdoc);        
1244     }
1245     return RECCTRL_EXTRACT_OK;
1246 }
1247
1248 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1249 {
1250     struct filter_info *tinfo = clientData;
1251     struct filter_input *input = tinfo->input_list;
1252
1253     if (!input)
1254         return RECCTRL_EXTRACT_ERROR_GENERIC;
1255
1256     odr_reset(tinfo->odr_record);
1257     switch(input->type)
1258     {
1259     case DOM_INPUT_XMLREADER:
1260         if (input->u.xmlreader.split_level == 0)
1261             return extract_xml_full(tinfo, input, p);
1262         else
1263             return extract_xml_split(tinfo, input, p);
1264         break;
1265     case DOM_INPUT_MARC:
1266         return extract_iso2709(tinfo, input, p);
1267     }
1268     return RECCTRL_EXTRACT_ERROR_GENERIC;
1269 }
1270
1271 static int ioread_ret(void *context, char *buffer, int len)
1272 {
1273     struct recRetrieveCtrl *p = context;
1274     return p->stream->readf(p->stream, buffer, len);
1275 }
1276
1277 static int ioclose_ret(void *context)
1278 {
1279     return 0;
1280 }
1281
1282 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1283 {
1284     /* const char *esn = zebra_dom_ns; */
1285     const char *esn = 0;
1286     const char *params[32];
1287     struct filter_info *tinfo = clientData;
1288     xmlDocPtr doc;
1289     struct filter_retrieve *retrieve;
1290     xsltStylesheetPtr last_xsp = 0;
1291
1292     if (p->comp)
1293     {
1294         if (p->comp->which == Z_RecordComp_simple
1295             && p->comp->u.simple->which == Z_ElementSetNames_generic)
1296         {
1297             esn = p->comp->u.simple->u.generic;
1298         }
1299         else if (p->comp->which == Z_RecordComp_complex 
1300                  && p->comp->u.complex->generic->elementSpec
1301                  && p->comp->u.complex->generic->elementSpec->which ==
1302                  Z_ElementSpec_elementSetName)
1303         {
1304             esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1305         }
1306     }
1307     retrieve = lookup_retrieve(tinfo, esn);
1308     if (!retrieve)
1309     {
1310         p->diagnostic =
1311             YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1312         return 0;
1313     }
1314
1315     params[0] = 0;
1316     set_param_int(params, "id", p->localno, p->odr);
1317     if (p->fname)
1318         set_param_str(params, "filename", p->fname, p->odr);
1319     if (p->staticrank >= 0)
1320         set_param_int(params, "rank", p->staticrank, p->odr);
1321
1322     if (esn)
1323         set_param_str(params, "schema", esn, p->odr);
1324     else
1325         if (retrieve->name)
1326             set_param_str(params, "schema", retrieve->name, p->odr);
1327         else if (retrieve->identifier)
1328             set_param_str(params, "schema", retrieve->identifier, p->odr);
1329         else
1330             set_param_str(params, "schema", "", p->odr);
1331
1332     if (p->score >= 0)
1333         set_param_int(params, "score", p->score, p->odr);
1334     set_param_int(params, "size", p->recordSize, p->odr);
1335
1336     doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1337                     0 /* URL */,
1338                     0 /* encoding */,
1339                     XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1340     if (!doc)
1341     {
1342         p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1343         return 0;
1344     }
1345
1346     /* retrieve conversion */
1347     perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
1348     if (!doc)
1349     {
1350         p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1351     }
1352     else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1353     {
1354         xmlChar *buf_out;
1355         int len_out;
1356
1357         if (last_xsp)
1358             xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1359         else
1360             xmlDocDumpMemory(doc, &buf_out, &len_out);            
1361
1362         p->output_format = VAL_TEXT_XML;
1363         p->rec_len = len_out;
1364         p->rec_buf = odr_malloc(p->odr, p->rec_len);
1365         memcpy(p->rec_buf, buf_out, p->rec_len);
1366         xmlFree(buf_out);
1367     }
1368     else if (p->output_format == VAL_SUTRS)
1369     {
1370         xmlChar *buf_out;
1371         int len_out;
1372
1373         if (last_xsp)
1374             xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1375         else
1376             xmlDocDumpMemory(doc, &buf_out, &len_out);            
1377         
1378         p->output_format = VAL_SUTRS;
1379         p->rec_len = len_out;
1380         p->rec_buf = odr_malloc(p->odr, p->rec_len);
1381         memcpy(p->rec_buf, buf_out, p->rec_len);
1382         
1383         xmlFree(buf_out);
1384     }
1385     else
1386     {
1387         p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1388     }
1389     xmlFreeDoc(doc);
1390     return 0;
1391 }
1392
1393 static struct recType filter_type = {
1394     0,
1395     "dom",
1396     filter_init,
1397     filter_config,
1398     filter_destroy,
1399     filter_extract,
1400     filter_retrieve
1401 };
1402
1403 RecType
1404 #ifdef IDZEBRA_STATIC_DOM
1405 idzebra_filter_dom
1406 #else
1407 idzebra_filter
1408 #endif
1409
1410 [] = {
1411     &filter_type,
1412     0,
1413 };
1414 /*
1415  * Local variables:
1416  * c-basic-offset: 4
1417  * indent-tabs-mode: nil
1418  * End:
1419  * vim: shiftwidth=4 tabstop=8 expandtab
1420  */
1421