38ac3d498e9a6094e37af76800c136e48be61dfd
[idzebra-moved-to-github.git] / index / mod_dom.c
1 /* $Id: mod_dom.c,v 1.14 2007-02-15 14:44:48 marc Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5    This file is part of the Zebra server.
6
7    Zebra is free software; you can redistribute it and/or modify it under
8    the terms of the GNU General Public License as published by the Free
9    Software Foundation; either version 2, or (at your option) any later
10    version.
11
12    Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or
14    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15    for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 #include <stdio.h>
24 #include <assert.h>
25 #include <ctype.h>
26
27 #include <yaz/diagbib1.h>
28 #include <yaz/tpath.h>
29
30 #include <libxml/xmlversion.h>
31 #include <libxml/parser.h>
32 #include <libxml/tree.h>
33 #include <libxml/xmlIO.h>
34 #include <libxml/xmlreader.h>
35 #include <libxslt/transform.h>
36 #include <libxslt/xsltutils.h>
37
38 #if YAZ_HAVE_EXSLT
39 #include <libexslt/exslt.h>
40 #endif
41
42 #include <idzebra/util.h>
43 #include <idzebra/recctrl.h>
44
45 /* DOM filter style indexing */
46 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
47 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
48
49 /* DOM filter style indexing */
50 #define ZEBRA_PI_NAME "zebra-2.0"
51 static const char *zebra_pi_name = ZEBRA_PI_NAME;
52
53
54
55 struct convert_s {
56     const char *stylesheet;
57     xsltStylesheetPtr stylesheet_xsp;
58     struct convert_s *next;
59 };
60
61 struct filter_extract {
62     const char *name;
63     struct convert_s *convert;
64 };
65
66 struct filter_store {
67     struct convert_s *convert;
68 };
69
70 struct filter_retrieve {
71     const char *name;
72     const char *identifier;
73     struct convert_s *convert;
74     struct filter_retrieve *next;
75 };
76
77 #define DOM_INPUT_XMLREADER 1
78 #define DOM_INPUT_MARC 2
79 struct filter_input {
80     const char *syntax;
81     const char *name;
82     struct convert_s *convert;
83     int type;
84     union {
85         struct {
86             const char *input_charset;
87             yaz_marc_t handle;
88             yaz_iconv_t iconv;
89         } marc;
90         struct {
91             xmlTextReaderPtr reader;
92             int split_level;
93         } xmlreader;
94     } u;
95     struct filter_input *next;
96 };
97   
98 struct filter_info {
99     char *fname;
100     char *full_name;
101     const char *profile_path;
102     ODR odr_record;
103     ODR odr_config;
104     xmlDocPtr doc_config;
105     struct filter_extract *extract;
106     struct filter_retrieve *retrieve_list;
107     struct filter_input *input_list;
108     struct filter_store *store;
109 };
110
111 #define XML_STRCMP(a,b)   strcmp((char*)a, b)
112 #define XML_STRLEN(a) strlen((char*)a)
113
114
115
116
117 static void set_param_str(const char **params, const char *name,
118                           const char *value, ODR odr)
119 {
120     char *quoted = odr_malloc(odr, 3 + strlen(value));
121     sprintf(quoted, "'%s'", value);
122     while (*params)
123         params++;
124     params[0] = name;
125     params[1] = quoted;
126     params[2] = 0;
127 }
128
129 static void set_param_int(const char **params, const char *name,
130                           zint value, ODR odr)
131 {
132     char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
133     while (*params)
134         params++;
135     sprintf(quoted, "'" ZINT_FORMAT "'", value);
136     params[0] = name;
137     params[1] = quoted;
138     params[2] = 0;
139 }
140
141 static void *filter_init(Res res, RecType recType)
142 {
143     struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
144     tinfo->fname = 0;
145     tinfo->full_name = 0;
146     tinfo->profile_path = 0;
147     tinfo->odr_record = odr_createmem(ODR_ENCODE);
148     tinfo->odr_config = odr_createmem(ODR_ENCODE);
149     tinfo->extract = 0;
150     tinfo->retrieve_list = 0;
151     tinfo->input_list = 0;
152     tinfo->store = 0;
153     tinfo->doc_config = 0;
154
155 #if YAZ_HAVE_EXSLT
156     exsltRegisterAll(); 
157 #endif
158
159     return tinfo;
160 }
161
162 static int attr_content(struct _xmlAttr *attr, const char *name,
163                         const char **dst_content)
164 {
165     if (!XML_STRCMP(attr->name, name) && attr->children 
166         && attr->children->type == XML_TEXT_NODE)
167         {
168             *dst_content = (const char *)(attr->children->content);
169             return 1;
170         }
171     return 0;
172 }
173
174 static void destroy_xsp(struct convert_s *c)
175 {
176     while(c)
177         {
178             if (c->stylesheet_xsp)
179                 xsltFreeStylesheet(c->stylesheet_xsp);
180             c = c->next;
181         }
182 }
183
184 static void destroy_dom(struct filter_info *tinfo)
185 {
186     if (tinfo->extract)
187         {
188             destroy_xsp(tinfo->extract->convert);
189             tinfo->extract = 0;
190         }
191     if (tinfo->store)
192         {
193             destroy_xsp(tinfo->store->convert);
194             tinfo->store = 0;
195         }
196     if (tinfo->input_list)
197         {
198             struct filter_input *i_ptr;
199             for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
200                 {
201                     switch(i_ptr->type)
202                         {
203                         case DOM_INPUT_XMLREADER:
204                             if (i_ptr->u.xmlreader.reader)
205                                 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
206                             break;
207                         case DOM_INPUT_MARC:
208                             yaz_iconv_close(i_ptr->u.marc.iconv);
209                             yaz_marc_destroy(i_ptr->u.marc.handle);
210                             break;
211                         }
212                     destroy_xsp(i_ptr->convert);
213                 }
214             tinfo->input_list = 0;
215         }
216     if (tinfo->retrieve_list)
217         {
218             struct filter_retrieve *r_ptr;
219             for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
220                 destroy_xsp(r_ptr->convert);
221             tinfo->retrieve_list = 0;
222         }
223
224     if (tinfo->doc_config)
225         {
226             xmlFreeDoc(tinfo->doc_config);
227             tinfo->doc_config = 0;
228         }
229     odr_reset(tinfo->odr_config);
230 }
231
232 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
233                                struct convert_s **l)
234 {
235     *l = 0;
236     for(; ptr; ptr = ptr->next)
237         {
238             if (ptr->type != XML_ELEMENT_NODE)
239                 continue;
240             if (!XML_STRCMP(ptr->name, "xslt"))
241                 {
242                     struct _xmlAttr *attr;
243                     struct convert_s *p 
244                         = odr_malloc(tinfo->odr_config, sizeof(*p));
245
246                     p->next = 0;
247                     p->stylesheet = 0;
248                     p->stylesheet_xsp = 0;
249
250                     for (attr = ptr->properties; attr; attr = attr->next)
251                         if (attr_content(attr, "stylesheet", &p->stylesheet))
252                             ;
253                         else
254                             yaz_log(YLOG_WARN, "%s: dom filter: "
255                                     "%s bad attribute @%s, "
256                                     "expected @stylesheet",
257                                     tinfo->fname, 
258                                     xmlGetNodePath(ptr), attr->name);
259                     if (p->stylesheet)
260                         {
261                             char tmp_xslt_full_name[1024];
262                             if (!yaz_filepath_resolve(p->stylesheet, 
263                                                       tinfo->profile_path,
264                                                       NULL, 
265                                                       tmp_xslt_full_name))
266                                 {
267                                     yaz_log(YLOG_WARN, "%s: dom filter: "
268                                             "stylesheet %s not found in "
269                                             "path %s",
270                                             tinfo->fname,
271                                             p->stylesheet, 
272                                             tinfo->profile_path);
273                                     return ZEBRA_FAIL;
274                                 }
275                 
276                             p->stylesheet_xsp
277                                 = xsltParseStylesheetFile((const xmlChar*) 
278                                                           tmp_xslt_full_name);
279                             if (!p->stylesheet_xsp)
280                                 {
281                                     yaz_log(YLOG_WARN, "%s: dom filter: "
282                                             "could not parse xslt "
283                                             "stylesheet %s",
284                                             tinfo->fname, tmp_xslt_full_name);
285                                     return ZEBRA_FAIL;
286                                 }
287                         }
288                     else
289                         {
290                             yaz_log(YLOG_WARN, "%s: dom filter: "
291                                     "%s missing attribute 'stylesheet' ", 
292                                     tinfo->fname, xmlGetNodePath(ptr));
293                             return ZEBRA_FAIL;
294                         }
295                     *l = p;
296                     l = &p->next;
297                 }
298             else
299                 {
300                     yaz_log(YLOG_LOG, 
301                             "%s: dom filter: "
302                             "%s bad node '%s'",
303                             tinfo->fname, xmlGetNodePath(ptr), ptr->name);
304                     return ZEBRA_FAIL;
305                 }
306         
307         }
308     return ZEBRA_OK;
309 }
310
311 static ZEBRA_RES perform_convert(struct filter_info *tinfo, 
312                                  struct convert_s *convert,
313                                  const char **params,
314                                  xmlDocPtr *doc,
315                                  xsltStylesheetPtr *last_xsp)
316 {
317     for (; convert; convert = convert->next)
318         {
319             xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
320                                                     *doc, params);
321             if (last_xsp)
322                 *last_xsp = convert->stylesheet_xsp;
323             xmlFreeDoc(*doc);
324             *doc = res_doc;
325         }
326     return ZEBRA_OK;
327 }
328
329 static struct filter_input *new_input(struct filter_info *tinfo, int type)
330 {
331     struct filter_input *p;
332     struct filter_input **np = &tinfo->input_list;
333     for (;*np; np = &(*np)->next)
334         ;
335     p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
336     p->next = 0;
337     p->syntax = 0;
338     p->name = 0;
339     p->convert = 0;
340     p->type = type;
341     return p;
342 }
343
344 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
345                              const char *syntax,
346                              const char *name)
347 {
348     for (; ptr; ptr = ptr->next)
349         {
350             if (ptr->type != XML_ELEMENT_NODE)
351                 continue;
352             if (!XML_STRCMP(ptr->name, "marc"))
353                 {
354                     yaz_iconv_t iconv = 0;
355                     const char *input_charset = "marc-8";
356                     struct _xmlAttr *attr;
357             
358                     for (attr = ptr->properties; attr; attr = attr->next)
359                         {
360                             if (attr_content(attr, "charset", &input_charset))
361                                 ;
362                             else
363                                 yaz_log(YLOG_WARN, "%s: dom filter: "
364                                         "%s bad attribute @%s,"
365                                         " expected @charset",
366                                         tinfo->fname, 
367                                         xmlGetNodePath(ptr), attr->name);
368                         }
369                     iconv = yaz_iconv_open("utf-8", input_charset);
370                     if (!iconv)
371                         {
372                             yaz_log(YLOG_WARN, "%s: dom filter: "
373                                     "%s unsupported @charset '%s'", 
374                                     tinfo->fname, xmlGetNodePath(ptr),
375                                     input_charset);
376                             return ZEBRA_FAIL;
377                         }
378                     else
379                         {
380                             struct filter_input *p 
381                                 = new_input(tinfo, DOM_INPUT_MARC);
382                             p->u.marc.handle = yaz_marc_create();
383                             p->u.marc.iconv = iconv;
384                 
385                             yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
386                 
387                             ptr = ptr->next;
388                 
389                             parse_convert(tinfo, ptr, &p->convert);
390                         }
391                     break;
392
393                 }
394             else if (!XML_STRCMP(ptr->name, "xmlreader"))
395                 {
396                     struct filter_input *p 
397                         = new_input(tinfo, DOM_INPUT_XMLREADER);
398                     struct _xmlAttr *attr;
399                     const char *level_str = 0;
400
401                     p->u.xmlreader.split_level = 0;
402                     p->u.xmlreader.reader = 0;
403
404                     for (attr = ptr->properties; attr; attr = attr->next)
405                         {
406                             if (attr_content(attr, "level", &level_str))
407                                 ;
408                             else
409                                 yaz_log(YLOG_WARN, "%s: dom filter: "
410                                         "%s bad attribute @%s,"
411                                         " expected @level",
412                                         tinfo->fname, xmlGetNodePath(ptr),
413                                         attr->name);
414                         }
415                     if (level_str)
416                         p->u.xmlreader.split_level = atoi(level_str);
417                 
418                     ptr = ptr->next;
419
420                     parse_convert(tinfo, ptr, &p->convert);
421                     break;
422                 }
423             else
424                 {
425                     yaz_log(YLOG_WARN, "%s: dom filter: "
426                             "%s bad element <%s>,"
427                             " expected <marc>|<xmlreader>",
428                             tinfo->fname, xmlGetNodePath(ptr), ptr->name);
429                     return ZEBRA_FAIL;
430                 }
431         }
432     return ZEBRA_OK;
433 }
434
435 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
436 {
437     char tmp_full_name[1024];
438     xmlNodePtr ptr;
439     xmlDocPtr doc;
440
441     tinfo->fname = odr_strdup(tinfo->odr_config, fname);
442     
443     if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path, 
444                              NULL, tmp_full_name))
445         tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
446     else
447         tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
448     
449     yaz_log(YLOG_LOG, "%s dom filter: "
450             "loading config file %s", tinfo->fname, tinfo->full_name);
451     
452     doc = xmlParseFile(tinfo->full_name);
453     if (!doc)
454         {
455             yaz_log(YLOG_WARN, "%s: dom filter: "
456                     "failed to parse config file %s",
457                     tinfo->fname, tinfo->full_name);
458             return ZEBRA_FAIL;
459         }
460     /* save because we store ptrs to the content */ 
461     tinfo->doc_config = doc;
462     
463     ptr = xmlDocGetRootElement(doc);
464     if (!ptr || ptr->type != XML_ELEMENT_NODE 
465         || XML_STRCMP(ptr->name, "dom"))
466         {
467             yaz_log(YLOG_WARN, "%s: dom filter: "
468                     "%s bad root element <%s>,"
469                     " expected root element <dom>", 
470                     tinfo->fname, xmlGetNodePath(ptr), ptr->name);  
471             return ZEBRA_FAIL;
472         }
473
474     for (ptr = ptr->children; ptr; ptr = ptr->next)
475         {
476             if (ptr->type != XML_ELEMENT_NODE)
477                 continue;
478             if (!XML_STRCMP(ptr->name, "extract"))
479                 {
480                     /*
481                       <extract name="index">
482                       <xslt stylesheet="first.xsl"/>
483                       <xslt stylesheet="second.xsl"/>
484                       </extract>
485                     */
486                     struct _xmlAttr *attr;
487                     struct filter_extract *f =
488                         odr_malloc(tinfo->odr_config, sizeof(*f));
489             
490                     tinfo->extract = f;
491                     f->name = 0;
492                     f->convert = 0;
493                     for (attr = ptr->properties; attr; attr = attr->next)
494                         {
495                             if (attr_content(attr, "name", &f->name))
496                                 ;
497                             else
498                                 yaz_log(YLOG_WARN, "%s: dom filter: "
499                                         "%s bad attribute @%s"
500                                         " expected @name",
501                                         tinfo->fname, 
502                                         xmlGetNodePath(ptr),attr->name);
503
504                         }
505                     parse_convert(tinfo, ptr->children, &f->convert);
506                 }
507             else if (!XML_STRCMP(ptr->name, "retrieve"))
508                 {  
509                     /* 
510                        <retrieve name="F">
511                        <xslt stylesheet="some.xsl"/>
512                        <xslt stylesheet="some.xsl"/>
513                        </retrieve>
514                     */
515                     struct _xmlAttr *attr;
516                     struct filter_retrieve **fp = &tinfo->retrieve_list;
517                     struct filter_retrieve *f =
518                         odr_malloc(tinfo->odr_config, sizeof(*f));
519             
520                     while (*fp)
521                         fp = &(*fp)->next;
522
523                     *fp = f;
524                     f->name = 0;
525                     f->identifier = 0;
526                     f->convert = 0;
527                     f->next = 0;
528
529                     for (attr = ptr->properties; attr; attr = attr->next)
530                         {
531                             if (attr_content(attr, "identifier", 
532                                              &f->identifier))
533                                 ;
534                             else if (attr_content(attr, "name", &f->name))
535                                 ;
536                             else
537                                 yaz_log(YLOG_WARN, "%s: dom filter: "
538                                         "%s bad attribute @%s"
539                                         " expected @identifier|@name",
540                                         tinfo->fname, 
541                                         xmlGetNodePath(ptr),attr->name);
542                         }
543                     parse_convert(tinfo, ptr->children, &f->convert);
544                 }
545             else if (!XML_STRCMP(ptr->name, "store"))
546                 {
547                     /*
548                       <store name="F">
549                       <xslt stylesheet="some.xsl"/>
550                       <xslt stylesheet="some.xsl"/>
551                       </retrieve>
552                     */
553                     struct filter_store *f =
554                         odr_malloc(tinfo->odr_config, sizeof(*f));
555             
556                     tinfo->store = f;
557                     f->convert = 0;
558                     parse_convert(tinfo, ptr->children, &f->convert);
559                 }
560             else if (!XML_STRCMP(ptr->name, "input"))
561                 {
562                     /*
563                       <input syntax="xml">
564                       <xmlreader level="1"/>
565                       </input>
566                       <input syntax="usmarc">
567                       <marc inputcharset="marc-8"/>
568                       </input>
569                     */
570                     struct _xmlAttr *attr;
571                     const char  *syntax = 0;
572                     const char *name = 0;
573                     for (attr = ptr->properties; attr; attr = attr->next)
574                         {
575                             if (attr_content(attr, "syntax", &syntax))
576                                 ;
577                             else if (attr_content(attr, "name", &name))
578                                 ;
579                             else
580                                 yaz_log(YLOG_WARN, "%s: dom filter: "
581                                         "%s bad attribute @%s"
582                                         " expected @syntax|@name",
583                                         tinfo->fname, 
584                                         xmlGetNodePath(ptr),attr->name);
585                         }
586                     parse_input(tinfo, ptr->children, syntax, name);
587                 }
588             else
589                 {
590                     yaz_log(YLOG_WARN, "%s: dom filter: "
591                             "%s bad element <%s>,"
592                             " expected <extract>|<input>|<retrieve>|<store>",
593                             tinfo->fname, xmlGetNodePath(ptr), ptr->name);
594                     return ZEBRA_FAIL;
595                 }
596         }
597     return ZEBRA_OK;
598 }
599
600 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
601                                                const char *est)
602 {
603     struct filter_retrieve *f = tinfo->retrieve_list;
604
605     /* return first schema if no est is provided */
606     if (!est)
607         return f;
608     for (; f; f = f->next)
609         { 
610             /* find requested schema */
611             if (est) 
612                 {    
613                     if (f->identifier && !strcmp(f->identifier, est))
614                         return f;
615                     if (f->name && !strcmp(f->name, est))
616                         return f;
617                 } 
618         }
619     return 0;
620 }
621
622 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
623 {
624     struct filter_info *tinfo = clientData;
625     if (!args || !*args)
626         {
627             yaz_log(YLOG_WARN, "dom filter: need config file");
628             return ZEBRA_FAIL;
629         }
630
631     if (tinfo->fname && !strcmp(args, tinfo->fname))
632         return ZEBRA_OK;
633     
634     tinfo->profile_path = res_get(res, "profilePath");
635
636     destroy_dom(tinfo);
637     return parse_dom(tinfo, args);
638 }
639
640 static void filter_destroy(void *clientData)
641 {
642     struct filter_info *tinfo = clientData;
643     destroy_dom(tinfo);
644     odr_destroy(tinfo->odr_config);
645     odr_destroy(tinfo->odr_record);
646     xfree(tinfo);
647 }
648
649 static int ioread_ex(void *context, char *buffer, int len)
650 {
651     struct recExtractCtrl *p = context;
652     return p->stream->readf(p->stream, buffer, len);
653 }
654
655 static int ioclose_ex(void *context)
656 {
657     return 0;
658 }
659
660
661 /* DOM filter style indexing */
662 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
663                             xmlChar **dst_content)
664 {
665     if (0 == XML_STRCMP(attr->name, name) && attr->children 
666         && attr->children->type == XML_TEXT_NODE)
667         {
668             *dst_content = (attr->children->content);
669             return 1;
670         }
671     return 0;
672 }
673
674
675 /* DOM filter style indexing */
676 static void index_value_of(struct filter_info *tinfo, 
677                            struct recExtractCtrl *extctr, 
678                            xmlNodePtr node, 
679                            xmlChar * index_p)
680 {
681     xmlChar *text = xmlNodeGetContent(node);
682     size_t text_len = strlen((const char *)text);
683
684
685     /* if there is no text, we do not need to proceed */
686     if (text_len)
687         {            
688             xmlChar *look = index_p;
689             xmlChar *bval;
690             xmlChar *eval;
691
692             xmlChar index[256];
693             xmlChar type[256];
694
695             /* assingning text to be indexed */
696             RecWord recWord;
697             (*extctr->init)(extctr, &recWord);
698             recWord.term_buf = (const char *)text;
699             recWord.term_len = text_len;
700
701             /* parsing all index name/type pairs */
702             /* may not start with ' ' or ':' */
703             while (*look && ' ' != *look && ':' != *look){
704     
705                 /* setting name and type to zero */
706                 *index = '\0';
707                 *type = '\0';
708     
709                 /* parsing one index name */
710                 bval = look;
711                 while (*look && ':' != *look && ' ' != *look){
712                     look++;
713                 }
714                 eval = look;
715                 strncpy((char *)index, (const char *)bval, eval - bval);
716                 index[eval - bval] = '\0';
717     
718     
719                 /* parsing one index type, if existing */
720                 if (':' == *look){
721                     look++;
722       
723                     bval = look;
724                     while (*look && ' ' != *look){
725                         look++;
726                     }
727                     eval = look;
728                     strncpy((char *)type, (const char *)bval, eval - bval);
729                     type[eval - bval] = '\0';
730                 }
731
732                 /* actually indexing the text given */
733                 yaz_log(YLOG_DEBUG, "%s dom filter: "
734                         "INDEX  '%s:%s' '%s'", 
735                         tinfo->fname, index, type, text);
736
737                 recWord.index_name = (const char *)index;
738                 if (type && *type)
739                     recWord.index_type = *type;
740                 (extctr->tokenAdd)(&recWord);
741
742                 /* eat whitespaces */
743                 if (*look && ' ' == *look && *(look+1)){
744                     look++;
745                 } 
746             }
747         }
748     
749     xmlFree(text); 
750 }
751
752
753 /* DOM filter style indexing */
754 static void set_record_info(struct filter_info *tinfo, 
755                             struct recExtractCtrl *extctr, 
756                             xmlChar * id_p, 
757                             xmlChar * rank_p, 
758                             xmlChar * type_p)
759 {
760     yaz_log(YLOG_DEBUG, "%s dom filter: "
761             "RECORD id=%s rank=%s type=%s", 
762             tinfo->fname,  id_p, rank_p, type_p);
763     
764     if (id_p)
765         sscanf((const char *)id_p, "%255s", extctr->match_criteria);
766
767     if (rank_p)
768         extctr->staticrank = atozint((const char *)rank_p);
769
770     /*     if (!strcmp("update", type_str)) */
771     /*         index_node(tinfo, ctrl, ptr, recWord); */
772     /*     else if (!strcmp("delete", type_str)) */
773     /*         yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); */
774     /*     else */
775     /*         yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'",  */
776     /*                 type_str); */
777
778 }
779
780
781 /* DOM filter style indexing */
782 static void process_xml_element_zebra_node(struct filter_info *tinfo, 
783                                            struct recExtractCtrl *extctr, 
784                                            xmlNodePtr node)
785 {
786     if (node->type == XML_ELEMENT_NODE 
787         && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns)){
788     
789         if (0 == XML_STRCMP(node->name, "index")){
790             xmlChar *index_p = 0;
791
792             struct _xmlAttr *attr;      
793             for (attr = node->properties; attr; attr = attr->next){
794                 if (attr_content_xml(attr, "name", &index_p)){
795                     index_value_of(tinfo, extctr, node, index_p);        
796                 }  
797                 else
798                     yaz_log(YLOG_WARN,"%s dom filter: "
799                             "%s bad attribute @%s, expected @name",
800                             tinfo->fname, xmlGetNodePath(node), attr->name);
801             }
802         }
803         else if (0 == XML_STRCMP(node->name, "record")){
804             xmlChar *id_p = 0;
805             xmlChar *rank_p = 0;
806             xmlChar *type_p = 0;
807
808             struct _xmlAttr *attr;
809             for (attr = node->properties; attr; attr = attr->next){
810                 if (attr_content_xml(attr, "id", &id_p))
811                     ;
812                 else if (attr_content_xml(attr, "rank", &rank_p))
813                     ;
814                 else if (attr_content_xml(attr, "type", &type_p))
815                    ;
816                 else
817                     yaz_log(YLOG_WARN,"%s dom filter: "
818                             "%s bad attribute @%s,"
819                            " expected @id|@rank|@type",
820                            tinfo->fname, xmlGetNodePath(node), attr->name);
821
822                 if (type_p && 0 != strcmp("update", (const char *)type_p))
823                     yaz_log(YLOG_WARN,"%s dom filter: "
824                             "%s attribute @%s,"
825                             " only implemented '@type='update'",
826                             tinfo->fname, xmlGetNodePath(node), attr->name);
827           
828
829             }
830             set_record_info(tinfo, extctr, id_p, rank_p, type_p);
831         } else {
832             yaz_log(YLOG_WARN,"%s dom filter: "
833                     "%s bad element <%s>,"
834                     " expected <record>|<index> in namespace '%s'",
835                     tinfo->fname, xmlGetNodePath(node), 
836                     node->name, zebra_dom_ns);
837       
838         }
839     }
840 }
841
842
843 /* DOM filter style indexing */
844 static void process_xml_pi_node(struct filter_info *tinfo, 
845                                 struct recExtractCtrl *extctr, 
846                                 xmlNodePtr node,
847                                 xmlChar **index_pp)
848 {
849
850     /* yaz_log(YLOG_DEBUG,"PI     %s\n", xmlGetNodePath(node)); */
851
852     /* if right PI name, continue parsing PI */
853     if (0 == strcmp(zebra_pi_name, (const char *)node->name)){
854         xmlChar *pi_p =  node->content;
855         xmlChar *look = pi_p;
856     
857         xmlChar *bval;
858         xmlChar *eval;
859
860         /* parsing PI record instructions */
861         if (0 == strncmp((const char *)look, "record", 6)){
862             xmlChar id[256];
863             xmlChar rank[256];
864             xmlChar type[256];
865
866             *id = '\0';
867             *rank = '\0';
868             *type = '\0';
869       
870             look += 6;
871       
872             /* eat whitespace */
873             while (*look && ' ' == *look && *(look+1))
874                 look++;
875
876             /* parse possible id */
877             if (*look && 0 == strncmp((const char *)look, "id=", 3)){
878                 look += 3;
879                 bval = look;
880                 while (*look && ' ' != *look)
881                     look++;
882                 eval = look;
883                 strncpy((char *)id, (const char *)bval, eval - bval);
884                 id[eval - bval] = '\0';
885             }
886       
887             /* eat whitespace */
888             while (*look && ' ' == *look && *(look+1))
889                 look++;
890       
891             /* parse possible rank */
892             if (*look && 0 == strncmp((const char *)look, "rank=", 5)){
893                 look += 6;
894                 bval = look;
895                 while (*look && ' ' != *look)
896                     look++;
897                 eval = look;
898                 strncpy((char *)rank, (const char *)bval, eval - bval);
899                 rank[eval - bval] = '\0';
900             }
901
902             /* eat whitespace */
903             while (*look && ' ' == *look && *(look+1))
904                 look++;
905
906             if (look && '\0' != *look)
907                 yaz_log(YLOG_WARN,"%s dom filter: "
908                         "%s content '%s', can not parse '%s'",
909                         tinfo->fname, xmlGetNodePath(node), pi_p, look);
910             else 
911                 set_record_info(tinfo, extctr, id, rank, 0);
912
913         } 
914    
915         /* parsing index instruction */
916         else   if (0 == strncmp((const char *)look, "index", 5)){
917             look += 5;
918       
919             /* eat whitespace */
920             while (*look && ' ' == *look && *(look+1))
921                 look++;
922
923             /* export index instructions to outside */
924             *index_pp = look;
925         } 
926         else 
927             yaz_log(YLOG_WARN,"%s dom filter: "
928                     "%s content '%s', can not parse '%s'",
929                     tinfo->fname, xmlGetNodePath(node), pi_p, look);
930     }
931 }
932
933 /* DOM filter style indexing */
934 static void process_xml_element_node(struct filter_info *tinfo, 
935                                      struct recExtractCtrl *extctr, 
936                                      xmlNodePtr node)
937 {
938     /* remember indexing instruction from PI to next element node */
939     xmlChar *index_p = 0;
940
941     /* yaz_log(YLOG_DEBUG,"ELEM   %s\n", xmlGetNodePath(node)); */
942
943     /* check if we are an element node in the special zebra namespace 
944        and either set record data or index value-of node content*/
945     process_xml_element_zebra_node(tinfo, extctr, node);
946   
947     /* loop through kid nodes */
948     for (node = node->children; node; node = node->next)
949         {
950             /* check and set PI record and index index instructions */
951             if (node->type == XML_PI_NODE){
952                 process_xml_pi_node(tinfo, extctr, node, &index_p);
953             }
954             else if (node->type == XML_ELEMENT_NODE){
955                 /* if there was a PI index instruction before this element */
956                 if (index_p){
957                     index_value_of(tinfo, extctr, node, index_p);            
958                     index_p = 0;
959                 }
960                 process_xml_element_node(tinfo, extctr, node);
961             }
962             else
963                 continue;
964         }
965 }
966
967
968 /* DOM filter style indexing */
969 static void extract_dom_doc_node(struct filter_info *tinfo, 
970                                  struct recExtractCtrl *extctr, 
971                                  xmlDocPtr doc)
972 {
973     /* yaz_log(YLOG_DEBUG,"DOC    %s\n", xmlGetNodePath((xmlNodePtr)doc)); */
974
975     xmlChar *buf_out;
976     int len_out;
977     if (extctr->flagShowRecords){
978         xmlDocDumpMemory(doc, &buf_out, &len_out);
979         fwrite(buf_out, len_out, 1, stdout);
980         xmlFree(buf_out);
981     }
982
983     process_xml_element_node(tinfo, extctr, (xmlNodePtr)doc);
984 }
985
986
987
988
989 static int convert_extract_doc(struct filter_info *tinfo, 
990                                struct filter_input *input,
991                                struct recExtractCtrl *p, 
992                                xmlDocPtr doc)
993
994 {
995     xmlChar *buf_out;
996     int len_out;
997     const char *params[10];
998     xsltStylesheetPtr last_xsp = 0;
999     xmlDocPtr store_doc = 0;
1000
1001     params[0] = 0;
1002     set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
1003
1004     /* input conversion */
1005     perform_convert(tinfo, input->convert, params, &doc, 0);
1006
1007     if (tinfo->store)
1008         {
1009             /* store conversion */
1010             store_doc = xmlCopyDoc(doc, 1);
1011             perform_convert(tinfo, tinfo->store->convert,
1012                             params, &store_doc, &last_xsp);
1013         }
1014     
1015     if (last_xsp)
1016         xsltSaveResultToString(&buf_out, &len_out, 
1017                                store_doc ? store_doc : doc, last_xsp);
1018     else
1019         xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1020     if (p->flagShowRecords)
1021         fwrite(buf_out, len_out, 1, stdout);
1022     (*p->setStoreData)(p, buf_out, len_out);
1023     xmlFree(buf_out);
1024
1025     if (store_doc)
1026         xmlFreeDoc(store_doc);
1027
1028     /* extract conversion */
1029     perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
1030
1031     /* finally, do the indexing */
1032     if (doc){
1033         extract_dom_doc_node(tinfo, p, doc);
1034         /* extract_doc_alvis(tinfo, p, doc); */
1035         xmlFreeDoc(doc);
1036     }
1037
1038     return RECCTRL_EXTRACT_OK;
1039 }
1040
1041 static int extract_xml_split(struct filter_info *tinfo,
1042                              struct filter_input *input,
1043                              struct recExtractCtrl *p)
1044 {
1045     int ret;
1046
1047     if (p->first_record)
1048         {
1049             if (input->u.xmlreader.reader)
1050                 xmlFreeTextReader(input->u.xmlreader.reader);
1051             input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1052                                                        p /* I/O handler */,
1053                                                        0 /* URL */, 
1054                                                        0 /* encoding */,
1055                                                        XML_PARSE_XINCLUDE|
1056                                                        XML_PARSE_NOENT);
1057         }
1058     if (!input->u.xmlreader.reader)
1059         return RECCTRL_EXTRACT_ERROR_GENERIC;
1060
1061     ret = xmlTextReaderRead(input->u.xmlreader.reader);
1062     while (ret == 1)
1063         {
1064             int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1065             int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1066             if (type == XML_READER_TYPE_ELEMENT && 
1067                 input->u.xmlreader.split_level == depth)
1068                 {
1069                     xmlNodePtr ptr
1070                         = xmlTextReaderExpand(input->u.xmlreader.reader);
1071                     if (ptr)
1072                         {
1073                             xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1074                             xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1075                 
1076                             xmlDocSetRootElement(doc, ptr2);
1077                 
1078                             return convert_extract_doc(tinfo, input, p, doc);
1079                         }
1080                     else
1081                         {
1082                             xmlFreeTextReader(input->u.xmlreader.reader);
1083                             input->u.xmlreader.reader = 0;
1084                             return RECCTRL_EXTRACT_ERROR_GENERIC;
1085                         }
1086                 }
1087             ret = xmlTextReaderRead(input->u.xmlreader.reader);
1088         }
1089     xmlFreeTextReader(input->u.xmlreader.reader);
1090     input->u.xmlreader.reader = 0;
1091     return RECCTRL_EXTRACT_EOF;
1092 }
1093
1094 static int extract_xml_full(struct filter_info *tinfo, 
1095                             struct filter_input *input,
1096                             struct recExtractCtrl *p)
1097 {
1098     if (p->first_record) /* only one record per stream */
1099         {
1100             xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, 
1101                                       p /* I/O handler */,
1102                                       0 /* URL */,
1103                                       0 /* encoding */,
1104                                       XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1105             if (!doc)
1106                 {
1107                     return RECCTRL_EXTRACT_ERROR_GENERIC;
1108                 }
1109             return convert_extract_doc(tinfo, input, p, doc);
1110         }
1111     else
1112         return RECCTRL_EXTRACT_EOF;
1113 }
1114
1115 static int extract_iso2709(struct filter_info *tinfo,
1116                            struct filter_input *input,
1117                            struct recExtractCtrl *p)
1118 {
1119     char buf[100000];
1120     int record_length;
1121     int read_bytes, r;
1122
1123     if (p->stream->readf(p->stream, buf, 5) != 5)
1124         return RECCTRL_EXTRACT_EOF;
1125     while (*buf < '0' || *buf > '9')
1126         {
1127             int i;
1128
1129             yaz_log(YLOG_WARN, "%s dom filter: "
1130                     "MARC: Skipping bad byte %d (0x%02X)",
1131                     tinfo->fname, *buf & 0xff, *buf & 0xff);
1132             for (i = 0; i<4; i++)
1133                 buf[i] = buf[i+1];
1134
1135             if (p->stream->readf(p->stream, buf+4, 1) != 1)
1136                 return RECCTRL_EXTRACT_EOF;
1137         }
1138     record_length = atoi_n (buf, 5);
1139     if (record_length < 25)
1140         {
1141             yaz_log (YLOG_WARN, "%s dom filter: "
1142                      "MARC record length < 25, is %d", 
1143                      tinfo->fname, record_length);
1144             return RECCTRL_EXTRACT_ERROR_GENERIC;
1145         }
1146     read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1147     if (read_bytes < record_length-5)
1148         {
1149             yaz_log (YLOG_WARN, "%s dom filter: "
1150                      "Couldn't read whole MARC record",
1151                      tinfo->fname);
1152             return RECCTRL_EXTRACT_ERROR_GENERIC;
1153         }
1154     r = yaz_marc_read_iso2709(input->u.marc.handle,  buf, record_length);
1155     if (r < record_length)
1156         {
1157             yaz_log (YLOG_WARN, "%s dom filter: "
1158                      "Parsing of MARC record failed r=%d length=%d",
1159                      tinfo->fname, r, record_length);
1160             return RECCTRL_EXTRACT_ERROR_GENERIC;
1161         }
1162     else
1163         {
1164             xmlDocPtr rdoc;
1165             xmlNode *root_ptr;
1166             yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1167             rdoc = xmlNewDoc((const xmlChar*) "1.0");
1168             xmlDocSetRootElement(rdoc, root_ptr);
1169             return convert_extract_doc(tinfo, input, p, rdoc);        
1170         }
1171     return RECCTRL_EXTRACT_OK;
1172 }
1173
1174 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1175 {
1176     struct filter_info *tinfo = clientData;
1177     struct filter_input *input = tinfo->input_list;
1178
1179     if (!input)
1180         return RECCTRL_EXTRACT_ERROR_GENERIC;
1181
1182     odr_reset(tinfo->odr_record);
1183     switch(input->type)
1184         {
1185         case DOM_INPUT_XMLREADER:
1186             if (input->u.xmlreader.split_level == 0)
1187                 return extract_xml_full(tinfo, input, p);
1188             else
1189                 return extract_xml_split(tinfo, input, p);
1190             break;
1191         case DOM_INPUT_MARC:
1192             return extract_iso2709(tinfo, input, p);
1193         }
1194     return RECCTRL_EXTRACT_ERROR_GENERIC;
1195 }
1196
1197 static int ioread_ret(void *context, char *buffer, int len)
1198 {
1199     struct recRetrieveCtrl *p = context;
1200     return p->stream->readf(p->stream, buffer, len);
1201 }
1202
1203 static int ioclose_ret(void *context)
1204 {
1205     return 0;
1206 }
1207
1208 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1209 {
1210     /* const char *esn = zebra_dom_ns; */
1211     const char *esn = 0;
1212     const char *params[32];
1213     struct filter_info *tinfo = clientData;
1214     xmlDocPtr doc;
1215     struct filter_retrieve *retrieve;
1216     xsltStylesheetPtr last_xsp = 0;
1217
1218     if (p->comp)
1219         {
1220             if (p->comp->which == Z_RecordComp_simple
1221                 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1222                 {
1223                     esn = p->comp->u.simple->u.generic;
1224                 }
1225             else if (p->comp->which == Z_RecordComp_complex 
1226                      && p->comp->u.complex->generic->elementSpec
1227                      && p->comp->u.complex->generic->elementSpec->which ==
1228                      Z_ElementSpec_elementSetName)
1229                 {
1230                     esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1231                 }
1232         }
1233     retrieve = lookup_retrieve(tinfo, esn);
1234     if (!retrieve)
1235         {
1236             p->diagnostic =
1237                 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1238             return 0;
1239         }
1240
1241     params[0] = 0;
1242     set_param_int(params, "id", p->localno, p->odr);
1243     if (p->fname)
1244         set_param_str(params, "filename", p->fname, p->odr);
1245     if (p->staticrank >= 0)
1246         set_param_int(params, "rank", p->staticrank, p->odr);
1247
1248     if (esn)
1249         set_param_str(params, "schema", esn, p->odr);
1250     else
1251         if (retrieve->name)
1252             set_param_str(params, "schema", retrieve->name, p->odr);
1253         else if (retrieve->identifier)
1254             set_param_str(params, "schema", retrieve->identifier, p->odr);
1255         else
1256             set_param_str(params, "schema", "", p->odr);
1257
1258     if (p->score >= 0)
1259         set_param_int(params, "score", p->score, p->odr);
1260     set_param_int(params, "size", p->recordSize, p->odr);
1261
1262     doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1263                     0 /* URL */,
1264                     0 /* encoding */,
1265                     XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1266     if (!doc)
1267         {
1268             p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1269             return 0;
1270         }
1271
1272     /* retrieve conversion */
1273     perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
1274     if (!doc)
1275         {
1276             p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1277         }
1278     else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1279         {
1280             xmlChar *buf_out;
1281             int len_out;
1282
1283             if (last_xsp)
1284                 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1285             else
1286                 xmlDocDumpMemory(doc, &buf_out, &len_out);            
1287
1288             p->output_format = VAL_TEXT_XML;
1289             p->rec_len = len_out;
1290             p->rec_buf = odr_malloc(p->odr, p->rec_len);
1291             memcpy(p->rec_buf, buf_out, p->rec_len);
1292             xmlFree(buf_out);
1293         }
1294     else if (p->output_format == VAL_SUTRS)
1295         {
1296             xmlChar *buf_out;
1297             int len_out;
1298
1299             if (last_xsp)
1300                 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1301             else
1302                 xmlDocDumpMemory(doc, &buf_out, &len_out);            
1303         
1304             p->output_format = VAL_SUTRS;
1305             p->rec_len = len_out;
1306             p->rec_buf = odr_malloc(p->odr, p->rec_len);
1307             memcpy(p->rec_buf, buf_out, p->rec_len);
1308         
1309             xmlFree(buf_out);
1310         }
1311     else
1312         {
1313             p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1314         }
1315     xmlFreeDoc(doc);
1316     return 0;
1317 }
1318
1319 static struct recType filter_type = {
1320     0,
1321     "dom",
1322     filter_init,
1323     filter_config,
1324     filter_destroy,
1325     filter_extract,
1326     filter_retrieve
1327 };
1328
1329 RecType
1330 #ifdef IDZEBRA_STATIC_DOM
1331 idzebra_filter_dom
1332 #else
1333 idzebra_filter
1334 #endif
1335
1336 [] = {
1337     &filter_type,
1338     0,
1339 };
1340 /*
1341  * Local variables:
1342  * c-basic-offset: 4
1343  * indent-tabs-mode: nil
1344  * End:
1345  * vim: shiftwidth=4 tabstop=8 expandtab
1346  */
1347