indenting entire file according to the rules stated in the very end of
[idzebra-moved-to-github.git] / index / mod_dom.c
1 /* $Id: mod_dom.c,v 1.9 2007-02-14 16:31:37 marc Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5    This file is part of the Zebra server.
6
7    Zebra is free software; you can redistribute it and/or modify it under
8    the terms of the GNU General Public License as published by the Free
9    Software Foundation; either version 2, or (at your option) any later
10    version.
11
12    Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or
14    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15    for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 #include <stdio.h>
24 #include <assert.h>
25 #include <ctype.h>
26
27 #include <yaz/diagbib1.h>
28 #include <yaz/tpath.h>
29
30 #include <libxml/xmlversion.h>
31 #include <libxml/parser.h>
32 #include <libxml/tree.h>
33 #include <libxml/xmlIO.h>
34 #include <libxml/xmlreader.h>
35 #include <libxslt/transform.h>
36 #include <libxslt/xsltutils.h>
37
38 #if YAZ_HAVE_EXSLT
39 #include <libexslt/exslt.h>
40 #endif
41
42 #include <idzebra/util.h>
43 #include <idzebra/recctrl.h>
44
45 struct convert_s {
46     const char *stylesheet;
47     xsltStylesheetPtr stylesheet_xsp;
48     struct convert_s *next;
49 };
50
51 struct filter_extract {
52     const char *name;
53     struct convert_s *convert;
54 };
55
56 struct filter_store {
57     struct convert_s *convert;
58 };
59
60 struct filter_retrieve {
61     const char *name;
62     const char *identifier;
63     struct convert_s *convert;
64     struct filter_retrieve *next;
65 };
66
67 #define DOM_INPUT_XMLREADER 1
68 #define DOM_INPUT_MARC 2
69 struct filter_input {
70     const char *syntax;
71     const char *name;
72     struct convert_s *convert;
73     int type;
74     union {
75         struct {
76             const char *input_charset;
77             yaz_marc_t handle;
78             yaz_iconv_t iconv;
79         } marc;
80         struct {
81             xmlTextReaderPtr reader;
82             int split_level;
83         } xmlreader;
84     } u;
85     struct filter_input *next;
86 };
87   
88 struct filter_info {
89     char *fname;
90     char *full_name;
91     const char *profile_path;
92     ODR odr_record;
93     ODR odr_config;
94     xmlDocPtr doc_config;
95     struct filter_extract *extract;
96     struct filter_retrieve *retrieve_list;
97     struct filter_input *input_list;
98     struct filter_store *store;
99 };
100
101 #define XML_STRCMP(a,b)   strcmp((char*)a, b)
102 #define XML_STRLEN(a) strlen((char*)a)
103
104
105
106
107 static void set_param_str(const char **params, const char *name,
108                           const char *value, ODR odr)
109 {
110     char *quoted = odr_malloc(odr, 3 + strlen(value));
111     sprintf(quoted, "'%s'", value);
112     while (*params)
113         params++;
114     params[0] = name;
115     params[1] = quoted;
116     params[2] = 0;
117 }
118
119 static void set_param_int(const char **params, const char *name,
120                           zint value, ODR odr)
121 {
122     char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
123     while (*params)
124         params++;
125     sprintf(quoted, "'" ZINT_FORMAT "'", value);
126     params[0] = name;
127     params[1] = quoted;
128     params[2] = 0;
129 }
130
131 static void *filter_init(Res res, RecType recType)
132 {
133     struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
134     tinfo->fname = 0;
135     tinfo->full_name = 0;
136     tinfo->profile_path = 0;
137     tinfo->odr_record = odr_createmem(ODR_ENCODE);
138     tinfo->odr_config = odr_createmem(ODR_ENCODE);
139     tinfo->extract = 0;
140     tinfo->retrieve_list = 0;
141     tinfo->input_list = 0;
142     tinfo->store = 0;
143     tinfo->doc_config = 0;
144
145 #if YAZ_HAVE_EXSLT
146     exsltRegisterAll(); 
147 #endif
148
149     return tinfo;
150 }
151
152 static int attr_content(struct _xmlAttr *attr, const char *name,
153                         const char **dst_content)
154 {
155     if (!XML_STRCMP(attr->name, name) && attr->children 
156         && attr->children->type == XML_TEXT_NODE)
157         {
158             *dst_content = (const char *)(attr->children->content);
159             return 1;
160         }
161     return 0;
162 }
163
164 static void destroy_xsp(struct convert_s *c)
165 {
166     while(c)
167         {
168             if (c->stylesheet_xsp)
169                 xsltFreeStylesheet(c->stylesheet_xsp);
170             c = c->next;
171         }
172 }
173
174 static void destroy_dom(struct filter_info *tinfo)
175 {
176     if (tinfo->extract)
177         {
178             destroy_xsp(tinfo->extract->convert);
179             tinfo->extract = 0;
180         }
181     if (tinfo->store)
182         {
183             destroy_xsp(tinfo->store->convert);
184             tinfo->store = 0;
185         }
186     if (tinfo->input_list)
187         {
188             struct filter_input *i_ptr;
189             for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
190                 {
191                     switch(i_ptr->type)
192                         {
193                         case DOM_INPUT_XMLREADER:
194                             if (i_ptr->u.xmlreader.reader)
195                                 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
196                             break;
197                         case DOM_INPUT_MARC:
198                             yaz_iconv_close(i_ptr->u.marc.iconv);
199                             yaz_marc_destroy(i_ptr->u.marc.handle);
200                             break;
201                         }
202                     destroy_xsp(i_ptr->convert);
203                 }
204             tinfo->input_list = 0;
205         }
206     if (tinfo->retrieve_list)
207         {
208             struct filter_retrieve *r_ptr;
209             for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
210                 destroy_xsp(r_ptr->convert);
211             tinfo->retrieve_list = 0;
212         }
213
214     if (tinfo->doc_config)
215         {
216             xmlFreeDoc(tinfo->doc_config);
217             tinfo->doc_config = 0;
218         }
219     odr_reset(tinfo->odr_config);
220 }
221
222 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
223                                struct convert_s **l)
224 {
225     *l = 0;
226     for(; ptr; ptr = ptr->next)
227         {
228             if (ptr->type != XML_ELEMENT_NODE)
229                 continue;
230             if (!XML_STRCMP(ptr->name, "xslt"))
231                 {
232                     struct _xmlAttr *attr;
233                     struct convert_s *p 
234                         = odr_malloc(tinfo->odr_config, sizeof(*p));
235
236                     p->next = 0;
237                     p->stylesheet = 0;
238                     p->stylesheet_xsp = 0;
239
240                     for (attr = ptr->properties; attr; attr = attr->next)
241                         if (attr_content(attr, "stylesheet", &p->stylesheet))
242                             ;
243                         else
244                             yaz_log(YLOG_WARN, "%s: dom filter: "
245                                     "bad attribute %s"
246                                     " for <xslt>",
247                                     tinfo->fname, attr->name);
248                     if (p->stylesheet)
249                         {
250                             char tmp_xslt_full_name[1024];
251                             if (!yaz_filepath_resolve(p->stylesheet, 
252                                                       tinfo->profile_path,
253                                                       NULL, 
254                                                       tmp_xslt_full_name))
255                                 {
256                                     yaz_log(YLOG_WARN,
257                                             "%s: dom filter: "
258                                             "stylesheet %s not found in "
259                                             "path %s",
260                                             tinfo->fname,
261                                             p->stylesheet, 
262                                             tinfo->profile_path);
263                                     return ZEBRA_FAIL;
264                                 }
265                 
266                             p->stylesheet_xsp
267                                 = xsltParseStylesheetFile((const xmlChar*) 
268                                                           tmp_xslt_full_name);
269                             if (!p->stylesheet_xsp)
270                                 {
271                                     yaz_log(YLOG_WARN,
272                                             "%s: dom filter: "
273                                             "could not parse xslt "
274                                             "stylesheet %s",
275                                             tinfo->fname, tmp_xslt_full_name);
276                                     return ZEBRA_FAIL;
277                                 }
278                         }
279                     else
280                         {
281                             yaz_log(YLOG_WARN,
282                                     "%s: dom filter: "
283                                     "missing attribute 'stylesheet' "
284                                     "for element 'xslt'", tinfo->fname);
285                             return ZEBRA_FAIL;
286                         }
287                     *l = p;
288                     l = &p->next;
289                 }
290             else
291                 {
292                     yaz_log(YLOG_LOG, 
293                             "%s: dom filter: bad node '%s' for <conv>",
294                             tinfo->fname, ptr->name);
295                     return ZEBRA_FAIL;
296                 }
297         
298         }
299     return ZEBRA_OK;
300 }
301
302 static ZEBRA_RES perform_convert(struct filter_info *tinfo, 
303                                  struct convert_s *convert,
304                                  const char **params,
305                                  xmlDocPtr *doc,
306                                  xsltStylesheetPtr *last_xsp)
307 {
308     for (; convert; convert = convert->next)
309         {
310             xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
311                                                     *doc, params);
312             if (last_xsp)
313                 *last_xsp = convert->stylesheet_xsp;
314             xmlFreeDoc(*doc);
315             *doc = res_doc;
316         }
317     return ZEBRA_OK;
318 }
319
320 static struct filter_input *new_input(struct filter_info *tinfo, int type)
321 {
322     struct filter_input *p;
323     struct filter_input **np = &tinfo->input_list;
324     for (;*np; np = &(*np)->next)
325         ;
326     p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
327     p->next = 0;
328     p->syntax = 0;
329     p->name = 0;
330     p->convert = 0;
331     p->type = type;
332     return p;
333 }
334
335 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
336                              const char *syntax,
337                              const char *name)
338 {
339     for (; ptr; ptr = ptr->next)
340         {
341             if (ptr->type != XML_ELEMENT_NODE)
342                 continue;
343             if (!XML_STRCMP(ptr->name, "marc"))
344                 {
345                     yaz_iconv_t iconv = 0;
346                     const char *input_charset = "marc-8";
347                     struct _xmlAttr *attr;
348             
349                     for (attr = ptr->properties; attr; attr = attr->next)
350                         {
351                             if (attr_content(attr, "charset", &input_charset))
352                                 ;
353                             else
354                                 yaz_log(YLOG_WARN, 
355                                         "%s: dom filter: bad attribute %s"
356                                         " for <marc>",
357                                         tinfo->fname, attr->name);
358                         }
359                     iconv = yaz_iconv_open("utf-8", input_charset);
360                     if (!iconv)
361                         {
362                             yaz_log(YLOG_WARN, 
363                                     "%s: dom filter: unsupported charset "
364                                     "'%s' for <marc>", 
365                                     tinfo->fname,  input_charset);
366                             return ZEBRA_FAIL;
367                         }
368                     else
369                         {
370                             struct filter_input *p 
371                                 = new_input(tinfo, DOM_INPUT_MARC);
372                             p->u.marc.handle = yaz_marc_create();
373                             p->u.marc.iconv = iconv;
374                 
375                             yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
376                 
377                             ptr = ptr->next;
378                 
379                             parse_convert(tinfo, ptr, &p->convert);
380                         }
381                     break;
382
383                 }
384             else if (!XML_STRCMP(ptr->name, "xmlreader"))
385                 {
386                     struct filter_input *p 
387                         = new_input(tinfo, DOM_INPUT_XMLREADER);
388                     struct _xmlAttr *attr;
389                     const char *level_str = 0;
390
391                     p->u.xmlreader.split_level = 0;
392                     p->u.xmlreader.reader = 0;
393
394                     for (attr = ptr->properties; attr; attr = attr->next)
395                         {
396                             if (attr_content(attr, "level", &level_str))
397                                 ;
398                             else
399                                 yaz_log(YLOG_WARN, 
400                                         "%s: dom filter: bad attribute %s"
401                                         " for <xmlreader>",
402                                         tinfo->fname, attr->name);
403                         }
404                     if (level_str)
405                         p->u.xmlreader.split_level = atoi(level_str);
406                 
407                     ptr = ptr->next;
408
409                     parse_convert(tinfo, ptr, &p->convert);
410                     break;
411                 }
412             else
413                 {
414                     yaz_log(YLOG_WARN, "%s: dom filter: bad input type %s",
415                             tinfo->fname, ptr->name);
416                     return ZEBRA_FAIL;
417                 }
418         }
419     return ZEBRA_OK;
420 }
421
422 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
423 {
424     char tmp_full_name[1024];
425     xmlNodePtr ptr;
426     xmlDocPtr doc;
427
428     tinfo->fname = odr_strdup(tinfo->odr_config, fname);
429     
430     if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path, 
431                              NULL, tmp_full_name))
432         tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
433     else
434         tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
435     
436     yaz_log(YLOG_LOG, "dom filter: loading config file %s", tinfo->full_name);
437     
438     doc = xmlParseFile(tinfo->full_name);
439     if (!doc)
440         {
441             yaz_log(YLOG_WARN, 
442                     "%s: dom filter: failed to parse config file %s",
443                     tinfo->fname, tinfo->full_name);
444             return ZEBRA_FAIL;
445         }
446     /* save because we store ptrs to the content */ 
447     tinfo->doc_config = doc;
448     
449     ptr = xmlDocGetRootElement(doc);
450     if (!ptr || ptr->type != XML_ELEMENT_NODE 
451         || XML_STRCMP(ptr->name, "dom"))
452         {
453             yaz_log(YLOG_WARN, 
454                     "%s: dom filter: expected root element <dom>", 
455                     tinfo->fname);  
456             return ZEBRA_FAIL;
457         }
458
459     for (ptr = ptr->children; ptr; ptr = ptr->next)
460         {
461             if (ptr->type != XML_ELEMENT_NODE)
462                 continue;
463             if (!XML_STRCMP(ptr->name, "extract"))
464                 {
465                     /*
466                       <extract name="index">
467                       <xslt stylesheet="first.xsl"/>
468                       <xslt stylesheet="second.xsl"/>
469                       </extract>
470                     */
471                     struct _xmlAttr *attr;
472                     struct filter_extract *f =
473                         odr_malloc(tinfo->odr_config, sizeof(*f));
474             
475                     tinfo->extract = f;
476                     f->name = 0;
477                     f->convert = 0;
478                     for (attr = ptr->properties; attr; attr = attr->next)
479                         {
480                             if (attr_content(attr, "name", &f->name))
481                                 ;
482                             else
483                                 yaz_log(YLOG_WARN, 
484                                         "%s: dom filter: bad attribute %s"
485                                         " for <extract>",
486                                         tinfo->fname, attr->name);
487
488                         }
489                     parse_convert(tinfo, ptr->children, &f->convert);
490                 }
491             else if (!XML_STRCMP(ptr->name, "retrieve"))
492                 {  
493                     /* 
494                        <retrieve name="F">
495                        <xslt stylesheet="some.xsl"/>
496                        <xslt stylesheet="some.xsl"/>
497                        </retrieve>
498                     */
499                     struct _xmlAttr *attr;
500                     struct filter_retrieve **fp = &tinfo->retrieve_list;
501                     struct filter_retrieve *f =
502                         odr_malloc(tinfo->odr_config, sizeof(*f));
503             
504                     while (*fp)
505                         fp = &(*fp)->next;
506
507                     *fp = f;
508                     f->name = 0;
509                     f->identifier = 0;
510                     f->convert = 0;
511                     f->next = 0;
512
513                     for (attr = ptr->properties; attr; attr = attr->next)
514                         {
515                             if (attr_content(attr, "identifier", 
516                                              &f->identifier))
517                                 ;
518                             else if (attr_content(attr, "name", &f->name))
519                                 ;
520                             else
521                                 yaz_log(YLOG_WARN, 
522                                         "%s: dom filter: bad attribute %s"
523                                         " for <retrieve>",
524                                         tinfo->fname, attr->name);
525                         }
526                     parse_convert(tinfo, ptr->children, &f->convert);
527                 }
528             else if (!XML_STRCMP(ptr->name, "store"))
529                 {
530                     /*
531                       <retrieve name="F">
532                       <xslt stylesheet="some.xsl"/>
533                       <xslt stylesheet="some.xsl"/>
534                       </retrieve>
535                     */
536                     struct filter_store *f =
537                         odr_malloc(tinfo->odr_config, sizeof(*f));
538             
539                     tinfo->store = f;
540                     f->convert = 0;
541                     parse_convert(tinfo, ptr->children, &f->convert);
542                 }
543             else if (!XML_STRCMP(ptr->name, "input"))
544                 {
545                     /*
546                       <input syntax="xml">
547                       <xmlreader level="1"/>
548                       </input>
549                       <input syntax="usmarc">
550                       <marc inputcharset="marc-8"/>
551                       </input>
552                     */
553                     struct _xmlAttr *attr;
554                     const char  *syntax = 0;
555                     const char *name = 0;
556                     for (attr = ptr->properties; attr; attr = attr->next)
557                         {
558                             if (attr_content(attr, "syntax", &syntax))
559                                 ;
560                             else if (attr_content(attr, "name", &name))
561                                 ;
562                             else
563                                 yaz_log(YLOG_WARN, 
564                                         "%s: dom filter: bad attribute %s"
565                                         " for <input>",
566                                         tinfo->fname, attr->name);
567                         }
568                     parse_input(tinfo, ptr->children, syntax, name);
569                 }
570             else
571                 {
572                     yaz_log(YLOG_WARN, "%s: dom filter: bad element %s",
573                             tinfo->fname, ptr->name);
574                     return ZEBRA_FAIL;
575                 }
576         }
577     return ZEBRA_OK;
578 }
579
580 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
581                                                const char *est)
582 {
583     struct filter_retrieve *f = tinfo->retrieve_list;
584
585     /* return first schema if no est is provided */
586     if (!est)
587         return f;
588     for (; f; f = f->next)
589         { 
590             /* find requested schema */
591             if (est) 
592                 {    
593                     if (f->identifier && !strcmp(f->identifier, est))
594                         return f;
595                     if (f->name && !strcmp(f->name, est))
596                         return f;
597                 } 
598         }
599     return 0;
600 }
601
602 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
603 {
604     struct filter_info *tinfo = clientData;
605     if (!args || !*args)
606         {
607             yaz_log(YLOG_WARN, "dom filter: need config file");
608             return ZEBRA_FAIL;
609         }
610
611     if (tinfo->fname && !strcmp(args, tinfo->fname))
612         return ZEBRA_OK;
613     
614     tinfo->profile_path = res_get(res, "profilePath");
615
616     destroy_dom(tinfo);
617     return parse_dom(tinfo, args);
618 }
619
620 static void filter_destroy(void *clientData)
621 {
622     struct filter_info *tinfo = clientData;
623     destroy_dom(tinfo);
624     odr_destroy(tinfo->odr_config);
625     odr_destroy(tinfo->odr_record);
626     xfree(tinfo);
627 }
628
629 static int ioread_ex(void *context, char *buffer, int len)
630 {
631     struct recExtractCtrl *p = context;
632     return p->stream->readf(p->stream, buffer, len);
633 }
634
635 static int ioclose_ex(void *context)
636 {
637     return 0;
638 }
639
640
641 /* Alvis style indexing */
642 #define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1"
643 static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS;
644
645 /* Alvis style indexing */
646 static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
647                         xmlNodePtr ptr, RecWord *recWord)
648 {
649     for(; ptr; ptr = ptr->next)
650         {
651             index_cdata(tinfo, ctrl, ptr->children, recWord);
652             if (ptr->type != XML_TEXT_NODE)
653                 continue;
654             recWord->term_buf = (const char *)ptr->content;
655             recWord->term_len = XML_STRLEN(ptr->content);
656             (*ctrl->tokenAdd)(recWord);
657         }
658 }
659
660 /* Alvis style indexing */
661 static void index_node(struct filter_info *tinfo,  struct recExtractCtrl *ctrl,
662                        xmlNodePtr ptr, RecWord *recWord)
663 {
664     for(; ptr; ptr = ptr->next)
665         {
666             index_node(tinfo, ctrl, ptr->children, recWord);
667             if (ptr->type != XML_ELEMENT_NODE || !ptr->ns ||
668                 XML_STRCMP(ptr->ns->href, zebra_xslt_ns))
669                 continue;
670             if (!XML_STRCMP(ptr->name, "index"))
671                 {
672                     const char *name_str = 0;
673                     const char *type_str = 0;
674                     const char *xpath_str = 0;
675                     struct _xmlAttr *attr;
676                     for (attr = ptr->properties; attr; attr = attr->next)
677                         {
678                             if (attr_content(attr, "name", &name_str))
679                                 ;
680                             else if (attr_content(attr, "xpath", &xpath_str))
681                                 ;
682                             else if (attr_content(attr, "type", &type_str))
683                                 ;
684                             else
685                                 yaz_log(YLOG_WARN, 
686                                         "%s: dom filter: bad attribute %s"
687                                         " for <index>",
688                                         tinfo->fname, attr->name);
689                         }
690                     if (name_str)
691                         {
692                             /* save default type */
693                             int prev_type = recWord->index_type; 
694
695                             /* type was given */
696                             if (type_str && *type_str)
697                                 recWord->index_type = *type_str; 
698
699                             recWord->index_name = name_str;
700                             index_cdata(tinfo, ctrl, ptr->children, recWord);
701
702                             /* restore it again */
703                             recWord->index_type = prev_type;     
704                         }
705                 }
706         }
707 }
708
709 /* Alvis style indexing */
710 static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl,
711                          xmlNodePtr ptr, RecWord *recWord)
712 {
713     const char *type_str = "update";
714
715     if (ptr && ptr->type == XML_ELEMENT_NODE && ptr->ns &&
716         !XML_STRCMP(ptr->ns->href, zebra_xslt_ns)
717         && !XML_STRCMP(ptr->name, "record"))
718         {
719             const char *id_str = 0;
720             const char *rank_str = 0;
721             struct _xmlAttr *attr;
722             for (attr = ptr->properties; attr; attr = attr->next)
723                 {
724                     if (attr_content(attr, "type", &type_str))
725                         ;
726                     else if (attr_content(attr, "id", &id_str))
727                         ;
728                     else if (attr_content(attr, "rank", &rank_str))
729                         ;
730                     else
731                         yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
732                                 " for <record>",
733                                 tinfo->fname, attr->name);
734                 }
735             if (id_str)
736                 sscanf(id_str, "%255s", ctrl->match_criteria);
737
738             if (rank_str)
739                 ctrl->staticrank = atozint(rank_str);
740             ptr = ptr->children;
741         }
742
743     if (!strcmp("update", type_str))
744         index_node(tinfo, ctrl, ptr, recWord);
745     else if (!strcmp("delete", type_str))
746         yaz_log(YLOG_WARN, "dom filter delete: to be implemented");
747     else
748         yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", 
749                 type_str);
750 }
751
752
753 /* Alvis style indexing */
754 static void extract_doc_alvis(struct filter_info *tinfo, 
755                               struct recExtractCtrl *recctr, 
756                               xmlDocPtr doc)
757 {
758     if (doc){
759         RecWord recWord;
760         xmlChar *buf_out;
761         int len_out;
762         xmlNodePtr root_ptr;
763
764         (*recctr->init)(recctr, &recWord);
765         
766         if (recctr->flagShowRecords){
767             xmlDocDumpMemory(doc, &buf_out, &len_out);
768             fwrite(buf_out, len_out, 1, stdout);
769             xmlFree(buf_out);
770         }
771         root_ptr = xmlDocGetRootElement(doc);
772         if (root_ptr)
773             index_record(tinfo, recctr, root_ptr, &recWord);
774         else
775             yaz_log(YLOG_WARN, "No root for index XML record");
776     }
777 }
778
779
780 /* DOM filter style indexing */
781 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
782                             xmlChar **dst_content)
783 {
784     if (0 == XML_STRCMP(attr->name, name) && attr->children 
785         && attr->children->type == XML_TEXT_NODE)
786         {
787             *dst_content = (attr->children->content);
788             return 1;
789         }
790     return 0;
791 }
792
793 /* DOM filter style indexing */
794 /* #define ZEBRA_XSLT_NS "http://indexdata.com/zebra-2.0" */
795 /* static const char *zebra_xslt_ns = ZEBRA_XSLT_NS; */
796
797 /* DOM filter style indexing */
798 #define ZEBRA_PI_NAME "zebra-2.0"
799 static const char *zebra_pi_name = ZEBRA_PI_NAME;
800
801
802 /* DOM filter style indexing */
803 void index_value_of(struct filter_info *tinfo, 
804                     struct recExtractCtrl *recctr, 
805                     xmlNodePtr node, 
806                     xmlChar * index_p)
807 {
808     xmlChar *text = xmlNodeGetContent(node);
809
810     xmlChar *look = index_p;
811     xmlChar *bval;
812     xmlChar *eval;
813
814     xmlChar index[256];
815     xmlChar type[256];
816
817     /* parsing all index name/type pairs - may not start with ' ' or ':' */
818     while (*look && ' ' != *look && ':' != *look){
819     
820         /* setting name and type to zero */
821         *index = '\0';
822         *type = '\0';
823     
824         /* parsing one index name */
825         bval = look;
826         while (*look && ':' != *look && ' ' != *look){
827             look++;
828         }
829         eval = look;
830         strncpy((char *)index, (const char *)bval, eval - bval);
831         index[eval - bval] = '\0';
832     
833     
834         /* parsing one index type, if existing */
835         if (':' == *look){
836             look++;
837       
838             bval = look;
839             while (*look && ' ' != *look){
840                 look++;
841             }
842             eval = look;
843             strncpy((char *)type, (const char *)bval, eval - bval);
844             type[eval - bval] = '\0';
845         }
846
847         printf("INDEX  '%s:%s' '%s'\n", index, type, text);
848     
849         if (*look && ' ' == *look && *(look+1)){
850             look++;
851         } 
852     }
853
854     xmlFree(text);
855
856     /*   //recWord->term_buf = (const char *)ptr->content; */
857     /*   //recWord->term_len = XML_STRLEN(ptr->content); */
858     /*   //  if (type_str && *type_str) */
859     /*   //  recWord->index_type = *type_str; /\* type was given *\/ */
860     /*   //  recWord->index_name = name_str; */
861     /*   // recWord->index_type = prev_type;     /\* restore it again *\/ */
862 }
863
864
865 /* DOM filter style indexing */
866 void set_record_info(struct filter_info *tinfo, 
867                      struct recExtractCtrl *recctr, 
868                      xmlChar * id_p, 
869                      xmlChar * rank_p, 
870                      xmlChar * action_p)
871 {
872     printf("RECORD id=%s rank=%s action=%s\n", id_p, rank_p, action_p);
873 }
874
875
876 /* DOM filter style indexing */
877 void process_xml_element_zebra_node(struct filter_info *tinfo, 
878                                     struct recExtractCtrl *recctr, 
879                                     xmlNodePtr node)
880 {
881     if (node->type == XML_ELEMENT_NODE 
882         && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_xslt_ns)){
883     
884         if (0 == XML_STRCMP(node->name, "index")){
885             xmlChar *index_p = 0;
886
887             struct _xmlAttr *attr;      
888             for (attr = node->properties; attr; attr = attr->next){
889                 if (attr_content_xml(attr, "name", &index_p)){
890                     index_value_of(tinfo, recctr, node, index_p);        
891                 }  
892                 else
893                     // printf("%s: dom filter: s% bad attribute %s",
894                     // tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
895                     printf("dom filter: %s bad attribute @%s, "
896                            "expected @name\n",
897                            xmlGetNodePath(node), attr->name);
898             }
899         }
900         else if (0 == XML_STRCMP(node->name, "record")){
901             xmlChar *id_p = 0;
902             xmlChar *rank_p = 0;
903             xmlChar *action_p = 0;
904
905             struct _xmlAttr *attr;
906             for (attr = node->properties; attr; attr = attr->next){
907                 if (attr_content_xml(attr, "id", &id_p))
908                     ;
909                 else if (attr_content_xml(attr, "rank", &rank_p))
910                     ;
911                 else if (attr_content_xml(attr, "acton", &action_p))
912                     ;
913                 else
914                     // printf("%s: dom filter: s% bad attribute %s",
915                     // tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
916                     printf("dom filter: %s bad attribute @%s,"
917                            " expected @id|@rank|@action\n",
918                            xmlGetNodePath(node), attr->name);
919
920                 if (action_p && 0 != strcmp("update", (const char *)action_p))
921                     printf("dom filter: %s attribute @%s,"
922                            " only implemented '@action=\"update\"\n",
923                            xmlGetNodePath(node), attr->name);
924           
925
926             }
927             set_record_info(tinfo, recctr, id_p, rank_p, action_p);
928         } else {
929             //  printf("%s: dom filter: s% bad attribute %s",
930             //  tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
931             printf("dom filter: %s bad element <%s>,"
932                    " expected <record>|<index> in namespace '%s'\n",
933                    xmlGetNodePath(node), node->name, zebra_xslt_ns);
934       
935         }
936     }
937 }
938
939
940 /* DOM filter style indexing */
941 void process_xml_pi_node(struct filter_info *tinfo, 
942                          struct recExtractCtrl *recctr, 
943                          xmlNodePtr node,
944                          xmlChar **index_pp)
945 {
946
947     /* printf("PI     %s\n", xmlGetNodePath(node)); */
948
949     /* if right PI name, continue parsing PI */
950     if (0 == strcmp(zebra_pi_name, (const char *)node->name)){
951         xmlChar *pi_p =  node->content;
952         xmlChar *look = pi_p;
953     
954         xmlChar *bval;
955         xmlChar *eval;
956
957         /* parsing PI record instructions */
958         if (0 == strncmp((const char *)look, "record", 6)){
959             xmlChar id[256];
960             xmlChar rank[256];
961             xmlChar action[256];
962
963             *id = '\0';
964             *rank = '\0';
965             *action = '\0';
966       
967             look += 6;
968       
969             /* eat whitespace */
970             while (*look && ' ' == *look && *(look+1))
971                 look++;
972
973             /* parse possible id */
974             if (*look && 0 == strncmp((const char *)look, "id=", 3)){
975                 look += 3;
976                 bval = look;
977                 while (*look && ' ' != *look)
978                     look++;
979                 eval = look;
980                 strncpy((char *)id, (const char *)bval, eval - bval);
981                 id[eval - bval] = '\0';
982             }
983       
984             /* eat whitespace */
985             while (*look && ' ' == *look && *(look+1))
986                 look++;
987       
988             /* parse possible rank */
989             if (*look && 0 == strncmp((const char *)look, "rank=", 5)){
990                 look += 6;
991                 bval = look;
992                 while (*look && ' ' != *look)
993                     look++;
994                 eval = look;
995                 strncpy((char *)rank, (const char *)bval, eval - bval);
996                 rank[eval - bval] = '\0';
997             }
998
999             /* eat whitespace */
1000             while (*look && ' ' == *look && *(look+1))
1001                 look++;
1002
1003             if (look && '\0' != *look){
1004                 printf ("ERROR %s: content '%s'; can not parse '%s'\n", 
1005                         xmlGetNodePath(node), pi_p, look);
1006             } else {
1007                 /* set_record_info(id, rank, action); */
1008                 set_record_info(tinfo, recctr, id, rank, 0);
1009             }
1010
1011         } 
1012    
1013         /* parsing index instruction */
1014         else   if (0 == strncmp((const char *)look, "index", 5)){
1015             look += 5;
1016       
1017             /* eat whitespace */
1018             while (*look && ' ' == *look && *(look+1))
1019                 look++;
1020
1021             /* export index instructions to outside */
1022             *index_pp = look;
1023
1024             /* nor record, neither index */ 
1025         } else {
1026     
1027             printf ("ERROR %s: content '%s'; can not parse '%s'\n", 
1028                     xmlGetNodePath(node), pi_p, look);
1029         }  
1030     }
1031 }
1032
1033 /* DOM filter style indexing */
1034 void process_xml_element_node(struct filter_info *tinfo, 
1035                               struct recExtractCtrl *recctr, 
1036                               xmlNodePtr node)
1037 {
1038     /* remember indexing instruction from PI to next element node */
1039     xmlChar *index_p = 0;
1040
1041     /* printf("ELEM   %s\n", xmlGetNodePath(node)); */
1042
1043     /* check if we are an element node in the special zebra namespace 
1044        and either set record data or index value-of node content*/
1045     process_xml_element_zebra_node(tinfo, recctr, node);
1046   
1047     /* loop through kid nodes */
1048     for (node = node->children; node; node = node->next)
1049         {
1050             /* check and set PI record and index index instructions */
1051             if (node->type == XML_PI_NODE){
1052                 process_xml_pi_node(tinfo, recctr, node, &index_p);
1053             }
1054             else if (node->type == XML_ELEMENT_NODE){
1055                 /* if there was a PI index instruction before this element */
1056                 if (index_p){
1057                     index_value_of(tinfo, recctr, node, index_p);            
1058                     index_p = 0;
1059                 }
1060                 process_xml_element_node(tinfo, recctr, node);
1061             }
1062             else
1063                 continue;
1064         }
1065 }
1066
1067
1068 /* DOM filter style indexing */
1069 void extract_dom_doc_node(struct filter_info *tinfo, 
1070                           struct recExtractCtrl *recctr, 
1071                           xmlDocPtr doc)
1072 {
1073     /* printf("DOC    %s\n", xmlGetNodePath((xmlNodePtr)doc)); */
1074
1075     process_xml_element_node(tinfo, recctr, (xmlNodePtr)doc);
1076 }
1077
1078
1079
1080
1081 static int convert_extract_doc(struct filter_info *tinfo, 
1082                                struct filter_input *input,
1083                                struct recExtractCtrl *p, 
1084                                xmlDocPtr doc)
1085
1086 {
1087     /* RecWord recWord; */
1088     xmlChar *buf_out;
1089     int len_out;
1090     const char *params[10];
1091     xsltStylesheetPtr last_xsp = 0;
1092     xmlDocPtr store_doc = 0;
1093
1094     params[0] = 0;
1095     set_param_str(params, "schema", zebra_xslt_ns, tinfo->odr_record);
1096
1097     /* input conversion */
1098     perform_convert(tinfo, input->convert, params, &doc, 0);
1099
1100     if (tinfo->store)
1101         {
1102             /* store conversion */
1103             store_doc = xmlCopyDoc(doc, 1);
1104             perform_convert(tinfo, tinfo->store->convert,
1105                             params, &store_doc, &last_xsp);
1106         }
1107     
1108     if (last_xsp)
1109         xsltSaveResultToString(&buf_out, &len_out, 
1110                                store_doc ? store_doc : doc, last_xsp);
1111     else
1112         xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1113     if (p->flagShowRecords)
1114         fwrite(buf_out, len_out, 1, stdout);
1115     (*p->setStoreData)(p, buf_out, len_out);
1116     xmlFree(buf_out);
1117
1118     if (store_doc)
1119         xmlFreeDoc(store_doc);
1120
1121     /* extract conversion */
1122     perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
1123
1124     /* finally, do the indexing */
1125     if (doc){
1126         extract_dom_doc_node(tinfo, p, doc);
1127         extract_doc_alvis(tinfo, p, doc);
1128         xmlFreeDoc(doc);
1129     }
1130
1131     return RECCTRL_EXTRACT_OK;
1132 }
1133
1134 static int extract_xml_split(struct filter_info *tinfo,
1135                              struct filter_input *input,
1136                              struct recExtractCtrl *p)
1137 {
1138     int ret;
1139
1140     if (p->first_record)
1141         {
1142             if (input->u.xmlreader.reader)
1143                 xmlFreeTextReader(input->u.xmlreader.reader);
1144             input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1145                                                        p /* I/O handler */,
1146                                                        0 /* URL */, 
1147                                                        0 /* encoding */,
1148                                                        XML_PARSE_XINCLUDE|
1149                                                        XML_PARSE_NOENT);
1150         }
1151     if (!input->u.xmlreader.reader)
1152         return RECCTRL_EXTRACT_ERROR_GENERIC;
1153
1154     ret = xmlTextReaderRead(input->u.xmlreader.reader);
1155     while (ret == 1)
1156         {
1157             int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1158             int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1159             if (type == XML_READER_TYPE_ELEMENT && 
1160                 input->u.xmlreader.split_level == depth)
1161                 {
1162                     xmlNodePtr ptr
1163                         = xmlTextReaderExpand(input->u.xmlreader.reader);
1164                     if (ptr)
1165                         {
1166                             xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1167                             xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1168                 
1169                             xmlDocSetRootElement(doc, ptr2);
1170                 
1171                             return convert_extract_doc(tinfo, input, p, doc);
1172                         }
1173                     else
1174                         {
1175                             xmlFreeTextReader(input->u.xmlreader.reader);
1176                             input->u.xmlreader.reader = 0;
1177                             return RECCTRL_EXTRACT_ERROR_GENERIC;
1178                         }
1179                 }
1180             ret = xmlTextReaderRead(input->u.xmlreader.reader);
1181         }
1182     xmlFreeTextReader(input->u.xmlreader.reader);
1183     input->u.xmlreader.reader = 0;
1184     return RECCTRL_EXTRACT_EOF;
1185 }
1186
1187 static int extract_xml_full(struct filter_info *tinfo, 
1188                             struct filter_input *input,
1189                             struct recExtractCtrl *p)
1190 {
1191     if (p->first_record) /* only one record per stream */
1192         {
1193             xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, 
1194                                       p /* I/O handler */,
1195                                       0 /* URL */,
1196                                       0 /* encoding */,
1197                                       XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1198             if (!doc)
1199                 {
1200                     return RECCTRL_EXTRACT_ERROR_GENERIC;
1201                 }
1202             return convert_extract_doc(tinfo, input, p, doc);
1203         }
1204     else
1205         return RECCTRL_EXTRACT_EOF;
1206 }
1207
1208 static int extract_iso2709(struct filter_info *tinfo,
1209                            struct filter_input *input,
1210                            struct recExtractCtrl *p)
1211 {
1212     char buf[100000];
1213     int record_length;
1214     int read_bytes, r;
1215
1216     if (p->stream->readf(p->stream, buf, 5) != 5)
1217         return RECCTRL_EXTRACT_EOF;
1218     while (*buf < '0' || *buf > '9')
1219         {
1220             int i;
1221
1222             yaz_log(YLOG_WARN, "MARC: Skipping bad byte %d (0x%02X)",
1223                     *buf & 0xff, *buf & 0xff);
1224             for (i = 0; i<4; i++)
1225                 buf[i] = buf[i+1];
1226
1227             if (p->stream->readf(p->stream, buf+4, 1) != 1)
1228                 return RECCTRL_EXTRACT_EOF;
1229         }
1230     record_length = atoi_n (buf, 5);
1231     if (record_length < 25)
1232         {
1233             yaz_log (YLOG_WARN, "MARC record length < 25, is %d", 
1234                      record_length);
1235             return RECCTRL_EXTRACT_ERROR_GENERIC;
1236         }
1237     read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1238     if (read_bytes < record_length-5)
1239         {
1240             yaz_log (YLOG_WARN, "Couldn't read whole MARC record");
1241             return RECCTRL_EXTRACT_ERROR_GENERIC;
1242         }
1243     r = yaz_marc_read_iso2709(input->u.marc.handle,  buf, record_length);
1244     if (r < record_length)
1245         {
1246             yaz_log (YLOG_WARN, "Parsing of MARC record failed r=%d length=%d",
1247                      r, record_length);
1248             return RECCTRL_EXTRACT_ERROR_GENERIC;
1249         }
1250     else
1251         {
1252             xmlDocPtr rdoc;
1253             xmlNode *root_ptr;
1254             yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1255             rdoc = xmlNewDoc((const xmlChar*) "1.0");
1256             xmlDocSetRootElement(rdoc, root_ptr);
1257             return convert_extract_doc(tinfo, input, p, rdoc);        
1258         }
1259     return RECCTRL_EXTRACT_OK;
1260 }
1261
1262 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1263 {
1264     struct filter_info *tinfo = clientData;
1265     struct filter_input *input = tinfo->input_list;
1266
1267     if (!input)
1268         return RECCTRL_EXTRACT_ERROR_GENERIC;
1269
1270     odr_reset(tinfo->odr_record);
1271     switch(input->type)
1272         {
1273         case DOM_INPUT_XMLREADER:
1274             if (input->u.xmlreader.split_level == 0)
1275                 return extract_xml_full(tinfo, input, p);
1276             else
1277                 return extract_xml_split(tinfo, input, p);
1278             break;
1279         case DOM_INPUT_MARC:
1280             return extract_iso2709(tinfo, input, p);
1281         }
1282     return RECCTRL_EXTRACT_ERROR_GENERIC;
1283 }
1284
1285 static int ioread_ret(void *context, char *buffer, int len)
1286 {
1287     struct recRetrieveCtrl *p = context;
1288     return p->stream->readf(p->stream, buffer, len);
1289 }
1290
1291 static int ioclose_ret(void *context)
1292 {
1293     return 0;
1294 }
1295
1296 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1297 {
1298     /* const char *esn = zebra_xslt_ns; */
1299     const char *esn = 0;
1300     const char *params[32];
1301     struct filter_info *tinfo = clientData;
1302     xmlDocPtr doc;
1303     struct filter_retrieve *retrieve;
1304     xsltStylesheetPtr last_xsp = 0;
1305
1306     if (p->comp)
1307         {
1308             if (p->comp->which == Z_RecordComp_simple
1309                 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1310                 {
1311                     esn = p->comp->u.simple->u.generic;
1312                 }
1313             else if (p->comp->which == Z_RecordComp_complex 
1314                      && p->comp->u.complex->generic->elementSpec
1315                      && p->comp->u.complex->generic->elementSpec->which ==
1316                      Z_ElementSpec_elementSetName)
1317                 {
1318                     esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1319                 }
1320         }
1321     retrieve = lookup_retrieve(tinfo, esn);
1322     if (!retrieve)
1323         {
1324             p->diagnostic =
1325                 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1326             return 0;
1327         }
1328
1329     params[0] = 0;
1330     set_param_int(params, "id", p->localno, p->odr);
1331     if (p->fname)
1332         set_param_str(params, "filename", p->fname, p->odr);
1333     if (p->staticrank >= 0)
1334         set_param_int(params, "rank", p->staticrank, p->odr);
1335
1336     if (esn)
1337         set_param_str(params, "schema", esn, p->odr);
1338     else
1339         if (retrieve->name)
1340             set_param_str(params, "schema", retrieve->name, p->odr);
1341         else if (retrieve->identifier)
1342             set_param_str(params, "schema", retrieve->identifier, p->odr);
1343         else
1344             set_param_str(params, "schema", "", p->odr);
1345
1346     if (p->score >= 0)
1347         set_param_int(params, "score", p->score, p->odr);
1348     set_param_int(params, "size", p->recordSize, p->odr);
1349
1350     doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1351                     0 /* URL */,
1352                     0 /* encoding */,
1353                     XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1354     if (!doc)
1355         {
1356             p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1357             return 0;
1358         }
1359
1360     /* retrieve conversion */
1361     perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
1362     if (!doc)
1363         {
1364             p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1365         }
1366     else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1367         {
1368             xmlChar *buf_out;
1369             int len_out;
1370
1371             if (last_xsp)
1372                 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1373             else
1374                 xmlDocDumpMemory(doc, &buf_out, &len_out);            
1375
1376             p->output_format = VAL_TEXT_XML;
1377             p->rec_len = len_out;
1378             p->rec_buf = odr_malloc(p->odr, p->rec_len);
1379             memcpy(p->rec_buf, buf_out, p->rec_len);
1380             xmlFree(buf_out);
1381         }
1382     else if (p->output_format == VAL_SUTRS)
1383         {
1384             xmlChar *buf_out;
1385             int len_out;
1386
1387             if (last_xsp)
1388                 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1389             else
1390                 xmlDocDumpMemory(doc, &buf_out, &len_out);            
1391         
1392             p->output_format = VAL_SUTRS;
1393             p->rec_len = len_out;
1394             p->rec_buf = odr_malloc(p->odr, p->rec_len);
1395             memcpy(p->rec_buf, buf_out, p->rec_len);
1396         
1397             xmlFree(buf_out);
1398         }
1399     else
1400         {
1401             p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1402         }
1403     xmlFreeDoc(doc);
1404     return 0;
1405 }
1406
1407 static struct recType filter_type = {
1408     0,
1409     "dom",
1410     filter_init,
1411     filter_config,
1412     filter_destroy,
1413     filter_extract,
1414     filter_retrieve
1415 };
1416
1417 RecType
1418 #ifdef IDZEBRA_STATIC_DOM
1419 idzebra_filter_dom
1420 #else
1421 idzebra_filter
1422 #endif
1423
1424 [] = {
1425     &filter_type,
1426     0,
1427 };
1428 /*
1429  * Local variables:
1430  * c-basic-offset: 4
1431  * indent-tabs-mode: nil
1432  * End:
1433  * vim: shiftwidth=4 tabstop=8 expandtab
1434  */
1435