optimized code such that the RecWord structure recword is only
[idzebra-moved-to-github.git] / index / mod_dom.c
1 /* $Id: mod_dom.c,v 1.15 2007-02-15 15:08:41 marc Exp $
2    Copyright (C) 1995-2007
3    Index Data ApS
4
5    This file is part of the Zebra server.
6
7    Zebra is free software; you can redistribute it and/or modify it under
8    the terms of the GNU General Public License as published by the Free
9    Software Foundation; either version 2, or (at your option) any later
10    version.
11
12    Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or
14    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15    for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20
21 */
22
23 #include <stdio.h>
24 #include <assert.h>
25 #include <ctype.h>
26
27 #include <yaz/diagbib1.h>
28 #include <yaz/tpath.h>
29
30 #include <libxml/xmlversion.h>
31 #include <libxml/parser.h>
32 #include <libxml/tree.h>
33 #include <libxml/xmlIO.h>
34 #include <libxml/xmlreader.h>
35 #include <libxslt/transform.h>
36 #include <libxslt/xsltutils.h>
37
38 #if YAZ_HAVE_EXSLT
39 #include <libexslt/exslt.h>
40 #endif
41
42 #include <idzebra/util.h>
43 #include <idzebra/recctrl.h>
44
45 /* DOM filter style indexing */
46 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
47 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
48
49 /* DOM filter style indexing */
50 #define ZEBRA_PI_NAME "zebra-2.0"
51 static const char *zebra_pi_name = ZEBRA_PI_NAME;
52
53
54
55 struct convert_s {
56     const char *stylesheet;
57     xsltStylesheetPtr stylesheet_xsp;
58     struct convert_s *next;
59 };
60
61 struct filter_extract {
62     const char *name;
63     struct convert_s *convert;
64 };
65
66 struct filter_store {
67     struct convert_s *convert;
68 };
69
70 struct filter_retrieve {
71     const char *name;
72     const char *identifier;
73     struct convert_s *convert;
74     struct filter_retrieve *next;
75 };
76
77 #define DOM_INPUT_XMLREADER 1
78 #define DOM_INPUT_MARC 2
79 struct filter_input {
80     const char *syntax;
81     const char *name;
82     struct convert_s *convert;
83     int type;
84     union {
85         struct {
86             const char *input_charset;
87             yaz_marc_t handle;
88             yaz_iconv_t iconv;
89         } marc;
90         struct {
91             xmlTextReaderPtr reader;
92             int split_level;
93         } xmlreader;
94     } u;
95     struct filter_input *next;
96 };
97   
98 struct filter_info {
99     char *fname;
100     char *full_name;
101     const char *profile_path;
102     ODR odr_record;
103     ODR odr_config;
104     xmlDocPtr doc_config;
105     struct filter_extract *extract;
106     struct filter_retrieve *retrieve_list;
107     struct filter_input *input_list;
108     struct filter_store *store;
109 };
110
111 #define XML_STRCMP(a,b)   strcmp((char*)a, b)
112 #define XML_STRLEN(a) strlen((char*)a)
113
114
115
116
117 static void set_param_str(const char **params, const char *name,
118                           const char *value, ODR odr)
119 {
120     char *quoted = odr_malloc(odr, 3 + strlen(value));
121     sprintf(quoted, "'%s'", value);
122     while (*params)
123         params++;
124     params[0] = name;
125     params[1] = quoted;
126     params[2] = 0;
127 }
128
129 static void set_param_int(const char **params, const char *name,
130                           zint value, ODR odr)
131 {
132     char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
133     while (*params)
134         params++;
135     sprintf(quoted, "'" ZINT_FORMAT "'", value);
136     params[0] = name;
137     params[1] = quoted;
138     params[2] = 0;
139 }
140
141 static void *filter_init(Res res, RecType recType)
142 {
143     struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
144     tinfo->fname = 0;
145     tinfo->full_name = 0;
146     tinfo->profile_path = 0;
147     tinfo->odr_record = odr_createmem(ODR_ENCODE);
148     tinfo->odr_config = odr_createmem(ODR_ENCODE);
149     tinfo->extract = 0;
150     tinfo->retrieve_list = 0;
151     tinfo->input_list = 0;
152     tinfo->store = 0;
153     tinfo->doc_config = 0;
154
155 #if YAZ_HAVE_EXSLT
156     exsltRegisterAll(); 
157 #endif
158
159     return tinfo;
160 }
161
162 static int attr_content(struct _xmlAttr *attr, const char *name,
163                         const char **dst_content)
164 {
165     if (!XML_STRCMP(attr->name, name) && attr->children 
166         && attr->children->type == XML_TEXT_NODE)
167         {
168             *dst_content = (const char *)(attr->children->content);
169             return 1;
170         }
171     return 0;
172 }
173
174 static void destroy_xsp(struct convert_s *c)
175 {
176     while(c)
177         {
178             if (c->stylesheet_xsp)
179                 xsltFreeStylesheet(c->stylesheet_xsp);
180             c = c->next;
181         }
182 }
183
184 static void destroy_dom(struct filter_info *tinfo)
185 {
186     if (tinfo->extract)
187         {
188             destroy_xsp(tinfo->extract->convert);
189             tinfo->extract = 0;
190         }
191     if (tinfo->store)
192         {
193             destroy_xsp(tinfo->store->convert);
194             tinfo->store = 0;
195         }
196     if (tinfo->input_list)
197         {
198             struct filter_input *i_ptr;
199             for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
200                 {
201                     switch(i_ptr->type)
202                         {
203                         case DOM_INPUT_XMLREADER:
204                             if (i_ptr->u.xmlreader.reader)
205                                 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
206                             break;
207                         case DOM_INPUT_MARC:
208                             yaz_iconv_close(i_ptr->u.marc.iconv);
209                             yaz_marc_destroy(i_ptr->u.marc.handle);
210                             break;
211                         }
212                     destroy_xsp(i_ptr->convert);
213                 }
214             tinfo->input_list = 0;
215         }
216     if (tinfo->retrieve_list)
217         {
218             struct filter_retrieve *r_ptr;
219             for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
220                 destroy_xsp(r_ptr->convert);
221             tinfo->retrieve_list = 0;
222         }
223
224     if (tinfo->doc_config)
225         {
226             xmlFreeDoc(tinfo->doc_config);
227             tinfo->doc_config = 0;
228         }
229     odr_reset(tinfo->odr_config);
230 }
231
232 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
233                                struct convert_s **l)
234 {
235     *l = 0;
236     for(; ptr; ptr = ptr->next)
237         {
238             if (ptr->type != XML_ELEMENT_NODE)
239                 continue;
240             if (!XML_STRCMP(ptr->name, "xslt"))
241                 {
242                     struct _xmlAttr *attr;
243                     struct convert_s *p 
244                         = odr_malloc(tinfo->odr_config, sizeof(*p));
245
246                     p->next = 0;
247                     p->stylesheet = 0;
248                     p->stylesheet_xsp = 0;
249
250                     for (attr = ptr->properties; attr; attr = attr->next)
251                         if (attr_content(attr, "stylesheet", &p->stylesheet))
252                             ;
253                         else
254                             yaz_log(YLOG_WARN, "%s: dom filter: "
255                                     "%s bad attribute @%s, "
256                                     "expected @stylesheet",
257                                     tinfo->fname, 
258                                     xmlGetNodePath(ptr), attr->name);
259                     if (p->stylesheet)
260                         {
261                             char tmp_xslt_full_name[1024];
262                             if (!yaz_filepath_resolve(p->stylesheet, 
263                                                       tinfo->profile_path,
264                                                       NULL, 
265                                                       tmp_xslt_full_name))
266                                 {
267                                     yaz_log(YLOG_WARN, "%s: dom filter: "
268                                             "stylesheet %s not found in "
269                                             "path %s",
270                                             tinfo->fname,
271                                             p->stylesheet, 
272                                             tinfo->profile_path);
273                                     return ZEBRA_FAIL;
274                                 }
275                 
276                             p->stylesheet_xsp
277                                 = xsltParseStylesheetFile((const xmlChar*) 
278                                                           tmp_xslt_full_name);
279                             if (!p->stylesheet_xsp)
280                                 {
281                                     yaz_log(YLOG_WARN, "%s: dom filter: "
282                                             "could not parse xslt "
283                                             "stylesheet %s",
284                                             tinfo->fname, tmp_xslt_full_name);
285                                     return ZEBRA_FAIL;
286                                 }
287                         }
288                     else
289                         {
290                             yaz_log(YLOG_WARN, "%s: dom filter: "
291                                     "%s missing attribute 'stylesheet' ", 
292                                     tinfo->fname, xmlGetNodePath(ptr));
293                             return ZEBRA_FAIL;
294                         }
295                     *l = p;
296                     l = &p->next;
297                 }
298             else
299                 {
300                     yaz_log(YLOG_LOG, 
301                             "%s: dom filter: "
302                             "%s bad node '%s'",
303                             tinfo->fname, xmlGetNodePath(ptr), ptr->name);
304                     return ZEBRA_FAIL;
305                 }
306         
307         }
308     return ZEBRA_OK;
309 }
310
311 static ZEBRA_RES perform_convert(struct filter_info *tinfo, 
312                                  struct convert_s *convert,
313                                  const char **params,
314                                  xmlDocPtr *doc,
315                                  xsltStylesheetPtr *last_xsp)
316 {
317     for (; convert; convert = convert->next)
318         {
319             xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
320                                                     *doc, params);
321             if (last_xsp)
322                 *last_xsp = convert->stylesheet_xsp;
323             xmlFreeDoc(*doc);
324             *doc = res_doc;
325         }
326     return ZEBRA_OK;
327 }
328
329 static struct filter_input *new_input(struct filter_info *tinfo, int type)
330 {
331     struct filter_input *p;
332     struct filter_input **np = &tinfo->input_list;
333     for (;*np; np = &(*np)->next)
334         ;
335     p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
336     p->next = 0;
337     p->syntax = 0;
338     p->name = 0;
339     p->convert = 0;
340     p->type = type;
341     return p;
342 }
343
344 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
345                              const char *syntax,
346                              const char *name)
347 {
348     for (; ptr; ptr = ptr->next)
349         {
350             if (ptr->type != XML_ELEMENT_NODE)
351                 continue;
352             if (!XML_STRCMP(ptr->name, "marc"))
353                 {
354                     yaz_iconv_t iconv = 0;
355                     const char *input_charset = "marc-8";
356                     struct _xmlAttr *attr;
357             
358                     for (attr = ptr->properties; attr; attr = attr->next)
359                         {
360                             if (attr_content(attr, "charset", &input_charset))
361                                 ;
362                             else
363                                 yaz_log(YLOG_WARN, "%s: dom filter: "
364                                         "%s bad attribute @%s,"
365                                         " expected @charset",
366                                         tinfo->fname, 
367                                         xmlGetNodePath(ptr), attr->name);
368                         }
369                     iconv = yaz_iconv_open("utf-8", input_charset);
370                     if (!iconv)
371                         {
372                             yaz_log(YLOG_WARN, "%s: dom filter: "
373                                     "%s unsupported @charset '%s'", 
374                                     tinfo->fname, xmlGetNodePath(ptr),
375                                     input_charset);
376                             return ZEBRA_FAIL;
377                         }
378                     else
379                         {
380                             struct filter_input *p 
381                                 = new_input(tinfo, DOM_INPUT_MARC);
382                             p->u.marc.handle = yaz_marc_create();
383                             p->u.marc.iconv = iconv;
384                 
385                             yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
386                 
387                             ptr = ptr->next;
388                 
389                             parse_convert(tinfo, ptr, &p->convert);
390                         }
391                     break;
392
393                 }
394             else if (!XML_STRCMP(ptr->name, "xmlreader"))
395                 {
396                     struct filter_input *p 
397                         = new_input(tinfo, DOM_INPUT_XMLREADER);
398                     struct _xmlAttr *attr;
399                     const char *level_str = 0;
400
401                     p->u.xmlreader.split_level = 0;
402                     p->u.xmlreader.reader = 0;
403
404                     for (attr = ptr->properties; attr; attr = attr->next)
405                         {
406                             if (attr_content(attr, "level", &level_str))
407                                 ;
408                             else
409                                 yaz_log(YLOG_WARN, "%s: dom filter: "
410                                         "%s bad attribute @%s,"
411                                         " expected @level",
412                                         tinfo->fname, xmlGetNodePath(ptr),
413                                         attr->name);
414                         }
415                     if (level_str)
416                         p->u.xmlreader.split_level = atoi(level_str);
417                 
418                     ptr = ptr->next;
419
420                     parse_convert(tinfo, ptr, &p->convert);
421                     break;
422                 }
423             else
424                 {
425                     yaz_log(YLOG_WARN, "%s: dom filter: "
426                             "%s bad element <%s>,"
427                             " expected <marc>|<xmlreader>",
428                             tinfo->fname, xmlGetNodePath(ptr), ptr->name);
429                     return ZEBRA_FAIL;
430                 }
431         }
432     return ZEBRA_OK;
433 }
434
435 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
436 {
437     char tmp_full_name[1024];
438     xmlNodePtr ptr;
439     xmlDocPtr doc;
440
441     tinfo->fname = odr_strdup(tinfo->odr_config, fname);
442     
443     if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path, 
444                              NULL, tmp_full_name))
445         tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
446     else
447         tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
448     
449     yaz_log(YLOG_LOG, "%s dom filter: "
450             "loading config file %s", tinfo->fname, tinfo->full_name);
451     
452     doc = xmlParseFile(tinfo->full_name);
453     if (!doc)
454         {
455             yaz_log(YLOG_WARN, "%s: dom filter: "
456                     "failed to parse config file %s",
457                     tinfo->fname, tinfo->full_name);
458             return ZEBRA_FAIL;
459         }
460     /* save because we store ptrs to the content */ 
461     tinfo->doc_config = doc;
462     
463     ptr = xmlDocGetRootElement(doc);
464     if (!ptr || ptr->type != XML_ELEMENT_NODE 
465         || XML_STRCMP(ptr->name, "dom"))
466         {
467             yaz_log(YLOG_WARN, "%s: dom filter: "
468                     "%s bad root element <%s>,"
469                     " expected root element <dom>", 
470                     tinfo->fname, xmlGetNodePath(ptr), ptr->name);  
471             return ZEBRA_FAIL;
472         }
473
474     for (ptr = ptr->children; ptr; ptr = ptr->next)
475         {
476             if (ptr->type != XML_ELEMENT_NODE)
477                 continue;
478             if (!XML_STRCMP(ptr->name, "extract"))
479                 {
480                     /*
481                       <extract name="index">
482                       <xslt stylesheet="first.xsl"/>
483                       <xslt stylesheet="second.xsl"/>
484                       </extract>
485                     */
486                     struct _xmlAttr *attr;
487                     struct filter_extract *f =
488                         odr_malloc(tinfo->odr_config, sizeof(*f));
489             
490                     tinfo->extract = f;
491                     f->name = 0;
492                     f->convert = 0;
493                     for (attr = ptr->properties; attr; attr = attr->next)
494                         {
495                             if (attr_content(attr, "name", &f->name))
496                                 ;
497                             else
498                                 yaz_log(YLOG_WARN, "%s: dom filter: "
499                                         "%s bad attribute @%s"
500                                         " expected @name",
501                                         tinfo->fname, 
502                                         xmlGetNodePath(ptr),attr->name);
503
504                         }
505                     parse_convert(tinfo, ptr->children, &f->convert);
506                 }
507             else if (!XML_STRCMP(ptr->name, "retrieve"))
508                 {  
509                     /* 
510                        <retrieve name="F">
511                        <xslt stylesheet="some.xsl"/>
512                        <xslt stylesheet="some.xsl"/>
513                        </retrieve>
514                     */
515                     struct _xmlAttr *attr;
516                     struct filter_retrieve **fp = &tinfo->retrieve_list;
517                     struct filter_retrieve *f =
518                         odr_malloc(tinfo->odr_config, sizeof(*f));
519             
520                     while (*fp)
521                         fp = &(*fp)->next;
522
523                     *fp = f;
524                     f->name = 0;
525                     f->identifier = 0;
526                     f->convert = 0;
527                     f->next = 0;
528
529                     for (attr = ptr->properties; attr; attr = attr->next)
530                         {
531                             if (attr_content(attr, "identifier", 
532                                              &f->identifier))
533                                 ;
534                             else if (attr_content(attr, "name", &f->name))
535                                 ;
536                             else
537                                 yaz_log(YLOG_WARN, "%s: dom filter: "
538                                         "%s bad attribute @%s"
539                                         " expected @identifier|@name",
540                                         tinfo->fname, 
541                                         xmlGetNodePath(ptr),attr->name);
542                         }
543                     parse_convert(tinfo, ptr->children, &f->convert);
544                 }
545             else if (!XML_STRCMP(ptr->name, "store"))
546                 {
547                     /*
548                       <store name="F">
549                       <xslt stylesheet="some.xsl"/>
550                       <xslt stylesheet="some.xsl"/>
551                       </retrieve>
552                     */
553                     struct filter_store *f =
554                         odr_malloc(tinfo->odr_config, sizeof(*f));
555             
556                     tinfo->store = f;
557                     f->convert = 0;
558                     parse_convert(tinfo, ptr->children, &f->convert);
559                 }
560             else if (!XML_STRCMP(ptr->name, "input"))
561                 {
562                     /*
563                       <input syntax="xml">
564                       <xmlreader level="1"/>
565                       </input>
566                       <input syntax="usmarc">
567                       <marc inputcharset="marc-8"/>
568                       </input>
569                     */
570                     struct _xmlAttr *attr;
571                     const char  *syntax = 0;
572                     const char *name = 0;
573                     for (attr = ptr->properties; attr; attr = attr->next)
574                         {
575                             if (attr_content(attr, "syntax", &syntax))
576                                 ;
577                             else if (attr_content(attr, "name", &name))
578                                 ;
579                             else
580                                 yaz_log(YLOG_WARN, "%s: dom filter: "
581                                         "%s bad attribute @%s"
582                                         " expected @syntax|@name",
583                                         tinfo->fname, 
584                                         xmlGetNodePath(ptr),attr->name);
585                         }
586                     parse_input(tinfo, ptr->children, syntax, name);
587                 }
588             else
589                 {
590                     yaz_log(YLOG_WARN, "%s: dom filter: "
591                             "%s bad element <%s>,"
592                             " expected <extract>|<input>|<retrieve>|<store>",
593                             tinfo->fname, xmlGetNodePath(ptr), ptr->name);
594                     return ZEBRA_FAIL;
595                 }
596         }
597     return ZEBRA_OK;
598 }
599
600 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
601                                                const char *est)
602 {
603     struct filter_retrieve *f = tinfo->retrieve_list;
604
605     /* return first schema if no est is provided */
606     if (!est)
607         return f;
608     for (; f; f = f->next)
609         { 
610             /* find requested schema */
611             if (est) 
612                 {    
613                     if (f->identifier && !strcmp(f->identifier, est))
614                         return f;
615                     if (f->name && !strcmp(f->name, est))
616                         return f;
617                 } 
618         }
619     return 0;
620 }
621
622 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
623 {
624     struct filter_info *tinfo = clientData;
625     if (!args || !*args)
626         {
627             yaz_log(YLOG_WARN, "dom filter: need config file");
628             return ZEBRA_FAIL;
629         }
630
631     if (tinfo->fname && !strcmp(args, tinfo->fname))
632         return ZEBRA_OK;
633     
634     tinfo->profile_path = res_get(res, "profilePath");
635
636     destroy_dom(tinfo);
637     return parse_dom(tinfo, args);
638 }
639
640 static void filter_destroy(void *clientData)
641 {
642     struct filter_info *tinfo = clientData;
643     destroy_dom(tinfo);
644     odr_destroy(tinfo->odr_config);
645     odr_destroy(tinfo->odr_record);
646     xfree(tinfo);
647 }
648
649 static int ioread_ex(void *context, char *buffer, int len)
650 {
651     struct recExtractCtrl *p = context;
652     return p->stream->readf(p->stream, buffer, len);
653 }
654
655 static int ioclose_ex(void *context)
656 {
657     return 0;
658 }
659
660
661 /* DOM filter style indexing */
662 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
663                             xmlChar **dst_content)
664 {
665     if (0 == XML_STRCMP(attr->name, name) && attr->children 
666         && attr->children->type == XML_TEXT_NODE)
667         {
668             *dst_content = (attr->children->content);
669             return 1;
670         }
671     return 0;
672 }
673
674
675 /* DOM filter style indexing */
676 static void index_value_of(struct filter_info *tinfo, 
677                            struct recExtractCtrl *extctr,
678                            RecWord* recword, 
679                            xmlNodePtr node, 
680                            xmlChar * index_p)
681 {
682     xmlChar *text = xmlNodeGetContent(node);
683     size_t text_len = strlen((const char *)text);
684
685
686     /* if there is no text, we do not need to proceed */
687     if (text_len)
688         {            
689             xmlChar *look = index_p;
690             xmlChar *bval;
691             xmlChar *eval;
692
693             xmlChar index[256];
694             xmlChar type[256];
695
696             /* assingning text to be indexed */
697             recword->term_buf = (const char *)text;
698             recword->term_len = text_len;
699
700             /* parsing all index name/type pairs */
701             /* may not start with ' ' or ':' */
702             while (*look && ' ' != *look && ':' != *look){
703     
704                 /* setting name and type to zero */
705                 *index = '\0';
706                 *type = '\0';
707     
708                 /* parsing one index name */
709                 bval = look;
710                 while (*look && ':' != *look && ' ' != *look){
711                     look++;
712                 }
713                 eval = look;
714                 strncpy((char *)index, (const char *)bval, eval - bval);
715                 index[eval - bval] = '\0';
716     
717     
718                 /* parsing one index type, if existing */
719                 if (':' == *look){
720                     look++;
721       
722                     bval = look;
723                     while (*look && ' ' != *look){
724                         look++;
725                     }
726                     eval = look;
727                     strncpy((char *)type, (const char *)bval, eval - bval);
728                     type[eval - bval] = '\0';
729                 }
730
731                 /* actually indexing the text given */
732                 yaz_log(YLOG_DEBUG, "%s dom filter: "
733                         "INDEX  '%s:%s' '%s'", 
734                         tinfo->fname, index, type, text);
735
736                 recword->index_name = (const char *)index;
737                 if (type && *type)
738                     recword->index_type = *type;
739                 (extctr->tokenAdd)(recword);
740
741                 /* eat whitespaces */
742                 if (*look && ' ' == *look && *(look+1)){
743                     look++;
744                 } 
745             }
746         }
747     
748     xmlFree(text); 
749 }
750
751
752 /* DOM filter style indexing */
753 static void set_record_info(struct filter_info *tinfo, 
754                             struct recExtractCtrl *extctr, 
755                             xmlChar * id_p, 
756                             xmlChar * rank_p, 
757                             xmlChar * type_p)
758 {
759     yaz_log(YLOG_DEBUG, "%s dom filter: "
760             "RECORD id=%s rank=%s type=%s", 
761             tinfo->fname,  id_p, rank_p, type_p);
762     
763     if (id_p)
764         sscanf((const char *)id_p, "%255s", extctr->match_criteria);
765
766     if (rank_p)
767         extctr->staticrank = atozint((const char *)rank_p);
768
769     /*     if (!strcmp("update", type_str)) */
770     /*         index_node(tinfo, ctrl, ptr, recword); */
771     /*     else if (!strcmp("delete", type_str)) */
772     /*         yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); */
773     /*     else */
774     /*         yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'",  */
775     /*                 type_str); */
776
777 }
778
779
780 /* DOM filter style indexing */
781 static void process_xml_element_zebra_node(struct filter_info *tinfo, 
782                                            struct recExtractCtrl *extctr, 
783                                            RecWord* recword, 
784                                            xmlNodePtr node)
785 {
786     if (node->type == XML_ELEMENT_NODE 
787         && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns)){
788     
789         if (0 == XML_STRCMP(node->name, "index")){
790             xmlChar *index_p = 0;
791
792             struct _xmlAttr *attr;      
793             for (attr = node->properties; attr; attr = attr->next){
794                 if (attr_content_xml(attr, "name", &index_p)){
795                     index_value_of(tinfo, extctr, recword,node, index_p);
796                 }  
797                 else
798                     yaz_log(YLOG_WARN,"%s dom filter: "
799                             "%s bad attribute @%s, expected @name",
800                             tinfo->fname, xmlGetNodePath(node), attr->name);
801             }
802         }
803         else if (0 == XML_STRCMP(node->name, "record")){
804             xmlChar *id_p = 0;
805             xmlChar *rank_p = 0;
806             xmlChar *type_p = 0;
807
808             struct _xmlAttr *attr;
809             for (attr = node->properties; attr; attr = attr->next){
810                 if (attr_content_xml(attr, "id", &id_p))
811                     ;
812                 else if (attr_content_xml(attr, "rank", &rank_p))
813                     ;
814                 else if (attr_content_xml(attr, "type", &type_p))
815                    ;
816                 else
817                     yaz_log(YLOG_WARN,"%s dom filter: "
818                             "%s bad attribute @%s,"
819                            " expected @id|@rank|@type",
820                            tinfo->fname, xmlGetNodePath(node), attr->name);
821
822                 if (type_p && 0 != strcmp("update", (const char *)type_p))
823                     yaz_log(YLOG_WARN,"%s dom filter: "
824                             "%s attribute @%s,"
825                             " only implemented '@type='update'",
826                             tinfo->fname, xmlGetNodePath(node), attr->name);
827           
828
829             }
830             set_record_info(tinfo, extctr, id_p, rank_p, type_p);
831         } else {
832             yaz_log(YLOG_WARN,"%s dom filter: "
833                     "%s bad element <%s>,"
834                     " expected <record>|<index> in namespace '%s'",
835                     tinfo->fname, xmlGetNodePath(node), 
836                     node->name, zebra_dom_ns);
837       
838         }
839     }
840 }
841
842
843 /* DOM filter style indexing */
844 static void process_xml_pi_node(struct filter_info *tinfo, 
845                                 struct recExtractCtrl *extctr, 
846                                 xmlNodePtr node,
847                                 xmlChar **index_pp)
848 {
849
850     /* yaz_log(YLOG_DEBUG,"PI     %s\n", xmlGetNodePath(node)); */
851
852     /* if right PI name, continue parsing PI */
853     if (0 == strcmp(zebra_pi_name, (const char *)node->name)){
854         xmlChar *pi_p =  node->content;
855         xmlChar *look = pi_p;
856     
857         xmlChar *bval;
858         xmlChar *eval;
859
860         /* parsing PI record instructions */
861         if (0 == strncmp((const char *)look, "record", 6)){
862             xmlChar id[256];
863             xmlChar rank[256];
864             xmlChar type[256];
865
866             *id = '\0';
867             *rank = '\0';
868             *type = '\0';
869       
870             look += 6;
871       
872             /* eat whitespace */
873             while (*look && ' ' == *look && *(look+1))
874                 look++;
875
876             /* parse possible id */
877             if (*look && 0 == strncmp((const char *)look, "id=", 3)){
878                 look += 3;
879                 bval = look;
880                 while (*look && ' ' != *look)
881                     look++;
882                 eval = look;
883                 strncpy((char *)id, (const char *)bval, eval - bval);
884                 id[eval - bval] = '\0';
885             }
886       
887             /* eat whitespace */
888             while (*look && ' ' == *look && *(look+1))
889                 look++;
890       
891             /* parse possible rank */
892             if (*look && 0 == strncmp((const char *)look, "rank=", 5)){
893                 look += 6;
894                 bval = look;
895                 while (*look && ' ' != *look)
896                     look++;
897                 eval = look;
898                 strncpy((char *)rank, (const char *)bval, eval - bval);
899                 rank[eval - bval] = '\0';
900             }
901
902             /* eat whitespace */
903             while (*look && ' ' == *look && *(look+1))
904                 look++;
905
906             if (look && '\0' != *look)
907                 yaz_log(YLOG_WARN,"%s dom filter: "
908                         "%s content '%s', can not parse '%s'",
909                         tinfo->fname, xmlGetNodePath(node), pi_p, look);
910             else 
911                 set_record_info(tinfo, extctr, id, rank, 0);
912
913         } 
914    
915         /* parsing index instruction */
916         else   if (0 == strncmp((const char *)look, "index", 5)){
917             look += 5;
918       
919             /* eat whitespace */
920             while (*look && ' ' == *look && *(look+1))
921                 look++;
922
923             /* export index instructions to outside */
924             *index_pp = look;
925         } 
926         else 
927             yaz_log(YLOG_WARN,"%s dom filter: "
928                     "%s content '%s', can not parse '%s'",
929                     tinfo->fname, xmlGetNodePath(node), pi_p, look);
930     }
931 }
932
933 /* DOM filter style indexing */
934 static void process_xml_element_node(struct filter_info *tinfo, 
935                                      struct recExtractCtrl *extctr, 
936                                      RecWord* recword, 
937                                      xmlNodePtr node)
938 {
939     /* remember indexing instruction from PI to next element node */
940     xmlChar *index_p = 0;
941
942     /* check if we are an element node in the special zebra namespace 
943        and either set record data or index value-of node content*/
944     process_xml_element_zebra_node(tinfo, extctr, recword, node);
945   
946     /* loop through kid nodes */
947     for (node = node->children; node; node = node->next)
948         {
949             /* check and set PI record and index index instructions */
950             if (node->type == XML_PI_NODE){
951                 process_xml_pi_node(tinfo, extctr, node, &index_p);
952             }
953             else if (node->type == XML_ELEMENT_NODE){
954                 /* if there was a PI index instruction before this element */
955                 if (index_p){
956                     index_value_of(tinfo, extctr, recword, node, index_p);
957                     index_p = 0;
958                 }
959                 process_xml_element_node(tinfo, extctr, recword,node);
960             }
961             else
962                 continue;
963         }
964 }
965
966
967 /* DOM filter style indexing */
968 static void extract_dom_doc_node(struct filter_info *tinfo, 
969                                  struct recExtractCtrl *extctr, 
970                                  xmlDocPtr doc)
971 {
972     xmlChar *buf_out;
973     int len_out;
974
975     /* only need to do the initialization once, reuse recword for all terms */
976     RecWord recword;
977     (*extctr->init)(extctr, &recword);
978
979     if (extctr->flagShowRecords){
980         xmlDocDumpMemory(doc, &buf_out, &len_out);
981         fwrite(buf_out, len_out, 1, stdout);
982         xmlFree(buf_out);
983     }
984
985     process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
986 }
987
988
989
990
991 static int convert_extract_doc(struct filter_info *tinfo, 
992                                struct filter_input *input,
993                                struct recExtractCtrl *p, 
994                                xmlDocPtr doc)
995
996 {
997     xmlChar *buf_out;
998     int len_out;
999     const char *params[10];
1000     xsltStylesheetPtr last_xsp = 0;
1001     xmlDocPtr store_doc = 0;
1002
1003     params[0] = 0;
1004     set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
1005
1006     /* input conversion */
1007     perform_convert(tinfo, input->convert, params, &doc, 0);
1008
1009     if (tinfo->store)
1010         {
1011             /* store conversion */
1012             store_doc = xmlCopyDoc(doc, 1);
1013             perform_convert(tinfo, tinfo->store->convert,
1014                             params, &store_doc, &last_xsp);
1015         }
1016     
1017     if (last_xsp)
1018         xsltSaveResultToString(&buf_out, &len_out, 
1019                                store_doc ? store_doc : doc, last_xsp);
1020     else
1021         xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1022     if (p->flagShowRecords)
1023         fwrite(buf_out, len_out, 1, stdout);
1024     (*p->setStoreData)(p, buf_out, len_out);
1025     xmlFree(buf_out);
1026
1027     if (store_doc)
1028         xmlFreeDoc(store_doc);
1029
1030     /* extract conversion */
1031     perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
1032
1033     /* finally, do the indexing */
1034     if (doc){
1035         extract_dom_doc_node(tinfo, p, doc);
1036         /* extract_doc_alvis(tinfo, p, doc); */
1037         xmlFreeDoc(doc);
1038     }
1039
1040     return RECCTRL_EXTRACT_OK;
1041 }
1042
1043 static int extract_xml_split(struct filter_info *tinfo,
1044                              struct filter_input *input,
1045                              struct recExtractCtrl *p)
1046 {
1047     int ret;
1048
1049     if (p->first_record)
1050         {
1051             if (input->u.xmlreader.reader)
1052                 xmlFreeTextReader(input->u.xmlreader.reader);
1053             input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1054                                                        p /* I/O handler */,
1055                                                        0 /* URL */, 
1056                                                        0 /* encoding */,
1057                                                        XML_PARSE_XINCLUDE|
1058                                                        XML_PARSE_NOENT);
1059         }
1060     if (!input->u.xmlreader.reader)
1061         return RECCTRL_EXTRACT_ERROR_GENERIC;
1062
1063     ret = xmlTextReaderRead(input->u.xmlreader.reader);
1064     while (ret == 1)
1065         {
1066             int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1067             int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1068             if (type == XML_READER_TYPE_ELEMENT && 
1069                 input->u.xmlreader.split_level == depth)
1070                 {
1071                     xmlNodePtr ptr
1072                         = xmlTextReaderExpand(input->u.xmlreader.reader);
1073                     if (ptr)
1074                         {
1075                             xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1076                             xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1077                 
1078                             xmlDocSetRootElement(doc, ptr2);
1079                 
1080                             return convert_extract_doc(tinfo, input, p, doc);
1081                         }
1082                     else
1083                         {
1084                             xmlFreeTextReader(input->u.xmlreader.reader);
1085                             input->u.xmlreader.reader = 0;
1086                             return RECCTRL_EXTRACT_ERROR_GENERIC;
1087                         }
1088                 }
1089             ret = xmlTextReaderRead(input->u.xmlreader.reader);
1090         }
1091     xmlFreeTextReader(input->u.xmlreader.reader);
1092     input->u.xmlreader.reader = 0;
1093     return RECCTRL_EXTRACT_EOF;
1094 }
1095
1096 static int extract_xml_full(struct filter_info *tinfo, 
1097                             struct filter_input *input,
1098                             struct recExtractCtrl *p)
1099 {
1100     if (p->first_record) /* only one record per stream */
1101         {
1102             xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, 
1103                                       p /* I/O handler */,
1104                                       0 /* URL */,
1105                                       0 /* encoding */,
1106                                       XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1107             if (!doc)
1108                 {
1109                     return RECCTRL_EXTRACT_ERROR_GENERIC;
1110                 }
1111             return convert_extract_doc(tinfo, input, p, doc);
1112         }
1113     else
1114         return RECCTRL_EXTRACT_EOF;
1115 }
1116
1117 static int extract_iso2709(struct filter_info *tinfo,
1118                            struct filter_input *input,
1119                            struct recExtractCtrl *p)
1120 {
1121     char buf[100000];
1122     int record_length;
1123     int read_bytes, r;
1124
1125     if (p->stream->readf(p->stream, buf, 5) != 5)
1126         return RECCTRL_EXTRACT_EOF;
1127     while (*buf < '0' || *buf > '9')
1128         {
1129             int i;
1130
1131             yaz_log(YLOG_WARN, "%s dom filter: "
1132                     "MARC: Skipping bad byte %d (0x%02X)",
1133                     tinfo->fname, *buf & 0xff, *buf & 0xff);
1134             for (i = 0; i<4; i++)
1135                 buf[i] = buf[i+1];
1136
1137             if (p->stream->readf(p->stream, buf+4, 1) != 1)
1138                 return RECCTRL_EXTRACT_EOF;
1139         }
1140     record_length = atoi_n (buf, 5);
1141     if (record_length < 25)
1142         {
1143             yaz_log (YLOG_WARN, "%s dom filter: "
1144                      "MARC record length < 25, is %d", 
1145                      tinfo->fname, record_length);
1146             return RECCTRL_EXTRACT_ERROR_GENERIC;
1147         }
1148     read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1149     if (read_bytes < record_length-5)
1150         {
1151             yaz_log (YLOG_WARN, "%s dom filter: "
1152                      "Couldn't read whole MARC record",
1153                      tinfo->fname);
1154             return RECCTRL_EXTRACT_ERROR_GENERIC;
1155         }
1156     r = yaz_marc_read_iso2709(input->u.marc.handle,  buf, record_length);
1157     if (r < record_length)
1158         {
1159             yaz_log (YLOG_WARN, "%s dom filter: "
1160                      "Parsing of MARC record failed r=%d length=%d",
1161                      tinfo->fname, r, record_length);
1162             return RECCTRL_EXTRACT_ERROR_GENERIC;
1163         }
1164     else
1165         {
1166             xmlDocPtr rdoc;
1167             xmlNode *root_ptr;
1168             yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1169             rdoc = xmlNewDoc((const xmlChar*) "1.0");
1170             xmlDocSetRootElement(rdoc, root_ptr);
1171             return convert_extract_doc(tinfo, input, p, rdoc);        
1172         }
1173     return RECCTRL_EXTRACT_OK;
1174 }
1175
1176 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1177 {
1178     struct filter_info *tinfo = clientData;
1179     struct filter_input *input = tinfo->input_list;
1180
1181     if (!input)
1182         return RECCTRL_EXTRACT_ERROR_GENERIC;
1183
1184     odr_reset(tinfo->odr_record);
1185     switch(input->type)
1186         {
1187         case DOM_INPUT_XMLREADER:
1188             if (input->u.xmlreader.split_level == 0)
1189                 return extract_xml_full(tinfo, input, p);
1190             else
1191                 return extract_xml_split(tinfo, input, p);
1192             break;
1193         case DOM_INPUT_MARC:
1194             return extract_iso2709(tinfo, input, p);
1195         }
1196     return RECCTRL_EXTRACT_ERROR_GENERIC;
1197 }
1198
1199 static int ioread_ret(void *context, char *buffer, int len)
1200 {
1201     struct recRetrieveCtrl *p = context;
1202     return p->stream->readf(p->stream, buffer, len);
1203 }
1204
1205 static int ioclose_ret(void *context)
1206 {
1207     return 0;
1208 }
1209
1210 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1211 {
1212     /* const char *esn = zebra_dom_ns; */
1213     const char *esn = 0;
1214     const char *params[32];
1215     struct filter_info *tinfo = clientData;
1216     xmlDocPtr doc;
1217     struct filter_retrieve *retrieve;
1218     xsltStylesheetPtr last_xsp = 0;
1219
1220     if (p->comp)
1221         {
1222             if (p->comp->which == Z_RecordComp_simple
1223                 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1224                 {
1225                     esn = p->comp->u.simple->u.generic;
1226                 }
1227             else if (p->comp->which == Z_RecordComp_complex 
1228                      && p->comp->u.complex->generic->elementSpec
1229                      && p->comp->u.complex->generic->elementSpec->which ==
1230                      Z_ElementSpec_elementSetName)
1231                 {
1232                     esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1233                 }
1234         }
1235     retrieve = lookup_retrieve(tinfo, esn);
1236     if (!retrieve)
1237         {
1238             p->diagnostic =
1239                 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1240             return 0;
1241         }
1242
1243     params[0] = 0;
1244     set_param_int(params, "id", p->localno, p->odr);
1245     if (p->fname)
1246         set_param_str(params, "filename", p->fname, p->odr);
1247     if (p->staticrank >= 0)
1248         set_param_int(params, "rank", p->staticrank, p->odr);
1249
1250     if (esn)
1251         set_param_str(params, "schema", esn, p->odr);
1252     else
1253         if (retrieve->name)
1254             set_param_str(params, "schema", retrieve->name, p->odr);
1255         else if (retrieve->identifier)
1256             set_param_str(params, "schema", retrieve->identifier, p->odr);
1257         else
1258             set_param_str(params, "schema", "", p->odr);
1259
1260     if (p->score >= 0)
1261         set_param_int(params, "score", p->score, p->odr);
1262     set_param_int(params, "size", p->recordSize, p->odr);
1263
1264     doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1265                     0 /* URL */,
1266                     0 /* encoding */,
1267                     XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1268     if (!doc)
1269         {
1270             p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1271             return 0;
1272         }
1273
1274     /* retrieve conversion */
1275     perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
1276     if (!doc)
1277         {
1278             p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1279         }
1280     else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1281         {
1282             xmlChar *buf_out;
1283             int len_out;
1284
1285             if (last_xsp)
1286                 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1287             else
1288                 xmlDocDumpMemory(doc, &buf_out, &len_out);            
1289
1290             p->output_format = VAL_TEXT_XML;
1291             p->rec_len = len_out;
1292             p->rec_buf = odr_malloc(p->odr, p->rec_len);
1293             memcpy(p->rec_buf, buf_out, p->rec_len);
1294             xmlFree(buf_out);
1295         }
1296     else if (p->output_format == VAL_SUTRS)
1297         {
1298             xmlChar *buf_out;
1299             int len_out;
1300
1301             if (last_xsp)
1302                 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1303             else
1304                 xmlDocDumpMemory(doc, &buf_out, &len_out);            
1305         
1306             p->output_format = VAL_SUTRS;
1307             p->rec_len = len_out;
1308             p->rec_buf = odr_malloc(p->odr, p->rec_len);
1309             memcpy(p->rec_buf, buf_out, p->rec_len);
1310         
1311             xmlFree(buf_out);
1312         }
1313     else
1314         {
1315             p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1316         }
1317     xmlFreeDoc(doc);
1318     return 0;
1319 }
1320
1321 static struct recType filter_type = {
1322     0,
1323     "dom",
1324     filter_init,
1325     filter_config,
1326     filter_destroy,
1327     filter_extract,
1328     filter_retrieve
1329 };
1330
1331 RecType
1332 #ifdef IDZEBRA_STATIC_DOM
1333 idzebra_filter_dom
1334 #else
1335 idzebra_filter
1336 #endif
1337
1338 [] = {
1339     &filter_type,
1340     0,
1341 };
1342 /*
1343  * Local variables:
1344  * c-basic-offset: 4
1345  * indent-tabs-mode: nil
1346  * End:
1347  * vim: shiftwidth=4 tabstop=8 expandtab
1348  */
1349