1 /* $Id: mod_dom.c,v 1.8 2007-02-14 16:16:15 marc Exp $
2 Copyright (C) 1995-2007
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
27 #include <yaz/diagbib1.h>
28 #include <yaz/tpath.h>
30 #include <libxml/xmlversion.h>
31 #include <libxml/parser.h>
32 #include <libxml/tree.h>
33 #include <libxml/xmlIO.h>
34 #include <libxml/xmlreader.h>
35 #include <libxslt/transform.h>
36 #include <libxslt/xsltutils.h>
39 #include <libexslt/exslt.h>
42 #include <idzebra/util.h>
43 #include <idzebra/recctrl.h>
46 const char *stylesheet;
47 xsltStylesheetPtr stylesheet_xsp;
48 struct convert_s *next;
51 struct filter_extract {
53 struct convert_s *convert;
57 struct convert_s *convert;
60 struct filter_retrieve {
62 const char *identifier;
63 struct convert_s *convert;
64 struct filter_retrieve *next;
67 #define DOM_INPUT_XMLREADER 1
68 #define DOM_INPUT_MARC 2
72 struct convert_s *convert;
76 const char *input_charset;
81 xmlTextReaderPtr reader;
85 struct filter_input *next;
91 const char *profile_path;
95 struct filter_extract *extract;
96 struct filter_retrieve *retrieve_list;
97 struct filter_input *input_list;
98 struct filter_store *store;
101 #define XML_STRCMP(a,b) strcmp((char*)a, b)
102 #define XML_STRLEN(a) strlen((char*)a)
107 static void set_param_str(const char **params, const char *name,
108 const char *value, ODR odr)
110 char *quoted = odr_malloc(odr, 3 + strlen(value));
111 sprintf(quoted, "'%s'", value);
119 static void set_param_int(const char **params, const char *name,
122 char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
125 sprintf(quoted, "'" ZINT_FORMAT "'", value);
131 static void *filter_init(Res res, RecType recType)
133 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
135 tinfo->full_name = 0;
136 tinfo->profile_path = 0;
137 tinfo->odr_record = odr_createmem(ODR_ENCODE);
138 tinfo->odr_config = odr_createmem(ODR_ENCODE);
140 tinfo->retrieve_list = 0;
141 tinfo->input_list = 0;
143 tinfo->doc_config = 0;
152 static int attr_content(struct _xmlAttr *attr, const char *name,
153 const char **dst_content)
155 if (!XML_STRCMP(attr->name, name) && attr->children
156 && attr->children->type == XML_TEXT_NODE)
158 *dst_content = (const char *)(attr->children->content);
164 static void destroy_xsp(struct convert_s *c)
168 if (c->stylesheet_xsp)
169 xsltFreeStylesheet(c->stylesheet_xsp);
174 static void destroy_dom(struct filter_info *tinfo)
178 destroy_xsp(tinfo->extract->convert);
183 destroy_xsp(tinfo->store->convert);
186 if (tinfo->input_list)
188 struct filter_input *i_ptr;
189 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
193 case DOM_INPUT_XMLREADER:
194 if (i_ptr->u.xmlreader.reader)
195 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
198 yaz_iconv_close(i_ptr->u.marc.iconv);
199 yaz_marc_destroy(i_ptr->u.marc.handle);
202 destroy_xsp(i_ptr->convert);
204 tinfo->input_list = 0;
206 if (tinfo->retrieve_list)
208 struct filter_retrieve *r_ptr;
209 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
210 destroy_xsp(r_ptr->convert);
211 tinfo->retrieve_list = 0;
214 if (tinfo->doc_config)
216 xmlFreeDoc(tinfo->doc_config);
217 tinfo->doc_config = 0;
219 odr_reset(tinfo->odr_config);
222 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
223 struct convert_s **l)
226 for(; ptr; ptr = ptr->next)
228 if (ptr->type != XML_ELEMENT_NODE)
230 if (!XML_STRCMP(ptr->name, "xslt"))
232 struct _xmlAttr *attr;
233 struct convert_s *p = odr_malloc(tinfo->odr_config, sizeof(*p));
237 p->stylesheet_xsp = 0;
239 for (attr = ptr->properties; attr; attr = attr->next)
240 if (attr_content(attr, "stylesheet", &p->stylesheet))
243 yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
245 tinfo->fname, attr->name);
248 char tmp_xslt_full_name[1024];
249 if (!yaz_filepath_resolve(p->stylesheet, tinfo->profile_path,
250 NULL, tmp_xslt_full_name))
253 "%s: dom filter: stylesheet %s not found in "
256 p->stylesheet, tinfo->profile_path);
261 = xsltParseStylesheetFile((const xmlChar*) tmp_xslt_full_name);
262 if (!p->stylesheet_xsp)
265 "%s: dom filter: could not parse xslt "
267 tinfo->fname, tmp_xslt_full_name);
274 "%s: dom filter: missing attribute 'stylesheet' "
275 "for element 'xslt'", tinfo->fname);
283 yaz_log(YLOG_LOG, "%s: dom filter: bad node '%s' for <conv>",
284 tinfo->fname, ptr->name);
292 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
293 struct convert_s *convert,
296 xsltStylesheetPtr *last_xsp)
298 for (; convert; convert = convert->next)
300 xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
303 *last_xsp = convert->stylesheet_xsp;
310 static struct filter_input *new_input(struct filter_info *tinfo, int type)
312 struct filter_input *p;
313 struct filter_input **np = &tinfo->input_list;
314 for (;*np; np = &(*np)->next)
316 p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
325 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
329 for (; ptr; ptr = ptr->next)
331 if (ptr->type != XML_ELEMENT_NODE)
333 if (!XML_STRCMP(ptr->name, "marc"))
335 yaz_iconv_t iconv = 0;
336 const char *input_charset = "marc-8";
337 struct _xmlAttr *attr;
339 for (attr = ptr->properties; attr; attr = attr->next)
341 if (attr_content(attr, "charset", &input_charset))
344 yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
346 tinfo->fname, attr->name);
348 iconv = yaz_iconv_open("utf-8", input_charset);
351 yaz_log(YLOG_WARN, "%s: dom filter: unsupported charset "
353 tinfo->fname, input_charset);
358 struct filter_input *p = new_input(tinfo, DOM_INPUT_MARC);
359 p->u.marc.handle = yaz_marc_create();
360 p->u.marc.iconv = iconv;
362 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
366 parse_convert(tinfo, ptr, &p->convert);
371 else if (!XML_STRCMP(ptr->name, "xmlreader"))
373 struct filter_input *p = new_input(tinfo, DOM_INPUT_XMLREADER);
374 struct _xmlAttr *attr;
375 const char *level_str = 0;
377 p->u.xmlreader.split_level = 0;
378 p->u.xmlreader.reader = 0;
380 for (attr = ptr->properties; attr; attr = attr->next)
382 if (attr_content(attr, "level", &level_str))
385 yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
387 tinfo->fname, attr->name);
390 p->u.xmlreader.split_level = atoi(level_str);
394 parse_convert(tinfo, ptr, &p->convert);
399 yaz_log(YLOG_WARN, "%s: dom filter: bad input type %s",
400 tinfo->fname, ptr->name);
407 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
409 char tmp_full_name[1024];
413 tinfo->fname = odr_strdup(tinfo->odr_config, fname);
415 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
416 NULL, tmp_full_name))
417 tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
419 tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
421 yaz_log(YLOG_LOG, "dom filter: loading config file %s", tinfo->full_name);
423 doc = xmlParseFile(tinfo->full_name);
426 yaz_log(YLOG_WARN, "%s: dom filter: failed to parse config file %s",
427 tinfo->fname, tinfo->full_name);
430 /* save because we store ptrs to the content */
431 tinfo->doc_config = doc;
433 ptr = xmlDocGetRootElement(doc);
434 if (!ptr || ptr->type != XML_ELEMENT_NODE
435 || XML_STRCMP(ptr->name, "dom"))
438 "%s: dom filter: expected root element <dom>",
443 for (ptr = ptr->children; ptr; ptr = ptr->next)
445 if (ptr->type != XML_ELEMENT_NODE)
447 if (!XML_STRCMP(ptr->name, "extract"))
450 <extract name="index">
451 <xslt stylesheet="first.xsl"/>
452 <xslt stylesheet="second.xsl"/>
455 struct _xmlAttr *attr;
456 struct filter_extract *f =
457 odr_malloc(tinfo->odr_config, sizeof(*f));
462 for (attr = ptr->properties; attr; attr = attr->next)
464 if (attr_content(attr, "name", &f->name))
467 yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
469 tinfo->fname, attr->name);
472 parse_convert(tinfo, ptr->children, &f->convert);
474 else if (!XML_STRCMP(ptr->name, "retrieve"))
478 <xslt stylesheet="some.xsl"/>
479 <xslt stylesheet="some.xsl"/>
482 struct _xmlAttr *attr;
483 struct filter_retrieve **fp = &tinfo->retrieve_list;
484 struct filter_retrieve *f =
485 odr_malloc(tinfo->odr_config, sizeof(*f));
496 for (attr = ptr->properties; attr; attr = attr->next)
498 if (attr_content(attr, "identifier", &f->identifier))
500 else if (attr_content(attr, "name", &f->name))
503 yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
505 tinfo->fname, attr->name);
507 parse_convert(tinfo, ptr->children, &f->convert);
509 else if (!XML_STRCMP(ptr->name, "store"))
513 <xslt stylesheet="some.xsl"/>
514 <xslt stylesheet="some.xsl"/>
517 struct filter_store *f =
518 odr_malloc(tinfo->odr_config, sizeof(*f));
522 parse_convert(tinfo, ptr->children, &f->convert);
524 else if (!XML_STRCMP(ptr->name, "input"))
528 <xmlreader level="1"/>
530 <input syntax="usmarc">
531 <marc inputcharset="marc-8"/>
534 struct _xmlAttr *attr;
535 const char *syntax = 0;
536 const char *name = 0;
537 for (attr = ptr->properties; attr; attr = attr->next)
539 if (attr_content(attr, "syntax", &syntax))
541 else if (attr_content(attr, "name", &name))
544 yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
546 tinfo->fname, attr->name);
548 parse_input(tinfo, ptr->children, syntax, name);
552 yaz_log(YLOG_WARN, "%s: dom filter: bad element %s",
553 tinfo->fname, ptr->name);
560 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
563 struct filter_retrieve *f = tinfo->retrieve_list;
565 /* return first schema if no est is provided */
568 for (; f; f = f->next)
570 /* find requested schema */
573 if (f->identifier && !strcmp(f->identifier, est))
575 if (f->name && !strcmp(f->name, est))
582 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
584 struct filter_info *tinfo = clientData;
587 yaz_log(YLOG_WARN, "dom filter: need config file");
591 if (tinfo->fname && !strcmp(args, tinfo->fname))
594 tinfo->profile_path = res_get(res, "profilePath");
597 return parse_dom(tinfo, args);
600 static void filter_destroy(void *clientData)
602 struct filter_info *tinfo = clientData;
604 odr_destroy(tinfo->odr_config);
605 odr_destroy(tinfo->odr_record);
609 static int ioread_ex(void *context, char *buffer, int len)
611 struct recExtractCtrl *p = context;
612 return p->stream->readf(p->stream, buffer, len);
615 static int ioclose_ex(void *context)
621 /* Alvis style indexing */
622 #define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1"
623 static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS;
625 /* Alvis style indexing */
626 static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
627 xmlNodePtr ptr, RecWord *recWord)
629 for(; ptr; ptr = ptr->next)
631 index_cdata(tinfo, ctrl, ptr->children, recWord);
632 if (ptr->type != XML_TEXT_NODE)
634 recWord->term_buf = (const char *)ptr->content;
635 recWord->term_len = XML_STRLEN(ptr->content);
636 (*ctrl->tokenAdd)(recWord);
640 /* Alvis style indexing */
641 static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
642 xmlNodePtr ptr, RecWord *recWord)
644 for(; ptr; ptr = ptr->next)
646 index_node(tinfo, ctrl, ptr->children, recWord);
647 if (ptr->type != XML_ELEMENT_NODE || !ptr->ns ||
648 XML_STRCMP(ptr->ns->href, zebra_xslt_ns))
650 if (!XML_STRCMP(ptr->name, "index"))
652 const char *name_str = 0;
653 const char *type_str = 0;
654 const char *xpath_str = 0;
655 struct _xmlAttr *attr;
656 for (attr = ptr->properties; attr; attr = attr->next)
658 if (attr_content(attr, "name", &name_str))
660 else if (attr_content(attr, "xpath", &xpath_str))
662 else if (attr_content(attr, "type", &type_str))
665 yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
667 tinfo->fname, attr->name);
671 int prev_type = recWord->index_type; /* save default type */
673 if (type_str && *type_str)
674 recWord->index_type = *type_str; /* type was given */
675 recWord->index_name = name_str;
676 index_cdata(tinfo, ctrl, ptr->children, recWord);
678 recWord->index_type = prev_type; /* restore it again */
684 /* Alvis style indexing */
685 static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl,
686 xmlNodePtr ptr, RecWord *recWord)
688 const char *type_str = "update";
690 if (ptr && ptr->type == XML_ELEMENT_NODE && ptr->ns &&
691 !XML_STRCMP(ptr->ns->href, zebra_xslt_ns)
692 && !XML_STRCMP(ptr->name, "record"))
694 const char *id_str = 0;
695 const char *rank_str = 0;
696 struct _xmlAttr *attr;
697 for (attr = ptr->properties; attr; attr = attr->next)
699 if (attr_content(attr, "type", &type_str))
701 else if (attr_content(attr, "id", &id_str))
703 else if (attr_content(attr, "rank", &rank_str))
706 yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
708 tinfo->fname, attr->name);
711 sscanf(id_str, "%255s", ctrl->match_criteria);
714 ctrl->staticrank = atozint(rank_str);
718 if (!strcmp("update", type_str))
719 index_node(tinfo, ctrl, ptr, recWord);
720 else if (!strcmp("delete", type_str))
721 yaz_log(YLOG_WARN, "dom filter delete: to be implemented");
723 yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'",
728 /* Alvis style indexing */
729 static void extract_doc_alvis(struct filter_info *tinfo,
730 struct recExtractCtrl *recctr,
739 (*recctr->init)(recctr, &recWord);
741 if (recctr->flagShowRecords){
742 xmlDocDumpMemory(doc, &buf_out, &len_out);
743 fwrite(buf_out, len_out, 1, stdout);
746 root_ptr = xmlDocGetRootElement(doc);
748 index_record(tinfo, recctr, root_ptr, &recWord);
750 yaz_log(YLOG_WARN, "No root for index XML record");
755 /* DOM filter style indexing */
756 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
757 xmlChar **dst_content)
759 if (0 == XML_STRCMP(attr->name, name) && attr->children
760 && attr->children->type == XML_TEXT_NODE)
762 *dst_content = (attr->children->content);
768 /* DOM filter style indexing */
769 /* #define ZEBRA_XSLT_NS "http://indexdata.com/zebra-2.0" */
770 /* static const char *zebra_xslt_ns = ZEBRA_XSLT_NS; */
772 /* DOM filter style indexing */
773 #define ZEBRA_PI_NAME "zebra-2.0"
774 static const char *zebra_pi_name = ZEBRA_PI_NAME;
777 /* DOM filter style indexing */
778 void index_value_of(struct filter_info *tinfo,
779 struct recExtractCtrl *recctr,
783 xmlChar *text = xmlNodeGetContent(node);
785 xmlChar *look = index_p;
792 /* parsing all index name/type pairs - may not start with ' ' or ':' */
793 while (*look && ' ' != *look && ':' != *look){
795 /* setting name and type to zero */
799 /* parsing one index name */
801 while (*look && ':' != *look && ' ' != *look){
805 strncpy((char *)index, (const char *)bval, eval - bval);
806 index[eval - bval] = '\0';
809 /* parsing one index type, if existing */
814 while (*look && ' ' != *look){
818 strncpy((char *)type, (const char *)bval, eval - bval);
819 type[eval - bval] = '\0';
822 printf("INDEX '%s:%s' '%s'\n", index, type, text);
824 if (*look && ' ' == *look && *(look+1)){
831 /* //recWord->term_buf = (const char *)ptr->content; */
832 /* //recWord->term_len = XML_STRLEN(ptr->content); */
833 /* // if (type_str && *type_str) */
834 /* // recWord->index_type = *type_str; /\* type was given *\/ */
835 /* // recWord->index_name = name_str; */
836 /* // recWord->index_type = prev_type; /\* restore it again *\/ */
840 /* DOM filter style indexing */
841 void set_record_info(struct filter_info *tinfo,
842 struct recExtractCtrl *recctr,
847 printf("RECORD id=%s rank=%s action=%s\n", id_p, rank_p, action_p);
851 /* DOM filter style indexing */
852 void process_xml_element_zebra_node(struct filter_info *tinfo,
853 struct recExtractCtrl *recctr,
856 if (node->type == XML_ELEMENT_NODE
857 && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_xslt_ns)){
859 if (0 == XML_STRCMP(node->name, "index")){
860 xmlChar *index_p = 0;
862 struct _xmlAttr *attr;
863 for (attr = node->properties; attr; attr = attr->next){
864 if (attr_content_xml(attr, "name", &index_p)){
865 index_value_of(tinfo, recctr, node, index_p);
868 // printf("%s: dom filter: s% bad attribute %s",
869 // tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
870 printf("dom filter: %s bad attribute @%s, expected @name\n",
871 xmlGetNodePath(node), attr->name);
874 else if (0 == XML_STRCMP(node->name, "record")){
877 xmlChar *action_p = 0;
879 struct _xmlAttr *attr;
880 for (attr = node->properties; attr; attr = attr->next){
881 if (attr_content_xml(attr, "id", &id_p))
883 else if (attr_content_xml(attr, "rank", &rank_p))
885 else if (attr_content_xml(attr, "acton", &action_p))
888 // printf("%s: dom filter: s% bad attribute %s",
889 // tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
890 printf("dom filter: %s bad attribute @%s,"
891 " expected @id|@rank|@action\n",
892 xmlGetNodePath(node), attr->name);
894 if (action_p && 0 != strcmp("update", (const char *)action_p))
895 printf("dom filter: %s attribute @%s,"
896 " only implemented '@action=\"update\"\n",
897 xmlGetNodePath(node), attr->name);
901 set_record_info(tinfo, recctr, id_p, rank_p, action_p);
903 // printf("%s: dom filter: s% bad attribute %s",
904 // tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
905 printf("dom filter: %s bad element <%s>,"
906 " expected <record>|<index> in namespace '%s'\n",
907 xmlGetNodePath(node), node->name, zebra_xslt_ns);
914 /* DOM filter style indexing */
915 void process_xml_pi_node(struct filter_info *tinfo,
916 struct recExtractCtrl *recctr,
921 /* printf("PI %s\n", xmlGetNodePath(node)); */
923 /* if right PI name, continue parsing PI */
924 if (0 == strcmp(zebra_pi_name, (const char *)node->name)){
925 xmlChar *pi_p = node->content;
926 xmlChar *look = pi_p;
931 /* parsing PI record instructions */
932 if (0 == strncmp((const char *)look, "record", 6)){
944 while (*look && ' ' == *look && *(look+1))
947 /* parse possible id */
948 if (*look && 0 == strncmp((const char *)look, "id=", 3)){
951 while (*look && ' ' != *look)
954 strncpy((char *)id, (const char *)bval, eval - bval);
955 id[eval - bval] = '\0';
959 while (*look && ' ' == *look && *(look+1))
962 /* parse possible rank */
963 if (*look && 0 == strncmp((const char *)look, "rank=", 5)){
966 while (*look && ' ' != *look)
969 strncpy((char *)rank, (const char *)bval, eval - bval);
970 rank[eval - bval] = '\0';
974 while (*look && ' ' == *look && *(look+1))
977 if (look && '\0' != *look){
978 printf ("ERROR %s: content '%s'; can not parse '%s'\n",
979 xmlGetNodePath(node), pi_p, look);
981 /* set_record_info(id, rank, action); */
982 set_record_info(tinfo, recctr, id, rank, 0);
987 /* parsing index instruction */
988 else if (0 == strncmp((const char *)look, "index", 5)){
992 while (*look && ' ' == *look && *(look+1))
995 /* export index instructions to outside */
998 /* nor record, neither index */
1001 printf ("ERROR %s: content '%s'; can not parse '%s'\n",
1002 xmlGetNodePath(node), pi_p, look);
1007 /* DOM filter style indexing */
1008 void process_xml_element_node(struct filter_info *tinfo,
1009 struct recExtractCtrl *recctr,
1012 /* remember indexing instruction from PI to next element node */
1013 xmlChar *index_p = 0;
1015 /* printf("ELEM %s\n", xmlGetNodePath(node)); */
1017 /* check if we are an element node in the special zebra namespace
1018 and either set record data or index value-of node content*/
1019 process_xml_element_zebra_node(tinfo, recctr, node);
1021 /* loop through kid nodes */
1022 for (node = node->children; node; node = node->next)
1024 /* check and set PI record and index index instructions */
1025 if (node->type == XML_PI_NODE){
1026 process_xml_pi_node(tinfo, recctr, node, &index_p);
1028 else if (node->type == XML_ELEMENT_NODE){
1029 /* if there was a PI index instruction before this element node */
1031 index_value_of(tinfo, recctr, node, index_p);
1034 process_xml_element_node(tinfo, recctr, node);
1045 /* DOM filter style indexing */
1046 void extract_dom_doc_node(struct filter_info *tinfo,
1047 struct recExtractCtrl *recctr,
1050 printf("DOC %s\n", xmlGetNodePath((xmlNodePtr)doc));
1052 process_xml_element_node(tinfo, recctr, (xmlNodePtr)doc);
1058 static int convert_extract_doc(struct filter_info *tinfo,
1059 struct filter_input *input,
1060 struct recExtractCtrl *p,
1064 /* RecWord recWord; */
1067 const char *params[10];
1068 xsltStylesheetPtr last_xsp = 0;
1069 xmlDocPtr store_doc = 0;
1072 set_param_str(params, "schema", zebra_xslt_ns, tinfo->odr_record);
1074 /* input conversion */
1075 perform_convert(tinfo, input->convert, params, &doc, 0);
1079 /* store conversion */
1080 store_doc = xmlCopyDoc(doc, 1);
1081 perform_convert(tinfo, tinfo->store->convert,
1082 params, &store_doc, &last_xsp);
1086 xsltSaveResultToString(&buf_out, &len_out,
1087 store_doc ? store_doc : doc, last_xsp);
1089 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1090 if (p->flagShowRecords)
1091 fwrite(buf_out, len_out, 1, stdout);
1092 (*p->setStoreData)(p, buf_out, len_out);
1096 xmlFreeDoc(store_doc);
1098 /* extract conversion */
1099 perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
1101 /* finally, do the indexing */
1103 extract_dom_doc_node(tinfo, p, doc);
1104 extract_doc_alvis(tinfo, p, doc);
1108 return RECCTRL_EXTRACT_OK;
1111 static int extract_xml_split(struct filter_info *tinfo,
1112 struct filter_input *input,
1113 struct recExtractCtrl *p)
1117 if (p->first_record)
1119 if (input->u.xmlreader.reader)
1120 xmlFreeTextReader(input->u.xmlreader.reader);
1121 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1122 p /* I/O handler */,
1128 if (!input->u.xmlreader.reader)
1129 return RECCTRL_EXTRACT_ERROR_GENERIC;
1131 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1134 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1135 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1136 if (type == XML_READER_TYPE_ELEMENT &&
1137 input->u.xmlreader.split_level == depth)
1139 xmlNodePtr ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
1142 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1143 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1145 xmlDocSetRootElement(doc, ptr2);
1147 return convert_extract_doc(tinfo, input, p, doc);
1151 xmlFreeTextReader(input->u.xmlreader.reader);
1152 input->u.xmlreader.reader = 0;
1153 return RECCTRL_EXTRACT_ERROR_GENERIC;
1156 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1158 xmlFreeTextReader(input->u.xmlreader.reader);
1159 input->u.xmlreader.reader = 0;
1160 return RECCTRL_EXTRACT_EOF;
1163 static int extract_xml_full(struct filter_info *tinfo,
1164 struct filter_input *input,
1165 struct recExtractCtrl *p)
1167 if (p->first_record) /* only one record per stream */
1169 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, p /* I/O handler */,
1172 XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1175 return RECCTRL_EXTRACT_ERROR_GENERIC;
1177 return convert_extract_doc(tinfo, input, p, doc);
1180 return RECCTRL_EXTRACT_EOF;
1183 static int extract_iso2709(struct filter_info *tinfo,
1184 struct filter_input *input,
1185 struct recExtractCtrl *p)
1191 if (p->stream->readf(p->stream, buf, 5) != 5)
1192 return RECCTRL_EXTRACT_EOF;
1193 while (*buf < '0' || *buf > '9')
1197 yaz_log(YLOG_WARN, "MARC: Skipping bad byte %d (0x%02X)",
1198 *buf & 0xff, *buf & 0xff);
1199 for (i = 0; i<4; i++)
1202 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1203 return RECCTRL_EXTRACT_EOF;
1205 record_length = atoi_n (buf, 5);
1206 if (record_length < 25)
1208 yaz_log (YLOG_WARN, "MARC record length < 25, is %d", record_length);
1209 return RECCTRL_EXTRACT_ERROR_GENERIC;
1211 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1212 if (read_bytes < record_length-5)
1214 yaz_log (YLOG_WARN, "Couldn't read whole MARC record");
1215 return RECCTRL_EXTRACT_ERROR_GENERIC;
1217 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1218 if (r < record_length)
1220 yaz_log (YLOG_WARN, "Parsing of MARC record failed r=%d length=%d",
1222 return RECCTRL_EXTRACT_ERROR_GENERIC;
1228 yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1229 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1230 xmlDocSetRootElement(rdoc, root_ptr);
1231 return convert_extract_doc(tinfo, input, p, rdoc);
1233 return RECCTRL_EXTRACT_OK;
1236 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1238 struct filter_info *tinfo = clientData;
1239 struct filter_input *input = tinfo->input_list;
1242 return RECCTRL_EXTRACT_ERROR_GENERIC;
1244 odr_reset(tinfo->odr_record);
1247 case DOM_INPUT_XMLREADER:
1248 if (input->u.xmlreader.split_level == 0)
1249 return extract_xml_full(tinfo, input, p);
1251 return extract_xml_split(tinfo, input, p);
1253 case DOM_INPUT_MARC:
1254 return extract_iso2709(tinfo, input, p);
1256 return RECCTRL_EXTRACT_ERROR_GENERIC;
1259 static int ioread_ret(void *context, char *buffer, int len)
1261 struct recRetrieveCtrl *p = context;
1262 return p->stream->readf(p->stream, buffer, len);
1265 static int ioclose_ret(void *context)
1270 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1272 /* const char *esn = zebra_xslt_ns; */
1273 const char *esn = 0;
1274 const char *params[32];
1275 struct filter_info *tinfo = clientData;
1277 struct filter_retrieve *retrieve;
1278 xsltStylesheetPtr last_xsp = 0;
1282 if (p->comp->which == Z_RecordComp_simple
1283 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1285 esn = p->comp->u.simple->u.generic;
1287 else if (p->comp->which == Z_RecordComp_complex
1288 && p->comp->u.complex->generic->elementSpec
1289 && p->comp->u.complex->generic->elementSpec->which ==
1290 Z_ElementSpec_elementSetName)
1292 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1295 retrieve = lookup_retrieve(tinfo, esn);
1299 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1304 set_param_int(params, "id", p->localno, p->odr);
1306 set_param_str(params, "filename", p->fname, p->odr);
1307 if (p->staticrank >= 0)
1308 set_param_int(params, "rank", p->staticrank, p->odr);
1311 set_param_str(params, "schema", esn, p->odr);
1314 set_param_str(params, "schema", retrieve->name, p->odr);
1315 else if (retrieve->identifier)
1316 set_param_str(params, "schema", retrieve->identifier, p->odr);
1318 set_param_str(params, "schema", "", p->odr);
1321 set_param_int(params, "score", p->score, p->odr);
1322 set_param_int(params, "size", p->recordSize, p->odr);
1324 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1327 XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1330 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1334 /* retrieve conversion */
1335 perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
1338 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1340 else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1346 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1348 xmlDocDumpMemory(doc, &buf_out, &len_out);
1350 p->output_format = VAL_TEXT_XML;
1351 p->rec_len = len_out;
1352 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1353 memcpy(p->rec_buf, buf_out, p->rec_len);
1356 else if (p->output_format == VAL_SUTRS)
1362 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1364 xmlDocDumpMemory(doc, &buf_out, &len_out);
1366 p->output_format = VAL_SUTRS;
1367 p->rec_len = len_out;
1368 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1369 memcpy(p->rec_buf, buf_out, p->rec_len);
1375 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1381 static struct recType filter_type = {
1392 #ifdef IDZEBRA_STATIC_DOM
1405 * indent-tabs-mode: nil
1407 * vim: shiftwidth=4 tabstop=8 expandtab