1 /* This file is part of the Zebra server.
2 Copyright (C) Index Data
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 #include <yaz/diagbib1.h>
29 #include <yaz/tpath.h>
30 #include <yaz/snprintf.h>
32 #include <libxml/xmlversion.h>
33 #include <libxml/parser.h>
34 #include <libxml/tree.h>
35 #include <libxml/xmlIO.h>
36 #include <libxml/xmlreader.h>
37 #include <libxslt/transform.h>
38 #include <libxslt/xsltutils.h>
41 #include <libexslt/exslt.h>
44 #include <idzebra/util.h>
45 #include <idzebra/recctrl.h>
46 #include <yaz/oid_db.h>
48 /* DOM filter style indexing */
49 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
50 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
52 /* DOM filter style indexing */
53 #define ZEBRA_PI_NAME "zebra-2.0"
54 static const char *zebra_pi_name = ZEBRA_PI_NAME;
62 const char *stylesheet;
63 xsltStylesheetPtr stylesheet_xsp;
71 enum convert_type which;
73 struct convert_xslt xslt;
74 struct convert_meta meta;
76 struct convert_s *next;
79 struct filter_extract {
81 struct convert_s *convert;
85 struct convert_s *convert;
88 struct filter_retrieve {
90 const char *identifier;
91 struct convert_s *convert;
92 struct filter_retrieve *next;
95 #define DOM_INPUT_XMLREADER 1
96 #define DOM_INPUT_MARC 2
100 struct convert_s *convert;
104 xmlTextReaderPtr reader;
108 const char *input_charset;
113 struct filter_input *next;
119 const char *profile_path;
122 xmlDocPtr doc_config;
123 struct filter_extract *extract;
124 struct filter_retrieve *retrieve_list;
125 struct filter_input *input_list;
126 struct filter_store *store;
127 int record_info_invoked;
132 #define XML_STRCMP(a,b) strcmp((char*)a, b)
133 #define XML_STRLEN(a) strlen((char*)a)
136 #define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
138 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
139 const char *fmt, ...)
141 __attribute__ ((format (printf, 4, 5)))
145 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
146 const char *fmt, ...)
152 yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
155 yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none",
156 xmlGetLineNo(ptr), buf);
160 yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
166 static void set_param_str(const char **params, const char *name,
167 const char *value, NMEM nmem)
169 char *quoted = nmem_malloc(nmem, 3 + strlen(value));
170 sprintf(quoted, "'%s'", value);
178 static void set_param_int(const char **params, const char *name,
179 zint value, NMEM nmem)
181 char *quoted = nmem_malloc(nmem, 30); /* 25 digits enough for 2^64 */
184 sprintf(quoted, "'" ZINT_FORMAT "'", value);
190 static void *filter_init(Res res, RecType recType)
192 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
194 tinfo->full_name = 0;
195 tinfo->profile_path = 0;
196 tinfo->nmem_record = nmem_create();
197 tinfo->nmem_config = nmem_create();
199 tinfo->retrieve_list = 0;
200 tinfo->input_list = 0;
202 tinfo->doc_config = 0;
203 tinfo->record_info_invoked = 0;
212 static int attr_content(struct _xmlAttr *attr, const char *name,
213 const char **dst_content)
215 if (!XML_STRCMP(attr->name, name) && attr->children
216 && attr->children->type == XML_TEXT_NODE)
218 *dst_content = (const char *)(attr->children->content);
224 static void destroy_xsp(struct convert_s *c)
228 if (c->which == convert_xslt_type)
230 if (c->u.xslt.stylesheet_xsp)
231 xsltFreeStylesheet(c->u.xslt.stylesheet_xsp);
237 static void destroy_dom(struct filter_info *tinfo)
241 destroy_xsp(tinfo->extract->convert);
246 destroy_xsp(tinfo->store->convert);
249 if (tinfo->input_list)
251 struct filter_input *i_ptr;
252 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
256 case DOM_INPUT_XMLREADER:
257 if (i_ptr->u.xmlreader.reader)
258 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
261 yaz_iconv_close(i_ptr->u.marc.iconv);
262 yaz_marc_destroy(i_ptr->u.marc.handle);
265 destroy_xsp(i_ptr->convert);
267 tinfo->input_list = 0;
269 if (tinfo->retrieve_list)
271 struct filter_retrieve *r_ptr;
272 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
273 destroy_xsp(r_ptr->convert);
274 tinfo->retrieve_list = 0;
277 if (tinfo->doc_config)
279 xmlFreeDoc(tinfo->doc_config);
280 tinfo->doc_config = 0;
282 nmem_reset(tinfo->nmem_config);
285 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
286 struct convert_s **l)
289 FOR_EACH_ELEMENT(ptr) {
290 if (!XML_STRCMP(ptr->name, "xslt"))
292 struct _xmlAttr *attr;
293 struct convert_s *p = nmem_malloc(tinfo->nmem_config, sizeof(*p));
296 p->which = convert_xslt_type;
297 p->u.xslt.stylesheet = 0;
298 p->u.xslt.stylesheet_xsp = 0;
300 for (attr = ptr->properties; attr; attr = attr->next)
301 if (attr_content(attr, "stylesheet", &p->u.xslt.stylesheet))
305 dom_log(YLOG_WARN, tinfo, ptr,
306 "bad attribute @%s", attr->name);
308 if (p->u.xslt.stylesheet)
310 char tmp_xslt_full_name[1024];
311 if (!yaz_filepath_resolve(p->u.xslt.stylesheet,
316 dom_log(YLOG_WARN, tinfo, 0,
317 "stylesheet %s not found in "
319 p->u.xslt.stylesheet,
320 tinfo->profile_path);
324 p->u.xslt.stylesheet_xsp
325 = xsltParseStylesheetFile((const xmlChar*)
327 if (!p->u.xslt.stylesheet_xsp)
329 dom_log(YLOG_WARN, tinfo, 0,
330 "could not parse xslt stylesheet %s",
337 dom_log(YLOG_WARN, tinfo, ptr,
338 "missing attribute 'stylesheet'");
344 else if (!XML_STRCMP(ptr->name, "process-meta"))
346 struct _xmlAttr *attr;
347 struct convert_s *p = nmem_malloc(tinfo->nmem_config, sizeof(*p));
350 p->which = convert_meta_type;
352 for (attr = ptr->properties; attr; attr = attr->next)
353 dom_log(YLOG_WARN, tinfo, ptr,
354 "bad attribute @%s", attr->name);
360 dom_log(YLOG_WARN, tinfo, ptr,
361 "bad element '%s', expected <xslt>", ptr->name);
368 static int process_meta(struct filter_info *tinfo, xmlDocPtr doc, xmlNodePtr node,
369 struct recRetrieveCtrl *retctr)
372 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href &&
373 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
375 if (0 == XML_STRCMP(node->name, "meta"))
377 const char *element_set_name = 0;
379 struct _xmlAttr *attr;
380 for (attr = node->properties; attr; attr = attr->next)
382 if (attr_content(attr, "name", &element_set_name))
386 dom_log(YLOG_WARN, tinfo, node,
387 "bad attribute @%s, expected @name", attr->name);
390 if (element_set_name)
392 WRBUF result = wrbuf_alloc();
393 WRBUF addinfo = wrbuf_alloc();
394 const Odr_oid *input_format = yaz_oid_recsyn_xml;
395 const Odr_oid *output_format = 0;
398 ret = retctr->special_fetch(retctr->handle,
400 input_format, &output_format,
405 xmlParseMemory(wrbuf_buf(result), wrbuf_len(result));
408 xmlNodePtr t = xmlDocGetRootElement(sub_doc);
409 xmlReplaceNode(node, xmlCopyNode(t, 1));
413 wrbuf_destroy(result);
414 wrbuf_destroy(addinfo);
418 for (node = node->children; node; node = node->next)
419 process_meta(tinfo, doc, node, retctr);
423 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
424 struct recExtractCtrl *extctr,
425 struct recRetrieveCtrl *retctr,
426 struct convert_s *convert,
429 xsltStylesheetPtr *last_xsp)
431 for (; convert; convert = convert->next)
433 if (convert->which == convert_xslt_type)
435 xmlChar *buf_out = 0;
437 xmlDocPtr res_doc = xsltApplyStylesheet(convert->u.xslt.stylesheet_xsp,
440 *last_xsp = convert->u.xslt.stylesheet_xsp;
445 /* now saving into buffer and re-reading into DOM to avoid annoing
446 XSLT problem with thrown-out indentation text nodes */
447 xsltSaveResultToString(&buf_out, &len_out, res_doc,
448 convert->u.xslt.stylesheet_xsp);
453 *doc = xmlParseMemory((const char *) buf_out, len_out);
455 /* writing debug info out */
456 if (extctr && extctr->flagShowRecords)
457 yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s",
458 tinfo->fname ? tinfo->fname : "(none)",
459 convert->u.xslt.stylesheet,
464 else if (convert->which == convert_meta_type)
466 if (retctr) /* only execute meta on retrieval */
468 process_meta(tinfo, *doc, xmlDocGetRootElement(*doc), retctr);
470 /* last stylesheet absent */
479 static struct filter_input *new_input(struct filter_info *tinfo, int type)
481 struct filter_input *p;
482 struct filter_input **np = &tinfo->input_list;
483 for (;*np; np = &(*np)->next)
485 p = *np = nmem_malloc(tinfo->nmem_config, sizeof(*p));
494 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
495 const char *syntax, const char *name)
497 FOR_EACH_ELEMENT(ptr) {
498 if (!XML_STRCMP(ptr->name, "marc"))
500 yaz_iconv_t iconv = 0;
501 const char *input_charset = "marc-8";
502 struct _xmlAttr *attr;
504 for (attr = ptr->properties; attr; attr = attr->next)
506 if (attr_content(attr, "inputcharset", &input_charset))
510 dom_log(YLOG_WARN, tinfo, ptr,
511 "bad attribute @%s, expected @inputcharset",
515 iconv = yaz_iconv_open("utf-8", input_charset);
518 dom_log(YLOG_WARN, tinfo, ptr,
519 "unsupported @charset '%s'", input_charset);
524 struct filter_input *p
525 = new_input(tinfo, DOM_INPUT_MARC);
526 p->u.marc.handle = yaz_marc_create();
527 p->u.marc.iconv = iconv;
529 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
533 parse_convert(tinfo, ptr, &p->convert);
538 else if (!XML_STRCMP(ptr->name, "xmlreader"))
540 struct filter_input *p
541 = new_input(tinfo, DOM_INPUT_XMLREADER);
542 struct _xmlAttr *attr;
543 const char *level_str = 0;
545 p->u.xmlreader.split_level = 0;
546 p->u.xmlreader.reader = 0;
548 for (attr = ptr->properties; attr; attr = attr->next)
550 if (attr_content(attr, "level", &level_str))
554 dom_log(YLOG_WARN, tinfo, ptr,
555 "bad attribute @%s, expected @level",
560 p->u.xmlreader.split_level = atoi(level_str);
564 parse_convert(tinfo, ptr, &p->convert);
569 dom_log(YLOG_WARN, tinfo, ptr,
570 "bad element <%s>, expected <marc>|<xmlreader>",
578 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
580 char tmp_full_name[1024];
584 tinfo->fname = nmem_strdup(tinfo->nmem_config, fname);
586 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
587 NULL, tmp_full_name))
588 tinfo->full_name = nmem_strdup(tinfo->nmem_config, tmp_full_name);
590 tinfo->full_name = nmem_strdup(tinfo->nmem_config, tinfo->fname);
592 yaz_log(YLOG_LOG, "%s dom filter: "
593 "loading config file %s", tinfo->fname, tinfo->full_name);
595 doc = xmlParseFile(tinfo->full_name);
598 yaz_log(YLOG_WARN, "%s: dom filter: "
599 "failed to parse config file %s",
600 tinfo->fname, tinfo->full_name);
603 /* save because we store ptrs to the content */
604 tinfo->doc_config = doc;
606 ptr = xmlDocGetRootElement(doc);
607 if (!ptr || ptr->type != XML_ELEMENT_NODE
608 || XML_STRCMP(ptr->name, "dom"))
610 dom_log(YLOG_WARN, tinfo, ptr,
611 "bad root element <%s>, expected root element <dom>",
617 FOR_EACH_ELEMENT(ptr) {
618 if (!XML_STRCMP(ptr->name, "extract"))
621 <extract name="index">
622 <xslt stylesheet="first.xsl"/>
623 <xslt stylesheet="second.xsl"/>
626 struct _xmlAttr *attr;
627 struct filter_extract *f =
628 nmem_malloc(tinfo->nmem_config, sizeof(*f));
633 for (attr = ptr->properties; attr; attr = attr->next)
635 if (attr_content(attr, "name", &f->name))
639 dom_log(YLOG_WARN, tinfo, ptr,
640 "bad attribute @%s, expected @name",
644 parse_convert(tinfo, ptr->children, &f->convert);
646 else if (!XML_STRCMP(ptr->name, "retrieve"))
650 <xslt stylesheet="some.xsl"/>
651 <xslt stylesheet="some.xsl"/>
654 struct _xmlAttr *attr;
655 struct filter_retrieve **fp = &tinfo->retrieve_list;
656 struct filter_retrieve *f =
657 nmem_malloc(tinfo->nmem_config, sizeof(*f));
668 for (attr = ptr->properties; attr; attr = attr->next)
670 if (attr_content(attr, "identifier",
673 else if (attr_content(attr, "name", &f->name))
677 dom_log(YLOG_WARN, tinfo, ptr,
678 "bad attribute @%s, expected @identifier|@name",
682 parse_convert(tinfo, ptr->children, &f->convert);
684 else if (!XML_STRCMP(ptr->name, "store"))
688 <xslt stylesheet="some.xsl"/>
689 <xslt stylesheet="some.xsl"/>
692 struct filter_store *f =
693 nmem_malloc(tinfo->nmem_config, sizeof(*f));
697 parse_convert(tinfo, ptr->children, &f->convert);
699 else if (!XML_STRCMP(ptr->name, "input"))
703 <xmlreader level="1"/>
705 <input syntax="usmarc">
706 <marc inputcharset="marc-8"/>
709 struct _xmlAttr *attr;
710 const char *syntax = 0;
711 const char *name = 0;
712 for (attr = ptr->properties; attr; attr = attr->next)
714 if (attr_content(attr, "syntax", &syntax))
716 else if (attr_content(attr, "name", &name))
720 dom_log(YLOG_WARN, tinfo, ptr,
721 "bad attribute @%s, expected @syntax|@name",
725 parse_input(tinfo, ptr->children, syntax, name);
729 dom_log(YLOG_WARN, tinfo, ptr,
731 "expected <extract>|<input>|<retrieve>|<store>",
736 if (!tinfo->input_list)
738 struct filter_input *p
739 = new_input(tinfo, DOM_INPUT_XMLREADER);
740 p->u.xmlreader.split_level = 0;
741 p->u.xmlreader.reader = 0;
746 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
749 struct filter_retrieve *f = tinfo->retrieve_list;
751 /* return first schema if no est is provided */
754 for (; f; f = f->next)
756 /* find requested schema */
759 if (f->identifier && !strcmp(f->identifier, est))
761 if (f->name && !strcmp(f->name, est))
768 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
770 struct filter_info *tinfo = clientData;
773 yaz_log(YLOG_WARN, "dom filter: need config file");
777 if (tinfo->fname && !strcmp(args, tinfo->fname))
780 tinfo->profile_path = res_get(res, "profilePath");
783 return parse_dom(tinfo, args);
786 static void filter_destroy(void *clientData)
788 struct filter_info *tinfo = clientData;
790 nmem_destroy(tinfo->nmem_config);
791 nmem_destroy(tinfo->nmem_record);
795 static int ioread_ex(void *context, char *buffer, int len)
797 struct recExtractCtrl *p = context;
798 return p->stream->readf(p->stream, buffer, len);
801 static int ioclose_ex(void *context)
808 /* DOM filter style indexing */
809 static void index_value_of(struct filter_info *tinfo,
810 struct recExtractCtrl *extctr,
815 if (tinfo->record_info_invoked == 1)
817 xmlChar *text = xmlNodeGetContent(node);
818 size_t text_len = strlen((const char *)text);
820 /* if there is no text, we do not need to proceed */
823 /* keep seqno base so that all text will have
824 identical seqno's for multiple fields , e.g
825 <z:index name="title:w any:w title:p">.. */
827 zint seqno_base = recword->seqno;
828 zint seqno_max = recword->seqno;
831 const char *look = index_p;
838 /* assingning text to be indexed */
839 recword->term_buf = (const char *)text;
840 recword->term_len = text_len;
842 /* parsing all index name/type pairs */
843 /* may not start with ' ' or ':' */
844 while (*look && ' ' != *look && ':' != *look)
846 /* setting name and type to zero */
850 /* parsing one index name */
852 while (*look && ':' != *look && ' ' != *look)
857 strncpy((char *)index, (const char *)bval, eval - bval);
858 index[eval - bval] = '\0';
861 /* parsing one index type, if existing */
867 while (*look && ' ' != *look)
872 strncpy((char *)type, (const char *)bval, eval - bval);
873 type[eval - bval] = '\0';
876 /* actually indexing the text given */
878 recword->seqno = seqno_base;
879 recword->index_name = (const char *)index;
881 recword->index_type = (const char *) type;
883 /* writing debug out */
884 if (extctr->flagShowRecords)
885 dom_log(YLOG_LOG, tinfo, 0,
886 "INDEX '%s:%s' '%s'",
887 (const char *) index,
889 (const char *) text);
891 (extctr->tokenAdd)(recword);
893 if (seqno_max < recword->seqno)
894 seqno_max = recword->seqno;
896 /* eat whitespaces */
897 if (*look && ' ' == *look)
902 recword->seqno = seqno_max;
909 /* DOM filter style indexing */
910 static void set_record_info(struct filter_info *tinfo,
911 struct recExtractCtrl *extctr,
917 /* writing debug info out */
918 if (extctr && extctr->flagShowRecords)
919 dom_log(YLOG_LOG, tinfo, node,
920 "RECORD id=%s rank=%s type=%s",
921 id_p ? (const char *) id_p : "(null)",
922 rank_p ? (const char *) rank_p : "(null)",
923 type_p ? (const char *) type_p : "(null)");
928 size_t l = strlen(id_p);
929 if (l >= sizeof(extctr->match_criteria))
930 l = sizeof(extctr->match_criteria)-1;
931 memcpy(extctr->match_criteria, id_p, l);
932 extctr->match_criteria[l] = '\0';
935 if (rank_p && *rank_p)
936 extctr->staticrank = atozint((const char *)rank_p);
938 if (type_p && *type_p)
940 enum zebra_recctrl_action_t action = action_update;
941 if (!strcmp(type_p, "insert"))
942 action = action_insert;
943 else if (!strcmp(type_p, "delete"))
944 action = action_delete;
945 else if (!strcmp(type_p, "replace"))
946 action = action_replace;
947 else if (!strcmp(type_p, "update"))
948 action = action_update;
949 else if (!strcmp(type_p, "adelete"))
950 action = action_a_delete;
952 dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p);
953 extctr->action = action;
956 if (tinfo->record_info_invoked == 1)
958 /* warn about multiple only once */
959 dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
961 tinfo->record_info_invoked++;
966 /* DOM filter style indexing */
967 static void process_xml_element_zebra_node(struct filter_info *tinfo,
968 struct recExtractCtrl *extctr,
972 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
973 && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
975 if (0 == XML_STRCMP(node->name, "index"))
977 const char *index_p = 0;
979 struct _xmlAttr *attr;
980 for (attr = node->properties; attr; attr = attr->next)
982 if (attr_content(attr, "name", &index_p))
984 index_value_of(tinfo, extctr, recword, node, index_p);
988 dom_log(YLOG_WARN, tinfo, node,
989 "bad attribute @%s, expected @name",
994 else if (0 == XML_STRCMP(node->name, "record"))
996 const char *id_p = 0;
997 const char *rank_p = 0;
998 const char *type_p = 0;
1000 struct _xmlAttr *attr;
1001 for (attr = node->properties; attr; attr = attr->next)
1003 if (attr_content(attr, "id", &id_p))
1005 else if (attr_content(attr, "rank", &rank_p))
1007 else if (attr_content(attr, "type", &type_p))
1011 dom_log(YLOG_WARN, tinfo, node,
1012 "bad attribute @%s, expected @id|@rank|@type",
1016 set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
1020 dom_log(YLOG_WARN, tinfo, node,
1022 " expected <record>|<index> in namespace '%s'",
1023 node->name, zebra_dom_ns);
1028 static int attr_content_pi(const char **c_ptr, const char *name,
1029 char *value, size_t value_max)
1031 size_t name_len = strlen(name);
1032 const char *look = *c_ptr;
1035 if (strlen(look) > name_len)
1037 if (look[name_len] == '=' && !memcmp(look, name, name_len))
1041 while (*look && ' ' != *look)
1043 if (i < value_max-1)
1055 /* DOM filter style indexing */
1056 static void process_xml_pi_node(struct filter_info *tinfo,
1057 struct recExtractCtrl *extctr,
1059 const char **index_pp)
1061 /* if right PI name, continue parsing PI */
1062 if (0 == strcmp(zebra_pi_name, (const char *)node->name))
1064 xmlChar *pi_p = node->content;
1065 const char *look = (const char *) node->content;
1067 /* parsing PI record instructions */
1068 if (0 == strncmp((const char *)look, "record", 6))
1080 /* eat whitespace */
1081 while (' ' == *look)
1085 if (attr_content_pi(&look, "id", id, sizeof(id)))
1087 else if (attr_content_pi(&look, "rank", rank, sizeof(rank)))
1089 else if (attr_content_pi(&look, "type", type, sizeof(type)))
1093 dom_log(YLOG_WARN, tinfo, node,
1094 "content '%s', can not parse '%s'",
1099 set_record_info(tinfo, extctr, node, id, rank, type);
1101 /* parsing index instruction */
1102 else if (0 == strncmp((const char *)look, "index", 5))
1106 /* eat whitespace */
1107 while (*look && ' ' == *look)
1110 /* export index instructions to outside */
1115 dom_log(YLOG_WARN, tinfo, node,
1116 "content '%s', can not parse '%s'",
1122 /* DOM filter style indexing */
1123 static void process_xml_element_node(struct filter_info *tinfo,
1124 struct recExtractCtrl *extctr,
1128 /* remember indexing instruction from PI to next element node */
1129 const char *index_p = 0;
1131 /* check if we are an element node in the special zebra namespace
1132 and either set record data or index value-of node content*/
1133 process_xml_element_zebra_node(tinfo, extctr, recword, node);
1135 /* loop through kid nodes */
1136 for (node = node->children; node; node = node->next)
1138 /* check and set PI record and index index instructions */
1139 if (node->type == XML_PI_NODE)
1141 process_xml_pi_node(tinfo, extctr, node, &index_p);
1143 else if (node->type == XML_ELEMENT_NODE)
1145 /* if there was a PI index instruction before this element */
1148 index_value_of(tinfo, extctr, recword, node, index_p);
1151 process_xml_element_node(tinfo, extctr, recword,node);
1159 /* DOM filter style indexing */
1160 static void extract_dom_doc_node(struct filter_info *tinfo,
1161 struct recExtractCtrl *extctr,
1164 /* only need to do the initialization once, reuse recword for all terms */
1166 (*extctr->init)(extctr, &recword);
1168 process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1172 static int convert_extract_doc(struct filter_info *tinfo,
1173 struct filter_input *input,
1174 struct recExtractCtrl *p,
1179 const char *params[10];
1180 xsltStylesheetPtr last_xsp = 0;
1182 /* per default do not ingest record */
1183 tinfo->record_info_invoked = 0;
1185 /* exit if empty document given */
1187 return RECCTRL_EXTRACT_SKIP;
1189 /* we actuallu have a document which needs to be processed further */
1191 set_param_str(params, "schema", zebra_dom_ns, tinfo->nmem_record);
1193 if (p && p->flagShowRecords)
1197 xmlDocDumpMemory(doc, &buf_out, &len_out);
1199 FILE *outf = fopen("extract.xml", "w");
1200 fwrite(buf_out, 1, len_out, outf);
1203 yaz_log(YLOG_LOG, "Extract Doc: %.*s", len_out, buf_out);
1206 if (p->setStoreData)
1208 xmlDocPtr store_doc = 0;
1210 /* input conversion */
1211 perform_convert(tinfo, p, 0, input->convert, params, &doc, 0);
1215 /* store conversion */
1216 store_doc = xmlCopyDoc(doc, 1);
1217 perform_convert(tinfo, p, 0, tinfo->store->convert,
1218 params, &store_doc, &last_xsp);
1221 /* saving either store doc or original doc in case no store doc exists */
1223 xsltSaveResultToString(&buf_out, &len_out,
1224 store_doc ? store_doc : doc, last_xsp);
1226 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1228 if (p->setStoreData)
1229 (*p->setStoreData)(p, buf_out, len_out);
1232 xmlFreeDoc(store_doc);
1236 /* extract conversion */
1237 perform_convert(tinfo, p, 0, tinfo->extract->convert, params, &doc, 0);
1240 /* finally, do the indexing */
1242 extract_dom_doc_node(tinfo, p, doc);
1246 /* there was nothing to index, so there is no inserted/updated record */
1247 if (tinfo->record_info_invoked == 0)
1248 return RECCTRL_EXTRACT_SKIP;
1250 return RECCTRL_EXTRACT_OK;
1253 static int extract_xml_split(struct filter_info *tinfo,
1254 struct filter_input *input,
1255 struct recExtractCtrl *p)
1259 if (p->first_record)
1261 if (input->u.xmlreader.reader)
1262 xmlFreeTextReader(input->u.xmlreader.reader);
1263 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1264 p /* I/O handler */,
1271 if (!input->u.xmlreader.reader)
1272 return RECCTRL_EXTRACT_ERROR_GENERIC;
1274 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1277 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1278 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1280 if (type == XML_READER_TYPE_ELEMENT &&
1281 input->u.xmlreader.split_level == depth)
1285 /* per default do not ingest record */
1286 tinfo->record_info_invoked = 0;
1288 ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
1291 /* we have a new document */
1293 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1294 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1296 xmlDocSetRootElement(doc, ptr2);
1298 /* writing debug info out */
1299 if (p->flagShowRecords)
1301 xmlChar *buf_out = 0;
1303 xmlDocDumpMemory(doc, &buf_out, &len_out);
1304 yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s",
1305 tinfo->fname ? tinfo->fname : "(none)",
1306 depth, len_out, buf_out);
1310 return convert_extract_doc(tinfo, input, p, doc);
1314 xmlFreeTextReader(input->u.xmlreader.reader);
1315 input->u.xmlreader.reader = 0;
1316 return RECCTRL_EXTRACT_ERROR_GENERIC;
1319 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1321 xmlFreeTextReader(input->u.xmlreader.reader);
1322 input->u.xmlreader.reader = 0;
1323 return RECCTRL_EXTRACT_EOF;
1326 static int extract_xml_full(struct filter_info *tinfo,
1327 struct filter_input *input,
1328 struct recExtractCtrl *p)
1330 if (p->first_record) /* only one record per stream */
1332 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1333 p /* I/O handler */,
1341 return RECCTRL_EXTRACT_ERROR_GENERIC;
1343 return convert_extract_doc(tinfo, input, p, doc);
1346 return RECCTRL_EXTRACT_EOF;
1349 static int extract_iso2709(struct filter_info *tinfo,
1350 struct filter_input *input,
1351 struct recExtractCtrl *p)
1357 if (p->stream->readf(p->stream, buf, 5) != 5)
1358 return RECCTRL_EXTRACT_EOF;
1359 while (*buf < '0' || *buf > '9')
1363 dom_log(YLOG_WARN, tinfo, 0,
1364 "MARC: Skipping bad byte %d (0x%02X)",
1365 *buf & 0xff, *buf & 0xff);
1366 for (i = 0; i < 4; i++)
1369 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1370 return RECCTRL_EXTRACT_EOF;
1372 record_length = atoi_n (buf, 5);
1373 if (record_length < 25)
1375 dom_log(YLOG_WARN, tinfo, 0,
1376 "MARC record length < 25, is %d", record_length);
1377 return RECCTRL_EXTRACT_ERROR_GENERIC;
1379 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1380 if (read_bytes < record_length-5)
1382 dom_log(YLOG_WARN, tinfo, 0,
1383 "couldn't read whole MARC record");
1384 return RECCTRL_EXTRACT_ERROR_GENERIC;
1386 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1387 if (r < record_length)
1389 dom_log (YLOG_WARN, tinfo, 0,
1390 "parsing of MARC record failed r=%d length=%d",
1392 return RECCTRL_EXTRACT_ERROR_GENERIC;
1398 yaz_marc_write_xml(input->u.marc.handle, &root_ptr,
1399 "http://www.loc.gov/MARC21/slim", 0, 0);
1400 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1401 xmlDocSetRootElement(rdoc, root_ptr);
1402 return convert_extract_doc(tinfo, input, p, rdoc);
1404 return RECCTRL_EXTRACT_OK;
1407 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1409 struct filter_info *tinfo = clientData;
1410 struct filter_input *input = tinfo->input_list;
1413 return RECCTRL_EXTRACT_ERROR_GENERIC;
1415 nmem_reset(tinfo->nmem_record);
1417 if (p->setStoreData == 0)
1418 return extract_xml_full(tinfo, input, p);
1421 case DOM_INPUT_XMLREADER:
1422 if (input->u.xmlreader.split_level == 0)
1423 return extract_xml_full(tinfo, input, p);
1425 return extract_xml_split(tinfo, input, p);
1427 case DOM_INPUT_MARC:
1428 return extract_iso2709(tinfo, input, p);
1430 return RECCTRL_EXTRACT_ERROR_GENERIC;
1433 static int ioread_ret(void *context, char *buffer, int len)
1435 struct recRetrieveCtrl *p = context;
1436 int r = p->stream->readf(p->stream, buffer, len);
1440 static int ioclose_ret(void *context)
1445 static int filter_retrieve(void *clientData, struct recRetrieveCtrl *p)
1447 /* const char *esn = zebra_dom_ns; */
1448 const char *esn = 0;
1449 const char *params[32];
1450 struct filter_info *tinfo = clientData;
1452 struct filter_retrieve *retrieve;
1453 xsltStylesheetPtr last_xsp = 0;
1457 if (p->comp->which == Z_RecordComp_simple
1458 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1460 esn = p->comp->u.simple->u.generic;
1462 else if (p->comp->which == Z_RecordComp_complex
1463 && p->comp->u.complex->generic->elementSpec
1464 && p->comp->u.complex->generic->elementSpec->which ==
1465 Z_ElementSpec_elementSetName)
1467 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1470 retrieve = lookup_retrieve(tinfo, esn);
1474 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1475 p->addinfo = odr_strdup_null(p->odr, esn);
1480 set_param_int(params, "id", p->localno, p->odr->mem);
1482 set_param_str(params, "filename", p->fname, p->odr->mem);
1483 if (p->staticrank >= 0)
1484 set_param_int(params, "rank", p->staticrank, p->odr->mem);
1487 set_param_str(params, "schema", esn, p->odr->mem);
1490 set_param_str(params, "schema", retrieve->name, p->odr->mem);
1491 else if (retrieve->identifier)
1492 set_param_str(params, "schema", retrieve->identifier, p->odr->mem);
1494 set_param_str(params, "schema", "", p->odr->mem);
1497 set_param_int(params, "score", p->score, p->odr->mem);
1498 set_param_int(params, "size", p->recordSize, p->odr->mem);
1500 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1503 XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
1506 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1510 /* retrieve conversion */
1511 perform_convert(tinfo, 0, p, retrieve->convert, params, &doc, &last_xsp);
1514 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1516 else if (!p->input_format
1517 || !oid_oidcmp(p->input_format, yaz_oid_recsyn_xml))
1523 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1525 xmlDocDumpMemory(doc, &buf_out, &len_out);
1527 p->output_format = yaz_oid_recsyn_xml;
1528 p->rec_len = len_out;
1529 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1530 memcpy(p->rec_buf, buf_out, p->rec_len);
1533 else if (!oid_oidcmp(p->output_format, yaz_oid_recsyn_sutrs))
1539 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1541 xmlDocDumpMemory(doc, &buf_out, &len_out);
1543 p->output_format = yaz_oid_recsyn_sutrs;
1544 p->rec_len = len_out;
1545 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1546 memcpy(p->rec_buf, buf_out, p->rec_len);
1552 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1558 static struct recType filter_type = {
1569 #if IDZEBRA_STATIC_DOM
1582 * c-file-style: "Stroustrup"
1583 * indent-tabs-mode: nil
1585 * vim: shiftwidth=4 tabstop=8 expandtab