2 /* $Id: mod_dom.c,v 1.31 2007-03-08 17:19:12 marc Exp $
3 Copyright (C) 1995-2007
6 This file is part of the Zebra server.
8 Zebra is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
29 #include <yaz/diagbib1.h>
30 #include <yaz/tpath.h>
31 #include <yaz/snprintf.h>
33 #include <libxml/xmlversion.h>
34 #include <libxml/parser.h>
35 #include <libxml/tree.h>
36 #include <libxml/xmlIO.h>
37 #include <libxml/xmlreader.h>
38 #include <libxslt/transform.h>
39 #include <libxslt/xsltutils.h>
42 #include <libexslt/exslt.h>
45 #include <idzebra/util.h>
46 #include <idzebra/recctrl.h>
48 /* DOM filter style indexing */
49 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
50 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
52 /* DOM filter style indexing */
53 #define ZEBRA_PI_NAME "zebra-2.0"
54 static const char *zebra_pi_name = ZEBRA_PI_NAME;
59 const char *stylesheet;
60 xsltStylesheetPtr stylesheet_xsp;
61 struct convert_s *next;
64 struct filter_extract {
66 struct convert_s *convert;
70 struct convert_s *convert;
73 struct filter_retrieve {
75 const char *identifier;
76 struct convert_s *convert;
77 struct filter_retrieve *next;
80 #define DOM_INPUT_DOM 0
81 #define DOM_INPUT_XMLREADER 1
82 #define DOM_INPUT_MARC 2
86 struct convert_s *convert;
93 xmlTextReaderPtr reader;
97 const char *input_charset;
102 struct filter_input *next;
108 const char *profile_path;
111 xmlDocPtr doc_config;
112 struct filter_extract *extract;
113 struct filter_retrieve *retrieve_list;
114 struct filter_input *input_list;
115 struct filter_store *store;
116 int record_info_invoked;
121 #define XML_STRCMP(a,b) strcmp((char*)a, b)
122 #define XML_STRLEN(a) strlen((char*)a)
125 #define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
127 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
128 const char *fmt, ...)
130 __attribute__ ((format (printf, 4, 5)))
134 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
135 const char *fmt, ...)
141 yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
144 yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none",
145 xmlGetLineNo(ptr), buf);
149 yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
155 static void set_param_str(const char **params, const char *name,
156 const char *value, ODR odr)
158 char *quoted = odr_malloc(odr, 3 + strlen(value));
159 sprintf(quoted, "'%s'", value);
167 static void set_param_int(const char **params, const char *name,
170 char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
173 sprintf(quoted, "'" ZINT_FORMAT "'", value);
179 static void *filter_init(Res res, RecType recType)
181 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
183 tinfo->full_name = 0;
184 tinfo->profile_path = 0;
185 tinfo->odr_record = odr_createmem(ODR_ENCODE);
186 tinfo->odr_config = odr_createmem(ODR_ENCODE);
188 tinfo->retrieve_list = 0;
189 tinfo->input_list = 0;
191 tinfo->doc_config = 0;
192 tinfo->record_info_invoked = 0;
201 static int attr_content(struct _xmlAttr *attr, const char *name,
202 const char **dst_content)
204 if (!XML_STRCMP(attr->name, name) && attr->children
205 && attr->children->type == XML_TEXT_NODE)
207 *dst_content = (const char *)(attr->children->content);
213 static void destroy_xsp(struct convert_s *c)
217 if (c->stylesheet_xsp)
218 xsltFreeStylesheet(c->stylesheet_xsp);
223 static void destroy_dom(struct filter_info *tinfo)
227 destroy_xsp(tinfo->extract->convert);
232 destroy_xsp(tinfo->store->convert);
235 if (tinfo->input_list)
237 struct filter_input *i_ptr;
238 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
244 case DOM_INPUT_XMLREADER:
245 if (i_ptr->u.xmlreader.reader)
246 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
249 yaz_iconv_close(i_ptr->u.marc.iconv);
250 yaz_marc_destroy(i_ptr->u.marc.handle);
253 destroy_xsp(i_ptr->convert);
255 tinfo->input_list = 0;
257 if (tinfo->retrieve_list)
259 struct filter_retrieve *r_ptr;
260 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
261 destroy_xsp(r_ptr->convert);
262 tinfo->retrieve_list = 0;
265 if (tinfo->doc_config)
267 xmlFreeDoc(tinfo->doc_config);
268 tinfo->doc_config = 0;
270 odr_reset(tinfo->odr_config);
273 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
274 struct convert_s **l)
277 FOR_EACH_ELEMENT(ptr) {
278 if (!XML_STRCMP(ptr->name, "xslt"))
280 struct _xmlAttr *attr;
282 = odr_malloc(tinfo->odr_config, sizeof(*p));
286 p->stylesheet_xsp = 0;
288 for (attr = ptr->properties; attr; attr = attr->next)
289 if (attr_content(attr, "stylesheet", &p->stylesheet))
293 dom_log(YLOG_WARN, tinfo, ptr,
294 "bad attribute @%s", attr->name);
298 char tmp_xslt_full_name[1024];
299 if (!yaz_filepath_resolve(p->stylesheet,
304 dom_log(YLOG_WARN, tinfo, 0,
305 "stylesheet %s not found in "
308 tinfo->profile_path);
313 = xsltParseStylesheetFile((const xmlChar*)
315 if (!p->stylesheet_xsp)
317 dom_log(YLOG_WARN, tinfo, 0,
318 "could not parse xslt stylesheet %s",
325 dom_log(YLOG_WARN, tinfo, ptr,
326 "missing attribute 'stylesheet' ");
334 dom_log(YLOG_WARN, tinfo, ptr,
335 "bad element '%s', expected <xslt>", ptr->name);
342 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
343 struct recExtractCtrl *extctr,
344 struct convert_s *convert,
347 xsltStylesheetPtr *last_xsp)
349 for (; convert; convert = convert->next)
351 xmlChar *buf_out = 0;
353 xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
356 *last_xsp = convert->stylesheet_xsp;
361 /* now saving into buffer and re-reading into DOM to avoid annoing
362 XSLT problem with thrown-out indentation text nodes */
363 xsltSaveResultToString(&buf_out, &len_out, res_doc,
364 convert->stylesheet_xsp);
369 *doc = xmlParseMemory((const char *) buf_out, len_out);
371 /* writing debug info out */
372 if (extctr && extctr->flagShowRecords)
373 yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s",
374 tinfo->fname ? tinfo->fname : "(none)",
383 static struct filter_input *new_input(struct filter_info *tinfo, int type)
385 struct filter_input *p;
386 struct filter_input **np = &tinfo->input_list;
387 for (;*np; np = &(*np)->next)
389 p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
398 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
399 const char *syntax, const char *name)
401 FOR_EACH_ELEMENT(ptr) {
402 if (!XML_STRCMP(ptr->name, "marc"))
404 yaz_iconv_t iconv = 0;
405 const char *input_charset = "marc-8";
406 struct _xmlAttr *attr;
408 for (attr = ptr->properties; attr; attr = attr->next)
410 if (attr_content(attr, "inputcharset", &input_charset))
414 dom_log(YLOG_WARN, tinfo, ptr,
415 "bad attribute @%s, expected @inputcharset",
419 iconv = yaz_iconv_open("utf-8", input_charset);
422 dom_log(YLOG_WARN, tinfo, ptr,
423 "unsupported @charset '%s'", input_charset);
428 struct filter_input *p
429 = new_input(tinfo, DOM_INPUT_MARC);
430 p->u.marc.handle = yaz_marc_create();
431 p->u.marc.iconv = iconv;
433 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
437 parse_convert(tinfo, ptr, &p->convert);
442 else if (!XML_STRCMP(ptr->name, "xmlreader"))
444 struct filter_input *p
445 = new_input(tinfo, DOM_INPUT_XMLREADER);
446 struct _xmlAttr *attr;
447 const char *level_str = 0;
449 p->u.xmlreader.split_level = 0;
450 p->u.xmlreader.reader = 0;
452 for (attr = ptr->properties; attr; attr = attr->next)
454 if (attr_content(attr, "level", &level_str))
458 dom_log(YLOG_WARN, tinfo, ptr,
459 "bad attribute @%s, expected @level",
464 p->u.xmlreader.split_level = atoi(level_str);
468 parse_convert(tinfo, ptr, &p->convert);
471 else if (!XML_STRCMP(ptr->name, "xslt")){
472 struct filter_input *p
473 = new_input(tinfo, DOM_INPUT_DOM);
474 parse_convert(tinfo, ptr, &p->convert);
479 dom_log(YLOG_WARN, tinfo, ptr,
480 "bad element <%s>, expected <marc>|<xmlreader>|<xslt>",
488 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
490 char tmp_full_name[1024];
494 tinfo->fname = odr_strdup(tinfo->odr_config, fname);
496 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
497 NULL, tmp_full_name))
498 tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
500 tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
502 yaz_log(YLOG_LOG, "%s dom filter: "
503 "loading config file %s", tinfo->fname, tinfo->full_name);
505 doc = xmlParseFile(tinfo->full_name);
508 yaz_log(YLOG_WARN, "%s: dom filter: "
509 "failed to parse config file %s",
510 tinfo->fname, tinfo->full_name);
513 /* save because we store ptrs to the content */
514 tinfo->doc_config = doc;
516 ptr = xmlDocGetRootElement(doc);
517 if (!ptr || ptr->type != XML_ELEMENT_NODE
518 || XML_STRCMP(ptr->name, "dom"))
520 dom_log(YLOG_WARN, tinfo, ptr,
521 "bad root element <%s>, expected root element <dom>",
527 FOR_EACH_ELEMENT(ptr) {
528 if (!XML_STRCMP(ptr->name, "extract"))
531 <extract name="index">
532 <xslt stylesheet="first.xsl"/>
533 <xslt stylesheet="second.xsl"/>
536 struct _xmlAttr *attr;
537 struct filter_extract *f =
538 odr_malloc(tinfo->odr_config, sizeof(*f));
543 for (attr = ptr->properties; attr; attr = attr->next)
545 if (attr_content(attr, "name", &f->name))
549 dom_log(YLOG_WARN, tinfo, ptr,
550 "bad attribute @%s, expected @name",
554 parse_convert(tinfo, ptr->children, &f->convert);
556 else if (!XML_STRCMP(ptr->name, "retrieve"))
560 <xslt stylesheet="some.xsl"/>
561 <xslt stylesheet="some.xsl"/>
564 struct _xmlAttr *attr;
565 struct filter_retrieve **fp = &tinfo->retrieve_list;
566 struct filter_retrieve *f =
567 odr_malloc(tinfo->odr_config, sizeof(*f));
578 for (attr = ptr->properties; attr; attr = attr->next)
580 if (attr_content(attr, "identifier",
583 else if (attr_content(attr, "name", &f->name))
587 dom_log(YLOG_WARN, tinfo, ptr,
588 "bad attribute @%s, expected @identifier|@name",
592 parse_convert(tinfo, ptr->children, &f->convert);
594 else if (!XML_STRCMP(ptr->name, "store"))
598 <xslt stylesheet="some.xsl"/>
599 <xslt stylesheet="some.xsl"/>
602 struct filter_store *f =
603 odr_malloc(tinfo->odr_config, sizeof(*f));
607 parse_convert(tinfo, ptr->children, &f->convert);
609 else if (!XML_STRCMP(ptr->name, "input"))
613 <xmlreader level="1"/>
615 <input syntax="usmarc">
616 <marc inputcharset="marc-8"/>
619 struct _xmlAttr *attr;
620 const char *syntax = 0;
621 const char *name = 0;
622 for (attr = ptr->properties; attr; attr = attr->next)
624 if (attr_content(attr, "syntax", &syntax))
626 else if (attr_content(attr, "name", &name))
630 dom_log(YLOG_WARN, tinfo, ptr,
631 "bad attribute @%s, expected @syntax|@name",
635 parse_input(tinfo, ptr->children, syntax, name);
639 dom_log(YLOG_WARN, tinfo, ptr,
641 "expected <extract>|<input>|<retrieve>|<store>",
647 /* adding an empty DOM dummy type if no <input> list has been defined */
648 if (! tinfo->input_list){
649 struct filter_input *p
650 = new_input(tinfo, DOM_INPUT_DOM);
656 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
659 struct filter_retrieve *f = tinfo->retrieve_list;
661 /* return first schema if no est is provided */
664 for (; f; f = f->next)
666 /* find requested schema */
669 if (f->identifier && !strcmp(f->identifier, est))
671 if (f->name && !strcmp(f->name, est))
678 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
680 struct filter_info *tinfo = clientData;
683 yaz_log(YLOG_WARN, "dom filter: need config file");
687 if (tinfo->fname && !strcmp(args, tinfo->fname))
690 tinfo->profile_path = res_get(res, "profilePath");
693 return parse_dom(tinfo, args);
696 static void filter_destroy(void *clientData)
698 struct filter_info *tinfo = clientData;
700 odr_destroy(tinfo->odr_config);
701 odr_destroy(tinfo->odr_record);
705 static int ioread_ex(void *context, char *buffer, int len)
707 struct recExtractCtrl *p = context;
708 return p->stream->readf(p->stream, buffer, len);
711 static int ioclose_ex(void *context)
717 /* DOM filter style indexing */
718 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
719 xmlChar **dst_content)
721 if (0 == XML_STRCMP(attr->name, name) && attr->children
722 && attr->children->type == XML_TEXT_NODE)
724 *dst_content = (attr->children->content);
731 /* DOM filter style indexing */
732 static void index_value_of(struct filter_info *tinfo,
733 struct recExtractCtrl *extctr,
738 if (tinfo->record_info_invoked == 1)
740 xmlChar *text = xmlNodeGetContent(node);
741 size_t text_len = strlen((const char *)text);
743 /* if there is no text, we do not need to proceed */
746 xmlChar *look = index_p;
753 /* assingning text to be indexed */
754 recword->term_buf = (const char *)text;
755 recword->term_len = text_len;
757 /* parsing all index name/type pairs */
758 /* may not start with ' ' or ':' */
759 while (*look && ' ' != *look && ':' != *look)
761 /* setting name and type to zero */
765 /* parsing one index name */
767 while (*look && ':' != *look && ' ' != *look)
772 strncpy((char *)index, (const char *)bval, eval - bval);
773 index[eval - bval] = '\0';
776 /* parsing one index type, if existing */
782 while (*look && ' ' != *look)
787 strncpy((char *)type, (const char *)bval, eval - bval);
788 type[eval - bval] = '\0';
791 /* actually indexing the text given */
792 dom_log(YLOG_DEBUG, tinfo, 0,
793 "INDEX '%s:%s' '%s'",
794 index ? (const char *) index : "null",
795 type ? (const char *) type : "null",
796 text ? (const char *) text : "null");
798 recword->index_name = (const char *)index;
800 recword->index_type = *type;
802 /* writing debug out */
803 if (extctr->flagShowRecords)
804 dom_log(YLOG_LOG, tinfo, 0,
805 "INDEX '%s:%s' '%s'",
806 index ? (const char *) index : "null",
807 type ? (const char *) type : "null",
808 text ? (const char *) text : "null");
810 /* actually indexing the text given */
811 recword->index_name = (const char *)index;
813 recword->index_type = *type;
814 (extctr->tokenAdd)(recword);
816 /* eat whitespaces */
817 if (*look && ' ' == *look && *(look+1))
828 /* DOM filter style indexing */
829 static void set_record_info(struct filter_info *tinfo,
830 struct recExtractCtrl *extctr,
837 /* writing debug info out */
838 if (extctr->flagShowRecords)
839 dom_log(YLOG_LOG, tinfo, 0,
840 "RECORD id=%s rank=%s type=%s",
841 id_p ? (const char *) id_p : "(null)",
842 rank_p ? (const char *) rank_p : "(null)",
843 type_p ? (const char *) type_p : "(null)");
847 sscanf((const char *)id_p, "%255s", extctr->match_criteria);
850 extctr->staticrank = atozint((const char *)rank_p);
852 /* if (!strcmp("update", type_str)) */
853 /* index_node(tinfo, ctrl, ptr, recword); */
854 /* else if (!strcmp("delete", type_str)) */
855 /* dom_log(YLOG_WARN, tinfo, ptr, "dom filter delete: to be implemented"); */
857 /* dom_log(YLOG_WARN, tinfo, ptr, "dom filter: unknown record type '%s'", */
859 if (tinfo->record_info_invoked == 1)
861 /* warn about multiple only once */
862 dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
864 tinfo->record_info_invoked++;
869 /* DOM filter style indexing */
870 static void process_xml_element_zebra_node(struct filter_info *tinfo,
871 struct recExtractCtrl *extctr,
875 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
876 && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
878 if (0 == XML_STRCMP(node->name, "index"))
880 xmlChar *index_p = 0;
882 struct _xmlAttr *attr;
883 for (attr = node->properties; attr; attr = attr->next)
885 if (attr_content_xml(attr, "name", &index_p))
887 index_value_of(tinfo, extctr, recword,node, index_p);
891 dom_log(YLOG_WARN, tinfo, node,
892 "bad attribute @%s, expected @name",
897 else if (0 == XML_STRCMP(node->name, "record"))
903 struct _xmlAttr *attr;
904 for (attr = node->properties; attr; attr = attr->next)
906 if (attr_content_xml(attr, "id", &id_p))
908 else if (attr_content_xml(attr, "rank", &rank_p))
910 else if (attr_content_xml(attr, "type", &type_p))
914 dom_log(YLOG_WARN, tinfo, node,
915 "bad attribute @%s, expected @id|@rank|@type",
919 if (type_p && 0 != strcmp("update", (const char *)type_p))
921 dom_log(YLOG_WARN, tinfo, node,
922 "attribute @%s, only implemented '@type='update'",
926 set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
930 dom_log(YLOG_WARN, tinfo, node,
932 " expected <record>|<index> in namespace '%s'",
933 node->name, zebra_dom_ns);
939 /* DOM filter style indexing */
940 static void process_xml_pi_node(struct filter_info *tinfo,
941 struct recExtractCtrl *extctr,
945 /* if right PI name, continue parsing PI */
946 if (0 == strcmp(zebra_pi_name, (const char *)node->name))
948 xmlChar *pi_p = node->content;
949 xmlChar *look = pi_p;
954 /* parsing PI record instructions */
955 if (0 == strncmp((const char *)look, "record", 6))
968 while (*look && ' ' == *look && *(look+1))
971 /* parse possible id */
972 if (*look && 0 == strncmp((const char *)look, "id=", 3))
976 while (*look && ' ' != *look)
979 strncpy((char *)id, (const char *)bval, eval - bval);
980 id[eval - bval] = '\0';
984 while (*look && ' ' == *look && *(look+1))
987 /* parse possible rank */
988 if (*look && 0 == strncmp((const char *)look, "rank=", 5))
992 while (*look && ' ' != *look)
995 strncpy((char *)rank, (const char *)bval, eval - bval);
996 rank[eval - bval] = '\0';
1000 while (*look && ' ' == *look && *(look+1))
1003 if (look && '\0' != *look)
1005 dom_log(YLOG_WARN, tinfo, node,
1006 "content '%s', can not parse '%s'",
1010 set_record_info(tinfo, extctr, node, id, rank, 0);
1013 /* parsing index instruction */
1014 else if (0 == strncmp((const char *)look, "index", 5))
1018 /* eat whitespace */
1019 while (*look && ' ' == *look && *(look+1))
1022 /* export index instructions to outside */
1027 dom_log(YLOG_WARN, tinfo, node,
1028 "content '%s', can not parse '%s'",
1034 /* DOM filter style indexing */
1035 static void process_xml_element_node(struct filter_info *tinfo,
1036 struct recExtractCtrl *extctr,
1040 /* remember indexing instruction from PI to next element node */
1041 xmlChar *index_p = 0;
1043 /* check if we are an element node in the special zebra namespace
1044 and either set record data or index value-of node content*/
1045 process_xml_element_zebra_node(tinfo, extctr, recword, node);
1047 /* loop through kid nodes */
1048 for (node = node->children; node; node = node->next)
1050 /* check and set PI record and index index instructions */
1051 if (node->type == XML_PI_NODE)
1053 process_xml_pi_node(tinfo, extctr, node, &index_p);
1055 else if (node->type == XML_ELEMENT_NODE)
1057 /* if there was a PI index instruction before this element */
1060 index_value_of(tinfo, extctr, recword, node, index_p);
1063 process_xml_element_node(tinfo, extctr, recword,node);
1071 /* DOM filter style indexing */
1072 static void extract_dom_doc_node(struct filter_info *tinfo,
1073 struct recExtractCtrl *extctr,
1076 /* only need to do the initialization once, reuse recword for all terms */
1078 (*extctr->init)(extctr, &recword);
1080 process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1086 static int convert_extract_doc(struct filter_info *tinfo,
1087 struct filter_input *input,
1088 struct recExtractCtrl *p,
1094 const char *params[10];
1095 xsltStylesheetPtr last_xsp = 0;
1096 xmlDocPtr store_doc = 0;
1098 /* per default do not ingest record */
1099 tinfo->record_info_invoked = 0;
1101 /* exit if empty document given */
1103 return RECCTRL_EXTRACT_SKIP;
1105 /* we actuallu have a document which needs to be processed further */
1107 set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
1109 /* input conversion */
1110 perform_convert(tinfo, p, input->convert, params, &doc, 0);
1114 /* store conversion */
1115 store_doc = xmlCopyDoc(doc, 1);
1116 perform_convert(tinfo, p, tinfo->store->convert,
1117 params, &store_doc, &last_xsp);
1120 /* saving either store doc or original doc in case no store doc exists */
1122 xsltSaveResultToString(&buf_out, &len_out,
1123 store_doc ? store_doc : doc, last_xsp);
1125 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1127 (*p->setStoreData)(p, buf_out, len_out);
1131 xmlFreeDoc(store_doc);
1133 /* extract conversion */
1134 perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0);
1137 /* finally, do the indexing */
1139 extract_dom_doc_node(tinfo, p, doc);
1143 /* there was nothing to index, so there is no inserted/updated record */
1144 if (tinfo->record_info_invoked == 0)
1145 return RECCTRL_EXTRACT_SKIP;
1147 return RECCTRL_EXTRACT_OK;
1150 static int extract_xml_split(struct filter_info *tinfo,
1151 struct filter_input *input,
1152 struct recExtractCtrl *p)
1156 if (p->first_record)
1158 if (input->u.xmlreader.reader)
1159 xmlFreeTextReader(input->u.xmlreader.reader);
1160 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1161 p /* I/O handler */,
1168 if (!input->u.xmlreader.reader)
1169 return RECCTRL_EXTRACT_ERROR_GENERIC;
1171 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1174 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1175 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1177 if (type == XML_READER_TYPE_ELEMENT &&
1178 input->u.xmlreader.split_level == depth)
1182 /* per default do not ingest record */
1183 tinfo->record_info_invoked = 0;
1185 ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
1188 /* we have a new document */
1190 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1191 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1193 xmlDocSetRootElement(doc, ptr2);
1195 /* writing debug info out */
1196 if (p->flagShowRecords)
1198 xmlChar *buf_out = 0;
1200 xmlDocDumpMemory(doc, &buf_out, &len_out);
1201 yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s",
1202 tinfo->fname ? tinfo->fname : "(none)",
1203 depth, len_out, buf_out);
1207 return convert_extract_doc(tinfo, input, p, doc);
1211 xmlFreeTextReader(input->u.xmlreader.reader);
1212 input->u.xmlreader.reader = 0;
1213 return RECCTRL_EXTRACT_ERROR_GENERIC;
1216 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1218 xmlFreeTextReader(input->u.xmlreader.reader);
1219 input->u.xmlreader.reader = 0;
1220 return RECCTRL_EXTRACT_EOF;
1223 static int extract_xml_full(struct filter_info *tinfo,
1224 struct filter_input *input,
1225 struct recExtractCtrl *p)
1227 if (p->first_record) /* only one record per stream */
1229 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1230 p /* I/O handler */,
1238 return RECCTRL_EXTRACT_ERROR_GENERIC;
1240 return convert_extract_doc(tinfo, input, p, doc);
1243 return RECCTRL_EXTRACT_EOF;
1246 static int extract_iso2709(struct filter_info *tinfo,
1247 struct filter_input *input,
1248 struct recExtractCtrl *p)
1254 if (p->stream->readf(p->stream, buf, 5) != 5)
1255 return RECCTRL_EXTRACT_EOF;
1256 while (*buf < '0' || *buf > '9')
1260 dom_log(YLOG_WARN, tinfo, 0,
1261 "MARC: Skipping bad byte %d (0x%02X)",
1262 *buf & 0xff, *buf & 0xff);
1263 for (i = 0; i<4; i++)
1266 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1267 return RECCTRL_EXTRACT_EOF;
1269 record_length = atoi_n (buf, 5);
1270 if (record_length < 25)
1272 dom_log(YLOG_WARN, tinfo, 0,
1273 "MARC record length < 25, is %d", record_length);
1274 return RECCTRL_EXTRACT_ERROR_GENERIC;
1276 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1277 if (read_bytes < record_length-5)
1279 dom_log(YLOG_WARN, tinfo, 0,
1280 "couldn't read whole MARC record");
1281 return RECCTRL_EXTRACT_ERROR_GENERIC;
1283 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1284 if (r < record_length)
1286 dom_log (YLOG_WARN, tinfo, 0,
1287 "parsing of MARC record failed r=%d length=%d",
1289 return RECCTRL_EXTRACT_ERROR_GENERIC;
1295 yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1296 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1297 xmlDocSetRootElement(rdoc, root_ptr);
1298 return convert_extract_doc(tinfo, input, p, rdoc);
1300 return RECCTRL_EXTRACT_OK;
1303 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1305 struct filter_info *tinfo = clientData;
1306 struct filter_input *input = tinfo->input_list;
1310 return RECCTRL_EXTRACT_ERROR_GENERIC;
1312 odr_reset(tinfo->odr_record);
1316 return extract_xml_full(tinfo, input, p);
1318 case DOM_INPUT_XMLREADER:
1319 if (input->u.xmlreader.split_level == 0)
1320 return extract_xml_full(tinfo, input, p);
1322 return extract_xml_split(tinfo, input, p);
1324 case DOM_INPUT_MARC:
1325 return extract_iso2709(tinfo, input, p);
1327 return RECCTRL_EXTRACT_ERROR_GENERIC;
1330 static int ioread_ret(void *context, char *buffer, int len)
1332 struct recRetrieveCtrl *p = context;
1333 return p->stream->readf(p->stream, buffer, len);
1336 static int ioclose_ret(void *context)
1341 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1343 /* const char *esn = zebra_dom_ns; */
1344 const char *esn = 0;
1345 const char *params[32];
1346 struct filter_info *tinfo = clientData;
1348 struct filter_retrieve *retrieve;
1349 xsltStylesheetPtr last_xsp = 0;
1353 if (p->comp->which == Z_RecordComp_simple
1354 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1356 esn = p->comp->u.simple->u.generic;
1358 else if (p->comp->which == Z_RecordComp_complex
1359 && p->comp->u.complex->generic->elementSpec
1360 && p->comp->u.complex->generic->elementSpec->which ==
1361 Z_ElementSpec_elementSetName)
1363 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1366 retrieve = lookup_retrieve(tinfo, esn);
1370 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1375 set_param_int(params, "id", p->localno, p->odr);
1377 set_param_str(params, "filename", p->fname, p->odr);
1378 if (p->staticrank >= 0)
1379 set_param_int(params, "rank", p->staticrank, p->odr);
1382 set_param_str(params, "schema", esn, p->odr);
1385 set_param_str(params, "schema", retrieve->name, p->odr);
1386 else if (retrieve->identifier)
1387 set_param_str(params, "schema", retrieve->identifier, p->odr);
1389 set_param_str(params, "schema", "", p->odr);
1392 set_param_int(params, "score", p->score, p->odr);
1393 set_param_int(params, "size", p->recordSize, p->odr);
1395 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1398 XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
1401 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1405 /* retrieve conversion */
1406 perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp);
1409 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1411 else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1417 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1419 xmlDocDumpMemory(doc, &buf_out, &len_out);
1421 p->output_format = VAL_TEXT_XML;
1422 p->rec_len = len_out;
1423 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1424 memcpy(p->rec_buf, buf_out, p->rec_len);
1427 else if (p->output_format == VAL_SUTRS)
1433 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1435 xmlDocDumpMemory(doc, &buf_out, &len_out);
1437 p->output_format = VAL_SUTRS;
1438 p->rec_len = len_out;
1439 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1440 memcpy(p->rec_buf, buf_out, p->rec_len);
1446 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1452 static struct recType filter_type = {
1463 #ifdef IDZEBRA_STATIC_DOM
1476 * indent-tabs-mode: nil
1478 * vim: shiftwidth=4 tabstop=8 expandtab