X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=index%2Fmod_dom.c;h=3340ec454832c68936461c46e570f0e3248c92f5;hp=5805efb4545f8d6f19f8e94e688e5dace75b3afa;hb=426d07a60c57c3555934655a78437cf4677c65c8;hpb=f1d5d3a64682b741757c91311f3362c9d43f288c diff --git a/index/mod_dom.c b/index/mod_dom.c index 5805efb..3340ec4 100644 --- a/index/mod_dom.c +++ b/index/mod_dom.c @@ -1,31 +1,30 @@ -/* $Id: mod_dom.c,v 1.9 2007-02-14 16:31:37 marc Exp $ - Copyright (C) 1995-2007 - Index Data ApS +/* This file is part of the Zebra server. + Copyright (C) 1994-2009 Index Data - This file is part of the Zebra server. +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. - Zebra is free software; you can redistribute it and/or modify it under - the terms of the GNU General Public License as published by the Free - Software Foundation; either version 2, or (at your option) any later - version. +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. - Zebra is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include +#include #include #include +#include #include #include @@ -41,10 +40,36 @@ #include #include +#include -struct convert_s { +/* DOM filter style indexing */ +#define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0" +static const char *zebra_dom_ns = ZEBRA_DOM_NS; + +/* DOM filter style indexing */ +#define ZEBRA_PI_NAME "zebra-2.0" +static const char *zebra_pi_name = ZEBRA_PI_NAME; + +enum convert_type { + convert_xslt_type, + convert_meta_type +}; + +struct convert_xslt { const char *stylesheet; xsltStylesheetPtr stylesheet_xsp; +}; + +struct convert_meta { + int dummy; +}; + +struct convert_s { + enum convert_type which; + union { + struct convert_xslt xslt; + struct convert_meta meta; + } u; struct convert_s *next; }; @@ -73,14 +98,14 @@ struct filter_input { int type; union { struct { + xmlTextReaderPtr reader; + int split_level; + } xmlreader; + struct { const char *input_charset; yaz_marc_t handle; yaz_iconv_t iconv; } marc; - struct { - xmlTextReaderPtr reader; - int split_level; - } xmlreader; } u; struct filter_input *next; }; @@ -89,25 +114,56 @@ struct filter_info { char *fname; char *full_name; const char *profile_path; - ODR odr_record; - ODR odr_config; + NMEM nmem_record; + NMEM nmem_config; xmlDocPtr doc_config; struct filter_extract *extract; struct filter_retrieve *retrieve_list; struct filter_input *input_list; struct filter_store *store; + int record_info_invoked; }; + + #define XML_STRCMP(a,b) strcmp((char*)a, b) #define XML_STRLEN(a) strlen((char*)a) +#define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE) + +static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr, + const char *fmt, ...) +#ifdef __GNUC__ + __attribute__ ((format (printf, 4, 5))) +#endif + ; + +static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr, + const char *fmt, ...) +{ + va_list ap; + char buf[4096]; + + va_start(ap, fmt); + yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap); + if (ptr) + { + yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none", + xmlGetLineNo(ptr), buf); + } + else + { + yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf); + } + va_end(ap); +} static void set_param_str(const char **params, const char *name, - const char *value, ODR odr) + const char *value, NMEM nmem) { - char *quoted = odr_malloc(odr, 3 + strlen(value)); + char *quoted = nmem_malloc(nmem, 3 + strlen(value)); sprintf(quoted, "'%s'", value); while (*params) params++; @@ -117,9 +173,9 @@ static void set_param_str(const char **params, const char *name, } static void set_param_int(const char **params, const char *name, - zint value, ODR odr) + zint value, NMEM nmem) { - char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */ + char *quoted = nmem_malloc(nmem, 30); /* 25 digits enough for 2^64 */ while (*params) params++; sprintf(quoted, "'" ZINT_FORMAT "'", value); @@ -134,13 +190,14 @@ static void *filter_init(Res res, RecType recType) tinfo->fname = 0; tinfo->full_name = 0; tinfo->profile_path = 0; - tinfo->odr_record = odr_createmem(ODR_ENCODE); - tinfo->odr_config = odr_createmem(ODR_ENCODE); + tinfo->nmem_record = nmem_create(); + tinfo->nmem_config = nmem_create(); tinfo->extract = 0; tinfo->retrieve_list = 0; tinfo->input_list = 0; tinfo->store = 0; tinfo->doc_config = 0; + tinfo->record_info_invoked = 0; #if YAZ_HAVE_EXSLT exsltRegisterAll(); @@ -154,166 +211,265 @@ static int attr_content(struct _xmlAttr *attr, const char *name, { if (!XML_STRCMP(attr->name, name) && attr->children && attr->children->type == XML_TEXT_NODE) - { - *dst_content = (const char *)(attr->children->content); - return 1; - } + { + *dst_content = (const char *)(attr->children->content); + return 1; + } return 0; } static void destroy_xsp(struct convert_s *c) { - while(c) + while (c) + { + if (c->which == convert_xslt_type) { - if (c->stylesheet_xsp) - xsltFreeStylesheet(c->stylesheet_xsp); - c = c->next; + if (c->u.xslt.stylesheet_xsp) + xsltFreeStylesheet(c->u.xslt.stylesheet_xsp); } + c = c->next; + } } static void destroy_dom(struct filter_info *tinfo) { if (tinfo->extract) - { - destroy_xsp(tinfo->extract->convert); - tinfo->extract = 0; - } + { + destroy_xsp(tinfo->extract->convert); + tinfo->extract = 0; + } if (tinfo->store) - { - destroy_xsp(tinfo->store->convert); - tinfo->store = 0; - } + { + destroy_xsp(tinfo->store->convert); + tinfo->store = 0; + } if (tinfo->input_list) + { + struct filter_input *i_ptr; + for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next) { - struct filter_input *i_ptr; - for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next) - { - switch(i_ptr->type) - { - case DOM_INPUT_XMLREADER: - if (i_ptr->u.xmlreader.reader) - xmlFreeTextReader(i_ptr->u.xmlreader.reader); - break; - case DOM_INPUT_MARC: - yaz_iconv_close(i_ptr->u.marc.iconv); - yaz_marc_destroy(i_ptr->u.marc.handle); - break; - } - destroy_xsp(i_ptr->convert); - } - tinfo->input_list = 0; + switch(i_ptr->type) + { + case DOM_INPUT_XMLREADER: + if (i_ptr->u.xmlreader.reader) + xmlFreeTextReader(i_ptr->u.xmlreader.reader); + break; + case DOM_INPUT_MARC: + yaz_iconv_close(i_ptr->u.marc.iconv); + yaz_marc_destroy(i_ptr->u.marc.handle); + break; + } + destroy_xsp(i_ptr->convert); } + tinfo->input_list = 0; + } if (tinfo->retrieve_list) - { - struct filter_retrieve *r_ptr; - for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next) - destroy_xsp(r_ptr->convert); - tinfo->retrieve_list = 0; - } + { + struct filter_retrieve *r_ptr; + for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next) + destroy_xsp(r_ptr->convert); + tinfo->retrieve_list = 0; + } if (tinfo->doc_config) - { - xmlFreeDoc(tinfo->doc_config); - tinfo->doc_config = 0; - } - odr_reset(tinfo->odr_config); + { + xmlFreeDoc(tinfo->doc_config); + tinfo->doc_config = 0; + } + nmem_reset(tinfo->nmem_config); } static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr, struct convert_s **l) { *l = 0; - for(; ptr; ptr = ptr->next) + FOR_EACH_ELEMENT(ptr) { + if (!XML_STRCMP(ptr->name, "xslt")) { - if (ptr->type != XML_ELEMENT_NODE) - continue; - if (!XML_STRCMP(ptr->name, "xslt")) + struct _xmlAttr *attr; + struct convert_s *p = nmem_malloc(tinfo->nmem_config, sizeof(*p)); + + p->next = 0; + p->which = convert_xslt_type; + p->u.xslt.stylesheet = 0; + p->u.xslt.stylesheet_xsp = 0; + + for (attr = ptr->properties; attr; attr = attr->next) + if (attr_content(attr, "stylesheet", &p->u.xslt.stylesheet)) + ; + else { - struct _xmlAttr *attr; - struct convert_s *p - = odr_malloc(tinfo->odr_config, sizeof(*p)); - - p->next = 0; - p->stylesheet = 0; - p->stylesheet_xsp = 0; - - for (attr = ptr->properties; attr; attr = attr->next) - if (attr_content(attr, "stylesheet", &p->stylesheet)) - ; - else - yaz_log(YLOG_WARN, "%s: dom filter: " - "bad attribute %s" - " for ", - tinfo->fname, attr->name); - if (p->stylesheet) - { - char tmp_xslt_full_name[1024]; - if (!yaz_filepath_resolve(p->stylesheet, - tinfo->profile_path, - NULL, - tmp_xslt_full_name)) - { - yaz_log(YLOG_WARN, - "%s: dom filter: " - "stylesheet %s not found in " - "path %s", - tinfo->fname, - p->stylesheet, - tinfo->profile_path); - return ZEBRA_FAIL; - } - - p->stylesheet_xsp - = xsltParseStylesheetFile((const xmlChar*) - tmp_xslt_full_name); - if (!p->stylesheet_xsp) - { - yaz_log(YLOG_WARN, - "%s: dom filter: " - "could not parse xslt " - "stylesheet %s", - tinfo->fname, tmp_xslt_full_name); - return ZEBRA_FAIL; - } - } - else - { - yaz_log(YLOG_WARN, - "%s: dom filter: " - "missing attribute 'stylesheet' " - "for element 'xslt'", tinfo->fname); - return ZEBRA_FAIL; - } - *l = p; - l = &p->next; + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s", attr->name); } - else + if (p->u.xslt.stylesheet) + { + char tmp_xslt_full_name[1024]; + if (!yaz_filepath_resolve(p->u.xslt.stylesheet, + tinfo->profile_path, + NULL, + tmp_xslt_full_name)) { - yaz_log(YLOG_LOG, - "%s: dom filter: bad node '%s' for ", - tinfo->fname, ptr->name); + dom_log(YLOG_WARN, tinfo, 0, + "stylesheet %s not found in " + "path %s", + p->u.xslt.stylesheet, + tinfo->profile_path); return ZEBRA_FAIL; } - + + p->u.xslt.stylesheet_xsp + = xsltParseStylesheetFile((const xmlChar*) + tmp_xslt_full_name); + if (!p->u.xslt.stylesheet_xsp) + { + dom_log(YLOG_WARN, tinfo, 0, + "could not parse xslt stylesheet %s", + tmp_xslt_full_name); + return ZEBRA_FAIL; + } + } + else + { + dom_log(YLOG_WARN, tinfo, ptr, + "missing attribute 'stylesheet'"); + return ZEBRA_FAIL; + } + *l = p; + l = &p->next; } + else if (!XML_STRCMP(ptr->name, "process-meta")) + { + struct _xmlAttr *attr; + struct convert_s *p = nmem_malloc(tinfo->nmem_config, sizeof(*p)); + + p->next = 0; + p->which = convert_meta_type; + + for (attr = ptr->properties; attr; attr = attr->next) + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s", attr->name); + *l = p; + l = &p->next; + } + else + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad element '%s', expected ", ptr->name); + return ZEBRA_FAIL; + } + } return ZEBRA_OK; } +static int process_meta(struct filter_info *tinfo, xmlDocPtr doc, xmlNodePtr node, + struct recRetrieveCtrl *retctr) +{ + + if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href && + 0 == XML_STRCMP(node->ns->href, zebra_dom_ns)) + { + if (0 == XML_STRCMP(node->name, "meta")) + { + const char *element_set_name = 0; + + struct _xmlAttr *attr; + for (attr = node->properties; attr; attr = attr->next) + { + if (attr_content(attr, "name", &element_set_name)) + ; + else + { + dom_log(YLOG_WARN, tinfo, node, + "bad attribute @%s, expected @name", attr->name); + } + } + if (element_set_name) + { + WRBUF result = wrbuf_alloc(); + WRBUF addinfo = wrbuf_alloc(); + const Odr_oid *input_format = yaz_oid_recsyn_xml; + const Odr_oid *output_format = 0; + int ret; + + ret = retctr->special_fetch(retctr->handle, + element_set_name, + input_format, &output_format, + result, addinfo); + if (ret == 0) + { + xmlDocPtr sub_doc = + xmlParseMemory(wrbuf_buf(result), wrbuf_len(result)); + if (sub_doc) + { + xmlNodePtr t = xmlDocGetRootElement(sub_doc); + xmlReplaceNode(node, xmlCopyNode(t, 1)); + xmlFreeDoc(sub_doc); + } + } + wrbuf_destroy(result); + wrbuf_destroy(addinfo); + } + } + } + for (node = node->children; node; node = node->next) + process_meta(tinfo, doc, node, retctr); + return 0; +} + static ZEBRA_RES perform_convert(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + struct recRetrieveCtrl *retctr, struct convert_s *convert, const char **params, xmlDocPtr *doc, xsltStylesheetPtr *last_xsp) { for (; convert; convert = convert->next) + { + if (convert->which == convert_xslt_type) { - xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp, + xmlChar *buf_out = 0; + int len_out = 0; + xmlDocPtr res_doc = xsltApplyStylesheet(convert->u.xslt.stylesheet_xsp, *doc, params); if (last_xsp) - *last_xsp = convert->stylesheet_xsp; + *last_xsp = convert->u.xslt.stylesheet_xsp; + + if (!res_doc) + break; + + /* now saving into buffer and re-reading into DOM to avoid annoing + XSLT problem with thrown-out indentation text nodes */ + xsltSaveResultToString(&buf_out, &len_out, res_doc, + convert->u.xslt.stylesheet_xsp); + xmlFreeDoc(res_doc); + xmlFreeDoc(*doc); - *doc = res_doc; + + *doc = xmlParseMemory((const char *) buf_out, len_out); + + /* writing debug info out */ + if (extctr && extctr->flagShowRecords) + yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s", + tinfo->fname ? tinfo->fname : "(none)", + convert->u.xslt.stylesheet, + len_out, buf_out); + + xmlFree(buf_out); } + else if (convert->which == convert_meta_type) + { + if (retctr) /* only execute meta on retrieval */ + { + process_meta(tinfo, *doc, xmlDocGetRootElement(*doc), retctr); + + /* last stylesheet absent */ + if (last_xsp) + *last_xsp = 0; + } + } + } return ZEBRA_OK; } @@ -323,7 +479,7 @@ static struct filter_input *new_input(struct filter_info *tinfo, int type) struct filter_input **np = &tinfo->input_list; for (;*np; np = &(*np)->next) ; - p = *np = odr_malloc(tinfo->odr_config, sizeof(*p)); + p = *np = nmem_malloc(tinfo->nmem_config, sizeof(*p)); p->next = 0; p->syntax = 0; p->name = 0; @@ -333,89 +489,86 @@ static struct filter_input *new_input(struct filter_info *tinfo, int type) } static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr, - const char *syntax, - const char *name) + const char *syntax, const char *name) { - for (; ptr; ptr = ptr->next) + FOR_EACH_ELEMENT(ptr) { + if (!XML_STRCMP(ptr->name, "marc")) { - if (ptr->type != XML_ELEMENT_NODE) - continue; - if (!XML_STRCMP(ptr->name, "marc")) - { - yaz_iconv_t iconv = 0; - const char *input_charset = "marc-8"; - struct _xmlAttr *attr; + yaz_iconv_t iconv = 0; + const char *input_charset = "marc-8"; + struct _xmlAttr *attr; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "charset", &input_charset)) - ; - else - yaz_log(YLOG_WARN, - "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } - iconv = yaz_iconv_open("utf-8", input_charset); - if (!iconv) - { - yaz_log(YLOG_WARN, - "%s: dom filter: unsupported charset " - "'%s' for ", - tinfo->fname, input_charset); - return ZEBRA_FAIL; - } - else - { - struct filter_input *p - = new_input(tinfo, DOM_INPUT_MARC); - p->u.marc.handle = yaz_marc_create(); - p->u.marc.iconv = iconv; + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "inputcharset", &input_charset)) + ; + else + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @inputcharset", + attr->name); + } + } + iconv = yaz_iconv_open("utf-8", input_charset); + if (!iconv) + { + dom_log(YLOG_WARN, tinfo, ptr, + "unsupported @charset '%s'", input_charset); + return ZEBRA_FAIL; + } + else + { + struct filter_input *p + = new_input(tinfo, DOM_INPUT_MARC); + p->u.marc.handle = yaz_marc_create(); + p->u.marc.iconv = iconv; - yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv); + yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv); - ptr = ptr->next; + ptr = ptr->next; - parse_convert(tinfo, ptr, &p->convert); - } - break; + parse_convert(tinfo, ptr, &p->convert); + } + break; - } - else if (!XML_STRCMP(ptr->name, "xmlreader")) - { - struct filter_input *p - = new_input(tinfo, DOM_INPUT_XMLREADER); - struct _xmlAttr *attr; - const char *level_str = 0; - - p->u.xmlreader.split_level = 0; - p->u.xmlreader.reader = 0; - - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "level", &level_str)) - ; - else - yaz_log(YLOG_WARN, - "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } - if (level_str) - p->u.xmlreader.split_level = atoi(level_str); - - ptr = ptr->next; + } + else if (!XML_STRCMP(ptr->name, "xmlreader")) + { + struct filter_input *p + = new_input(tinfo, DOM_INPUT_XMLREADER); + struct _xmlAttr *attr; + const char *level_str = 0; - parse_convert(tinfo, ptr, &p->convert); - break; - } - else + p->u.xmlreader.split_level = 0; + p->u.xmlreader.reader = 0; + + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "level", &level_str)) + ; + else { - yaz_log(YLOG_WARN, "%s: dom filter: bad input type %s", - tinfo->fname, ptr->name); - return ZEBRA_FAIL; + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @level", + attr->name); } + } + if (level_str) + p->u.xmlreader.split_level = atoi(level_str); + + ptr = ptr->next; + + parse_convert(tinfo, ptr, &p->convert); + break; } + else + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad element <%s>, expected |", + ptr->name); + return ZEBRA_FAIL; + } + } return ZEBRA_OK; } @@ -425,155 +578,165 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname) xmlNodePtr ptr; xmlDocPtr doc; - tinfo->fname = odr_strdup(tinfo->odr_config, fname); + tinfo->fname = nmem_strdup(tinfo->nmem_config, fname); if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path, NULL, tmp_full_name)) - tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name); + tinfo->full_name = nmem_strdup(tinfo->nmem_config, tmp_full_name); else - tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname); - - yaz_log(YLOG_LOG, "dom filter: loading config file %s", tinfo->full_name); + tinfo->full_name = nmem_strdup(tinfo->nmem_config, tinfo->fname); + yaz_log(YLOG_LOG, "%s dom filter: " + "loading config file %s", tinfo->fname, tinfo->full_name); + doc = xmlParseFile(tinfo->full_name); if (!doc) - { - yaz_log(YLOG_WARN, - "%s: dom filter: failed to parse config file %s", - tinfo->fname, tinfo->full_name); - return ZEBRA_FAIL; - } + { + yaz_log(YLOG_WARN, "%s: dom filter: " + "failed to parse config file %s", + tinfo->fname, tinfo->full_name); + return ZEBRA_FAIL; + } /* save because we store ptrs to the content */ tinfo->doc_config = doc; ptr = xmlDocGetRootElement(doc); if (!ptr || ptr->type != XML_ELEMENT_NODE || XML_STRCMP(ptr->name, "dom")) - { - yaz_log(YLOG_WARN, - "%s: dom filter: expected root element ", - tinfo->fname); - return ZEBRA_FAIL; - } + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad root element <%s>, expected root element ", + ptr->name); + return ZEBRA_FAIL; + } - for (ptr = ptr->children; ptr; ptr = ptr->next) + ptr = ptr->children; + FOR_EACH_ELEMENT(ptr) { + if (!XML_STRCMP(ptr->name, "extract")) { - if (ptr->type != XML_ELEMENT_NODE) - continue; - if (!XML_STRCMP(ptr->name, "extract")) - { - /* - - - - - */ - struct _xmlAttr *attr; - struct filter_extract *f = - odr_malloc(tinfo->odr_config, sizeof(*f)); - - tinfo->extract = f; - f->name = 0; - f->convert = 0; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "name", &f->name)) - ; - else - yaz_log(YLOG_WARN, - "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - - } - parse_convert(tinfo, ptr->children, &f->convert); - } - else if (!XML_STRCMP(ptr->name, "retrieve")) - { - /* - - - - - */ - struct _xmlAttr *attr; - struct filter_retrieve **fp = &tinfo->retrieve_list; - struct filter_retrieve *f = - odr_malloc(tinfo->odr_config, sizeof(*f)); + /* + + + + + */ + struct _xmlAttr *attr; + struct filter_extract *f = + nmem_malloc(tinfo->nmem_config, sizeof(*f)); - while (*fp) - fp = &(*fp)->next; - - *fp = f; - f->name = 0; - f->identifier = 0; - f->convert = 0; - f->next = 0; - - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "identifier", - &f->identifier)) - ; - else if (attr_content(attr, "name", &f->name)) - ; - else - yaz_log(YLOG_WARN, - "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } - parse_convert(tinfo, ptr->children, &f->convert); - } - else if (!XML_STRCMP(ptr->name, "store")) + tinfo->extract = f; + f->name = 0; + f->convert = 0; + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "name", &f->name)) + ; + else { - /* - - - - - */ - struct filter_store *f = - odr_malloc(tinfo->odr_config, sizeof(*f)); - - tinfo->store = f; - f->convert = 0; - parse_convert(tinfo, ptr->children, &f->convert); + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @name", + attr->name); } - else if (!XML_STRCMP(ptr->name, "input")) + } + parse_convert(tinfo, ptr->children, &f->convert); + } + else if (!XML_STRCMP(ptr->name, "retrieve")) + { + /* + + + + + */ + struct _xmlAttr *attr; + struct filter_retrieve **fp = &tinfo->retrieve_list; + struct filter_retrieve *f = + nmem_malloc(tinfo->nmem_config, sizeof(*f)); + + while (*fp) + fp = &(*fp)->next; + + *fp = f; + f->name = 0; + f->identifier = 0; + f->convert = 0; + f->next = 0; + + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "identifier", + &f->identifier)) + ; + else if (attr_content(attr, "name", &f->name)) + ; + else { - /* - - - - - - - */ - struct _xmlAttr *attr; - const char *syntax = 0; - const char *name = 0; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "syntax", &syntax)) - ; - else if (attr_content(attr, "name", &name)) - ; - else - yaz_log(YLOG_WARN, - "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } - parse_input(tinfo, ptr->children, syntax, name); + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @identifier|@name", + attr->name); } - else + } + parse_convert(tinfo, ptr->children, &f->convert); + } + else if (!XML_STRCMP(ptr->name, "store")) + { + /* + + + + + */ + struct filter_store *f = + nmem_malloc(tinfo->nmem_config, sizeof(*f)); + + tinfo->store = f; + f->convert = 0; + parse_convert(tinfo, ptr->children, &f->convert); + } + else if (!XML_STRCMP(ptr->name, "input")) + { + /* + + + + + + + */ + struct _xmlAttr *attr; + const char *syntax = 0; + const char *name = 0; + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "syntax", &syntax)) + ; + else if (attr_content(attr, "name", &name)) + ; + else { - yaz_log(YLOG_WARN, "%s: dom filter: bad element %s", - tinfo->fname, ptr->name); - return ZEBRA_FAIL; + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @syntax|@name", + attr->name); } + } + parse_input(tinfo, ptr->children, syntax, name); + } + else + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad element <%s>, " + "expected |||", + ptr->name); + return ZEBRA_FAIL; } + } + if (!tinfo->input_list) + { + struct filter_input *p + = new_input(tinfo, DOM_INPUT_XMLREADER); + p->u.xmlreader.split_level = 0; + p->u.xmlreader.reader = 0; + } return ZEBRA_OK; } @@ -586,16 +749,16 @@ static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo, if (!est) return f; for (; f; f = f->next) - { - /* find requested schema */ - if (est) - { - if (f->identifier && !strcmp(f->identifier, est)) - return f; - if (f->name && !strcmp(f->name, est)) - return f; - } - } + { + /* find requested schema */ + if (est) + { + if (f->identifier && !strcmp(f->identifier, est)) + return f; + if (f->name && !strcmp(f->name, est)) + return f; + } + } return 0; } @@ -603,10 +766,10 @@ static ZEBRA_RES filter_config(void *clientData, Res res, const char *args) { struct filter_info *tinfo = clientData; if (!args || !*args) - { - yaz_log(YLOG_WARN, "dom filter: need config file"); - return ZEBRA_FAIL; - } + { + yaz_log(YLOG_WARN, "dom filter: need config file"); + return ZEBRA_FAIL; + } if (tinfo->fname && !strcmp(args, tinfo->fname)) return ZEBRA_OK; @@ -621,8 +784,8 @@ static void filter_destroy(void *clientData) { struct filter_info *tinfo = clientData; destroy_dom(tinfo); - odr_destroy(tinfo->odr_config); - odr_destroy(tinfo->odr_record); + nmem_destroy(tinfo->nmem_config); + nmem_destroy(tinfo->nmem_record); xfree(tinfo); } @@ -638,495 +801,436 @@ static int ioclose_ex(void *context) } -/* Alvis style indexing */ -#define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1" -static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS; - -/* Alvis style indexing */ -static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl, - xmlNodePtr ptr, RecWord *recWord) -{ - for(; ptr; ptr = ptr->next) - { - index_cdata(tinfo, ctrl, ptr->children, recWord); - if (ptr->type != XML_TEXT_NODE) - continue; - recWord->term_buf = (const char *)ptr->content; - recWord->term_len = XML_STRLEN(ptr->content); - (*ctrl->tokenAdd)(recWord); - } -} -/* Alvis style indexing */ -static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl, - xmlNodePtr ptr, RecWord *recWord) +/* DOM filter style indexing */ +static void index_value_of(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + RecWord* recword, + xmlNodePtr node, + const char *index_p) { - for(; ptr; ptr = ptr->next) - { - index_node(tinfo, ctrl, ptr->children, recWord); - if (ptr->type != XML_ELEMENT_NODE || !ptr->ns || - XML_STRCMP(ptr->ns->href, zebra_xslt_ns)) - continue; - if (!XML_STRCMP(ptr->name, "index")) + if (tinfo->record_info_invoked == 1) + { + xmlChar *text = xmlNodeGetContent(node); + size_t text_len = strlen((const char *)text); + + /* if there is no text, we do not need to proceed */ + if (text_len) + { + /* keep seqno base so that all text will have + identical seqno's for multiple fields , e.g + .. */ + + zint seqno_base = recword->seqno; + zint seqno_max = recword->seqno; + + + const char *look = index_p; + const char *bval; + const char *eval; + + xmlChar index[256]; + xmlChar type[256]; + + /* assingning text to be indexed */ + recword->term_buf = (const char *)text; + recword->term_len = text_len; + + /* parsing all index name/type pairs */ + /* may not start with ' ' or ':' */ + while (*look && ' ' != *look && ':' != *look) + { + /* setting name and type to zero */ + *index = '\0'; + *type = '\0'; + + /* parsing one index name */ + bval = look; + while (*look && ':' != *look && ' ' != *look) { - const char *name_str = 0; - const char *type_str = 0; - const char *xpath_str = 0; - struct _xmlAttr *attr; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "name", &name_str)) - ; - else if (attr_content(attr, "xpath", &xpath_str)) - ; - else if (attr_content(attr, "type", &type_str)) - ; - else - yaz_log(YLOG_WARN, - "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } - if (name_str) - { - /* save default type */ - int prev_type = recWord->index_type; - - /* type was given */ - if (type_str && *type_str) - recWord->index_type = *type_str; - - recWord->index_name = name_str; - index_cdata(tinfo, ctrl, ptr->children, recWord); - - /* restore it again */ - recWord->index_type = prev_type; - } + look++; } - } -} - -/* Alvis style indexing */ -static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl, - xmlNodePtr ptr, RecWord *recWord) -{ - const char *type_str = "update"; - - if (ptr && ptr->type == XML_ELEMENT_NODE && ptr->ns && - !XML_STRCMP(ptr->ns->href, zebra_xslt_ns) - && !XML_STRCMP(ptr->name, "record")) - { - const char *id_str = 0; - const char *rank_str = 0; - struct _xmlAttr *attr; - for (attr = ptr->properties; attr; attr = attr->next) + eval = look; + strncpy((char *)index, (const char *)bval, eval - bval); + index[eval - bval] = '\0'; + + + /* parsing one index type, if existing */ + if (':' == *look) { - if (attr_content(attr, "type", &type_str)) - ; - else if (attr_content(attr, "id", &id_str)) - ; - else if (attr_content(attr, "rank", &rank_str)) - ; - else - yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); + look++; + + bval = look; + while (*look && ' ' != *look) + { + look++; + } + eval = look; + strncpy((char *)type, (const char *)bval, eval - bval); + type[eval - bval] = '\0'; } - if (id_str) - sscanf(id_str, "%255s", ctrl->match_criteria); - if (rank_str) - ctrl->staticrank = atozint(rank_str); - ptr = ptr->children; - } + /* actually indexing the text given */ - if (!strcmp("update", type_str)) - index_node(tinfo, ctrl, ptr, recWord); - else if (!strcmp("delete", type_str)) - yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); - else - yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", - type_str); -} + recword->seqno = seqno_base; + recword->index_name = (const char *)index; + if (*type) + recword->index_type = (const char *) type; + /* writing debug out */ + if (extctr->flagShowRecords) + dom_log(YLOG_LOG, tinfo, 0, + "INDEX '%s:%s' '%s'", + (const char *) index, + (const char *) type, + (const char *) text); + + (extctr->tokenAdd)(recword); -/* Alvis style indexing */ -static void extract_doc_alvis(struct filter_info *tinfo, - struct recExtractCtrl *recctr, - xmlDocPtr doc) -{ - if (doc){ - RecWord recWord; - xmlChar *buf_out; - int len_out; - xmlNodePtr root_ptr; + if (seqno_max < recword->seqno) + seqno_max = recword->seqno; - (*recctr->init)(recctr, &recWord); - - if (recctr->flagShowRecords){ - xmlDocDumpMemory(doc, &buf_out, &len_out); - fwrite(buf_out, len_out, 1, stdout); - xmlFree(buf_out); - } - root_ptr = xmlDocGetRootElement(doc); - if (root_ptr) - index_record(tinfo, recctr, root_ptr, &recWord); - else - yaz_log(YLOG_WARN, "No root for index XML record"); + /* eat whitespaces */ + if (*look && ' ' == *look) + { + look++; + } + } + recword->seqno = seqno_max; + } + xmlFree(text); } } /* DOM filter style indexing */ -static int attr_content_xml(struct _xmlAttr *attr, const char *name, - xmlChar **dst_content) +static void set_record_info(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + xmlNodePtr node, + const char * id_p, + const char * rank_p, + const char * type_p) { - if (0 == XML_STRCMP(attr->name, name) && attr->children - && attr->children->type == XML_TEXT_NODE) - { - *dst_content = (attr->children->content); - return 1; - } - return 0; -} - -/* DOM filter style indexing */ -/* #define ZEBRA_XSLT_NS "http://indexdata.com/zebra-2.0" */ -/* static const char *zebra_xslt_ns = ZEBRA_XSLT_NS; */ - -/* DOM filter style indexing */ -#define ZEBRA_PI_NAME "zebra-2.0" -static const char *zebra_pi_name = ZEBRA_PI_NAME; - - -/* DOM filter style indexing */ -void index_value_of(struct filter_info *tinfo, - struct recExtractCtrl *recctr, - xmlNodePtr node, - xmlChar * index_p) -{ - xmlChar *text = xmlNodeGetContent(node); - - xmlChar *look = index_p; - xmlChar *bval; - xmlChar *eval; - - xmlChar index[256]; - xmlChar type[256]; - - /* parsing all index name/type pairs - may not start with ' ' or ':' */ - while (*look && ' ' != *look && ':' != *look){ + /* writing debug info out */ + if (extctr && extctr->flagShowRecords) + dom_log(YLOG_LOG, tinfo, node, + "RECORD id=%s rank=%s type=%s", + id_p ? (const char *) id_p : "(null)", + rank_p ? (const char *) rank_p : "(null)", + type_p ? (const char *) type_p : "(null)"); - /* setting name and type to zero */ - *index = '\0'; - *type = '\0'; - - /* parsing one index name */ - bval = look; - while (*look && ':' != *look && ' ' != *look){ - look++; - } - eval = look; - strncpy((char *)index, (const char *)bval, eval - bval); - index[eval - bval] = '\0'; - - - /* parsing one index type, if existing */ - if (':' == *look){ - look++; - - bval = look; - while (*look && ' ' != *look){ - look++; - } - eval = look; - strncpy((char *)type, (const char *)bval, eval - bval); - type[eval - bval] = '\0'; - } - printf("INDEX '%s:%s' '%s'\n", index, type, text); - - if (*look && ' ' == *look && *(look+1)){ - look++; - } + if (id_p && *id_p) + sscanf((const char *)id_p, "%255s", extctr->match_criteria); + + if (rank_p && *rank_p) + extctr->staticrank = atozint((const char *)rank_p); + + if (type_p && *type_p) + { + enum zebra_recctrl_action_t action = action_update; + if (!strcmp(type_p, "insert")) + action = action_insert; + else if (!strcmp(type_p, "delete")) + action = action_delete; + else if (!strcmp(type_p, "replace")) + action = action_replace; + else if (!strcmp(type_p, "update")) + action = action_update; + else + dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p); + extctr->action = action; } - xmlFree(text); - - /* //recWord->term_buf = (const char *)ptr->content; */ - /* //recWord->term_len = XML_STRLEN(ptr->content); */ - /* // if (type_str && *type_str) */ - /* // recWord->index_type = *type_str; /\* type was given *\/ */ - /* // recWord->index_name = name_str; */ - /* // recWord->index_type = prev_type; /\* restore it again *\/ */ -} - + if (tinfo->record_info_invoked == 1) + { + /* warn about multiple only once */ + dom_log(YLOG_WARN, tinfo, node, "multiple record elements"); + } + tinfo->record_info_invoked++; -/* DOM filter style indexing */ -void set_record_info(struct filter_info *tinfo, - struct recExtractCtrl *recctr, - xmlChar * id_p, - xmlChar * rank_p, - xmlChar * action_p) -{ - printf("RECORD id=%s rank=%s action=%s\n", id_p, rank_p, action_p); } /* DOM filter style indexing */ -void process_xml_element_zebra_node(struct filter_info *tinfo, - struct recExtractCtrl *recctr, - xmlNodePtr node) +static void process_xml_element_zebra_node(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + RecWord* recword, + xmlNodePtr node) { - if (node->type == XML_ELEMENT_NODE - && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_xslt_ns)){ - - if (0 == XML_STRCMP(node->name, "index")){ - xmlChar *index_p = 0; + if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href + && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns)) + { + if (0 == XML_STRCMP(node->name, "index")) + { + const char *index_p = 0; struct _xmlAttr *attr; - for (attr = node->properties; attr; attr = attr->next){ - if (attr_content_xml(attr, "name", &index_p)){ - index_value_of(tinfo, recctr, node, index_p); + for (attr = node->properties; attr; attr = attr->next) + { + if (attr_content(attr, "name", &index_p)) + { + index_value_of(tinfo, extctr, recword, node, index_p); } else - // printf("%s: dom filter: s% bad attribute %s", - // tinfo->fname, xmlGetNodePath(node)), nodeattr->name); - printf("dom filter: %s bad attribute @%s, " - "expected @name\n", - xmlGetNodePath(node), attr->name); + { + dom_log(YLOG_WARN, tinfo, node, + "bad attribute @%s, expected @name", + attr->name); + } } } - else if (0 == XML_STRCMP(node->name, "record")){ - xmlChar *id_p = 0; - xmlChar *rank_p = 0; - xmlChar *action_p = 0; + else if (0 == XML_STRCMP(node->name, "record")) + { + const char *id_p = 0; + const char *rank_p = 0; + const char *type_p = 0; struct _xmlAttr *attr; - for (attr = node->properties; attr; attr = attr->next){ - if (attr_content_xml(attr, "id", &id_p)) + for (attr = node->properties; attr; attr = attr->next) + { + if (attr_content(attr, "id", &id_p)) ; - else if (attr_content_xml(attr, "rank", &rank_p)) + else if (attr_content(attr, "rank", &rank_p)) ; - else if (attr_content_xml(attr, "acton", &action_p)) + else if (attr_content(attr, "type", &type_p)) ; else - // printf("%s: dom filter: s% bad attribute %s", - // tinfo->fname, xmlGetNodePath(node)), nodeattr->name); - printf("dom filter: %s bad attribute @%s," - " expected @id|@rank|@action\n", - xmlGetNodePath(node), attr->name); - - if (action_p && 0 != strcmp("update", (const char *)action_p)) - printf("dom filter: %s attribute @%s," - " only implemented '@action=\"update\"\n", - xmlGetNodePath(node), attr->name); - - + { + dom_log(YLOG_WARN, tinfo, node, + "bad attribute @%s, expected @id|@rank|@type", + attr->name); + } } - set_record_info(tinfo, recctr, id_p, rank_p, action_p); - } else { - // printf("%s: dom filter: s% bad attribute %s", - // tinfo->fname, xmlGetNodePath(node)), nodeattr->name); - printf("dom filter: %s bad element <%s>," - " expected | in namespace '%s'\n", - xmlGetNodePath(node), node->name, zebra_xslt_ns); - + set_record_info(tinfo, extctr, node, id_p, rank_p, type_p); + } + else + { + dom_log(YLOG_WARN, tinfo, node, + "bad element <%s>," + " expected | in namespace '%s'", + node->name, zebra_dom_ns); } } } +static int attr_content_pi(const char **c_ptr, const char *name, + char *value, size_t value_max) +{ + size_t name_len = strlen(name); + const char *look = *c_ptr; + int ret = 0; + + *value = '\0'; + while (*look && ' ' == *look) + look++; + if (strlen(look) > name_len) + { + if (look[name_len] == '=' && !memcmp(look, name, name_len)) + { + size_t i = 0; + look += name_len+1; + while (*look && ' ' != *look) + { + if (i < value_max-1) + value[i++] = *look; + look++; + } + value[i] = '\0'; + ret = 1; + } + } + while (*look && ' ' == *look) + look++; + *c_ptr = look; + return ret; +} /* DOM filter style indexing */ -void process_xml_pi_node(struct filter_info *tinfo, - struct recExtractCtrl *recctr, - xmlNodePtr node, - xmlChar **index_pp) +static void process_xml_pi_node(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + xmlNodePtr node, + const char **index_pp) { - - /* printf("PI %s\n", xmlGetNodePath(node)); */ - /* if right PI name, continue parsing PI */ - if (0 == strcmp(zebra_pi_name, (const char *)node->name)){ + if (0 == strcmp(zebra_pi_name, (const char *)node->name)) + { xmlChar *pi_p = node->content; - xmlChar *look = pi_p; + const char *look = (const char *) node->content; - xmlChar *bval; - xmlChar *eval; - /* parsing PI record instructions */ - if (0 == strncmp((const char *)look, "record", 6)){ - xmlChar id[256]; - xmlChar rank[256]; - xmlChar action[256]; - + if (0 == strncmp((const char *)look, "record", 6)) + { + char id[256]; + char rank[256]; + char type[256]; + *id = '\0'; *rank = '\0'; - *action = '\0'; - + *type = '\0'; look += 6; - - /* eat whitespace */ - while (*look && ' ' == *look && *(look+1)) - look++; - - /* parse possible id */ - if (*look && 0 == strncmp((const char *)look, "id=", 3)){ - look += 3; - bval = look; - while (*look && ' ' != *look) - look++; - eval = look; - strncpy((char *)id, (const char *)bval, eval - bval); - id[eval - bval] = '\0'; - } - - /* eat whitespace */ - while (*look && ' ' == *look && *(look+1)) - look++; - - /* parse possible rank */ - if (*look && 0 == strncmp((const char *)look, "rank=", 5)){ - look += 6; - bval = look; - while (*look && ' ' != *look) - look++; - eval = look; - strncpy((char *)rank, (const char *)bval, eval - bval); - rank[eval - bval] = '\0'; - } - - /* eat whitespace */ - while (*look && ' ' == *look && *(look+1)) - look++; - - if (look && '\0' != *look){ - printf ("ERROR %s: content '%s'; can not parse '%s'\n", - xmlGetNodePath(node), pi_p, look); - } else { - /* set_record_info(id, rank, action); */ - set_record_info(tinfo, recctr, id, rank, 0); - } - + while (*look) + if (attr_content_pi(&look, "id", id, sizeof(id))) + ; + else if (attr_content_pi(&look, "rank", rank, sizeof(rank))) + ; + else if (attr_content_pi(&look, "type", type, sizeof(type))) + { + dom_log(YLOG_WARN, tinfo, node, + "content '%s', can not parse '%s'", + pi_p, look); + break; + } + set_record_info(tinfo, extctr, node, id, rank, type); } - /* parsing index instruction */ - else if (0 == strncmp((const char *)look, "index", 5)){ + else if (0 == strncmp((const char *)look, "index", 5)) + { look += 5; /* eat whitespace */ - while (*look && ' ' == *look && *(look+1)) + while (*look && ' ' == *look) look++; /* export index instructions to outside */ *index_pp = look; - - /* nor record, neither index */ - } else { - - printf ("ERROR %s: content '%s'; can not parse '%s'\n", - xmlGetNodePath(node), pi_p, look); - } + } + else + { + dom_log(YLOG_WARN, tinfo, node, + "content '%s', can not parse '%s'", + pi_p, look); + } } } /* DOM filter style indexing */ -void process_xml_element_node(struct filter_info *tinfo, - struct recExtractCtrl *recctr, - xmlNodePtr node) +static void process_xml_element_node(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + RecWord* recword, + xmlNodePtr node) { /* remember indexing instruction from PI to next element node */ - xmlChar *index_p = 0; - - /* printf("ELEM %s\n", xmlGetNodePath(node)); */ + const char *index_p = 0; /* check if we are an element node in the special zebra namespace and either set record data or index value-of node content*/ - process_xml_element_zebra_node(tinfo, recctr, node); + process_xml_element_zebra_node(tinfo, extctr, recword, node); /* loop through kid nodes */ for (node = node->children; node; node = node->next) + { + /* check and set PI record and index index instructions */ + if (node->type == XML_PI_NODE) { - /* check and set PI record and index index instructions */ - if (node->type == XML_PI_NODE){ - process_xml_pi_node(tinfo, recctr, node, &index_p); - } - else if (node->type == XML_ELEMENT_NODE){ - /* if there was a PI index instruction before this element */ - if (index_p){ - index_value_of(tinfo, recctr, node, index_p); - index_p = 0; - } - process_xml_element_node(tinfo, recctr, node); + process_xml_pi_node(tinfo, extctr, node, &index_p); + } + else if (node->type == XML_ELEMENT_NODE) + { + /* if there was a PI index instruction before this element */ + if (index_p) + { + index_value_of(tinfo, extctr, recword, node, index_p); + index_p = 0; } - else - continue; + process_xml_element_node(tinfo, extctr, recword,node); } + else + continue; + } } /* DOM filter style indexing */ -void extract_dom_doc_node(struct filter_info *tinfo, - struct recExtractCtrl *recctr, - xmlDocPtr doc) +static void extract_dom_doc_node(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + xmlDocPtr doc) { - /* printf("DOC %s\n", xmlGetNodePath((xmlNodePtr)doc)); */ + /* only need to do the initialization once, reuse recword for all terms */ + RecWord recword; + (*extctr->init)(extctr, &recword); - process_xml_element_node(tinfo, recctr, (xmlNodePtr)doc); + process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc); } - - static int convert_extract_doc(struct filter_info *tinfo, struct filter_input *input, struct recExtractCtrl *p, xmlDocPtr doc) - { - /* RecWord recWord; */ xmlChar *buf_out; int len_out; const char *params[10]; xsltStylesheetPtr last_xsp = 0; - xmlDocPtr store_doc = 0; + /* per default do not ingest record */ + tinfo->record_info_invoked = 0; + + /* exit if empty document given */ + if (!doc) + return RECCTRL_EXTRACT_SKIP; + + /* we actuallu have a document which needs to be processed further */ params[0] = 0; - set_param_str(params, "schema", zebra_xslt_ns, tinfo->odr_record); + set_param_str(params, "schema", zebra_dom_ns, tinfo->nmem_record); - /* input conversion */ - perform_convert(tinfo, input->convert, params, &doc, 0); + if (p && p->flagShowRecords) + { + xmlChar *buf_out; + int len_out; + xmlDocDumpMemory(doc, &buf_out, &len_out); +#if 0 + FILE *outf = fopen("extract.xml", "w"); + fwrite(buf_out, 1, len_out, outf); + fclose(outf); +#endif + yaz_log(YLOG_LOG, "Extract Doc: %.*s", len_out, buf_out); + } - if (tinfo->store) + if (p->setStoreData) + { + xmlDocPtr store_doc = 0; + + /* input conversion */ + perform_convert(tinfo, p, 0, input->convert, params, &doc, 0); + + if (tinfo->store) { /* store conversion */ store_doc = xmlCopyDoc(doc, 1); - perform_convert(tinfo, tinfo->store->convert, + perform_convert(tinfo, p, 0, tinfo->store->convert, params, &store_doc, &last_xsp); } - - if (last_xsp) - xsltSaveResultToString(&buf_out, &len_out, - store_doc ? store_doc : doc, last_xsp); - else - xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out); - if (p->flagShowRecords) - fwrite(buf_out, len_out, 1, stdout); - (*p->setStoreData)(p, buf_out, len_out); - xmlFree(buf_out); + + /* saving either store doc or original doc in case no store doc exists */ + if (last_xsp) + xsltSaveResultToString(&buf_out, &len_out, + store_doc ? store_doc : doc, last_xsp); + else + xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out); + + if (p->setStoreData) + (*p->setStoreData)(p, buf_out, len_out); + xmlFree(buf_out); + if (store_doc) + xmlFreeDoc(store_doc); + } - if (store_doc) - xmlFreeDoc(store_doc); /* extract conversion */ - perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0); + perform_convert(tinfo, p, 0, tinfo->extract->convert, params, &doc, 0); + /* finally, do the indexing */ if (doc){ extract_dom_doc_node(tinfo, p, doc); - extract_doc_alvis(tinfo, p, doc); xmlFreeDoc(doc); } + + /* there was nothing to index, so there is no inserted/updated record */ + if (tinfo->record_info_invoked == 0) + return RECCTRL_EXTRACT_SKIP; return RECCTRL_EXTRACT_OK; } @@ -1138,47 +1242,67 @@ static int extract_xml_split(struct filter_info *tinfo, int ret; if (p->first_record) - { - if (input->u.xmlreader.reader) - xmlFreeTextReader(input->u.xmlreader.reader); - input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex, - p /* I/O handler */, - 0 /* URL */, - 0 /* encoding */, - XML_PARSE_XINCLUDE| - XML_PARSE_NOENT); - } + { + if (input->u.xmlreader.reader) + xmlFreeTextReader(input->u.xmlreader.reader); + input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex, + p /* I/O handler */, + 0 /* URL */, + 0 /* encoding */, + XML_PARSE_XINCLUDE + | XML_PARSE_NOENT + | XML_PARSE_NONET); + } if (!input->u.xmlreader.reader) return RECCTRL_EXTRACT_ERROR_GENERIC; ret = xmlTextReaderRead(input->u.xmlreader.reader); while (ret == 1) + { + int type = xmlTextReaderNodeType(input->u.xmlreader.reader); + int depth = xmlTextReaderDepth(input->u.xmlreader.reader); + + if (type == XML_READER_TYPE_ELEMENT && + input->u.xmlreader.split_level == depth) { - int type = xmlTextReaderNodeType(input->u.xmlreader.reader); - int depth = xmlTextReaderDepth(input->u.xmlreader.reader); - if (type == XML_READER_TYPE_ELEMENT && - input->u.xmlreader.split_level == depth) - { - xmlNodePtr ptr - = xmlTextReaderExpand(input->u.xmlreader.reader); - if (ptr) - { - xmlNodePtr ptr2 = xmlCopyNode(ptr, 1); - xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0"); + xmlNodePtr ptr; + + /* per default do not ingest record */ + tinfo->record_info_invoked = 0; + + ptr = xmlTextReaderExpand(input->u.xmlreader.reader); + if (ptr) + { + /* we have a new document */ + + xmlNodePtr ptr2 = xmlCopyNode(ptr, 1); + xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0"); - xmlDocSetRootElement(doc, ptr2); + xmlDocSetRootElement(doc, ptr2); - return convert_extract_doc(tinfo, input, p, doc); - } - else - { - xmlFreeTextReader(input->u.xmlreader.reader); - input->u.xmlreader.reader = 0; - return RECCTRL_EXTRACT_ERROR_GENERIC; - } + /* writing debug info out */ + if (p->flagShowRecords) + { + xmlChar *buf_out = 0; + int len_out = 0; + xmlDocDumpMemory(doc, &buf_out, &len_out); + yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s", + tinfo->fname ? tinfo->fname : "(none)", + depth, len_out, buf_out); + xmlFree(buf_out); } - ret = xmlTextReaderRead(input->u.xmlreader.reader); + + return convert_extract_doc(tinfo, input, p, doc); + } + else + { + xmlFreeTextReader(input->u.xmlreader.reader); + input->u.xmlreader.reader = 0; + return RECCTRL_EXTRACT_ERROR_GENERIC; + } } + ret = xmlTextReaderRead(input->u.xmlreader.reader); + } xmlFreeTextReader(input->u.xmlreader.reader); input->u.xmlreader.reader = 0; return RECCTRL_EXTRACT_EOF; @@ -1189,18 +1313,20 @@ static int extract_xml_full(struct filter_info *tinfo, struct recExtractCtrl *p) { if (p->first_record) /* only one record per stream */ + { + xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, + p /* I/O handler */, + 0 /* URL */, + 0 /* encoding */, + XML_PARSE_XINCLUDE + | XML_PARSE_NOENT + | XML_PARSE_NONET); + if (!doc) { - xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, - p /* I/O handler */, - 0 /* URL */, - 0 /* encoding */, - XML_PARSE_XINCLUDE|XML_PARSE_NOENT); - if (!doc) - { - return RECCTRL_EXTRACT_ERROR_GENERIC; - } - return convert_extract_doc(tinfo, input, p, doc); + return RECCTRL_EXTRACT_ERROR_GENERIC; } + return convert_extract_doc(tinfo, input, p, doc); + } else return RECCTRL_EXTRACT_EOF; } @@ -1216,46 +1342,50 @@ static int extract_iso2709(struct filter_info *tinfo, if (p->stream->readf(p->stream, buf, 5) != 5) return RECCTRL_EXTRACT_EOF; while (*buf < '0' || *buf > '9') - { - int i; + { + int i; - yaz_log(YLOG_WARN, "MARC: Skipping bad byte %d (0x%02X)", - *buf & 0xff, *buf & 0xff); - for (i = 0; i<4; i++) - buf[i] = buf[i+1]; + dom_log(YLOG_WARN, tinfo, 0, + "MARC: Skipping bad byte %d (0x%02X)", + *buf & 0xff, *buf & 0xff); + for (i = 0; i<4; i++) + buf[i] = buf[i+1]; - if (p->stream->readf(p->stream, buf+4, 1) != 1) - return RECCTRL_EXTRACT_EOF; - } + if (p->stream->readf(p->stream, buf+4, 1) != 1) + return RECCTRL_EXTRACT_EOF; + } record_length = atoi_n (buf, 5); if (record_length < 25) - { - yaz_log (YLOG_WARN, "MARC record length < 25, is %d", - record_length); - return RECCTRL_EXTRACT_ERROR_GENERIC; - } + { + dom_log(YLOG_WARN, tinfo, 0, + "MARC record length < 25, is %d", record_length); + return RECCTRL_EXTRACT_ERROR_GENERIC; + } read_bytes = p->stream->readf(p->stream, buf+5, record_length-5); if (read_bytes < record_length-5) - { - yaz_log (YLOG_WARN, "Couldn't read whole MARC record"); - return RECCTRL_EXTRACT_ERROR_GENERIC; - } + { + dom_log(YLOG_WARN, tinfo, 0, + "couldn't read whole MARC record"); + return RECCTRL_EXTRACT_ERROR_GENERIC; + } r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length); if (r < record_length) - { - yaz_log (YLOG_WARN, "Parsing of MARC record failed r=%d length=%d", - r, record_length); - return RECCTRL_EXTRACT_ERROR_GENERIC; - } + { + dom_log (YLOG_WARN, tinfo, 0, + "parsing of MARC record failed r=%d length=%d", + r, record_length); + return RECCTRL_EXTRACT_ERROR_GENERIC; + } else - { - xmlDocPtr rdoc; - xmlNode *root_ptr; - yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0); - rdoc = xmlNewDoc((const xmlChar*) "1.0"); - xmlDocSetRootElement(rdoc, root_ptr); - return convert_extract_doc(tinfo, input, p, rdoc); - } + { + xmlDocPtr rdoc; + xmlNode *root_ptr; + yaz_marc_write_xml(input->u.marc.handle, &root_ptr, + "http://www.loc.gov/MARC21/slim", 0, 0); + rdoc = xmlNewDoc((const xmlChar*) "1.0"); + xmlDocSetRootElement(rdoc, root_ptr); + return convert_extract_doc(tinfo, input, p, rdoc); + } return RECCTRL_EXTRACT_OK; } @@ -1266,26 +1396,30 @@ static int filter_extract(void *clientData, struct recExtractCtrl *p) if (!input) return RECCTRL_EXTRACT_ERROR_GENERIC; + + nmem_reset(tinfo->nmem_record); - odr_reset(tinfo->odr_record); + if (p->setStoreData == 0) + return extract_xml_full(tinfo, input, p); switch(input->type) - { - case DOM_INPUT_XMLREADER: - if (input->u.xmlreader.split_level == 0) - return extract_xml_full(tinfo, input, p); - else - return extract_xml_split(tinfo, input, p); - break; - case DOM_INPUT_MARC: - return extract_iso2709(tinfo, input, p); - } + { + case DOM_INPUT_XMLREADER: + if (input->u.xmlreader.split_level == 0) + return extract_xml_full(tinfo, input, p); + else + return extract_xml_split(tinfo, input, p); + break; + case DOM_INPUT_MARC: + return extract_iso2709(tinfo, input, p); + } return RECCTRL_EXTRACT_ERROR_GENERIC; } static int ioread_ret(void *context, char *buffer, int len) { struct recRetrieveCtrl *p = context; - return p->stream->readf(p->stream, buffer, len); + int r = p->stream->readf(p->stream, buffer, len); + return r; } static int ioclose_ret(void *context) @@ -1293,9 +1427,9 @@ static int ioclose_ret(void *context) return 0; } -static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) +static int filter_retrieve(void *clientData, struct recRetrieveCtrl *p) { - /* const char *esn = zebra_xslt_ns; */ + /* const char *esn = zebra_dom_ns; */ const char *esn = 0; const char *params[32]; struct filter_info *tinfo = clientData; @@ -1304,102 +1438,104 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) xsltStylesheetPtr last_xsp = 0; if (p->comp) + { + if (p->comp->which == Z_RecordComp_simple + && p->comp->u.simple->which == Z_ElementSetNames_generic) { - if (p->comp->which == Z_RecordComp_simple - && p->comp->u.simple->which == Z_ElementSetNames_generic) - { - esn = p->comp->u.simple->u.generic; - } - else if (p->comp->which == Z_RecordComp_complex - && p->comp->u.complex->generic->elementSpec - && p->comp->u.complex->generic->elementSpec->which == - Z_ElementSpec_elementSetName) - { - esn = p->comp->u.complex->generic->elementSpec->u.elementSetName; - } + esn = p->comp->u.simple->u.generic; } - retrieve = lookup_retrieve(tinfo, esn); - if (!retrieve) + else if (p->comp->which == Z_RecordComp_complex + && p->comp->u.complex->generic->elementSpec + && p->comp->u.complex->generic->elementSpec->which == + Z_ElementSpec_elementSetName) { - p->diagnostic = - YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; - return 0; + esn = p->comp->u.complex->generic->elementSpec->u.elementSetName; } + } + retrieve = lookup_retrieve(tinfo, esn); + if (!retrieve) + { + p->diagnostic = + YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; + p->addinfo = odr_strdup(p->odr, esn); + return 0; + } params[0] = 0; - set_param_int(params, "id", p->localno, p->odr); + set_param_int(params, "id", p->localno, p->odr->mem); if (p->fname) - set_param_str(params, "filename", p->fname, p->odr); + set_param_str(params, "filename", p->fname, p->odr->mem); if (p->staticrank >= 0) - set_param_int(params, "rank", p->staticrank, p->odr); + set_param_int(params, "rank", p->staticrank, p->odr->mem); if (esn) - set_param_str(params, "schema", esn, p->odr); + set_param_str(params, "schema", esn, p->odr->mem); else if (retrieve->name) - set_param_str(params, "schema", retrieve->name, p->odr); + set_param_str(params, "schema", retrieve->name, p->odr->mem); else if (retrieve->identifier) - set_param_str(params, "schema", retrieve->identifier, p->odr); + set_param_str(params, "schema", retrieve->identifier, p->odr->mem); else - set_param_str(params, "schema", "", p->odr); + set_param_str(params, "schema", "", p->odr->mem); if (p->score >= 0) - set_param_int(params, "score", p->score, p->odr); - set_param_int(params, "size", p->recordSize, p->odr); + set_param_int(params, "score", p->score, p->odr->mem); + set_param_int(params, "size", p->recordSize, p->odr->mem); doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, - XML_PARSE_XINCLUDE|XML_PARSE_NOENT); + XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET); if (!doc) - { - p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; - return 0; - } + { + p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + return 0; + } /* retrieve conversion */ - perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp); + perform_convert(tinfo, 0, p, retrieve->convert, params, &doc, &last_xsp); if (!doc) - { - p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; - } - else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML) - { - xmlChar *buf_out; - int len_out; + { + p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + } + else if (!p->input_format + || !oid_oidcmp(p->input_format, yaz_oid_recsyn_xml)) + { + xmlChar *buf_out; + int len_out; - if (last_xsp) - xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp); - else - xmlDocDumpMemory(doc, &buf_out, &len_out); + if (last_xsp) + xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp); + else + xmlDocDumpMemory(doc, &buf_out, &len_out); - p->output_format = VAL_TEXT_XML; - p->rec_len = len_out; - p->rec_buf = odr_malloc(p->odr, p->rec_len); - memcpy(p->rec_buf, buf_out, p->rec_len); - xmlFree(buf_out); - } - else if (p->output_format == VAL_SUTRS) - { - xmlChar *buf_out; - int len_out; + p->output_format = yaz_oid_recsyn_xml; + p->rec_len = len_out; + p->rec_buf = odr_malloc(p->odr, p->rec_len); + memcpy(p->rec_buf, buf_out, p->rec_len); + xmlFree(buf_out); + } + else if (!oid_oidcmp(p->output_format, yaz_oid_recsyn_sutrs)) + { + xmlChar *buf_out; + int len_out; - if (last_xsp) - xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp); - else - xmlDocDumpMemory(doc, &buf_out, &len_out); + if (last_xsp) + xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp); + else + xmlDocDumpMemory(doc, &buf_out, &len_out); - p->output_format = VAL_SUTRS; - p->rec_len = len_out; - p->rec_buf = odr_malloc(p->odr, p->rec_len); - memcpy(p->rec_buf, buf_out, p->rec_len); + p->output_format = yaz_oid_recsyn_sutrs; + p->rec_len = len_out; + p->rec_buf = odr_malloc(p->odr, p->rec_len); + memcpy(p->rec_buf, buf_out, p->rec_len); - xmlFree(buf_out); - } + xmlFree(buf_out); + } else - { - p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP; - } + { + p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP; + } xmlFreeDoc(doc); return 0; } @@ -1428,6 +1564,7 @@ idzebra_filter /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab