X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fmod_dom.c;h=fca04eb75ce2c6812a49a48b8af6184080a8a4e2;hb=7bc692d5bc6971008fc2e1e37d63c080383d6a3e;hp=102a51af9cf456cdcf95d6e42f8040568c49cff5;hpb=c5365d8095f29747f5998028934cfc034d038673;p=idzebra-moved-to-github.git diff --git a/index/mod_dom.c b/index/mod_dom.c index 102a51a..fca04eb 100644 --- a/index/mod_dom.c +++ b/index/mod_dom.c @@ -1,31 +1,33 @@ -/* $Id: mod_dom.c,v 1.2 2007-02-12 10:33:51 adam Exp $ +/* $Id: mod_dom.c,v 1.26 2007-03-03 21:39:10 adam Exp $ Copyright (C) 1995-2007 Index Data ApS -This file is part of the Zebra server. + This file is part of the Zebra server. -Zebra is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free -Software Foundation; either version 2, or (at your option) any later -version. + Zebra is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2, or (at your option) any later + version. -Zebra is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. + Zebra is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include +#include #include #include +#include #include #include @@ -42,6 +44,16 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include +/* DOM filter style indexing */ +#define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0" +static const char *zebra_dom_ns = ZEBRA_DOM_NS; + +/* DOM filter style indexing */ +#define ZEBRA_PI_NAME "zebra-2.0" +static const char *zebra_pi_name = ZEBRA_PI_NAME; + + + struct convert_s { const char *stylesheet; xsltStylesheetPtr stylesheet_xsp; @@ -96,11 +108,45 @@ struct filter_info { struct filter_retrieve *retrieve_list; struct filter_input *input_list; struct filter_store *store; + int record_info_invoked; }; + + #define XML_STRCMP(a,b) strcmp((char*)a, b) #define XML_STRLEN(a) strlen((char*)a) + +#define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE) + +static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr, + const char *fmt, ...) +#ifdef __GNUC__ + __attribute__ ((format (printf, 4, 5))) +#endif + ; + +static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr, + const char *fmt, ...) +{ + va_list ap; + char buf[4096]; + + va_start(ap, fmt); + yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap); + if (ptr) + { + yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none", + xmlGetLineNo(ptr), buf); + } + else + { + yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf); + } + va_end(ap); +} + + static void set_param_str(const char **params, const char *name, const char *value, ODR odr) { @@ -138,6 +184,7 @@ static void *filter_init(Res res, RecType recType) tinfo->input_list = 0; tinfo->store = 0; tinfo->doc_config = 0; + tinfo->record_info_invoked = 0; #if YAZ_HAVE_EXSLT exsltRegisterAll(); @@ -152,8 +199,8 @@ static int attr_content(struct _xmlAttr *attr, const char *name, if (!XML_STRCMP(attr->name, name) && attr->children && attr->children->type == XML_TEXT_NODE) { - *dst_content = (const char *)(attr->children->content); - return 1; + *dst_content = (const char *)(attr->children->content); + return 1; } return 0; } @@ -220,73 +267,73 @@ static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr, struct convert_s **l) { *l = 0; - for(; ptr; ptr = ptr->next) - { - if (ptr->type != XML_ELEMENT_NODE) - continue; + FOR_EACH_ELEMENT(ptr) { if (!XML_STRCMP(ptr->name, "xslt")) { struct _xmlAttr *attr; - struct convert_s *p = odr_malloc(tinfo->odr_config, sizeof(*p)); - + struct convert_s *p + = odr_malloc(tinfo->odr_config, sizeof(*p)); + p->next = 0; p->stylesheet = 0; p->stylesheet_xsp = 0; - + for (attr = ptr->properties; attr; attr = attr->next) if (attr_content(attr, "stylesheet", &p->stylesheet)) ; else - yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s", attr->name); + } if (p->stylesheet) { char tmp_xslt_full_name[1024]; - if (!yaz_filepath_resolve(p->stylesheet, tinfo->profile_path, - NULL, tmp_xslt_full_name)) + if (!yaz_filepath_resolve(p->stylesheet, + tinfo->profile_path, + NULL, + tmp_xslt_full_name)) { - yaz_log(YLOG_WARN, - "%s: dom filter: stylesheet %s not found in " + dom_log(YLOG_WARN, tinfo, 0, + "stylesheet %s not found in " "path %s", - tinfo->fname, - p->stylesheet, tinfo->profile_path); + p->stylesheet, + tinfo->profile_path); return ZEBRA_FAIL; } p->stylesheet_xsp - = xsltParseStylesheetFile((const xmlChar*) tmp_xslt_full_name); + = xsltParseStylesheetFile((const xmlChar*) + tmp_xslt_full_name); if (!p->stylesheet_xsp) { - yaz_log(YLOG_WARN, - "%s: dom filter: could not parse xslt " - "stylesheet %s", - tinfo->fname, tmp_xslt_full_name); + dom_log(YLOG_WARN, tinfo, 0, + "could not parse xslt stylesheet %s", + tmp_xslt_full_name); return ZEBRA_FAIL; } - } - else - { - yaz_log(YLOG_WARN, - "%s: dom filter: missing attribute 'stylesheet' " - "for element 'xslt'", tinfo->fname); - return ZEBRA_FAIL; - } - *l = p; - l = &p->next; + } + else + { + dom_log(YLOG_WARN, tinfo, ptr, + "missing attribute 'stylesheet' "); + return ZEBRA_FAIL; + } + *l = p; + l = &p->next; } else { - yaz_log(YLOG_LOG, "%s: dom filter: bad node '%s' for ", - tinfo->fname, ptr->name); + dom_log(YLOG_WARN, tinfo, ptr, + "bad element '%s', expected ", ptr->name); return ZEBRA_FAIL; } - } return ZEBRA_OK; } static ZEBRA_RES perform_convert(struct filter_info *tinfo, + struct recExtractCtrl *extctr, struct convert_s *convert, const char **params, xmlDocPtr *doc, @@ -294,12 +341,34 @@ static ZEBRA_RES perform_convert(struct filter_info *tinfo, { for (; convert; convert = convert->next) { + xmlChar *buf_out = 0; + int len_out = 0; xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp, - *doc, params); + *doc, params); if (last_xsp) *last_xsp = convert->stylesheet_xsp; + + if (!res_doc) + break; + + /* now saving into buffer and re-reading into DOM to avoid annoing + XSLT problem with thrown-out indentation text nodes */ + xsltSaveResultToString(&buf_out, &len_out, res_doc, + convert->stylesheet_xsp); + xmlFreeDoc(res_doc); + xmlFreeDoc(*doc); - *doc = res_doc; + + *doc = xmlParseMemory((const char *) buf_out, len_out); + + /* writing debug info out */ + if (extctr->flagShowRecords) + yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s", + tinfo->fname ? tinfo->fname : "(none)", + convert->stylesheet, + len_out, buf_out); + + xmlFree(buf_out); } return ZEBRA_OK; } @@ -320,39 +389,37 @@ static struct filter_input *new_input(struct filter_info *tinfo, int type) } static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr, - const char *syntax, - const char *name) + const char *syntax, const char *name) { - for (; ptr; ptr = ptr->next) - { - if (ptr->type != XML_ELEMENT_NODE) - continue; + FOR_EACH_ELEMENT(ptr) { if (!XML_STRCMP(ptr->name, "marc")) { yaz_iconv_t iconv = 0; const char *input_charset = "marc-8"; - struct _xmlAttr *attr; + struct _xmlAttr *attr; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "charset", &input_charset)) + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "inputcharset", &input_charset)) ; else - yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @inputcharset", + attr->name); + } + } iconv = yaz_iconv_open("utf-8", input_charset); if (!iconv) { - yaz_log(YLOG_WARN, "%s: dom filter: unsupported charset " - "'%s' for ", - tinfo->fname, input_charset); + dom_log(YLOG_WARN, tinfo, ptr, + "unsupported @charset '%s'", input_charset); return ZEBRA_FAIL; } else { - struct filter_input *p = new_input(tinfo, DOM_INPUT_MARC); + struct filter_input *p + = new_input(tinfo, DOM_INPUT_MARC); p->u.marc.handle = yaz_marc_create(); p->u.marc.iconv = iconv; @@ -367,22 +434,25 @@ static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr, } else if (!XML_STRCMP(ptr->name, "xmlreader")) { - struct filter_input *p = new_input(tinfo, DOM_INPUT_XMLREADER); - struct _xmlAttr *attr; + struct filter_input *p + = new_input(tinfo, DOM_INPUT_XMLREADER); + struct _xmlAttr *attr; const char *level_str = 0; p->u.xmlreader.split_level = 0; p->u.xmlreader.reader = 0; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "level", &level_str)) + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "level", &level_str)) ; else - yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @level", + attr->name); + } + } if (level_str) p->u.xmlreader.split_level = atoi(level_str); @@ -393,8 +463,9 @@ static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr, } else { - yaz_log(YLOG_WARN, "%s: dom filter: bad input type %s", - tinfo->fname, ptr->name); + dom_log(YLOG_WARN, tinfo, ptr, + "bad element <%s>, expected |", + ptr->name); return ZEBRA_FAIL; } } @@ -415,14 +486,16 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname) else tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname); - yaz_log(YLOG_LOG, "dom filter: loading config file %s", tinfo->full_name); - + yaz_log(YLOG_LOG, "%s dom filter: " + "loading config file %s", tinfo->fname, tinfo->full_name); + doc = xmlParseFile(tinfo->full_name); if (!doc) { - yaz_log(YLOG_WARN, "%s: dom filter: failed to parse config file %s", + yaz_log(YLOG_WARN, "%s: dom filter: " + "failed to parse config file %s", tinfo->fname, tinfo->full_name); - return ZEBRA_FAIL; + return ZEBRA_FAIL; } /* save because we store ptrs to the content */ tinfo->doc_config = doc; @@ -431,16 +504,14 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname) if (!ptr || ptr->type != XML_ELEMENT_NODE || XML_STRCMP(ptr->name, "dom")) { - yaz_log(YLOG_WARN, - "%s: dom filter: expected root element ", - tinfo->fname); + dom_log(YLOG_WARN, tinfo, ptr, + "bad root element <%s>, expected root element ", + ptr->name); return ZEBRA_FAIL; } - for (ptr = ptr->children; ptr; ptr = ptr->next) - { - if (ptr->type != XML_ELEMENT_NODE) - continue; + ptr = ptr->children; + FOR_EACH_ELEMENT(ptr) { if (!XML_STRCMP(ptr->name, "extract")) { /* @@ -449,34 +520,35 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname) */ - struct _xmlAttr *attr; + struct _xmlAttr *attr; struct filter_extract *f = odr_malloc(tinfo->odr_config, sizeof(*f)); tinfo->extract = f; f->name = 0; f->convert = 0; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "name", &f->name)) + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "name", &f->name)) ; else - yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - - } + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @name", + attr->name); + } + } parse_convert(tinfo, ptr->children, &f->convert); } - else if (!XML_STRCMP(ptr->name, "retrieve")) - { + else if (!XML_STRCMP(ptr->name, "retrieve")) + { /* */ - struct _xmlAttr *attr; + struct _xmlAttr *attr; struct filter_retrieve **fp = &tinfo->retrieve_list; struct filter_retrieve *f = odr_malloc(tinfo->odr_config, sizeof(*f)); @@ -490,26 +562,29 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname) f->convert = 0; f->next = 0; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "identifier", &f->identifier)) + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "identifier", + &f->identifier)) ; else if (attr_content(attr, "name", &f->name)) ; else - yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @identifier|@name", + attr->name); + } + } parse_convert(tinfo, ptr->children, &f->convert); - } - else if (!XML_STRCMP(ptr->name, "store")) - { + } + else if (!XML_STRCMP(ptr->name, "store")) + { /* - - - - + + + + */ struct filter_store *f = odr_malloc(tinfo->odr_config, sizeof(*f)); @@ -518,8 +593,8 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname) f->convert = 0; parse_convert(tinfo, ptr->children, &f->convert); } - else if (!XML_STRCMP(ptr->name, "input")) - { + else if (!XML_STRCMP(ptr->name, "input")) + { /* @@ -528,28 +603,32 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname) */ - struct _xmlAttr *attr; + struct _xmlAttr *attr; const char *syntax = 0; const char *name = 0; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "syntax", &syntax)) + for (attr = ptr->properties; attr; attr = attr->next) + { + if (attr_content(attr, "syntax", &syntax)) ; else if (attr_content(attr, "name", &name)) ; else - yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad attribute @%s, expected @syntax|@name", + attr->name); + } + } parse_input(tinfo, ptr->children, syntax, name); - } - else - { - yaz_log(YLOG_WARN, "%s: dom filter: bad element %s", - tinfo->fname, ptr->name); - return ZEBRA_FAIL; - } + } + else + { + dom_log(YLOG_WARN, tinfo, ptr, + "bad element <%s>, " + "expected |||", + ptr->name); + return ZEBRA_FAIL; + } } return ZEBRA_OK; } @@ -565,13 +644,13 @@ static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo, for (; f; f = f->next) { /* find requested schema */ - if (est) - { - if (f->identifier && !strcmp(f->identifier, est)) + if (est) + { + if (f->identifier && !strcmp(f->identifier, est)) return f; - if (f->name && !strcmp(f->name, est)) + if (f->name && !strcmp(f->name, est)) return f; - } + } } return 0; } @@ -614,133 +693,400 @@ static int ioclose_ex(void *context) return 0; } -static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl, - xmlNodePtr ptr, RecWord *recWord) + +/* DOM filter style indexing */ +static int attr_content_xml(struct _xmlAttr *attr, const char *name, + xmlChar **dst_content) { - for(; ptr; ptr = ptr->next) + if (0 == XML_STRCMP(attr->name, name) && attr->children + && attr->children->type == XML_TEXT_NODE) { - index_cdata(tinfo, ctrl, ptr->children, recWord); - if (ptr->type != XML_TEXT_NODE) - continue; - recWord->term_buf = (const char *)ptr->content; - recWord->term_len = XML_STRLEN(ptr->content); - (*ctrl->tokenAdd)(recWord); + *dst_content = (attr->children->content); + return 1; } + return 0; } -#define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1" +/* DOM filter style indexing */ +static void index_value_of(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + RecWord* recword, + xmlNodePtr node, + xmlChar * index_p) +{ + if (tinfo->record_info_invoked == 1) + { + xmlChar *text = xmlNodeGetContent(node); + size_t text_len = strlen((const char *)text); + + /* if there is no text, we do not need to proceed */ + if (text_len) + { + xmlChar *look = index_p; + xmlChar *bval; + xmlChar *eval; + + xmlChar index[256]; + xmlChar type[256]; + + /* assingning text to be indexed */ + recword->term_buf = (const char *)text; + recword->term_len = text_len; + + /* parsing all index name/type pairs */ + /* may not start with ' ' or ':' */ + while (*look && ' ' != *look && ':' != *look) + { + /* setting name and type to zero */ + *index = '\0'; + *type = '\0'; + + /* parsing one index name */ + bval = look; + while (*look && ':' != *look && ' ' != *look) + { + look++; + } + eval = look; + strncpy((char *)index, (const char *)bval, eval - bval); + index[eval - bval] = '\0'; + + + /* parsing one index type, if existing */ + if (':' == *look) + { + look++; + + bval = look; + while (*look && ' ' != *look) + { + look++; + } + eval = look; + strncpy((char *)type, (const char *)bval, eval - bval); + type[eval - bval] = '\0'; + } + + /* actually indexing the text given */ + dom_log(YLOG_DEBUG, tinfo, 0, + "INDEX '%s:%s' '%s'", + index ? (const char *) index : "null", + type ? (const char *) type : "null", + text ? (const char *) text : "null"); + + recword->index_name = (const char *)index; + if (type && *type) + recword->index_type = *type; + + /* writing debug out */ + if (extctr->flagShowRecords) + dom_log(YLOG_LOG, tinfo, 0, + "INDEX '%s:%s' '%s'", + index ? (const char *) index : "null", + type ? (const char *) type : "null", + text ? (const char *) text : "null"); + + /* actually indexing the text given */ + recword->index_name = (const char *)index; + if (type && *type) + recword->index_type = *type; + (extctr->tokenAdd)(recword); + + /* eat whitespaces */ + if (*look && ' ' == *look && *(look+1)) + { + look++; + } + } + } + xmlFree(text); + } +} -static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS; -static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl, - xmlNodePtr ptr, RecWord *recWord) +/* DOM filter style indexing */ +static void set_record_info(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + xmlNodePtr node, + xmlChar * id_p, + xmlChar * rank_p, + xmlChar * type_p) { - for(; ptr; ptr = ptr->next) + + /* writing debug info out */ + if (extctr->flagShowRecords) + dom_log(YLOG_LOG, tinfo, 0, + "RECORD id=%s rank=%s type=%s", + id_p ? (const char *) id_p : "(null)", + rank_p ? (const char *) rank_p : "(null)", + type_p ? (const char *) type_p : "(null)"); + + + if (id_p) + sscanf((const char *)id_p, "%255s", extctr->match_criteria); + + if (rank_p) + extctr->staticrank = atozint((const char *)rank_p); + + /* if (!strcmp("update", type_str)) */ + /* index_node(tinfo, ctrl, ptr, recword); */ + /* else if (!strcmp("delete", type_str)) */ + /* dom_log(YLOG_WARN, tinfo, ptr, "dom filter delete: to be implemented"); */ + /* else */ + /* dom_log(YLOG_WARN, tinfo, ptr, "dom filter: unknown record type '%s'", */ + /* type_str); */ + if (tinfo->record_info_invoked == 1) { - index_node(tinfo, ctrl, ptr->children, recWord); - if (ptr->type != XML_ELEMENT_NODE || !ptr->ns || - XML_STRCMP(ptr->ns->href, zebra_xslt_ns)) - continue; - if (!XML_STRCMP(ptr->name, "index")) - { - const char *name_str = 0; - const char *type_str = 0; - const char *xpath_str = 0; - struct _xmlAttr *attr; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "name", &name_str)) + /* warn about multiple only once */ + dom_log(YLOG_WARN, tinfo, node, "multiple record elements"); + } + tinfo->record_info_invoked++; + +} + + +/* DOM filter style indexing */ +static void process_xml_element_zebra_node(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + RecWord* recword, + xmlNodePtr node) +{ + if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href + && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns)) + { + if (0 == XML_STRCMP(node->name, "index")) + { + xmlChar *index_p = 0; + + struct _xmlAttr *attr; + for (attr = node->properties; attr; attr = attr->next) + { + if (attr_content_xml(attr, "name", &index_p)) + { + index_value_of(tinfo, extctr, recword,node, index_p); + } + else + { + dom_log(YLOG_WARN, tinfo, node, + "bad attribute @%s, expected @name", + attr->name); + } + } + } + else if (0 == XML_STRCMP(node->name, "record")) + { + xmlChar *id_p = 0; + xmlChar *rank_p = 0; + xmlChar *type_p = 0; + + struct _xmlAttr *attr; + for (attr = node->properties; attr; attr = attr->next) + { + if (attr_content_xml(attr, "id", &id_p)) ; - else if (attr_content(attr, "xpath", &xpath_str)) + else if (attr_content_xml(attr, "rank", &rank_p)) ; - else if (attr_content(attr, "type", &type_str)) + else if (attr_content_xml(attr, "type", &type_p)) ; else - yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } - if (name_str) - { - int prev_type = recWord->index_type; /* save default type */ - - if (type_str && *type_str) - recWord->index_type = *type_str; /* type was given */ - recWord->index_name = name_str; - index_cdata(tinfo, ctrl, ptr->children, recWord); - - recWord->index_type = prev_type; /* restore it again */ - } - } + { + dom_log(YLOG_WARN, tinfo, node, + "bad attribute @%s, expected @id|@rank|@type", + attr->name); + } + + if (type_p && 0 != strcmp("update", (const char *)type_p)) + { + dom_log(YLOG_WARN, tinfo, node, + "attribute @%s, only implemented '@type='update'", + attr->name); + } + } + set_record_info(tinfo, extctr, node, id_p, rank_p, type_p); + } + else + { + dom_log(YLOG_WARN, tinfo, node, + "bad element <%s>," + " expected | in namespace '%s'", + node->name, zebra_dom_ns); + } + } +} + + +/* DOM filter style indexing */ +static void process_xml_pi_node(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + xmlNodePtr node, + xmlChar **index_pp) +{ + /* if right PI name, continue parsing PI */ + if (0 == strcmp(zebra_pi_name, (const char *)node->name)) + { + xmlChar *pi_p = node->content; + xmlChar *look = pi_p; + + xmlChar *bval; + xmlChar *eval; + + /* parsing PI record instructions */ + if (0 == strncmp((const char *)look, "record", 6)) + { + xmlChar id[256]; + xmlChar rank[256]; + xmlChar type[256]; + + *id = '\0'; + *rank = '\0'; + *type = '\0'; + + look += 6; + + /* eat whitespace */ + while (*look && ' ' == *look && *(look+1)) + look++; + + /* parse possible id */ + if (*look && 0 == strncmp((const char *)look, "id=", 3)) + { + look += 3; + bval = look; + while (*look && ' ' != *look) + look++; + eval = look; + strncpy((char *)id, (const char *)bval, eval - bval); + id[eval - bval] = '\0'; + } + + /* eat whitespace */ + while (*look && ' ' == *look && *(look+1)) + look++; + + /* parse possible rank */ + if (*look && 0 == strncmp((const char *)look, "rank=", 5)) + { + look += 6; + bval = look; + while (*look && ' ' != *look) + look++; + eval = look; + strncpy((char *)rank, (const char *)bval, eval - bval); + rank[eval - bval] = '\0'; + } + + /* eat whitespace */ + while (*look && ' ' == *look && *(look+1)) + look++; + + if (look && '\0' != *look) + { + dom_log(YLOG_WARN, tinfo, node, + "content '%s', can not parse '%s'", + pi_p, look); + } + else + set_record_info(tinfo, extctr, node, id, rank, 0); + + } + /* parsing index instruction */ + else if (0 == strncmp((const char *)look, "index", 5)) + { + look += 5; + + /* eat whitespace */ + while (*look && ' ' == *look && *(look+1)) + look++; + + /* export index instructions to outside */ + *index_pp = look; + } + else + { + dom_log(YLOG_WARN, tinfo, node, + "content '%s', can not parse '%s'", + pi_p, look); + } } } -static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl, - xmlNodePtr ptr, RecWord *recWord) +/* DOM filter style indexing */ +static void process_xml_element_node(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + RecWord* recword, + xmlNodePtr node) { - const char *type_str = "update"; + /* remember indexing instruction from PI to next element node */ + xmlChar *index_p = 0; - if (ptr && ptr->type == XML_ELEMENT_NODE && ptr->ns && - !XML_STRCMP(ptr->ns->href, zebra_xslt_ns) - && !XML_STRCMP(ptr->name, "record")) + /* check if we are an element node in the special zebra namespace + and either set record data or index value-of node content*/ + process_xml_element_zebra_node(tinfo, extctr, recword, node); + + /* loop through kid nodes */ + for (node = node->children; node; node = node->next) { - const char *id_str = 0; - const char *rank_str = 0; - struct _xmlAttr *attr; - for (attr = ptr->properties; attr; attr = attr->next) - { - if (attr_content(attr, "type", &type_str)) - ; - else if (attr_content(attr, "id", &id_str)) - ; - else if (attr_content(attr, "rank", &rank_str)) - ; - else - yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s" - " for ", - tinfo->fname, attr->name); - } - if (id_str) - sscanf(id_str, "%255s", ctrl->match_criteria); - - if (rank_str) - ctrl->staticrank = atozint(rank_str); - ptr = ptr->children; + /* check and set PI record and index index instructions */ + if (node->type == XML_PI_NODE) + { + process_xml_pi_node(tinfo, extctr, node, &index_p); + } + else if (node->type == XML_ELEMENT_NODE) + { + /* if there was a PI index instruction before this element */ + if (index_p) + { + index_value_of(tinfo, extctr, recword, node, index_p); + index_p = 0; + } + process_xml_element_node(tinfo, extctr, recword,node); + } + else + continue; } +} - if (!strcmp("update", type_str)) - index_node(tinfo, ctrl, ptr, recWord); - else if (!strcmp("delete", type_str)) - yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); - else - yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", - type_str); + +/* DOM filter style indexing */ +static void extract_dom_doc_node(struct filter_info *tinfo, + struct recExtractCtrl *extctr, + xmlDocPtr doc) +{ + /* only need to do the initialization once, reuse recword for all terms */ + RecWord recword; + (*extctr->init)(extctr, &recword); + + tinfo->record_info_invoked = 0; + process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc); } - -static int extract_doc(struct filter_info *tinfo, struct filter_input *input, - struct recExtractCtrl *p, xmlDocPtr doc) + + + + +static int convert_extract_doc(struct filter_info *tinfo, + struct filter_input *input, + struct recExtractCtrl *p, + xmlDocPtr doc) + { - RecWord recWord; - const char *params[10]; xmlChar *buf_out; int len_out; + const char *params[10]; xsltStylesheetPtr last_xsp = 0; xmlDocPtr store_doc = 0; params[0] = 0; - set_param_str(params, "schema", zebra_xslt_ns, tinfo->odr_record); + set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record); /* input conversion */ - perform_convert(tinfo, input->convert, params, &doc, 0); - - (*p->init)(p, &recWord); + perform_convert(tinfo, p, input->convert, params, &doc, 0); if (tinfo->store) { /* store conversion */ store_doc = xmlCopyDoc(doc, 1); - perform_convert(tinfo, tinfo->store->convert, + perform_convert(tinfo, p, tinfo->store->convert, params, &store_doc, &last_xsp); } @@ -749,8 +1095,10 @@ static int extract_doc(struct filter_info *tinfo, struct filter_input *input, store_doc ? store_doc : doc, last_xsp); else xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out); - if (p->flagShowRecords) - fwrite(buf_out, len_out, 1, stdout); + + /* if (p->flagShowRecords) + fwrite(buf_out, len_out, 1, stdout); */ + (*p->setStoreData)(p, buf_out, len_out); xmlFree(buf_out); @@ -758,25 +1106,17 @@ static int extract_doc(struct filter_info *tinfo, struct filter_input *input, xmlFreeDoc(store_doc); /* extract conversion */ - perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0); + perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0); + + /* finally, do the indexing */ + if (doc) + extract_dom_doc_node(tinfo, p, doc); + if (doc) - { - xmlNodePtr root_ptr; - if (p->flagShowRecords) - { - xmlDocDumpMemory(doc, &buf_out, &len_out); - fwrite(buf_out, len_out, 1, stdout); - xmlFree(buf_out); - } - root_ptr = xmlDocGetRootElement(doc); - if (root_ptr) - index_record(tinfo, p, root_ptr, &recWord); - else - { - yaz_log(YLOG_WARN, "No root for index XML record"); - } xmlFreeDoc(doc); - } + + if (tinfo->record_info_invoked == 0) + return RECCTRL_EXTRACT_SKIP; return RECCTRL_EXTRACT_OK; } @@ -788,9 +1128,9 @@ static int extract_xml_split(struct filter_info *tinfo, if (p->first_record) { - if (input->u.xmlreader.reader) - xmlFreeTextReader(input->u.xmlreader.reader); - input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex, + if (input->u.xmlreader.reader) + xmlFreeTextReader(input->u.xmlreader.reader); + input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, @@ -803,12 +1143,13 @@ static int extract_xml_split(struct filter_info *tinfo, ret = xmlTextReaderRead(input->u.xmlreader.reader); while (ret == 1) { - int type = xmlTextReaderNodeType(input->u.xmlreader.reader); - int depth = xmlTextReaderDepth(input->u.xmlreader.reader); - if (type == XML_READER_TYPE_ELEMENT && + int type = xmlTextReaderNodeType(input->u.xmlreader.reader); + int depth = xmlTextReaderDepth(input->u.xmlreader.reader); + if (type == XML_READER_TYPE_ELEMENT && input->u.xmlreader.split_level == depth) - { - xmlNodePtr ptr = xmlTextReaderExpand(input->u.xmlreader.reader); + { + xmlNodePtr ptr + = xmlTextReaderExpand(input->u.xmlreader.reader); if (ptr) { xmlNodePtr ptr2 = xmlCopyNode(ptr, 1); @@ -816,7 +1157,19 @@ static int extract_xml_split(struct filter_info *tinfo, xmlDocSetRootElement(doc, ptr2); - return extract_doc(tinfo, input, p, doc); + /* writing debug info out */ + if (p->flagShowRecords) + { + xmlChar *buf_out = 0; + int len_out = 0; + xmlDocDumpMemory(doc, &buf_out, &len_out); + yaz_log(YLOG_LOG, "%s: XMLREADER depth: %i\n%.*s", + tinfo->fname ? tinfo->fname : "(none)", + depth, len_out, buf_out); + xmlFree(buf_out); + } + + return convert_extract_doc(tinfo, input, p, doc); } else { @@ -824,8 +1177,8 @@ static int extract_xml_split(struct filter_info *tinfo, input->u.xmlreader.reader = 0; return RECCTRL_EXTRACT_ERROR_GENERIC; } - } - ret = xmlTextReaderRead(input->u.xmlreader.reader); + } + ret = xmlTextReaderRead(input->u.xmlreader.reader); } xmlFreeTextReader(input->u.xmlreader.reader); input->u.xmlreader.reader = 0; @@ -838,7 +1191,8 @@ static int extract_xml_full(struct filter_info *tinfo, { if (p->first_record) /* only one record per stream */ { - xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, p /* I/O handler */, + xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, + p /* I/O handler */, 0 /* URL */, 0 /* encoding */, XML_PARSE_XINCLUDE|XML_PARSE_NOENT); @@ -846,7 +1200,7 @@ static int extract_xml_full(struct filter_info *tinfo, { return RECCTRL_EXTRACT_ERROR_GENERIC; } - return extract_doc(tinfo, input, p, doc); + return convert_extract_doc(tinfo, input, p, doc); } else return RECCTRL_EXTRACT_EOF; @@ -866,7 +1220,8 @@ static int extract_iso2709(struct filter_info *tinfo, { int i; - yaz_log(YLOG_WARN, "MARC: Skipping bad byte %d (0x%02X)", + dom_log(YLOG_WARN, tinfo, 0, + "MARC: Skipping bad byte %d (0x%02X)", *buf & 0xff, *buf & 0xff); for (i = 0; i<4; i++) buf[i] = buf[i+1]; @@ -877,19 +1232,22 @@ static int extract_iso2709(struct filter_info *tinfo, record_length = atoi_n (buf, 5); if (record_length < 25) { - yaz_log (YLOG_WARN, "MARC record length < 25, is %d", record_length); + dom_log(YLOG_WARN, tinfo, 0, + "MARC record length < 25, is %d", record_length); return RECCTRL_EXTRACT_ERROR_GENERIC; } read_bytes = p->stream->readf(p->stream, buf+5, record_length-5); if (read_bytes < record_length-5) { - yaz_log (YLOG_WARN, "Couldn't read whole MARC record"); + dom_log(YLOG_WARN, tinfo, 0, + "couldn't read whole MARC record"); return RECCTRL_EXTRACT_ERROR_GENERIC; } r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length); if (r < record_length) { - yaz_log (YLOG_WARN, "Parsing of MARC record failed r=%d length=%d", + dom_log (YLOG_WARN, tinfo, 0, + "parsing of MARC record failed r=%d length=%d", r, record_length); return RECCTRL_EXTRACT_ERROR_GENERIC; } @@ -900,7 +1258,7 @@ static int extract_iso2709(struct filter_info *tinfo, yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0); rdoc = xmlNewDoc((const xmlChar*) "1.0"); xmlDocSetRootElement(rdoc, root_ptr); - return extract_doc(tinfo, input, p, rdoc); + return convert_extract_doc(tinfo, input, p, rdoc); } return RECCTRL_EXTRACT_OK; } @@ -941,7 +1299,7 @@ static int ioclose_ret(void *context) static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) { - /* const char *esn = zebra_xslt_ns; */ + /* const char *esn = zebra_dom_ns; */ const char *esn = 0; const char *params[32]; struct filter_info *tinfo = clientData; @@ -951,25 +1309,25 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) if (p->comp) { - if (p->comp->which == Z_RecordComp_simple - && p->comp->u.simple->which == Z_ElementSetNames_generic) - { - esn = p->comp->u.simple->u.generic; - } - else if (p->comp->which == Z_RecordComp_complex - && p->comp->u.complex->generic->elementSpec - && p->comp->u.complex->generic->elementSpec->which == - Z_ElementSpec_elementSetName) - { - esn = p->comp->u.complex->generic->elementSpec->u.elementSetName; - } + if (p->comp->which == Z_RecordComp_simple + && p->comp->u.simple->which == Z_ElementSetNames_generic) + { + esn = p->comp->u.simple->u.generic; + } + else if (p->comp->which == Z_RecordComp_complex + && p->comp->u.complex->generic->elementSpec + && p->comp->u.complex->generic->elementSpec->which == + Z_ElementSpec_elementSetName) + { + esn = p->comp->u.complex->generic->elementSpec->u.elementSetName; + } } retrieve = lookup_retrieve(tinfo, esn); if (!retrieve) { - p->diagnostic = - YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; - return 0; + p->diagnostic = + YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; + return 0; } params[0] = 0; @@ -999,52 +1357,52 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) XML_PARSE_XINCLUDE|XML_PARSE_NOENT); if (!doc) { - p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; - return 0; + p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + return 0; } /* retrieve conversion */ - perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp); + perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp); if (!doc) { - p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; } else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML) { - xmlChar *buf_out; - int len_out; + xmlChar *buf_out; + int len_out; if (last_xsp) xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp); else - xmlDocDumpMemory(doc, &buf_out, &len_out); + xmlDocDumpMemory(doc, &buf_out, &len_out); - p->output_format = VAL_TEXT_XML; - p->rec_len = len_out; - p->rec_buf = odr_malloc(p->odr, p->rec_len); - memcpy(p->rec_buf, buf_out, p->rec_len); - xmlFree(buf_out); + p->output_format = VAL_TEXT_XML; + p->rec_len = len_out; + p->rec_buf = odr_malloc(p->odr, p->rec_len); + memcpy(p->rec_buf, buf_out, p->rec_len); + xmlFree(buf_out); } else if (p->output_format == VAL_SUTRS) { - xmlChar *buf_out; - int len_out; + xmlChar *buf_out; + int len_out; if (last_xsp) xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp); else - xmlDocDumpMemory(doc, &buf_out, &len_out); + xmlDocDumpMemory(doc, &buf_out, &len_out); - p->output_format = VAL_SUTRS; - p->rec_len = len_out; - p->rec_buf = odr_malloc(p->odr, p->rec_len); - memcpy(p->rec_buf, buf_out, p->rec_len); + p->output_format = VAL_SUTRS; + p->rec_len = len_out; + p->rec_buf = odr_malloc(p->odr, p->rec_len); + memcpy(p->rec_buf, buf_out, p->rec_len); - xmlFree(buf_out); + xmlFree(buf_out); } else { - p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP; + p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP; } xmlFreeDoc(doc); return 0;