X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fmod_dom.c;h=d36c7e80a4576bc28fdaa088bc0c9a48f5cbd9ff;hb=85c5e09eacc5c89eda6e1ffa6f039fa1e8dd7221;hp=d42d80b0ecaad8fd5e1943a3cfbc838bb69e0172;hpb=97dc097858772a66c8e90e8b07f77c9c20450131;p=idzebra-moved-to-github.git diff --git a/index/mod_dom.c b/index/mod_dom.c index d42d80b..d36c7e8 100644 --- a/index/mod_dom.c +++ b/index/mod_dom.c @@ -1,4 +1,4 @@ -/* $Id: mod_dom.c,v 1.1 2007-02-07 12:08:54 adam Exp $ +/* $Id: mod_dom.c,v 1.7 2007-02-14 15:42:24 marc Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -101,6 +101,9 @@ struct filter_info { #define XML_STRCMP(a,b) strcmp((char*)a, b) #define XML_STRLEN(a) strlen((char*)a) + + + static void set_param_str(const char **params, const char *name, const char *value, ODR odr) { @@ -614,6 +617,12 @@ static int ioclose_ex(void *context) return 0; } + +/* Alvis style indexing */ +#define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1" +static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS; + +/* Alvis style indexing */ static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl, xmlNodePtr ptr, RecWord *recWord) { @@ -628,11 +637,7 @@ static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl, } } -#define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1" - - -static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS; - +/* Alvis style indexing */ static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl, xmlNodePtr ptr, RecWord *recWord) { @@ -676,6 +681,7 @@ static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl, } } +/* Alvis style indexing */ static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl, xmlNodePtr ptr, RecWord *recWord) { @@ -717,14 +723,330 @@ static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl, yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", type_str); } + + +/* Alvis style indexing */ +static void extract_doc_alvis(struct filter_info *tinfo, + struct recExtractCtrl *recctr, + xmlDocPtr doc) +{ + if (doc){ + RecWord recWord; + xmlChar *buf_out; + int len_out; + xmlNodePtr root_ptr; + + (*recctr->init)(recctr, &recWord); + + if (recctr->flagShowRecords){ + xmlDocDumpMemory(doc, &buf_out, &len_out); + fwrite(buf_out, len_out, 1, stdout); + xmlFree(buf_out); + } + root_ptr = xmlDocGetRootElement(doc); + if (root_ptr) + index_record(tinfo, recctr, root_ptr, &recWord); + else + yaz_log(YLOG_WARN, "No root for index XML record"); + } +} + + +/* DOM filter style indexing */ +static int attr_content_xml(struct _xmlAttr *attr, const char *name, + xmlChar **dst_content) +{ + if (0 == XML_STRCMP(attr->name, name) && attr->children + && attr->children->type == XML_TEXT_NODE) + { + *dst_content = (attr->children->content); + return 1; + } + return 0; +} + +/* DOM filter style indexing */ +/* #define ZEBRA_XSLT_NS "http://indexdata.com/zebra-2.0" */ +/* static const char *zebra_xslt_ns = ZEBRA_XSLT_NS; */ + +/* DOM filter style indexing */ +#define ZEBRA_PI_NAME "zebra-2.0" +static const char *zebra_pi_name = ZEBRA_PI_NAME; + + +/* DOM filter style indexing */ +void index_value_of(xmlNodePtr node, xmlChar * index_p){ + xmlChar *text = xmlNodeGetContent(node); + + xmlChar *look = index_p; + xmlChar *bval; + xmlChar *eval; + + xmlChar index[256]; + xmlChar type[256]; + + /* parsing all index name/type pairs - may not start with ' ' or ':' */ + while (*look && ' ' != *look && ':' != *look){ + + /* setting name and type to zero */ + *index = '\0'; + *type = '\0'; + + /* parsing one index name */ + bval = look; + while (*look && ':' != *look && ' ' != *look){ + look++; + } + eval = look; + strncpy((char *)index, (const char *)bval, eval - bval); + index[eval - bval] = '\0'; + + + /* parsing one index type, if existing */ + if (':' == *look){ + look++; + + bval = look; + while (*look && ' ' != *look){ + look++; + } + eval = look; + strncpy((char *)type, (const char *)bval, eval - bval); + type[eval - bval] = '\0'; + } + + printf("INDEX '%s:%s' '%s'\n", index, type, text); + + if (*look && ' ' == *look && *(look+1)){ + look++; + } + } + + xmlFree(text); + +/* //recWord->term_buf = (const char *)ptr->content; */ +/* //recWord->term_len = XML_STRLEN(ptr->content); */ +/* // if (type_str && *type_str) */ +/* // recWord->index_type = *type_str; /\* type was given *\/ */ +/* // recWord->index_name = name_str; */ +/* // recWord->index_type = prev_type; /\* restore it again *\/ */ +} + + +/* DOM filter style indexing */ +void set_record_info(xmlChar * id_p, xmlChar * rank_p, xmlChar * action_p){ + printf("RECORD id=%s rank=%s action=%s\n", id_p, rank_p, action_p); +} + + +/* DOM filter style indexing */ +void process_xml_element_zebra_node(xmlNodePtr node, xmlChar **record_p) +{ + if (node->type == XML_ELEMENT_NODE + && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_xslt_ns)){ -static int extract_doc(struct filter_info *tinfo, struct filter_input *input, - struct recExtractCtrl *p, xmlDocPtr doc) + if (0 == XML_STRCMP(node->name, "index")){ + xmlChar *index_p = 0; + + struct _xmlAttr *attr; + for (attr = node->properties; attr; attr = attr->next){ + if (attr_content_xml(attr, "name", &index_p)){ + index_value_of(node, index_p); + } + else + // printf("%s: dom filter: s% bad attribute %s", + // tinfo->fname, xmlGetNodePath(node)), nodeattr->name); + printf("dom filter: %s bad attribute @%s, expected @name\n", + xmlGetNodePath(node), attr->name); + } + } + else if (0 == XML_STRCMP(node->name, "record")){ + xmlChar *id_p = 0; + xmlChar *rank_p = 0; + xmlChar *action_p = 0; + + struct _xmlAttr *attr; + for (attr = node->properties; attr; attr = attr->next){ + if (attr_content_xml(attr, "id", &id_p)) + ; + else if (attr_content_xml(attr, "rank", &rank_p)) + ; + else if (attr_content_xml(attr, "acton", &action_p)) + ; + else + // printf("%s: dom filter: s% bad attribute %s", + // tinfo->fname, xmlGetNodePath(node)), nodeattr->name); + printf("dom filter: %s bad attribute @%s," + " expected @id|@rank|@action\n", + xmlGetNodePath(node), attr->name); + + if (action_p && 0 != strcmp("update", (const char *)action_p)) + printf("dom filter: %s attribute @%s," + " only implemented '@action=\"update\"\n", + xmlGetNodePath(node), attr->name); + + + } + set_record_info(id_p, rank_p, action_p); + } else { + // printf("%s: dom filter: s% bad attribute %s", + // tinfo->fname, xmlGetNodePath(node)), nodeattr->name); + printf("dom filter: %s bad element <%s>," + " expected | in namespace '%s'\n", + xmlGetNodePath(node), node->name, zebra_xslt_ns); + + } + } +} + + +/* DOM filter style indexing */ +void process_xml_pi_node(xmlNodePtr node, xmlChar **record_pp, + xmlChar **index_pp) { - RecWord recWord; - const char *params[10]; + printf("PI %s\n", xmlGetNodePath(node)); + + /* if right PI name, continue parsing PI */ + if (0 == strcmp(zebra_pi_name, (const char *)node->name)){ + xmlChar *pi_p = node->content; + xmlChar *look = pi_p; + + xmlChar *bval; + xmlChar *eval; + + /* parsing PI record instructions */ + if (0 == strncmp((const char *)look, "record", 6)){ + xmlChar id[256]; + xmlChar rank[256]; + xmlChar action[256]; + + *id = '\0'; + *rank = '\0'; + *action = '\0'; + + look += 6; + + /* eat whitespace */ + while (*look && ' ' == *look && *(look+1)) + look++; + + /* parse possible id */ + if (*look && 0 == strncmp((const char *)look, "id=", 3)){ + look += 3; + bval = look; + while (*look && ' ' != *look) + look++; + eval = look; + strncpy((char *)id, (const char *)bval, eval - bval); + id[eval - bval] = '\0'; + } + + /* eat whitespace */ + while (*look && ' ' == *look && *(look+1)) + look++; + + /* parse possible rank */ + if (*look && 0 == strncmp((const char *)look, "rank=", 5)){ + look += 6; + bval = look; + while (*look && ' ' != *look) + look++; + eval = look; + strncpy((char *)rank, (const char *)bval, eval - bval); + rank[eval - bval] = '\0'; + } + + /* eat whitespace */ + while (*look && ' ' == *look && *(look+1)) + look++; + + if (look && '\0' != *look){ + printf ("ERROR %s: content '%s'; can not parse '%s'\n", + xmlGetNodePath(node), pi_p, look); + } else { + /* set_record_info(id, rank, action); */ + set_record_info(id, rank, 0); + } + + } + + /* parsing index instruction */ + else if (0 == strncmp((const char *)look, "index", 5)){ + look += 5; + + /* eat whitespace */ + while (*look && ' ' == *look && *(look+1)) + look++; + + /* export index instructions to outside */ + *index_pp = look; + + /* nor record, neither index */ + } else { + + printf ("ERROR %s: content '%s'; can not parse '%s'\n", + xmlGetNodePath(node), pi_p, look); + } + } +} + +/* DOM filter style indexing */ +void process_xml_element_node(xmlNodePtr node, xmlChar **record_pp) +{ + /* remember indexing instruction from PI to next element node */ + xmlChar *index_p = 0; + + printf("ELEM %s\n", xmlGetNodePath(node)); + + /* check if we are an element node in the special zebra namespace + and either set record data or index value-of node content*/ + process_xml_element_zebra_node(node, record_pp); + + /* loop through kid nodes */ + for (node = node->children; node; node = node->next) + { + /* check and set PI record and index index instructions */ + if (node->type == XML_PI_NODE){ + process_xml_pi_node(node, record_pp, &index_p); + } + else if (node->type == XML_ELEMENT_NODE){ + /* if there was a PI index instruction before this element node */ + if (index_p){ + index_value_of(node, index_p); + index_p = 0; + } + process_xml_element_node(node, record_pp); + } + else + continue; + } +} + + + +/* DOM filter style indexing */ +void process_xml_doc_node(xmlDocPtr doc) +{ + xmlChar *record_pp; + + printf("DOC %s\n", xmlGetNodePath((xmlNodePtr)doc)); + + process_xml_element_node((xmlNodePtr)doc, &record_pp); +} + + + + +static int convert_extract_doc(struct filter_info *tinfo, + struct filter_input *input, + struct recExtractCtrl *p, + xmlDocPtr doc) + +{ + /* RecWord recWord; */ xmlChar *buf_out; int len_out; + const char *params[10]; xsltStylesheetPtr last_xsp = 0; xmlDocPtr store_doc = 0; @@ -734,8 +1056,6 @@ static int extract_doc(struct filter_info *tinfo, struct filter_input *input, /* input conversion */ perform_convert(tinfo, input->convert, params, &doc, 0); - (*p->init)(p, &recWord); - if (tinfo->store) { /* store conversion */ @@ -759,24 +1079,12 @@ static int extract_doc(struct filter_info *tinfo, struct filter_input *input, /* extract conversion */ perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0); - if (doc) - { - xmlNodePtr root_ptr; - if (p->flagShowRecords) - { - xmlDocDumpMemory(doc, &buf_out, &len_out); - fwrite(buf_out, len_out, 1, stdout); - xmlFree(buf_out); - } - root_ptr = xmlDocGetRootElement(doc); - if (root_ptr) - index_record(tinfo, p, root_ptr, &recWord); - else - { - yaz_log(YLOG_WARN, "No root for index XML record"); - } + + if (doc){ + extract_doc_alvis(tinfo, p, doc); xmlFreeDoc(doc); - } + } + return RECCTRL_EXTRACT_OK; } @@ -794,7 +1102,8 @@ static int extract_xml_split(struct filter_info *tinfo, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, - XML_PARSE_XINCLUDE); + XML_PARSE_XINCLUDE| + XML_PARSE_NOENT); } if (!input->u.xmlreader.reader) return RECCTRL_EXTRACT_ERROR_GENERIC; @@ -815,7 +1124,7 @@ static int extract_xml_split(struct filter_info *tinfo, xmlDocSetRootElement(doc, ptr2); - return extract_doc(tinfo, input, p, doc); + return convert_extract_doc(tinfo, input, p, doc); } else { @@ -840,12 +1149,12 @@ static int extract_xml_full(struct filter_info *tinfo, xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, - XML_PARSE_XINCLUDE); + XML_PARSE_XINCLUDE|XML_PARSE_NOENT); if (!doc) { return RECCTRL_EXTRACT_ERROR_GENERIC; } - return extract_doc(tinfo, input, p, doc); + return convert_extract_doc(tinfo, input, p, doc); } else return RECCTRL_EXTRACT_EOF; @@ -899,7 +1208,7 @@ static int extract_iso2709(struct filter_info *tinfo, yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0); rdoc = xmlNewDoc((const xmlChar*) "1.0"); xmlDocSetRootElement(rdoc, root_ptr); - return extract_doc(tinfo, input, p, rdoc); + return convert_extract_doc(tinfo, input, p, rdoc); } return RECCTRL_EXTRACT_OK; } @@ -995,7 +1304,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, - XML_PARSE_XINCLUDE); + XML_PARSE_XINCLUDE|XML_PARSE_NOENT); if (!doc) { p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;