X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=recctrl%2Fxslt.c;h=ab7a18e3ce2ab28909ba294bdb0573e16df71f88;hb=9eebf93dc2525854867cbc43920ea8ba4a199ab5;hp=62c8dd04793e920a3e4dae1d0e20dea219fb8ded;hpb=0f3b8bcc6fe2e3beeec7c834d9a64dca48a4f1b7;p=idzebra-moved-to-github.git diff --git a/recctrl/xslt.c b/recctrl/xslt.c index 62c8dd0..ab7a18e 100644 --- a/recctrl/xslt.c +++ b/recctrl/xslt.c @@ -1,4 +1,4 @@ -/* $Id: xslt.c,v 1.1 2005-04-28 08:20:40 adam Exp $ +/* $Id: xslt.c,v 1.11 2005-06-23 06:45:47 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -25,54 +25,255 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include +#include +#include +#include +#include #include #include #include #include -struct filter_info { +struct filter_schema { + const char *name; + const char *identifier; + const char *stylesheet; + struct filter_schema *next; + const char *default_schema; + const char *include_snippet; xsltStylesheetPtr stylesheet_xsp; - xmlTextReaderPtr reader; +}; + +struct filter_info { + xmlDocPtr doc; char *fname; - int split_depth; + const char *split_level; + const char *split_path; + ODR odr; + struct filter_schema *schemas; + xmlTextReaderPtr reader; }; -static const char *zebra_index_ns = "http://indexdata.dk/zebra/indexing/1"; +#define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1" + +static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS; + +static void set_param_xml(const char **params, const char *name, + const char *value, ODR odr) +{ + while (*params) + params++; + params[0] = name; + params[1] = value; + params[2] = 0; +} -static void *filter_init (Res res, RecType recType) +static void set_param_str(const char **params, const char *name, + const char *value, ODR odr) +{ + char *quoted = odr_malloc(odr, 3 + strlen(value)); + sprintf(quoted, "'%s'", value); + while (*params) + params++; + params[0] = name; + params[1] = quoted; + params[2] = 0; +} + +static void set_param_int(const char **params, const char *name, + zint value, ODR odr) +{ + char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */ + while (*params) + params++; + sprintf(quoted, "'" ZINT_FORMAT "'", value); + params[0] = name; + params[1] = quoted; + params[2] = 0; +} + + +int zebra_xmlInputMatchCallback (char const *filename) +{ + yaz_log(YLOG_LOG, "match %s", filename); + return 0; +} + + +void * zebra_xmlInputOpenCallback (char const *filename) +{ + return 0; +} + +int zebra_xmlInputReadCallback (void * context, char * buffer, int len) +{ + return 0; +} + +int zebra_xmlInputCloseCallback (void * context) +{ + return 0; +} + + + + + +static void *filter_init_xslt(Res res, RecType recType) { struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo)); - tinfo->stylesheet_xsp = 0; tinfo->reader = 0; tinfo->fname = 0; - tinfo->split_depth = 1; + tinfo->split_level = 0; + tinfo->split_path = 0; + tinfo->odr = odr_createmem(ODR_ENCODE); + tinfo->doc = 0; + tinfo->schemas = 0; + +#if 0 + xmlRegisterDefaultInputCallbacks(); + xmlRegisterInputCallbacks(zebra_xmlInputMatchCallback, + zebra_xmlInputOpenCallback, + zebra_xmlInputReadCallback, + zebra_xmlInputCloseCallback); +#endif return tinfo; } +static void *filter_init_xslt1(Res res, RecType recType) +{ + struct filter_info *tinfo = (struct filter_info *) + filter_init_xslt(res, recType); + tinfo->split_level = "1"; + return tinfo; +} + +static int attr_content(struct _xmlAttr *attr, const char *name, + const char **dst_content) +{ + if (!strcmp(attr->name, name) && attr->children && + attr->children->type == XML_TEXT_NODE) + { + *dst_content = attr->children->content; + return 1; + } + return 0; +} + +static void destroy_schemas(struct filter_info *tinfo) +{ + struct filter_schema *schema = tinfo->schemas; + while (schema) + { + struct filter_schema *schema_next = schema->next; + if (schema->stylesheet_xsp) + xsltFreeStylesheet(schema->stylesheet_xsp); + xfree(schema); + schema = schema_next; + } + tinfo->schemas = 0; + xfree(tinfo->fname); + if (tinfo->doc) + xmlFreeDoc(tinfo->doc); + tinfo->doc = 0; +} + +static ZEBRA_RES create_schemas(struct filter_info *tinfo, const char *fname) +{ + xmlNodePtr ptr; + tinfo->fname = xstrdup(fname); + tinfo->doc = xmlParseFile(tinfo->fname); + if (!tinfo->doc) + return ZEBRA_FAIL; + ptr = xmlDocGetRootElement(tinfo->doc); + if (!ptr || ptr->type != XML_ELEMENT_NODE || + strcmp(ptr->name, "schemaInfo")) + return ZEBRA_FAIL; + for (ptr = ptr->children; ptr; ptr = ptr->next) + { + if (ptr->type != XML_ELEMENT_NODE) + continue; + if (!strcmp(ptr->name, "schema")) + { + struct _xmlAttr *attr; + struct filter_schema *schema = xmalloc(sizeof(*schema)); + schema->name = 0; + schema->identifier = 0; + schema->stylesheet = 0; + schema->default_schema = 0; + schema->next = tinfo->schemas; + schema->stylesheet_xsp = 0; + schema->include_snippet = 0; + tinfo->schemas = schema; + for (attr = ptr->properties; attr; attr = attr->next) + { + attr_content(attr, "identifier", &schema->identifier); + attr_content(attr, "name", &schema->name); + attr_content(attr, "stylesheet", &schema->stylesheet); + attr_content(attr, "default", &schema->default_schema); + attr_content(attr, "snippet", &schema->include_snippet); + } + if (schema->stylesheet) + schema->stylesheet_xsp = + xsltParseStylesheetFile( + (const xmlChar*) schema->stylesheet); + } + else if (!strcmp(ptr->name, "split")) + { + struct _xmlAttr *attr; + for (attr = ptr->properties; attr; attr = attr->next) + { + attr_content(attr, "level", &tinfo->split_level); + attr_content(attr, "path", &tinfo->split_path); + } + } + else + { + yaz_log(YLOG_WARN, "Bad element %s in %s", ptr->name, fname); + return ZEBRA_FAIL; + } + } + return ZEBRA_OK; +} + +static struct filter_schema *lookup_schema(struct filter_info *tinfo, + const char *est) +{ + struct filter_schema *schema; + for (schema = tinfo->schemas; schema; schema = schema->next) + { + if (est) + { + if (schema->identifier && !strcmp(schema->identifier, est)) + return schema; + if (schema->name && !strcmp(schema->name, est)) + return schema; + } + if (schema->default_schema) + return schema; + } + return 0; +} + static void filter_config(void *clientData, Res res, const char *args) { struct filter_info *tinfo = clientData; if (!args || !*args) - args = "default.xsl"; - if (!tinfo->fname || strcmp(args, tinfo->fname)) - { - /* different filename so must reread stylesheet */ - xfree(tinfo->fname); - tinfo->fname = xstrdup(args); - if (tinfo->stylesheet_xsp) - xsltFreeStylesheet(tinfo->stylesheet_xsp); - tinfo->stylesheet_xsp = - xsltParseStylesheetFile((const xmlChar*) tinfo->fname); - } + args = "xsltfilter.xml"; + if (tinfo->fname && !strcmp(args, tinfo->fname)) + return; + destroy_schemas(tinfo); + create_schemas(tinfo, args); } static void filter_destroy(void *clientData) { struct filter_info *tinfo = clientData; - if (tinfo->stylesheet_xsp) - xsltFreeStylesheet(tinfo->stylesheet_xsp); - xfree(tinfo->fname); + destroy_schemas(tinfo); + if (tinfo->reader) + xmlFreeTextReader(tinfo->reader); + odr_destroy(tinfo->odr); xfree(tinfo); } @@ -87,12 +288,12 @@ static int ioclose_ex(void *context) return 0; } -static void index_field(struct filter_info *tinfo, struct recExtractCtrl *ctrl, +static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl, xmlNodePtr ptr, RecWord *recWord) { for(; ptr; ptr = ptr->next) { - index_field(tinfo, ctrl, ptr->children, recWord); + index_cdata(tinfo, ctrl, ptr->children, recWord); if (ptr->type != XML_TEXT_NODE) continue; recWord->term_buf = ptr->content; @@ -108,41 +309,84 @@ static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl, { index_node(tinfo, ctrl, ptr->children, recWord); if (ptr->type != XML_ELEMENT_NODE || !ptr->ns || - strcmp(ptr->ns->href, zebra_index_ns)) + strcmp(ptr->ns->href, zebra_xslt_ns)) continue; if (!strcmp(ptr->name, "index")) { - char *field_str = 0; + char *name_str = 0; + const char *type_str = 0; const char *xpath_str = 0; struct _xmlAttr *attr; for (attr = ptr->properties; attr; attr = attr->next) { - if (!strcmp(attr->name, "field") + if (!strcmp(attr->name, "name") && attr->children && attr->children->type == XML_TEXT_NODE) - field_str = attr->children->content; + name_str = attr->children->content; if (!strcmp(attr->name, "xpath") && attr->children && attr->children->type == XML_TEXT_NODE) xpath_str = attr->children->content; + if (!strcmp(attr->name, "type") + && attr->children && attr->children->type == XML_TEXT_NODE) + type_str = attr->children->content; } - if (field_str) + if (name_str) { - recWord->attrStr = field_str; - index_field(tinfo, ctrl, ptr->children, recWord); + int prev_type = recWord->index_type; /* save default type */ + + if (type_str && *type_str) + recWord->index_type = *type_str; /* type was given */ + recWord->index_name = name_str; + index_cdata(tinfo, ctrl, ptr->children, recWord); + + recWord->index_type = prev_type; /* restore it again */ } } } } -static int filter_extract(void *clientData, struct recExtractCtrl *p) +static int extract_doc(struct filter_info *tinfo, struct recExtractCtrl *p, + xmlDocPtr doc) { - static const char *params[] = { - "schema", "'http://indexdata.dk/zebra/indexing/1'", - 0 - }; - struct filter_info *tinfo = clientData; RecWord recWord; - int ret; + const char *params[10]; + xmlChar *buf_out; + int len_out; + + struct filter_schema *schema = lookup_schema(tinfo, zebra_xslt_ns); + + params[0] = 0; + set_param_str(params, "schema", zebra_xslt_ns, tinfo->odr); + + (*p->init)(p, &recWord); + if (schema && schema->stylesheet_xsp) + { + xmlDocPtr resDoc = + xsltApplyStylesheet(schema->stylesheet_xsp, + doc, params); + if (p->flagShowRecords) + { + xmlDocDumpMemory(resDoc, &buf_out, &len_out); + fwrite(buf_out, len_out, 1, stdout); + xmlFree(buf_out); + } + index_node(tinfo, p, xmlDocGetRootElement(resDoc), &recWord); + xmlFreeDoc(resDoc); + } + xmlDocDumpMemory(doc, &buf_out, &len_out); + if (p->flagShowRecords) + fwrite(buf_out, len_out, 1, stdout); + (*p->setStoreData)(p, buf_out, len_out); + xmlFree(buf_out); + + xmlFreeDoc(doc); + return RECCTRL_EXTRACT_OK; +} + +static int extract_split(struct filter_info *tinfo, struct recExtractCtrl *p) +{ + int ret; + int split_depth = 0; if (p->first_record) { if (tinfo->reader) @@ -156,50 +400,23 @@ static int filter_extract(void *clientData, struct recExtractCtrl *p) if (!tinfo->reader) return RECCTRL_EXTRACT_ERROR_GENERIC; - if (!tinfo->stylesheet_xsp) - return RECCTRL_EXTRACT_ERROR_GENERIC; - - (*p->init)(p, &recWord); - recWord.reg_type = 'w'; - + if (tinfo->split_level) + split_depth = atoi(tinfo->split_level); ret = xmlTextReaderRead(tinfo->reader); while (ret == 1) { int type = xmlTextReaderNodeType(tinfo->reader); int depth = xmlTextReaderDepth(tinfo->reader); - if (tinfo->split_depth == 0 || - (type == XML_READER_TYPE_ELEMENT && tinfo->split_depth == depth)) + if (split_depth == 0 || + (split_depth > 0 && + type == XML_READER_TYPE_ELEMENT && split_depth == depth)) { - xmlChar *buf_out; - int len_out; - xmlNodePtr ptr = xmlTextReaderExpand(tinfo->reader); xmlNodePtr ptr2 = xmlCopyNode(ptr, 1); xmlDocPtr doc = xmlNewDoc("1.0"); xmlDocSetRootElement(doc, ptr2); - - if (tinfo->stylesheet_xsp) - { - xmlDocPtr resDoc = - xsltApplyStylesheet(tinfo->stylesheet_xsp, - doc, params); - if (p->flagShowRecords) - { - xmlDocDumpMemory(resDoc, &buf_out, &len_out); - fwrite(buf_out, len_out, 1, stdout); - xmlFree(buf_out); - } - index_node(tinfo, p, xmlDocGetRootElement(resDoc), &recWord); - xmlFreeDoc(resDoc); - } - xmlDocDumpMemory(doc, &buf_out, &len_out); - if (p->flagShowRecords) - fwrite(buf_out, len_out, 1, stdout); - (*p->setStoreData)(p, buf_out, len_out); - xmlFree(buf_out); - xmlFreeDoc(doc); - return RECCTRL_EXTRACT_OK; + return extract_doc(tinfo, p, doc); } ret = xmlTextReaderRead(tinfo->reader); } @@ -208,6 +425,38 @@ static int filter_extract(void *clientData, struct recExtractCtrl *p) return RECCTRL_EXTRACT_EOF; } +static int extract_full(struct filter_info *tinfo, struct recExtractCtrl *p) +{ + if (p->first_record) /* only one record per stream */ + { + xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, p /* I/O handler */, + 0 /* URL */, + 0 /* encoding */, + XML_PARSE_XINCLUDE); + if (!doc) + { + return RECCTRL_EXTRACT_ERROR_GENERIC; + } + return extract_doc(tinfo, p, doc); + } + else + return RECCTRL_EXTRACT_EOF; +} + +static int filter_extract(void *clientData, struct recExtractCtrl *p) +{ + struct filter_info *tinfo = clientData; + + odr_reset(tinfo->odr); + + if (tinfo->split_level == 0 && tinfo->split_path == 0) + return extract_full(tinfo, p); + else + { + return extract_split(tinfo, p); + } +} + static int ioread_ret(void *context, char *buffer, int len) { struct recRetrieveCtrl *p = context; @@ -219,20 +468,66 @@ static int ioclose_ret(void *context) return 0; } + +static const char *snippet_doc(struct recRetrieveCtrl *p, int text_mode, + int window_size) +{ + const char *xml_doc_str; + int ord = 0; + WRBUF wrbuf = wrbuf_alloc(); + zebra_snippets *res = + zebra_snippets_window(p->doc_snippet, p->hit_snippet, window_size); + zebra_snippet_word *w = zebra_snippets_list(res); + + if (text_mode) + wrbuf_printf(wrbuf, "\'"); + else + wrbuf_printf(wrbuf, "\n", zebra_xslt_ns); + for (; w; w = w->next) + { + if (ord == 0) + ord = w->ord; + else if (ord != w->ord) + + break; + if (text_mode) + wrbuf_printf(wrbuf, "%s%s%s ", + w->match ? "*" : "", + w->term, + w->match ? "*" : ""); + else + { + wrbuf_printf(wrbuf, " ", + w->ord, w->seqno, + (w->match ? "match='1'" : "")); + wrbuf_xmlputs(wrbuf, w->term); + wrbuf_printf(wrbuf, "\n"); + } + } + if (text_mode) + wrbuf_printf(wrbuf, "\'"); + else + wrbuf_printf(wrbuf, "\n"); + + xml_doc_str = odr_strdup(p->odr, wrbuf_buf(wrbuf)); + + zebra_snippets_destroy(res); + wrbuf_free(wrbuf, 1); + return xml_doc_str; +} + static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) { - static const char *params[] = { - "schema", "'F'", - 0 - }; + const char *esn = zebra_xslt_ns; + const char *params[10]; struct filter_info *tinfo = clientData; xmlDocPtr resDoc; xmlDocPtr doc; + struct filter_schema *schema; + int window_size = -1; if (p->comp) { - const char *esn; - char *esn_quoted; if (p->comp->which != Z_RecordComp_simple || p->comp->u.simple->which != Z_ElementSetNames_generic) { @@ -240,15 +535,29 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) return 0; } esn = p->comp->u.simple->u.generic; - esn_quoted = odr_malloc(p->odr, 3 + strlen(esn)); - sprintf(esn_quoted, "'%s'", esn); - params[1] = esn_quoted; } - if (!tinfo->stylesheet_xsp) + schema = lookup_schema(tinfo, esn); + if (!schema) { - p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + p->diagnostic = + YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; return 0; } + + if (schema->include_snippet) + window_size = atoi(schema->include_snippet); + + params[0] = 0; + set_param_str(params, "schema", esn, p->odr); + if (p->fname) + set_param_str(params, "filename", p->fname, p->odr); + if (p->score >= 0) + set_param_int(params, "score", p->score, p->odr); + set_param_int(params, "size", p->recordSize, p->odr); + + if (window_size >= 0) + set_param_xml(params, "snippet", snippet_doc(p, 1, window_size), + p->odr); doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, @@ -258,8 +567,22 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; return 0; } - resDoc = xsltApplyStylesheet(tinfo->stylesheet_xsp, - doc, params); + + if (window_size >= 0) + { + xmlNodePtr node = xmlDocGetRootElement(doc); + const char *snippet_str = snippet_doc(p, 0, window_size); + xmlDocPtr snippet_doc = xmlParseMemory(snippet_str, strlen(snippet_str)); + xmlAddChild(node, xmlDocGetRootElement(snippet_doc)); + } + if (!schema->stylesheet_xsp) + resDoc = doc; + else + { + resDoc = xsltApplyStylesheet(schema->stylesheet_xsp, + doc, params); + xmlFreeDoc(doc); + } if (!resDoc) { p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; @@ -295,14 +618,23 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP; } xmlFreeDoc(resDoc); - xmlFreeDoc(doc); return 0; } -static struct recType filter_type = { +static struct recType filter_type_xslt = { 0, "xslt", - filter_init, + filter_init_xslt, + filter_config, + filter_destroy, + filter_extract, + filter_retrieve +}; + +static struct recType filter_type_xslt1 = { + 0, + "xslt1", + filter_init_xslt1, filter_config, filter_destroy, filter_extract, @@ -317,6 +649,9 @@ idzebra_filter #endif [] = { - &filter_type, + &filter_type_xslt, +#ifdef LIBXML_READER_ENABLED + &filter_type_xslt1, +#endif 0, };