Allow split path/level to be specified in XSLT conf.
[idzebra-moved-to-github.git] / recctrl / xslt.c
index 62c8dd0..9fe383a 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: xslt.c,v 1.1 2005-04-28 08:20:40 adam Exp $
+/* $Id: xslt.c,v 1.7 2005-06-01 07:32:46 adam Exp $
    Copyright (C) 1995-2005
    Index Data ApS
 
@@ -25,54 +25,207 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 #include <ctype.h>
 
 #include <yaz/diagbib1.h>
+#include <libxml/xmlversion.h>
+#include <libxml/parser.h>
+#include <libxml/tree.h>
 #include <libxml/xmlreader.h>
 #include <libxslt/transform.h>
 
 #include <idzebra/util.h>
 #include <idzebra/recctrl.h>
 
-struct filter_info {
+struct filter_schema {
+    const char *name;
+    const char *identifier;
+    const char *stylesheet;
+    struct filter_schema *next;
+    const char *default_schema;
     xsltStylesheetPtr stylesheet_xsp;
-    xmlTextReaderPtr reader;
+};
+
+struct filter_info {
+    xmlDocPtr doc;
     char *fname;
-    int split_depth;
+    const char *split_level;
+    const char *split_path;
+    ODR odr;
+    struct filter_schema *schemas;
+    xmlTextReaderPtr reader;
 };
 
-static const char *zebra_index_ns = "http://indexdata.dk/zebra/indexing/1";
+#define ZEBRA_INDEX_NS "http://indexdata.dk/zebra/indexing/1"
+#define ZEBRA_SCHEMA_IDENTITY_NS "http://indexdata.dk/zebra/identity/1"
+static const char *zebra_index_ns = ZEBRA_INDEX_NS;
+
+static void set_param_str(const char **params, const char *name,
+                         const char *value, ODR odr)
+{
+    char *quoted = odr_malloc(odr, 3 + strlen(value));
+    sprintf(quoted, "'%s'", value);
+    while (*params)
+       params++;
+    params[0] = name;
+    params[1] = quoted;
+    params[2] = 0;
+}
+
+static void set_param_int(const char **params, const char *name,
+                         zint value, ODR odr)
+{
+    char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
+    while (*params)
+       params++;
+    sprintf(quoted, "'" ZINT_FORMAT "'", value);
+    params[0] = name;
+    params[1] = quoted;
+    params[2] = 0;
+}
+
 
-static void *filter_init (Res res, RecType recType)
+static void *filter_init_xslt(Res res, RecType recType)
 {
     struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
-    tinfo->stylesheet_xsp = 0;
     tinfo->reader = 0;
     tinfo->fname = 0;
-    tinfo->split_depth = 1;
+    tinfo->split_level = 0;
+    tinfo->split_path = 0;
+    tinfo->odr = odr_createmem(ODR_ENCODE);
+    tinfo->doc = 0;
+    tinfo->schemas = 0;
     return tinfo;
 }
 
+static void *filter_init_xslt1(Res res, RecType recType)
+{
+    struct filter_info *tinfo = (struct filter_info *)
+       filter_init_xslt(res, recType);
+    tinfo->split_level = "1";
+    return tinfo;
+}
+
+static int attr_content(struct _xmlAttr *attr, const char *name,
+                       const char **dst_content)
+{
+    if (!strcmp(attr->name, name) && attr->children &&
+       attr->children->type == XML_TEXT_NODE)
+    {
+       *dst_content = attr->children->content;
+       return 1;
+    }
+    return 0;
+}
+
+static void destroy_schemas(struct filter_info *tinfo)
+{
+    struct filter_schema *schema = tinfo->schemas;
+    while (schema)
+    {
+       struct filter_schema *schema_next = schema->next;
+       if (schema->stylesheet_xsp)
+           xsltFreeStylesheet(schema->stylesheet_xsp);
+       xfree(schema);
+       schema = schema_next;
+    }
+    tinfo->schemas = 0;
+    xfree(tinfo->fname);
+    if (tinfo->doc)
+       xmlFreeDoc(tinfo->doc);    
+    tinfo->doc = 0;
+}
+
+static ZEBRA_RES create_schemas(struct filter_info *tinfo, const char *fname)
+{
+    xmlNodePtr ptr;
+    tinfo->fname = xstrdup(fname);
+    tinfo->doc = xmlParseFile(tinfo->fname);
+    if (!tinfo->doc)
+       return ZEBRA_FAIL;
+    ptr = xmlDocGetRootElement(tinfo->doc);
+    if (!ptr || ptr->type != XML_ELEMENT_NODE ||
+       strcmp(ptr->name, "schemaInfo"))
+       return ZEBRA_FAIL;
+    for (ptr = ptr->children; ptr; ptr = ptr->next)
+    {
+       if (ptr->type != XML_ELEMENT_NODE)
+           continue;
+       if (!strcmp(ptr->name, "schema"))
+       {
+           struct _xmlAttr *attr;
+           struct filter_schema *schema = xmalloc(sizeof(*schema));
+           schema->name = 0;
+           schema->identifier = 0;
+           schema->stylesheet = 0;
+           schema->default_schema = 0;
+           schema->next = tinfo->schemas;
+           schema->stylesheet_xsp = 0;
+           tinfo->schemas = schema;
+           for (attr = ptr->properties; attr; attr = attr->next)
+           {
+               attr_content(attr, "identifier", &schema->identifier);
+               attr_content(attr, "name", &schema->name);
+               attr_content(attr, "stylesheet", &schema->stylesheet);
+               attr_content(attr, "default", &schema->default_schema);
+           }
+           if (schema->stylesheet)
+               schema->stylesheet_xsp =
+                   xsltParseStylesheetFile(
+                       (const xmlChar*) schema->stylesheet);
+       }
+       else if (!strcmp(ptr->name, "split"))
+       {
+           struct _xmlAttr *attr;
+           for (attr = ptr->properties; attr; attr = attr->next)
+           {
+               attr_content(attr, "level", &tinfo->split_level);
+               attr_content(attr, "path", &tinfo->split_path);
+           }
+       }
+       else
+       {
+           yaz_log(YLOG_WARN, "Bad element %s in %s", ptr->name, fname);
+           return ZEBRA_FAIL;
+       }
+    }
+    return ZEBRA_OK;
+}
+
+static struct filter_schema *lookup_schema(struct filter_info *tinfo,
+                                          const char *est)
+{
+    struct filter_schema *schema;
+    for (schema = tinfo->schemas; schema; schema = schema->next)
+    {
+       if (est)
+       {
+           if (schema->identifier && !strcmp(schema->identifier, est))
+               return schema;
+           if (schema->name && !strcmp(schema->name, est))
+               return schema;
+       }
+       if (schema->default_schema)
+           return schema;
+    }
+    return 0;
+}
+
 static void filter_config(void *clientData, Res res, const char *args)
 {
     struct filter_info *tinfo = clientData;
     if (!args || !*args)
-       args = "default.xsl";
-    if (!tinfo->fname || strcmp(args, tinfo->fname))
-    {
-       /* different filename so must reread stylesheet */
-       xfree(tinfo->fname);
-       tinfo->fname = xstrdup(args);
-       if (tinfo->stylesheet_xsp)
-           xsltFreeStylesheet(tinfo->stylesheet_xsp);
-       tinfo->stylesheet_xsp =
-           xsltParseStylesheetFile((const xmlChar*) tinfo->fname);
-    }
+       args = "xsltfilter.xml";
+    if (tinfo->fname && !strcmp(args, tinfo->fname))
+       return;
+    destroy_schemas(tinfo);
+    create_schemas(tinfo, args);
 }
 
 static void filter_destroy(void *clientData)
 {
     struct filter_info *tinfo = clientData;
-    if (tinfo->stylesheet_xsp)
-       xsltFreeStylesheet(tinfo->stylesheet_xsp);
-    xfree(tinfo->fname);
+    destroy_schemas(tinfo);
+    if (tinfo->reader)
+       xmlFreeTextReader(tinfo->reader);
+    odr_destroy(tinfo->odr);
     xfree(tinfo);
 }
 
@@ -133,16 +286,50 @@ static void index_node(struct filter_info *tinfo,  struct recExtractCtrl *ctrl,
     }
 }
 
-static int filter_extract(void *clientData, struct recExtractCtrl *p)
+static int extract_doc(struct filter_info *tinfo, struct recExtractCtrl *p,
+                      xmlDocPtr doc)
 {
-    static const char *params[] = {
-       "schema", "'http://indexdata.dk/zebra/indexing/1'",
-       0
-    };
-    struct filter_info *tinfo = clientData;
     RecWord recWord;
-    int ret;
+    const char *params[10];
+    xmlChar *buf_out;
+    int len_out;
+
+    struct filter_schema *schema = lookup_schema(tinfo, ZEBRA_INDEX_NS);
+
+    params[0] = 0;
+    set_param_str(params, "schema", ZEBRA_INDEX_NS, tinfo->odr);
+
+    (*p->init)(p, &recWord);
+    recWord.reg_type = 'w';
+
+    if (schema && schema->stylesheet_xsp)
+    {
+       xmlDocPtr resDoc = 
+           xsltApplyStylesheet(schema->stylesheet_xsp,
+                               doc, params);
+       if (p->flagShowRecords)
+       {
+           xmlDocDumpMemory(resDoc, &buf_out, &len_out);
+           fwrite(buf_out, len_out, 1, stdout);
+           xmlFree(buf_out);
+       }
+       index_node(tinfo, p, xmlDocGetRootElement(resDoc), &recWord);
+       xmlFreeDoc(resDoc);
+    }
+    xmlDocDumpMemory(doc, &buf_out, &len_out);
+    if (p->flagShowRecords)
+       fwrite(buf_out, len_out, 1, stdout);
+    (*p->setStoreData)(p, buf_out, len_out);
+    xmlFree(buf_out);
+    
+    xmlFreeDoc(doc);
+    return RECCTRL_EXTRACT_OK;
+}
 
+static int extract_split(struct filter_info *tinfo, struct recExtractCtrl *p)
+{
+    int ret;
+    int split_depth = 0;
     if (p->first_record)
     {
        if (tinfo->reader)
@@ -156,50 +343,23 @@ static int filter_extract(void *clientData, struct recExtractCtrl *p)
     if (!tinfo->reader)
        return RECCTRL_EXTRACT_ERROR_GENERIC;
 
-    if (!tinfo->stylesheet_xsp)
-       return RECCTRL_EXTRACT_ERROR_GENERIC;
-
-    (*p->init)(p, &recWord);
-    recWord.reg_type = 'w';
-
+    if (tinfo->split_level)
+       split_depth = atoi(tinfo->split_level);
     ret = xmlTextReaderRead(tinfo->reader);
     while (ret == 1) {
        int type = xmlTextReaderNodeType(tinfo->reader);
        int depth = xmlTextReaderDepth(tinfo->reader);
-       if (tinfo->split_depth == 0 ||
-           (type == XML_READER_TYPE_ELEMENT && tinfo->split_depth == depth))
+       if (split_depth == 0 ||
+           (split_depth > 0 &&
+            type == XML_READER_TYPE_ELEMENT && split_depth == depth))
        {
-           xmlChar *buf_out;
-           int len_out;
-
            xmlNodePtr ptr = xmlTextReaderExpand(tinfo->reader);
            xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
            xmlDocPtr doc = xmlNewDoc("1.0");
 
            xmlDocSetRootElement(doc, ptr2);
-           
-           if (tinfo->stylesheet_xsp)
-           {
-               xmlDocPtr resDoc = 
-                   xsltApplyStylesheet(tinfo->stylesheet_xsp,
-                                       doc, params);
-               if (p->flagShowRecords)
-               {
-                   xmlDocDumpMemory(resDoc, &buf_out, &len_out);
-                   fwrite(buf_out, len_out, 1, stdout);
-                   xmlFree(buf_out);
-               }
-               index_node(tinfo, p, xmlDocGetRootElement(resDoc), &recWord);
-               xmlFreeDoc(resDoc);
-           }
-           xmlDocDumpMemory(doc, &buf_out, &len_out);
-           if (p->flagShowRecords)
-               fwrite(buf_out, len_out, 1, stdout);
-           (*p->setStoreData)(p, buf_out, len_out);
-           xmlFree(buf_out);
 
-           xmlFreeDoc(doc);
-           return RECCTRL_EXTRACT_OK;
+           return extract_doc(tinfo, p, doc);      
        }
        ret = xmlTextReaderRead(tinfo->reader);
     }
@@ -208,6 +368,38 @@ static int filter_extract(void *clientData, struct recExtractCtrl *p)
     return RECCTRL_EXTRACT_EOF;
 }
 
+static int extract_full(struct filter_info *tinfo, struct recExtractCtrl *p)
+{
+    if (p->first_record) /* only one record per stream */
+    {
+       xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, p /* I/O handler */,
+                                 0 /* URL */,
+                                 0 /* encoding */,
+                                 XML_PARSE_XINCLUDE);
+       if (!doc)
+       {
+           return RECCTRL_EXTRACT_ERROR_GENERIC;
+       }
+       return extract_doc(tinfo, p, doc);
+    }
+    else
+       return RECCTRL_EXTRACT_EOF;
+}
+
+static int filter_extract(void *clientData, struct recExtractCtrl *p)
+{
+    struct filter_info *tinfo = clientData;
+
+    odr_reset(tinfo->odr);
+
+    if (tinfo->split_level == 0 && tinfo->split_path == 0)
+       return extract_full(tinfo, p);
+    else
+    {
+       return extract_split(tinfo, p);
+    }
+}
+
 static int ioread_ret(void *context, char *buffer, int len)
 {
     struct recRetrieveCtrl *p = context;
@@ -219,20 +411,18 @@ static int ioclose_ret(void *context)
     return 0;
 }
 
+
 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
 {
-    static const char *params[] = {
-       "schema", "'F'",
-       0
-    };
+    const char *esn = ZEBRA_SCHEMA_IDENTITY_NS;
+    const char *params[10];
     struct filter_info *tinfo = clientData;
     xmlDocPtr resDoc;
     xmlDocPtr doc;
+    struct filter_schema *schema;
 
     if (p->comp)
     {
-       const char *esn;
-       char *esn_quoted;
        if (p->comp->which != Z_RecordComp_simple
            || p->comp->u.simple->which != Z_ElementSetNames_generic)
        {
@@ -240,15 +430,23 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
            return 0;
        }
        esn = p->comp->u.simple->u.generic;
-       esn_quoted = odr_malloc(p->odr, 3 + strlen(esn));
-       sprintf(esn_quoted, "'%s'", esn);
-       params[1] = esn_quoted;
     }
-    if (!tinfo->stylesheet_xsp)
+    schema = lookup_schema(tinfo, esn);
+    if (!schema)
     {
-       p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
+       p->diagnostic =
+           YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
        return 0;
     }
+
+    params[0] = 0;
+    set_param_str(params, "schema", esn, p->odr);
+    if (p->fname)
+       set_param_str(params, "filename", p->fname, p->odr);
+    if (p->score >= 0)
+       set_param_int(params, "score", p->score, p->odr);
+    set_param_int(params, "size", p->recordSize, p->odr);
+    
     doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
                    0 /* URL */,
                    0 /* encoding */,
@@ -258,8 +456,15 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
        p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
        return 0;
     }
-    resDoc = xsltApplyStylesheet(tinfo->stylesheet_xsp,
-                                doc, params);
+
+    if (!schema->stylesheet_xsp)
+       resDoc = doc;
+    else
+    {
+       resDoc = xsltApplyStylesheet(schema->stylesheet_xsp,
+                                    doc, params);
+       xmlFreeDoc(doc);
+    }
     if (!resDoc)
     {
        p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
@@ -295,14 +500,23 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
        p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
     }
     xmlFreeDoc(resDoc);
-    xmlFreeDoc(doc);
     return 0;
 }
 
-static struct recType filter_type = {
+static struct recType filter_type_xslt = {
     0,
     "xslt",
-    filter_init,
+    filter_init_xslt,
+    filter_config,
+    filter_destroy,
+    filter_extract,
+    filter_retrieve
+};
+
+static struct recType filter_type_xslt1 = {
+    0,
+    "xslt1",
+    filter_init_xslt1,
     filter_config,
     filter_destroy,
     filter_extract,
@@ -317,6 +531,9 @@ idzebra_filter
 #endif
 
 [] = {
-    &filter_type,
+    &filter_type_xslt,
+#ifdef LIBXML_READER_ENABLED
+    &filter_type_xslt1,
+#endif
     0,
 };