New yaz_xml_get_prop utility YAZ-839
[yaz-moved-to-github.git] / src / record_conv.c
index a272aee..b117340 100644 (file)
@@ -1,5 +1,5 @@
 /* This file is part of the YAZ toolkit.
- * Copyright (C) 1995-2012 Index Data
+ * Copyright (C) Index Data
  * See the file LICENSE for details.
  */
 /**
 #include <yaz/nmem.h>
 #include <yaz/tpath.h>
 #include <yaz/z-opac.h>
+#include <yaz/xml_get.h>
 
 #if YAZ_HAVE_XML2
 #include <libxml/parser.h>
 #include <libxml/tree.h>
 #include <libxml/xinclude.h>
+#include <libxml/xpath.h>
+#include <libxml/xpathInternals.h>
 #if YAZ_HAVE_XSLT
 #include <libxslt/xsltutils.h>
 #include <libxslt/transform.h>
@@ -57,6 +60,7 @@ struct marc_info {
     const char *output_charset;
     int input_format_mode;
     int output_format_mode;
+    const char *leader_spec;
 };
 
 /** \brief tranformation info (rule info) */
@@ -97,11 +101,21 @@ void yaz_record_conv_destroy(yaz_record_conv_t p)
 }
 
 #if YAZ_HAVE_XSLT
+struct xslt_info {
+    NMEM nmem;
+    xmlDocPtr xsp_doc;
+    const char **xsl_parms;
+};
+
 static void *construct_xslt(const xmlNode *ptr,
                             const char *path, WRBUF wr_error)
 {
     struct _xmlAttr *attr;
     const char *stylesheet = 0;
+    struct xslt_info *info = 0;
+    NMEM nmem = 0;
+    int max_parms = 10;
+    int no_parms = 0;
 
     if (strcmp((const char *) ptr->name, "xslt"))
         return 0;
@@ -118,17 +132,77 @@ static void *construct_xslt(const xmlNode *ptr,
             return 0;
         }
     }
+    nmem = nmem_create();
+    info = nmem_malloc(nmem, sizeof(*info));
+    info->nmem = nmem;
+    info->xsl_parms = nmem_malloc(
+        nmem, (2 * max_parms + 1) * sizeof(*info->xsl_parms));
+
+    for (ptr = ptr->children; ptr; ptr = ptr->next)
+    {
+        const char *name = 0;
+        const char *value = 0;
+        char *qvalue = 0;
+        if (ptr->type != XML_ELEMENT_NODE)
+            continue;
+        if (strcmp((const char *) ptr->name, "param"))
+        {
+            wrbuf_printf(wr_error, "Bad element '%s'"
+                         "Expected param.", ptr->name);
+            nmem_destroy(nmem);
+            return 0;
+        }
+        for (attr = ptr->properties; attr; attr = attr->next)
+        {
+            if (!xmlStrcmp(attr->name, BAD_CAST "name") &&
+                attr->children && attr->children->type == XML_TEXT_NODE)
+                name = (const char *) attr->children->content;
+            else if (!xmlStrcmp(attr->name, BAD_CAST "value") &&
+                attr->children && attr->children->type == XML_TEXT_NODE)
+                value = (const char *) attr->children->content;
+            else
+            {
+                wrbuf_printf(wr_error, "Bad attribute '%s'"
+                             "Expected name or value.", attr->name);
+                nmem_destroy(nmem);
+                return 0;
+            }
+        }
+        if (!name || !value)
+        {
+            wrbuf_printf(wr_error, "Missing attributes name or value");
+            nmem_destroy(nmem);
+            return 0;
+        }
+        if (no_parms >= max_parms)
+        {
+            wrbuf_printf(wr_error, "Too many parameters given");
+            nmem_destroy(nmem);
+            return 0;
+        }
+
+        qvalue = nmem_malloc(nmem, strlen(value) + 3);
+        strcpy(qvalue, "\'");
+        strcat(qvalue, value);
+        strcat(qvalue, "\'");
+
+        info->xsl_parms[2 * no_parms] = nmem_strdup(nmem, name);
+        info->xsl_parms[2 * no_parms + 1] = qvalue;
+        no_parms++;
+    }
+
+    info->xsl_parms[2 * no_parms] = '\0';
+
     if (!stylesheet)
     {
         wrbuf_printf(wr_error, "Element <xslt>: "
                      "attribute 'stylesheet' expected");
-        return 0;
+        nmem_destroy(nmem);
     }
     else
     {
         char fullpath[1024];
         xsltStylesheetPtr xsp;
-        xmlDocPtr xsp_doc;
         if (!yaz_filepath_resolve(stylesheet, path, 0, fullpath))
         {
             wrbuf_printf(wr_error, "Element <xslt stylesheet=\"%s\"/>:"
@@ -136,21 +210,23 @@ static void *construct_xslt(const xmlNode *ptr,
                          stylesheet, stylesheet);
             if (path)
                 wrbuf_printf(wr_error, " with path '%s'", path);
-                
+
+            nmem_destroy(nmem);
             return 0;
         }
-        xsp_doc = xmlParseFile(fullpath);
-        if (!xsp_doc)
+        info->xsp_doc = xmlParseFile(fullpath);
+        if (!info->xsp_doc)
         {
             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
                          " xml parse failed: %s", stylesheet, fullpath);
             if (path)
                 wrbuf_printf(wr_error, " with path '%s'", path);
+            nmem_destroy(nmem);
             return 0;
         }
         /* need to copy this before passing it to the processor. It will
            be encapsulated in the xsp and destroyed by xsltFreeStylesheet */
-        xsp = xsltParseStylesheetDoc(xmlCopyDoc(xsp_doc, 1));
+        xsp = xsltParseStylesheetDoc(xmlCopyDoc(info->xsp_doc, 1));
         if (!xsp)
         {
             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
@@ -159,27 +235,29 @@ static void *construct_xslt(const xmlNode *ptr,
                 wrbuf_printf(wr_error, " with path '%s'", path);
             wrbuf_printf(wr_error, " ("
 #if YAZ_HAVE_EXSLT
-                         
+
                          "EXSLT enabled"
 #else
                          "EXSLT not supported"
 #endif
                          ")");
-            xmlFreeDoc(xsp_doc);
-            return 0;
+            xmlFreeDoc(info->xsp_doc);
+            nmem_destroy(info->nmem);
         }
         else
         {
             xsltFreeStylesheet(xsp);
-            return xsp_doc;
+            return info;
         }
     }
     return 0;
 }
 
-static int convert_xslt(void *info, WRBUF record, WRBUF wr_error)
+static int convert_xslt(void *vinfo, WRBUF record, WRBUF wr_error)
 {
     int ret = 0;
+    struct xslt_info *info = vinfo;
+
     xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
                                    wrbuf_len(record));
     if (!doc)
@@ -189,14 +267,14 @@ static int convert_xslt(void *info, WRBUF record, WRBUF wr_error)
     }
     else
     {
-        xmlDocPtr xsp_doc = xmlCopyDoc((xmlDocPtr) info, 1);
+        xmlDocPtr xsp_doc = xmlCopyDoc(info->xsp_doc, 1);
         xsltStylesheetPtr xsp = xsltParseStylesheetDoc(xsp_doc);
-        xmlDocPtr res = xsltApplyStylesheet(xsp, doc, 0);
+        xmlDocPtr res = xsltApplyStylesheet(xsp, doc, info->xsl_parms);
         if (res)
         {
             xmlChar *out_buf = 0;
             int out_len;
-            
+
 #if HAVE_XSLTSAVERESULTTOSTRING
             xsltSaveResultToString(&out_buf, &out_len, res, xsp);
 #else
@@ -212,7 +290,7 @@ static int convert_xslt(void *info, WRBUF record, WRBUF wr_error)
             {
                 wrbuf_rewind(record);
                 wrbuf_write(record, (const char *) out_buf, out_len);
-                
+
                 xmlFree(out_buf);
             }
             xmlFreeDoc(res);
@@ -228,18 +306,141 @@ static int convert_xslt(void *info, WRBUF record, WRBUF wr_error)
     return ret;
 }
 
-static void destroy_xslt(void *info)
+static void destroy_xslt(void *vinfo)
 {
+    struct xslt_info *info = vinfo;
+
     if (info)
     {
-        xmlDocPtr xsp_doc = info;
-        xmlFreeDoc(xsp_doc);
+        xmlFreeDoc(info->xsp_doc);
+        nmem_destroy(info->nmem);
     }
 }
 
 /* YAZ_HAVE_XSLT */
 #endif
 
+struct select_info {
+    NMEM nmem;
+    char *xpath_expr;
+};
+
+static void *construct_select(const xmlNode *ptr,
+                              const char *path, WRBUF wr_error)
+{
+    if (strcmp((const char *) ptr->name, "select"))
+        return 0;
+    else
+    {
+        NMEM nmem = nmem_create();
+        struct select_info *info = nmem_malloc(nmem, sizeof(*info));
+        const char *attr_str;
+        const char *xpath = 0;
+
+        info->nmem = nmem;
+        info->xpath_expr = 0;
+        attr_str = yaz_xml_get_prop(ptr, "path%s", &xpath);
+        if (attr_str)
+        {
+            wrbuf_printf(wr_error, "Bad attribute '%s'"
+                         "Expected xpath.", attr_str);
+            nmem_destroy(nmem);
+                return 0;
+        }
+        if (xpath)
+            info->xpath_expr = nmem_strdup(nmem, xpath);
+        return info;
+    }
+}
+
+static int convert_select(void *vinfo, WRBUF record, WRBUF wr_error)
+{
+    int ret = 0;
+    struct select_info *info = vinfo;
+
+    xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
+                                   wrbuf_len(record));
+    if (!doc)
+    {
+        wrbuf_printf(wr_error, "xmlParseMemory failed");
+        ret = -1;
+    }
+    else
+    {
+        xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+        if (xpathCtx && info->xpath_expr)
+        {
+            xmlXPathObjectPtr xpathObj =
+                xmlXPathEvalExpression((const xmlChar *) info->xpath_expr,
+                                       xpathCtx);
+            if (xpathObj)
+            {
+                xmlNodeSetPtr nodes = xpathObj->nodesetval;
+                if (nodes)
+                {
+                    int i;
+                    if (nodes->nodeNr > 0)
+                        wrbuf_rewind(record);
+                    for (i = 0; i < nodes->nodeNr; i++)
+                    {
+                        xmlNode *ptr = nodes->nodeTab[i];
+                        if (ptr->type == XML_ELEMENT_NODE)
+                            ptr = ptr->children;
+                        for (; ptr; ptr = ptr->next)
+                            if (ptr->type == XML_TEXT_NODE)
+                                wrbuf_puts(record, (const char *) ptr->content);
+                    }
+                }
+                xmlXPathFreeObject(xpathObj);
+            }
+            xmlXPathFreeContext(xpathCtx);
+        }
+        xmlFreeDoc(doc);
+    }
+    return ret;
+}
+
+static void destroy_select(void *vinfo)
+{
+    struct select_info *info = vinfo;
+
+    if (info)
+        nmem_destroy(info->nmem);
+}
+
+
+static void *construct_solrmarc(const xmlNode *ptr,
+                                const char *path, WRBUF wr_error)
+{
+    if (strcmp((const char *) ptr->name, "solrmarc"))
+        return 0;
+    return wr_error; /* any non-null ptr will do; we don't use it later*/
+}
+
+static int convert_solrmarc(void *info, WRBUF record, WRBUF wr_error)
+{
+    WRBUF w = wrbuf_alloc();
+    const char *buf = wrbuf_buf(record);
+    size_t i, sz = wrbuf_len(record);
+    for (i = 0; i < sz; i++)
+    {
+        int ch;
+        if (buf[i] == '#' && i < sz - 3 && buf[i+3] == ';'
+            && atoi_n_check(buf+i+1, 2, &ch))
+            i += 3;
+        else
+            ch = buf[i];
+        wrbuf_putc(w, ch);
+    }
+    wrbuf_rewind(record);
+    wrbuf_write(record, wrbuf_buf(w), wrbuf_len(w));
+    wrbuf_destroy(w);
+    return 0;
+}
+
+static void destroy_solrmarc(void *info)
+{
+}
 
 static void *construct_marc(const xmlNode *ptr,
                             const char *path, WRBUF wr_error)
@@ -255,12 +456,12 @@ static void *construct_marc(const xmlNode *ptr,
         nmem_destroy(nmem);
         return 0;
     }
-
     info->nmem = nmem;
     info->input_charset = 0;
     info->output_charset = 0;
     info->input_format_mode = 0;
     info->output_format_mode = 0;
+    info->leader_spec = 0;
 
     for (attr = ptr->properties; attr; attr = attr->next)
     {
@@ -276,11 +477,15 @@ static void *construct_marc(const xmlNode *ptr,
         else if (!xmlStrcmp(attr->name, BAD_CAST "outputformat") &&
             attr->children && attr->children->type == XML_TEXT_NODE)
             output_format = (const char *) attr->children->content;
+        else if (!xmlStrcmp(attr->name, BAD_CAST "leaderspec") &&
+                 attr->children && attr->children->type == XML_TEXT_NODE)
+            info->leader_spec =
+                nmem_strdup(info->nmem,(const char *) attr->children->content);
         else
         {
             wrbuf_printf(wr_error, "Element <marc>: expected attributes"
                          "'inputformat', 'inputcharset', 'outputformat' or"
-                         " 'outputcharset', got attribute '%s'", 
+                         " 'outputcharset', got attribute '%s'",
                          attr->name);
             nmem_destroy(info->nmem);
             return 0;
@@ -301,24 +506,28 @@ static void *construct_marc(const xmlNode *ptr,
     {
         info->input_format_mode = YAZ_MARC_MARCXML;
         /** Libxml2 generates UTF-8 encoding by default .
-            So we convert from UTF-8 to outputcharset (if defined) 
+            So we convert from UTF-8 to outputcharset (if defined)
         */
         if (!info->input_charset && info->output_charset)
             info->input_charset = "utf-8";
     }
+    else if (!strcmp(input_format, "json"))
+    {
+        info->input_format_mode = YAZ_MARC_JSON;
+    }
     else
     {
         wrbuf_printf(wr_error, "Element <marc inputformat='%s'>: "
                      " Unsupported input format"
-                     " defined by attribute value", 
+                     " defined by attribute value",
                      input_format);
         nmem_destroy(info->nmem);
         return 0;
     }
-    
+
     if (!output_format)
     {
-        wrbuf_printf(wr_error, 
+        wrbuf_printf(wr_error,
                      "Element <marc>: attribute 'outputformat' required");
         nmem_destroy(info->nmem);
         return 0;
@@ -349,11 +558,17 @@ static void *construct_marc(const xmlNode *ptr,
         if (info->input_charset && !info->output_charset)
             info->output_charset = "utf-8";
     }
+    else if (!strcmp(output_format, "json"))
+    {
+        info->output_format_mode = YAZ_MARC_JSON;
+        if (info->input_charset && !info->output_charset)
+            info->output_charset = "utf-8";
+    }
     else
     {
         wrbuf_printf(wr_error, "Element <marc outputformat='%s'>: "
                      " Unsupported output format"
-                     " defined by attribute value", 
+                     " defined by attribute value",
                      output_format);
         nmem_destroy(info->nmem);
         return 0;
@@ -364,7 +579,7 @@ static void *construct_marc(const xmlNode *ptr,
                                         info->input_charset);
         if (!cd)
         {
-            wrbuf_printf(wr_error, 
+            wrbuf_printf(wr_error,
                          "Element <marc inputcharset='%s' outputcharset='%s'>:"
                          " Unsupported character set mapping"
                          " defined by attribute values",
@@ -396,21 +611,25 @@ static void *construct_marc(const xmlNode *ptr,
 static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
 {
     struct marc_info *mi = info;
+    const char *input_charset = mi->input_charset;
     int ret = 0;
-    
-    yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, mi->input_charset);
     yaz_marc_t mt = yaz_marc_create();
-    
+
     yaz_marc_xml(mt, mi->output_format_mode);
-    
-    if (cd)
-        yaz_marc_iconv(mt, cd);
+    if (mi->leader_spec)
+        yaz_marc_leader_spec(mt, mi->leader_spec);
+
     if (mi->input_format_mode == YAZ_MARC_ISO2709)
     {
         int sz = yaz_marc_read_iso2709(mt, wrbuf_buf(record),
                                        wrbuf_len(record));
         if (sz > 0)
+        {
+            if (yaz_marc_check_marc21_coding(input_charset, wrbuf_buf(record),
+                                             wrbuf_len(record)))
+                input_charset = "utf-8";
             ret = 0;
+        }
         else
             ret = -1;
     }
@@ -439,13 +658,18 @@ static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
     }
     if (ret == 0)
     {
+        yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, input_charset);
+
+        if (cd)
+            yaz_marc_iconv(mt, cd);
+
         wrbuf_rewind(record);
         ret = yaz_marc_write_mode(mt, record);
         if (ret)
             wrbuf_printf(wr_error, "yaz_marc_write_mode failed");
+        if (cd)
+            yaz_iconv_close(cd);
     }
-    if (cd)
-        yaz_iconv_close(cd);
     yaz_marc_destroy(mt);
     return ret;
 }
@@ -453,31 +677,40 @@ static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
 static void destroy_marc(void *info)
 {
     struct marc_info *mi = info;
-    
+
     nmem_destroy(mi->nmem);
 }
 
 int yaz_record_conv_configure_t(yaz_record_conv_t p, const xmlNode *ptr,
                                 struct yaz_record_conv_type *types)
 {
-    struct yaz_record_conv_type bt[2];
-    
+    struct yaz_record_conv_type bt[4];
+    size_t i = 0;
+
     /* register marc */
-    bt[0].construct = construct_marc;
-    bt[0].convert = convert_marc;
-    bt[0].destroy = destroy_marc;
+    bt[i].construct = construct_marc;
+    bt[i].convert = convert_marc;
+    bt[i++].destroy = destroy_marc;
+
+    bt[i-1].next = &bt[i];
+    bt[i].construct = construct_solrmarc;
+    bt[i].convert = convert_solrmarc;
+    bt[i++].destroy = destroy_solrmarc;
+
+    bt[i-1].next = &bt[i];
+    bt[i].construct = construct_select;
+    bt[i].convert = convert_select;
+    bt[i++].destroy = destroy_select;
 
 #if YAZ_HAVE_XSLT
     /* register xslt */
-    bt[0].next = &bt[1];
-    bt[1].next = types;
-    bt[1].construct = construct_xslt;
-    bt[1].convert = convert_xslt;
-    bt[1].destroy = destroy_xslt;
-#else
-    bt[0].next = types;
+    bt[i-1].next = &bt[i];
+    bt[i].construct = construct_xslt;
+    bt[i].convert = convert_xslt;
+    bt[i++].destroy = destroy_xslt;
 #endif
-    
+
+    bt[i-1].next = types;
     yaz_record_conv_reset(p);
 
     /* parsing element children */
@@ -530,7 +763,7 @@ static int yaz_record_conv_record_rule(yaz_record_conv_t p,
     int ret = 0;
     WRBUF record = output_record; /* pointer transfer */
     wrbuf_rewind(p->wr_error);
-    
+
     wrbuf_write(record, input_record_buf, input_record_len);
     for (; ret == 0 && r; r = r->next)
         ret = r->type->convert(r->info, record, p->wr_error);
@@ -544,25 +777,32 @@ int yaz_record_conv_opac_record(yaz_record_conv_t p,
     int ret = 0;
     struct yaz_record_conv_rule *r = p->rules;
     if (!r || r->type->construct != construct_marc)
+    {
+        wrbuf_puts(p->wr_error, "Expecting MARC rule as first rule for OPAC");
         ret = -1; /* no marc rule so we can't do OPAC */
+    }
     else
     {
         struct marc_info *mi = r->info;
+        const char *input_charset = mi->input_charset;
+        yaz_iconv_t cd;
 
         WRBUF res = wrbuf_alloc();
         yaz_marc_t mt = yaz_marc_create();
-        yaz_iconv_t cd = yaz_iconv_open(mi->output_charset,
-                                        mi->input_charset);
-        
+
+        if (yaz_opac_check_marc21_coding(input_charset, input_record))
+            input_charset = "utf-8";
+        cd = yaz_iconv_open(mi->output_charset, input_charset);
+
         wrbuf_rewind(p->wr_error);
         yaz_marc_xml(mt, mi->output_format_mode);
-        
+
         yaz_marc_iconv(mt, cd);
-        
+
         yaz_opac_decode_wrbuf(mt, input_record, res);
         if (ret != -1)
         {
-            ret = yaz_record_conv_record_rule(p, 
+            ret = yaz_record_conv_record_rule(p,
                                               r->next,
                                               wrbuf_buf(res), wrbuf_len(res),
                                               output_record);
@@ -605,9 +845,6 @@ yaz_record_conv_t yaz_record_conv_create()
     p->wr_error = wrbuf_alloc();
     p->rules = 0;
     p->path = 0;
-#if YAZ_HAVE_EXSLT
-    exsltRegisterAll(); 
-#endif    
     return p;
 }