1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) Index Data
3 * See the file LICENSE for details.
7 * \brief Record Conversions utility
15 #include <yaz/yaz-iconv.h>
16 #include <yaz/marcdisp.h>
17 #include <yaz/record_conv.h>
18 #include <yaz/wrbuf.h>
19 #include <yaz/xmalloc.h>
21 #include <yaz/tpath.h>
22 #include <yaz/z-opac.h>
25 #include <libxml/parser.h>
26 #include <libxml/tree.h>
27 #include <libxml/xinclude.h>
28 #include <libxml/xpath.h>
29 #include <libxml/xpathInternals.h>
31 #include <libxslt/xsltutils.h>
32 #include <libxslt/transform.h>
35 #include <libexslt/exslt.h>
38 /** \brief The internal structure for yaz_record_conv_t */
39 struct yaz_record_conv_struct {
40 /** \brief memory for configuration */
43 /** \brief conversion rules (allocated using NMEM) */
44 struct yaz_record_conv_rule *rules;
46 /** \brief pointer to last conversion rule pointer in chain */
47 struct yaz_record_conv_rule **rules_p;
49 /** \brief string buffer for error messages */
52 /** \brief path for opening files */
58 const char *input_charset;
59 const char *output_charset;
60 int input_format_mode;
61 int output_format_mode;
62 const char *leader_spec;
65 /** \brief tranformation info (rule info) */
66 struct yaz_record_conv_rule {
67 struct yaz_record_conv_type *type;
69 struct yaz_record_conv_rule *next;
72 /** \brief reset rules+configuration */
73 static void yaz_record_conv_reset(yaz_record_conv_t p)
76 struct yaz_record_conv_rule *r;
77 for (r = p->rules; r; r = r->next)
79 r->type->destroy(r->info);
81 wrbuf_rewind(p->wr_error);
86 p->rules_p = &p->rules;
89 void yaz_record_conv_destroy(yaz_record_conv_t p)
93 yaz_record_conv_reset(p);
94 nmem_destroy(p->nmem);
95 wrbuf_destroy(p->wr_error);
106 const char **xsl_parms;
109 static void *construct_xslt(const xmlNode *ptr,
110 const char *path, WRBUF wr_error)
112 struct _xmlAttr *attr;
113 const char *stylesheet = 0;
114 struct xslt_info *info = 0;
119 if (strcmp((const char *) ptr->name, "xslt"))
122 for (attr = ptr->properties; attr; attr = attr->next)
124 if (!xmlStrcmp(attr->name, BAD_CAST "stylesheet") &&
125 attr->children && attr->children->type == XML_TEXT_NODE)
126 stylesheet = (const char *) attr->children->content;
129 wrbuf_printf(wr_error, "Bad attribute '%s'"
130 "Expected stylesheet.", attr->name);
134 nmem = nmem_create();
135 info = nmem_malloc(nmem, sizeof(*info));
137 info->xsl_parms = nmem_malloc(
138 nmem, (2 * max_parms + 1) * sizeof(*info->xsl_parms));
140 for (ptr = ptr->children; ptr; ptr = ptr->next)
142 const char *name = 0;
143 const char *value = 0;
145 if (ptr->type != XML_ELEMENT_NODE)
147 if (strcmp((const char *) ptr->name, "param"))
149 wrbuf_printf(wr_error, "Bad element '%s'"
150 "Expected param.", ptr->name);
154 for (attr = ptr->properties; attr; attr = attr->next)
156 if (!xmlStrcmp(attr->name, BAD_CAST "name") &&
157 attr->children && attr->children->type == XML_TEXT_NODE)
158 name = (const char *) attr->children->content;
159 else if (!xmlStrcmp(attr->name, BAD_CAST "value") &&
160 attr->children && attr->children->type == XML_TEXT_NODE)
161 value = (const char *) attr->children->content;
164 wrbuf_printf(wr_error, "Bad attribute '%s'"
165 "Expected name or value.", attr->name);
172 wrbuf_printf(wr_error, "Missing attributes name or value");
176 if (no_parms >= max_parms)
178 wrbuf_printf(wr_error, "Too many parameters given");
183 qvalue = nmem_malloc(nmem, strlen(value) + 3);
184 strcpy(qvalue, "\'");
185 strcat(qvalue, value);
186 strcat(qvalue, "\'");
188 info->xsl_parms[2 * no_parms] = nmem_strdup(nmem, name);
189 info->xsl_parms[2 * no_parms + 1] = qvalue;
193 info->xsl_parms[2 * no_parms] = '\0';
197 wrbuf_printf(wr_error, "Element <xslt>: "
198 "attribute 'stylesheet' expected");
204 xsltStylesheetPtr xsp;
205 if (!yaz_filepath_resolve(stylesheet, path, 0, fullpath))
207 wrbuf_printf(wr_error, "Element <xslt stylesheet=\"%s\"/>:"
208 " could not locate stylesheet '%s'",
209 stylesheet, stylesheet);
211 wrbuf_printf(wr_error, " with path '%s'", path);
216 info->xsp_doc = xmlParseFile(fullpath);
219 wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
220 " xml parse failed: %s", stylesheet, fullpath);
222 wrbuf_printf(wr_error, " with path '%s'", path);
226 /* need to copy this before passing it to the processor. It will
227 be encapsulated in the xsp and destroyed by xsltFreeStylesheet */
228 xsp = xsltParseStylesheetDoc(xmlCopyDoc(info->xsp_doc, 1));
231 wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
232 " xslt parse failed: %s", stylesheet, fullpath);
234 wrbuf_printf(wr_error, " with path '%s'", path);
235 wrbuf_printf(wr_error, " ("
240 "EXSLT not supported"
243 xmlFreeDoc(info->xsp_doc);
244 nmem_destroy(info->nmem);
248 xsltFreeStylesheet(xsp);
255 static int convert_xslt(void *vinfo, WRBUF record, WRBUF wr_error)
258 struct xslt_info *info = vinfo;
260 xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
264 wrbuf_printf(wr_error, "xmlParseMemory failed");
269 xmlDocPtr xsp_doc = xmlCopyDoc(info->xsp_doc, 1);
270 xsltStylesheetPtr xsp = xsltParseStylesheetDoc(xsp_doc);
271 xmlDocPtr res = xsltApplyStylesheet(xsp, doc, info->xsl_parms);
274 xmlChar *out_buf = 0;
277 #if HAVE_XSLTSAVERESULTTOSTRING
278 xsltSaveResultToString(&out_buf, &out_len, res, xsp);
280 xmlDocDumpFormatMemory (res, &out_buf, &out_len, 1);
284 wrbuf_printf(wr_error,
285 "xsltSaveResultToString failed");
290 wrbuf_rewind(record);
291 wrbuf_write(record, (const char *) out_buf, out_len);
299 wrbuf_printf(wr_error, "xsltApplyStylesheet failed");
303 xsltFreeStylesheet(xsp); /* frees xsp_doc too */
308 static void destroy_xslt(void *vinfo)
310 struct xslt_info *info = vinfo;
314 xmlFreeDoc(info->xsp_doc);
315 nmem_destroy(info->nmem);
327 static void *construct_select(const xmlNode *ptr,
328 const char *path, WRBUF wr_error)
330 if (strcmp((const char *) ptr->name, "select"))
334 struct _xmlAttr *attr;
335 NMEM nmem = nmem_create();
336 struct select_info *info = nmem_malloc(nmem, sizeof(*info));
339 info->xpath_expr = 0;
340 for (attr = ptr->properties; attr; attr = attr->next)
342 if (!xmlStrcmp(attr->name, BAD_CAST "path") &&
343 attr->children && attr->children->type == XML_TEXT_NODE)
345 nmem_strdup(nmem, (const char *) attr->children->content);
348 wrbuf_printf(wr_error, "Bad attribute '%s'"
349 "Expected xpath.", attr->name);
358 static int convert_select(void *vinfo, WRBUF record, WRBUF wr_error)
361 struct select_info *info = vinfo;
363 xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
367 wrbuf_printf(wr_error, "xmlParseMemory failed");
372 xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
373 if (xpathCtx && info->xpath_expr)
375 xmlXPathObjectPtr xpathObj =
376 xmlXPathEvalExpression((const xmlChar *) info->xpath_expr,
380 xmlNodeSetPtr nodes = xpathObj->nodesetval;
384 if (nodes->nodeNr > 0)
385 wrbuf_rewind(record);
386 for (i = 0; i < nodes->nodeNr; i++)
388 xmlNode *ptr = nodes->nodeTab[i];
389 if (ptr->type == XML_ELEMENT_NODE)
391 for (; ptr; ptr = ptr->next)
392 if (ptr->type == XML_TEXT_NODE)
393 wrbuf_puts(record, (const char *) ptr->content);
396 xmlXPathFreeObject(xpathObj);
398 xmlXPathFreeContext(xpathCtx);
405 static void destroy_select(void *vinfo)
407 struct select_info *info = vinfo;
410 nmem_destroy(info->nmem);
414 static void *construct_solrmarc(const xmlNode *ptr,
415 const char *path, WRBUF wr_error)
417 if (strcmp((const char *) ptr->name, "solrmarc"))
419 return wr_error; /* any non-null ptr will do; we don't use it later*/
422 static int convert_solrmarc(void *info, WRBUF record, WRBUF wr_error)
424 WRBUF w = wrbuf_alloc();
425 const char *buf = wrbuf_buf(record);
426 size_t i, sz = wrbuf_len(record);
427 for (i = 0; i < sz; i++)
430 if (buf[i] == '#' && i < sz - 3 && buf[i+3] == ';'
431 && atoi_n_check(buf+i+1, 2, &ch))
437 wrbuf_rewind(record);
438 wrbuf_write(record, wrbuf_buf(w), wrbuf_len(w));
443 static void destroy_solrmarc(void *info)
447 static void *construct_marc(const xmlNode *ptr,
448 const char *path, WRBUF wr_error)
450 NMEM nmem = nmem_create();
451 struct marc_info *info = nmem_malloc(nmem, sizeof(*info));
452 struct _xmlAttr *attr;
453 const char *input_format = 0;
454 const char *output_format = 0;
456 if (strcmp((const char *) ptr->name, "marc"))
462 info->input_charset = 0;
463 info->output_charset = 0;
464 info->input_format_mode = 0;
465 info->output_format_mode = 0;
466 info->leader_spec = 0;
468 for (attr = ptr->properties; attr; attr = attr->next)
470 if (!xmlStrcmp(attr->name, BAD_CAST "inputcharset") &&
471 attr->children && attr->children->type == XML_TEXT_NODE)
472 info->input_charset = (const char *) attr->children->content;
473 else if (!xmlStrcmp(attr->name, BAD_CAST "outputcharset") &&
474 attr->children && attr->children->type == XML_TEXT_NODE)
475 info->output_charset = (const char *) attr->children->content;
476 else if (!xmlStrcmp(attr->name, BAD_CAST "inputformat") &&
477 attr->children && attr->children->type == XML_TEXT_NODE)
478 input_format = (const char *) attr->children->content;
479 else if (!xmlStrcmp(attr->name, BAD_CAST "outputformat") &&
480 attr->children && attr->children->type == XML_TEXT_NODE)
481 output_format = (const char *) attr->children->content;
482 else if (!xmlStrcmp(attr->name, BAD_CAST "leaderspec") &&
483 attr->children && attr->children->type == XML_TEXT_NODE)
485 nmem_strdup(info->nmem,(const char *) attr->children->content);
488 wrbuf_printf(wr_error, "Element <marc>: expected attributes"
489 "'inputformat', 'inputcharset', 'outputformat' or"
490 " 'outputcharset', got attribute '%s'",
492 nmem_destroy(info->nmem);
498 wrbuf_printf(wr_error, "Element <marc>: "
499 "attribute 'inputformat' required");
500 nmem_destroy(info->nmem);
503 else if (!strcmp(input_format, "marc"))
505 info->input_format_mode = YAZ_MARC_ISO2709;
507 else if (!strcmp(input_format, "xml"))
509 info->input_format_mode = YAZ_MARC_MARCXML;
510 /** Libxml2 generates UTF-8 encoding by default .
511 So we convert from UTF-8 to outputcharset (if defined)
513 if (!info->input_charset && info->output_charset)
514 info->input_charset = "utf-8";
516 else if (!strcmp(input_format, "json"))
518 info->input_format_mode = YAZ_MARC_JSON;
522 wrbuf_printf(wr_error, "Element <marc inputformat='%s'>: "
523 " Unsupported input format"
524 " defined by attribute value",
526 nmem_destroy(info->nmem);
532 wrbuf_printf(wr_error,
533 "Element <marc>: attribute 'outputformat' required");
534 nmem_destroy(info->nmem);
537 else if (!strcmp(output_format, "line"))
539 info->output_format_mode = YAZ_MARC_LINE;
541 else if (!strcmp(output_format, "marcxml"))
543 info->output_format_mode = YAZ_MARC_MARCXML;
544 if (info->input_charset && !info->output_charset)
545 info->output_charset = "utf-8";
547 else if (!strcmp(output_format, "turbomarc"))
549 info->output_format_mode = YAZ_MARC_TURBOMARC;
550 if (info->input_charset && !info->output_charset)
551 info->output_charset = "utf-8";
553 else if (!strcmp(output_format, "marc"))
555 info->output_format_mode = YAZ_MARC_ISO2709;
557 else if (!strcmp(output_format, "marcxchange"))
559 info->output_format_mode = YAZ_MARC_XCHANGE;
560 if (info->input_charset && !info->output_charset)
561 info->output_charset = "utf-8";
563 else if (!strcmp(output_format, "json"))
565 info->output_format_mode = YAZ_MARC_JSON;
566 if (info->input_charset && !info->output_charset)
567 info->output_charset = "utf-8";
571 wrbuf_printf(wr_error, "Element <marc outputformat='%s'>: "
572 " Unsupported output format"
573 " defined by attribute value",
575 nmem_destroy(info->nmem);
578 if (info->input_charset && info->output_charset)
580 yaz_iconv_t cd = yaz_iconv_open(info->output_charset,
581 info->input_charset);
584 wrbuf_printf(wr_error,
585 "Element <marc inputcharset='%s' outputcharset='%s'>:"
586 " Unsupported character set mapping"
587 " defined by attribute values",
588 info->input_charset, info->output_charset);
589 nmem_destroy(info->nmem);
594 else if (!info->output_charset)
596 wrbuf_printf(wr_error, "Element <marc>: "
597 "attribute 'outputcharset' missing");
598 nmem_destroy(info->nmem);
601 else if (!info->input_charset)
603 wrbuf_printf(wr_error, "Element <marc>: "
604 "attribute 'inputcharset' missing");
605 nmem_destroy(info->nmem);
608 info->input_charset = nmem_strdup(info->nmem, info->input_charset);
609 info->output_charset = nmem_strdup(info->nmem, info->output_charset);
613 static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
615 struct marc_info *mi = info;
616 const char *input_charset = mi->input_charset;
618 yaz_marc_t mt = yaz_marc_create();
620 yaz_marc_xml(mt, mi->output_format_mode);
622 yaz_marc_leader_spec(mt, mi->leader_spec);
624 if (mi->input_format_mode == YAZ_MARC_ISO2709)
626 int sz = yaz_marc_read_iso2709(mt, wrbuf_buf(record),
630 if (yaz_marc_check_marc21_coding(input_charset, wrbuf_buf(record),
632 input_charset = "utf-8";
638 else if (mi->input_format_mode == YAZ_MARC_MARCXML ||
639 mi->input_format_mode == YAZ_MARC_TURBOMARC)
641 xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
645 wrbuf_printf(wr_error, "xmlParseMemory failed");
650 ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc));
652 wrbuf_printf(wr_error, "yaz_marc_read_xml failed");
658 wrbuf_printf(wr_error, "unsupported input format");
663 yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, input_charset);
666 yaz_marc_iconv(mt, cd);
668 wrbuf_rewind(record);
669 ret = yaz_marc_write_mode(mt, record);
671 wrbuf_printf(wr_error, "yaz_marc_write_mode failed");
675 yaz_marc_destroy(mt);
679 static void destroy_marc(void *info)
681 struct marc_info *mi = info;
683 nmem_destroy(mi->nmem);
686 int yaz_record_conv_configure_t(yaz_record_conv_t p, const xmlNode *ptr,
687 struct yaz_record_conv_type *types)
689 struct yaz_record_conv_type bt[4];
693 bt[i].construct = construct_marc;
694 bt[i].convert = convert_marc;
695 bt[i++].destroy = destroy_marc;
697 bt[i-1].next = &bt[i];
698 bt[i].construct = construct_solrmarc;
699 bt[i].convert = convert_solrmarc;
700 bt[i++].destroy = destroy_solrmarc;
702 bt[i-1].next = &bt[i];
703 bt[i].construct = construct_select;
704 bt[i].convert = convert_select;
705 bt[i++].destroy = destroy_select;
709 bt[i-1].next = &bt[i];
710 bt[i].construct = construct_xslt;
711 bt[i].convert = convert_xslt;
712 bt[i++].destroy = destroy_xslt;
715 bt[i-1].next = types;
716 yaz_record_conv_reset(p);
718 /* parsing element children */
719 for (ptr = ptr->children; ptr; ptr = ptr->next)
721 struct yaz_record_conv_type *t;
722 struct yaz_record_conv_rule *r;
724 if (ptr->type != XML_ELEMENT_NODE)
726 for (t = &bt[0]; t; t = t->next)
728 wrbuf_rewind(p->wr_error);
729 info = t->construct(ptr, p->path, p->wr_error);
731 if (info || wrbuf_len(p->wr_error))
733 /* info== 0 and no error reported , ie not handled by it */
737 if (wrbuf_len(p->wr_error) == 0)
738 wrbuf_printf(p->wr_error, "Element <backend>: expected "
739 "<marc> or <xslt> element, got <%s>"
743 r = (struct yaz_record_conv_rule *) nmem_malloc(p->nmem, sizeof(*r));
746 r->type = nmem_malloc(p->nmem, sizeof(*t));
747 memcpy(r->type, t, sizeof(*t));
749 p->rules_p = &r->next;
754 int yaz_record_conv_configure(yaz_record_conv_t p, const xmlNode *ptr)
756 return yaz_record_conv_configure_t(p, ptr, 0);
759 static int yaz_record_conv_record_rule(yaz_record_conv_t p,
760 struct yaz_record_conv_rule *r,
761 const char *input_record_buf,
762 size_t input_record_len,
766 WRBUF record = output_record; /* pointer transfer */
767 wrbuf_rewind(p->wr_error);
769 wrbuf_write(record, input_record_buf, input_record_len);
770 for (; ret == 0 && r; r = r->next)
771 ret = r->type->convert(r->info, record, p->wr_error);
775 int yaz_record_conv_opac_record(yaz_record_conv_t p,
776 Z_OPACRecord *input_record,
780 struct yaz_record_conv_rule *r = p->rules;
781 if (!r || r->type->construct != construct_marc)
783 wrbuf_puts(p->wr_error, "Expecting MARC rule as first rule for OPAC");
784 ret = -1; /* no marc rule so we can't do OPAC */
788 struct marc_info *mi = r->info;
789 const char *input_charset = mi->input_charset;
792 WRBUF res = wrbuf_alloc();
793 yaz_marc_t mt = yaz_marc_create();
795 if (yaz_opac_check_marc21_coding(input_charset, input_record))
796 input_charset = "utf-8";
797 cd = yaz_iconv_open(mi->output_charset, input_charset);
799 wrbuf_rewind(p->wr_error);
800 yaz_marc_xml(mt, mi->output_format_mode);
802 yaz_marc_iconv(mt, cd);
804 yaz_opac_decode_wrbuf(mt, input_record, res);
807 ret = yaz_record_conv_record_rule(p,
809 wrbuf_buf(res), wrbuf_len(res),
812 yaz_marc_destroy(mt);
820 int yaz_record_conv_record(yaz_record_conv_t p,
821 const char *input_record_buf,
822 size_t input_record_len,
825 return yaz_record_conv_record_rule(p, p->rules,
827 input_record_len, output_record);
830 const char *yaz_record_conv_get_error(yaz_record_conv_t p)
832 return wrbuf_cstr(p->wr_error);
835 void yaz_record_conv_set_path(yaz_record_conv_t p, const char *path)
840 p->path = xstrdup(path);
843 yaz_record_conv_t yaz_record_conv_create()
845 yaz_record_conv_t p = (yaz_record_conv_t) xmalloc(sizeof(*p));
846 p->nmem = nmem_create();
847 p->wr_error = wrbuf_alloc();
859 * c-file-style: "Stroustrup"
860 * indent-tabs-mode: nil
862 * vim: shiftwidth=4 tabstop=8 expandtab