Improve display of MARC records with multi-byte subfield IDs YAZ-695
[yaz-moved-to-github.git] / src / record_conv.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file record_conv.c
7  * \brief Record Conversions utility
8  */
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <string.h>
15 #include <yaz/yaz-iconv.h>
16 #include <yaz/marcdisp.h>
17 #include <yaz/record_conv.h>
18 #include <yaz/wrbuf.h>
19 #include <yaz/xmalloc.h>
20 #include <yaz/nmem.h>
21 #include <yaz/tpath.h>
22 #include <yaz/z-opac.h>
23
24 #if YAZ_HAVE_XML2
25 #include <libxml/parser.h>
26 #include <libxml/tree.h>
27 #include <libxml/xinclude.h>
28 #if YAZ_HAVE_XSLT
29 #include <libxslt/xsltutils.h>
30 #include <libxslt/transform.h>
31 #endif
32 #if YAZ_HAVE_EXSLT
33 #include <libexslt/exslt.h>
34 #endif
35
36 /** \brief The internal structure for yaz_record_conv_t */
37 struct yaz_record_conv_struct {
38     /** \brief memory for configuration */
39     NMEM nmem;
40
41     /** \brief conversion rules (allocated using NMEM) */
42     struct yaz_record_conv_rule *rules;
43
44     /** \brief pointer to last conversion rule pointer in chain */
45     struct yaz_record_conv_rule **rules_p;
46
47     /** \brief string buffer for error messages */
48     WRBUF wr_error;
49
50     /** \brief path for opening files  */
51     char *path;
52 };
53
54 struct marc_info {
55     NMEM nmem;
56     const char *input_charset;
57     const char *output_charset;
58     int input_format_mode;
59     int output_format_mode;
60     const char *leader_spec;
61 };
62
63 /** \brief tranformation info (rule info) */
64 struct yaz_record_conv_rule {
65     struct yaz_record_conv_type *type;
66     void *info;
67     struct yaz_record_conv_rule *next;
68 };
69
70 /** \brief reset rules+configuration */
71 static void yaz_record_conv_reset(yaz_record_conv_t p)
72 {
73
74     struct yaz_record_conv_rule *r;
75     for (r = p->rules; r; r = r->next)
76     {
77         r->type->destroy(r->info);
78     }
79     wrbuf_rewind(p->wr_error);
80     nmem_reset(p->nmem);
81
82     p->rules = 0;
83
84     p->rules_p = &p->rules;
85 }
86
87 void yaz_record_conv_destroy(yaz_record_conv_t p)
88 {
89     if (p)
90     {
91         yaz_record_conv_reset(p);
92         nmem_destroy(p->nmem);
93         wrbuf_destroy(p->wr_error);
94
95         xfree(p->path);
96         xfree(p);
97     }
98 }
99
100 #if YAZ_HAVE_XSLT
101 struct xslt_info {
102     NMEM nmem;
103     xmlDocPtr xsp_doc;
104     const char **xsl_parms;
105 };
106
107 static void *construct_xslt(const xmlNode *ptr,
108                             const char *path, WRBUF wr_error)
109 {
110     struct _xmlAttr *attr;
111     const char *stylesheet = 0;
112     struct xslt_info *info = 0;
113     NMEM nmem = 0;
114     int max_parms = 10;
115     int no_parms = 0;
116
117     if (strcmp((const char *) ptr->name, "xslt"))
118         return 0;
119
120     for (attr = ptr->properties; attr; attr = attr->next)
121     {
122         if (!xmlStrcmp(attr->name, BAD_CAST "stylesheet") &&
123             attr->children && attr->children->type == XML_TEXT_NODE)
124             stylesheet = (const char *) attr->children->content;
125         else
126         {
127             wrbuf_printf(wr_error, "Bad attribute '%s'"
128                          "Expected stylesheet.", attr->name);
129             return 0;
130         }
131     }
132     nmem = nmem_create();
133     info = nmem_malloc(nmem, sizeof(*info));
134     info->nmem = nmem;
135     info->xsl_parms = nmem_malloc(
136         nmem, (2 * max_parms + 1) * sizeof(*info->xsl_parms));
137
138     for (ptr = ptr->children; ptr; ptr = ptr->next)
139     {
140         const char *name = 0;
141         const char *value = 0;
142         char *qvalue = 0;
143         if (ptr->type != XML_ELEMENT_NODE)
144             continue;
145         if (strcmp((const char *) ptr->name, "param"))
146         {
147             wrbuf_printf(wr_error, "Bad element '%s'"
148                          "Expected param.", ptr->name);
149             nmem_destroy(nmem);
150             return 0;
151         }
152         for (attr = ptr->properties; attr; attr = attr->next)
153         {
154             if (!xmlStrcmp(attr->name, BAD_CAST "name") &&
155                 attr->children && attr->children->type == XML_TEXT_NODE)
156                 name = (const char *) attr->children->content;
157             else if (!xmlStrcmp(attr->name, BAD_CAST "value") &&
158                 attr->children && attr->children->type == XML_TEXT_NODE)
159                 value = (const char *) attr->children->content;
160             else
161             {
162                 wrbuf_printf(wr_error, "Bad attribute '%s'"
163                              "Expected name or value.", attr->name);
164                 nmem_destroy(nmem);
165                 return 0;
166             }
167         }
168         if (!name || !value)
169         {
170             wrbuf_printf(wr_error, "Missing attributes name or value");
171             nmem_destroy(nmem);
172             return 0;
173         }
174         if (no_parms >= max_parms)
175         {
176             wrbuf_printf(wr_error, "Too many parameters given");
177             nmem_destroy(nmem);
178             return 0;
179         }
180
181         qvalue = nmem_malloc(nmem, strlen(value) + 3);
182         strcpy(qvalue, "\'");
183         strcat(qvalue, value);
184         strcat(qvalue, "\'");
185
186         info->xsl_parms[2 * no_parms] = nmem_strdup(nmem, name);
187         info->xsl_parms[2 * no_parms + 1] = qvalue;
188         no_parms++;
189     }
190
191     info->xsl_parms[2 * no_parms] = '\0';
192
193     if (!stylesheet)
194     {
195         wrbuf_printf(wr_error, "Element <xslt>: "
196                      "attribute 'stylesheet' expected");
197         nmem_destroy(nmem);
198     }
199     else
200     {
201         char fullpath[1024];
202         xsltStylesheetPtr xsp;
203         if (!yaz_filepath_resolve(stylesheet, path, 0, fullpath))
204         {
205             wrbuf_printf(wr_error, "Element <xslt stylesheet=\"%s\"/>:"
206                          " could not locate stylesheet '%s'",
207                          stylesheet, stylesheet);
208             if (path)
209                 wrbuf_printf(wr_error, " with path '%s'", path);
210
211             nmem_destroy(nmem);
212             return 0;
213         }
214         info->xsp_doc = xmlParseFile(fullpath);
215         if (!info->xsp_doc)
216         {
217             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
218                          " xml parse failed: %s", stylesheet, fullpath);
219             if (path)
220                 wrbuf_printf(wr_error, " with path '%s'", path);
221             nmem_destroy(nmem);
222             return 0;
223         }
224         /* need to copy this before passing it to the processor. It will
225            be encapsulated in the xsp and destroyed by xsltFreeStylesheet */
226         xsp = xsltParseStylesheetDoc(xmlCopyDoc(info->xsp_doc, 1));
227         if (!xsp)
228         {
229             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
230                          " xslt parse failed: %s", stylesheet, fullpath);
231             if (path)
232                 wrbuf_printf(wr_error, " with path '%s'", path);
233             wrbuf_printf(wr_error, " ("
234 #if YAZ_HAVE_EXSLT
235
236                          "EXSLT enabled"
237 #else
238                          "EXSLT not supported"
239 #endif
240                          ")");
241             xmlFreeDoc(info->xsp_doc);
242             nmem_destroy(info->nmem);
243         }
244         else
245         {
246             xsltFreeStylesheet(xsp);
247             return info;
248         }
249     }
250     return 0;
251 }
252
253 static int convert_xslt(void *vinfo, WRBUF record, WRBUF wr_error)
254 {
255     int ret = 0;
256     struct xslt_info *info = vinfo;
257
258     xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
259                                    wrbuf_len(record));
260     if (!doc)
261     {
262         wrbuf_printf(wr_error, "xmlParseMemory failed");
263         ret = -1;
264     }
265     else
266     {
267         xmlDocPtr xsp_doc = xmlCopyDoc(info->xsp_doc, 1);
268         xsltStylesheetPtr xsp = xsltParseStylesheetDoc(xsp_doc);
269         xmlDocPtr res = xsltApplyStylesheet(xsp, doc, info->xsl_parms);
270         if (res)
271         {
272             xmlChar *out_buf = 0;
273             int out_len;
274
275 #if HAVE_XSLTSAVERESULTTOSTRING
276             xsltSaveResultToString(&out_buf, &out_len, res, xsp);
277 #else
278             xmlDocDumpFormatMemory (res, &out_buf, &out_len, 1);
279 #endif
280             if (!out_buf)
281             {
282                 wrbuf_printf(wr_error,
283                              "xsltSaveResultToString failed");
284                 ret = -1;
285             }
286             else
287             {
288                 wrbuf_rewind(record);
289                 wrbuf_write(record, (const char *) out_buf, out_len);
290
291                 xmlFree(out_buf);
292             }
293             xmlFreeDoc(res);
294         }
295         else
296         {
297             wrbuf_printf(wr_error, "xsltApplyStylesheet failed");
298             ret = -1;
299         }
300         xmlFreeDoc(doc);
301         xsltFreeStylesheet(xsp); /* frees xsp_doc too */
302     }
303     return ret;
304 }
305
306 static void destroy_xslt(void *vinfo)
307 {
308     struct xslt_info *info = vinfo;
309
310     if (info)
311     {
312         xmlFreeDoc(info->xsp_doc);
313         nmem_destroy(info->nmem);
314     }
315 }
316
317 /* YAZ_HAVE_XSLT */
318 #endif
319
320
321 static void *construct_marc(const xmlNode *ptr,
322                             const char *path, WRBUF wr_error)
323 {
324     NMEM nmem = nmem_create();
325     struct marc_info *info = nmem_malloc(nmem, sizeof(*info));
326     struct _xmlAttr *attr;
327     const char *input_format = 0;
328     const char *output_format = 0;
329
330     if (strcmp((const char *) ptr->name, "marc"))
331     {
332         nmem_destroy(nmem);
333         return 0;
334     }
335
336     info->nmem = nmem;
337     info->input_charset = 0;
338     info->output_charset = 0;
339     info->input_format_mode = 0;
340     info->output_format_mode = 0;
341     info->leader_spec = 0;
342
343     for (attr = ptr->properties; attr; attr = attr->next)
344     {
345         if (!xmlStrcmp(attr->name, BAD_CAST "inputcharset") &&
346             attr->children && attr->children->type == XML_TEXT_NODE)
347             info->input_charset = (const char *) attr->children->content;
348         else if (!xmlStrcmp(attr->name, BAD_CAST "outputcharset") &&
349             attr->children && attr->children->type == XML_TEXT_NODE)
350             info->output_charset = (const char *) attr->children->content;
351         else if (!xmlStrcmp(attr->name, BAD_CAST "inputformat") &&
352             attr->children && attr->children->type == XML_TEXT_NODE)
353             input_format = (const char *) attr->children->content;
354         else if (!xmlStrcmp(attr->name, BAD_CAST "outputformat") &&
355             attr->children && attr->children->type == XML_TEXT_NODE)
356             output_format = (const char *) attr->children->content;
357         else if (!xmlStrcmp(attr->name, BAD_CAST "leaderspec") &&
358                  attr->children && attr->children->type == XML_TEXT_NODE)
359             info->leader_spec =
360                 nmem_strdup(info->nmem,(const char *) attr->children->content);
361         else
362         {
363             wrbuf_printf(wr_error, "Element <marc>: expected attributes"
364                          "'inputformat', 'inputcharset', 'outputformat' or"
365                          " 'outputcharset', got attribute '%s'",
366                          attr->name);
367             nmem_destroy(info->nmem);
368             return 0;
369         }
370     }
371     if (!input_format)
372     {
373         wrbuf_printf(wr_error, "Element <marc>: "
374                      "attribute 'inputformat' required");
375         nmem_destroy(info->nmem);
376         return 0;
377     }
378     else if (!strcmp(input_format, "marc"))
379     {
380         info->input_format_mode = YAZ_MARC_ISO2709;
381     }
382     else if (!strcmp(input_format, "xml"))
383     {
384         info->input_format_mode = YAZ_MARC_MARCXML;
385         /** Libxml2 generates UTF-8 encoding by default .
386             So we convert from UTF-8 to outputcharset (if defined)
387         */
388         if (!info->input_charset && info->output_charset)
389             info->input_charset = "utf-8";
390     }
391     else
392     {
393         wrbuf_printf(wr_error, "Element <marc inputformat='%s'>: "
394                      " Unsupported input format"
395                      " defined by attribute value",
396                      input_format);
397         nmem_destroy(info->nmem);
398         return 0;
399     }
400
401     if (!output_format)
402     {
403         wrbuf_printf(wr_error,
404                      "Element <marc>: attribute 'outputformat' required");
405         nmem_destroy(info->nmem);
406         return 0;
407     }
408     else if (!strcmp(output_format, "line"))
409     {
410         info->output_format_mode = YAZ_MARC_LINE;
411     }
412     else if (!strcmp(output_format, "marcxml"))
413     {
414         info->output_format_mode = YAZ_MARC_MARCXML;
415         if (info->input_charset && !info->output_charset)
416             info->output_charset = "utf-8";
417     }
418     else if (!strcmp(output_format, "turbomarc"))
419     {
420         info->output_format_mode = YAZ_MARC_TURBOMARC;
421         if (info->input_charset && !info->output_charset)
422             info->output_charset = "utf-8";
423     }
424     else if (!strcmp(output_format, "marc"))
425     {
426         info->output_format_mode = YAZ_MARC_ISO2709;
427     }
428     else if (!strcmp(output_format, "marcxchange"))
429     {
430         info->output_format_mode = YAZ_MARC_XCHANGE;
431         if (info->input_charset && !info->output_charset)
432             info->output_charset = "utf-8";
433     }
434     else
435     {
436         wrbuf_printf(wr_error, "Element <marc outputformat='%s'>: "
437                      " Unsupported output format"
438                      " defined by attribute value",
439                      output_format);
440         nmem_destroy(info->nmem);
441         return 0;
442     }
443     if (info->input_charset && info->output_charset)
444     {
445         yaz_iconv_t cd = yaz_iconv_open(info->output_charset,
446                                         info->input_charset);
447         if (!cd)
448         {
449             wrbuf_printf(wr_error,
450                          "Element <marc inputcharset='%s' outputcharset='%s'>:"
451                          " Unsupported character set mapping"
452                          " defined by attribute values",
453                          info->input_charset, info->output_charset);
454             nmem_destroy(info->nmem);
455             return 0;
456         }
457         yaz_iconv_close(cd);
458     }
459     else if (!info->output_charset)
460     {
461         wrbuf_printf(wr_error, "Element <marc>: "
462                      "attribute 'outputcharset' missing");
463         nmem_destroy(info->nmem);
464         return 0;
465     }
466     else if (!info->input_charset)
467     {
468         wrbuf_printf(wr_error, "Element <marc>: "
469                      "attribute 'inputcharset' missing");
470         nmem_destroy(info->nmem);
471         return 0;
472     }
473     info->input_charset = nmem_strdup(info->nmem, info->input_charset);
474     info->output_charset = nmem_strdup(info->nmem, info->output_charset);
475     return info;
476 }
477
478 static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
479 {
480     struct marc_info *mi = info;
481     int ret = 0;
482
483     yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, mi->input_charset);
484     yaz_marc_t mt = yaz_marc_create();
485
486     yaz_marc_xml(mt, mi->output_format_mode);
487     if (mi->leader_spec)
488         yaz_marc_leader_spec(mt, mi->leader_spec);
489
490     if (cd)
491         yaz_marc_iconv(mt, cd);
492     if (mi->input_format_mode == YAZ_MARC_ISO2709)
493     {
494         int sz = yaz_marc_read_iso2709(mt, wrbuf_buf(record),
495                                        wrbuf_len(record));
496         if (sz > 0)
497             ret = 0;
498         else
499             ret = -1;
500     }
501     else if (mi->input_format_mode == YAZ_MARC_MARCXML ||
502              mi->input_format_mode == YAZ_MARC_TURBOMARC)
503     {
504         xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
505                                        wrbuf_len(record));
506         if (!doc)
507         {
508             wrbuf_printf(wr_error, "xmlParseMemory failed");
509             ret = -1;
510         }
511         else
512         {
513             ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc));
514             if (ret)
515                 wrbuf_printf(wr_error, "yaz_marc_read_xml failed");
516         }
517         xmlFreeDoc(doc);
518     }
519     else
520     {
521         wrbuf_printf(wr_error, "unsupported input format");
522         ret = -1;
523     }
524     if (ret == 0)
525     {
526         wrbuf_rewind(record);
527         ret = yaz_marc_write_mode(mt, record);
528         if (ret)
529             wrbuf_printf(wr_error, "yaz_marc_write_mode failed");
530     }
531     if (cd)
532         yaz_iconv_close(cd);
533     yaz_marc_destroy(mt);
534     return ret;
535 }
536
537 static void destroy_marc(void *info)
538 {
539     struct marc_info *mi = info;
540
541     nmem_destroy(mi->nmem);
542 }
543
544 int yaz_record_conv_configure_t(yaz_record_conv_t p, const xmlNode *ptr,
545                                 struct yaz_record_conv_type *types)
546 {
547     struct yaz_record_conv_type bt[2];
548
549     /* register marc */
550     bt[0].construct = construct_marc;
551     bt[0].convert = convert_marc;
552     bt[0].destroy = destroy_marc;
553
554 #if YAZ_HAVE_XSLT
555     /* register xslt */
556     bt[0].next = &bt[1];
557     bt[1].next = types;
558     bt[1].construct = construct_xslt;
559     bt[1].convert = convert_xslt;
560     bt[1].destroy = destroy_xslt;
561 #else
562     bt[0].next = types;
563 #endif
564
565     yaz_record_conv_reset(p);
566
567     /* parsing element children */
568     for (ptr = ptr->children; ptr; ptr = ptr->next)
569     {
570         struct yaz_record_conv_type *t;
571         struct yaz_record_conv_rule *r;
572         void *info = 0;
573         if (ptr->type != XML_ELEMENT_NODE)
574             continue;
575         for (t = &bt[0]; t; t = t->next)
576         {
577             wrbuf_rewind(p->wr_error);
578             info = t->construct(ptr, p->path, p->wr_error);
579
580             if (info || wrbuf_len(p->wr_error))
581                 break;
582             /* info== 0 and no error reported , ie not handled by it */
583         }
584         if (!info)
585         {
586             if (wrbuf_len(p->wr_error) == 0)
587                 wrbuf_printf(p->wr_error, "Element <backend>: expected "
588                              "<marc> or <xslt> element, got <%s>"
589                              , ptr->name);
590             return -1;
591         }
592         r = (struct yaz_record_conv_rule *) nmem_malloc(p->nmem, sizeof(*r));
593         r->next = 0;
594         r->info = info;
595         r->type = nmem_malloc(p->nmem, sizeof(*t));
596         memcpy(r->type, t, sizeof(*t));
597         *p->rules_p = r;
598         p->rules_p = &r->next;
599     }
600     return 0;
601 }
602
603 int yaz_record_conv_configure(yaz_record_conv_t p, const xmlNode *ptr)
604 {
605     return yaz_record_conv_configure_t(p, ptr, 0);
606 }
607
608 static int yaz_record_conv_record_rule(yaz_record_conv_t p,
609                                        struct yaz_record_conv_rule *r,
610                                        const char *input_record_buf,
611                                        size_t input_record_len,
612                                        WRBUF output_record)
613 {
614     int ret = 0;
615     WRBUF record = output_record; /* pointer transfer */
616     wrbuf_rewind(p->wr_error);
617
618     wrbuf_write(record, input_record_buf, input_record_len);
619     for (; ret == 0 && r; r = r->next)
620         ret = r->type->convert(r->info, record, p->wr_error);
621     return ret;
622 }
623
624 int yaz_record_conv_opac_record(yaz_record_conv_t p,
625                                 Z_OPACRecord *input_record,
626                                 WRBUF output_record)
627 {
628     int ret = 0;
629     struct yaz_record_conv_rule *r = p->rules;
630     if (!r || r->type->construct != construct_marc)
631     {
632         wrbuf_puts(p->wr_error, "Expecting MARC rule as first rule for OPAC");
633         ret = -1; /* no marc rule so we can't do OPAC */
634     }
635     else
636     {
637         struct marc_info *mi = r->info;
638
639         WRBUF res = wrbuf_alloc();
640         yaz_marc_t mt = yaz_marc_create();
641         yaz_iconv_t cd = yaz_iconv_open(mi->output_charset,
642                                         mi->input_charset);
643
644         wrbuf_rewind(p->wr_error);
645         yaz_marc_xml(mt, mi->output_format_mode);
646
647         yaz_marc_iconv(mt, cd);
648
649         yaz_opac_decode_wrbuf(mt, input_record, res);
650         if (ret != -1)
651         {
652             ret = yaz_record_conv_record_rule(p,
653                                               r->next,
654                                               wrbuf_buf(res), wrbuf_len(res),
655                                               output_record);
656         }
657         yaz_marc_destroy(mt);
658         if (cd)
659             yaz_iconv_close(cd);
660         wrbuf_destroy(res);
661     }
662     return ret;
663 }
664
665 int yaz_record_conv_record(yaz_record_conv_t p,
666                            const char *input_record_buf,
667                            size_t input_record_len,
668                            WRBUF output_record)
669 {
670     return yaz_record_conv_record_rule(p, p->rules,
671                                        input_record_buf,
672                                        input_record_len, output_record);
673 }
674
675 const char *yaz_record_conv_get_error(yaz_record_conv_t p)
676 {
677     return wrbuf_cstr(p->wr_error);
678 }
679
680 void yaz_record_conv_set_path(yaz_record_conv_t p, const char *path)
681 {
682     xfree(p->path);
683     p->path = 0;
684     if (path)
685         p->path = xstrdup(path);
686 }
687
688 yaz_record_conv_t yaz_record_conv_create()
689 {
690     yaz_record_conv_t p = (yaz_record_conv_t) xmalloc(sizeof(*p));
691     p->nmem = nmem_create();
692     p->wr_error = wrbuf_alloc();
693     p->rules = 0;
694     p->path = 0;
695 #if YAZ_HAVE_EXSLT
696     exsltRegisterAll();
697 #endif
698     return p;
699 }
700
701 /* YAZ_HAVE_XML2 */
702 #endif
703
704 /*
705  * Local variables:
706  * c-basic-offset: 4
707  * c-file-style: "Stroustrup"
708  * indent-tabs-mode: nil
709  * End:
710  * vim: shiftwidth=4 tabstop=8 expandtab
711  */
712