MARC-8: allow all characters 0x01-0x20 YAZ-650
[yaz-moved-to-github.git] / src / record_conv.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file record_conv.c
7  * \brief Record Conversions utility
8  */
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <string.h>
15 #include <yaz/yaz-iconv.h>
16 #include <yaz/marcdisp.h>
17 #include <yaz/record_conv.h>
18 #include <yaz/wrbuf.h>
19 #include <yaz/xmalloc.h>
20 #include <yaz/nmem.h>
21 #include <yaz/tpath.h>
22 #include <yaz/z-opac.h>
23
24 #if YAZ_HAVE_XML2
25 #include <libxml/parser.h>
26 #include <libxml/tree.h>
27 #include <libxml/xinclude.h>
28 #if YAZ_HAVE_XSLT
29 #include <libxslt/xsltutils.h>
30 #include <libxslt/transform.h>
31 #endif
32 #if YAZ_HAVE_EXSLT
33 #include <libexslt/exslt.h>
34 #endif
35
36 /** \brief The internal structure for yaz_record_conv_t */
37 struct yaz_record_conv_struct {
38     /** \brief memory for configuration */
39     NMEM nmem;
40
41     /** \brief conversion rules (allocated using NMEM) */
42     struct yaz_record_conv_rule *rules;
43
44     /** \brief pointer to last conversion rule pointer in chain */
45     struct yaz_record_conv_rule **rules_p;
46
47     /** \brief string buffer for error messages */
48     WRBUF wr_error;
49
50     /** \brief path for opening files  */
51     char *path;
52 };
53
54 struct marc_info {
55     NMEM nmem;
56     const char *input_charset;
57     const char *output_charset;
58     int input_format_mode;
59     int output_format_mode;
60     const char *leader_spec;
61 };
62
63 /** \brief tranformation info (rule info) */
64 struct yaz_record_conv_rule {
65     struct yaz_record_conv_type *type;
66     void *info;
67     struct yaz_record_conv_rule *next;
68 };
69
70 /** \brief reset rules+configuration */
71 static void yaz_record_conv_reset(yaz_record_conv_t p)
72 {
73
74     struct yaz_record_conv_rule *r;
75     for (r = p->rules; r; r = r->next)
76     {
77         r->type->destroy(r->info);
78     }
79     wrbuf_rewind(p->wr_error);
80     nmem_reset(p->nmem);
81
82     p->rules = 0;
83
84     p->rules_p = &p->rules;
85 }
86
87 void yaz_record_conv_destroy(yaz_record_conv_t p)
88 {
89     if (p)
90     {
91         yaz_record_conv_reset(p);
92         nmem_destroy(p->nmem);
93         wrbuf_destroy(p->wr_error);
94
95         xfree(p->path);
96         xfree(p);
97     }
98 }
99
100 #if YAZ_HAVE_XSLT
101 struct xslt_info {
102     NMEM nmem;
103     xmlDocPtr xsp_doc;
104     const char **xsl_parms;
105 };
106
107 static void *construct_xslt(const xmlNode *ptr,
108                             const char *path, WRBUF wr_error)
109 {
110     struct _xmlAttr *attr;
111     const char *stylesheet = 0;
112     struct xslt_info *info = 0;
113     NMEM nmem = 0;
114     int max_parms = 10;
115     int no_parms = 0;
116
117     if (strcmp((const char *) ptr->name, "xslt"))
118         return 0;
119
120     for (attr = ptr->properties; attr; attr = attr->next)
121     {
122         if (!xmlStrcmp(attr->name, BAD_CAST "stylesheet") &&
123             attr->children && attr->children->type == XML_TEXT_NODE)
124             stylesheet = (const char *) attr->children->content;
125         else
126         {
127             wrbuf_printf(wr_error, "Bad attribute '%s'"
128                          "Expected stylesheet.", attr->name);
129             return 0;
130         }
131     }
132     nmem = nmem_create();
133     info = nmem_malloc(nmem, sizeof(*info));
134     info->nmem = nmem;
135     info->xsl_parms = nmem_malloc(
136         nmem, (2 * max_parms + 1) * sizeof(*info->xsl_parms));
137
138     for (ptr = ptr->children; ptr; ptr = ptr->next)
139     {
140         const char *name = 0;
141         const char *value = 0;
142         char *qvalue = 0;
143         if (ptr->type != XML_ELEMENT_NODE)
144             continue;
145         if (strcmp((const char *) ptr->name, "param"))
146         {
147             wrbuf_printf(wr_error, "Bad element '%s'"
148                          "Expected param.", ptr->name);
149             nmem_destroy(nmem);
150             return 0;
151         }
152         for (attr = ptr->properties; attr; attr = attr->next)
153         {
154             if (!xmlStrcmp(attr->name, BAD_CAST "name") &&
155                 attr->children && attr->children->type == XML_TEXT_NODE)
156                 name = (const char *) attr->children->content;
157             else if (!xmlStrcmp(attr->name, BAD_CAST "value") &&
158                 attr->children && attr->children->type == XML_TEXT_NODE)
159                 value = (const char *) attr->children->content;
160             else
161             {
162                 wrbuf_printf(wr_error, "Bad attribute '%s'"
163                              "Expected name or value.", attr->name);
164                 nmem_destroy(nmem);
165                 return 0;
166             }
167         }
168         if (!name || !value)
169         {
170             wrbuf_printf(wr_error, "Missing attributes name or value");
171             nmem_destroy(nmem);
172             return 0;
173         }
174         if (no_parms >= max_parms)
175         {
176             wrbuf_printf(wr_error, "Too many parameters given");
177             nmem_destroy(nmem);
178             return 0;
179         }
180
181         qvalue = nmem_malloc(nmem, strlen(value) + 3);
182         strcpy(qvalue, "\'");
183         strcat(qvalue, value);
184         strcat(qvalue, "\'");
185
186         info->xsl_parms[2 * no_parms] = nmem_strdup(nmem, name);
187         info->xsl_parms[2 * no_parms + 1] = qvalue;
188         no_parms++;
189     }
190
191     info->xsl_parms[2 * no_parms] = '\0';
192
193     if (!stylesheet)
194     {
195         wrbuf_printf(wr_error, "Element <xslt>: "
196                      "attribute 'stylesheet' expected");
197         nmem_destroy(nmem);
198         return 0;
199     }
200     else
201     {
202         char fullpath[1024];
203         xsltStylesheetPtr xsp;
204         if (!yaz_filepath_resolve(stylesheet, path, 0, fullpath))
205         {
206             wrbuf_printf(wr_error, "Element <xslt stylesheet=\"%s\"/>:"
207                          " could not locate stylesheet '%s'",
208                          stylesheet, stylesheet);
209             if (path)
210                 wrbuf_printf(wr_error, " with path '%s'", path);
211
212             nmem_destroy(nmem);
213             return 0;
214         }
215         info->xsp_doc = xmlParseFile(fullpath);
216         if (!info->xsp_doc)
217         {
218             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
219                          " xml parse failed: %s", stylesheet, fullpath);
220             if (path)
221                 wrbuf_printf(wr_error, " with path '%s'", path);
222             nmem_destroy(nmem);
223             return 0;
224         }
225         /* need to copy this before passing it to the processor. It will
226            be encapsulated in the xsp and destroyed by xsltFreeStylesheet */
227         xsp = xsltParseStylesheetDoc(xmlCopyDoc(info->xsp_doc, 1));
228         if (!xsp)
229         {
230             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
231                          " xslt parse failed: %s", stylesheet, fullpath);
232             if (path)
233                 wrbuf_printf(wr_error, " with path '%s'", path);
234             wrbuf_printf(wr_error, " ("
235 #if YAZ_HAVE_EXSLT
236
237                          "EXSLT enabled"
238 #else
239                          "EXSLT not supported"
240 #endif
241                          ")");
242             xmlFreeDoc(info->xsp_doc);
243             nmem_destroy(info->nmem);
244             return 0;
245         }
246         else
247         {
248             xsltFreeStylesheet(xsp);
249             return info;
250         }
251     }
252     return 0;
253 }
254
255 static int convert_xslt(void *vinfo, WRBUF record, WRBUF wr_error)
256 {
257     int ret = 0;
258     struct xslt_info *info = vinfo;
259
260     xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
261                                    wrbuf_len(record));
262     if (!doc)
263     {
264         wrbuf_printf(wr_error, "xmlParseMemory failed");
265         ret = -1;
266     }
267     else
268     {
269         xmlDocPtr xsp_doc = xmlCopyDoc(info->xsp_doc, 1);
270         xsltStylesheetPtr xsp = xsltParseStylesheetDoc(xsp_doc);
271         xmlDocPtr res = xsltApplyStylesheet(xsp, doc, info->xsl_parms);
272         if (res)
273         {
274             xmlChar *out_buf = 0;
275             int out_len;
276
277 #if HAVE_XSLTSAVERESULTTOSTRING
278             xsltSaveResultToString(&out_buf, &out_len, res, xsp);
279 #else
280             xmlDocDumpFormatMemory (res, &out_buf, &out_len, 1);
281 #endif
282             if (!out_buf)
283             {
284                 wrbuf_printf(wr_error,
285                              "xsltSaveResultToString failed");
286                 ret = -1;
287             }
288             else
289             {
290                 wrbuf_rewind(record);
291                 wrbuf_write(record, (const char *) out_buf, out_len);
292
293                 xmlFree(out_buf);
294             }
295             xmlFreeDoc(res);
296         }
297         else
298         {
299             wrbuf_printf(wr_error, "xsltApplyStylesheet failed");
300             ret = -1;
301         }
302         xmlFreeDoc(doc);
303         xsltFreeStylesheet(xsp); /* frees xsp_doc too */
304     }
305     return ret;
306 }
307
308 static void destroy_xslt(void *vinfo)
309 {
310     struct xslt_info *info = vinfo;
311
312     if (info)
313     {
314         xmlFreeDoc(info->xsp_doc);
315         nmem_destroy(info->nmem);
316     }
317 }
318
319 /* YAZ_HAVE_XSLT */
320 #endif
321
322
323 static void *construct_marc(const xmlNode *ptr,
324                             const char *path, WRBUF wr_error)
325 {
326     NMEM nmem = nmem_create();
327     struct marc_info *info = nmem_malloc(nmem, sizeof(*info));
328     struct _xmlAttr *attr;
329     const char *input_format = 0;
330     const char *output_format = 0;
331
332     if (strcmp((const char *) ptr->name, "marc"))
333     {
334         nmem_destroy(nmem);
335         return 0;
336     }
337
338     info->nmem = nmem;
339     info->input_charset = 0;
340     info->output_charset = 0;
341     info->input_format_mode = 0;
342     info->output_format_mode = 0;
343     info->leader_spec = 0;
344
345     for (attr = ptr->properties; attr; attr = attr->next)
346     {
347         if (!xmlStrcmp(attr->name, BAD_CAST "inputcharset") &&
348             attr->children && attr->children->type == XML_TEXT_NODE)
349             info->input_charset = (const char *) attr->children->content;
350         else if (!xmlStrcmp(attr->name, BAD_CAST "outputcharset") &&
351             attr->children && attr->children->type == XML_TEXT_NODE)
352             info->output_charset = (const char *) attr->children->content;
353         else if (!xmlStrcmp(attr->name, BAD_CAST "inputformat") &&
354             attr->children && attr->children->type == XML_TEXT_NODE)
355             input_format = (const char *) attr->children->content;
356         else if (!xmlStrcmp(attr->name, BAD_CAST "outputformat") &&
357             attr->children && attr->children->type == XML_TEXT_NODE)
358             output_format = (const char *) attr->children->content;
359         else if (!xmlStrcmp(attr->name, BAD_CAST "leaderspec") &&
360                  attr->children && attr->children->type == XML_TEXT_NODE)
361             info->leader_spec =
362                 nmem_strdup(info->nmem,(const char *) attr->children->content);
363         else
364         {
365             wrbuf_printf(wr_error, "Element <marc>: expected attributes"
366                          "'inputformat', 'inputcharset', 'outputformat' or"
367                          " 'outputcharset', got attribute '%s'",
368                          attr->name);
369             nmem_destroy(info->nmem);
370             return 0;
371         }
372     }
373     if (!input_format)
374     {
375         wrbuf_printf(wr_error, "Element <marc>: "
376                      "attribute 'inputformat' required");
377         nmem_destroy(info->nmem);
378         return 0;
379     }
380     else if (!strcmp(input_format, "marc"))
381     {
382         info->input_format_mode = YAZ_MARC_ISO2709;
383     }
384     else if (!strcmp(input_format, "xml"))
385     {
386         info->input_format_mode = YAZ_MARC_MARCXML;
387         /** Libxml2 generates UTF-8 encoding by default .
388             So we convert from UTF-8 to outputcharset (if defined)
389         */
390         if (!info->input_charset && info->output_charset)
391             info->input_charset = "utf-8";
392     }
393     else
394     {
395         wrbuf_printf(wr_error, "Element <marc inputformat='%s'>: "
396                      " Unsupported input format"
397                      " defined by attribute value",
398                      input_format);
399         nmem_destroy(info->nmem);
400         return 0;
401     }
402
403     if (!output_format)
404     {
405         wrbuf_printf(wr_error,
406                      "Element <marc>: attribute 'outputformat' required");
407         nmem_destroy(info->nmem);
408         return 0;
409     }
410     else if (!strcmp(output_format, "line"))
411     {
412         info->output_format_mode = YAZ_MARC_LINE;
413     }
414     else if (!strcmp(output_format, "marcxml"))
415     {
416         info->output_format_mode = YAZ_MARC_MARCXML;
417         if (info->input_charset && !info->output_charset)
418             info->output_charset = "utf-8";
419     }
420     else if (!strcmp(output_format, "turbomarc"))
421     {
422         info->output_format_mode = YAZ_MARC_TURBOMARC;
423         if (info->input_charset && !info->output_charset)
424             info->output_charset = "utf-8";
425     }
426     else if (!strcmp(output_format, "marc"))
427     {
428         info->output_format_mode = YAZ_MARC_ISO2709;
429     }
430     else if (!strcmp(output_format, "marcxchange"))
431     {
432         info->output_format_mode = YAZ_MARC_XCHANGE;
433         if (info->input_charset && !info->output_charset)
434             info->output_charset = "utf-8";
435     }
436     else
437     {
438         wrbuf_printf(wr_error, "Element <marc outputformat='%s'>: "
439                      " Unsupported output format"
440                      " defined by attribute value",
441                      output_format);
442         nmem_destroy(info->nmem);
443         return 0;
444     }
445     if (info->input_charset && info->output_charset)
446     {
447         yaz_iconv_t cd = yaz_iconv_open(info->output_charset,
448                                         info->input_charset);
449         if (!cd)
450         {
451             wrbuf_printf(wr_error,
452                          "Element <marc inputcharset='%s' outputcharset='%s'>:"
453                          " Unsupported character set mapping"
454                          " defined by attribute values",
455                          info->input_charset, info->output_charset);
456             nmem_destroy(info->nmem);
457             return 0;
458         }
459         yaz_iconv_close(cd);
460     }
461     else if (!info->output_charset)
462     {
463         wrbuf_printf(wr_error, "Element <marc>: "
464                      "attribute 'outputcharset' missing");
465         nmem_destroy(info->nmem);
466         return 0;
467     }
468     else if (!info->input_charset)
469     {
470         wrbuf_printf(wr_error, "Element <marc>: "
471                      "attribute 'inputcharset' missing");
472         nmem_destroy(info->nmem);
473         return 0;
474     }
475     info->input_charset = nmem_strdup(info->nmem, info->input_charset);
476     info->output_charset = nmem_strdup(info->nmem, info->output_charset);
477     return info;
478 }
479
480 static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
481 {
482     struct marc_info *mi = info;
483     int ret = 0;
484
485     yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, mi->input_charset);
486     yaz_marc_t mt = yaz_marc_create();
487
488     yaz_marc_xml(mt, mi->output_format_mode);
489     if (mi->leader_spec)
490         yaz_marc_leader_spec(mt, mi->leader_spec);
491
492     if (cd)
493         yaz_marc_iconv(mt, cd);
494     if (mi->input_format_mode == YAZ_MARC_ISO2709)
495     {
496         int sz = yaz_marc_read_iso2709(mt, wrbuf_buf(record),
497                                        wrbuf_len(record));
498         if (sz > 0)
499             ret = 0;
500         else
501             ret = -1;
502     }
503     else if (mi->input_format_mode == YAZ_MARC_MARCXML ||
504              mi->input_format_mode == YAZ_MARC_TURBOMARC)
505     {
506         xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
507                                        wrbuf_len(record));
508         if (!doc)
509         {
510             wrbuf_printf(wr_error, "xmlParseMemory failed");
511             ret = -1;
512         }
513         else
514         {
515             ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc));
516             if (ret)
517                 wrbuf_printf(wr_error, "yaz_marc_read_xml failed");
518         }
519         xmlFreeDoc(doc);
520     }
521     else
522     {
523         wrbuf_printf(wr_error, "unsupported input format");
524         ret = -1;
525     }
526     if (ret == 0)
527     {
528         wrbuf_rewind(record);
529         ret = yaz_marc_write_mode(mt, record);
530         if (ret)
531             wrbuf_printf(wr_error, "yaz_marc_write_mode failed");
532     }
533     if (cd)
534         yaz_iconv_close(cd);
535     yaz_marc_destroy(mt);
536     return ret;
537 }
538
539 static void destroy_marc(void *info)
540 {
541     struct marc_info *mi = info;
542
543     nmem_destroy(mi->nmem);
544 }
545
546 int yaz_record_conv_configure_t(yaz_record_conv_t p, const xmlNode *ptr,
547                                 struct yaz_record_conv_type *types)
548 {
549     struct yaz_record_conv_type bt[2];
550
551     /* register marc */
552     bt[0].construct = construct_marc;
553     bt[0].convert = convert_marc;
554     bt[0].destroy = destroy_marc;
555
556 #if YAZ_HAVE_XSLT
557     /* register xslt */
558     bt[0].next = &bt[1];
559     bt[1].next = types;
560     bt[1].construct = construct_xslt;
561     bt[1].convert = convert_xslt;
562     bt[1].destroy = destroy_xslt;
563 #else
564     bt[0].next = types;
565 #endif
566
567     yaz_record_conv_reset(p);
568
569     /* parsing element children */
570     for (ptr = ptr->children; ptr; ptr = ptr->next)
571     {
572         struct yaz_record_conv_type *t;
573         struct yaz_record_conv_rule *r;
574         void *info = 0;
575         if (ptr->type != XML_ELEMENT_NODE)
576             continue;
577         for (t = &bt[0]; t; t = t->next)
578         {
579             wrbuf_rewind(p->wr_error);
580             info = t->construct(ptr, p->path, p->wr_error);
581
582             if (info || wrbuf_len(p->wr_error))
583                 break;
584             /* info== 0 and no error reported , ie not handled by it */
585         }
586         if (!info)
587         {
588             if (wrbuf_len(p->wr_error) == 0)
589                 wrbuf_printf(p->wr_error, "Element <backend>: expected "
590                              "<marc> or <xslt> element, got <%s>"
591                              , ptr->name);
592             return -1;
593         }
594         r = (struct yaz_record_conv_rule *) nmem_malloc(p->nmem, sizeof(*r));
595         r->next = 0;
596         r->info = info;
597         r->type = nmem_malloc(p->nmem, sizeof(*t));
598         memcpy(r->type, t, sizeof(*t));
599         *p->rules_p = r;
600         p->rules_p = &r->next;
601     }
602     return 0;
603 }
604
605 int yaz_record_conv_configure(yaz_record_conv_t p, const xmlNode *ptr)
606 {
607     return yaz_record_conv_configure_t(p, ptr, 0);
608 }
609
610 static int yaz_record_conv_record_rule(yaz_record_conv_t p,
611                                        struct yaz_record_conv_rule *r,
612                                        const char *input_record_buf,
613                                        size_t input_record_len,
614                                        WRBUF output_record)
615 {
616     int ret = 0;
617     WRBUF record = output_record; /* pointer transfer */
618     wrbuf_rewind(p->wr_error);
619
620     wrbuf_write(record, input_record_buf, input_record_len);
621     for (; ret == 0 && r; r = r->next)
622         ret = r->type->convert(r->info, record, p->wr_error);
623     return ret;
624 }
625
626 int yaz_record_conv_opac_record(yaz_record_conv_t p,
627                                 Z_OPACRecord *input_record,
628                                 WRBUF output_record)
629 {
630     int ret = 0;
631     struct yaz_record_conv_rule *r = p->rules;
632     if (!r || r->type->construct != construct_marc)
633     {
634         wrbuf_puts(p->wr_error, "Expecting MARC rule as first rule for OPAC");
635         ret = -1; /* no marc rule so we can't do OPAC */
636     }
637     else
638     {
639         struct marc_info *mi = r->info;
640
641         WRBUF res = wrbuf_alloc();
642         yaz_marc_t mt = yaz_marc_create();
643         yaz_iconv_t cd = yaz_iconv_open(mi->output_charset,
644                                         mi->input_charset);
645
646         wrbuf_rewind(p->wr_error);
647         yaz_marc_xml(mt, mi->output_format_mode);
648
649         yaz_marc_iconv(mt, cd);
650
651         yaz_opac_decode_wrbuf(mt, input_record, res);
652         if (ret != -1)
653         {
654             ret = yaz_record_conv_record_rule(p,
655                                               r->next,
656                                               wrbuf_buf(res), wrbuf_len(res),
657                                               output_record);
658         }
659         yaz_marc_destroy(mt);
660         if (cd)
661             yaz_iconv_close(cd);
662         wrbuf_destroy(res);
663     }
664     return ret;
665 }
666
667 int yaz_record_conv_record(yaz_record_conv_t p,
668                            const char *input_record_buf,
669                            size_t input_record_len,
670                            WRBUF output_record)
671 {
672     return yaz_record_conv_record_rule(p, p->rules,
673                                        input_record_buf,
674                                        input_record_len, output_record);
675 }
676
677 const char *yaz_record_conv_get_error(yaz_record_conv_t p)
678 {
679     return wrbuf_cstr(p->wr_error);
680 }
681
682 void yaz_record_conv_set_path(yaz_record_conv_t p, const char *path)
683 {
684     xfree(p->path);
685     p->path = 0;
686     if (path)
687         p->path = xstrdup(path);
688 }
689
690 yaz_record_conv_t yaz_record_conv_create()
691 {
692     yaz_record_conv_t p = (yaz_record_conv_t) xmalloc(sizeof(*p));
693     p->nmem = nmem_create();
694     p->wr_error = wrbuf_alloc();
695     p->rules = 0;
696     p->path = 0;
697 #if YAZ_HAVE_EXSLT
698     exsltRegisterAll();
699 #endif
700     return p;
701 }
702
703 /* YAZ_HAVE_XML2 */
704 #endif
705
706 /*
707  * Local variables:
708  * c-basic-offset: 4
709  * c-file-style: "Stroustrup"
710  * indent-tabs-mode: nil
711  * End:
712  * vim: shiftwidth=4 tabstop=8 expandtab
713  */
714