record_conv: extensible conversion
[yaz-moved-to-github.git] / src / record_conv.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2012 Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file record_conv.c
7  * \brief Record Conversions utility
8  */
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <string.h>
15 #include <yaz/yaz-iconv.h>
16 #include <yaz/marcdisp.h>
17 #include <yaz/record_conv.h>
18 #include <yaz/wrbuf.h>
19 #include <yaz/xmalloc.h>
20 #include <yaz/nmem.h>
21 #include <yaz/tpath.h>
22 #include <yaz/z-opac.h>
23
24 #if YAZ_HAVE_XML2
25 #include <libxml/parser.h>
26 #include <libxml/tree.h>
27 #include <libxml/xinclude.h>
28 #if YAZ_HAVE_XSLT
29 #include <libxslt/xsltutils.h>
30 #include <libxslt/transform.h>
31 #endif
32 #if YAZ_HAVE_EXSLT
33 #include <libexslt/exslt.h>
34 #endif
35
36 /** \brief The internal structure for yaz_record_conv_t */
37 struct yaz_record_conv_struct {
38     /** \brief memory for configuration */
39     NMEM nmem;
40
41     /** \brief conversion rules (allocated using NMEM) */
42     struct yaz_record_conv_rule *rules;
43
44     /** \brief pointer to last conversion rule pointer in chain */
45     struct yaz_record_conv_rule **rules_p;
46
47     /** \brief string buffer for error messages */
48     WRBUF wr_error;
49
50     /** \brief path for opening files  */
51     char *path;
52
53     /** \brief handlers */
54     struct yaz_record_conv_type *types;
55 };
56
57 struct marc_info {
58     NMEM nmem;
59     const char *input_charset;
60     const char *output_charset;
61     int input_format_mode;
62     int output_format_mode;
63 };
64
65 /** \brief tranformation info (rule info) */
66 struct yaz_record_conv_rule {
67     struct yaz_record_conv_type *type;
68     void *info;
69     struct yaz_record_conv_rule *next;
70 };
71
72 /** \brief reset rules+configuration */
73 static void yaz_record_conv_reset(yaz_record_conv_t p)
74 {
75
76     struct yaz_record_conv_rule *r;
77     for (r = p->rules; r; r = r->next)
78     {
79         r->type->destroy(r->info);
80     }
81     wrbuf_rewind(p->wr_error);
82     nmem_reset(p->nmem);
83
84     p->rules = 0;
85
86     p->rules_p = &p->rules;
87 }
88
89 void yaz_record_conv_add_type(yaz_record_conv_t p,
90                               struct yaz_record_conv_type *type)
91 {
92     struct yaz_record_conv_type **tp = &p->types;
93     while (*tp)
94         tp = &(*tp)->next;
95     *tp = xmalloc(sizeof(*type));
96     memcpy(*tp, type, sizeof(*type));
97     (*tp)->next = 0;
98 }
99
100 void yaz_record_conv_destroy(yaz_record_conv_t p)
101 {
102     if (p)
103     {
104         struct yaz_record_conv_type *t = p->types;
105
106         yaz_record_conv_reset(p);
107         nmem_destroy(p->nmem);
108         wrbuf_destroy(p->wr_error);
109
110         while (t)
111         { 
112             struct yaz_record_conv_type *t_next = t->next;
113             xfree(t);
114             t = t_next;
115         }
116         xfree(p->path);
117         xfree(p);
118     }
119 }
120
121 #if YAZ_HAVE_XSLT
122 static void *construct_xslt(yaz_record_conv_t p, const xmlNode *ptr,
123                             const char *path, WRBUF wr_error)
124 {
125     struct _xmlAttr *attr;
126     const char *stylesheet = 0;
127
128     if (strcmp((const char *) ptr->name, "xslt"))
129         return 0;
130
131     for (attr = ptr->properties; attr; attr = attr->next)
132     {
133         if (!xmlStrcmp(attr->name, BAD_CAST "stylesheet") &&
134             attr->children && attr->children->type == XML_TEXT_NODE)
135             stylesheet = (const char *) attr->children->content;
136         else
137         {
138             wrbuf_printf(wr_error, "Bad attribute '%s'"
139                          "Expected stylesheet.", attr->name);
140             return 0;
141         }
142     }
143     if (!stylesheet)
144     {
145         wrbuf_printf(wr_error, "Element <xslt>: "
146                      "attribute 'stylesheet' expected");
147         return 0;
148     }
149     else
150     {
151         char fullpath[1024];
152         xsltStylesheetPtr xsp;
153         xmlDocPtr xsp_doc;
154         if (!yaz_filepath_resolve(stylesheet, path, 0, fullpath))
155         {
156             wrbuf_printf(wr_error, "Element <xslt stylesheet=\"%s\"/>:"
157                          " could not locate stylesheet '%s'",
158                          stylesheet, stylesheet);
159             if (path)
160                 wrbuf_printf(wr_error, " with path '%s'", path);
161                 
162             return 0;
163         }
164         xsp_doc = xmlParseFile(fullpath);
165         if (!xsp_doc)
166         {
167             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
168                          " xml parse failed: %s", stylesheet, fullpath);
169             if (path)
170                 wrbuf_printf(wr_error, " with path '%s'", path);
171             return 0;
172         }
173         /* need to copy this before passing it to the processor. It will
174            be encapsulated in the xsp and destroyed by xsltFreeStylesheet */
175         xsp = xsltParseStylesheetDoc(xmlCopyDoc(xsp_doc, 1));
176         if (!xsp)
177         {
178             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
179                          " xslt parse failed: %s", stylesheet, fullpath);
180             if (path)
181                 wrbuf_printf(wr_error, " with path '%s'", path);
182             wrbuf_printf(wr_error, " ("
183 #if YAZ_HAVE_EXSLT
184                          
185                          "EXSLT enabled"
186 #else
187                          "EXSLT not supported"
188 #endif
189                          ")");
190             xmlFreeDoc(xsp_doc);
191             return 0;
192         }
193         else
194         {
195             xsltFreeStylesheet(xsp);
196             return xsp_doc;
197         }
198     }
199     return 0;
200 }
201
202 static int convert_xslt(void *info, WRBUF record, WRBUF wr_error)
203 {
204     int ret = 0;
205     xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
206                                    wrbuf_len(record));
207     if (!doc)
208     {
209         wrbuf_printf(wr_error, "xmlParseMemory failed");
210         ret = -1;
211     }
212     else
213     {
214         xmlDocPtr xsp_doc = xmlCopyDoc((xmlDocPtr) info, 1);
215         xsltStylesheetPtr xsp = xsltParseStylesheetDoc(xsp_doc);
216         xmlDocPtr res = xsltApplyStylesheet(xsp, doc, 0);
217         if (res)
218         {
219             xmlChar *out_buf = 0;
220             int out_len;
221             
222 #if HAVE_XSLTSAVERESULTTOSTRING
223             xsltSaveResultToString(&out_buf, &out_len, res, xsp);
224 #else
225             xmlDocDumpFormatMemory (res, &out_buf, &out_len, 1);
226 #endif
227             if (!out_buf)
228             {
229                 wrbuf_printf(wr_error,
230                              "xsltSaveResultToString failed");
231                 ret = -1;
232             }
233             else
234             {
235                 wrbuf_rewind(record);
236                 wrbuf_write(record, (const char *) out_buf, out_len);
237                 
238                 xmlFree(out_buf);
239             }
240             xmlFreeDoc(res);
241         }
242         else
243         {
244             wrbuf_printf(wr_error, "xsltApplyStylesheet failed");
245             ret = -1;
246         }
247         xmlFreeDoc(doc);
248         xsltFreeStylesheet(xsp); /* frees xsp_doc too */
249     }
250     return ret;
251 }
252
253 static void destroy_xslt(void *info)
254 {
255     if (info)
256     {
257         xmlDocPtr xsp_doc = info;
258         xmlFreeDoc(xsp_doc);
259     }
260 }
261
262 /* YAZ_HAVE_XSLT */
263 #endif
264
265
266 static void *construct_marc(yaz_record_conv_t p, const xmlNode *ptr,
267                             const char *path, WRBUF wr_error)
268 {
269     NMEM nmem = nmem_create();
270     struct marc_info *info = nmem_malloc(nmem, sizeof(*info));
271     struct _xmlAttr *attr;
272     const char *input_format = 0;
273     const char *output_format = 0;
274
275     if (strcmp((const char *) ptr->name, "marc"))
276     {
277         nmem_destroy(nmem);
278         return 0;
279     }
280
281     info->nmem = nmem;
282     info->input_charset = 0;
283     info->output_charset = 0;
284     info->input_format_mode = 0;
285     info->output_format_mode = 0;
286
287     for (attr = ptr->properties; attr; attr = attr->next)
288     {
289         if (!xmlStrcmp(attr->name, BAD_CAST "inputcharset") &&
290             attr->children && attr->children->type == XML_TEXT_NODE)
291             info->input_charset = (const char *) attr->children->content;
292         else if (!xmlStrcmp(attr->name, BAD_CAST "outputcharset") &&
293             attr->children && attr->children->type == XML_TEXT_NODE)
294             info->output_charset = (const char *) attr->children->content;
295         else if (!xmlStrcmp(attr->name, BAD_CAST "inputformat") &&
296             attr->children && attr->children->type == XML_TEXT_NODE)
297             input_format = (const char *) attr->children->content;
298         else if (!xmlStrcmp(attr->name, BAD_CAST "outputformat") &&
299             attr->children && attr->children->type == XML_TEXT_NODE)
300             output_format = (const char *) attr->children->content;
301         else
302         {
303             wrbuf_printf(wr_error, "Element <marc>: expected attributes"
304                          "'inputformat', 'inputcharset', 'outputformat' or"
305                          " 'outputcharset', got attribute '%s'", 
306                          attr->name);
307             nmem_destroy(info->nmem);
308             return 0;
309         }
310     }
311     if (!input_format)
312     {
313         wrbuf_printf(p->wr_error, "Element <marc>: "
314                      "attribute 'inputformat' required");
315         nmem_destroy(info->nmem);
316         return 0;
317     }
318     else if (!strcmp(input_format, "marc"))
319     {
320         info->input_format_mode = YAZ_MARC_ISO2709;
321     }
322     else if (!strcmp(input_format, "xml"))
323     {
324         info->input_format_mode = YAZ_MARC_MARCXML;
325         /** Libxml2 generates UTF-8 encoding by default .
326             So we convert from UTF-8 to outputcharset (if defined) 
327         */
328         if (!info->input_charset && info->output_charset)
329             info->input_charset = "utf-8";
330     }
331     else
332     {
333         wrbuf_printf(wr_error, "Element <marc inputformat='%s'>: "
334                      " Unsupported input format"
335                      " defined by attribute value", 
336                      input_format);
337         nmem_destroy(info->nmem);
338         return 0;
339     }
340     
341     if (!output_format)
342     {
343         wrbuf_printf(wr_error, 
344                      "Element <marc>: attribute 'outputformat' required");
345         nmem_destroy(info->nmem);
346         return 0;
347     }
348     else if (!strcmp(output_format, "line"))
349     {
350         info->output_format_mode = YAZ_MARC_LINE;
351     }
352     else if (!strcmp(output_format, "marcxml"))
353     {
354         info->output_format_mode = YAZ_MARC_MARCXML;
355         if (info->input_charset && !info->output_charset)
356             info->output_charset = "utf-8";
357     }
358     else if (!strcmp(output_format, "turbomarc"))
359     {
360         info->output_format_mode = YAZ_MARC_TURBOMARC;
361         if (info->input_charset && !info->output_charset)
362             info->output_charset = "utf-8";
363     }
364     else if (!strcmp(output_format, "marc"))
365     {
366         info->output_format_mode = YAZ_MARC_ISO2709;
367     }
368     else if (!strcmp(output_format, "marcxchange"))
369     {
370         info->output_format_mode = YAZ_MARC_XCHANGE;
371         if (info->input_charset && !info->output_charset)
372             info->output_charset = "utf-8";
373     }
374     else
375     {
376         wrbuf_printf(wr_error, "Element <marc outputformat='%s'>: "
377                      " Unsupported output format"
378                      " defined by attribute value", 
379                      output_format);
380         nmem_destroy(info->nmem);
381         return 0;
382     }
383     if (info->input_charset && info->output_charset)
384     {
385         yaz_iconv_t cd = yaz_iconv_open(info->output_charset,
386                                         info->input_charset);
387         if (!cd)
388         {
389             wrbuf_printf(p->wr_error, 
390                          "Element <marc inputcharset='%s' outputcharset='%s'>:"
391                          " Unsupported character set mapping"
392                          " defined by attribute values",
393                          info->input_charset, info->output_charset);
394             nmem_destroy(info->nmem);
395             return 0;
396         }
397         yaz_iconv_close(cd);
398     }
399     else if (info->input_charset)
400     {
401         wrbuf_printf(wr_error, "Element <marc>: "
402                      "attribute 'outputcharset' missing");
403         nmem_destroy(info->nmem);
404         return 0;
405     }
406     else if (info->output_charset)
407     {
408         wrbuf_printf(wr_error, "Element <marc>: "
409                      "attribute 'inputcharset' missing");
410         nmem_destroy(info->nmem);
411         return 0;
412     }
413     info->input_charset = nmem_strdup(p->nmem, info->input_charset);
414     info->output_charset = nmem_strdup(p->nmem, info->output_charset);
415     return info;
416 }
417
418 static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
419 {
420     struct marc_info *mi = info;
421     int ret = 0;
422     
423     yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, mi->input_charset);
424     yaz_marc_t mt = yaz_marc_create();
425     
426     yaz_marc_xml(mt, mi->output_format_mode);
427     
428     if (cd)
429         yaz_marc_iconv(mt, cd);
430     if (mi->input_format_mode == YAZ_MARC_ISO2709)
431     {
432         int sz = yaz_marc_read_iso2709(mt, wrbuf_buf(record),
433                                        wrbuf_len(record));
434         if (sz > 0)
435             ret = 0;
436         else
437             ret = -1;
438     }
439     else if (mi->input_format_mode == YAZ_MARC_MARCXML ||
440              mi->input_format_mode == YAZ_MARC_TURBOMARC)
441     {
442         xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
443                                        wrbuf_len(record));
444         if (!doc)
445         {
446             wrbuf_printf(wr_error, "xmlParseMemory failed");
447             ret = -1;
448         }
449         else
450         {
451             ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc));
452             if (ret)
453                 wrbuf_printf(wr_error, "yaz_marc_read_xml failed");
454         }
455         xmlFreeDoc(doc);
456     }
457     else
458     {
459         wrbuf_printf(wr_error, "unsupported input format");
460         ret = -1;
461     }
462     if (ret == 0)
463     {
464         wrbuf_rewind(record);
465         ret = yaz_marc_write_mode(mt, record);
466         if (ret)
467             wrbuf_printf(wr_error, "yaz_marc_write_mode failed");
468     }
469     if (cd)
470         yaz_iconv_close(cd);
471     yaz_marc_destroy(mt);
472     return ret;
473 }
474
475 static void destroy_marc(void *info)
476 {
477     struct marc_info *mi = info;
478     
479     nmem_destroy(mi->nmem);
480 }
481
482 int yaz_record_conv_configure(yaz_record_conv_t p, const xmlNode *ptr)
483 {
484     yaz_record_conv_reset(p);
485
486     /* parsing element children */
487     for (ptr = ptr->children; ptr; ptr = ptr->next)
488     {
489         struct yaz_record_conv_type *t;
490         struct yaz_record_conv_rule *r;
491         void *info = 0;
492         if (ptr->type != XML_ELEMENT_NODE)
493             continue;
494         for (t = p->types; t; t = t->next)
495         {
496             wrbuf_rewind(p->wr_error);
497             info = t->construct(p, ptr, p->path, p->wr_error);
498             if (info)
499                 break;
500         }
501         if (!info)
502         {
503             wrbuf_printf(p->wr_error, "Element <backend>: expected "
504                          "<marc> or <xslt> element, got <%s>"
505                          , ptr->name);
506             return -1;
507         }
508         r = (struct yaz_record_conv_rule *) nmem_malloc(p->nmem, sizeof(*r));
509         r->next = 0;
510         r->info = info;
511         r->type = t;
512         *p->rules_p = r;
513         p->rules_p = &r->next;
514     }
515     return 0;
516 }
517
518 static int yaz_record_conv_record_rule(yaz_record_conv_t p,
519                                        struct yaz_record_conv_rule *r,
520                                        const char *input_record_buf,
521                                        size_t input_record_len,
522                                        WRBUF output_record)
523 {
524     int ret = 0;
525     WRBUF record = output_record; /* pointer transfer */
526     wrbuf_rewind(p->wr_error);
527     
528     wrbuf_write(record, input_record_buf, input_record_len);
529     for (; ret == 0 && r; r = r->next)
530         ret = r->type->convert(r->info, record, p->wr_error);
531     return ret;
532 }
533
534 int yaz_record_conv_opac_record(yaz_record_conv_t p,
535                                 Z_OPACRecord *input_record,
536                                 WRBUF output_record)
537 {
538     int ret = 0;
539     struct yaz_record_conv_rule *r = p->rules;
540     if (!r || r->type->construct != construct_marc)
541         ret = -1; /* no marc rule so we can't do OPAC */
542     else
543     {
544         struct marc_info *mi = r->info;
545
546         WRBUF res = wrbuf_alloc();
547         yaz_marc_t mt = yaz_marc_create();
548         yaz_iconv_t cd = yaz_iconv_open(mi->output_charset,
549                                         mi->input_charset);
550         
551         wrbuf_rewind(p->wr_error);
552         yaz_marc_xml(mt, mi->output_format_mode);
553         
554         yaz_marc_iconv(mt, cd);
555         
556         yaz_opac_decode_wrbuf(mt, input_record, res);
557         if (ret != -1)
558         {
559             ret = yaz_record_conv_record_rule(p, 
560                                               r->next,
561                                               wrbuf_buf(res), wrbuf_len(res),
562                                               output_record);
563         }
564         yaz_marc_destroy(mt);
565         if (cd)
566             yaz_iconv_close(cd);
567         wrbuf_destroy(res);
568     }
569     return ret;
570 }
571
572 int yaz_record_conv_record(yaz_record_conv_t p,
573                            const char *input_record_buf,
574                            size_t input_record_len,
575                            WRBUF output_record)
576 {
577     return yaz_record_conv_record_rule(p, p->rules,
578                                        input_record_buf,
579                                        input_record_len, output_record);
580 }
581
582 const char *yaz_record_conv_get_error(yaz_record_conv_t p)
583 {
584     return wrbuf_cstr(p->wr_error);
585 }
586
587 void yaz_record_conv_set_path(yaz_record_conv_t p, const char *path)
588 {
589     xfree(p->path);
590     p->path = 0;
591     if (path)
592         p->path = xstrdup(path);
593 }
594
595 yaz_record_conv_t yaz_record_conv_create()
596 {
597     yaz_record_conv_t p = (yaz_record_conv_t) xmalloc(sizeof(*p));
598     p->nmem = nmem_create();
599     p->wr_error = wrbuf_alloc();
600     p->rules = 0;
601     p->path = 0;
602     p->types = 0;
603
604 #if YAZ_HAVE_EXSLT
605     exsltRegisterAll(); 
606 #endif    
607     { /* register marc */
608         struct yaz_record_conv_type t;
609
610         t.construct = construct_marc;
611         t.convert = convert_marc;
612         t.destroy = destroy_marc;
613
614         yaz_record_conv_add_type(p, &t);
615     }
616 #if YAZ_HAVE_XSLT
617     { /* register xslt */
618         struct yaz_record_conv_type t;
619         
620         t.construct = construct_xslt;
621         t.convert = convert_xslt;
622         t.destroy = destroy_xslt;
623
624         yaz_record_conv_add_type(p, &t);
625     }
626 #endif
627     return p;
628 }
629
630 /* YAZ_HAVE_XML2 */
631 #endif
632
633 /*
634  * Local variables:
635  * c-basic-offset: 4
636  * c-file-style: "Stroustrup"
637  * indent-tabs-mode: nil
638  * End:
639  * vim: shiftwidth=4 tabstop=8 expandtab
640  */
641