212e96b7a901e40ed13bb1edb8e4cf1e3819a2d5
[yaz-moved-to-github.git] / src / record_conv.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2012 Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file record_conv.c
7  * \brief Record Conversions utility
8  */
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <string.h>
15 #include <yaz/yaz-iconv.h>
16 #include <yaz/marcdisp.h>
17 #include <yaz/record_conv.h>
18 #include <yaz/wrbuf.h>
19 #include <yaz/xmalloc.h>
20 #include <yaz/nmem.h>
21 #include <yaz/tpath.h>
22 #include <yaz/z-opac.h>
23
24 #if YAZ_HAVE_XML2
25 #include <libxml/parser.h>
26 #include <libxml/tree.h>
27 #include <libxml/xinclude.h>
28 #if YAZ_HAVE_XSLT
29 #include <libxslt/xsltutils.h>
30 #include <libxslt/transform.h>
31 #endif
32 #if YAZ_HAVE_EXSLT
33 #include <libexslt/exslt.h>
34 #endif
35
36 /** \brief The internal structure for yaz_record_conv_t */
37 struct yaz_record_conv_struct {
38     /** \brief memory for configuration */
39     NMEM nmem;
40
41     /** \brief conversion rules (allocated using NMEM) */
42     struct yaz_record_conv_rule *rules;
43
44     /** \brief pointer to last conversion rule pointer in chain */
45     struct yaz_record_conv_rule **rules_p;
46
47     /** \brief string buffer for error messages */
48     WRBUF wr_error;
49
50     /** \brief path for opening files  */
51     char *path;
52 };
53
54 struct marc_info {
55     NMEM nmem;
56     const char *input_charset;
57     const char *output_charset;
58     int input_format_mode;
59     int output_format_mode;
60     const char *leader_spec;
61 };
62
63 /** \brief tranformation info (rule info) */
64 struct yaz_record_conv_rule {
65     struct yaz_record_conv_type *type;
66     void *info;
67     struct yaz_record_conv_rule *next;
68 };
69
70 /** \brief reset rules+configuration */
71 static void yaz_record_conv_reset(yaz_record_conv_t p)
72 {
73
74     struct yaz_record_conv_rule *r;
75     for (r = p->rules; r; r = r->next)
76     {
77         r->type->destroy(r->info);
78     }
79     wrbuf_rewind(p->wr_error);
80     nmem_reset(p->nmem);
81
82     p->rules = 0;
83
84     p->rules_p = &p->rules;
85 }
86
87 void yaz_record_conv_destroy(yaz_record_conv_t p)
88 {
89     if (p)
90     {
91         yaz_record_conv_reset(p);
92         nmem_destroy(p->nmem);
93         wrbuf_destroy(p->wr_error);
94
95         xfree(p->path);
96         xfree(p);
97     }
98 }
99
100 #if YAZ_HAVE_XSLT
101 static void *construct_xslt(const xmlNode *ptr,
102                             const char *path, WRBUF wr_error)
103 {
104     struct _xmlAttr *attr;
105     const char *stylesheet = 0;
106
107     if (strcmp((const char *) ptr->name, "xslt"))
108         return 0;
109
110     for (attr = ptr->properties; attr; attr = attr->next)
111     {
112         if (!xmlStrcmp(attr->name, BAD_CAST "stylesheet") &&
113             attr->children && attr->children->type == XML_TEXT_NODE)
114             stylesheet = (const char *) attr->children->content;
115         else
116         {
117             wrbuf_printf(wr_error, "Bad attribute '%s'"
118                          "Expected stylesheet.", attr->name);
119             return 0;
120         }
121     }
122     if (!stylesheet)
123     {
124         wrbuf_printf(wr_error, "Element <xslt>: "
125                      "attribute 'stylesheet' expected");
126         return 0;
127     }
128     else
129     {
130         char fullpath[1024];
131         xsltStylesheetPtr xsp;
132         xmlDocPtr xsp_doc;
133         if (!yaz_filepath_resolve(stylesheet, path, 0, fullpath))
134         {
135             wrbuf_printf(wr_error, "Element <xslt stylesheet=\"%s\"/>:"
136                          " could not locate stylesheet '%s'",
137                          stylesheet, stylesheet);
138             if (path)
139                 wrbuf_printf(wr_error, " with path '%s'", path);
140                 
141             return 0;
142         }
143         xsp_doc = xmlParseFile(fullpath);
144         if (!xsp_doc)
145         {
146             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
147                          " xml parse failed: %s", stylesheet, fullpath);
148             if (path)
149                 wrbuf_printf(wr_error, " with path '%s'", path);
150             return 0;
151         }
152         /* need to copy this before passing it to the processor. It will
153            be encapsulated in the xsp and destroyed by xsltFreeStylesheet */
154         xsp = xsltParseStylesheetDoc(xmlCopyDoc(xsp_doc, 1));
155         if (!xsp)
156         {
157             wrbuf_printf(wr_error, "Element: <xslt stylesheet=\"%s\"/>:"
158                          " xslt parse failed: %s", stylesheet, fullpath);
159             if (path)
160                 wrbuf_printf(wr_error, " with path '%s'", path);
161             wrbuf_printf(wr_error, " ("
162 #if YAZ_HAVE_EXSLT
163                          
164                          "EXSLT enabled"
165 #else
166                          "EXSLT not supported"
167 #endif
168                          ")");
169             xmlFreeDoc(xsp_doc);
170             return 0;
171         }
172         else
173         {
174             xsltFreeStylesheet(xsp);
175             return xsp_doc;
176         }
177     }
178     return 0;
179 }
180
181 static int convert_xslt(void *info, WRBUF record, WRBUF wr_error)
182 {
183     int ret = 0;
184     xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
185                                    wrbuf_len(record));
186     if (!doc)
187     {
188         wrbuf_printf(wr_error, "xmlParseMemory failed");
189         ret = -1;
190     }
191     else
192     {
193         xmlDocPtr xsp_doc = xmlCopyDoc((xmlDocPtr) info, 1);
194         xsltStylesheetPtr xsp = xsltParseStylesheetDoc(xsp_doc);
195         xmlDocPtr res = xsltApplyStylesheet(xsp, doc, 0);
196         if (res)
197         {
198             xmlChar *out_buf = 0;
199             int out_len;
200             
201 #if HAVE_XSLTSAVERESULTTOSTRING
202             xsltSaveResultToString(&out_buf, &out_len, res, xsp);
203 #else
204             xmlDocDumpFormatMemory (res, &out_buf, &out_len, 1);
205 #endif
206             if (!out_buf)
207             {
208                 wrbuf_printf(wr_error,
209                              "xsltSaveResultToString failed");
210                 ret = -1;
211             }
212             else
213             {
214                 wrbuf_rewind(record);
215                 wrbuf_write(record, (const char *) out_buf, out_len);
216                 
217                 xmlFree(out_buf);
218             }
219             xmlFreeDoc(res);
220         }
221         else
222         {
223             wrbuf_printf(wr_error, "xsltApplyStylesheet failed");
224             ret = -1;
225         }
226         xmlFreeDoc(doc);
227         xsltFreeStylesheet(xsp); /* frees xsp_doc too */
228     }
229     return ret;
230 }
231
232 static void destroy_xslt(void *info)
233 {
234     if (info)
235     {
236         xmlDocPtr xsp_doc = info;
237         xmlFreeDoc(xsp_doc);
238     }
239 }
240
241 /* YAZ_HAVE_XSLT */
242 #endif
243
244
245 static void *construct_marc(const xmlNode *ptr,
246                             const char *path, WRBUF wr_error)
247 {
248     NMEM nmem = nmem_create();
249     struct marc_info *info = nmem_malloc(nmem, sizeof(*info));
250     struct _xmlAttr *attr;
251     const char *input_format = 0;
252     const char *output_format = 0;
253
254     if (strcmp((const char *) ptr->name, "marc"))
255     {
256         nmem_destroy(nmem);
257         return 0;
258     }
259
260     info->nmem = nmem;
261     info->input_charset = 0;
262     info->output_charset = 0;
263     info->input_format_mode = 0;
264     info->output_format_mode = 0;
265     info->leader_spec = 0;
266
267     for (attr = ptr->properties; attr; attr = attr->next)
268     {
269         if (!xmlStrcmp(attr->name, BAD_CAST "inputcharset") &&
270             attr->children && attr->children->type == XML_TEXT_NODE)
271             info->input_charset = (const char *) attr->children->content;
272         else if (!xmlStrcmp(attr->name, BAD_CAST "outputcharset") &&
273             attr->children && attr->children->type == XML_TEXT_NODE)
274             info->output_charset = (const char *) attr->children->content;
275         else if (!xmlStrcmp(attr->name, BAD_CAST "inputformat") &&
276             attr->children && attr->children->type == XML_TEXT_NODE)
277             input_format = (const char *) attr->children->content;
278         else if (!xmlStrcmp(attr->name, BAD_CAST "outputformat") &&
279             attr->children && attr->children->type == XML_TEXT_NODE)
280             output_format = (const char *) attr->children->content;
281         else if (!xmlStrcmp(attr->name, BAD_CAST "leaderspec") &&
282                  attr->children && attr->children->type == XML_TEXT_NODE)
283             info->leader_spec =
284                 nmem_strdup(info->nmem,(const char *) attr->children->content);
285         else
286         {
287             wrbuf_printf(wr_error, "Element <marc>: expected attributes"
288                          "'inputformat', 'inputcharset', 'outputformat' or"
289                          " 'outputcharset', got attribute '%s'", 
290                          attr->name);
291             nmem_destroy(info->nmem);
292             return 0;
293         }
294     }
295     if (!input_format)
296     {
297         wrbuf_printf(wr_error, "Element <marc>: "
298                      "attribute 'inputformat' required");
299         nmem_destroy(info->nmem);
300         return 0;
301     }
302     else if (!strcmp(input_format, "marc"))
303     {
304         info->input_format_mode = YAZ_MARC_ISO2709;
305     }
306     else if (!strcmp(input_format, "xml"))
307     {
308         info->input_format_mode = YAZ_MARC_MARCXML;
309         /** Libxml2 generates UTF-8 encoding by default .
310             So we convert from UTF-8 to outputcharset (if defined) 
311         */
312         if (!info->input_charset && info->output_charset)
313             info->input_charset = "utf-8";
314     }
315     else
316     {
317         wrbuf_printf(wr_error, "Element <marc inputformat='%s'>: "
318                      " Unsupported input format"
319                      " defined by attribute value", 
320                      input_format);
321         nmem_destroy(info->nmem);
322         return 0;
323     }
324     
325     if (!output_format)
326     {
327         wrbuf_printf(wr_error, 
328                      "Element <marc>: attribute 'outputformat' required");
329         nmem_destroy(info->nmem);
330         return 0;
331     }
332     else if (!strcmp(output_format, "line"))
333     {
334         info->output_format_mode = YAZ_MARC_LINE;
335     }
336     else if (!strcmp(output_format, "marcxml"))
337     {
338         info->output_format_mode = YAZ_MARC_MARCXML;
339         if (info->input_charset && !info->output_charset)
340             info->output_charset = "utf-8";
341     }
342     else if (!strcmp(output_format, "turbomarc"))
343     {
344         info->output_format_mode = YAZ_MARC_TURBOMARC;
345         if (info->input_charset && !info->output_charset)
346             info->output_charset = "utf-8";
347     }
348     else if (!strcmp(output_format, "marc"))
349     {
350         info->output_format_mode = YAZ_MARC_ISO2709;
351     }
352     else if (!strcmp(output_format, "marcxchange"))
353     {
354         info->output_format_mode = YAZ_MARC_XCHANGE;
355         if (info->input_charset && !info->output_charset)
356             info->output_charset = "utf-8";
357     }
358     else
359     {
360         wrbuf_printf(wr_error, "Element <marc outputformat='%s'>: "
361                      " Unsupported output format"
362                      " defined by attribute value", 
363                      output_format);
364         nmem_destroy(info->nmem);
365         return 0;
366     }
367     if (info->input_charset && info->output_charset)
368     {
369         yaz_iconv_t cd = yaz_iconv_open(info->output_charset,
370                                         info->input_charset);
371         if (!cd)
372         {
373             wrbuf_printf(wr_error, 
374                          "Element <marc inputcharset='%s' outputcharset='%s'>:"
375                          " Unsupported character set mapping"
376                          " defined by attribute values",
377                          info->input_charset, info->output_charset);
378             nmem_destroy(info->nmem);
379             return 0;
380         }
381         yaz_iconv_close(cd);
382     }
383     else if (!info->output_charset)
384     {
385         wrbuf_printf(wr_error, "Element <marc>: "
386                      "attribute 'outputcharset' missing");
387         nmem_destroy(info->nmem);
388         return 0;
389     }
390     else if (!info->input_charset)
391     {
392         wrbuf_printf(wr_error, "Element <marc>: "
393                      "attribute 'inputcharset' missing");
394         nmem_destroy(info->nmem);
395         return 0;
396     }
397     info->input_charset = nmem_strdup(info->nmem, info->input_charset);
398     info->output_charset = nmem_strdup(info->nmem, info->output_charset);
399     return info;
400 }
401
402 static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
403 {
404     struct marc_info *mi = info;
405     int ret = 0;
406     
407     yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, mi->input_charset);
408     yaz_marc_t mt = yaz_marc_create();
409     
410     yaz_marc_xml(mt, mi->output_format_mode);
411     if (mi->leader_spec)
412         yaz_marc_leader_spec(mt, mi->leader_spec);
413         
414     if (cd)
415         yaz_marc_iconv(mt, cd);
416     if (mi->input_format_mode == YAZ_MARC_ISO2709)
417     {
418         int sz = yaz_marc_read_iso2709(mt, wrbuf_buf(record),
419                                        wrbuf_len(record));
420         if (sz > 0)
421             ret = 0;
422         else
423             ret = -1;
424     }
425     else if (mi->input_format_mode == YAZ_MARC_MARCXML ||
426              mi->input_format_mode == YAZ_MARC_TURBOMARC)
427     {
428         xmlDocPtr doc = xmlParseMemory(wrbuf_buf(record),
429                                        wrbuf_len(record));
430         if (!doc)
431         {
432             wrbuf_printf(wr_error, "xmlParseMemory failed");
433             ret = -1;
434         }
435         else
436         {
437             ret = yaz_marc_read_xml(mt, xmlDocGetRootElement(doc));
438             if (ret)
439                 wrbuf_printf(wr_error, "yaz_marc_read_xml failed");
440         }
441         xmlFreeDoc(doc);
442     }
443     else
444     {
445         wrbuf_printf(wr_error, "unsupported input format");
446         ret = -1;
447     }
448     if (ret == 0)
449     {
450         wrbuf_rewind(record);
451         ret = yaz_marc_write_mode(mt, record);
452         if (ret)
453             wrbuf_printf(wr_error, "yaz_marc_write_mode failed");
454     }
455     if (cd)
456         yaz_iconv_close(cd);
457     yaz_marc_destroy(mt);
458     return ret;
459 }
460
461 static void destroy_marc(void *info)
462 {
463     struct marc_info *mi = info;
464     
465     nmem_destroy(mi->nmem);
466 }
467
468 int yaz_record_conv_configure_t(yaz_record_conv_t p, const xmlNode *ptr,
469                                 struct yaz_record_conv_type *types)
470 {
471     struct yaz_record_conv_type bt[2];
472     
473     /* register marc */
474     bt[0].construct = construct_marc;
475     bt[0].convert = convert_marc;
476     bt[0].destroy = destroy_marc;
477
478 #if YAZ_HAVE_XSLT
479     /* register xslt */
480     bt[0].next = &bt[1];
481     bt[1].next = types;
482     bt[1].construct = construct_xslt;
483     bt[1].convert = convert_xslt;
484     bt[1].destroy = destroy_xslt;
485 #else
486     bt[0].next = types;
487 #endif
488     
489     yaz_record_conv_reset(p);
490
491     /* parsing element children */
492     for (ptr = ptr->children; ptr; ptr = ptr->next)
493     {
494         struct yaz_record_conv_type *t;
495         struct yaz_record_conv_rule *r;
496         void *info = 0;
497         if (ptr->type != XML_ELEMENT_NODE)
498             continue;
499         for (t = &bt[0]; t; t = t->next)
500         {
501             wrbuf_rewind(p->wr_error);
502             info = t->construct(ptr, p->path, p->wr_error);
503
504             if (info || wrbuf_len(p->wr_error))
505                 break;
506             /* info== 0 and no error reported , ie not handled by it */
507         }
508         if (!info)
509         {
510             if (wrbuf_len(p->wr_error) == 0)
511                 wrbuf_printf(p->wr_error, "Element <backend>: expected "
512                              "<marc> or <xslt> element, got <%s>"
513                              , ptr->name);
514             return -1;
515         }
516         r = (struct yaz_record_conv_rule *) nmem_malloc(p->nmem, sizeof(*r));
517         r->next = 0;
518         r->info = info;
519         r->type = nmem_malloc(p->nmem, sizeof(*t));
520         memcpy(r->type, t, sizeof(*t));
521         *p->rules_p = r;
522         p->rules_p = &r->next;
523     }
524     return 0;
525 }
526
527 int yaz_record_conv_configure(yaz_record_conv_t p, const xmlNode *ptr)
528 {
529     return yaz_record_conv_configure_t(p, ptr, 0);
530 }
531
532 static int yaz_record_conv_record_rule(yaz_record_conv_t p,
533                                        struct yaz_record_conv_rule *r,
534                                        const char *input_record_buf,
535                                        size_t input_record_len,
536                                        WRBUF output_record)
537 {
538     int ret = 0;
539     WRBUF record = output_record; /* pointer transfer */
540     wrbuf_rewind(p->wr_error);
541     
542     wrbuf_write(record, input_record_buf, input_record_len);
543     for (; ret == 0 && r; r = r->next)
544         ret = r->type->convert(r->info, record, p->wr_error);
545     return ret;
546 }
547
548 int yaz_record_conv_opac_record(yaz_record_conv_t p,
549                                 Z_OPACRecord *input_record,
550                                 WRBUF output_record)
551 {
552     int ret = 0;
553     struct yaz_record_conv_rule *r = p->rules;
554     if (!r || r->type->construct != construct_marc)
555         ret = -1; /* no marc rule so we can't do OPAC */
556     else
557     {
558         struct marc_info *mi = r->info;
559
560         WRBUF res = wrbuf_alloc();
561         yaz_marc_t mt = yaz_marc_create();
562         yaz_iconv_t cd = yaz_iconv_open(mi->output_charset,
563                                         mi->input_charset);
564         
565         wrbuf_rewind(p->wr_error);
566         yaz_marc_xml(mt, mi->output_format_mode);
567         
568         yaz_marc_iconv(mt, cd);
569         
570         yaz_opac_decode_wrbuf(mt, input_record, res);
571         if (ret != -1)
572         {
573             ret = yaz_record_conv_record_rule(p, 
574                                               r->next,
575                                               wrbuf_buf(res), wrbuf_len(res),
576                                               output_record);
577         }
578         yaz_marc_destroy(mt);
579         if (cd)
580             yaz_iconv_close(cd);
581         wrbuf_destroy(res);
582     }
583     return ret;
584 }
585
586 int yaz_record_conv_record(yaz_record_conv_t p,
587                            const char *input_record_buf,
588                            size_t input_record_len,
589                            WRBUF output_record)
590 {
591     return yaz_record_conv_record_rule(p, p->rules,
592                                        input_record_buf,
593                                        input_record_len, output_record);
594 }
595
596 const char *yaz_record_conv_get_error(yaz_record_conv_t p)
597 {
598     return wrbuf_cstr(p->wr_error);
599 }
600
601 void yaz_record_conv_set_path(yaz_record_conv_t p, const char *path)
602 {
603     xfree(p->path);
604     p->path = 0;
605     if (path)
606         p->path = xstrdup(path);
607 }
608
609 yaz_record_conv_t yaz_record_conv_create()
610 {
611     yaz_record_conv_t p = (yaz_record_conv_t) xmalloc(sizeof(*p));
612     p->nmem = nmem_create();
613     p->wr_error = wrbuf_alloc();
614     p->rules = 0;
615     p->path = 0;
616 #if YAZ_HAVE_EXSLT
617     exsltRegisterAll(); 
618 #endif    
619     return p;
620 }
621
622 /* YAZ_HAVE_XML2 */
623 #endif
624
625 /*
626  * Local variables:
627  * c-basic-offset: 4
628  * c-file-style: "Stroustrup"
629  * indent-tabs-mode: nil
630  * End:
631  * vim: shiftwidth=4 tabstop=8 expandtab
632  */
633