document parameter (Doxygen warning)
[yaz-moved-to-github.git] / src / marc_read_xml.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file marc_read_xml.c
8  * \brief Implements reading of MARC as XML
9  */
10
11 #if HAVE_CONFIG_H
12 #include <config.h>
13 #endif
14
15 #ifdef WIN32
16 #include <windows.h>
17 #endif
18
19 #include <stdio.h>
20 #include <string.h>
21 #include <yaz/marcdisp.h>
22 #include <yaz/wrbuf.h>
23 #include <yaz/yaz-util.h>
24 #include <yaz/nmem_xml.h>
25
26 #if YAZ_HAVE_XML2
27 #include <libxml/tree.h>
28 #endif
29
30 #if YAZ_HAVE_XML2
31 int yaz_marc_read_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
32 {
33     NMEM nmem = yaz_marc_get_nmem(mt);
34     for (; ptr; ptr = ptr->next)
35     {
36         if (ptr->type == XML_ELEMENT_NODE)
37         {
38             if (!strcmp((const char *) ptr->name, "subfield"))
39             {
40                 size_t ctrl_data_len = 0;
41                 char *ctrl_data_buf = 0;
42                 const xmlNode *p = 0, *ptr_code = 0;
43                 struct _xmlAttr *attr;
44                 for (attr = ptr->properties; attr; attr = attr->next)
45                     if (!strcmp((const char *)attr->name, "code"))
46                         ptr_code = attr->children;
47                     else
48                     {
49                         yaz_marc_cprintf(
50                             mt, "Bad attribute '%.80s' for 'subfield'",
51                             attr->name);
52                         return -1;
53                     }
54                 if (!ptr_code)
55                 {
56                     yaz_marc_cprintf(
57                         mt, "Missing attribute 'code' for 'subfield'" );
58                     return -1;
59                 }
60                 if (ptr_code->type == XML_TEXT_NODE)
61                 {
62                     ctrl_data_len =
63                         strlen((const char *)ptr_code->content);
64                 }
65                 else
66                 {
67                     yaz_marc_cprintf(
68                         mt, "Missing value for 'code' in 'subfield'" );
69                     return -1;
70                 }
71                 for (p = ptr->children; p ; p = p->next)
72                     if (p->type == XML_TEXT_NODE)
73                         ctrl_data_len += strlen((const char *)p->content);
74                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
75                 strcpy(ctrl_data_buf, (const char *)ptr_code->content);
76                 for (p = ptr->children; p ; p = p->next)
77                     if (p->type == XML_TEXT_NODE)
78                         strcat(ctrl_data_buf, (const char *)p->content);
79                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
80             }
81             else
82             {
83                 yaz_marc_cprintf(
84                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
85                 return -1;
86             }
87         }
88     }
89     return 0;
90 }
91
92 const char *tag_value_extract(const char *name, char tag_buffer[5])
93 {
94     size_t length = strlen(name);
95     if (length == 3)
96     {
97         strcpy(tag_buffer, name);
98         return tag_buffer;
99     }
100     return 0;
101 }
102
103 // Given a xmlNode ptr,  extract a value from either a element name or from a given attribute
104 char *element_attribute_value_extract(const xmlNode *ptr,
105                                       const char *attribute_name,
106                                       NMEM nmem)
107 {
108     const char *name = (const char *) ptr->name;
109     size_t length = strlen(name);
110     xmlAttr *attr;
111     if (length > 1 )
112         return nmem_strdup(nmem, name+1);
113     // TODO Extract from attribute where matches attribute_name
114     for (attr = ptr->properties; attr; attr = attr->next)
115         if (!strcmp((const char *)attr->name, attribute_name))
116             return nmem_text_node_cdata(attr->children, nmem);
117     return 0;
118 }
119
120
121 int yaz_marc_read_turbo_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
122 {
123     for (; ptr; ptr = ptr->next)
124     {
125         if (ptr->type == XML_ELEMENT_NODE)
126         {
127             if (!strncmp((const char *) ptr->name, "s", 1))
128             {
129                 NMEM nmem = yaz_marc_get_nmem(mt);
130                 xmlNode *p;
131                 size_t ctrl_data_len = 0;
132                 char *ctrl_data_buf = 0;
133                 const char *tag_value = element_attribute_value_extract(ptr, "code", nmem);
134                 if (!tag_value)
135                 {
136                     yaz_marc_cprintf(
137                         mt, "Missing 'code' value for 'subfield'" );
138                     return -1;
139                 }
140
141                 ctrl_data_len = strlen((const char *) tag_value);
142                 // Extract (length) from CDATA
143                 for (p = ptr->children; p ; p = p->next)
144                     if (p->type == XML_TEXT_NODE)
145                         ctrl_data_len += strlen((const char *)p->content);
146                 // Allocate memory for code value (1 character (can be multi-byte) and data
147                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
148                 // Build a string with "<Code><data>"
149                 strcpy(ctrl_data_buf, (const char *) tag_value);
150                 for (p = ptr->children; p ; p = p->next)
151                     if (p->type == XML_TEXT_NODE)
152                         strcat(ctrl_data_buf, (const char *)p->content);
153                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
154             }
155             else
156             {
157                 yaz_marc_cprintf(
158                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
159                 return -1;
160             }
161         }
162     }
163     return 0;
164 }
165
166
167 static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p,
168                                     int *indicator_length)
169 {
170     int identifier_length;
171     int base_address;
172     int length_data_entry;
173     int length_starting;
174     int length_implementation;
175     const char *leader = 0;
176     const xmlNode *ptr = *ptr_p;
177
178     for(; ptr; ptr = ptr->next)
179         if (ptr->type == XML_ELEMENT_NODE)
180         {
181             if ( !strcmp( (const char *) ptr->name, "leader") ||
182                  (!strncmp((const char *) ptr->name, "l", 1) ))
183             {
184                 xmlNode *p = ptr->children;
185                 for(; p; p = p->next)
186                     if (p->type == XML_TEXT_NODE)
187                         leader = (const char *) p->content;
188                 break;
189             }
190             else
191             {
192                 yaz_marc_cprintf(
193                     mt, "Expected element 'leader', got '%.80s'", ptr->name);
194             }
195         }
196     if (!leader)
197     {
198         yaz_marc_cprintf(mt, "Missing element 'leader'");
199         return -1;
200     }
201     if (strlen(leader) != 24)
202     {
203         yaz_marc_cprintf(mt, "Bad length %d of leader data."
204                          " Must have length of 24 characters", strlen(leader));
205         return -1;
206     }
207     yaz_marc_set_leader(mt, leader,
208                         indicator_length,
209                         &identifier_length,
210                         &base_address,
211                         &length_data_entry,
212                         &length_starting,
213                         &length_implementation);
214     *ptr_p = ptr;
215     return 0;
216 }
217
218 static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr,
219                                     int indicator_length)
220 {
221     for(; ptr; ptr = ptr->next)
222         if (ptr->type == XML_ELEMENT_NODE)
223         {
224             if (!strcmp( (const char *) ptr->name, "controlfield"))
225             {
226                 const xmlNode *ptr_tag = 0;
227                 struct _xmlAttr *attr;
228                 for (attr = ptr->properties; attr; attr = attr->next)
229                     if (!strcmp((const char *)attr->name, "tag"))
230                         ptr_tag = attr->children;
231                     else
232                     {
233                         yaz_marc_cprintf(
234                             mt, "Bad attribute '%.80s' for 'controlfield'",
235                             attr->name);
236                         return -1;
237                     }
238                 if (!ptr_tag)
239                 {
240                     yaz_marc_cprintf(
241                         mt, "Missing attribute 'tag' for 'controlfield'" );
242                     return -1;
243                 }
244                 yaz_marc_add_controlfield_xml(mt, ptr_tag, ptr->children);
245             }
246             else if (!strcmp((const char *) ptr->name, "datafield"))
247             {
248                 char indstr[11]; /* 0(unused), 1,....9, + zero term */
249                 const xmlNode *ptr_tag = 0;
250                 struct _xmlAttr *attr;
251                 int i;
252                 for (i = 0; i < indicator_length; i++)
253                     indstr[i] = ' ';
254                 indstr[i] = '\0';
255                 for (attr = ptr->properties; attr; attr = attr->next)
256                     if (!strcmp((const char *)attr->name, "tag"))
257                         ptr_tag = attr->children;
258                     else if (strlen((const char *)attr->name) == 4 &&
259                              !memcmp(attr->name, "ind", 3))
260                     {
261                         int no = atoi((const char *)attr->name + 3);
262                         if (attr->children &&
263                             attr->children->type == XML_TEXT_NODE &&
264                             no <= indicator_length && no > 0 &&
265                             attr->children->content[0])
266                         {
267                             indstr[no - 1] = attr->children->content[0];
268                         }
269                         else
270                         {
271                             yaz_marc_cprintf(
272                                 mt, "Bad attribute '%.80s' for 'datafield'",
273                                 attr->name);
274                         }
275                     }
276                     else
277                     {
278                         yaz_marc_cprintf(
279                             mt, "Bad attribute '%.80s' for 'datafield'",
280                             attr->name);
281                     }
282                 if (!ptr_tag)
283                 {
284                     yaz_marc_cprintf(
285                         mt, "Missing attribute 'tag' for 'datafield'" );
286                     return -1;
287                 }
288                 yaz_marc_add_datafield_xml(mt, ptr_tag,
289                                            indstr, indicator_length);
290                 if (yaz_marc_read_xml_subfields(mt, ptr->children))
291                     return -1;
292             }
293             else
294             {
295                 yaz_marc_cprintf(mt,
296                                  "Expected element controlfield or datafield,"
297                                  " got %.80s", ptr->name);
298                 return -1;
299             }
300         }
301     return 0;
302 }
303
304
305 static int yaz_marc_read_turbo_xml_fields(yaz_marc_t mt, const xmlNode *ptr,
306                                           int indicator_length)
307 {
308     for(; ptr; ptr = ptr->next)
309         if (ptr->type == XML_ELEMENT_NODE)
310         {
311             if (!strncmp( (const char *) ptr->name, "c", 1))
312             {
313                 NMEM nmem = yaz_marc_get_nmem(mt);
314                 char *tag_value = element_attribute_value_extract(ptr, "tag", nmem);
315                 if (!tag_value)
316                 {
317                     yaz_marc_cprintf(
318                         mt, "Missing attribute 'tag' for 'controlfield'" );
319                     return -1;
320                 }
321                 yaz_marc_add_controlfield_xml2(mt, tag_value, ptr->children);
322             }
323             else if (!strncmp((const char *) ptr->name, "d",1))
324             {
325                 struct _xmlAttr *attr;
326                 NMEM nmem = yaz_marc_get_nmem(mt);
327                 char *tag_value;
328                 char *indstr = nmem_malloc(nmem, indicator_length + 1);
329                 int i = 0;
330                 for (i = 0; i < indicator_length; i++)
331                     indstr[i] = ' ';
332                 indstr[i] = '\0';
333                 tag_value = element_attribute_value_extract(ptr, "tag", nmem);
334                 if (!tag_value)
335                 {
336                     yaz_marc_cprintf(
337                         mt, "Missing attribute 'tag' for 'datafield'" );
338                     return -1;
339                 }
340                 for (attr = ptr->properties; attr; attr = attr->next)
341                     if (strlen((const char *)attr->name) == 2 &&
342                         attr->name[0] == 'i')
343                     {
344                         //extract indicator attribute from i#="Y" pattern
345                         int no = atoi((const char *)attr->name + 1);
346                         if (attr->children &&
347                             attr->children->type == XML_TEXT_NODE &&
348                             no <= indicator_length && no > 0 &&
349                             attr->children->content[0])
350                         {
351                             indstr[no - 1] = attr->children->content[0];
352                         }
353                         else
354                         {
355                             yaz_marc_cprintf(
356                                 mt, "Bad attribute '%.80s' for 'd'",attr->name);
357                         }
358                     }
359                     else
360                     {
361                         yaz_marc_cprintf(
362                             mt, "Bad attribute '%.80s' for 'd'", attr->name);
363                     }
364                 yaz_marc_add_datafield_xml2(mt, tag_value, indstr);
365                 if (yaz_marc_read_turbo_xml_subfields(mt, ptr->children /*, indstr */))
366                     return -1;
367             }
368             else
369             {
370                 yaz_marc_cprintf(mt,
371                                  "Expected element controlfield or datafield,"
372                                  " got %.80s", ptr->name);
373                 return -1;
374             }
375         }
376     return 0;
377 }
378
379
380 #endif
381
382 #if YAZ_HAVE_XML2
383 int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr)
384 {
385     int indicator_length = 0;
386     int format = 0;
387     yaz_marc_reset(mt);
388
389     for(; ptr; ptr = ptr->next)
390         if (ptr->type == XML_ELEMENT_NODE)
391         {
392             if (!strcmp((const char *) ptr->name, "record"))
393             {
394                 format = YAZ_MARC_MARCXML;
395                 break;
396             }
397             else if (!strcmp((const char *) ptr->name, "r"))
398             {
399                 format = YAZ_MARC_TURBOMARC;
400                 break;
401             }
402             else
403             {
404                 yaz_marc_cprintf(
405                     mt, "Unknown element '%.80s' in MARC XML reader",
406                     ptr->name);
407                 return -1;
408             }
409         }
410     if (!ptr)
411     {
412         yaz_marc_cprintf(mt, "Missing element 'record' in MARC XML record");
413         return -1;
414     }
415     /* ptr points to record node now */
416     ptr = ptr->children;
417     if (yaz_marc_read_xml_leader(mt, &ptr, &indicator_length))
418         return -1;
419
420     switch (format)
421     {
422     case YAZ_MARC_MARCXML:
423         return yaz_marc_read_xml_fields(mt, ptr->next, indicator_length);
424     case YAZ_MARC_TURBOMARC:
425         return yaz_marc_read_turbo_xml_fields(mt, ptr->next, indicator_length);
426     }
427     return -1;
428 }
429 #endif
430
431
432 /*
433  * Local variables:
434  * c-basic-offset: 4
435  * c-file-style: "Stroustrup"
436  * indent-tabs-mode: nil
437  * End:
438  * vim: shiftwidth=4 tabstop=8 expandtab
439  */
440