a81e8c487734bc55b84a1daf5709444d9dd19b60
[yaz-moved-to-github.git] / src / marc_read_xml.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2011 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file marc_read_xml.c
8  * \brief Implements reading of MARC as XML
9  */
10
11 #if HAVE_CONFIG_H
12 #include <config.h>
13 #endif
14
15 #ifdef WIN32
16 #include <windows.h>
17 #endif
18
19 #include <stdio.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <yaz/marcdisp.h>
23 #include <yaz/wrbuf.h>
24 #include <yaz/yaz-util.h>
25 #include <yaz/nmem_xml.h>
26
27 #if YAZ_HAVE_XML2
28 #include <libxml/tree.h>
29 #endif
30
31 #if YAZ_HAVE_XML2
32 int yaz_marc_read_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
33 {
34     NMEM nmem = yaz_marc_get_nmem(mt);
35     for (; ptr; ptr = ptr->next)
36     {
37         if (ptr->type == XML_ELEMENT_NODE)
38         {
39             if (!strcmp((const char *) ptr->name, "subfield"))
40             {
41                 size_t ctrl_data_len = 0;
42                 char *ctrl_data_buf = 0;
43                 const xmlNode *p = 0, *ptr_code = 0;
44                 struct _xmlAttr *attr;
45                 for (attr = ptr->properties; attr; attr = attr->next)
46                     if (!strcmp((const char *)attr->name, "code"))
47                         ptr_code = attr->children;
48                     else
49                     {
50                         yaz_marc_cprintf(
51                             mt, "Bad attribute '%.80s' for 'subfield'",
52                             attr->name);
53                         return -1;
54                     }
55                 if (!ptr_code)
56                 {
57                     yaz_marc_cprintf(
58                         mt, "Missing attribute 'code' for 'subfield'" );
59                     return -1;
60                 }
61                 if (ptr_code->type == XML_TEXT_NODE)
62                 {
63                     ctrl_data_len = 
64                         strlen((const char *)ptr_code->content);
65                 }
66                 else
67                 {
68                     yaz_marc_cprintf(
69                         mt, "Missing value for 'code' in 'subfield'" );
70                     return -1;
71                 }
72                 for (p = ptr->children; p ; p = p->next)
73                     if (p->type == XML_TEXT_NODE)
74                         ctrl_data_len += strlen((const char *)p->content);
75                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
76                 strcpy(ctrl_data_buf, (const char *)ptr_code->content);
77                 for (p = ptr->children; p ; p = p->next)
78                     if (p->type == XML_TEXT_NODE)
79                         strcat(ctrl_data_buf, (const char *)p->content);
80                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
81             }
82             else
83             {
84                 yaz_marc_cprintf(
85                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
86                 return -1;
87             }
88         }
89     }
90     return 0;
91 }
92
93 const char *tag_value_extract(const char *name, char tag_buffer[5])
94 {
95     size_t length = strlen(name);
96     if (length == 3)
97     {
98         strcpy(tag_buffer, name);
99         return tag_buffer;
100     }
101     return 0;
102 }
103
104 // Given a xmlNode ptr,  extract a value from either a element name or from a given attribute
105 char *element_attribute_value_extract(const xmlNode *ptr,
106                                       const char *attribute_name,
107                                       NMEM nmem)
108 {
109     const char *name = (const char *) ptr->name;
110     size_t length = strlen(name);
111     xmlAttr *attr;
112     if (length > 1 )
113         return nmem_strdup(nmem, name+1);
114     // TODO Extract from attribute where matches attribute_name
115     for (attr = ptr->properties; attr; attr = attr->next)
116         if (!strcmp((const char *)attr->name, attribute_name))
117             return nmem_text_node_cdata(attr->children, nmem);
118     return 0;
119 }
120
121
122 int yaz_marc_read_turbo_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
123 {
124     for (; ptr; ptr = ptr->next)
125     {
126         if (ptr->type == XML_ELEMENT_NODE)
127         {
128             if (!strncmp((const char *) ptr->name, "s", 1))
129             {
130                 NMEM nmem = yaz_marc_get_nmem(mt);
131                 xmlNode *p;
132                 size_t ctrl_data_len = 0;
133                 char *ctrl_data_buf = 0;
134                 const char *tag_value = element_attribute_value_extract(ptr, "code", nmem);
135                 if (!tag_value)
136                 {
137                     yaz_marc_cprintf(
138                         mt, "Missing 'code' value for 'subfield'" );
139                     return -1;
140                 }
141
142                 ctrl_data_len = strlen((const char *) tag_value);
143                 // Extract (length) from CDATA
144                 for (p = ptr->children; p ; p = p->next)
145                     if (p->type == XML_TEXT_NODE)
146                         ctrl_data_len += strlen((const char *)p->content);
147                 // Allocate memory for code value (1 character (can be multi-byte) and data
148                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
149                 // Build a string with "<Code><data>"
150                 strcpy(ctrl_data_buf, (const char *) tag_value);
151                 for (p = ptr->children; p ; p = p->next)
152                     if (p->type == XML_TEXT_NODE)
153                         strcat(ctrl_data_buf, (const char *)p->content);
154                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
155             }
156             else
157             {
158                 yaz_marc_cprintf(
159                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
160                 return -1;
161             }
162         }
163     }
164     return 0;
165 }
166
167
168 static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p)
169 {
170     int indicator_length;
171     int identifier_length;
172     int base_address;
173     int length_data_entry;
174     int length_starting;
175     int length_implementation;
176     const char *leader = 0;
177     const xmlNode *ptr = *ptr_p;
178
179     for(; ptr; ptr = ptr->next)
180         if (ptr->type == XML_ELEMENT_NODE)
181         {
182             if ( !strcmp( (const char *) ptr->name, "leader") ||
183                  (!strncmp((const char *) ptr->name, "l", 1) ))
184             {
185                 xmlNode *p = ptr->children;
186                 for(; p; p = p->next)
187                     if (p->type == XML_TEXT_NODE)
188                         leader = (const char *) p->content;
189                 break;
190             }
191             else
192             {
193                 yaz_marc_cprintf(
194                     mt, "Expected element 'leader', got '%.80s'", ptr->name);
195             }
196         }
197     if (!leader)
198     {
199         yaz_marc_cprintf(mt, "Missing element 'leader'");
200         return -1;
201     }
202     if (strlen(leader) != 24)
203     {
204         yaz_marc_cprintf(mt, "Bad length %d of leader data."
205                          " Must have length of 24 characters", strlen(leader));
206         return -1;
207     }
208     yaz_marc_set_leader(mt, leader,
209                         &indicator_length,
210                         &identifier_length,
211                         &base_address,
212                         &length_data_entry,
213                         &length_starting,
214                         &length_implementation);
215     *ptr_p = ptr;
216     return 0;
217 }
218
219 static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
220 {
221     for(; ptr; ptr = ptr->next)
222         if (ptr->type == XML_ELEMENT_NODE)
223         {
224             if (!strcmp( (const char *) ptr->name, "controlfield"))
225             {
226                 const xmlNode *ptr_tag = 0;
227                 struct _xmlAttr *attr;
228                 for (attr = ptr->properties; attr; attr = attr->next)
229                     if (!strcmp((const char *)attr->name, "tag"))
230                         ptr_tag = attr->children;
231                     else
232                     {
233                         yaz_marc_cprintf(
234                             mt, "Bad attribute '%.80s' for 'controlfield'",
235                             attr->name);
236                         return -1;
237                     }
238                 if (!ptr_tag)
239                 {
240                     yaz_marc_cprintf(
241                         mt, "Missing attribute 'tag' for 'controlfield'" );
242                     return -1;
243                 }
244                 yaz_marc_add_controlfield_xml(mt, ptr_tag, ptr->children);
245             }
246             else if (!strcmp((const char *) ptr->name, "datafield"))
247             {
248                 char indstr[11]; /* 0(unused), 1,....9, + zero term */
249                 const xmlNode *ptr_tag = 0;
250                 struct _xmlAttr *attr;
251                 int i;
252                 for (i = 0; i<11; i++)
253                     indstr[i] = '\0';
254                 for (attr = ptr->properties; attr; attr = attr->next)
255                     if (!strcmp((const char *)attr->name, "tag"))
256                         ptr_tag = attr->children;
257                     else if (strlen((const char *)attr->name) == 4 &&
258                              !memcmp(attr->name, "ind", 3))
259                     {
260                         int no = atoi((const char *)attr->name+3);
261                         if (attr->children
262                             && attr->children->type == XML_TEXT_NODE)
263                             indstr[no] = attr->children->content[0];
264                     }
265                     else
266                     {
267                         yaz_marc_cprintf(
268                             mt, "Bad attribute '%.80s' for 'datafield'",
269                             attr->name);
270                     }
271                 if (!ptr_tag)
272                 {
273                     yaz_marc_cprintf(
274                         mt, "Missing attribute 'tag' for 'datafield'" );
275                     return -1;
276                 }
277                 /* note that indstr[0] is unused so we use indstr[1..] */
278                 yaz_marc_add_datafield_xml(mt, ptr_tag,
279                                            indstr+1, strlen(indstr+1));
280                 
281                 if (yaz_marc_read_xml_subfields(mt, ptr->children))
282                     return -1;
283             }
284             else
285             {
286                 yaz_marc_cprintf(mt,
287                                  "Expected element controlfield or datafield,"
288                                  " got %.80s", ptr->name);
289                 return -1;
290             }
291         }
292     return 0;
293 }
294
295
296 static int yaz_marc_read_turbo_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
297 {
298     for(; ptr; ptr = ptr->next)
299         if (ptr->type == XML_ELEMENT_NODE)
300         {
301             if (!strncmp( (const char *) ptr->name, "c", 1))
302             {
303                 NMEM nmem = yaz_marc_get_nmem(mt);
304                 char *tag_value = element_attribute_value_extract(ptr, "tag", nmem);
305                 if (!tag_value)
306                 {
307                     yaz_marc_cprintf(
308                         mt, "Missing attribute 'tag' for 'controlfield'" );
309                     return -1;
310                 }
311                 yaz_marc_add_controlfield_xml2(mt, tag_value, ptr->children);
312             }
313             else if (!strncmp((const char *) ptr->name, "d",1))
314             {
315                 struct _xmlAttr *attr;
316                 NMEM nmem = yaz_marc_get_nmem(mt);
317                 char *tag_value;
318                 char *indstr = nmem_malloc(nmem, 11);  /* 0(unused), 1,....9, + zero term */
319                 int index = 0;
320                 for (index = 0; index < 11; index++)
321                     indstr[index] = '\0';
322                 tag_value = element_attribute_value_extract(ptr, "tag", nmem);
323                 if (!tag_value)
324                 {
325                     yaz_marc_cprintf(
326                         mt, "Missing attribute 'tag' for 'datafield'" );
327                     return -1;
328                 }
329                 for (attr = ptr->properties; attr; attr = attr->next)
330                     if (strlen((const char *)attr->name) == 2 &&
331                         attr->name[0] == 'i')
332                     {
333                         //extract indicator attribute from i#="Y" pattern
334                         int no = atoi((const char *)attr->name+1);
335                         if (attr->children
336                             && attr->children->type == XML_TEXT_NODE)
337                             indstr[no] = attr->children->content[0];
338                     }
339                     else
340                     {
341                         yaz_marc_cprintf(
342                             mt, "Bad attribute '%.80s' for 'datafield'",
343                             attr->name);
344                     }
345                 /* note that indstr[0] is unused so we use indstr[1..] */
346                 yaz_marc_add_datafield_xml2(mt, tag_value, indstr+1);
347                 if (yaz_marc_read_turbo_xml_subfields(mt, ptr->children /*, indstr */))
348                     return -1;
349             }
350             else
351             {
352                 yaz_marc_cprintf(mt,
353                                  "Expected element controlfield or datafield,"
354                                  " got %.80s", ptr->name);
355                 return -1;
356             }
357         }
358     return 0;
359 }
360
361
362 #endif
363
364 #if YAZ_HAVE_XML2
365 int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr)
366 {
367     int format = 0;
368     yaz_marc_reset(mt);
369     
370     for(; ptr; ptr = ptr->next)
371         if (ptr->type == XML_ELEMENT_NODE)
372         {
373             if (!strcmp((const char *) ptr->name, "record"))
374             {
375                 format = YAZ_MARC_MARCXML;
376                 break;
377             }
378             else if (!strcmp((const char *) ptr->name, "r"))
379             {
380                 format = YAZ_MARC_TURBOMARC;
381                 break;
382             }
383             else
384             {
385                 yaz_marc_cprintf(
386                     mt, "Unknown element '%.80s' in MARC XML reader",
387                     ptr->name);
388                 return -1;
389             }
390         }
391     if (!ptr)
392     {
393         yaz_marc_cprintf(mt, "Missing element 'record' in MARC XML record");
394         return -1;
395     }
396     /* ptr points to record node now */
397     ptr = ptr->children;
398     if (yaz_marc_read_xml_leader(mt, &ptr))
399         return -1;
400     
401     switch (format)
402     {
403     case YAZ_MARC_MARCXML:
404         return yaz_marc_read_xml_fields(mt, ptr->next);
405     case YAZ_MARC_TURBOMARC:
406         return yaz_marc_read_turbo_xml_fields(mt, ptr->next);
407     }
408     return -1;
409 }
410 #endif
411
412
413 /*
414  * Local variables:
415  * c-basic-offset: 4
416  * c-file-style: "Stroustrup"
417  * indent-tabs-mode: nil
418  * End:
419  * vim: shiftwidth=4 tabstop=8 expandtab
420  */
421