f448b5b1ad4719af331cee8bff843c35471504e5
[yaz-moved-to-github.git] / src / marc_read_xml.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2010 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file marc_read_xml.c
8  * \brief Implements reading of MARC as XML
9  */
10
11 #if HAVE_CONFIG_H
12 #include <config.h>
13 #endif
14
15 #ifdef WIN32
16 #include <windows.h>
17 #endif
18
19 #include <stdio.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <yaz/marcdisp.h>
23 #include <yaz/wrbuf.h>
24 #include <yaz/yaz-util.h>
25 #include <yaz/nmem_xml.h>
26
27 #if YAZ_HAVE_XML2
28 #include <libxml/tree.h>
29 #endif
30
31 #if YAZ_HAVE_XML2
32 int yaz_marc_read_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
33 {
34     NMEM nmem = yaz_marc_get_nmem(mt);
35     for (; ptr; ptr = ptr->next)
36     {
37         if (ptr->type == XML_ELEMENT_NODE)
38         {
39             if (!strcmp((const char *) ptr->name, "subfield"))
40             {
41                 size_t ctrl_data_len = 0;
42                 char *ctrl_data_buf = 0;
43                 const xmlNode *p = 0, *ptr_code = 0;
44                 struct _xmlAttr *attr;
45                 for (attr = ptr->properties; attr; attr = attr->next)
46                     if (!strcmp((const char *)attr->name, "code"))
47                         ptr_code = attr->children;
48                     else
49                     {
50                         yaz_marc_cprintf(
51                             mt, "Bad attribute '%.80s' for 'subfield'",
52                             attr->name);
53                         return -1;
54                     }
55                 if (!ptr_code)
56                 {
57                     yaz_marc_cprintf(
58                         mt, "Missing attribute 'code' for 'subfield'" );
59                     return -1;
60                 }
61                 if (ptr_code->type == XML_TEXT_NODE)
62                 {
63                     ctrl_data_len = 
64                         strlen((const char *)ptr_code->content);
65                 }
66                 else
67                 {
68                     yaz_marc_cprintf(
69                         mt, "Missing value for 'code' in 'subfield'" );
70                     return -1;
71                 }
72                 for (p = ptr->children; p ; p = p->next)
73                     if (p->type == XML_TEXT_NODE)
74                         ctrl_data_len += strlen((const char *)p->content);
75                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
76                 strcpy(ctrl_data_buf, (const char *)ptr_code->content);
77                 for (p = ptr->children; p ; p = p->next)
78                     if (p->type == XML_TEXT_NODE)
79                         strcat(ctrl_data_buf, (const char *)p->content);
80                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
81             }
82             else
83             {
84                 yaz_marc_cprintf(
85                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
86                 return -1;
87             }
88         }
89     }
90     return 0;
91 }
92
93 const char *tag_value_extract(const char *name, char tag_buffer[5])
94 {
95     size_t length = strlen(name);
96     if (length == 3)
97     {
98         strcpy(tag_buffer, name);
99         return tag_buffer;
100     }
101     return 0;
102 }
103
104 // Given a xmlNode ptr,  extract a value from either a element name or from a given attribute
105 const char *element_attribute_value_extract(const xmlNode *ptr,
106                                             const char *attribute_name,
107                                             NMEM nmem)
108 {
109
110     const char *name = ptr->name;
111     size_t length = strlen(name);
112     if (length > 1 )
113         return nmem_strdup(nmem, name+1);
114     // TODO Extract from attribute where matches attribute_name
115     xmlAttr *attr;
116     for (attr = ptr->properties; attr; attr = attr->next)
117         if (!strcmp((const char *)attr->name, attribute_name))
118             return nmem_text_node_cdata(attr->children, nmem);
119     return 0;
120 }
121
122
123 int yaz_marc_read_turbo_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
124 {
125     NMEM nmem = yaz_marc_get_nmem(mt);
126     for (; ptr; ptr = ptr->next)
127     {
128         if (ptr->type == XML_ELEMENT_NODE)
129         {
130             xmlNode *p;
131             if (!strncmp((const char *) ptr->name, "s", 1))
132             {
133                 NMEM nmem = yaz_marc_get_nmem(mt);
134                 char *buffer = (char *) nmem_malloc(nmem, 5);
135                 const char *tag_value = element_attribute_value_extract(ptr, "code", nmem);
136                 if (!tag_value)
137                 {
138                     yaz_marc_cprintf(
139                         mt, "Missing 'code' value for 'subfield'" );
140                     return -1;
141                 }
142
143                 size_t ctrl_data_len = 0;
144                 char *ctrl_data_buf = 0;
145                 ctrl_data_len = strlen((const char *) tag_value);
146                 // Extract (length) from CDATA
147                 xmlNode *p;
148                 for (p = ptr->children; p ; p = p->next)
149                     if (p->type == XML_TEXT_NODE)
150                         ctrl_data_len += strlen((const char *)p->content);
151                 // Allocate memory for code value (1 character (can be multi-byte) and data
152                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
153                 // Build a string with "<Code><data>"
154                 strcpy(ctrl_data_buf, (const char *) tag_value);
155                 for (p = ptr->children; p ; p = p->next)
156                     if (p->type == XML_TEXT_NODE)
157                         strcat(ctrl_data_buf, (const char *)p->content);
158                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
159             }
160             else
161             {
162                 yaz_marc_cprintf(
163                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
164                 return -1;
165             }
166         }
167     }
168     return 0;
169 }
170
171
172 static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p)
173 {
174     int indicator_length;
175     int identifier_length;
176     int base_address;
177     int length_data_entry;
178     int length_starting;
179     int length_implementation;
180     const char *leader = 0;
181     const xmlNode *ptr = *ptr_p;
182
183     for(; ptr; ptr = ptr->next)
184         if (ptr->type == XML_ELEMENT_NODE)
185         {
186             if ( !strcmp( (const char *) ptr->name, "leader") ||
187                  (!strncmp((const char *) ptr->name, "l", 1) ))
188             {
189                 xmlNode *p = ptr->children;
190                 for(; p; p = p->next)
191                     if (p->type == XML_TEXT_NODE)
192                         leader = (const char *) p->content;
193                 break;
194             }
195             else
196             {
197                 yaz_marc_cprintf(
198                     mt, "Expected element 'leader', got '%.80s'", ptr->name);
199             }
200         }
201     if (!leader)
202     {
203         yaz_marc_cprintf(mt, "Missing element 'leader'");
204         return -1;
205     }
206     if (strlen(leader) != 24)
207     {
208         yaz_marc_cprintf(mt, "Bad length %d of leader data."
209                          " Must have length of 24 characters", strlen(leader));
210         return -1;
211     }
212     yaz_marc_set_leader(mt, leader,
213                         &indicator_length,
214                         &identifier_length,
215                         &base_address,
216                         &length_data_entry,
217                         &length_starting,
218                         &length_implementation);
219     *ptr_p = ptr;
220     return 0;
221 }
222
223 static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
224 {
225     for(; ptr; ptr = ptr->next)
226         if (ptr->type == XML_ELEMENT_NODE)
227         {
228             if (!strcmp( (const char *) ptr->name, "controlfield"))
229             {
230                 const xmlNode *ptr_tag = 0;
231                 struct _xmlAttr *attr;
232                 for (attr = ptr->properties; attr; attr = attr->next)
233                     if (!strcmp((const char *)attr->name, "tag"))
234                         ptr_tag = attr->children;
235                     else
236                     {
237                         yaz_marc_cprintf(
238                             mt, "Bad attribute '%.80s' for 'controlfield'",
239                             attr->name);
240                         return -1;
241                     }
242                 if (!ptr_tag)
243                 {
244                     yaz_marc_cprintf(
245                         mt, "Missing attribute 'tag' for 'controlfield'" );
246                     return -1;
247                 }
248                 yaz_marc_add_controlfield_xml(mt, ptr_tag, ptr->children);
249             }
250             else if (!strcmp((const char *) ptr->name, "datafield"))
251             {
252                 char indstr[11]; /* 0(unused), 1,....9, + zero term */
253                 const xmlNode *ptr_tag = 0;
254                 struct _xmlAttr *attr;
255                 int i;
256                 for (i = 0; i<11; i++)
257                     indstr[i] = '\0';
258                 for (attr = ptr->properties; attr; attr = attr->next)
259                     if (!strcmp((const char *)attr->name, "tag"))
260                         ptr_tag = attr->children;
261                     else if (strlen((const char *)attr->name) == 4 &&
262                              !memcmp(attr->name, "ind", 3))
263                     {
264                         int no = atoi((const char *)attr->name+3);
265                         if (attr->children
266                             && attr->children->type == XML_TEXT_NODE)
267                             indstr[no] = attr->children->content[0];
268                     }
269                     else
270                     {
271                         yaz_marc_cprintf(
272                             mt, "Bad attribute '%.80s' for 'datafield'",
273                             attr->name);
274                     }
275                 if (!ptr_tag)
276                 {
277                     yaz_marc_cprintf(
278                         mt, "Missing attribute 'tag' for 'datafield'" );
279                     return -1;
280                 }
281                 /* note that indstr[0] is unused so we use indstr[1..] */
282                 yaz_marc_add_datafield_xml(mt, ptr_tag,
283                                            indstr+1, strlen(indstr+1));
284                 
285                 if (yaz_marc_read_xml_subfields(mt, ptr->children))
286                     return -1;
287             }
288             else
289             {
290                 yaz_marc_cprintf(mt,
291                                  "Expected element controlfield or datafield,"
292                                  " got %.80s", ptr->name);
293                 return -1;
294             }
295         }
296     return 0;
297 }
298
299 void yaz_marc_add_datafield_turbo_xml(yaz_marc_t mt, char *tag_value, char *indicators);
300
301 static int yaz_marc_read_turbo_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
302 {
303     for(; ptr; ptr = ptr->next)
304         if (ptr->type == XML_ELEMENT_NODE)
305         {
306             if (!strncmp( (const char *) ptr->name, "c", 1))
307             {
308                 NMEM nmem = yaz_marc_get_nmem(mt);
309                 const char *tag_value = element_attribute_value_extract(ptr, "tag", nmem);
310                 if (!tag_value)
311                 {
312                     yaz_marc_cprintf(
313                         mt, "Missing attribute 'tag' for 'controlfield'" );
314                     return -1;
315                 }
316                 yaz_marc_add_controlfield_turbo_xml(mt, tag_value, ptr->children);
317             }
318             else if (!strncmp((const char *) ptr->name, "d",1))
319             {
320                 struct _xmlAttr *attr;
321                 NMEM nmem = yaz_marc_get_nmem(mt);
322                 char *indstr = nmem_malloc(nmem, 11);  /* 0(unused), 1,....9, + zero term */
323                 int index = 0;
324                 for (index = 0; index < 11; index++)
325                     indstr[index] = '\0';
326                 const char *tag_value = element_attribute_value_extract(ptr, "tag", nmem);
327                 if (!tag_value)
328                 {
329                     yaz_marc_cprintf(
330                         mt, "Missing attribute 'tag' for 'datafield'" );
331                     return -1;
332                 }
333                 for (attr = ptr->properties; attr; attr = attr->next)
334                     if (strlen((const char *)attr->name) == 2 &&
335                         attr->name[0] == 'i')
336                     {
337                         //extract indicator attribute from i#="Y" pattern
338                         int no = atoi((const char *)attr->name+1);
339                         if (attr->children
340                             && attr->children->type == XML_TEXT_NODE)
341                             indstr[no] = attr->children->content[0];
342                     }
343                     else
344                     {
345                         yaz_marc_cprintf(
346                             mt, "Bad attribute '%.80s' for 'datafield'",
347                             attr->name);
348                     }
349                 /* note that indstr[0] is unused so we use indstr[1..] */
350                 yaz_marc_add_datafield_turbo_xml(mt, tag_value, indstr+1);
351                 int rc = yaz_marc_read_turbo_xml_subfields(mt, ptr->children /*, indstr */);
352                 if (rc)
353                     return -1;
354             }
355             else
356             {
357                 yaz_marc_cprintf(mt,
358                                  "Expected element controlfield or datafield,"
359                                  " got %.80s", ptr->name);
360                 return -1;
361             }
362         }
363     return 0;
364 }
365
366
367 #endif
368
369 #if YAZ_HAVE_XML2
370 int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr)
371 {
372     yaz_marc_reset(mt);
373
374     for(; ptr; ptr = ptr->next)
375         if (ptr->type == XML_ELEMENT_NODE)
376         {
377             //TODO Should actually look at the namespace but...
378             if (!strcmp((const char *) ptr->name, "record"))
379             {
380                 yaz_marc_set_read_format(mt, YAZ_MARC_MARCXML);
381                 break;
382             }
383             else if (!strcmp((const char *) ptr->name, "r"))
384             {
385                 yaz_marc_set_read_format(mt, YAZ_MARC_TMARCXML);
386                 break;
387             }
388             {
389                 yaz_marc_cprintf(
390                     mt, "Unknown element '%.80s' in MARC XML reader",
391                     ptr->name);
392                 return -1;
393             }
394         }
395     if (!ptr)
396     {
397         yaz_marc_cprintf(mt, "Missing element 'record' in MARC XML record");
398         return -1;
399     }
400     /* ptr points to record node now */
401     ptr = ptr->children;
402     if (yaz_marc_read_xml_leader(mt, &ptr))
403         return -1;
404
405     switch (yaz_marc_get_read_format(mt))
406     {
407     case YAZ_MARC_MARCXML:
408         return yaz_marc_read_xml_fields(mt, ptr->next);
409     case YAZ_MARC_TMARCXML:
410         return yaz_marc_read_turbo_xml_fields(mt, ptr->next);
411     }
412     return -1;
413 }
414 #endif
415
416
417 /*
418  * Local variables:
419  * c-basic-offset: 4
420  * c-file-style: "Stroustrup"
421  * indent-tabs-mode: nil
422  * End:
423  * vim: shiftwidth=4 tabstop=8 expandtab
424  */
425