Found a new place to add turbo marcxml element name "r".
[yaz-moved-to-github.git] / src / marc_read_xml.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2010 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file marc_read_xml.c
8  * \brief Implements reading of MARC as XML
9  */
10
11 #if HAVE_CONFIG_H
12 #include <config.h>
13 #endif
14
15 #ifdef WIN32
16 #include <windows.h>
17 #endif
18
19 #include <stdio.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <yaz/marcdisp.h>
23 #include <yaz/wrbuf.h>
24 #include <yaz/yaz-util.h>
25 #include <yaz/nmem_xml.h>
26
27 #if YAZ_HAVE_XML2
28 #include <libxml/tree.h>
29 #endif
30
31 #if YAZ_HAVE_XML2
32 int yaz_marc_read_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
33 {
34     NMEM nmem = yaz_marc_get_nmem(mt);
35     for (; ptr; ptr = ptr->next)
36     {
37         if (ptr->type == XML_ELEMENT_NODE)
38         {
39             if (!strcmp((const char *) ptr->name, "subfield"))
40             {
41                 size_t ctrl_data_len = 0;
42                 char *ctrl_data_buf = 0;
43                 const xmlNode *p = 0, *ptr_code = 0;
44                 struct _xmlAttr *attr;
45                 for (attr = ptr->properties; attr; attr = attr->next)
46                     if (!strcmp((const char *)attr->name, "code"))
47                         ptr_code = attr->children;
48                     else
49                     {
50                         yaz_marc_cprintf(
51                             mt, "Bad attribute '%.80s' for 'subfield'",
52                             attr->name);
53                         return -1;
54                     }
55                 if (!ptr_code)
56                 {
57                     yaz_marc_cprintf(
58                         mt, "Missing attribute 'code' for 'subfield'" );
59                     return -1;
60                 }
61                 if (ptr_code->type == XML_TEXT_NODE)
62                 {
63                     ctrl_data_len = 
64                         strlen((const char *)ptr_code->content);
65                 }
66                 else
67                 {
68                     yaz_marc_cprintf(
69                         mt, "Missing value for 'code' in 'subfield'" );
70                     return -1;
71                 }
72                 for (p = ptr->children; p ; p = p->next)
73                     if (p->type == XML_TEXT_NODE)
74                         ctrl_data_len += strlen((const char *)p->content);
75                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
76                 strcpy(ctrl_data_buf, (const char *)ptr_code->content);
77                 for (p = ptr->children; p ; p = p->next)
78                     if (p->type == XML_TEXT_NODE)
79                         strcat(ctrl_data_buf, (const char *)p->content);
80                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
81             }
82             else
83             {
84                 yaz_marc_cprintf(
85                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
86                 return -1;
87             }
88         }
89     }
90     return 0;
91 }
92
93 const char *tag_value_extract(const char *name, char tag_buffer[5]) {
94         size_t length = strlen(name);
95         if (length == 3) {
96                 strcpy(tag_buffer, name);
97                 return tag_buffer;
98         }
99         return 0;
100 }
101
102 // pattern <one character or -AB[CD]
103 const char *code_value_extract(const char *name, char tag_buffer[5]) {
104         size_t length = strlen(name);
105         if (length == 1 ) {
106                 return name;
107         }
108         if (length > 2 && length < 6) {
109                 if (name[0] != '-') {
110                         return 0;
111                 }
112                 length--;
113                 const char *ptr = name+1;
114                 int index = 0;
115                 for (index = 0; index < length/2; index++) {
116                         unsigned int value;
117                         char temp[3];
118                         strncpy(temp, ptr + 2*index, 2);
119                         sscanf(temp, "%02X", &value);
120                         tag_buffer[index] = (unsigned char) value;
121                 }
122                 tag_buffer[index] = '\0';
123                 if (index > 0)
124                         return tag_buffer;
125         }
126         return 0;
127 }
128
129
130 int yaz_marc_read_turbo_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
131 {
132     NMEM nmem = yaz_marc_get_nmem(mt);
133     for (; ptr; ptr = ptr->next)
134     {
135         if (ptr->type == XML_ELEMENT_NODE)
136         {
137                 xmlNode *p;
138             if (!strncmp((const char *) ptr->name, "s", 1))
139             {
140                         NMEM nmem = yaz_marc_get_nmem(mt);
141                         char *buffer = (char *) nmem_malloc(nmem, 5);
142                                 const char *tag_value = code_value_extract((ptr->name+1), buffer);
143                 if (!tag_value)
144                 {
145                     yaz_marc_cprintf(
146                         mt, "Missing 'code' value for 'subfield'" );
147                     return -1;
148                 }
149
150                 size_t ctrl_data_len = 0;
151                 char *ctrl_data_buf = 0;
152                                 ctrl_data_len = strlen((const char *) tag_value);
153                                 // Extract (length) from CDATA
154                                 xmlNode *p;
155                                 for (p = ptr->children; p ; p = p->next)
156                     if (p->type == XML_TEXT_NODE)
157                         ctrl_data_len += strlen((const char *)p->content);
158                                 // Allocate memory for code value (1 character (can be multi-byte) and data
159                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
160                 // Build a string with "<Code><data>"
161                 strcpy(ctrl_data_buf, (const char *) tag_value);
162                 for (p = ptr->children; p ; p = p->next)
163                     if (p->type == XML_TEXT_NODE)
164                         strcat(ctrl_data_buf, (const char *)p->content);
165                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
166             }
167             else
168             {
169                 yaz_marc_cprintf(
170                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
171                 return -1;
172             }
173         }
174     }
175     return 0;
176 }
177
178
179 static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p)
180 {
181     int indicator_length;
182     int identifier_length;
183     int base_address;
184     int length_data_entry;
185     int length_starting;
186     int length_implementation;
187     const char *leader = 0;
188     const xmlNode *ptr = *ptr_p;
189
190     for(; ptr; ptr = ptr->next)
191         if (ptr->type == XML_ELEMENT_NODE)
192         {
193                 if ( !strcmp( (const char *) ptr->name, "leader") ||
194                         (!strncmp((const char *) ptr->name, "l", 1) ))
195             {
196                 xmlNode *p = ptr->children;
197                 for(; p; p = p->next)
198                     if (p->type == XML_TEXT_NODE)
199                         leader = (const char *) p->content;
200                 break;
201             }
202             else
203             {
204                 yaz_marc_cprintf(
205                     mt, "Expected element 'leader', got '%.80s'", ptr->name);
206             }
207         }
208     if (!leader)
209     {
210         yaz_marc_cprintf(mt, "Missing element 'leader'");
211         return -1;
212     }
213     if (strlen(leader) != 24)
214     {
215         yaz_marc_cprintf(mt, "Bad length %d of leader data."
216                          " Must have length of 24 characters", strlen(leader));
217         return -1;
218     }
219     yaz_marc_set_leader(mt, leader,
220                         &indicator_length,
221                         &identifier_length,
222                         &base_address,
223                         &length_data_entry,
224                         &length_starting,
225                         &length_implementation);
226     *ptr_p = ptr;
227     return 0;
228 }
229
230 static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
231 {
232     for(; ptr; ptr = ptr->next)
233         if (ptr->type == XML_ELEMENT_NODE)
234         {
235                 if (!strcmp( (const char *) ptr->name, "controlfield"))
236             {
237                 const xmlNode *ptr_tag = 0;
238                 struct _xmlAttr *attr;
239                 for (attr = ptr->properties; attr; attr = attr->next)
240                     if (!strcmp((const char *)attr->name, "tag"))
241                         ptr_tag = attr->children;
242                     else
243                     {
244                         yaz_marc_cprintf(
245                             mt, "Bad attribute '%.80s' for 'controlfield'",
246                             attr->name);
247                         return -1;
248                     }
249                 if (!ptr_tag)
250                 {
251                     yaz_marc_cprintf(
252                         mt, "Missing attribute 'tag' for 'controlfield'" );
253                     return -1;
254                 }
255                 yaz_marc_add_controlfield_xml(mt, ptr_tag, ptr->children);
256             }
257             else if (!strcmp((const char *) ptr->name, "datafield"))
258             {
259                 char indstr[11]; /* 0(unused), 1,....9, + zero term */
260                 const xmlNode *ptr_tag = 0;
261                 struct _xmlAttr *attr;
262                 int i;
263                 for (i = 0; i<11; i++)
264                     indstr[i] = '\0';
265                 for (attr = ptr->properties; attr; attr = attr->next)
266                     if (!strcmp((const char *)attr->name, "tag"))
267                         ptr_tag = attr->children;
268                     else if (strlen((const char *)attr->name) == 4 &&
269                              !memcmp(attr->name, "ind", 3))
270                     {
271                         int no = atoi((const char *)attr->name+3);
272                         if (attr->children
273                             && attr->children->type == XML_TEXT_NODE)
274                             indstr[no] = attr->children->content[0];
275                     }
276                     else
277                     {
278                         yaz_marc_cprintf(
279                             mt, "Bad attribute '%.80s' for 'datafield'",
280                             attr->name);
281                     }
282                 if (!ptr_tag)
283                 {
284                     yaz_marc_cprintf(
285                         mt, "Missing attribute 'tag' for 'datafield'" );
286                     return -1;
287                 }
288                 /* note that indstr[0] is unused so we use indstr[1..] */
289                 yaz_marc_add_datafield_xml(mt, ptr_tag,
290                                            indstr+1, strlen(indstr+1));
291                 
292                 if (yaz_marc_read_xml_subfields(mt, ptr->children))
293                     return -1;
294             }
295             else
296             {
297                 yaz_marc_cprintf(mt,
298                                  "Expected element controlfield or datafield,"
299                                  " got %.80s", ptr->name);
300                 return -1;
301             }
302         }
303     return 0;
304 }
305
306 void yaz_marc_add_datafield_turbo_xml(yaz_marc_t mt, char *tag_value, char *indicators);
307
308 static int yaz_marc_read_turbo_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
309 {
310     for(; ptr; ptr = ptr->next)
311         if (ptr->type == XML_ELEMENT_NODE)
312         {
313                 if (!strncmp( (const char *) ptr->name, "c", 1))
314             {
315                         NMEM nmem = yaz_marc_get_nmem(mt);
316                         char *buffer = (char *) nmem_malloc(nmem, 5);
317                         const char *tag_value = tag_value_extract((const char *)(ptr->name+1), buffer);
318                 if (!tag_value)
319                 {
320                     yaz_marc_cprintf(
321                         mt, "Missing attribute 'tag' for 'controlfield'" );
322                     return -1;
323                 }
324                 yaz_marc_add_controlfield_turbo_xml(mt, tag_value, ptr->children);
325             }
326             else if (!strncmp((const char *) ptr->name, "d",1))
327             {
328                 struct _xmlAttr *attr;
329                         NMEM nmem = yaz_marc_get_nmem(mt);
330                 char *indstr = nmem_malloc(nmem, 11);  /* 0(unused), 1,....9, + zero term */
331                 int index = 0;
332                 for (index = 0; index < 11; index++)
333                                         indstr[index] = '\0';
334                         char *buffer = (char *) nmem_malloc(nmem, 5);
335                                 char *tag_value = tag_value_extract(ptr->name+1, buffer);
336                 if (!tag_value)
337                                 {
338                     yaz_marc_cprintf(
339                         mt, "Missing attribute 'tag' for 'datafield'" );
340                     return -1;
341                 }
342                 for (attr = ptr->properties; attr; attr = attr->next)
343                     if (strlen((const char *)attr->name) == 2 &&
344                              attr->name[0] == 'i')
345                     {
346                         //extract indicator attribute from i#="Y" pattern
347                         int no = atoi((const char *)attr->name+1);
348                         if (attr->children
349                             && attr->children->type == XML_TEXT_NODE)
350                             indstr[no] = attr->children->content[0];
351                     }
352                     else
353                     {
354                         yaz_marc_cprintf(
355                             mt, "Bad attribute '%.80s' for 'datafield'",
356                             attr->name);
357                     }
358                 /* note that indstr[0] is unused so we use indstr[1..] */
359                 yaz_marc_add_datafield_turbo_xml(mt, tag_value, indstr+1);
360                 int rc = yaz_marc_read_turbo_xml_subfields(mt, ptr->children /*, indstr */);
361                 if (rc)
362                     return -1;
363             }
364             else
365             {
366                 yaz_marc_cprintf(mt,
367                                  "Expected element controlfield or datafield,"
368                                  " got %.80s", ptr->name);
369                 return -1;
370             }
371         }
372     return 0;
373 }
374
375
376 #endif
377
378 #if YAZ_HAVE_XML2
379 int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr)
380 {
381     yaz_marc_reset(mt);
382
383     for(; ptr; ptr = ptr->next)
384         if (ptr->type == XML_ELEMENT_NODE)
385         {
386                         //TODO Should actually look at the namespace but...
387             if (!strcmp((const char *) ptr->name, "record")) {
388                 yaz_marc_set_read_format(mt, YAZ_MARC_MARCXML);
389                 break;
390             }
391             else if (!strcmp((const char *) ptr->name, "r")) {
392                 yaz_marc_set_read_format(mt, YAZ_MARC_TMARCXML);
393                 break;
394             }
395             {
396                 yaz_marc_cprintf(
397                     mt, "Unknown element '%.80s' in MARC XML reader",
398                     ptr->name);
399                 return -1;
400             }
401         }
402     if (!ptr)
403     {
404         yaz_marc_cprintf(mt, "Missing element 'record' in MARC XML record");
405         return -1;
406     }
407     /* ptr points to record node now */
408     ptr = ptr->children;
409     if (yaz_marc_read_xml_leader(mt, &ptr))
410         return -1;
411
412     switch (yaz_marc_get_read_format(mt)) {
413                 case YAZ_MARC_MARCXML:
414                         return yaz_marc_read_xml_fields(mt, ptr->next);
415                 case YAZ_MARC_TMARCXML:
416                         return yaz_marc_read_turbo_xml_fields(mt, ptr->next);
417     }
418         return -1;
419 }
420 #endif
421
422
423 /*
424  * Local variables:
425  * c-basic-offset: 4
426  * c-file-style: "Stroustrup"
427  * indent-tabs-mode: nil
428  * End:
429  * vim: shiftwidth=4 tabstop=8 expandtab
430  */
431