Added turbo marcxml read and write
[yaz-moved-to-github.git] / src / marc_read_xml.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2010 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file marc_read_xml.c
8  * \brief Implements reading of MARC as XML
9  */
10
11 #if HAVE_CONFIG_H
12 #include <config.h>
13 #endif
14
15 #ifdef WIN32
16 #include <windows.h>
17 #endif
18
19 #include <stdio.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <yaz/marcdisp.h>
23 #include <yaz/wrbuf.h>
24 #include <yaz/yaz-util.h>
25 #include <yaz/nmem_xml.h>
26
27 #if YAZ_HAVE_XML2
28 #include <libxml/tree.h>
29 #endif
30
31 #if YAZ_HAVE_XML2
32 int yaz_marc_read_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
33 {
34     NMEM nmem = yaz_marc_get_nmem(mt);
35     for (; ptr; ptr = ptr->next)
36     {
37         if (ptr->type == XML_ELEMENT_NODE)
38         {
39             if (!strcmp((const char *) ptr->name, "subfield"))
40             {
41                 size_t ctrl_data_len = 0;
42                 char *ctrl_data_buf = 0;
43                 const xmlNode *p = 0, *ptr_code = 0;
44                 struct _xmlAttr *attr;
45                 for (attr = ptr->properties; attr; attr = attr->next)
46                     if (!strcmp((const char *)attr->name, "code"))
47                         ptr_code = attr->children;
48                     else
49                     {
50                         yaz_marc_cprintf(
51                             mt, "Bad attribute '%.80s' for 'subfield'",
52                             attr->name);
53                         return -1;
54                     }
55                 if (!ptr_code)
56                 {
57                     yaz_marc_cprintf(
58                         mt, "Missing attribute 'code' for 'subfield'" );
59                     return -1;
60                 }
61                 if (ptr_code->type == XML_TEXT_NODE)
62                 {
63                     ctrl_data_len = 
64                         strlen((const char *)ptr_code->content);
65                 }
66                 else
67                 {
68                     yaz_marc_cprintf(
69                         mt, "Missing value for 'code' in 'subfield'" );
70                     return -1;
71                 }
72                 for (p = ptr->children; p ; p = p->next)
73                     if (p->type == XML_TEXT_NODE)
74                         ctrl_data_len += strlen((const char *)p->content);
75                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
76                 strcpy(ctrl_data_buf, (const char *)ptr_code->content);
77                 for (p = ptr->children; p ; p = p->next)
78                     if (p->type == XML_TEXT_NODE)
79                         strcat(ctrl_data_buf, (const char *)p->content);
80                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
81             }
82             else
83             {
84                 yaz_marc_cprintf(
85                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
86                 return -1;
87             }
88         }
89     }
90     return 0;
91 }
92
93 const char *tag_value_extract(const char *name, char tag_buffer[5]) {
94         size_t length = strlen(name);
95         if (length == 3) {
96                 strcpy(tag_buffer, name);
97                 return tag_buffer;
98         }
99         return 0;
100 }
101
102 // pattern <on character or -AB[CD]
103 const char *code_value_extract(const char *name, char tag_buffer[5]) {
104         size_t length = strlen(name);
105         if (length == 1 ) {
106                 return name;
107         }
108         if (length > 2 && length < 5) {
109                 if (name[0] != '-') {
110                         return 0;
111                 }
112                 length--;
113                 const char *ptr = name+1;
114                 int index = 0;
115                 for (index = 0; index < length/2; index++) {
116                         unsigned int value;
117                         char temp[3];
118                         strncpy(temp, ptr + 2*index, 2);
119                         sscanf(temp, "%02X", &value);
120                         tag_buffer[index] = (unsigned char) value;
121                 }
122                 tag_buffer[index] = '\0';
123                 if (index > 0)
124                         return tag_buffer;
125         }
126         return 0;
127 }
128
129
130 int yaz_marc_read_turbo_xml_subfields(yaz_marc_t mt, const xmlNode *ptr, char indicators[11])
131 {
132     NMEM nmem = yaz_marc_get_nmem(mt);
133     for (; ptr; ptr = ptr->next)
134     {
135         if (ptr->type == XML_ELEMENT_NODE)
136         {
137                 xmlNode *p;
138                 if (!strncmp((const char *) ptr->name, "i", 1)) {
139                 int length = strlen(ptr->name+1);
140                 if (length > 0) {
141                         int index = (int)strtol(ptr->name+1, (char **)NULL, 10);
142                                 for (p = ptr->children; p ; p = p->next)
143                         if (p->type == XML_TEXT_NODE) {
144                             indicators[index] = ((const char *)p->content)[0];
145                             break;
146                         }
147                 }
148             }
149             else if (!strncmp((const char *) ptr->name, "s", 1))
150             {
151                         NMEM nmem = yaz_marc_get_nmem(mt);
152                         char *buffer = (char *) nmem_malloc(nmem, 5);
153                                 const char *tag_value = code_value_extract((ptr->name+1), buffer);
154                 if (!tag_value)
155                 {
156                     yaz_marc_cprintf(
157                         mt, "Missing 'code' value for 'subfield'" );
158                     return -1;
159                 }
160
161                 size_t ctrl_data_len = 0;
162                 char *ctrl_data_buf = 0;
163                                 ctrl_data_len = strlen((const char *) tag_value);
164                                 // Extract (length) from CDATA
165                                 xmlNode *p;
166                                 for (p = ptr->children; p ; p = p->next)
167                     if (p->type == XML_TEXT_NODE)
168                         ctrl_data_len += strlen((const char *)p->content);
169                                 // Allocate memory for code value (1 character (can be multi-byte) and data
170                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
171                 // Build a string with "<Code><data>"
172                 strcpy(ctrl_data_buf, (const char *) tag_value);
173                 for (p = ptr->children; p ; p = p->next)
174                     if (p->type == XML_TEXT_NODE)
175                         strcat(ctrl_data_buf, (const char *)p->content);
176                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
177             }
178             else
179             {
180                 yaz_marc_cprintf(
181                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
182                 return -1;
183             }
184         }
185     }
186     return 0;
187 }
188
189
190 static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p)
191 {
192     int indicator_length;
193     int identifier_length;
194     int base_address;
195     int length_data_entry;
196     int length_starting;
197     int length_implementation;
198     const char *leader = 0;
199     const xmlNode *ptr = *ptr_p;
200
201     for(; ptr; ptr = ptr->next)
202         if (ptr->type == XML_ELEMENT_NODE)
203         {
204                 if ( !strcmp( (const char *) ptr->name, "leader") ||
205                         (!strncmp((const char *) ptr->name, "l", 1) ))
206             {
207                 xmlNode *p = ptr->children;
208                 for(; p; p = p->next)
209                     if (p->type == XML_TEXT_NODE)
210                         leader = (const char *) p->content;
211                 break;
212             }
213             else
214             {
215                 yaz_marc_cprintf(
216                     mt, "Expected element 'leader', got '%.80s'", ptr->name);
217             }
218         }
219     if (!leader)
220     {
221         yaz_marc_cprintf(mt, "Missing element 'leader'");
222         return -1;
223     }
224     if (strlen(leader) != 24)
225     {
226         yaz_marc_cprintf(mt, "Bad length %d of leader data."
227                          " Must have length of 24 characters", strlen(leader));
228         return -1;
229     }
230     yaz_marc_set_leader(mt, leader,
231                         &indicator_length,
232                         &identifier_length,
233                         &base_address,
234                         &length_data_entry,
235                         &length_starting,
236                         &length_implementation);
237     *ptr_p = ptr;
238     return 0;
239 }
240
241 static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
242 {
243     for(; ptr; ptr = ptr->next)
244         if (ptr->type == XML_ELEMENT_NODE)
245         {
246                 if (!strcmp( (const char *) ptr->name, "controlfield"))
247             {
248                 const xmlNode *ptr_tag = 0;
249                 struct _xmlAttr *attr;
250                 for (attr = ptr->properties; attr; attr = attr->next)
251                     if (!strcmp((const char *)attr->name, "tag"))
252                         ptr_tag = attr->children;
253                     else
254                     {
255                         yaz_marc_cprintf(
256                             mt, "Bad attribute '%.80s' for 'controlfield'",
257                             attr->name);
258                         return -1;
259                     }
260                 if (!ptr_tag)
261                 {
262                     yaz_marc_cprintf(
263                         mt, "Missing attribute 'tag' for 'controlfield'" );
264                     return -1;
265                 }
266                 yaz_marc_add_controlfield_xml(mt, ptr_tag, ptr->children);
267             }
268             else if (!strcmp((const char *) ptr->name, "datafield"))
269             {
270                 char indstr[11]; /* 0(unused), 1,....9, + zero term */
271                 const xmlNode *ptr_tag = 0;
272                 struct _xmlAttr *attr;
273                 int i;
274                 for (i = 0; i<11; i++)
275                     indstr[i] = '\0';
276                 for (attr = ptr->properties; attr; attr = attr->next)
277                     if (!strcmp((const char *)attr->name, "tag"))
278                         ptr_tag = attr->children;
279                     else if (strlen((const char *)attr->name) == 4 &&
280                              !memcmp(attr->name, "ind", 3))
281                     {
282                         int no = atoi((const char *)attr->name+3);
283                         if (attr->children
284                             && attr->children->type == XML_TEXT_NODE)
285                             indstr[no] = attr->children->content[0];
286                     }
287                     else
288                     {
289                         yaz_marc_cprintf(
290                             mt, "Bad attribute '%.80s' for 'datafield'",
291                             attr->name);
292                     }
293                 if (!ptr_tag)
294                 {
295                     yaz_marc_cprintf(
296                         mt, "Missing attribute 'tag' for 'datafield'" );
297                     return -1;
298                 }
299                 /* note that indstr[0] is unused so we use indstr[1..] */
300                 yaz_marc_add_datafield_xml(mt, ptr_tag,
301                                            indstr+1, strlen(indstr+1));
302                 
303                 if (yaz_marc_read_xml_subfields(mt, ptr->children))
304                     return -1;
305             }
306             else
307             {
308                 yaz_marc_cprintf(mt,
309                                  "Expected element controlfield or datafield,"
310                                  " got %.80s", ptr->name);
311                 return -1;
312             }
313         }
314     return 0;
315 }
316
317 struct yaz_marc_node* yaz_marc_add_datafield_turbo_xml(yaz_marc_t mt, const char *tag_value);
318
319 static int yaz_marc_read_turbo_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
320 {
321     for(; ptr; ptr = ptr->next)
322         if (ptr->type == XML_ELEMENT_NODE)
323         {
324                 if (!strncmp( (const char *) ptr->name, "c", 1))
325             {
326                         NMEM nmem = yaz_marc_get_nmem(mt);
327                         char *buffer = (char *) nmem_malloc(nmem, 5);
328                         //Extract the tag value out of the rest of the element name
329                         const char *tag_value = tag_value_extract((const char *)(ptr->name+1), buffer);
330                 if (!tag_value)
331                 {
332                     yaz_marc_cprintf(
333                         mt, "Missing attribute 'tag' for 'controlfield'" );
334                     return -1;
335                 }
336                 yaz_marc_add_controlfield_turbo_xml(mt, tag_value, ptr->children);
337                 //wrbuf_destroy(tag_value);
338             }
339             else if (!strncmp((const char *) ptr->name, "d",1))
340             {
341                         NMEM nmem = yaz_marc_get_nmem(mt);
342                 char *indstr = nmem_malloc(nmem, 11);  /* 0(unused), 1,....9, + zero term */
343                         char *buffer = (char *) nmem_malloc(nmem, 5);
344                                 const char *tag_value = tag_value_extract(ptr->name+1, buffer);
345                 if (!tag_value)
346                                 {
347                     yaz_marc_cprintf(
348                         mt, "Missing attribute 'tag' for 'datafield'" );
349                     return -1;
350                 }
351                 /* note that indstr[0] is unused so we use indstr[1..] */
352                 struct yaz_marc_node *n = yaz_marc_add_datafield_turbo_xml(mt, tag_value);
353
354                 int rc = yaz_marc_read_turbo_xml_subfields(mt, ptr->children, indstr);
355                 yaz_marc_datafield_set_indicators(n, indstr+1, strlen(indstr+1));
356                 if (rc)
357                     return -1;
358             }
359             else
360             {
361                 yaz_marc_cprintf(mt,
362                                  "Expected element controlfield or datafield,"
363                                  " got %.80s", ptr->name);
364                 return -1;
365             }
366         }
367     return 0;
368 }
369
370
371 #endif
372
373 #if YAZ_HAVE_XML2
374 int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr)
375 {
376     yaz_marc_reset(mt);
377
378     for(; ptr; ptr = ptr->next)
379         if (ptr->type == XML_ELEMENT_NODE)
380         {
381             if (!strcmp((const char *) ptr->name, "record"))
382                 break;
383             else
384             {
385                 yaz_marc_cprintf(
386                     mt, "Unknown element '%.80s' in MARC XML reader",
387                     ptr->name);
388                 return -1;
389             }
390         }
391     if (!ptr)
392     {
393         yaz_marc_cprintf(mt, "Missing element 'record' in MARC XML record");
394         return -1;
395     }
396     /* ptr points to record node now */
397     ptr = ptr->children;
398     if (yaz_marc_read_xml_leader(mt, &ptr))
399         return -1;
400
401     switch (yaz_marc_get_read_format(mt)) {
402                 case YAZ_MARC_MARCXML:
403                         return yaz_marc_read_xml_fields(mt, ptr->next);
404                 case YAZ_MARC_TMARCXML:
405                         return yaz_marc_read_turbo_xml_fields(mt, ptr->next);
406     }
407         return -1;
408 }
409 #endif
410
411
412 /*
413  * Local variables:
414  * c-basic-offset: 4
415  * c-file-style: "Stroustrup"
416  * indent-tabs-mode: nil
417  * End:
418  * vim: shiftwidth=4 tabstop=8 expandtab
419  */
420