3f3e05bf24dcf0293e5ccd3a8ce729fdcefc7c90
[yaz-moved-to-github.git] / src / marc_read_xml.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2010 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file marc_read_xml.c
8  * \brief Implements reading of MARC as XML
9  */
10
11 #if HAVE_CONFIG_H
12 #include <config.h>
13 #endif
14
15 #ifdef WIN32
16 #include <windows.h>
17 #endif
18
19 #include <stdio.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <yaz/marcdisp.h>
23 #include <yaz/wrbuf.h>
24 #include <yaz/yaz-util.h>
25 #include <yaz/nmem_xml.h>
26
27 #if YAZ_HAVE_XML2
28 #include <libxml/tree.h>
29 #endif
30
31 #if YAZ_HAVE_XML2
32 int yaz_marc_read_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
33 {
34     NMEM nmem = yaz_marc_get_nmem(mt);
35     for (; ptr; ptr = ptr->next)
36     {
37         if (ptr->type == XML_ELEMENT_NODE)
38         {
39             if (!strcmp((const char *) ptr->name, "subfield"))
40             {
41                 size_t ctrl_data_len = 0;
42                 char *ctrl_data_buf = 0;
43                 const xmlNode *p = 0, *ptr_code = 0;
44                 struct _xmlAttr *attr;
45                 for (attr = ptr->properties; attr; attr = attr->next)
46                     if (!strcmp((const char *)attr->name, "code"))
47                         ptr_code = attr->children;
48                     else
49                     {
50                         yaz_marc_cprintf(
51                             mt, "Bad attribute '%.80s' for 'subfield'",
52                             attr->name);
53                         return -1;
54                     }
55                 if (!ptr_code)
56                 {
57                     yaz_marc_cprintf(
58                         mt, "Missing attribute 'code' for 'subfield'" );
59                     return -1;
60                 }
61                 if (ptr_code->type == XML_TEXT_NODE)
62                 {
63                     ctrl_data_len = 
64                         strlen((const char *)ptr_code->content);
65                 }
66                 else
67                 {
68                     yaz_marc_cprintf(
69                         mt, "Missing value for 'code' in 'subfield'" );
70                     return -1;
71                 }
72                 for (p = ptr->children; p ; p = p->next)
73                     if (p->type == XML_TEXT_NODE)
74                         ctrl_data_len += strlen((const char *)p->content);
75                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
76                 strcpy(ctrl_data_buf, (const char *)ptr_code->content);
77                 for (p = ptr->children; p ; p = p->next)
78                     if (p->type == XML_TEXT_NODE)
79                         strcat(ctrl_data_buf, (const char *)p->content);
80                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
81             }
82             else
83             {
84                 yaz_marc_cprintf(
85                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
86                 return -1;
87             }
88         }
89     }
90     return 0;
91 }
92
93 const char *tag_value_extract(const char *name, char tag_buffer[5]) {
94         size_t length = strlen(name);
95         if (length == 3) {
96                 strcpy(tag_buffer, name);
97                 return tag_buffer;
98         }
99         return 0;
100 }
101
102 // Given a xmlNode ptr,  extract a value from either a element name or from a given attribute
103 const char *element_attribute_value_extract(const xmlNode *ptr, const char *attribute_name, NMEM nmem) {
104
105         const char *name = ptr->name;
106         size_t length = strlen(name);
107         if (length > 1 ) {
108                 return nmem_strdup(nmem, name+1);
109         }
110         // TODO Extract from attribute where matches attribute_name
111         xmlAttr *attr;
112     for (attr = ptr->properties; attr; attr = attr->next)
113         if (!strcmp((const char *)attr->name, attribute_name)) {
114                 return nmem_text_node_cdata(attr->children, nmem);
115         }
116         return 0;
117 }
118
119
120 int yaz_marc_read_turbo_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
121 {
122     NMEM nmem = yaz_marc_get_nmem(mt);
123     for (; ptr; ptr = ptr->next)
124     {
125         if (ptr->type == XML_ELEMENT_NODE)
126         {
127                 xmlNode *p;
128             if (!strncmp((const char *) ptr->name, "s", 1))
129             {
130                         NMEM nmem = yaz_marc_get_nmem(mt);
131                         char *buffer = (char *) nmem_malloc(nmem, 5);
132                                 const char *tag_value = element_attribute_value_extract(ptr, "code", nmem);
133                 if (!tag_value)
134                 {
135                     yaz_marc_cprintf(
136                         mt, "Missing 'code' value for 'subfield'" );
137                     return -1;
138                 }
139
140                 size_t ctrl_data_len = 0;
141                 char *ctrl_data_buf = 0;
142                                 ctrl_data_len = strlen((const char *) tag_value);
143                                 // Extract (length) from CDATA
144                                 xmlNode *p;
145                                 for (p = ptr->children; p ; p = p->next)
146                     if (p->type == XML_TEXT_NODE)
147                         ctrl_data_len += strlen((const char *)p->content);
148                                 // Allocate memory for code value (1 character (can be multi-byte) and data
149                 ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
150                 // Build a string with "<Code><data>"
151                 strcpy(ctrl_data_buf, (const char *) tag_value);
152                 for (p = ptr->children; p ; p = p->next)
153                     if (p->type == XML_TEXT_NODE)
154                         strcat(ctrl_data_buf, (const char *)p->content);
155                 yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
156             }
157             else
158             {
159                 yaz_marc_cprintf(
160                     mt, "Expected element 'subfield', got '%.80s'", ptr->name);
161                 return -1;
162             }
163         }
164     }
165     return 0;
166 }
167
168
169 static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p)
170 {
171     int indicator_length;
172     int identifier_length;
173     int base_address;
174     int length_data_entry;
175     int length_starting;
176     int length_implementation;
177     const char *leader = 0;
178     const xmlNode *ptr = *ptr_p;
179
180     for(; ptr; ptr = ptr->next)
181         if (ptr->type == XML_ELEMENT_NODE)
182         {
183                 if ( !strcmp( (const char *) ptr->name, "leader") ||
184                         (!strncmp((const char *) ptr->name, "l", 1) ))
185             {
186                 xmlNode *p = ptr->children;
187                 for(; p; p = p->next)
188                     if (p->type == XML_TEXT_NODE)
189                         leader = (const char *) p->content;
190                 break;
191             }
192             else
193             {
194                 yaz_marc_cprintf(
195                     mt, "Expected element 'leader', got '%.80s'", ptr->name);
196             }
197         }
198     if (!leader)
199     {
200         yaz_marc_cprintf(mt, "Missing element 'leader'");
201         return -1;
202     }
203     if (strlen(leader) != 24)
204     {
205         yaz_marc_cprintf(mt, "Bad length %d of leader data."
206                          " Must have length of 24 characters", strlen(leader));
207         return -1;
208     }
209     yaz_marc_set_leader(mt, leader,
210                         &indicator_length,
211                         &identifier_length,
212                         &base_address,
213                         &length_data_entry,
214                         &length_starting,
215                         &length_implementation);
216     *ptr_p = ptr;
217     return 0;
218 }
219
220 static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
221 {
222     for(; ptr; ptr = ptr->next)
223         if (ptr->type == XML_ELEMENT_NODE)
224         {
225                 if (!strcmp( (const char *) ptr->name, "controlfield"))
226             {
227                 const xmlNode *ptr_tag = 0;
228                 struct _xmlAttr *attr;
229                 for (attr = ptr->properties; attr; attr = attr->next)
230                     if (!strcmp((const char *)attr->name, "tag"))
231                         ptr_tag = attr->children;
232                     else
233                     {
234                         yaz_marc_cprintf(
235                             mt, "Bad attribute '%.80s' for 'controlfield'",
236                             attr->name);
237                         return -1;
238                     }
239                 if (!ptr_tag)
240                 {
241                     yaz_marc_cprintf(
242                         mt, "Missing attribute 'tag' for 'controlfield'" );
243                     return -1;
244                 }
245                 yaz_marc_add_controlfield_xml(mt, ptr_tag, ptr->children);
246             }
247             else if (!strcmp((const char *) ptr->name, "datafield"))
248             {
249                 char indstr[11]; /* 0(unused), 1,....9, + zero term */
250                 const xmlNode *ptr_tag = 0;
251                 struct _xmlAttr *attr;
252                 int i;
253                 for (i = 0; i<11; i++)
254                     indstr[i] = '\0';
255                 for (attr = ptr->properties; attr; attr = attr->next)
256                     if (!strcmp((const char *)attr->name, "tag"))
257                         ptr_tag = attr->children;
258                     else if (strlen((const char *)attr->name) == 4 &&
259                              !memcmp(attr->name, "ind", 3))
260                     {
261                         int no = atoi((const char *)attr->name+3);
262                         if (attr->children
263                             && attr->children->type == XML_TEXT_NODE)
264                             indstr[no] = attr->children->content[0];
265                     }
266                     else
267                     {
268                         yaz_marc_cprintf(
269                             mt, "Bad attribute '%.80s' for 'datafield'",
270                             attr->name);
271                     }
272                 if (!ptr_tag)
273                 {
274                     yaz_marc_cprintf(
275                         mt, "Missing attribute 'tag' for 'datafield'" );
276                     return -1;
277                 }
278                 /* note that indstr[0] is unused so we use indstr[1..] */
279                 yaz_marc_add_datafield_xml(mt, ptr_tag,
280                                            indstr+1, strlen(indstr+1));
281                 
282                 if (yaz_marc_read_xml_subfields(mt, ptr->children))
283                     return -1;
284             }
285             else
286             {
287                 yaz_marc_cprintf(mt,
288                                  "Expected element controlfield or datafield,"
289                                  " got %.80s", ptr->name);
290                 return -1;
291             }
292         }
293     return 0;
294 }
295
296 void yaz_marc_add_datafield_turbo_xml(yaz_marc_t mt, char *tag_value, char *indicators);
297
298 static int yaz_marc_read_turbo_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
299 {
300     for(; ptr; ptr = ptr->next)
301         if (ptr->type == XML_ELEMENT_NODE)
302         {
303                 if (!strncmp( (const char *) ptr->name, "c", 1))
304             {
305                         NMEM nmem = yaz_marc_get_nmem(mt);
306                         const char *tag_value = element_attribute_value_extract(ptr, "tag", nmem);
307                 if (!tag_value)
308                 {
309                     yaz_marc_cprintf(
310                         mt, "Missing attribute 'tag' for 'controlfield'" );
311                     return -1;
312                 }
313                 yaz_marc_add_controlfield_turbo_xml(mt, tag_value, ptr->children);
314             }
315             else if (!strncmp((const char *) ptr->name, "d",1))
316             {
317                 struct _xmlAttr *attr;
318                         NMEM nmem = yaz_marc_get_nmem(mt);
319                 char *indstr = nmem_malloc(nmem, 11);  /* 0(unused), 1,....9, + zero term */
320                 int index = 0;
321                 for (index = 0; index < 11; index++)
322                                         indstr[index] = '\0';
323                         const char *tag_value = element_attribute_value_extract(ptr, "tag", nmem);
324                 if (!tag_value)
325                                 {
326                     yaz_marc_cprintf(
327                         mt, "Missing attribute 'tag' for 'datafield'" );
328                     return -1;
329                 }
330                 for (attr = ptr->properties; attr; attr = attr->next)
331                     if (strlen((const char *)attr->name) == 2 &&
332                              attr->name[0] == 'i')
333                     {
334                         //extract indicator attribute from i#="Y" pattern
335                         int no = atoi((const char *)attr->name+1);
336                         if (attr->children
337                             && attr->children->type == XML_TEXT_NODE)
338                             indstr[no] = attr->children->content[0];
339                     }
340                     else
341                     {
342                         yaz_marc_cprintf(
343                             mt, "Bad attribute '%.80s' for 'datafield'",
344                             attr->name);
345                     }
346                 /* note that indstr[0] is unused so we use indstr[1..] */
347                 yaz_marc_add_datafield_turbo_xml(mt, tag_value, indstr+1);
348                 int rc = yaz_marc_read_turbo_xml_subfields(mt, ptr->children /*, indstr */);
349                 if (rc)
350                     return -1;
351             }
352             else
353             {
354                 yaz_marc_cprintf(mt,
355                                  "Expected element controlfield or datafield,"
356                                  " got %.80s", ptr->name);
357                 return -1;
358             }
359         }
360     return 0;
361 }
362
363
364 #endif
365
366 #if YAZ_HAVE_XML2
367 int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr)
368 {
369     yaz_marc_reset(mt);
370
371     for(; ptr; ptr = ptr->next)
372         if (ptr->type == XML_ELEMENT_NODE)
373         {
374                         //TODO Should actually look at the namespace but...
375             if (!strcmp((const char *) ptr->name, "record")) {
376                 yaz_marc_set_read_format(mt, YAZ_MARC_MARCXML);
377                 break;
378             }
379             else if (!strcmp((const char *) ptr->name, "r")) {
380                 yaz_marc_set_read_format(mt, YAZ_MARC_TMARCXML);
381                 break;
382             }
383             {
384                 yaz_marc_cprintf(
385                     mt, "Unknown element '%.80s' in MARC XML reader",
386                     ptr->name);
387                 return -1;
388             }
389         }
390     if (!ptr)
391     {
392         yaz_marc_cprintf(mt, "Missing element 'record' in MARC XML record");
393         return -1;
394     }
395     /* ptr points to record node now */
396     ptr = ptr->children;
397     if (yaz_marc_read_xml_leader(mt, &ptr))
398         return -1;
399
400     switch (yaz_marc_get_read_format(mt)) {
401                 case YAZ_MARC_MARCXML:
402                         return yaz_marc_read_xml_fields(mt, ptr->next);
403                 case YAZ_MARC_TMARCXML:
404                         return yaz_marc_read_turbo_xml_fields(mt, ptr->next);
405     }
406         return -1;
407 }
408 #endif
409
410
411 /*
412  * Local variables:
413  * c-basic-offset: 4
414  * c-file-style: "Stroustrup"
415  * indent-tabs-mode: nil
416  * End:
417  * vim: shiftwidth=4 tabstop=8 expandtab
418  */
419