Fixed: reading indicators as attributes, no longer sub elements. Added XML format...
[yaz-moved-to-github.git] / src / marc_read_xml.c
index b755c2e..1db432c 100644 (file)
@@ -90,6 +90,92 @@ int yaz_marc_read_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
     return 0;
 }
 
+const char *tag_value_extract(const char *name, char tag_buffer[5]) {
+       size_t length = strlen(name);
+       if (length == 3) {
+               strcpy(tag_buffer, name);
+               return tag_buffer;
+       }
+       return 0;
+}
+
+// pattern <one character or -AB[CD]
+const char *code_value_extract(const char *name, char tag_buffer[5]) {
+       size_t length = strlen(name);
+       if (length == 1 ) {
+               return name;
+       }
+       if (length > 2 && length < 6) {
+               if (name[0] != '-') {
+                       return 0;
+               }
+               length--;
+               const char *ptr = name+1;
+               int index = 0;
+               for (index = 0; index < length/2; index++) {
+                       unsigned int value;
+                       char temp[3];
+                       strncpy(temp, ptr + 2*index, 2);
+                       sscanf(temp, "%02X", &value);
+                       tag_buffer[index] = (unsigned char) value;
+               }
+               tag_buffer[index] = '\0';
+               if (index > 0)
+                       return tag_buffer;
+       }
+       return 0;
+}
+
+
+int yaz_marc_read_turbo_xml_subfields(yaz_marc_t mt, const xmlNode *ptr)
+{
+    NMEM nmem = yaz_marc_get_nmem(mt);
+    for (; ptr; ptr = ptr->next)
+    {
+        if (ptr->type == XML_ELEMENT_NODE)
+        {
+               xmlNode *p;
+            if (!strncmp((const char *) ptr->name, "s", 1))
+            {
+                       NMEM nmem = yaz_marc_get_nmem(mt);
+                       char *buffer = (char *) nmem_malloc(nmem, 5);
+                               const char *tag_value = code_value_extract((ptr->name+1), buffer);
+                if (!tag_value)
+                {
+                    yaz_marc_cprintf(
+                        mt, "Missing 'code' value for 'subfield'" );
+                    return -1;
+                }
+
+               size_t ctrl_data_len = 0;
+                char *ctrl_data_buf = 0;
+                               ctrl_data_len = strlen((const char *) tag_value);
+                               // Extract (length) from CDATA
+                               xmlNode *p;
+                               for (p = ptr->children; p ; p = p->next)
+                    if (p->type == XML_TEXT_NODE)
+                        ctrl_data_len += strlen((const char *)p->content);
+                               // Allocate memory for code value (1 character (can be multi-byte) and data
+                ctrl_data_buf = (char *) nmem_malloc(nmem, ctrl_data_len+1);
+                // Build a string with "<Code><data>"
+                strcpy(ctrl_data_buf, (const char *) tag_value);
+                for (p = ptr->children; p ; p = p->next)
+                    if (p->type == XML_TEXT_NODE)
+                        strcat(ctrl_data_buf, (const char *)p->content);
+                yaz_marc_add_subfield(mt, ctrl_data_buf, ctrl_data_len);
+            }
+            else
+            {
+                yaz_marc_cprintf(
+                    mt, "Expected element 'subfield', got '%.80s'", ptr->name);
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+
 static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p)
 {
     int indicator_length;
@@ -104,7 +190,8 @@ static int yaz_marc_read_xml_leader(yaz_marc_t mt, const xmlNode **ptr_p)
     for(; ptr; ptr = ptr->next)
         if (ptr->type == XML_ELEMENT_NODE)
         {
-            if (!strcmp((const char *) ptr->name, "leader"))
+               if ( !strcmp( (const char *) ptr->name, "leader") ||
+                       (!strncmp((const char *) ptr->name, "l", 1) ))
             {
                 xmlNode *p = ptr->children;
                 for(; p; p = p->next)
@@ -145,7 +232,7 @@ static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
     for(; ptr; ptr = ptr->next)
         if (ptr->type == XML_ELEMENT_NODE)
         {
-            if (!strcmp((const char *) ptr->name, "controlfield"))
+               if (!strcmp( (const char *) ptr->name, "controlfield"))
             {
                 const xmlNode *ptr_tag = 0;
                 struct _xmlAttr *attr;
@@ -215,6 +302,77 @@ static int yaz_marc_read_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
         }
     return 0;
 }
+
+void yaz_marc_add_datafield_turbo_xml(yaz_marc_t mt, char *tag_value, char *indicators);
+
+static int yaz_marc_read_turbo_xml_fields(yaz_marc_t mt, const xmlNode *ptr)
+{
+    for(; ptr; ptr = ptr->next)
+        if (ptr->type == XML_ELEMENT_NODE)
+        {
+               if (!strncmp( (const char *) ptr->name, "c", 1))
+            {
+                       NMEM nmem = yaz_marc_get_nmem(mt);
+                       char *buffer = (char *) nmem_malloc(nmem, 5);
+                       const char *tag_value = tag_value_extract((const char *)(ptr->name+1), buffer);
+                if (!tag_value)
+                {
+                    yaz_marc_cprintf(
+                        mt, "Missing attribute 'tag' for 'controlfield'" );
+                    return -1;
+                }
+                yaz_marc_add_controlfield_turbo_xml(mt, tag_value, ptr->children);
+            }
+            else if (!strncmp((const char *) ptr->name, "d",1))
+            {
+                struct _xmlAttr *attr;
+                       NMEM nmem = yaz_marc_get_nmem(mt);
+                char *indstr = nmem_malloc(nmem, 11);  /* 0(unused), 1,....9, + zero term */
+                int index = 0;
+                for (index = 0; index < 11; index++)
+                                       indstr[index] = '\0';
+                       char *buffer = (char *) nmem_malloc(nmem, 5);
+                               char *tag_value = tag_value_extract(ptr->name+1, buffer);
+                if (!tag_value)
+                               {
+                    yaz_marc_cprintf(
+                        mt, "Missing attribute 'tag' for 'datafield'" );
+                    return -1;
+                }
+                for (attr = ptr->properties; attr; attr = attr->next)
+                    if (strlen((const char *)attr->name) == 2 &&
+                             attr->name[0] == 'i')
+                    {
+                       //extract indicator attribute from i#="Y" pattern
+                        int no = atoi((const char *)attr->name+1);
+                        if (attr->children
+                            && attr->children->type == XML_TEXT_NODE)
+                            indstr[no] = attr->children->content[0];
+                    }
+                    else
+                    {
+                        yaz_marc_cprintf(
+                            mt, "Bad attribute '%.80s' for 'datafield'",
+                            attr->name);
+                    }
+                /* note that indstr[0] is unused so we use indstr[1..] */
+                yaz_marc_add_datafield_turbo_xml(mt, tag_value, indstr+1);
+                int rc = yaz_marc_read_turbo_xml_subfields(mt, ptr->children /*, indstr */);
+                if (rc)
+                    return -1;
+            }
+            else
+            {
+                yaz_marc_cprintf(mt,
+                                 "Expected element controlfield or datafield,"
+                                 " got %.80s", ptr->name);
+                return -1;
+            }
+        }
+    return 0;
+}
+
+
 #endif
 
 #if YAZ_HAVE_XML2
@@ -225,9 +383,15 @@ int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr)
     for(; ptr; ptr = ptr->next)
         if (ptr->type == XML_ELEMENT_NODE)
         {
-            if (!strcmp((const char *) ptr->name, "record"))
+                       //TODO Should actually look at the namespace but...
+            if (!strcmp((const char *) ptr->name, "record")) {
+               yaz_marc_set_read_format(mt, YAZ_MARC_MARCXML);
                 break;
-            else
+            }
+            else if (!strcmp((const char *) ptr->name, "r")) {
+               yaz_marc_set_read_format(mt, YAZ_MARC_TMARCXML);
+                break;
+            }
             {
                 yaz_marc_cprintf(
                     mt, "Unknown element '%.80s' in MARC XML reader",
@@ -244,7 +408,14 @@ int yaz_marc_read_xml(yaz_marc_t mt, const xmlNode *ptr)
     ptr = ptr->children;
     if (yaz_marc_read_xml_leader(mt, &ptr))
         return -1;
-    return yaz_marc_read_xml_fields(mt, ptr->next);
+
+    switch (yaz_marc_get_read_format(mt)) {
+               case YAZ_MARC_MARCXML:
+                       return yaz_marc_read_xml_fields(mt, ptr->next);
+               case YAZ_MARC_TMARCXML:
+                       return yaz_marc_read_turbo_xml_fields(mt, ptr->next);
+    }
+       return -1;
 }
 #endif