X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fmod_dom.c;h=d36c7e80a4576bc28fdaa088bc0c9a48f5cbd9ff;hb=85c5e09eacc5c89eda6e1ffa6f039fa1e8dd7221;hp=d42d80b0ecaad8fd5e1943a3cfbc838bb69e0172;hpb=97dc097858772a66c8e90e8b07f77c9c20450131;p=idzebra-moved-to-github.git

diff --git a/index/mod_dom.c b/index/mod_dom.c
index d42d80b..d36c7e8 100644
--- a/index/mod_dom.c
+++ b/index/mod_dom.c
@@ -1,4 +1,4 @@
-/* $Id: mod_dom.c,v 1.1 2007-02-07 12:08:54 adam Exp $
+/* $Id: mod_dom.c,v 1.7 2007-02-14 15:42:24 marc Exp $
    Copyright (C) 1995-2007
    Index Data ApS
 
@@ -101,6 +101,9 @@ struct filter_info {
 #define XML_STRCMP(a,b)   strcmp((char*)a, b)
 #define XML_STRLEN(a) strlen((char*)a)
 
+
+
+
 static void set_param_str(const char **params, const char *name,
 			  const char *value, ODR odr)
 {
@@ -614,6 +617,12 @@ static int ioclose_ex(void *context)
     return 0;
 }
 
+
+/* Alvis style indexing */
+#define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1"
+static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS;
+
+/* Alvis style indexing */
 static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
 			xmlNodePtr ptr,	RecWord *recWord)
 {
@@ -628,11 +637,7 @@ static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
     }
 }
 
-#define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1"
-
-
-static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS;
-
+/* Alvis style indexing */
 static void index_node(struct filter_info *tinfo,  struct recExtractCtrl *ctrl,
 		       xmlNodePtr ptr, RecWord *recWord)
 {
@@ -676,6 +681,7 @@ static void index_node(struct filter_info *tinfo,  struct recExtractCtrl *ctrl,
     }
 }
 
+/* Alvis style indexing */
 static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl,
 			 xmlNodePtr ptr, RecWord *recWord)
 {
@@ -717,14 +723,330 @@ static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl,
          yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", 
                  type_str);
 }
+
+
+/* Alvis style indexing */
+static void extract_doc_alvis(struct filter_info *tinfo, 
+                              struct recExtractCtrl *recctr, 
+                              xmlDocPtr doc)
+{
+    if (doc){
+        RecWord recWord;
+        xmlChar *buf_out;
+        int len_out;
+        xmlNodePtr root_ptr;
+
+        (*recctr->init)(recctr, &recWord);
+        
+	if (recctr->flagShowRecords){
+            xmlDocDumpMemory(doc, &buf_out, &len_out);
+	    fwrite(buf_out, len_out, 1, stdout);
+	    xmlFree(buf_out);
+	}
+	root_ptr = xmlDocGetRootElement(doc);
+	if (root_ptr)
+	    index_record(tinfo, recctr, root_ptr, &recWord);
+        else
+                yaz_log(YLOG_WARN, "No root for index XML record");
+    }
+}
+
+
+/* DOM filter style indexing */
+static int attr_content_xml(struct _xmlAttr *attr, const char *name,
+                        xmlChar **dst_content)
+{
+    if (0 == XML_STRCMP(attr->name, name) && attr->children 
+        && attr->children->type == XML_TEXT_NODE)
+    {
+        *dst_content = (attr->children->content);
+        return 1;
+    }
+    return 0;
+}
+
+/* DOM filter style indexing */
+/* #define ZEBRA_XSLT_NS "http://indexdata.com/zebra-2.0" */
+/* static const char *zebra_xslt_ns = ZEBRA_XSLT_NS; */
+
+/* DOM filter style indexing */
+#define ZEBRA_PI_NAME "zebra-2.0"
+static const char *zebra_pi_name = ZEBRA_PI_NAME;
+
+
+/* DOM filter style indexing */
+void index_value_of(xmlNodePtr node, xmlChar * index_p){
+  xmlChar *text = xmlNodeGetContent(node);
+
+  xmlChar *look = index_p;
+  xmlChar *bval;
+  xmlChar *eval;
+
+  xmlChar index[256];
+  xmlChar type[256];
+
+  /* parsing all index name/type pairs - may not start with ' ' or ':' */
+  while (*look && ' ' != *look && ':' != *look){
+    
+    /* setting name and type to zero */
+    *index = '\0';
+    *type = '\0';
+    
+    /* parsing one index name */
+    bval = look;
+    while (*look && ':' != *look && ' ' != *look){
+      look++;
+    }
+    eval = look;
+    strncpy((char *)index, (const char *)bval, eval - bval);
+    index[eval - bval] = '\0';
+    
+    
+    /* parsing one index type, if existing */
+    if (':' == *look){
+      look++;
+      
+      bval = look;
+      while (*look && ' ' != *look){
+        look++;
+      }
+      eval = look;
+      strncpy((char *)type, (const char *)bval, eval - bval);
+      type[eval - bval] = '\0';
+    }
+
+    printf("INDEX  '%s:%s' '%s'\n", index, type, text);
+    
+    if (*look && ' ' == *look && *(look+1)){
+      look++;
+    } 
+  }
+
+  xmlFree(text);
+
+/*   //recWord->term_buf = (const char *)ptr->content; */
+/*   //recWord->term_len = XML_STRLEN(ptr->content); */
+/*   //  if (type_str && *type_str) */
+/*   //  recWord->index_type = *type_str; /\* type was given *\/ */
+/*   //  recWord->index_name = name_str; */
+/*   // recWord->index_type = prev_type;     /\* restore it again *\/ */
+}
+
+
+/* DOM filter style indexing */
+void set_record_info(xmlChar * id_p, xmlChar * rank_p, xmlChar * action_p){
+  printf("RECORD id=%s rank=%s action=%s\n", id_p, rank_p, action_p);
+}
+
+
+/* DOM filter style indexing */
+void process_xml_element_zebra_node(xmlNodePtr node, xmlChar **record_p)
+{
+  if (node->type == XML_ELEMENT_NODE 
+      && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_xslt_ns)){
     
-static int extract_doc(struct filter_info *tinfo, struct filter_input *input,
-                       struct recExtractCtrl *p, xmlDocPtr doc)
+    if (0 == XML_STRCMP(node->name, "index")){
+      xmlChar *index_p = 0;
+
+      struct _xmlAttr *attr;      
+      for (attr = node->properties; attr; attr = attr->next){
+        if (attr_content_xml(attr, "name", &index_p)){
+          index_value_of(node, index_p);        
+        }  
+        else
+          //   printf("%s: dom filter: s% bad attribute %s",
+          //        tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
+          printf("dom filter: %s bad attribute @%s, expected @name\n",
+                  xmlGetNodePath(node), attr->name);
+      }
+    }
+    else if (0 == XML_STRCMP(node->name, "record")){
+      xmlChar *id_p = 0;
+      xmlChar *rank_p = 0;
+      xmlChar *action_p = 0;
+
+      struct _xmlAttr *attr;
+      for (attr = node->properties; attr; attr = attr->next){
+        if (attr_content_xml(attr, "id", &id_p))
+          ;
+        else if (attr_content_xml(attr, "rank", &rank_p))
+          ;
+        else if (attr_content_xml(attr, "acton", &action_p))
+          ;
+        else
+          //   printf("%s: dom filter: s% bad attribute %s",
+          //        tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
+          printf("dom filter: %s bad attribute @%s,"
+                 " expected @id|@rank|@action\n",
+                 xmlGetNodePath(node), attr->name);
+
+        if (action_p && 0 != strcmp("update", (const char *)action_p))
+          printf("dom filter: %s attribute @%s,"
+                 " only implemented '@action=\"update\"\n",
+                 xmlGetNodePath(node), attr->name);
+          
+
+      }
+      set_record_info(id_p, rank_p, action_p);
+    } else {
+      //   printf("%s: dom filter: s% bad attribute %s",
+      //        tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
+      printf("dom filter: %s bad element <%s>,"
+             " expected <record>|<index> in namespace '%s'\n",
+             xmlGetNodePath(node), node->name, zebra_xslt_ns);
+      
+    }
+  }
+}
+
+
+/* DOM filter style indexing */
+void process_xml_pi_node(xmlNodePtr node, xmlChar **record_pp, 
+                        xmlChar **index_pp)
 {
-    RecWord recWord;
-    const char *params[10];
+  printf("PI     %s\n", xmlGetNodePath(node));
+
+  /* if right PI name, continue parsing PI */
+  if (0 == strcmp(zebra_pi_name, (const char *)node->name)){
+    xmlChar *pi_p =  node->content;
+    xmlChar *look = pi_p;
+    
+    xmlChar *bval;
+    xmlChar *eval;
+
+    /* parsing PI record instructions */
+    if (0 == strncmp((const char *)look, "record", 6)){
+      xmlChar id[256];
+      xmlChar rank[256];
+      xmlChar action[256];
+
+      *id = '\0';
+      *rank = '\0';
+      *action = '\0';
+      
+      look += 6;
+      
+      /* eat whitespace */
+      while (*look && ' ' == *look && *(look+1))
+        look++;
+
+      /* parse possible id */
+      if (*look && 0 == strncmp((const char *)look, "id=", 3)){
+        look += 3;
+        bval = look;
+        while (*look && ' ' != *look)
+          look++;
+        eval = look;
+        strncpy((char *)id, (const char *)bval, eval - bval);
+        id[eval - bval] = '\0';
+      }
+      
+      /* eat whitespace */
+      while (*look && ' ' == *look && *(look+1))
+        look++;
+      
+      /* parse possible rank */
+      if (*look && 0 == strncmp((const char *)look, "rank=", 5)){
+        look += 6;
+        bval = look;
+        while (*look && ' ' != *look)
+          look++;
+        eval = look;
+        strncpy((char *)rank, (const char *)bval, eval - bval);
+        rank[eval - bval] = '\0';
+      }
+
+      /* eat whitespace */
+      while (*look && ' ' == *look && *(look+1))
+        look++;
+
+      if (look && '\0' != *look){
+        printf ("ERROR %s: content '%s'; can not parse '%s'\n", 
+                xmlGetNodePath(node), pi_p, look);
+      } else {
+        /* set_record_info(id, rank, action); */
+        set_record_info(id, rank, 0);
+      }
+
+    } 
+   
+    /* parsing index instruction */
+    else   if (0 == strncmp((const char *)look, "index", 5)){
+      look += 5;
+      
+      /* eat whitespace */
+      while (*look && ' ' == *look && *(look+1))
+        look++;
+
+      /* export index instructions to outside */
+      *index_pp = look;
+
+      /* nor record, neither index */ 
+    } else {
+    
+      printf ("ERROR %s: content '%s'; can not parse '%s'\n", 
+              xmlGetNodePath(node), pi_p, look);
+    }  
+  }
+}
+
+/* DOM filter style indexing */
+void process_xml_element_node(xmlNodePtr node, xmlChar **record_pp)
+{
+  /* remember indexing instruction from PI to next element node */
+  xmlChar *index_p = 0;
+
+  printf("ELEM   %s\n", xmlGetNodePath(node));
+
+  /* check if we are an element node in the special zebra namespace 
+     and either set record data or index value-of node content*/
+  process_xml_element_zebra_node(node, record_pp);
+  
+  /* loop through kid nodes */
+  for (node = node->children; node; node = node->next)
+    {
+      /* check and set PI record and index index instructions */
+      if (node->type == XML_PI_NODE){
+        process_xml_pi_node(node, record_pp, &index_p);
+      }
+      else if (node->type == XML_ELEMENT_NODE){
+        /* if there was a PI index instruction before this element node */
+        if (index_p){
+          index_value_of(node, index_p);            
+          index_p = 0;
+        }
+        process_xml_element_node(node, record_pp);
+      }
+      else
+        continue;
+    }
+}
+
+
+
+/* DOM filter style indexing */
+void process_xml_doc_node(xmlDocPtr doc)
+{
+    xmlChar *record_pp;
+    
+    printf("DOC    %s\n", xmlGetNodePath((xmlNodePtr)doc));
+
+    process_xml_element_node((xmlNodePtr)doc, &record_pp);
+}
+
+
+
+
+static int convert_extract_doc(struct filter_info *tinfo, 
+                               struct filter_input *input,
+                               struct recExtractCtrl *p, 
+                               xmlDocPtr doc)
+
+{
+    /* RecWord recWord; */
     xmlChar *buf_out;
     int len_out;
+    const char *params[10];
     xsltStylesheetPtr last_xsp = 0;
     xmlDocPtr store_doc = 0;
 
@@ -734,8 +1056,6 @@ static int extract_doc(struct filter_info *tinfo, struct filter_input *input,
     /* input conversion */
     perform_convert(tinfo, input->convert, params, &doc, 0);
 
-    (*p->init)(p, &recWord);
-
     if (tinfo->store)
     {
         /* store conversion */
@@ -759,24 +1079,12 @@ static int extract_doc(struct filter_info *tinfo, struct filter_input *input,
 
     /* extract conversion */
     perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
-    if (doc)
-    {
-        xmlNodePtr root_ptr;
-	if (p->flagShowRecords)
-	{
-	    xmlDocDumpMemory(doc, &buf_out, &len_out);
-	    fwrite(buf_out, len_out, 1, stdout);
-	    xmlFree(buf_out);
-	}
-	root_ptr = xmlDocGetRootElement(doc);
-	if (root_ptr)
-	    index_record(tinfo, p, root_ptr, &recWord);
-        else
-        {
-	    yaz_log(YLOG_WARN, "No root for index XML record");
-        }
+
+    if (doc){
+        extract_doc_alvis(tinfo, p, doc);
 	xmlFreeDoc(doc);
-    }    
+    }
+
     return RECCTRL_EXTRACT_OK;
 }
 
@@ -794,7 +1102,8 @@ static int extract_xml_split(struct filter_info *tinfo,
                                                    p /* I/O handler */,
                                                    0 /* URL */, 
                                                    0 /* encoding */,
-                                                   XML_PARSE_XINCLUDE);
+                                                   XML_PARSE_XINCLUDE|
+                                                   XML_PARSE_NOENT);
     }
     if (!input->u.xmlreader.reader)
 	return RECCTRL_EXTRACT_ERROR_GENERIC;
@@ -815,7 +1124,7 @@ static int extract_xml_split(struct filter_info *tinfo,
                 
                 xmlDocSetRootElement(doc, ptr2);
                 
-                return extract_doc(tinfo, input, p, doc);
+                return convert_extract_doc(tinfo, input, p, doc);
             }
             else
             {
@@ -840,12 +1149,12 @@ static int extract_xml_full(struct filter_info *tinfo,
         xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, p /* I/O handler */,
                                   0 /* URL */,
                                   0 /* encoding */,
-                                  XML_PARSE_XINCLUDE);
+                                  XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
         if (!doc)
         {
             return RECCTRL_EXTRACT_ERROR_GENERIC;
         }
-        return extract_doc(tinfo, input, p, doc);
+        return convert_extract_doc(tinfo, input, p, doc);
     }
     else
         return RECCTRL_EXTRACT_EOF;
@@ -899,7 +1208,7 @@ static int extract_iso2709(struct filter_info *tinfo,
         yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
         rdoc = xmlNewDoc((const xmlChar*) "1.0");
         xmlDocSetRootElement(rdoc, root_ptr);
-        return extract_doc(tinfo, input, p, rdoc);        
+        return convert_extract_doc(tinfo, input, p, rdoc);        
     }
     return RECCTRL_EXTRACT_OK;
 }
@@ -995,7 +1304,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
     doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
 		    0 /* URL */,
 		    0 /* encoding */,
-		    XML_PARSE_XINCLUDE);
+		    XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
     if (!doc)
     {
 	p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;