continued hooking in tinfo and recctr, still need to do real indexing

[idzebra-moved-to-github.git] / index / mod_dom.c
diff --git a/index/mod_dom.c b/index/mod_dom.c

index d42d80b..0d7a99d 100644 (file)
--- a/index/mod_dom.c
+++ b/index/mod_dom.c
@@ -1,4 +1,4 @@
-/* $Id: mod_dom.c,v 1.1 2007-02-07 12:08:54 adam Exp $
+/* $Id: mod_dom.c,v 1.8 2007-02-14 16:16:15 marc Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -101,6 +101,9 @@ struct filter_info {
  #define XML_STRCMP(a,b)   strcmp((char*)a, b)
  #define XML_STRLEN(a) strlen((char*)a)
  
+
+
+
  static void set_param_str(const char **params, const char *name,
                           const char *value, ODR odr)
  {
@@ -614,6 +617,12 @@ static int ioclose_ex(void *context)
      return 0;
  }
  
+
+/* Alvis style indexing */
+#define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1"
+static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS;
+
+/* Alvis style indexing */
  static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
                         xmlNodePtr ptr, RecWord *recWord)
  {
@@ -628,11 +637,7 @@ static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
      }
  }
  
-#define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1"
-
-
-static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS;
-
+/* Alvis style indexing */
  static void index_node(struct filter_info *tinfo,  struct recExtractCtrl *ctrl,
                        xmlNodePtr ptr, RecWord *recWord)
  {
@@ -676,6 +681,7 @@ static void index_node(struct filter_info *tinfo,  struct recExtractCtrl *ctrl,
      }
  }
  
+/* Alvis style indexing */
  static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl,
                          xmlNodePtr ptr, RecWord *recWord)
  {
@@ -717,14 +723,348 @@ static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl,
           yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", 
                   type_str);
  }
+
+
+/* Alvis style indexing */
+static void extract_doc_alvis(struct filter_info *tinfo, 
+                              struct recExtractCtrl *recctr, 
+                              xmlDocPtr doc)
+{
+    if (doc){
+        RecWord recWord;
+        xmlChar *buf_out;
+        int len_out;
+        xmlNodePtr root_ptr;
+
+        (*recctr->init)(recctr, &recWord);
+        
+       if (recctr->flagShowRecords){
+            xmlDocDumpMemory(doc, &buf_out, &len_out);
+           fwrite(buf_out, len_out, 1, stdout);
+           xmlFree(buf_out);
+       }
+       root_ptr = xmlDocGetRootElement(doc);
+       if (root_ptr)
+           index_record(tinfo, recctr, root_ptr, &recWord);
+        else
+                yaz_log(YLOG_WARN, "No root for index XML record");
+    }
+}
+
+
+/* DOM filter style indexing */
+static int attr_content_xml(struct _xmlAttr *attr, const char *name,
+                        xmlChar **dst_content)
+{
+    if (0 == XML_STRCMP(attr->name, name) && attr->children 
+        && attr->children->type == XML_TEXT_NODE)
+    {
+        *dst_content = (attr->children->content);
+        return 1;
+    }
+    return 0;
+}
+
+/* DOM filter style indexing */
+/* #define ZEBRA_XSLT_NS "http://indexdata.com/zebra-2.0" */
+/* static const char *zebra_xslt_ns = ZEBRA_XSLT_NS; */
+
+/* DOM filter style indexing */
+#define ZEBRA_PI_NAME "zebra-2.0"
+static const char *zebra_pi_name = ZEBRA_PI_NAME;
+
+
+/* DOM filter style indexing */
+void index_value_of(struct filter_info *tinfo, 
+                    struct recExtractCtrl *recctr, 
+                    xmlNodePtr node, 
+                    xmlChar * index_p)
+{
+    xmlChar *text = xmlNodeGetContent(node);
+
+  xmlChar *look = index_p;
+  xmlChar *bval;
+  xmlChar *eval;
+
+  xmlChar index[256];
+  xmlChar type[256];
+
+  /* parsing all index name/type pairs - may not start with ' ' or ':' */
+  while (*look && ' ' != *look && ':' != *look){
      
-static int extract_doc(struct filter_info *tinfo, struct filter_input *input,
-                       struct recExtractCtrl *p, xmlDocPtr doc)
+    /* setting name and type to zero */
+    *index = '\0';
+    *type = '\0';
+    
+    /* parsing one index name */
+    bval = look;
+    while (*look && ':' != *look && ' ' != *look){
+      look++;
+    }
+    eval = look;
+    strncpy((char *)index, (const char *)bval, eval - bval);
+    index[eval - bval] = '\0';
+    
+    
+    /* parsing one index type, if existing */
+    if (':' == *look){
+      look++;
+      
+      bval = look;
+      while (*look && ' ' != *look){
+        look++;
+      }
+      eval = look;
+      strncpy((char *)type, (const char *)bval, eval - bval);
+      type[eval - bval] = '\0';
+    }
+
+    printf("INDEX  '%s:%s' '%s'\n", index, type, text);
+    
+    if (*look && ' ' == *look && *(look+1)){
+      look++;
+    } 
+  }
+
+  xmlFree(text);
+
+/*   //recWord->term_buf = (const char *)ptr->content; */
+/*   //recWord->term_len = XML_STRLEN(ptr->content); */
+/*   //  if (type_str && *type_str) */
+/*   //  recWord->index_type = *type_str; /\* type was given *\/ */
+/*   //  recWord->index_name = name_str; */
+/*   // recWord->index_type = prev_type;     /\* restore it again *\/ */
+}
+
+
+/* DOM filter style indexing */
+void set_record_info(struct filter_info *tinfo, 
+                     struct recExtractCtrl *recctr, 
+                     xmlChar * id_p, 
+                     xmlChar * rank_p, 
+                     xmlChar * action_p)
  {
-    RecWord recWord;
-    const char *params[10];
+  printf("RECORD id=%s rank=%s action=%s\n", id_p, rank_p, action_p);
+}
+
+
+/* DOM filter style indexing */
+void process_xml_element_zebra_node(struct filter_info *tinfo, 
+                                    struct recExtractCtrl *recctr, 
+                                    xmlNodePtr node)
+{
+  if (node->type == XML_ELEMENT_NODE 
+      && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_xslt_ns)){
+    
+    if (0 == XML_STRCMP(node->name, "index")){
+      xmlChar *index_p = 0;
+
+      struct _xmlAttr *attr;      
+      for (attr = node->properties; attr; attr = attr->next){
+        if (attr_content_xml(attr, "name", &index_p)){
+          index_value_of(tinfo, recctr, node, index_p);        
+        }  
+        else
+          //   printf("%s: dom filter: s% bad attribute %s",
+          //        tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
+          printf("dom filter: %s bad attribute @%s, expected @name\n",
+                  xmlGetNodePath(node), attr->name);
+      }
+    }
+    else if (0 == XML_STRCMP(node->name, "record")){
+      xmlChar *id_p = 0;
+      xmlChar *rank_p = 0;
+      xmlChar *action_p = 0;
+
+      struct _xmlAttr *attr;
+      for (attr = node->properties; attr; attr = attr->next){
+        if (attr_content_xml(attr, "id", &id_p))
+          ;
+        else if (attr_content_xml(attr, "rank", &rank_p))
+          ;
+        else if (attr_content_xml(attr, "acton", &action_p))
+          ;
+        else
+          //   printf("%s: dom filter: s% bad attribute %s",
+          //        tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
+          printf("dom filter: %s bad attribute @%s,"
+                 " expected @id|@rank|@action\n",
+                 xmlGetNodePath(node), attr->name);
+
+        if (action_p && 0 != strcmp("update", (const char *)action_p))
+          printf("dom filter: %s attribute @%s,"
+                 " only implemented '@action=\"update\"\n",
+                 xmlGetNodePath(node), attr->name);
+          
+
+      }
+      set_record_info(tinfo, recctr, id_p, rank_p, action_p);
+    } else {
+      //   printf("%s: dom filter: s% bad attribute %s",
+      //        tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
+      printf("dom filter: %s bad element <%s>,"
+             " expected <record>|<index> in namespace '%s'\n",
+             xmlGetNodePath(node), node->name, zebra_xslt_ns);
+      
+    }
+  }
+}
+
+
+/* DOM filter style indexing */
+void process_xml_pi_node(struct filter_info *tinfo, 
+                         struct recExtractCtrl *recctr, 
+                         xmlNodePtr node,
+                         xmlChar **index_pp)
+{
+
+    /* printf("PI     %s\n", xmlGetNodePath(node)); */
+
+  /* if right PI name, continue parsing PI */
+  if (0 == strcmp(zebra_pi_name, (const char *)node->name)){
+    xmlChar *pi_p =  node->content;
+    xmlChar *look = pi_p;
+    
+    xmlChar *bval;
+    xmlChar *eval;
+
+    /* parsing PI record instructions */
+    if (0 == strncmp((const char *)look, "record", 6)){
+      xmlChar id[256];
+      xmlChar rank[256];
+      xmlChar action[256];
+
+      *id = '\0';
+      *rank = '\0';
+      *action = '\0';
+      
+      look += 6;
+      
+      /* eat whitespace */
+      while (*look && ' ' == *look && *(look+1))
+        look++;
+
+      /* parse possible id */
+      if (*look && 0 == strncmp((const char *)look, "id=", 3)){
+        look += 3;
+        bval = look;
+        while (*look && ' ' != *look)
+          look++;
+        eval = look;
+        strncpy((char *)id, (const char *)bval, eval - bval);
+        id[eval - bval] = '\0';
+      }
+      
+      /* eat whitespace */
+      while (*look && ' ' == *look && *(look+1))
+        look++;
+      
+      /* parse possible rank */
+      if (*look && 0 == strncmp((const char *)look, "rank=", 5)){
+        look += 6;
+        bval = look;
+        while (*look && ' ' != *look)
+          look++;
+        eval = look;
+        strncpy((char *)rank, (const char *)bval, eval - bval);
+        rank[eval - bval] = '\0';
+      }
+
+      /* eat whitespace */
+      while (*look && ' ' == *look && *(look+1))
+        look++;
+
+      if (look && '\0' != *look){
+        printf ("ERROR %s: content '%s'; can not parse '%s'\n", 
+                xmlGetNodePath(node), pi_p, look);
+      } else {
+        /* set_record_info(id, rank, action); */
+        set_record_info(tinfo, recctr, id, rank, 0);
+      }
+
+    } 
+   
+    /* parsing index instruction */
+    else   if (0 == strncmp((const char *)look, "index", 5)){
+      look += 5;
+      
+      /* eat whitespace */
+      while (*look && ' ' == *look && *(look+1))
+        look++;
+
+      /* export index instructions to outside */
+      *index_pp = look;
+
+      /* nor record, neither index */ 
+    } else {
+    
+      printf ("ERROR %s: content '%s'; can not parse '%s'\n", 
+              xmlGetNodePath(node), pi_p, look);
+    }  
+  }
+}
+
+/* DOM filter style indexing */
+void process_xml_element_node(struct filter_info *tinfo, 
+                              struct recExtractCtrl *recctr, 
+                              xmlNodePtr node)
+{
+  /* remember indexing instruction from PI to next element node */
+  xmlChar *index_p = 0;
+
+  /* printf("ELEM   %s\n", xmlGetNodePath(node)); */
+
+  /* check if we are an element node in the special zebra namespace 
+     and either set record data or index value-of node content*/
+  process_xml_element_zebra_node(tinfo, recctr, node);
+  
+  /* loop through kid nodes */
+  for (node = node->children; node; node = node->next)
+    {
+      /* check and set PI record and index index instructions */
+      if (node->type == XML_PI_NODE){
+        process_xml_pi_node(tinfo, recctr, node, &index_p);
+      }
+      else if (node->type == XML_ELEMENT_NODE){
+        /* if there was a PI index instruction before this element node */
+        if (index_p){
+          index_value_of(tinfo, recctr, node, index_p);            
+          index_p = 0;
+        }
+        process_xml_element_node(tinfo, recctr, node);
+      }
+      else
+        continue;
+    }
+}
+
+
+
+
+
+/* DOM filter style indexing */
+void extract_dom_doc_node(struct filter_info *tinfo, 
+                          struct recExtractCtrl *recctr, 
+                          xmlDocPtr doc)
+{
+    printf("DOC    %s\n", xmlGetNodePath((xmlNodePtr)doc));
+
+    process_xml_element_node(tinfo, recctr, (xmlNodePtr)doc);
+}
+
+
+
+
+static int convert_extract_doc(struct filter_info *tinfo, 
+                               struct filter_input *input,
+                               struct recExtractCtrl *p, 
+                               xmlDocPtr doc)
+
+{
+    /* RecWord recWord; */
      xmlChar *buf_out;
      int len_out;
+    const char *params[10];
      xsltStylesheetPtr last_xsp = 0;
      xmlDocPtr store_doc = 0;
  
@@ -734,8 +1074,6 @@ static int extract_doc(struct filter_info *tinfo, struct filter_input *input,
      /* input conversion */
      perform_convert(tinfo, input->convert, params, &doc, 0);
  
-    (*p->init)(p, &recWord);
-
      if (tinfo->store)
      {
          /* store conversion */
@@ -759,24 +1097,14 @@ static int extract_doc(struct filter_info *tinfo, struct filter_input *input,
  
      /* extract conversion */
      perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
-    if (doc)
-    {
-        xmlNodePtr root_ptr;
-       if (p->flagShowRecords)
-       {
-           xmlDocDumpMemory(doc, &buf_out, &len_out);
-           fwrite(buf_out, len_out, 1, stdout);
-           xmlFree(buf_out);
-       }
-       root_ptr = xmlDocGetRootElement(doc);
-       if (root_ptr)
-           index_record(tinfo, p, root_ptr, &recWord);
-        else
-        {
-           yaz_log(YLOG_WARN, "No root for index XML record");
-        }
+
+    /* finally, do the indexing */
+    if (doc){
+        extract_dom_doc_node(tinfo, p, doc);
+        extract_doc_alvis(tinfo, p, doc);
         xmlFreeDoc(doc);
-    }    
+    }
+
      return RECCTRL_EXTRACT_OK;
  }
  
@@ -794,7 +1122,8 @@ static int extract_xml_split(struct filter_info *tinfo,
                                                     p /* I/O handler */,
                                                     0 /* URL */, 
                                                     0 /* encoding */,
-                                                   XML_PARSE_XINCLUDE);
+                                                   XML_PARSE_XINCLUDE|
+                                                   XML_PARSE_NOENT);
      }
      if (!input->u.xmlreader.reader)
         return RECCTRL_EXTRACT_ERROR_GENERIC;
@@ -815,7 +1144,7 @@ static int extract_xml_split(struct filter_info *tinfo,
                  
                  xmlDocSetRootElement(doc, ptr2);
                  
-                return extract_doc(tinfo, input, p, doc);
+                return convert_extract_doc(tinfo, input, p, doc);
              }
              else
              {
@@ -840,12 +1169,12 @@ static int extract_xml_full(struct filter_info *tinfo,
          xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex, p /* I/O handler */,
                                    0 /* URL */,
                                    0 /* encoding */,
-                                  XML_PARSE_XINCLUDE);
+                                  XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
          if (!doc)
          {
              return RECCTRL_EXTRACT_ERROR_GENERIC;
          }
-        return extract_doc(tinfo, input, p, doc);
+        return convert_extract_doc(tinfo, input, p, doc);
      }
      else
          return RECCTRL_EXTRACT_EOF;
@@ -899,7 +1228,7 @@ static int extract_iso2709(struct filter_info *tinfo,
          yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
          rdoc = xmlNewDoc((const xmlChar*) "1.0");
          xmlDocSetRootElement(rdoc, root_ptr);
-        return extract_doc(tinfo, input, p, rdoc);        
+        return convert_extract_doc(tinfo, input, p, rdoc);        
      }
      return RECCTRL_EXTRACT_OK;
  }
@@ -995,7 +1324,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
      doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
                     0 /* URL */,
                     0 /* encoding */,
-                   XML_PARSE_XINCLUDE);
+                   XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
      if (!doc)
      {
         p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;