Fixes for perform_convert: use xmlParseMemory instead of xmlParseMemory
[idzebra-moved-to-github.git] / index / mod_dom.c
index 8aeb416..fca04eb 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: mod_dom.c,v 1.20 2007-02-23 14:59:12 adam Exp $
+/* $Id: mod_dom.c,v 1.26 2007-03-03 21:39:10 adam Exp $
    Copyright (C) 1995-2007
    Index Data ApS
 
@@ -108,6 +108,7 @@ struct filter_info {
     struct filter_retrieve *retrieve_list;
     struct filter_input *input_list;
     struct filter_store *store;
+    int record_info_invoked;
 };
 
 
@@ -183,6 +184,7 @@ static void *filter_init(Res res, RecType recType)
     tinfo->input_list = 0;
     tinfo->store = 0;
     tinfo->doc_config = 0;
+    tinfo->record_info_invoked = 0;
 
 #if YAZ_HAVE_EXSLT
     exsltRegisterAll(); 
@@ -331,6 +333,7 @@ static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
 }
 
 static ZEBRA_RES perform_convert(struct filter_info *tinfo, 
+                                 struct recExtractCtrl *extctr,
                                  struct convert_s *convert,
                                  const char **params,
                                  xmlDocPtr *doc,
@@ -338,12 +341,34 @@ static ZEBRA_RES perform_convert(struct filter_info *tinfo,
 {
     for (; convert; convert = convert->next)
     {
+        xmlChar *buf_out = 0;
+        int len_out = 0;
         xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
                                                 *doc, params);
         if (last_xsp)
             *last_xsp = convert->stylesheet_xsp;
+        
+        if (!res_doc)
+            break;
+
+        /* now saving into buffer and re-reading into DOM to avoid annoing
+           XSLT problem with thrown-out indentation text nodes */
+        xsltSaveResultToString(&buf_out, &len_out, res_doc,
+                               convert->stylesheet_xsp); 
+        xmlFreeDoc(res_doc);
+
         xmlFreeDoc(*doc);
-        *doc = res_doc;
+
+        *doc = xmlParseMemory((const char *) buf_out, len_out);
+
+        /* writing debug info out */
+        if (extctr->flagShowRecords)
+            yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s", 
+                    tinfo->fname ? tinfo->fname : "(none)", 
+                    convert->stylesheet,
+                    len_out, buf_out);
+        
+        xmlFree(buf_out);
     }
     return ZEBRA_OK;
 }
@@ -690,91 +715,114 @@ static void index_value_of(struct filter_info *tinfo,
                            xmlNodePtr node, 
                            xmlChar * index_p)
 {
-    xmlChar *text = xmlNodeGetContent(node);
-    size_t text_len = strlen((const char *)text);
-
-
-    /* if there is no text, we do not need to proceed */
-    if (text_len)
-    {            
-        xmlChar *look = index_p;
-        xmlChar *bval;
-        xmlChar *eval;
-
-        xmlChar index[256];
-        xmlChar type[256];
+    if (tinfo->record_info_invoked == 1)
+    {
+        xmlChar *text = xmlNodeGetContent(node);
+        size_t text_len = strlen((const char *)text);
+        
+        /* if there is no text, we do not need to proceed */
+        if (text_len)
+        {            
+            xmlChar *look = index_p;
+            xmlChar *bval;
+            xmlChar *eval;
+
+            xmlChar index[256];
+            xmlChar type[256];
 
-        /* assingning text to be indexed */
-        recword->term_buf = (const char *)text;
-        recword->term_len = text_len;
+            /* assingning text to be indexed */
+            recword->term_buf = (const char *)text;
+            recword->term_len = text_len;
 
-        /* parsing all index name/type pairs */
-        /* may not start with ' ' or ':' */
-        while (*look && ' ' != *look && ':' != *look)
-        {
-            /* setting name and type to zero */
-            *index = '\0';
-            *type = '\0';
-    
-            /* parsing one index name */
-            bval = look;
-            while (*look && ':' != *look && ' ' != *look)
+            /* parsing all index name/type pairs */
+            /* may not start with ' ' or ':' */
+            while (*look && ' ' != *look && ':' != *look)
             {
-                look++;
-            }
-            eval = look;
-            strncpy((char *)index, (const char *)bval, eval - bval);
-            index[eval - bval] = '\0';
-    
+                /* setting name and type to zero */
+                *index = '\0';
+                *type = '\0';
     
-            /* parsing one index type, if existing */
-            if (':' == *look)
-            {
-                look++;
-      
+                /* parsing one index name */
                 bval = look;
-                while (*look && ' ' != *look)
+                while (*look && ':' != *look && ' ' != *look)
                 {
                     look++;
                 }
                 eval = look;
-                strncpy((char *)type, (const char *)bval, eval - bval);
-                type[eval - bval] = '\0';
-            }
-
-            /* actually indexing the text given */
-            dom_log(YLOG_DEBUG, tinfo, 0, 
-                    "INDEX '%s:%s' '%s'", 
-                    index, type, text);
-
-            recword->index_name = (const char *)index;
-            if (type && *type)
-                recword->index_type = *type;
-            (extctr->tokenAdd)(recword);
+                strncpy((char *)index, (const char *)bval, eval - bval);
+                index[eval - bval] = '\0';
+    
+    
+                /* parsing one index type, if existing */
+                if (':' == *look)
+                {
+                    look++;
+      
+                    bval = look;
+                    while (*look && ' ' != *look)
+                    {
+                        look++;
+                    }
+                    eval = look;
+                    strncpy((char *)type, (const char *)bval, eval - bval);
+                    type[eval - bval] = '\0';
+                }
 
-            /* eat whitespaces */
-            if (*look && ' ' == *look && *(look+1))
-            {
-                look++;
-            } 
+                /* actually indexing the text given */
+                dom_log(YLOG_DEBUG, tinfo, 0, 
+                        "INDEX '%s:%s' '%s'", 
+                        index ? (const char *) index : "null",
+                        type ? (const char *) type : "null", 
+                        text ? (const char *) text : "null");
+
+                recword->index_name = (const char *)index;
+                if (type && *type)
+                    recword->index_type = *type;
+
+                /* writing debug out */
+                if (extctr->flagShowRecords)
+                    dom_log(YLOG_LOG, tinfo, 0, 
+                            "INDEX '%s:%s' '%s'", 
+                            index ? (const char *) index : "null",
+                            type ? (const char *) type : "null", 
+                            text ? (const char *) text : "null");
+                
+                /* actually indexing the text given */
+                recword->index_name = (const char *)index;
+                if (type && *type)
+                    recword->index_type = *type;
+                (extctr->tokenAdd)(recword);
+
+                /* eat whitespaces */
+                if (*look && ' ' == *look && *(look+1))
+                {
+                    look++;
+                } 
+            }
         }
+        xmlFree(text); 
     }
-    
-    xmlFree(text); 
 }
 
 
 /* DOM filter style indexing */
 static void set_record_info(struct filter_info *tinfo, 
                             struct recExtractCtrl *extctr, 
+                            xmlNodePtr node, 
                             xmlChar * id_p, 
                             xmlChar * rank_p, 
                             xmlChar * type_p)
 {
-    dom_log(YLOG_DEBUG, tinfo, 0,
-            "RECORD id=%s rank=%s type=%s", 
-            id_p, rank_p, type_p);
+
+    /* writing debug info out */
+    if (extctr->flagShowRecords)
+        dom_log(YLOG_LOG, tinfo, 0,
+                "RECORD id=%s rank=%s type=%s", 
+                id_p ? (const char *) id_p : "(null)",
+                rank_p ? (const char *) rank_p : "(null)",
+                type_p ? (const char *) type_p : "(null)");
     
+
     if (id_p)
         sscanf((const char *)id_p, "%255s", extctr->match_criteria);
 
@@ -788,6 +836,12 @@ static void set_record_info(struct filter_info *tinfo,
     /*     else */
     /*         dom_log(YLOG_WARN, tinfo, ptr, "dom filter: unknown record type '%s'",  */
     /*                 type_str); */
+    if (tinfo->record_info_invoked == 1)
+    {
+        /* warn about multiple only once */
+        dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
+    }
+    tinfo->record_info_invoked++;
 
 }
 
@@ -849,7 +903,7 @@ static void process_xml_element_zebra_node(struct filter_info *tinfo,
                             attr->name);
                 }
             }
-            set_record_info(tinfo, extctr, id_p, rank_p, type_p);
+            set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
         } 
         else
         {
@@ -933,7 +987,7 @@ static void process_xml_pi_node(struct filter_info *tinfo,
                         pi_p, look);
             }
             else 
-                set_record_info(tinfo, extctr, id, rank, 0);
+                set_record_info(tinfo, extctr, node, id, rank, 0);
 
         } 
         /* parsing index instruction */
@@ -999,20 +1053,11 @@ static void extract_dom_doc_node(struct filter_info *tinfo,
                                  struct recExtractCtrl *extctr, 
                                  xmlDocPtr doc)
 {
-    xmlChar *buf_out;
-    int len_out;
-
     /* only need to do the initialization once, reuse recword for all terms */
     RecWord recword;
     (*extctr->init)(extctr, &recword);
 
-    if (extctr->flagShowRecords)
-    {
-        xmlDocDumpMemory(doc, &buf_out, &len_out);
-        fwrite(buf_out, len_out, 1, stdout);
-        xmlFree(buf_out);
-    }
-
+    tinfo->record_info_invoked = 0;
     process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
 }
 
@@ -1035,13 +1080,13 @@ static int convert_extract_doc(struct filter_info *tinfo,
     set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
 
     /* input conversion */
-    perform_convert(tinfo, input->convert, params, &doc, 0);
+    perform_convert(tinfo, p, input->convert, params, &doc, 0);
 
     if (tinfo->store)
     {
         /* store conversion */
         store_doc = xmlCopyDoc(doc, 1);
-        perform_convert(tinfo, tinfo->store->convert,
+        perform_convert(tinfo, p, tinfo->store->convert,
                         params, &store_doc, &last_xsp);
     }
     
@@ -1050,8 +1095,10 @@ static int convert_extract_doc(struct filter_info *tinfo,
                                store_doc ? store_doc : doc, last_xsp);
     else
         xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
-    if (p->flagShowRecords)
-       fwrite(buf_out, len_out, 1, stdout);
+  
+    /* if (p->flagShowRecords)
+       fwrite(buf_out, len_out, 1, stdout); */
+
     (*p->setStoreData)(p, buf_out, len_out);
     xmlFree(buf_out);
 
@@ -1059,16 +1106,17 @@ static int convert_extract_doc(struct filter_info *tinfo,
         xmlFreeDoc(store_doc);
 
     /* extract conversion */
-    perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
+    perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0);
 
     /* finally, do the indexing */
     if (doc)
-    {
         extract_dom_doc_node(tinfo, p, doc);
-        /* extract_doc_alvis(tinfo, p, doc); */
+
+    if (doc)
        xmlFreeDoc(doc);
-    }
 
+    if (tinfo->record_info_invoked == 0)
+        return RECCTRL_EXTRACT_SKIP;
     return RECCTRL_EXTRACT_OK;
 }
 
@@ -1109,6 +1157,18 @@ static int extract_xml_split(struct filter_info *tinfo,
                 
                 xmlDocSetRootElement(doc, ptr2);
                 
+                /* writing debug info out */
+                if (p->flagShowRecords)
+                {
+                    xmlChar *buf_out = 0;
+                    int len_out = 0;
+                    xmlDocDumpMemory(doc, &buf_out, &len_out);
+                    yaz_log(YLOG_LOG, "%s: XMLREADER depth: %i\n%.*s", 
+                            tinfo->fname ? tinfo->fname : "(none)",
+                            depth, len_out, buf_out); 
+                    xmlFree(buf_out);
+                }
+                
                 return convert_extract_doc(tinfo, input, p, doc);
             }
             else
@@ -1302,7 +1362,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
     }
 
     /* retrieve conversion */
-    perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
+    perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp);
     if (!doc)
     {
         p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;