ICU term lists are handled for search.
[idzebra-moved-to-github.git] / util / zebramap.c
index e87bb71..d2cd37f 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: zebramap.c,v 1.69 2007-11-08 09:30:05 adam Exp $
+/* $Id: zebramap.c,v 1.73 2007-12-07 14:09:10 adam Exp $
    Copyright (C) 1995-2007
    Index Data ApS
 
@@ -47,6 +47,7 @@ struct zebra_map {
     int first_in_field;
     int type;
     int use_chain;
+    int debug;
     union {
         struct {
             int entry_size;
@@ -54,7 +55,6 @@ struct zebra_map {
     } u;
     chrmaptab maptab;
     const char *maptab_name;
-    const char *locale;
     zebra_maps_t zebra_maps;
 #if YAZ_HAVE_XML2
     xmlDocPtr doc;
@@ -62,7 +62,8 @@ struct zebra_map {
 #if YAZ_HAVE_ICU
     struct icu_chain *icu_chain;
 #endif
-    WRBUF simple_buf;
+    WRBUF input_str;
+    WRBUF print_str;
     size_t simple_off;
     struct zebra_map *next;
 };
@@ -93,7 +94,8 @@ void zebra_maps_close(zebra_maps_t zms)
 #if YAZ_HAVE_XML2
         xmlFreeDoc(zm->doc);
 #endif
-        wrbuf_destroy(zm->simple_buf);
+        wrbuf_destroy(zm->input_str);
+        wrbuf_destroy(zm->print_str);
        zm = zm->next;
     }
     wrbuf_destroy(zms->wrbuf_1);
@@ -110,7 +112,7 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type,
     zm->id = nmem_strdup(zms->nmem, index_type);
     zm->maptab_name = 0;
     zm->use_chain = 0;
-    zm->locale = 0;
+    zm->debug = 0;
     zm->maptab = 0;
     zm->type = map_type;
     zm->completeness = 0;
@@ -130,7 +132,8 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type,
 #if YAZ_HAVE_XML2
     zm->doc = 0;
 #endif
-    zm->simple_buf = wrbuf_alloc();
+    zm->input_str = wrbuf_alloc();
+    zm->print_str = wrbuf_alloc();
     return zm;
 }
 
@@ -218,10 +221,6 @@ static int parse_command(zebra_maps_t zms, int argc, char **argv,
             return -1;
         }
     }
-    else if (!yaz_matchstr(argv[0], "locale"))
-    {
-        zm->locale = nmem_strdup(zms->nmem, argv[1]);
-    }
     else if (!yaz_matchstr(argv[0], "simplechain"))
     {
         zm->use_chain = 1;
@@ -232,12 +231,6 @@ static int parse_command(zebra_maps_t zms, int argc, char **argv,
     else if (!yaz_matchstr(argv[0], "icuchain"))
     {
 #if YAZ_HAVE_XML2
-        if (!zm->locale)
-        {
-            yaz_log(YLOG_WARN, "%s:%d: locale required before icuchain", 
-                    fname, lineno);
-            return -1;
-        }
         zm->doc = xmlParseFile(argv[1]);
         if (!zm->doc)
         {
@@ -251,7 +244,7 @@ static int parse_command(zebra_maps_t zms, int argc, char **argv,
             UErrorCode status;
             xmlNode *xml_node = xmlDocGetRootElement(zm->doc);
             zm->icu_chain = 
-                icu_chain_xml_config(xml_node, zm->locale, 
+                icu_chain_xml_config(xml_node,
 /* not sure about sort for this function yet.. */
 #if 1
                                      1,
@@ -277,6 +270,10 @@ static int parse_command(zebra_maps_t zms, int argc, char **argv,
         return -1;
 #endif
     }
+    else if (!yaz_matchstr(argv[0], "debug") && argc == 2)
+    {
+        zm->debug = atoi(argv[1]);
+    }
     else
     {
         yaz_log(YLOG_WARN, "%s:%d: Unrecognized directive '%s'",  
@@ -617,8 +614,8 @@ WRBUF zebra_replace(zebra_map_t zm, const char *ex_list,
 static int tokenize_simple(zebra_map_t zm,
                            const char **result_buf, size_t *result_len)
 {
-    char *buf = wrbuf_buf(zm->simple_buf);
-    size_t len = wrbuf_len(zm->simple_buf);
+    char *buf = wrbuf_buf(zm->input_str);
+    size_t len = wrbuf_len(zm->input_str);
     size_t i = zm->simple_off;
     size_t start;
 
@@ -642,6 +639,75 @@ static int tokenize_simple(zebra_map_t zm,
     return 0;
  }
 
+
+int zebra_map_tokenize_next(zebra_map_t zm,
+                            const char **result_buf, size_t *result_len)
+{
+    assert(zm->use_chain);
+
+#if YAZ_HAVE_ICU
+    if (!zm->icu_chain)
+        return tokenize_simple(zm, result_buf, result_len);
+    else
+    {
+        UErrorCode status;
+        while (icu_chain_next_token(zm->icu_chain, &status))
+        {
+            assert(U_SUCCESS(status));
+            *result_buf = icu_chain_token_sortkey(zm->icu_chain);
+            assert(*result_buf);
+
+            *result_len = strlen(*result_buf);
+
+            if (zm->debug)
+            {
+                wrbuf_rewind(zm->print_str);
+                wrbuf_write_escaped(zm->print_str, *result_buf, *result_len);
+                yaz_log(YLOG_LOG, "output %s", wrbuf_cstr(zm->print_str));
+            }
+
+            if (**result_buf != '\0')
+                return 1;
+        }
+        assert(U_SUCCESS(status));
+    }
+    return 0;
+#else
+    return tokenize_simple(zm, result_buf, result_len);
+#endif
+}
+
+int zebra_map_tokenize_start(zebra_map_t zm,
+                             const char *buf, size_t len)
+{
+    assert(zm->use_chain);
+
+    wrbuf_rewind(zm->input_str);
+    wrbuf_write(zm->input_str, buf, len);
+    zm->simple_off = 0;
+#if YAZ_HAVE_ICU
+    if (zm->icu_chain)
+    {
+        UErrorCode status;
+        if (zm->debug)
+        {
+            wrbuf_rewind(zm->print_str);
+            wrbuf_write_escaped(zm->print_str, wrbuf_buf(zm->input_str),
+                                wrbuf_len(zm->input_str));
+            
+            yaz_log(YLOG_LOG, "input %s", 
+                    wrbuf_cstr(zm->print_str)); 
+        }
+        icu_chain_assign_cstr(zm->icu_chain,
+                              wrbuf_cstr(zm->input_str),
+                              &status);
+        assert(U_SUCCESS(status));
+    }
+#endif
+    return 0;
+}
+
+#if 0
 int zebra_map_tokenize(zebra_map_t zm,
                        const char *buf, size_t len,
                        const char **result_buf, size_t *result_len)
@@ -650,8 +716,8 @@ int zebra_map_tokenize(zebra_map_t zm,
 
     if (buf)
     {
-        wrbuf_rewind(zm->simple_buf);
-        wrbuf_write(zm->simple_buf, buf, len);
+        wrbuf_rewind(zm->input_str);
+        wrbuf_write(zm->input_str, buf, len);
         zm->simple_off = 0;
     }
 
@@ -663,19 +729,35 @@ int zebra_map_tokenize(zebra_map_t zm,
         UErrorCode status;
         if (buf)
         {
-            yaz_log(YLOG_LOG, "assicn_cstr %s", wrbuf_cstr(zm->simple_buf)); 
+            if (zm->debug)
+            {
+                wrbuf_rewind(zm->print_str);
+                wrbuf_write_escaped(zm->print_str, wrbuf_buf(zm->input_str),
+                                    wrbuf_len(zm->input_str));
+                
+                yaz_log(YLOG_LOG, "input %s", 
+                        wrbuf_cstr(zm->print_str)); 
+            }
             icu_chain_assign_cstr(zm->icu_chain,
-                                  wrbuf_cstr(zm->simple_buf),
+                                  wrbuf_cstr(zm->input_str),
                                   &status);
             assert(U_SUCCESS(status));
         }
         while (icu_chain_next_token(zm->icu_chain, &status))
         {
             assert(U_SUCCESS(status));
-            *result_buf = icu_chain_token_norm(zm->icu_chain);
+            *result_buf = icu_chain_token_sortkey(zm->icu_chain);
             assert(*result_buf);
-            yaz_log(YLOG_LOG, "got result %s", *result_buf);
+
             *result_len = strlen(*result_buf);
+
+            if (zm->debug)
+            {
+                wrbuf_rewind(zm->print_str);
+                wrbuf_write_escaped(zm->print_str, *result_buf, *result_len);
+                yaz_log(YLOG_LOG, "output %s", wrbuf_cstr(zm->print_str));
+            }
+
             if (**result_buf != '\0')
                 return 1;
         }
@@ -686,6 +768,7 @@ int zebra_map_tokenize(zebra_map_t zm,
     return tokenize_simple(zm, result_buf, result_len);
 #endif
 }
+#endif
 
 int zebra_maps_is_icu(zebra_map_t zm)
 {