ICU term lists are handled for search.
[idzebra-moved-to-github.git] / util / zebramap.c
index d322645..d2cd37f 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: zebramap.c,v 1.64 2007-11-05 13:58:01 adam Exp $
+/* $Id: zebramap.c,v 1.73 2007-12-07 14:09:10 adam Exp $
    Copyright (C) 1995-2007
    Index Data ApS
 
@@ -28,8 +28,8 @@
 #include <attrfind.h>
 #include <yaz/yaz-util.h>
 
-#if HAVE_ICU
-#include <yaz/icu_I18N.h>
+#if YAZ_HAVE_ICU
+#include <yaz/icu.h>
 #endif
 #include <zebramap.h>
 
@@ -46,6 +46,8 @@ struct zebra_map {
     int alwaysmatches;
     int first_in_field;
     int type;
+    int use_chain;
+    int debug;
     union {
         struct {
             int entry_size;
@@ -53,14 +55,16 @@ struct zebra_map {
     } u;
     chrmaptab maptab;
     const char *maptab_name;
-    const char *locale;
     zebra_maps_t zebra_maps;
 #if YAZ_HAVE_XML2
     xmlDocPtr doc;
 #endif
-#if HAVE_ICU
+#if YAZ_HAVE_ICU
     struct icu_chain *icu_chain;
 #endif
+    WRBUF input_str;
+    WRBUF print_str;
+    size_t simple_off;
     struct zebra_map *next;
 };
 
@@ -83,13 +87,15 @@ void zebra_maps_close(zebra_maps_t zms)
     {
        if (zm->maptab)
            chrmaptab_destroy(zm->maptab);
-#if HAVE_ICU
+#if YAZ_HAVE_ICU
         if (zm->icu_chain)
             icu_chain_destroy(zm->icu_chain);
 #endif
 #if YAZ_HAVE_XML2
         xmlFreeDoc(zm->doc);
 #endif
+        wrbuf_destroy(zm->input_str);
+        wrbuf_destroy(zm->print_str);
        zm = zm->next;
     }
     wrbuf_destroy(zms->wrbuf_1);
@@ -105,7 +111,8 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type,
     zm->zebra_maps = zms;
     zm->id = nmem_strdup(zms->nmem, index_type);
     zm->maptab_name = 0;
-    zm->locale = 0;
+    zm->use_chain = 0;
+    zm->debug = 0;
     zm->maptab = 0;
     zm->type = map_type;
     zm->completeness = 0;
@@ -119,12 +126,14 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type,
         zms->map_list = zm;
     zms->last_map = zm;
     zm->next = 0;
-#if HAVE_ICU
+#if YAZ_HAVE_ICU
     zm->icu_chain = 0;
 #endif
 #if YAZ_HAVE_XML2
     zm->doc = 0;
 #endif
+    zm->input_str = wrbuf_alloc();
+    zm->print_str = wrbuf_alloc();
     return zm;
 }
 
@@ -212,9 +221,12 @@ static int parse_command(zebra_maps_t zms, int argc, char **argv,
             return -1;
         }
     }
-    else if (!yaz_matchstr(argv[0], "locale"))
+    else if (!yaz_matchstr(argv[0], "simplechain"))
     {
-        zm->locale = nmem_strdup(zms->nmem, argv[1]);
+        zm->use_chain = 1;
+#if YAZ_HAVE_ICU
+        zm->icu_chain = 0;
+#endif
     }
     else if (!yaz_matchstr(argv[0], "icuchain"))
     {
@@ -228,18 +240,24 @@ static int parse_command(zebra_maps_t zms, int argc, char **argv,
         }
         else
         {
-#if HAVE_ICU
+#if YAZ_HAVE_ICU
             UErrorCode status;
             xmlNode *xml_node = xmlDocGetRootElement(zm->doc);
             zm->icu_chain = 
-                icu_chain_xml_config(xml_node, zm->locale, 
+                icu_chain_xml_config(xml_node,
+/* not sure about sort for this function yet.. */
+#if 1
+                                     1,
+#else
                                      zm->type == ZEBRA_MAP_TYPE_SORT,
+#endif                                    
                                      &status);
             if (!zm->icu_chain)
             {
                 yaz_log(YLOG_WARN, "%s:%d: Failed to load ICU chain %s",
                         fname, lineno, argv[1]);
             }
+            zm->use_chain = 1;
 #else
             yaz_log(YLOG_WARN, "%s:%d: ICU support unavailable",
                     fname, lineno);
@@ -252,6 +270,10 @@ static int parse_command(zebra_maps_t zms, int argc, char **argv,
         return -1;
 #endif
     }
+    else if (!yaz_matchstr(argv[0], "debug") && argc == 2)
+    {
+        zm->debug = atoi(argv[1]);
+    }
     else
     {
         yaz_log(YLOG_WARN, "%s:%d: Unrecognized directive '%s'",  
@@ -587,6 +609,177 @@ WRBUF zebra_replace(zebra_map_t zm, const char *ex_list,
     return zm->zebra_maps->wrbuf_1;
 }
 
+#define SE_CHARS ";,.()-/?<> \r\n\t"
+
+static int tokenize_simple(zebra_map_t zm,
+                           const char **result_buf, size_t *result_len)
+{
+    char *buf = wrbuf_buf(zm->input_str);
+    size_t len = wrbuf_len(zm->input_str);
+    size_t i = zm->simple_off;
+    size_t start;
+
+    while (i < len && strchr(SE_CHARS, buf[i]))
+        i++;
+    start = i;
+    while (i < len && !strchr(SE_CHARS, buf[i]))
+    {
+        if (buf[i] > 32 && buf[i] < 127)
+            buf[i] = tolower(buf[i]);
+        i++;
+    }
+
+    zm->simple_off = i;
+    if (start != i)
+    {
+        *result_buf = buf + start;
+        *result_len = i - start;
+        return 1;
+    }
+    return 0;
+ }
+
+
+int zebra_map_tokenize_next(zebra_map_t zm,
+                            const char **result_buf, size_t *result_len)
+{
+    assert(zm->use_chain);
+
+#if YAZ_HAVE_ICU
+    if (!zm->icu_chain)
+        return tokenize_simple(zm, result_buf, result_len);
+    else
+    {
+        UErrorCode status;
+        while (icu_chain_next_token(zm->icu_chain, &status))
+        {
+            assert(U_SUCCESS(status));
+            *result_buf = icu_chain_token_sortkey(zm->icu_chain);
+            assert(*result_buf);
+
+            *result_len = strlen(*result_buf);
+
+            if (zm->debug)
+            {
+                wrbuf_rewind(zm->print_str);
+                wrbuf_write_escaped(zm->print_str, *result_buf, *result_len);
+                yaz_log(YLOG_LOG, "output %s", wrbuf_cstr(zm->print_str));
+            }
+
+            if (**result_buf != '\0')
+                return 1;
+        }
+        assert(U_SUCCESS(status));
+    }
+    return 0;
+#else
+    return tokenize_simple(zm, result_buf, result_len);
+#endif
+}
+
+int zebra_map_tokenize_start(zebra_map_t zm,
+                             const char *buf, size_t len)
+{
+    assert(zm->use_chain);
+
+    wrbuf_rewind(zm->input_str);
+    wrbuf_write(zm->input_str, buf, len);
+    zm->simple_off = 0;
+#if YAZ_HAVE_ICU
+    if (zm->icu_chain)
+    {
+        UErrorCode status;
+        if (zm->debug)
+        {
+            wrbuf_rewind(zm->print_str);
+            wrbuf_write_escaped(zm->print_str, wrbuf_buf(zm->input_str),
+                                wrbuf_len(zm->input_str));
+            
+            yaz_log(YLOG_LOG, "input %s", 
+                    wrbuf_cstr(zm->print_str)); 
+        }
+        icu_chain_assign_cstr(zm->icu_chain,
+                              wrbuf_cstr(zm->input_str),
+                              &status);
+        assert(U_SUCCESS(status));
+    }
+#endif
+    return 0;
+}
+
+#if 0
+int zebra_map_tokenize(zebra_map_t zm,
+                       const char *buf, size_t len,
+                       const char **result_buf, size_t *result_len)
+{
+    assert(zm->use_chain);
+
+    if (buf)
+    {
+        wrbuf_rewind(zm->input_str);
+        wrbuf_write(zm->input_str, buf, len);
+        zm->simple_off = 0;
+    }
+
+#if YAZ_HAVE_ICU
+    if (!zm->icu_chain)
+        return tokenize_simple(zm, result_buf, result_len);
+    else
+    {
+        UErrorCode status;
+        if (buf)
+        {
+            if (zm->debug)
+            {
+                wrbuf_rewind(zm->print_str);
+                wrbuf_write_escaped(zm->print_str, wrbuf_buf(zm->input_str),
+                                    wrbuf_len(zm->input_str));
+                
+                yaz_log(YLOG_LOG, "input %s", 
+                        wrbuf_cstr(zm->print_str)); 
+            }
+            icu_chain_assign_cstr(zm->icu_chain,
+                                  wrbuf_cstr(zm->input_str),
+                                  &status);
+            assert(U_SUCCESS(status));
+        }
+        while (icu_chain_next_token(zm->icu_chain, &status))
+        {
+            assert(U_SUCCESS(status));
+            *result_buf = icu_chain_token_sortkey(zm->icu_chain);
+            assert(*result_buf);
+
+            *result_len = strlen(*result_buf);
+
+            if (zm->debug)
+            {
+                wrbuf_rewind(zm->print_str);
+                wrbuf_write_escaped(zm->print_str, *result_buf, *result_len);
+                yaz_log(YLOG_LOG, "output %s", wrbuf_cstr(zm->print_str));
+            }
+
+            if (**result_buf != '\0')
+                return 1;
+        }
+        assert(U_SUCCESS(status));
+    }
+    return 0;
+#else
+    return tokenize_simple(zm, result_buf, result_len);
+#endif
+}
+#endif
+
+int zebra_maps_is_icu(zebra_map_t zm)
+{
+#if YAZ_HAVE_ICU
+    return zm->use_chain;
+#else
+    return 0;
+#endif
+}
+
+
 /*
  * Local variables:
  * c-basic-offset: 4