Test case for YAZ-834
[yaz-moved-to-github.git] / test / test_icu.c
index b1de907..5107693 100644 (file)
@@ -1,5 +1,5 @@
 /* This file is part of the YAZ toolkit.
- * Copyright (C) 1995-2013 Index Data
+ * Copyright (C) Index Data
  * See the file LICENSE for details.
  */
 
 #include <pthread.h>
 #endif
 
+#if YAZ_HAVE_XML2
+#include <libxml/xmlmemory.h>
+#endif
+
 #include <string.h>
 #include <stdlib.h>
 
@@ -364,6 +368,7 @@ static int test_icu_tokenizer(const char *locale, char action,
     struct icu_buf_utf16 *tkn16 = icu_buf_utf16_create(0);
     struct icu_buf_utf8 *tkn8 = icu_buf_utf8_create(0);
     struct icu_tokenizer *tokenizer = 0;
+    size_t org_start, org_len;
 
     /* transforming to UTF16 */
     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
@@ -379,7 +384,8 @@ static int test_icu_tokenizer(const char *locale, char action,
     icu_check_status(status);
 
     /* perform work on tokens */
-    while (icu_tokenizer_next_token(tokenizer, tkn16, &status))
+    while (icu_tokenizer_next_token(tokenizer, tkn16, &status,
+                                    &org_start, &org_len))
     {
         icu_check_status(status);
 
@@ -891,6 +897,68 @@ static void check_icu_iter4(void)
 }
 
 
+static void check_norm(void)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    struct icu_chain *chain = 0;
+    xmlNode *xml_node;
+    yaz_icu_iter_t it;
+
+    const char *xml_str =
+        "  <icu_chain id=\"relevance\" locale=\"en\">"
+        "    <transform rule=\"[:Control:] Any-Remove\"/>"
+        "    <tokenize rule=\"l\"/>"
+        "    <transform rule=\"[[:WhiteSpace:][:Punctuation:]`] Remove\"/>"
+        "    <casemap rule=\"l\"/>"
+        "  </icu_chain>";
+
+    xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
+    YAZ_CHECK(doc);
+    if (!doc)
+        return;
+    xml_node = xmlDocGetRootElement(doc);
+    YAZ_CHECK(xml_node);
+    if (!xml_node)
+        return ;
+    chain = icu_chain_xml_config(xml_node, 1, &status);
+
+    it = icu_iter_create(chain);
+    if (it)
+    {
+        icu_iter_first(it, " y😄");
+        while (icu_iter_next(it))
+        {
+            const char *norm_str = icu_iter_get_norm(it);
+            size_t start, len;
+
+            YAZ_CHECK(norm_str);
+            if (norm_str)
+                yaz_log(YLOG_LOG, "norm_str len=%ld=%s",
+                        (long) strlen(norm_str), norm_str);
+            icu_iter_get_org_info(it, &start, &len);
+            YAZ_CHECK(start <= 1000);
+            YAZ_CHECK(len <= 1000);
+        }
+
+        icu_iter_first(it, "\n y😄");
+        while (icu_iter_next(it))
+        {
+            const char *norm_str = icu_iter_get_norm(it);
+            size_t start, len;
+
+            YAZ_CHECK(norm_str);
+            if (norm_str)
+                yaz_log(YLOG_LOG, "norm_str len=%ld=%s",
+                        (long) strlen(norm_str), norm_str);
+            icu_iter_get_org_info(it, &start, &len);
+            YAZ_CHECK(start <= 1000);
+            YAZ_CHECK(len <= 1000);
+        }
+    }
+    icu_iter_destroy(it);
+    icu_chain_destroy(chain);
+    xmlFreeDoc(doc);
+}
 #endif /* YAZ_HAVE_ICU */
 
 int main(int argc, char **argv)
@@ -913,8 +981,13 @@ int main(int argc, char **argv)
     check_icu_iter4();
 
     check_bug_1140();
+    check_norm();
 
     u_cleanup();
+#if YAZ_HAVE_XML2
+    xmlCleanupParser();
+#endif
+
 #else /* YAZ_HAVE_ICU */
 
     yaz_log(YLOG_LOG, "ICU unit tests omitted");