updated ICU casemap wrappers to use dynamic buffers, all ICU tests succeed

author Marc Cromme <marc@indexdata.dk>

Mon, 7 May 2007 12:18:34 +0000 (12:18 +0000)

committer Marc Cromme <marc@indexdata.dk>

Mon, 7 May 2007 12:18:34 +0000 (12:18 +0000)
author Marc Cromme <marc@indexdata.dk>
Mon, 7 May 2007 12:18:34 +0000 (12:18 +0000)
committer Marc Cromme <marc@indexdata.dk>
Mon, 7 May 2007 12:18:34 +0000 (12:18 +0000)
diff --git a/src/icu_I18N.c b/src/icu_I18N.c

index 6dd150e..846ad4b 100644 (file)
--- a/src/icu_I18N.c
+++ b/src/icu_I18N.c
@@ -1,4 +1,4 @@
-/* $Id: icu_I18N.c,v 1.5 2007-05-07 09:31:36 marc Exp $
+/* $Id: icu_I18N.c,v 1.6 2007-05-07 12:18:34 marc Exp $
     Copyright (c) 2006-2007, Index Data.
  
  This file is part of Pazpar2.
@@ -202,7 +202,8 @@ UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
                    (const char *) src8->utf8, src8->utf8_len, status);
    }
  
-  if (*status != U_BUFFER_OVERFLOW_ERROR
+    //if (*status != U_BUFFER_OVERFLOW_ERROR
+  if (U_SUCCESS(*status)  
        && utf16_len < dest16->utf16_cap)
      dest16->utf16_len = utf16_len;
    else {
@@ -239,7 +240,8 @@ UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
                    src8cstr, src8cstr_len, status);
    }
  
-  if (*status != U_BUFFER_OVERFLOW_ERROR
+  //  if (*status != U_BUFFER_OVERFLOW_ERROR
+  if (U_SUCCESS(*status)  
        && utf16_len < dest16->utf16_cap)
      dest16->utf16_len = utf16_len;
    else {
@@ -251,6 +253,45 @@ UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
  };
  
  
+
+
+UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
+                               struct icu_buf_utf16 * src16,
+                               UErrorCode * status)
+{
+  int32_t utf8_len = 0;
+  
+  u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
+                &utf8_len,
+                 src16->utf16, src16->utf16_len, status);
+  
+  // check for buffer overflow, resize and retry
+  if (*status == U_BUFFER_OVERFLOW_ERROR
+      //|| dest8->utf8_len > dest8->utf8_cap
+      ){
+    icu_buf_utf8_resize(dest8, utf8_len * 2);
+    *status = U_ZERO_ERROR;
+    u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
+                &utf8_len,
+                src16->utf16, src16->utf16_len, status);
+
+  }
+
+  //if (*status != U_BUFFER_OVERFLOW_ERROR
+  if (U_SUCCESS(*status)  
+      && utf8_len < dest8->utf8_cap)
+      dest8->utf8_len = utf8_len;
+  else {
+      dest8->utf8[0] = (uint8_t) 0;
+      dest8->utf8_len = 0;
+  }
+  
+  return *status;
+};
+
+
+
+
  UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
                                     struct icu_buf_utf8 * dest8, 
                                     struct icu_buf_utf16 * src16,
@@ -269,9 +310,14 @@ UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
                                    dest8->utf8, dest8->utf8_cap);
    }
  
-  if (sortkey_len > 0)
+  if (U_SUCCESS(*status)
+      && sortkey_len > 0)
      dest8->utf8_len = sortkey_len;
- 
+  else {
+    dest8->utf8[0] = (UChar) 0;
+    dest8->utf8_len = 0;
+  }
+
    return *status;
  };
  
diff --git a/src/icu_I18N.h b/src/icu_I18N.h

index eb44204..b0a91d9 100644 (file)
--- a/src/icu_I18N.h
+++ b/src/icu_I18N.h
@@ -1,4 +1,4 @@
-/* $Id: icu_I18N.h,v 1.5 2007-05-07 09:31:36 marc Exp $
+/* $Id: icu_I18N.h,v 1.6 2007-05-07 12:18:34 marc Exp $
     Copyright (c) 2006-2007, Index Data.
  
  This file is part of Pazpar2.
@@ -76,6 +76,12 @@ UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
                                      const char * src8cstr,
                                      UErrorCode * status);
  
+
+UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
+                             struct icu_buf_utf16 * src16,
+                             UErrorCode * status);
+
+
  UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
                                     struct icu_buf_utf8 * dest8, 
                                     struct icu_buf_utf16 * src16,
diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c

index 065a8f0..93428ad 100644 (file)
--- a/src/test_icu_I18N.c
+++ b/src/test_icu_I18N.c
@@ -1,4 +1,4 @@
-/* $Id: test_icu_I18N.c,v 1.8 2007-05-07 09:31:36 marc Exp $
+/* $Id: test_icu_I18N.c,v 1.9 2007-05-07 12:18:34 marc Exp $
     Copyright (c) 2006-2007, Index Data.
  
  This file is part of Pazpar2.
@@ -37,18 +37,11 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  
  #ifdef HAVE_ICU
  #include "icu_I18N.h"
+
  #include <string.h>
  #include <stdlib.h>
-#include <stdio.h>
-
-
-#include <unicode/ustring.h>  /* some more string fcns*/
-#include <unicode/uchar.h>    /* char names           */
-//#include <unicode/ustdio.h>
-//#include <unicode/utypes.h>   /* Basic ICU data types */
-#include <unicode/ucol.h> 
-
  
+#include <unicode/ustring.h>  
  // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  
  
@@ -56,22 +49,160 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  
  struct icu_termmap
  {
-  uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
-  char disp_term[MAX_KEY_SIZE];  // standard C utf-8 string
+    uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
+    char disp_term[MAX_KEY_SIZE];  // standard C utf-8 string
  };
  
  
  
  int icu_termmap_cmp(const void *vp1, const void *vp2)
  {
-  struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
-  struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
+    struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
+    struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
+
+    int cmp = 0;
+    
+    cmp = strcmp((const char *)itmp1->sort_key, 
+                 (const char *)itmp2->sort_key);
+    return cmp;
+};
+
+
+
+int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
+                      struct icu_buf_utf16 * src16,
+                      const char *locale, char action,
+                      UErrorCode *status)
+{
+    int32_t dest16_len = 0;
+    
+    switch(action) {    
+    case 'l':    
+        dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
+                                  src16->utf16, src16->utf16_len, 
+                                  locale, status);
+        break;
+    case 'u':    
+        dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
+                                  src16->utf16, src16->utf16_len, 
+                                  locale, status);
+        break;
+    case 't':    
+        dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
+                                  src16->utf16, src16->utf16_len,
+                                  0, locale, status);
+        break;
+    case 'f':    
+        dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
+                                   src16->utf16, src16->utf16_len,
+                                   U_FOLD_CASE_DEFAULT, status);
+        break;
+        
+    default:
+        return U_UNSUPPORTED_ERROR;
+        break;
+    }
+
+    // check for buffer overflow, resize and retry
+    if (*status == U_BUFFER_OVERFLOW_ERROR
+        //|| dest16_len > dest16->utf16_cap
+        ){
+        icu_buf_utf16_resize(dest16, dest16_len * 2);
+        *status = U_ZERO_ERROR;
  
-  int cmp = 0;
      
-  cmp = strcmp((const char *)itmp1->sort_key, 
-               (const char *)itmp2->sort_key);
-  return cmp;
+        switch(action) {    
+        case 'l':    
+            dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
+                                      src16->utf16, src16->utf16_len, 
+                                      locale, status);
+            break;
+        case 'u':    
+            dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
+                                      src16->utf16, src16->utf16_len, 
+                                      locale, status);
+            break;
+        case 't':    
+            dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
+                                      src16->utf16, src16->utf16_len,
+                                      0, locale, status);
+            break;
+        case 'f':    
+        dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
+                                   src16->utf16, src16->utf16_len,
+                                   U_FOLD_CASE_DEFAULT, status);
+        break;
+        
+        default:
+            return U_UNSUPPORTED_ERROR;
+            break;
+        }
+    }
+    
+    if (U_SUCCESS(*status)
+        && dest16_len < dest16->utf16_cap)
+        dest16->utf16_len = dest16_len;
+    else {
+        dest16->utf16[0] = (UChar) 0;
+        dest16->utf16_len = 0;
+    }
+  
+    return *status;
+};
+
+
+
+int test_icu_casemap(const char * locale, char action,
+                     const char * src8cstr, const char * chk8cstr)
+{
+    int success = 0;
+    UErrorCode status = U_ZERO_ERROR;
+
+    struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0);
+    struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
+    struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
+    struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
+
+
+    int src8cstr_len = strlen(src8cstr);
+    int chk8cstr_len = strlen(chk8cstr);
+
+    // converting to UTF16
+    icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
+
+    // perform case mapping
+    icu_utf16_casemap(dest16, src16, locale, action, &status);
+  
+    // converting to UTF8
+    icu_utf16_to_utf8(dest8, dest16, &status);
+      
+
+  
+    // determine success
+    if (dest8->utf8 
+        && (dest8->utf8_len == strlen(chk8cstr))
+        && !strcmp(chk8cstr, (const char *) dest8->utf8))
+        success = 1;
+    else
+        success = 0;
+
+    // report failures
+    if (!success){
+        printf("\nERROR\n");
+        printf("original string:   '%s' (%d)\n", src8cstr, src8cstr_len);
+        printf("icu_casemap '%s:%c' '%s' (%d)\n", 
+               locale, action, dest8->utf8, dest8->utf8_len);
+        printf("expected string:   '%s' (%d)\n", chk8cstr, chk8cstr_len);
+    }
+  
+    // clean the buffers  
+    icu_buf_utf8_destroy(src8);
+    icu_buf_utf8_destroy(dest8);
+    icu_buf_utf16_destroy(src16);
+    icu_buf_utf16_destroy(dest16);
+  
+  
+    return success;
  }
  
  
@@ -109,6 +240,9 @@ int test_icu_casemap(const char * locale, char action,
      return sucess;
  }
  
+#endif
+
+
  // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  
  void test_icu_I18N_casemap(int argc, char **argv)
@@ -129,11 +263,10 @@ void test_icu_I18N_casemap(int argc, char **argv)
                                 "A ReD fOx hunTS sQUirriLs", 
                                 "a red fox hunts squirrils"));
      
-    // this one fails and needs more investigation ..
-    YAZ_CHECK(0 == test_icu_casemap("en", 't',
+    YAZ_CHECK(test_icu_casemap("en", 't',
                                 "A ReD fOx hunTS sQUirriLs", 
                                 "A Red Fox Hunts Squirrils"));
-
+    
  
      // Locale 'da'
  
@@ -176,6 +309,8 @@ void test_icu_I18N_casemap(int argc, char **argv)
  }
  
  
+#if 0
+
  // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  
  void test_icu_I18N_casemap_failures(int argc, char **argv)
@@ -228,6 +363,8 @@ void test_icu_I18N_casemap_failures(int argc, char **argv)
      nmem_destroy(nmem);
  }
  
+
+
  #endif
  
  // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
@@ -436,7 +573,7 @@ int main(int argc, char **argv)
  #ifdef HAVE_ICU
  
      //test_icu_I18N_casemap_failures(argc, argv);
-    //test_icu_I18N_casemap(argc, argv);
+    test_icu_I18N_casemap(argc, argv);
      test_icu_I18N_sortmap(argc, argv);
   
  #else
author	Marc Cromme <marc@indexdata.dk>
	Mon, 7 May 2007 12:18:34 +0000 (12:18 +0000)
committer	Marc Cromme <marc@indexdata.dk>
	Mon, 7 May 2007 12:18:34 +0000 (12:18 +0000)
src/icu_I18N.c		patch \| blob \| history
src/icu_I18N.h		patch \| blob \| history
src/test_icu_I18N.c		patch \| blob \| history