src/test_icu_I18N.c

   1 /* $Id: test_icu_I18N.c,v 1.9 2007-05-07 12:18:34 marc Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4 This file is part of Pazpar2.
   5
   6 Pazpar2 is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 2, or (at your option) any later
   9 version.
  10
  11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Pazpar2; see the file LICENSE.  If not, write to the
  18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19 02111-1307, USA.
  20  */
  21
  22 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  23
  24
  25 #if HAVE_CONFIG_H
  26 #include "cconfig.h"
  27 #endif
  28
  29 #define USE_TIMING 0
  30 #if USE_TIMING
  31 #include <yaz/timing.h>
  32 #endif
  33
  34 #include <yaz/test.h>
  35
  36
  37
  38 #ifdef HAVE_ICU
  39 #include "icu_I18N.h"
  40
  41 #include <string.h>
  42 #include <stdlib.h>
  43
  44 #include <unicode/ustring.h>
  45 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  46
  47
  48 #define MAX_KEY_SIZE 256
  49
  50 struct icu_termmap
  51 {
  52     uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
  53     char disp_term[MAX_KEY_SIZE];  // standard C utf-8 string
  54 };
  55
  56
  57
  58 int icu_termmap_cmp(const void *vp1, const void *vp2)
  59 {
  60     struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
  61     struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
  62
  63     int cmp = 0;
  64
  65     cmp = strcmp((const char *)itmp1->sort_key,
  66                  (const char *)itmp2->sort_key);
  67     return cmp;
  68 };
  69
  70
  71
  72 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
  73                       struct icu_buf_utf16 * src16,
  74                       const char *locale, char action,
  75                       UErrorCode *status)
  76 {
  77     int32_t dest16_len = 0;
  78
  79     switch(action) {
  80     case 'l':
  81         dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
  82                                   src16->utf16, src16->utf16_len,
  83                                   locale, status);
  84         break;
  85     case 'u':
  86         dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
  87                                   src16->utf16, src16->utf16_len,
  88                                   locale, status);
  89         break;
  90     case 't':
  91         dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
  92                                   src16->utf16, src16->utf16_len,
  93                                   0, locale, status);
  94         break;
  95     case 'f':
  96         dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
  97                                    src16->utf16, src16->utf16_len,
  98                                    U_FOLD_CASE_DEFAULT, status);
  99         break;
 100
 101     default:
 102         return U_UNSUPPORTED_ERROR;
 103         break;
 104     }
 105
 106     // check for buffer overflow, resize and retry
 107     if (*status == U_BUFFER_OVERFLOW_ERROR
 108         //|| dest16_len > dest16->utf16_cap
 109         ){
 110         icu_buf_utf16_resize(dest16, dest16_len * 2);
 111         *status = U_ZERO_ERROR;
 112
 113
 114         switch(action) {
 115         case 'l':
 116             dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 117                                       src16->utf16, src16->utf16_len,
 118                                       locale, status);
 119             break;
 120         case 'u':
 121             dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 122                                       src16->utf16, src16->utf16_len,
 123                                       locale, status);
 124             break;
 125         case 't':
 126             dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 127                                       src16->utf16, src16->utf16_len,
 128                                       0, locale, status);
 129             break;
 130         case 'f':
 131         dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 132                                    src16->utf16, src16->utf16_len,
 133                                    U_FOLD_CASE_DEFAULT, status);
 134         break;
 135
 136         default:
 137             return U_UNSUPPORTED_ERROR;
 138             break;
 139         }
 140     }
 141
 142     if (U_SUCCESS(*status)
 143         && dest16_len < dest16->utf16_cap)
 144         dest16->utf16_len = dest16_len;
 145     else {
 146         dest16->utf16[0] = (UChar) 0;
 147         dest16->utf16_len = 0;
 148     }
 149
 150     return *status;
 151 };
 152
 153
 154
 155 int test_icu_casemap(const char * locale, char action,
 156                      const char * src8cstr, const char * chk8cstr)
 157 {
 158     int success = 0;
 159     UErrorCode status = U_ZERO_ERROR;
 160
 161     struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0);
 162     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
 163     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 164     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
 165
 166
 167     int src8cstr_len = strlen(src8cstr);
 168     int chk8cstr_len = strlen(chk8cstr);
 169
 170     // converting to UTF16
 171     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 172
 173     // perform case mapping
 174     icu_utf16_casemap(dest16, src16, locale, action, &status);
 175
 176     // converting to UTF8
 177     icu_utf16_to_utf8(dest8, dest16, &status);
 178
 179
 180
 181     // determine success
 182     if (dest8->utf8
 183         && (dest8->utf8_len == strlen(chk8cstr))
 184         && !strcmp(chk8cstr, (const char *) dest8->utf8))
 185         success = 1;
 186     else
 187         success = 0;
 188
 189     // report failures
 190     if (!success){
 191         printf("\nERROR\n");
 192         printf("original string:   '%s' (%d)\n", src8cstr, src8cstr_len);
 193         printf("icu_casemap '%s:%c' '%s' (%d)\n",
 194                locale, action, dest8->utf8, dest8->utf8_len);
 195         printf("expected string:   '%s' (%d)\n", chk8cstr, chk8cstr_len);
 196     }
 197
 198     // clean the buffers
 199     icu_buf_utf8_destroy(src8);
 200     icu_buf_utf8_destroy(dest8);
 201     icu_buf_utf16_destroy(src16);
 202     icu_buf_utf16_destroy(dest16);
 203
 204
 205     return success;
 206 }
 207
 208
 209
 210 #if 0
 211
 212 int test_icu_casemap(const char * locale, char action,
 213                      const char * src8, const char * check8)
 214 {
 215     NMEM nmem = nmem_create();
 216     size_t buf_cap = 128;
 217     char buf[buf_cap];
 218     const char * dest8 = 0;
 219     size_t dest8_len = 0;
 220     //size_t src8_len = strlen(src8);
 221     int sucess = 0;
 222
 223     //printf("original string:   '%s' (%d)\n", src8, (int) src8_len);
 224
 225     //these shall succeed
 226     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 227                         src8, locale, action);
 228
 229
 230     //printf("icu_casemap '%s:%c' '%s' (%d)\n",
 231     //       locale, action, dest8, (int) dest8_len);
 232
 233     if (dest8
 234         && (dest8_len == strlen(check8))
 235         && !strcmp(check8, dest8))
 236         sucess = dest8_len;
 237
 238     nmem_destroy(nmem);
 239
 240     return sucess;
 241 }
 242
 243 #endif
 244
 245
 246 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 247
 248 void test_icu_I18N_casemap(int argc, char **argv)
 249 {
 250
 251     // Locale 'en'
 252
 253     // sucessful tests
 254     YAZ_CHECK(test_icu_casemap("en", 'l',
 255                                "A ReD fOx hunTS sQUirriLs",
 256                                "a red fox hunts squirrils"));
 257
 258     YAZ_CHECK(test_icu_casemap("en", 'u',
 259                                "A ReD fOx hunTS sQUirriLs",
 260                                "A RED FOX HUNTS SQUIRRILS"));
 261
 262     YAZ_CHECK(test_icu_casemap("en", 'f',
 263                                "A ReD fOx hunTS sQUirriLs",
 264                                "a red fox hunts squirrils"));
 265
 266     YAZ_CHECK(test_icu_casemap("en", 't',
 267                                "A ReD fOx hunTS sQUirriLs",
 268                                "A Red Fox Hunts Squirrils"));
 269
 270
 271     // Locale 'da'
 272
 273     // sucess expected
 274     YAZ_CHECK(test_icu_casemap("da", 'l',
 275                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 276                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 277
 278     YAZ_CHECK(test_icu_casemap("da", 'u',
 279                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 280                                "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN"));
 281
 282     YAZ_CHECK(test_icu_casemap("da", 'f',
 283                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 284                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 285
 286     YAZ_CHECK(test_icu_casemap("da", 't',
 287                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 288                                "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden"));
 289
 290     // Locale 'de'
 291
 292     // sucess expected
 293     YAZ_CHECK(test_icu_casemap("de", 'l',
 294                           "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 295                           "zwölf ärgerliche würste rollen über die straße"));
 296
 297     YAZ_CHECK(test_icu_casemap("de", 'u',
 298                            "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 299                            "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE"));
 300
 301     YAZ_CHECK(test_icu_casemap("de", 'f',
 302                            "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 303                            "zwölf ärgerliche würste rollen über die strasse"));
 304
 305     YAZ_CHECK(test_icu_casemap("de", 't',
 306                            "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 307                            "Zwölf Ärgerliche Würste Rollen Über Die Straße"));
 308
 309 }
 310
 311
 312 #if 0
 313
 314 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 315
 316 void test_icu_I18N_casemap_failures(int argc, char **argv)
 317 {
 318
 319     size_t buf_cap = 128;
 320     char buf[buf_cap];
 321     size_t dest8_len = 0;
 322     NMEM nmem = nmem_create();
 323     char * dest8 = 0;
 324
 325     const char * src8 =  "A ReD fOx hunTS sQUirriLs";
 326     //size_t src8_len = strlen(src8);
 327
 328     //printf("original string:   '%s' (%d)\n", src8, (int) src8_len);
 329
 330     // some calling error needs investigation
 331     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 332                         src8, "en", 't');
 333     YAZ_CHECK(0 == dest8_len);
 334     //printf("icu_casemap 'en:t' '%s' (%d)\n", dest8, (int) dest8_len);
 335
 336
 337     // attention: does not fail even if no locale 'xy_zz' defined
 338     // it seems to default to english locale
 339     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 340                         src8, "zz_abc", 'l');
 341     YAZ_CHECK(dest8_len);
 342     //printf("icu_casemap 'zz:l' '%s' (%d)\n", dest8, (int) dest8_len);
 343
 344
 345     // shall fail - no buf buffer defined
 346     dest8 = icu_casemap(nmem, 0, buf_cap, &dest8_len,
 347                         src8, "en", 'l');
 348     YAZ_CHECK(0 == dest8_len);
 349     //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len);
 350
 351     // shall fail - no buf_cap  defined
 352     dest8 = icu_casemap(nmem, buf, 0, &dest8_len,
 353                         src8, "en", 'l');
 354     YAZ_CHECK(0 == dest8_len);
 355     //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len);
 356
 357     // shall fail - no action 'x' defined
 358     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 359                         src8, "en", 'x');
 360     YAZ_CHECK(0 == dest8_len);
 361     //printf("icu_casemap 'en:x' '%s' (%d)\n", dest8, (int) dest8_len);
 362
 363     nmem_destroy(nmem);
 364 }
 365
 366
 367
 368 #endif
 369
 370 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 371
 372 #if 0
 373 int test_icu_sortmap(const char * locale, size_t list_len,
 374                      const char ** src8_list, const char ** check8_list)
 375 {
 376     int sucess = 1;
 377
 378     size_t i = 0;
 379
 380
 381     NMEM nmem = nmem_create();
 382     size_t buf_cap = 128;
 383     char buf[buf_cap];
 384     struct icu_termmap ** dest8_list
 385         = nmem_malloc(nmem, sizeof(struct icu_termmap *) * list_len);
 386     //size_t dest8_len = 0;
 387     //size_t src8_len = strlen(src8);
 388
 389     // initializing icu_termmap
 390     for (i = 0; i < list_len; i++){
 391         dest8_list[i] = icu_termmap_create(nmem);
 392         dest8_list[i]->norm_term = nmem_strdup(nmem, src8_list[i]);
 393         dest8_list[i]->disp_term = nmem_strdup(nmem, src8_list[i]);
 394         //dest8_list[i]->sort_key =  nmem_strdup(nmem, src8_list[i]);
 395         //dest8_list[i]->sort_len =  strlen(src8_list[i]);
 396         dest8_list[i]->sort_key
 397             = icu_sortmap(nmem, buf, buf_cap, 0, src8_list[i], locale);
 398         // = icu_sortmap(nmem, buf, buf_cap, &(dest8_list[i]->sort_len),
 399         //                  src8_list[i], locale);
 400     }
 401
 402     // do the sorting
 403     qsort(dest8_list, list_len,
 404           sizeof(struct icu_termmap *), icu_termmap_cmp);
 405
 406     // checking correct sorting
 407     for (i = 0; i < list_len; i++){
 408         if (0 != strcmp(dest8_list[i]->disp_term, check8_list[i])){
 409             sucess = 0;
 410         }
 411     }
 412
 413     if (1 || !sucess){
 414         printf("\n");
 415         printf("Input    '%s':", locale);
 416         for (i = 0; i < list_len; i++)
 417             printf(" '%s'", src8_list[i]);
 418         printf("\n");
 419         printf("ICU sort '%s':", locale);
 420         for (i = 0; i < list_len; i++)
 421             printf(" '%s'", dest8_list[i]->disp_term);
 422         if (sucess)
 423             printf(" OK");
 424         else
 425             printf(" ERROR ??");
 426         printf("\n");
 427         printf("Expected '%s':", locale);
 428         for (i = 0; i < list_len; i++)
 429             printf(" '%s'", check8_list[i]);
 430         printf("\n");
 431     }
 432
 433     nmem_destroy(nmem);
 434
 435     return sucess;
 436 }
 437
 438 #else
 439
 440 int test_icu_sortmap(const char * locale, int src_list_len,
 441                      const char ** src_list, const char ** chk_list)
 442 {
 443     int success = 1;
 444
 445     UErrorCode status = U_ZERO_ERROR;
 446
 447   struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
 448   struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
 449
 450   int i;
 451
 452   struct icu_termmap * list[src_list_len];
 453
 454   UCollator *coll = ucol_open(locale, &status);
 455   icu_check_status(status);
 456
 457   if(!U_SUCCESS(status))
 458     return 0;
 459
 460   // assigning display terms and sort keys using buf 8 and buf16
 461   for( i = 0; i < src_list_len; i++)
 462     {
 463
 464       list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
 465
 466       // copy display term
 467       strcpy(list[i]->disp_term, src_list[i]);
 468
 469       // transforming to UTF16
 470       icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
 471       icu_check_status(status);
 472
 473       // computing sortkeys
 474       icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
 475       icu_check_status(status);
 476
 477       // assigning sortkeys
 478       memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 479       //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 480       //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
 481     }
 482
 483
 484   // do the sorting
 485   qsort(list, src_list_len,
 486         sizeof(struct icu_termmap *), icu_termmap_cmp);
 487
 488   // checking correct sorting
 489   for (i = 0; i < src_list_len; i++){
 490     if (0 != strcmp(list[i]->disp_term, chk_list[i])){
 491       success = 0;
 492     }
 493   }
 494
 495   if(!success){
 496   printf("\nERROR\n");
 497   printf("Input str: '%s' : ", locale);
 498   for (i = 0; i < src_list_len; i++) {
 499     printf(" '%s'", list[i]->disp_term);
 500   }
 501   printf("\n");
 502   printf("ICU sort:  '%s' : ", locale);
 503   for (i = 0; i < src_list_len; i++) {
 504     printf(" '%s'", list[i]->disp_term);
 505     //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]);
 506   }
 507   printf("\n");
 508   printf("Expected:  '%s' : ", locale);
 509   for (i = 0; i < src_list_len; i++) {
 510     printf(" '%s'", chk_list[i]);
 511   }
 512   printf("\n");
 513   }
 514
 515
 516   ucol_close(coll);
 517
 518   icu_buf_utf8_destroy(buf8);
 519   icu_buf_utf16_destroy(buf16);
 520
 521
 522
 523     return success;
 524 }
 525
 526
 527 #endif
 528
 529
 530 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 531
 532 void test_icu_I18N_sortmap(int argc, char **argv)
 533 {
 534
 535     // sucessful tests
 536     size_t en_1_len = 6;
 537     const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
 538     const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
 539     YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
 540     YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
 541     YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
 542     YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
 543     YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
 544
 545     // sucessful tests
 546     size_t da_1_len = 6;
 547     const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
 548     const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
 549     YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
 550     YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
 551
 552     // sucessful tests
 553     size_t de_1_len = 9;
 554     const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
 555     const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
 556     YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
 557     YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
 558     YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
 559
 560 }
 561
 562
 563 #endif
 564
 565 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 566
 567 int main(int argc, char **argv)
 568 {
 569
 570     YAZ_CHECK_INIT(argc, argv);
 571     YAZ_CHECK_LOG();
 572
 573 #ifdef HAVE_ICU
 574
 575     //test_icu_I18N_casemap_failures(argc, argv);
 576     test_icu_I18N_casemap(argc, argv);
 577     test_icu_I18N_sortmap(argc, argv);
 578
 579 #else
 580
 581     printf("ICU unit tests omitted.\n"
 582            "Please install libicu36-dev and icu-doc or similar\n");
 583     YAZ_CHECK(0 == 0);
 584
 585 #endif
 586
 587     YAZ_CHECK_TERM;
 588 }
 589
 590
 591 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 592
 593 /*
 594  * Local variables:
 595  * c-basic-offset: 4
 596  * indent-tabs-mode: nil
 597  * End:
 598  * vim: shiftwidth=4 tabstop=8 expandtab
 599  */