src/test_icu_I18N.c

   1 /* $Id: test_icu_I18N.c,v 1.11 2007-05-09 14:01:21 marc Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4    This file is part of Pazpar2.
   5
   6    Pazpar2 is free software; you can redistribute it and/or modify it under
   7    the terms of the GNU General Public License as published by the Free
   8    Software Foundation; either version 2, or (at your option) any later
   9    version.
  10
  11    Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14    for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Pazpar2; see the file LICENSE.  If not, write to the
  18    Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19    02111-1307, USA.
  20 */
  21
  22 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  23
  24
  25 #if HAVE_CONFIG_H
  26 #include "cconfig.h"
  27 #endif
  28
  29 #define USE_TIMING 0
  30 #if USE_TIMING
  31 #include <yaz/timing.h>
  32 #endif
  33
  34 #include <yaz/test.h>
  35
  36
  37
  38 #ifdef HAVE_ICU
  39 #include "icu_I18N.h"
  40
  41 #include <string.h>
  42 #include <stdlib.h>
  43
  44 //#include <unicode/ustring.h>
  45 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  46
  47
  48 #define MAX_KEY_SIZE 256
  49 struct icu_termmap
  50 {
  51     uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
  52     char disp_term[MAX_KEY_SIZE];  // standard C utf-8 string
  53 };
  54
  55
  56
  57 int icu_termmap_cmp(const void *vp1, const void *vp2)
  58 {
  59     struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
  60     struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
  61
  62     int cmp = 0;
  63
  64     cmp = strcmp((const char *)itmp1->sort_key,
  65                  (const char *)itmp2->sort_key);
  66     return cmp;
  67 };
  68
  69
  70
  71
  72 int test_icu_casemap(const char * locale, char action,
  73                      const char * src8cstr, const char * chk8cstr)
  74 {
  75     int success = 0;
  76     UErrorCode status = U_ZERO_ERROR;
  77
  78     struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0);
  79     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
  80     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
  81     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
  82
  83
  84     int src8cstr_len = strlen(src8cstr);
  85     int chk8cstr_len = strlen(chk8cstr);
  86
  87     // converting to UTF16
  88     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
  89
  90     // perform case mapping
  91     icu_utf16_casemap(dest16, src16, locale, action, &status);
  92
  93     // converting to UTF8
  94     icu_utf16_to_utf8(dest8, dest16, &status);
  95
  96
  97
  98     // determine success
  99     if (dest8->utf8
 100         && (dest8->utf8_len == strlen(chk8cstr))
 101         && !strcmp(chk8cstr, (const char *) dest8->utf8))
 102         success = 1;
 103     else
 104         success = 0;
 105
 106     // report failures
 107     if (!success){
 108         printf("\nERROR\n");
 109         printf("original string:   '%s' (%d)\n", src8cstr, src8cstr_len);
 110         printf("icu_casemap '%s:%c' '%s' (%d)\n",
 111                locale, action, dest8->utf8, dest8->utf8_len);
 112         printf("expected string:   '%s' (%d)\n", chk8cstr, chk8cstr_len);
 113     }
 114
 115     // clean the buffers
 116     icu_buf_utf8_destroy(src8);
 117     icu_buf_utf8_destroy(dest8);
 118     icu_buf_utf16_destroy(src16);
 119     icu_buf_utf16_destroy(dest16);
 120
 121
 122     return success;
 123 }
 124
 125
 126
 127 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 128
 129 void test_icu_I18N_casemap(int argc, char **argv)
 130 {
 131
 132     // Locale 'en'
 133
 134     // sucessful tests
 135     YAZ_CHECK(test_icu_casemap("en", 'l',
 136                                "A ReD fOx hunTS sQUirriLs",
 137                                "a red fox hunts squirrils"));
 138
 139     YAZ_CHECK(test_icu_casemap("en", 'u',
 140                                "A ReD fOx hunTS sQUirriLs",
 141                                "A RED FOX HUNTS SQUIRRILS"));
 142
 143     YAZ_CHECK(test_icu_casemap("en", 'f',
 144                                "A ReD fOx hunTS sQUirriLs",
 145                                "a red fox hunts squirrils"));
 146
 147     YAZ_CHECK(test_icu_casemap("en", 't',
 148                                "A ReD fOx hunTS sQUirriLs",
 149                                "A Red Fox Hunts Squirrils"));
 150
 151
 152     // Locale 'da'
 153
 154     // sucess expected
 155     YAZ_CHECK(test_icu_casemap("da", 'l',
 156                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 157                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 158
 159     YAZ_CHECK(test_icu_casemap("da", 'u',
 160                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 161                                "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN"));
 162
 163     YAZ_CHECK(test_icu_casemap("da", 'f',
 164                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 165                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 166
 167     YAZ_CHECK(test_icu_casemap("da", 't',
 168                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 169                                "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden"));
 170
 171     // Locale 'de'
 172
 173     // sucess expected
 174     YAZ_CHECK(test_icu_casemap("de", 'l',
 175                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 176                                "zwölf ärgerliche würste rollen über die straße"));
 177
 178     YAZ_CHECK(test_icu_casemap("de", 'u',
 179                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 180                                "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE"));
 181
 182     YAZ_CHECK(test_icu_casemap("de", 'f',
 183                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 184                                "zwölf ärgerliche würste rollen über die strasse"));
 185
 186     YAZ_CHECK(test_icu_casemap("de", 't',
 187                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 188                                "Zwölf Ärgerliche Würste Rollen Über Die Straße"));
 189
 190 }
 191
 192
 193 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 194
 195 int test_icu_sortmap(const char * locale, int src_list_len,
 196                      const char ** src_list, const char ** chk_list)
 197 {
 198     int success = 1;
 199
 200     UErrorCode status = U_ZERO_ERROR;
 201
 202     struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
 203     struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
 204
 205     int i;
 206
 207     struct icu_termmap * list[src_list_len];
 208
 209     UCollator *coll = ucol_open(locale, &status);
 210     icu_check_status(status);
 211
 212     if(U_FAILURE(status))
 213         return 0;
 214
 215     // assigning display terms and sort keys using buf 8 and buf16
 216     for( i = 0; i < src_list_len; i++)
 217         {
 218
 219             list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
 220
 221             // copy display term
 222             strcpy(list[i]->disp_term, src_list[i]);
 223
 224             // transforming to UTF16
 225             icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
 226             icu_check_status(status);
 227
 228             // computing sortkeys
 229             icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
 230             icu_check_status(status);
 231
 232             // assigning sortkeys
 233             memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 234             //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 235             //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
 236         }
 237
 238
 239     // do the sorting
 240     qsort(list, src_list_len,
 241           sizeof(struct icu_termmap *), icu_termmap_cmp);
 242
 243     // checking correct sorting
 244     for (i = 0; i < src_list_len; i++){
 245         if (0 != strcmp(list[i]->disp_term, chk_list[i])){
 246             success = 0;
 247         }
 248     }
 249
 250     if(!success){
 251         printf("\nERROR\n");
 252         printf("Input str: '%s' : ", locale);
 253         for (i = 0; i < src_list_len; i++) {
 254             printf(" '%s'", list[i]->disp_term);
 255         }
 256         printf("\n");
 257         printf("ICU sort:  '%s' : ", locale);
 258         for (i = 0; i < src_list_len; i++) {
 259             printf(" '%s'", list[i]->disp_term);
 260             //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]);
 261         }
 262         printf("\n");
 263         printf("Expected:  '%s' : ", locale);
 264         for (i = 0; i < src_list_len; i++) {
 265             printf(" '%s'", chk_list[i]);
 266         }
 267         printf("\n");
 268     }
 269
 270
 271     ucol_close(coll);
 272
 273     icu_buf_utf8_destroy(buf8);
 274     icu_buf_utf16_destroy(buf16);
 275
 276
 277
 278     return success;
 279 }
 280
 281
 282 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 283
 284 void test_icu_I18N_sortmap(int argc, char **argv)
 285 {
 286
 287     // sucessful tests
 288     size_t en_1_len = 6;
 289     const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
 290     const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
 291     YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
 292     YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
 293     YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
 294     YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
 295     YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
 296
 297     // sucessful tests
 298     size_t da_1_len = 6;
 299     const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
 300     const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
 301     YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
 302     YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
 303
 304     // sucessful tests
 305     size_t de_1_len = 9;
 306     const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
 307     const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
 308     YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
 309     YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
 310     YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
 311
 312 }
 313
 314
 315 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 316
 317 void test_icu_I18N_normmap(int argc, char **argv)
 318 {
 319
 320
 321 }
 322
 323
 324 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 325
 326 void test_icu_I18N_tokenizer(int argc, char **argv)
 327 {
 328
 329     const char * src8cstr
 330         = "Though I am not naturally honest, I am so sometimes by chance.";
 331
 332     UErrorCode status = U_ZERO_ERROR;
 333     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 334     struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0);
 335     struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0);
 336
 337     printf("Input:  '%s'\n", src8cstr);
 338
 339     // transforming to UTF16
 340     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 341     icu_check_status(status);
 342
 343     // set up tokenizer
 344     struct icu_tokenizer * tokenizer
 345         = icu_tokenizer_create("en", 's', &status);
 346     icu_check_status(status);
 347     YAZ_CHECK(tokenizer);
 348
 349     // attach text buffer to tokenizer
 350     icu_tokenizer_attach(tokenizer, src16, &status);
 351     icu_check_status(status);
 352     YAZ_CHECK(tokenizer->bi);
 353
 354     // perform work on tokens
 355     printf("Tokens: ");
 356     while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){
 357         icu_check_status(status);
 358
 359         // converting to UTF8
 360         icu_utf16_to_utf8(tkn8, tkn16, &status);
 361
 362         printf("'%s' ", tkn8->utf8);
 363
 364         //printf("token %d %d %d %d '%s'\n",
 365         //       icu_tokenizer_token_id(tokenizer),
 366         //       icu_tokenizer_token_start(tokenizer),
 367         //       icu_tokenizer_token_end(tokenizer),
 368         //       icu_tokenizer_token_length(tokenizer),
 369         //       tkn8->utf8);
 370     }
 371     printf(" (%d)(%d)\n", icu_tokenizer_token_id(tokenizer),
 372            icu_tokenizer_token_count(tokenizer));
 373
 374     icu_tokenizer_destroy(tokenizer);
 375     icu_buf_utf16_destroy(src16);
 376     icu_buf_utf16_destroy(tkn16);
 377     icu_buf_utf8_destroy(tkn8);
 378 }
 379
 380
 381
 382
 383
 384 #endif // HAVE_ICU
 385
 386 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 387
 388 int main(int argc, char **argv)
 389 {
 390
 391     YAZ_CHECK_INIT(argc, argv);
 392     YAZ_CHECK_LOG();
 393
 394 #ifdef HAVE_ICU
 395
 396     //test_icu_I18N_casemap_failures(argc, argv);
 397     test_icu_I18N_casemap(argc, argv);
 398     test_icu_I18N_sortmap(argc, argv);
 399     test_icu_I18N_normmap(argc, argv);
 400     test_icu_I18N_tokenizer(argc, argv);
 401
 402 #else // HAVE_ICU
 403
 404     printf("ICU unit tests omitted.\n"
 405            "Please install libicu36-dev and icu-doc or similar\n");
 406     YAZ_CHECK(0 == 0);
 407
 408 #endif // HAVE_ICU
 409
 410     YAZ_CHECK_TERM;
 411 }
 412
 413
 414 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 415
 416
 417
 418 // CRAP to follow
 419 #if 0
 420
 421 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 422
 423 void test_icu_I18N_casemap_failures(int argc, char **argv)
 424 {
 425
 426     size_t buf_cap = 128;
 427     char buf[buf_cap];
 428     size_t dest8_len = 0;
 429     NMEM nmem = nmem_create();
 430     char * dest8 = 0;
 431
 432     const char * src8 =  "A ReD fOx hunTS sQUirriLs";
 433     //size_t src8_len = strlen(src8);
 434
 435     //printf("original string:   '%s' (%d)\n", src8, (int) src8_len);
 436
 437     // some calling error needs investigation
 438     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 439                         src8, "en", 't');
 440     YAZ_CHECK(0 == dest8_len);
 441     //printf("icu_casemap 'en:t' '%s' (%d)\n", dest8, (int) dest8_len);
 442
 443
 444     // attention: does not fail even if no locale 'xy_zz' defined
 445     // it seems to default to english locale
 446     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 447                         src8, "zz_abc", 'l');
 448     YAZ_CHECK(dest8_len);
 449     //printf("icu_casemap 'zz:l' '%s' (%d)\n", dest8, (int) dest8_len);
 450
 451
 452     // shall fail - no buf buffer defined
 453     dest8 = icu_casemap(nmem, 0, buf_cap, &dest8_len,
 454                         src8, "en", 'l');
 455     YAZ_CHECK(0 == dest8_len);
 456     //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len);
 457
 458     // shall fail - no buf_cap  defined
 459     dest8 = icu_casemap(nmem, buf, 0, &dest8_len,
 460                         src8, "en", 'l');
 461     YAZ_CHECK(0 == dest8_len);
 462     //printf("icu_casemap 'en:l' '%s' (%d)\n", dest8, (int) dest8_len);
 463
 464     // shall fail - no action 'x' defined
 465     dest8 = icu_casemap(nmem, buf, buf_cap, &dest8_len,
 466                         src8, "en", 'x');
 467     YAZ_CHECK(0 == dest8_len);
 468     //printf("icu_casemap 'en:x' '%s' (%d)\n", dest8, (int) dest8_len);
 469
 470     nmem_destroy(nmem);
 471 }
 472
 473
 474
 475 #endif
 476
 477
 478
 479 /*
 480  * Local variables:
 481  * c-basic-offset: 4
 482  * indent-tabs-mode: nil
 483  * End:
 484  * vim: shiftwidth=4 tabstop=8 expandtab
 485  */