test/tst_icu_I18N.c

   1 /* $Id: tst_icu_I18N.c,v 1.1 2007-10-22 12:21:39 adam Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4    This file is part of Pazpar2.
   5
   6    Pazpar2 is free software; you can redistribute it and/or modify it under
   7    the terms of the GNU General Public License as published by the Free
   8    Software Foundation; either version 2, or (at your option) any later
   9    version.
  10
  11    Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14    for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Pazpar2; see the file LICENSE.  If not, write to the
  18    Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19    02111-1307, USA.
  20 */
  21
  22 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  23
  24
  25 #if HAVE_CONFIG_H
  26 #include "cconfig.h"
  27 #endif
  28
  29 #define USE_TIMING 0
  30 #if USE_TIMING
  31 #include <yaz/timing.h>
  32 #endif
  33
  34 #include <yaz/test.h>
  35
  36
  37
  38 #ifdef HAVE_ICU
  39 #include <yaz/icu_I18N.h>
  40
  41 #include <string.h>
  42 #include <stdlib.h>
  43
  44 //#include <unicode/ustring.h>
  45 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  46
  47
  48 #define MAX_KEY_SIZE 256
  49 struct icu_termmap
  50 {
  51     uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
  52     char disp_term[MAX_KEY_SIZE];  // standard C utf-8 string
  53 };
  54
  55
  56
  57 int icu_termmap_cmp(const void *vp1, const void *vp2)
  58 {
  59     struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
  60     struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
  61
  62     int cmp = 0;
  63
  64     cmp = strcmp((const char *)itmp1->sort_key,
  65                  (const char *)itmp2->sort_key);
  66     return cmp;
  67 };
  68
  69
  70
  71
  72 int test_icu_casemap(const char * locale, char action,
  73                      const char * src8cstr, const char * chk8cstr)
  74 {
  75     int success = 0;
  76     UErrorCode status = U_ZERO_ERROR;
  77
  78     struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0);
  79     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
  80     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
  81     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
  82
  83
  84     int src8cstr_len = strlen(src8cstr);
  85     int chk8cstr_len = strlen(chk8cstr);
  86
  87     // converting to UTF16
  88     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
  89
  90     // perform case mapping
  91     icu_utf16_casemap(dest16, src16, locale, action, &status);
  92
  93     // converting to UTF8
  94     icu_utf16_to_utf8(dest8, dest16, &status);
  95
  96
  97
  98     // determine success
  99     if (dest8->utf8
 100         && (dest8->utf8_len == strlen(chk8cstr))
 101         && !strcmp(chk8cstr, (const char *) dest8->utf8))
 102         success = 1;
 103     else
 104         success = 0;
 105
 106     // report failures
 107     if (!success){
 108         printf("\nERROR\n");
 109         printf("original string:   '%s' (%d)\n", src8cstr, src8cstr_len);
 110         printf("icu_casemap '%s:%c' '%s' (%d)\n",
 111                locale, action, dest8->utf8, dest8->utf8_len);
 112         printf("expected string:   '%s' (%d)\n", chk8cstr, chk8cstr_len);
 113     }
 114
 115     // clean the buffers
 116     icu_buf_utf8_destroy(src8);
 117     icu_buf_utf8_destroy(dest8);
 118     icu_buf_utf16_destroy(src16);
 119     icu_buf_utf16_destroy(dest16);
 120
 121
 122     return success;
 123 }
 124
 125
 126
 127 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 128
 129 void test_icu_I18N_casemap(int argc, char **argv)
 130 {
 131
 132     // Locale 'en'
 133
 134     // sucessful tests
 135     YAZ_CHECK(test_icu_casemap("en", 'l',
 136                                "A ReD fOx hunTS sQUirriLs",
 137                                "a red fox hunts squirrils"));
 138
 139     YAZ_CHECK(test_icu_casemap("en", 'u',
 140                                "A ReD fOx hunTS sQUirriLs",
 141                                "A RED FOX HUNTS SQUIRRILS"));
 142
 143     YAZ_CHECK(test_icu_casemap("en", 'f',
 144                                "A ReD fOx hunTS sQUirriLs",
 145                                "a red fox hunts squirrils"));
 146
 147     YAZ_CHECK(test_icu_casemap("en", 't',
 148                                "A ReD fOx hunTS sQUirriLs",
 149                                "A Red Fox Hunts Squirrils"));
 150
 151
 152     // Locale 'da'
 153
 154     // sucess expected
 155     YAZ_CHECK(test_icu_casemap("da", 'l',
 156                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 157                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 158
 159     YAZ_CHECK(test_icu_casemap("da", 'u',
 160                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 161                                "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN"));
 162
 163     YAZ_CHECK(test_icu_casemap("da", 'f',
 164                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 165                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 166
 167     YAZ_CHECK(test_icu_casemap("da", 't',
 168                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 169                                "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden"));
 170
 171     // Locale 'de'
 172
 173     // sucess expected
 174     YAZ_CHECK(test_icu_casemap("de", 'l',
 175                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 176                                "zwölf ärgerliche würste rollen über die straße"));
 177
 178     YAZ_CHECK(test_icu_casemap("de", 'u',
 179                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 180                                "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE"));
 181
 182     YAZ_CHECK(test_icu_casemap("de", 'f',
 183                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 184                                "zwölf ärgerliche würste rollen über die strasse"));
 185
 186     YAZ_CHECK(test_icu_casemap("de", 't',
 187                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 188                                "Zwölf Ärgerliche Würste Rollen Über Die Straße"));
 189
 190 }
 191
 192
 193 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 194
 195 int test_icu_sortmap(const char * locale, int src_list_len,
 196                      const char ** src_list, const char ** chk_list)
 197 {
 198     int success = 1;
 199
 200     UErrorCode status = U_ZERO_ERROR;
 201
 202     struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
 203     struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
 204
 205     int i;
 206
 207     struct icu_termmap * list[src_list_len];
 208
 209     UCollator *coll = ucol_open(locale, &status);
 210     icu_check_status(status);
 211
 212     if(U_FAILURE(status))
 213         return 0;
 214
 215     // assigning display terms and sort keys using buf 8 and buf16
 216     for( i = 0; i < src_list_len; i++)
 217         {
 218
 219             list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
 220
 221             // copy display term
 222             strcpy(list[i]->disp_term, src_list[i]);
 223
 224             // transforming to UTF16
 225             icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
 226             icu_check_status(status);
 227
 228             // computing sortkeys
 229             icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
 230             icu_check_status(status);
 231
 232             // assigning sortkeys
 233             memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 234             //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 235             //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
 236         }
 237
 238
 239     // do the sorting
 240     qsort(list, src_list_len,
 241           sizeof(struct icu_termmap *), icu_termmap_cmp);
 242
 243     // checking correct sorting
 244     for (i = 0; i < src_list_len; i++){
 245         if (0 != strcmp(list[i]->disp_term, chk_list[i])){
 246             success = 0;
 247         }
 248     }
 249
 250     if(!success){
 251         printf("\nERROR\n");
 252         printf("Input str: '%s' : ", locale);
 253         for (i = 0; i < src_list_len; i++) {
 254             printf(" '%s'", list[i]->disp_term);
 255         }
 256         printf("\n");
 257         printf("ICU sort:  '%s' : ", locale);
 258         for (i = 0; i < src_list_len; i++) {
 259             printf(" '%s'", list[i]->disp_term);
 260             //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]);
 261         }
 262         printf("\n");
 263         printf("Expected:  '%s' : ", locale);
 264         for (i = 0; i < src_list_len; i++) {
 265             printf(" '%s'", chk_list[i]);
 266         }
 267         printf("\n");
 268     }
 269
 270
 271
 272     for( i = 0; i < src_list_len; i++)
 273         free(list[i]);
 274
 275
 276     ucol_close(coll);
 277
 278     icu_buf_utf8_destroy(buf8);
 279     icu_buf_utf16_destroy(buf16);
 280
 281     return success;
 282 }
 283
 284
 285 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 286
 287 void test_icu_I18N_sortmap(int argc, char **argv)
 288 {
 289
 290     // sucessful tests
 291     size_t en_1_len = 6;
 292     const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
 293     const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
 294     YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
 295     YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
 296     YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
 297     YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
 298     YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
 299
 300     // sucessful tests
 301     size_t da_1_len = 6;
 302     const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
 303     const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
 304     YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
 305     YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
 306
 307     // sucessful tests
 308     size_t de_1_len = 9;
 309     const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
 310     const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
 311     YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
 312     YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
 313     YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
 314
 315 }
 316
 317
 318 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 319
 320
 321
 322
 323 int test_icu_normalizer(const char * rules8cstr,
 324                             const char * src8cstr,
 325                             const char * chk8cstr)
 326 {
 327     int success = 0;
 328
 329     UErrorCode status = U_ZERO_ERROR;
 330
 331     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 332     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
 333     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
 334     struct icu_normalizer * normalizer
 335         = icu_normalizer_create(rules8cstr, 'f', &status);
 336     icu_check_status(status);
 337
 338     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 339     icu_check_status(status);
 340
 341     icu_normalizer_normalize(normalizer, dest16, src16, &status);
 342     icu_check_status(status);
 343
 344     icu_utf16_to_utf8(dest8, dest16, &status);
 345     icu_check_status(status);
 346
 347
 348     if(!strcmp((const char *) dest8->utf8,
 349                (const char *) chk8cstr))
 350         success = 1;
 351     else {
 352         success = 0;
 353         printf("Normalization\n");
 354         printf("Rules:      '%s'\n", rules8cstr);
 355         printf("Input:      '%s'\n", src8cstr);
 356         printf("Normalized: '%s'\n", dest8->utf8);
 357         printf("Expected:   '%s'\n", chk8cstr);
 358     }
 359
 360
 361     icu_normalizer_destroy(normalizer);
 362     icu_buf_utf16_destroy(src16);
 363     icu_buf_utf16_destroy(dest16);
 364     icu_buf_utf8_destroy(dest8);
 365
 366     return success;
 367 };
 368
 369
 370 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 371
 372 void test_icu_I18N_normalizer(int argc, char **argv)
 373 {
 374
 375     YAZ_CHECK(test_icu_normalizer("[:Punctuation:] Any-Remove",
 376                                   "Don't shoot!",
 377                                   "Dont shoot"));
 378
 379     YAZ_CHECK(test_icu_normalizer("[:Control:] Any-Remove",
 380                                   "Don't\n shoot!",
 381                                   "Don't shoot!"));
 382
 383     YAZ_CHECK(test_icu_normalizer("[:Decimal_Number:] Any-Remove",
 384                                   "This is 4 you!",
 385                                   "This is  you!"));
 386
 387     YAZ_CHECK(test_icu_normalizer("Lower; [:^Letter:] Remove",
 388                                   "Don't shoot!",
 389                                   "dontshoot"));
 390
 391     YAZ_CHECK(test_icu_normalizer("[:^Number:] Remove",
 392                                   "Monday 15th of April",
 393                                   "15"));
 394
 395     YAZ_CHECK(test_icu_normalizer("Lower;"
 396                                   "[[:WhiteSpace:][:Punctuation:]] Remove",
 397                                   " word4you? ",
 398                                   "word4you"));
 399
 400
 401     YAZ_CHECK(test_icu_normalizer("NFD; [:Nonspacing Mark:] Remove; NFC",
 402                                   "à côté de l'alcôve ovoïde",
 403                                   "a cote de l'alcove ovoide"));
 404
 405 }
 406
 407
 408 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 409
 410 int test_icu_tokenizer(const char * locale, char action,
 411                      const char * src8cstr, int count)
 412 {
 413     int success = 1;
 414
 415     UErrorCode status = U_ZERO_ERROR;
 416     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 417     struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0);
 418     struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0);
 419
 420     //printf("Input:  '%s'\n", src8cstr);
 421
 422     // transforming to UTF16
 423     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 424     icu_check_status(status);
 425
 426     // set up tokenizer
 427     struct icu_tokenizer * tokenizer
 428         = icu_tokenizer_create(locale, action, &status);
 429     icu_check_status(status);
 430     YAZ_CHECK(tokenizer);
 431
 432     // attach text buffer to tokenizer
 433     icu_tokenizer_attach(tokenizer, src16, &status);
 434     icu_check_status(status);
 435     YAZ_CHECK(tokenizer->bi);
 436
 437     // perform work on tokens
 438     //printf("Tokens: ");
 439     while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){
 440         icu_check_status(status);
 441
 442         // converting to UTF8
 443         icu_utf16_to_utf8(tkn8, tkn16, &status);
 444
 445         //printf("token %d %d %d %d '%s'\n",
 446         //
 447         //       icu_tokenizer_token_start(tokenizer),
 448         //       icu_tokenizer_token_end(tokenizer),
 449         //       icu_tokenizer_token_length(tokenizer),
 450         //       tkn8->utf8);
 451     }
 452
 453     if (count != icu_tokenizer_token_count(tokenizer)){
 454         success = 0;
 455         printf("\nTokenizer '%s:%c' Error: \n", locale, action);
 456         printf("Input:  '%s'\n", src8cstr);
 457         printf("Tokens: %d", icu_tokenizer_token_count(tokenizer));
 458         printf(", expected: %d\n", count);
 459     }
 460
 461     icu_tokenizer_destroy(tokenizer);
 462     icu_buf_utf16_destroy(src16);
 463     icu_buf_utf16_destroy(tkn16);
 464     icu_buf_utf8_destroy(tkn8);
 465
 466     return success;
 467 }
 468
 469
 470 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 471
 472 void test_icu_I18N_tokenizer(int argc, char **argv)
 473 {
 474
 475
 476     const char * en_str
 477         = "O Romeo, Romeo! wherefore art thou Romeo?";
 478
 479     YAZ_CHECK(test_icu_tokenizer("en", 's', en_str, 2));
 480     YAZ_CHECK(test_icu_tokenizer("en", 'l', en_str, 7));
 481     YAZ_CHECK(test_icu_tokenizer("en", 'w', en_str, 16));
 482     YAZ_CHECK(test_icu_tokenizer("en", 'c', en_str, 41));
 483
 484
 485
 486     const char * da_str
 487         = "Blåbærtærte. Denne kage stammer fra Finland. "
 488         "Den er med blåbær, men alle sommerens forskellige bær kan bruges.";
 489
 490     YAZ_CHECK(test_icu_tokenizer("da", 's', da_str, 3));
 491     YAZ_CHECK(test_icu_tokenizer("dar", 'l', da_str, 17));
 492     YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37));
 493     YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110));
 494
 495 }
 496
 497
 498 void test_icu_I18N_chain(int argc, char **argv)
 499 {
 500     const char * en_str
 501         = "O Romeo, Romeo! wherefore art thou\t Romeo?";
 502
 503     printf("ICU chain:\ninput: '%s'\n", en_str);
 504
 505     UErrorCode status = U_ZERO_ERROR;
 506     //struct icu_chain_step * step = 0;
 507     struct icu_chain * chain = 0;
 508
 509
 510     const char * xml_str = "<icu_chain id=\"en:word\" locale=\"en\">"
 511         "<normalize rule=\"[:Control:] Any-Remove\"/>"
 512         "<tokenize rule=\"l\"/>"
 513         "<normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
 514         "<display/>"
 515         "<casemap rule=\"l\"/>"
 516         "<index/>"
 517         "<sortkey/>"
 518         "</icu_chain>";
 519
 520
 521     xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
 522     xmlNode *xml_node = xmlDocGetRootElement(doc);
 523     YAZ_CHECK(xml_node);
 524
 525
 526     chain = icu_chain_xml_config(xml_node, &status);
 527
 528 #if 0
 529     chain  = icu_chain_create((uint8_t *) "en:word", (uint8_t *) "en");
 530     step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
 531                                  (const uint8_t *) "[:Control:] Any-Remove",
 532                                  &status);
 533     step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
 534                                  (const uint8_t *) "s",
 535                                  &status);
 536     step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
 537                                  (const uint8_t *) "l",
 538                                  &status);
 539     step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
 540                                  (const uint8_t *)
 541                                  "[[:WhiteSpace:][:Punctuation:]] Any-Remove",
 542                                  &status);
 543     step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
 544                                  (const uint8_t *)"",
 545                                  &status);
 546 /*     step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, */
 547 /*                                  (const uint8_t *) "Lower", */
 548 /*                                  &status); */
 549     step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
 550                                  (const uint8_t *) "l",
 551                                  &status);
 552     step = icu_chain_insert_step(chain, ICU_chain_step_type_index,
 553                                  (const uint8_t *)"",
 554                                  &status);
 555 /*     step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey, */
 556 /*                                  (const uint8_t *)"", */
 557 /*                                  &status); */
 558
 559 #endif
 560
 561     xmlFreeDoc(doc);
 562     YAZ_CHECK(chain);
 563
 564     YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status));
 565
 566     while (icu_chain_next_token(chain, &status)){
 567         printf("%d '%s' '%s'\n",
 568                icu_chain_get_token_count(chain),
 569                icu_chain_get_norm(chain),
 570                icu_chain_get_display(chain));
 571     }
 572
 573     YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 7);
 574
 575
 576     YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));
 577
 578     while (icu_chain_next_token(chain, &status)){
 579         printf("%d '%s' '%s'\n",
 580                icu_chain_get_token_count(chain),
 581                icu_chain_get_norm(chain),
 582                icu_chain_get_display(chain));
 583     }
 584
 585
 586     YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 3);
 587
 588     icu_chain_destroy(chain);
 589 }
 590
 591
 592 void test_bug_1140(void)
 593 {
 594     const char * en_str
 595         = "O Romeo, Romeo! wherefore art thou\t Romeo?";
 596
 597     printf("ICU chain:\ninput: '%s'\n", en_str);
 598
 599     UErrorCode status = U_ZERO_ERROR;
 600     //struct icu_chain_step * step = 0;
 601     struct icu_chain * chain = 0;
 602
 603     const char * xml_str = "<icu_chain id=\"en:word\" locale=\"en\">"
 604
 605         /* if the first rule is normalize instead. Then it works */
 606 #if 0
 607         "<normalize rule=\"[:Control:] Any-Remove\"/>"
 608 #endif
 609         "<tokenize rule=\"l\"/>"
 610         "<normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
 611         "<display/>"
 612         "<casemap rule=\"l\"/>"
 613         "<index/>"
 614         "<sortkey/>"
 615         "</icu_chain>";
 616
 617
 618     xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
 619     xmlNode *xml_node = xmlDocGetRootElement(doc);
 620     YAZ_CHECK(xml_node);
 621
 622     chain = icu_chain_xml_config(xml_node, &status);
 623
 624     xmlFreeDoc(doc);
 625     YAZ_CHECK(chain);
 626
 627     YAZ_CHECK(icu_chain_assign_cstr(
 628                   chain,  "O Romeo, Romeo! wherefore art thou\t Romeo?",
 629                   &status));
 630
 631     while (icu_chain_next_token(chain, &status))
 632         ;
 633
 634     YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 7);
 635
 636     YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));
 637
 638     while (icu_chain_next_token(chain, &status)){
 639         printf("%d '%s' '%s'\n",
 640                icu_chain_get_token_count(chain),
 641                icu_chain_get_norm(chain),
 642                icu_chain_get_display(chain));
 643     }
 644
 645     /* we expect 'what' 'is' 'this', i.e. 3 tokens */
 646     YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 3);
 647
 648     icu_chain_destroy(chain);
 649 }
 650
 651 #endif // HAVE_ICU
 652
 653 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 654
 655 int main(int argc, char **argv)
 656 {
 657
 658     YAZ_CHECK_INIT(argc, argv);
 659     YAZ_CHECK_LOG();
 660
 661 #ifdef HAVE_ICU
 662
 663     //test_icu_I18N_casemap_failures(argc, argv);
 664     test_icu_I18N_casemap(argc, argv);
 665     test_icu_I18N_sortmap(argc, argv);
 666     test_icu_I18N_normalizer(argc, argv);
 667     test_icu_I18N_tokenizer(argc, argv);
 668     test_icu_I18N_chain(argc, argv);
 669     test_bug_1140();
 670
 671 #else // HAVE_ICU
 672
 673     printf("ICU unit tests omitted.\n"
 674            "Please install libicu36-dev and icu-doc or similar\n");
 675     YAZ_CHECK(0 == 0);
 676
 677 #endif // HAVE_ICU
 678
 679     YAZ_CHECK_TERM;
 680 }
 681
 682
 683 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 684
 685
 686
 687 /*
 688  * Local variables:
 689  * c-basic-offset: 4
 690  * indent-tabs-mode: nil
 691  * End:
 692  * vim: shiftwidth=4 tabstop=8 expandtab
 693  */