src/test_icu_I18N.c

   1 /* This file is part of Pazpar2.
   2    Copyright (C) 2006-2008 Index Data
   3
   4 Pazpar2 is free software; you can redistribute it and/or modify it under
   5 the terms of the GNU General Public License as published by the Free
   6 Software Foundation; either version 2, or (at your option) any later
   7 version.
   8
   9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 for more details.
  13
  14 You should have received a copy of the GNU General Public License
  15 along with this program; if not, write to the Free Software
  16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 */
  19
  20 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  21
  22
  23 #if HAVE_CONFIG_H
  24 #include "cconfig.h"
  25 #endif
  26
  27 #define USE_TIMING 0
  28 #if USE_TIMING
  29 #include <yaz/timing.h>
  30 #endif
  31
  32 #include <yaz/test.h>
  33
  34
  35
  36 #ifdef HAVE_ICU
  37 #include "icu_I18N.h"
  38
  39 #include <string.h>
  40 #include <stdlib.h>
  41
  42 //#include <unicode/ustring.h>
  43 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
  44
  45
  46 #define MAX_KEY_SIZE 256
  47 struct icu_termmap
  48 {
  49     uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated
  50     char disp_term[MAX_KEY_SIZE];  // standard C utf-8 string
  51 };
  52
  53
  54
  55 int icu_termmap_cmp(const void *vp1, const void *vp2)
  56 {
  57     struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1;
  58     struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2;
  59
  60     int cmp = 0;
  61
  62     cmp = strcmp((const char *)itmp1->sort_key,
  63                  (const char *)itmp2->sort_key);
  64     return cmp;
  65 };
  66
  67
  68
  69
  70 int test_icu_casemap(const char * locale, char action,
  71                      const char * src8cstr, const char * chk8cstr)
  72 {
  73     int success = 0;
  74     UErrorCode status = U_ZERO_ERROR;
  75
  76     struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0);
  77     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
  78     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
  79     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
  80
  81
  82     int src8cstr_len = strlen(src8cstr);
  83     int chk8cstr_len = strlen(chk8cstr);
  84
  85     // converting to UTF16
  86     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
  87
  88     // perform case mapping
  89     icu_utf16_casemap(dest16, src16, locale, action, &status);
  90
  91     // converting to UTF8
  92     icu_utf16_to_utf8(dest8, dest16, &status);
  93
  94
  95
  96     // determine success
  97     if (dest8->utf8
  98         && (dest8->utf8_len == strlen(chk8cstr))
  99         && !strcmp(chk8cstr, (const char *) dest8->utf8))
 100         success = 1;
 101     else
 102         success = 0;
 103
 104     // report failures
 105     if (!success){
 106         printf("\nERROR\n");
 107         printf("original string:   '%s' (%d)\n", src8cstr, src8cstr_len);
 108         printf("icu_casemap '%s:%c' '%s' (%d)\n",
 109                locale, action, dest8->utf8, dest8->utf8_len);
 110         printf("expected string:   '%s' (%d)\n", chk8cstr, chk8cstr_len);
 111     }
 112
 113     // clean the buffers
 114     icu_buf_utf8_destroy(src8);
 115     icu_buf_utf8_destroy(dest8);
 116     icu_buf_utf16_destroy(src16);
 117     icu_buf_utf16_destroy(dest16);
 118
 119
 120     return success;
 121 }
 122
 123
 124
 125 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 126
 127 void test_icu_I18N_casemap(int argc, char **argv)
 128 {
 129
 130     // Locale 'en'
 131
 132     // sucessful tests
 133     YAZ_CHECK(test_icu_casemap("en", 'l',
 134                                "A ReD fOx hunTS sQUirriLs",
 135                                "a red fox hunts squirrils"));
 136
 137     YAZ_CHECK(test_icu_casemap("en", 'u',
 138                                "A ReD fOx hunTS sQUirriLs",
 139                                "A RED FOX HUNTS SQUIRRILS"));
 140
 141     YAZ_CHECK(test_icu_casemap("en", 'f',
 142                                "A ReD fOx hunTS sQUirriLs",
 143                                "a red fox hunts squirrils"));
 144
 145     YAZ_CHECK(test_icu_casemap("en", 't',
 146                                "A ReD fOx hunTS sQUirriLs",
 147                                "A Red Fox Hunts Squirrils"));
 148
 149
 150     // Locale 'da'
 151
 152     // sucess expected
 153     YAZ_CHECK(test_icu_casemap("da", 'l',
 154                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 155                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 156
 157     YAZ_CHECK(test_icu_casemap("da", 'u',
 158                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 159                                "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN"));
 160
 161     YAZ_CHECK(test_icu_casemap("da", 'f',
 162                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 163                                "åh æble, øs fløde i åen efter blåbærgrøden"));
 164
 165     YAZ_CHECK(test_icu_casemap("da", 't',
 166                                "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN",
 167                                "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden"));
 168
 169     // Locale 'de'
 170
 171     // sucess expected
 172     YAZ_CHECK(test_icu_casemap("de", 'l',
 173                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 174                                "zwölf ärgerliche würste rollen über die straße"));
 175
 176     YAZ_CHECK(test_icu_casemap("de", 'u',
 177                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 178                                "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE"));
 179
 180     YAZ_CHECK(test_icu_casemap("de", 'f',
 181                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 182                                "zwölf ärgerliche würste rollen über die strasse"));
 183
 184     YAZ_CHECK(test_icu_casemap("de", 't',
 185                                "zWÖlf ärgerliche Würste rollen ÜBer die StRAße",
 186                                "Zwölf Ärgerliche Würste Rollen Über Die Straße"));
 187
 188 }
 189
 190
 191 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 192
 193 int test_icu_sortmap(const char * locale, int src_list_len,
 194                      const char ** src_list, const char ** chk_list)
 195 {
 196     int success = 1;
 197
 198     UErrorCode status = U_ZERO_ERROR;
 199
 200     struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0);
 201     struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0);
 202
 203     int i;
 204
 205     struct icu_termmap * list[src_list_len];
 206
 207     UCollator *coll = ucol_open(locale, &status);
 208     icu_check_status(status);
 209
 210     if(U_FAILURE(status))
 211         return 0;
 212
 213     // assigning display terms and sort keys using buf 8 and buf16
 214     for( i = 0; i < src_list_len; i++)
 215         {
 216
 217             list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap));
 218
 219             // copy display term
 220             strcpy(list[i]->disp_term, src_list[i]);
 221
 222             // transforming to UTF16
 223             icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status);
 224             icu_check_status(status);
 225
 226             // computing sortkeys
 227             icu_sortkey8_from_utf16(coll, buf8, buf16, &status);
 228             icu_check_status(status);
 229
 230             // assigning sortkeys
 231             memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 232             //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len);
 233             //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8);
 234         }
 235
 236
 237     // do the sorting
 238     qsort(list, src_list_len,
 239           sizeof(struct icu_termmap *), icu_termmap_cmp);
 240
 241     // checking correct sorting
 242     for (i = 0; i < src_list_len; i++){
 243         if (0 != strcmp(list[i]->disp_term, chk_list[i])){
 244             success = 0;
 245         }
 246     }
 247
 248     if(!success){
 249         printf("\nERROR\n");
 250         printf("Input str: '%s' : ", locale);
 251         for (i = 0; i < src_list_len; i++) {
 252             printf(" '%s'", list[i]->disp_term);
 253         }
 254         printf("\n");
 255         printf("ICU sort:  '%s' : ", locale);
 256         for (i = 0; i < src_list_len; i++) {
 257             printf(" '%s'", list[i]->disp_term);
 258             //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]);
 259         }
 260         printf("\n");
 261         printf("Expected:  '%s' : ", locale);
 262         for (i = 0; i < src_list_len; i++) {
 263             printf(" '%s'", chk_list[i]);
 264         }
 265         printf("\n");
 266     }
 267
 268
 269
 270     for( i = 0; i < src_list_len; i++)
 271         free(list[i]);
 272
 273
 274     ucol_close(coll);
 275
 276     icu_buf_utf8_destroy(buf8);
 277     icu_buf_utf16_destroy(buf16);
 278
 279     return success;
 280 }
 281
 282
 283 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 284
 285 void test_icu_I18N_sortmap(int argc, char **argv)
 286 {
 287
 288     // sucessful tests
 289     size_t en_1_len = 6;
 290     const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"};
 291     const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"};
 292     YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck));
 293     YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck));
 294     YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck));
 295     YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck));
 296     YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck));
 297
 298     // sucessful tests
 299     size_t da_1_len = 6;
 300     const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"};
 301     const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"};
 302     YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck));
 303     YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck));
 304
 305     // sucessful tests
 306     size_t de_1_len = 9;
 307     const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"};
 308     const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"};
 309     YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck));
 310     YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck));
 311     YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck));
 312
 313 }
 314
 315
 316 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 317
 318
 319
 320
 321 int test_icu_normalizer(const char * rules8cstr,
 322                             const char * src8cstr,
 323                             const char * chk8cstr)
 324 {
 325     int success = 0;
 326
 327     UErrorCode status = U_ZERO_ERROR;
 328
 329     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 330     struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0);
 331     struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0);
 332     struct icu_normalizer * normalizer
 333         = icu_normalizer_create(rules8cstr, 'f', &status);
 334     icu_check_status(status);
 335
 336     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 337     icu_check_status(status);
 338
 339     icu_normalizer_normalize(normalizer, dest16, src16, &status);
 340     icu_check_status(status);
 341
 342     icu_utf16_to_utf8(dest8, dest16, &status);
 343     icu_check_status(status);
 344
 345
 346     if(!strcmp((const char *) dest8->utf8,
 347                (const char *) chk8cstr))
 348         success = 1;
 349     else {
 350         success = 0;
 351         printf("Normalization\n");
 352         printf("Rules:      '%s'\n", rules8cstr);
 353         printf("Input:      '%s'\n", src8cstr);
 354         printf("Normalized: '%s'\n", dest8->utf8);
 355         printf("Expected:   '%s'\n", chk8cstr);
 356     }
 357
 358
 359     icu_normalizer_destroy(normalizer);
 360     icu_buf_utf16_destroy(src16);
 361     icu_buf_utf16_destroy(dest16);
 362     icu_buf_utf8_destroy(dest8);
 363
 364     return success;
 365 };
 366
 367
 368 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 369
 370 void test_icu_I18N_normalizer(int argc, char **argv)
 371 {
 372
 373     YAZ_CHECK(test_icu_normalizer("[:Punctuation:] Any-Remove",
 374                                   "Don't shoot!",
 375                                   "Dont shoot"));
 376
 377     YAZ_CHECK(test_icu_normalizer("[:Control:] Any-Remove",
 378                                   "Don't\n shoot!",
 379                                   "Don't shoot!"));
 380
 381     YAZ_CHECK(test_icu_normalizer("[:Decimal_Number:] Any-Remove",
 382                                   "This is 4 you!",
 383                                   "This is  you!"));
 384
 385     YAZ_CHECK(test_icu_normalizer("Lower; [:^Letter:] Remove",
 386                                   "Don't shoot!",
 387                                   "dontshoot"));
 388
 389     YAZ_CHECK(test_icu_normalizer("[:^Number:] Remove",
 390                                   "Monday 15th of April",
 391                                   "15"));
 392
 393     YAZ_CHECK(test_icu_normalizer("Lower;"
 394                                   "[[:WhiteSpace:][:Punctuation:]] Remove",
 395                                   " word4you? ",
 396                                   "word4you"));
 397
 398
 399     YAZ_CHECK(test_icu_normalizer("NFD; [:Nonspacing Mark:] Remove; NFC",
 400                                   "à côté de l'alcôve ovoïde",
 401                                   "a cote de l'alcove ovoide"));
 402
 403 }
 404
 405
 406 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 407
 408 int test_icu_tokenizer(const char * locale, char action,
 409                      const char * src8cstr, int count)
 410 {
 411     int success = 1;
 412
 413     UErrorCode status = U_ZERO_ERROR;
 414     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
 415     struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0);
 416     struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0);
 417
 418     //printf("Input:  '%s'\n", src8cstr);
 419
 420     // transforming to UTF16
 421     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
 422     icu_check_status(status);
 423
 424     // set up tokenizer
 425     struct icu_tokenizer * tokenizer
 426         = icu_tokenizer_create(locale, action, &status);
 427     icu_check_status(status);
 428     YAZ_CHECK(tokenizer);
 429
 430     // attach text buffer to tokenizer
 431     icu_tokenizer_attach(tokenizer, src16, &status);
 432     icu_check_status(status);
 433     YAZ_CHECK(tokenizer->bi);
 434
 435     // perform work on tokens
 436     //printf("Tokens: ");
 437     while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){
 438         icu_check_status(status);
 439
 440         // converting to UTF8
 441         icu_utf16_to_utf8(tkn8, tkn16, &status);
 442
 443         //printf("token %d %d %d %d '%s'\n",
 444         //
 445         //       icu_tokenizer_token_start(tokenizer),
 446         //       icu_tokenizer_token_end(tokenizer),
 447         //       icu_tokenizer_token_length(tokenizer),
 448         //       tkn8->utf8);
 449     }
 450
 451     if (count != icu_tokenizer_token_count(tokenizer)){
 452         success = 0;
 453         printf("\nTokenizer '%s:%c' Error: \n", locale, action);
 454         printf("Input:  '%s'\n", src8cstr);
 455         printf("Tokens: %d", icu_tokenizer_token_count(tokenizer));
 456         printf(", expected: %d\n", count);
 457     }
 458
 459     icu_tokenizer_destroy(tokenizer);
 460     icu_buf_utf16_destroy(src16);
 461     icu_buf_utf16_destroy(tkn16);
 462     icu_buf_utf8_destroy(tkn8);
 463
 464     return success;
 465 }
 466
 467
 468 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 469
 470 void test_icu_I18N_tokenizer(int argc, char **argv)
 471 {
 472
 473
 474     const char * en_str
 475         = "O Romeo, Romeo! wherefore art thou Romeo?";
 476
 477     YAZ_CHECK(test_icu_tokenizer("en", 's', en_str, 2));
 478     YAZ_CHECK(test_icu_tokenizer("en", 'l', en_str, 7));
 479     YAZ_CHECK(test_icu_tokenizer("en", 'w', en_str, 16));
 480     YAZ_CHECK(test_icu_tokenizer("en", 'c', en_str, 41));
 481
 482
 483
 484     const char * da_str
 485         = "Blåbærtærte. Denne kage stammer fra Finland. "
 486         "Den er med blåbær, men alle sommerens forskellige bær kan bruges.";
 487
 488     YAZ_CHECK(test_icu_tokenizer("da", 's', da_str, 3));
 489     YAZ_CHECK(test_icu_tokenizer("dar", 'l', da_str, 17));
 490     YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37));
 491     YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110));
 492
 493 }
 494
 495
 496 void test_icu_I18N_chain(int argc, char **argv)
 497 {
 498     const char * en_str
 499         = "O Romeo, Romeo! wherefore art thou\t Romeo?";
 500
 501     printf("ICU chain:\ninput: '%s'\n", en_str);
 502
 503     UErrorCode status = U_ZERO_ERROR;
 504     //struct icu_chain_step * step = 0;
 505     struct icu_chain * chain = 0;
 506
 507
 508     const char * xml_str = "<icu_chain id=\"en:word\" locale=\"en\">"
 509         "<normalize rule=\"[:Control:] Any-Remove\"/>"
 510         "<tokenize rule=\"l\"/>"
 511         "<normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
 512         "<display/>"
 513         "<casemap rule=\"l\"/>"
 514         "<index/>"
 515         "<sortkey/>"
 516         "</icu_chain>";
 517
 518
 519     xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
 520     xmlNode *xml_node = xmlDocGetRootElement(doc);
 521     YAZ_CHECK(xml_node);
 522
 523
 524     chain = icu_chain_xml_config(xml_node, &status);
 525
 526 #if 0
 527     chain  = icu_chain_create((uint8_t *) "en:word", (uint8_t *) "en");
 528     step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
 529                                  (const uint8_t *) "[:Control:] Any-Remove",
 530                                  &status);
 531     step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
 532                                  (const uint8_t *) "s",
 533                                  &status);
 534     step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
 535                                  (const uint8_t *) "l",
 536                                  &status);
 537     step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
 538                                  (const uint8_t *)
 539                                  "[[:WhiteSpace:][:Punctuation:]] Any-Remove",
 540                                  &status);
 541     step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
 542                                  (const uint8_t *)"",
 543                                  &status);
 544 /*     step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, */
 545 /*                                  (const uint8_t *) "Lower", */
 546 /*                                  &status); */
 547     step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
 548                                  (const uint8_t *) "l",
 549                                  &status);
 550     step = icu_chain_insert_step(chain, ICU_chain_step_type_index,
 551                                  (const uint8_t *)"",
 552                                  &status);
 553 /*     step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey, */
 554 /*                                  (const uint8_t *)"", */
 555 /*                                  &status); */
 556
 557 #endif
 558
 559     xmlFreeDoc(doc);
 560     YAZ_CHECK(chain);
 561
 562     YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status));
 563
 564     while (icu_chain_next_token(chain, &status)){
 565         printf("%d '%s' '%s'\n",
 566                icu_chain_get_token_count(chain),
 567                icu_chain_get_norm(chain),
 568                icu_chain_get_display(chain));
 569     }
 570
 571     YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 7);
 572
 573
 574     YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));
 575
 576     while (icu_chain_next_token(chain, &status)){
 577         printf("%d '%s' '%s'\n",
 578                icu_chain_get_token_count(chain),
 579                icu_chain_get_norm(chain),
 580                icu_chain_get_display(chain));
 581     }
 582
 583
 584     YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 3);
 585
 586     icu_chain_destroy(chain);
 587 }
 588
 589
 590 void test_bug_1140(void)
 591 {
 592     const char * en_str
 593         = "O Romeo, Romeo! wherefore art thou\t Romeo?";
 594
 595     printf("ICU chain:\ninput: '%s'\n", en_str);
 596
 597     UErrorCode status = U_ZERO_ERROR;
 598     //struct icu_chain_step * step = 0;
 599     struct icu_chain * chain = 0;
 600
 601     const char * xml_str = "<icu_chain id=\"en:word\" locale=\"en\">"
 602
 603         /* if the first rule is normalize instead. Then it works */
 604 #if 0
 605         "<normalize rule=\"[:Control:] Any-Remove\"/>"
 606 #endif
 607         "<tokenize rule=\"l\"/>"
 608         "<normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>"
 609         "<display/>"
 610         "<casemap rule=\"l\"/>"
 611         "<index/>"
 612         "<sortkey/>"
 613         "</icu_chain>";
 614
 615
 616     xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str));
 617     xmlNode *xml_node = xmlDocGetRootElement(doc);
 618     YAZ_CHECK(xml_node);
 619
 620     chain = icu_chain_xml_config(xml_node, &status);
 621
 622     xmlFreeDoc(doc);
 623     YAZ_CHECK(chain);
 624
 625     YAZ_CHECK(icu_chain_assign_cstr(
 626                   chain,  "O Romeo, Romeo! wherefore art thou\t Romeo?",
 627                   &status));
 628
 629     while (icu_chain_next_token(chain, &status))
 630         ;
 631
 632     YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 7);
 633
 634     YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status));
 635
 636     while (icu_chain_next_token(chain, &status)){
 637         printf("%d '%s' '%s'\n",
 638                icu_chain_get_token_count(chain),
 639                icu_chain_get_norm(chain),
 640                icu_chain_get_display(chain));
 641     }
 642
 643     /* we expect 'what' 'is' 'this', i.e. 3 tokens */
 644     YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 3);
 645
 646     icu_chain_destroy(chain);
 647 }
 648
 649 #endif // HAVE_ICU
 650
 651 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 652
 653 int main(int argc, char **argv)
 654 {
 655
 656     YAZ_CHECK_INIT(argc, argv);
 657     YAZ_CHECK_LOG();
 658
 659 #ifdef HAVE_ICU
 660
 661     //test_icu_I18N_casemap_failures(argc, argv);
 662     test_icu_I18N_casemap(argc, argv);
 663     test_icu_I18N_sortmap(argc, argv);
 664     test_icu_I18N_normalizer(argc, argv);
 665     test_icu_I18N_tokenizer(argc, argv);
 666     test_icu_I18N_chain(argc, argv);
 667     test_bug_1140();
 668
 669 #else // HAVE_ICU
 670
 671     printf("ICU unit tests omitted.\n"
 672            "Please install libicu36-dev and icu-doc or similar\n");
 673     YAZ_CHECK(0 == 0);
 674
 675 #endif // HAVE_ICU
 676
 677     YAZ_CHECK_TERM;
 678 }
 679
 680
 681 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 682
 683
 684
 685 /*
 686  * Local variables:
 687  * c-basic-offset: 4
 688  * indent-tabs-mode: nil
 689  * End:
 690  * vim: shiftwidth=4 tabstop=8 expandtab
 691  */