src/icu_I18N.c

   1 /*
   2  * Copyright (C) 1995-2007, Index Data ApS
   3  * See the file LICENSE for details.
   4  *
   5  * $Id: icu_I18N.c,v 1.18 2007-11-08 17:22:49 adam Exp $
   6  */
   7
   8 #if HAVE_CONFIG_H
   9 #include "config.h"
  10 #endif
  11
  12 #define USE_TIMING 0
  13 #if USE_TIMING
  14 #include <yaz/timing.h>
  15 #endif
  16
  17 #include <yaz/xmalloc.h>
  18
  19 #if YAZ_HAVE_ICU
  20 #include <yaz/icu_I18N.h>
  21
  22 #include <yaz/log.h>
  23
  24 #include <string.h>
  25 #include <stdlib.h>
  26 #include <stdio.h>
  27
  28 #include <unicode/ustring.h>  /* some more string fcns*/
  29 #include <unicode/uchar.h>    /* char names           */
  30
  31
  32 #include <unicode/ucol.h>
  33
  34
  35 int icu_check_status (UErrorCode status)
  36 {
  37     if(U_FAILURE(status)){
  38         yaz_log(YLOG_WARN,
  39                 "ICU: %d %s\n", status, u_errorName(status));
  40         return 0;
  41     }
  42     return 1;
  43
  44 }
  45
  46
  47
  48 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
  49 {
  50     struct icu_buf_utf16 * buf16
  51         = (struct icu_buf_utf16 *) xmalloc(sizeof(struct icu_buf_utf16));
  52
  53     buf16->utf16 = 0;
  54     buf16->utf16_len = 0;
  55     buf16->utf16_cap = 0;
  56
  57     if (capacity > 0){
  58         buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity);
  59         buf16->utf16[0] = (UChar) 0;
  60         buf16->utf16_cap = capacity;
  61     }
  62     return buf16;
  63 }
  64
  65 struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16)
  66 {
  67     if (buf16){
  68         if (buf16->utf16)
  69             buf16->utf16[0] = (UChar) 0;
  70         buf16->utf16_len = 0;
  71     }
  72     return buf16;
  73 }
  74
  75 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
  76                                             size_t capacity)
  77 {
  78     if (!buf16)
  79         return 0;
  80
  81     if (capacity >  0){
  82         if (0 == buf16->utf16)
  83             buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity);
  84         else
  85             buf16->utf16
  86                 = (UChar *) xrealloc(buf16->utf16, sizeof(UChar) * capacity);
  87
  88         icu_buf_utf16_clear(buf16);
  89         buf16->utf16_cap = capacity;
  90     }
  91     else {
  92         xfree(buf16->utf16);
  93         buf16->utf16 = 0;
  94         buf16->utf16_len = 0;
  95         buf16->utf16_cap = 0;
  96     }
  97
  98     return buf16;
  99 }
 100
 101
 102 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
 103                                           struct icu_buf_utf16 * src16)
 104 {
 105     if(!dest16 || !src16
 106        || dest16 == src16)
 107         return 0;
 108
 109     if (dest16->utf16_cap < src16->utf16_len)
 110         icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
 111
 112     u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
 113     dest16->utf16_len = src16->utf16_len;
 114
 115     return dest16;
 116 }
 117
 118
 119 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
 120 {
 121     if (buf16)
 122         xfree(buf16->utf16);
 123     xfree(buf16);
 124 }
 125
 126
 127
 128 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
 129 {
 130     struct icu_buf_utf8 * buf8
 131         = (struct icu_buf_utf8 *) xmalloc(sizeof(struct icu_buf_utf8));
 132
 133     buf8->utf8 = 0;
 134     buf8->utf8_len = 0;
 135     buf8->utf8_cap = 0;
 136
 137     if (capacity > 0){
 138         buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity);
 139         buf8->utf8[0] = (uint8_t) 0;
 140         buf8->utf8_cap = capacity;
 141     }
 142     return buf8;
 143 }
 144
 145
 146 struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8)
 147 {
 148     if (buf8){
 149         if (buf8->utf8)
 150             buf8->utf8[0] = (uint8_t) 0;
 151         buf8->utf8_len = 0;
 152     }
 153     return buf8;
 154 }
 155
 156
 157 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
 158                                           size_t capacity)
 159 {
 160     if (!buf8)
 161         return 0;
 162
 163     if (capacity >  0){
 164         if (0 == buf8->utf8)
 165             buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity);
 166         else
 167             buf8->utf8
 168                 = (uint8_t *) xrealloc(buf8->utf8, sizeof(uint8_t) * capacity);
 169
 170         icu_buf_utf8_clear(buf8);
 171         buf8->utf8_cap = capacity;
 172     }
 173     else {
 174         xfree(buf8->utf8);
 175         buf8->utf8 = 0;
 176         buf8->utf8_len = 0;
 177         buf8->utf8_cap = 0;
 178     }
 179
 180     return buf8;
 181 }
 182
 183
 184 struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8,
 185                                           struct icu_buf_utf8 * src8)
 186 {
 187     if(!dest8 || !src8
 188        || dest8 == src8)
 189         return 0;
 190
 191
 192     if (dest8->utf8_cap < src8->utf8_len)
 193         icu_buf_utf8_resize(dest8, src8->utf8_len * 2);
 194
 195     strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len);
 196
 197     return dest8;
 198 }
 199
 200
 201 const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
 202 {
 203     if (!src8 || src8->utf8_len == 0)
 204         return "";
 205
 206     if (src8->utf8_len == src8->utf8_cap)
 207         src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1);
 208
 209     src8->utf8[src8->utf8_len] = '\0';
 210
 211     return (const char *) src8->utf8;
 212 }
 213
 214
 215 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
 216 {
 217     if (buf8)
 218         xfree(buf8->utf8);
 219     xfree(buf8);
 220 }
 221
 222
 223
 224 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
 225                                struct icu_buf_utf8 * src8,
 226                                UErrorCode * status)
 227 {
 228     int32_t utf16_len = 0;
 229
 230     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 231                   &utf16_len,
 232                   (const char *) src8->utf8, src8->utf8_len, status);
 233
 234     /* check for buffer overflow, resize and retry */
 235     if (*status == U_BUFFER_OVERFLOW_ERROR)
 236     {
 237         icu_buf_utf16_resize(dest16, utf16_len * 2);
 238         *status = U_ZERO_ERROR;
 239         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 240                       &utf16_len,
 241                       (const char *) src8->utf8, src8->utf8_len, status);
 242     }
 243
 244     if (U_SUCCESS(*status)
 245         && utf16_len <= dest16->utf16_cap)
 246         dest16->utf16_len = utf16_len;
 247     else
 248         icu_buf_utf16_clear(dest16);
 249
 250     return *status;
 251 }
 252
 253
 254
 255 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
 256                                     const char * src8cstr,
 257                                     UErrorCode * status)
 258 {
 259     size_t src8cstr_len = 0;
 260     int32_t utf16_len = 0;
 261
 262     *status = U_ZERO_ERROR;
 263     src8cstr_len = strlen(src8cstr);
 264
 265     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 266                   &utf16_len,
 267                   src8cstr, src8cstr_len, status);
 268
 269     /* check for buffer overflow, resize and retry */
 270     if (*status == U_BUFFER_OVERFLOW_ERROR)
 271     {
 272         icu_buf_utf16_resize(dest16, utf16_len * 2);
 273         *status = U_ZERO_ERROR;
 274         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 275                       &utf16_len,
 276                       src8cstr, src8cstr_len, status);
 277     }
 278
 279     if (U_SUCCESS(*status)
 280         && utf16_len <= dest16->utf16_cap)
 281         dest16->utf16_len = utf16_len;
 282     else
 283         icu_buf_utf16_clear(dest16);
 284
 285     return *status;
 286 }
 287
 288
 289
 290
 291 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
 292                              struct icu_buf_utf16 * src16,
 293                              UErrorCode * status)
 294 {
 295     int32_t utf8_len = 0;
 296
 297     u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 298                 &utf8_len,
 299                 src16->utf16, src16->utf16_len, status);
 300
 301     /* check for buffer overflow, resize and retry */
 302     if (*status == U_BUFFER_OVERFLOW_ERROR)
 303     {
 304         icu_buf_utf8_resize(dest8, utf8_len * 2);
 305         *status = U_ZERO_ERROR;
 306         u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 307                     &utf8_len,
 308                     src16->utf16, src16->utf16_len, status);
 309
 310     }
 311
 312     if (U_SUCCESS(*status)
 313         && utf8_len <= dest8->utf8_cap)
 314         dest8->utf8_len = utf8_len;
 315     else
 316         icu_buf_utf8_clear(dest8);
 317
 318     return *status;
 319 }
 320
 321
 322
 323 struct icu_casemap * icu_casemap_create(char action, UErrorCode *status)
 324 {
 325     struct icu_casemap * casemap
 326         = (struct icu_casemap *) xmalloc(sizeof(struct icu_casemap));
 327     casemap->action = action;
 328
 329     switch(casemap->action) {
 330     case 'l':
 331     case 'L':
 332     case 'u':
 333     case 'U':
 334     case 't':
 335     case 'T':
 336     case 'f':
 337     case 'F':
 338         break;
 339     default:
 340         icu_casemap_destroy(casemap);
 341         return 0;
 342     }
 343
 344     return casemap;
 345 }
 346
 347 void icu_casemap_destroy(struct icu_casemap * casemap)
 348 {
 349     xfree(casemap);
 350 }
 351
 352
 353 int icu_casemap_casemap(struct icu_casemap * casemap,
 354                         struct icu_buf_utf16 * dest16,
 355                         struct icu_buf_utf16 * src16,
 356                         UErrorCode *status,
 357                         const char *locale)
 358 {
 359     if(!casemap)
 360         return 0;
 361
 362     return icu_utf16_casemap(dest16, src16, locale,
 363                              casemap->action, status);
 364 }
 365
 366
 367 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
 368                       struct icu_buf_utf16 * src16,
 369                       const char *locale, char action,
 370                       UErrorCode *status)
 371 {
 372     int32_t dest16_len = 0;
 373
 374
 375     if (!src16->utf16_len){           //guarding for empty source string
 376         if (dest16->utf16)
 377             dest16->utf16[0] = (UChar) 0;
 378         dest16->utf16_len = 0;
 379         return U_ZERO_ERROR;
 380     }
 381
 382
 383     switch(action) {
 384     case 'l':
 385     case 'L':
 386         dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 387                                   src16->utf16, src16->utf16_len,
 388                                   locale, status);
 389         break;
 390     case 'u':
 391     case 'U':
 392         dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 393                                   src16->utf16, src16->utf16_len,
 394                                   locale, status);
 395         break;
 396     case 't':
 397     case 'T':
 398         dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 399                                   src16->utf16, src16->utf16_len,
 400                                   0, locale, status);
 401         break;
 402     case 'f':
 403     case 'F':
 404         dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 405                                    src16->utf16, src16->utf16_len,
 406                                    U_FOLD_CASE_DEFAULT, status);
 407         break;
 408
 409     default:
 410         return U_UNSUPPORTED_ERROR;
 411         break;
 412     }
 413
 414     /* check for buffer overflow, resize and retry */
 415     if (*status == U_BUFFER_OVERFLOW_ERROR
 416         && dest16 != src16        /* do not resize if in-place conversion */
 417         ){
 418         icu_buf_utf16_resize(dest16, dest16_len * 2);
 419         *status = U_ZERO_ERROR;
 420
 421
 422         switch(action) {
 423         case 'l':
 424         case 'L':
 425             dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 426                                       src16->utf16, src16->utf16_len,
 427                                       locale, status);
 428             break;
 429         case 'u':
 430         case 'U':
 431             dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 432                                       src16->utf16, src16->utf16_len,
 433                                       locale, status);
 434             break;
 435         case 't':
 436         case 'T':
 437             dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 438                                       src16->utf16, src16->utf16_len,
 439                                       0, locale, status);
 440             break;
 441         case 'f':
 442         case 'F':
 443             dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 444                                        src16->utf16, src16->utf16_len,
 445                                        U_FOLD_CASE_DEFAULT, status);
 446             break;
 447
 448         default:
 449             return U_UNSUPPORTED_ERROR;
 450             break;
 451         }
 452     }
 453
 454     if (U_SUCCESS(*status)
 455         && dest16_len <= dest16->utf16_cap)
 456         dest16->utf16_len = dest16_len;
 457     else {
 458         if (dest16->utf16)
 459             dest16->utf16[0] = (UChar) 0;
 460         dest16->utf16_len = 0;
 461     }
 462
 463     return *status;
 464 }
 465
 466
 467
 468 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
 469                                    struct icu_buf_utf8 * dest8,
 470                                    struct icu_buf_utf16 * src16,
 471                                    UErrorCode * status)
 472 {
 473
 474     int32_t sortkey_len = 0;
 475
 476     sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 477                                   dest8->utf8, dest8->utf8_cap);
 478
 479     /* check for buffer overflow, resize and retry */
 480     if (sortkey_len > dest8->utf8_cap) {
 481         icu_buf_utf8_resize(dest8, sortkey_len * 2);
 482         sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 483                                       dest8->utf8, dest8->utf8_cap);
 484     }
 485
 486     if (U_SUCCESS(*status)
 487         && sortkey_len > 0)
 488         dest8->utf8_len = sortkey_len;
 489     else
 490         icu_buf_utf8_clear(dest8);
 491
 492     return sortkey_len;
 493 }
 494
 495
 496
 497 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
 498                                             UErrorCode *status)
 499 {
 500     struct icu_tokenizer * tokenizer
 501         = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
 502
 503     tokenizer->action = action;
 504     tokenizer->bi = 0;
 505     tokenizer->buf16 = 0;
 506     tokenizer->token_count = 0;
 507     tokenizer->token_id = 0;
 508     tokenizer->token_start = 0;
 509     tokenizer->token_end = 0;
 510
 511
 512     switch(tokenizer->action) {
 513     case 'l':
 514     case 'L':
 515         tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
 516         break;
 517     case 's':
 518     case 'S':
 519         tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
 520         break;
 521     case 'w':
 522     case 'W':
 523         tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
 524         break;
 525     case 'c':
 526     case 'C':
 527         tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
 528         break;
 529     case 't':
 530     case 'T':
 531         tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
 532         break;
 533     default:
 534         *status = U_UNSUPPORTED_ERROR;
 535         return 0;
 536         break;
 537     }
 538
 539     /* ICU error stuff is a very  funny business */
 540     if (U_SUCCESS(*status))
 541         return tokenizer;
 542
 543     /* freeing if failed */
 544     icu_tokenizer_destroy(tokenizer);
 545     return 0;
 546 }
 547
 548 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
 549 {
 550     if (tokenizer) {
 551         if (tokenizer->bi)
 552             ubrk_close(tokenizer->bi);
 553         xfree(tokenizer);
 554     }
 555 }
 556
 557 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 558                          struct icu_buf_utf16 * src16,
 559                          UErrorCode *status)
 560 {
 561     if (!tokenizer || !tokenizer->bi || !src16)
 562         return 0;
 563
 564
 565     tokenizer->buf16 = src16;
 566     tokenizer->token_count = 0;
 567     tokenizer->token_id = 0;
 568     tokenizer->token_start = 0;
 569     tokenizer->token_end = 0;
 570
 571     ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
 572
 573
 574     if (U_FAILURE(*status))
 575         return 0;
 576
 577     return 1;
 578 };
 579
 580 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 581                          struct icu_buf_utf16 * tkn16,
 582                          UErrorCode *status)
 583 {
 584     int32_t tkn_start = 0;
 585     int32_t tkn_end = 0;
 586     int32_t tkn_len = 0;
 587
 588
 589     if (!tokenizer || !tokenizer->bi
 590         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 591         return 0;
 592
 593     /*
 594     never change tokenizer->buf16 and keep always invariant
 595     0 <= tokenizer->token_start
 596        <= tokenizer->token_end
 597        <= tokenizer->buf16->utf16_len
 598     returns length of token
 599     */
 600
 601     if (0 == tokenizer->token_end) /* first call */
 602         tkn_start = ubrk_first(tokenizer->bi);
 603     else /* successive calls */
 604         tkn_start = tokenizer->token_end;
 605
 606     /* get next position */
 607     tkn_end = ubrk_next(tokenizer->bi);
 608
 609     /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
 610     if (UBRK_DONE == tkn_end)
 611         tkn_end = tokenizer->buf16->utf16_len;
 612
 613     /* copy out if everything is well */
 614     if(U_FAILURE(*status))
 615         return 0;
 616
 617     /* everything OK, now update internal state */
 618     tkn_len = tkn_end - tkn_start;
 619
 620     if (0 < tkn_len){
 621         tokenizer->token_count++;
 622         tokenizer->token_id++;
 623     } else {
 624         tokenizer->token_id = 0;
 625     }
 626     tokenizer->token_start = tkn_start;
 627     tokenizer->token_end = tkn_end;
 628
 629
 630     /* copying into token buffer if it exists */
 631     if (tkn16){
 632         if (tkn16->utf16_cap < tkn_len)
 633             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 634
 635         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 636                   tkn_len);
 637
 638         tkn16->utf16_len = tkn_len;
 639     }
 640
 641     return tkn_len;
 642 }
 643
 644
 645 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
 646 {
 647     return tokenizer->token_id;
 648 }
 649
 650 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
 651 {
 652     return tokenizer->token_start;
 653 }
 654
 655 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
 656 {
 657     return tokenizer->token_end;
 658 }
 659
 660 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
 661 {
 662     return (tokenizer->token_end - tokenizer->token_start);
 663 }
 664
 665 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 666 {
 667     return tokenizer->token_count;
 668 }
 669
 670
 671
 672 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
 673                                               UErrorCode *status)
 674 {
 675
 676     struct icu_normalizer * normalizer
 677         = (struct icu_normalizer *) xmalloc(sizeof(struct icu_normalizer));
 678
 679     normalizer->action = action;
 680     normalizer->trans = 0;
 681     normalizer->rules16 =  icu_buf_utf16_create(0);
 682     icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status);
 683
 684     switch(normalizer->action) {
 685     case 'f':
 686     case 'F':
 687         normalizer->trans
 688             = utrans_openU(normalizer->rules16->utf16,
 689                            normalizer->rules16->utf16_len,
 690                            UTRANS_FORWARD,
 691                            0, 0,
 692                            normalizer->parse_error, status);
 693         break;
 694     case 'r':
 695     case 'R':
 696         normalizer->trans
 697             = utrans_openU(normalizer->rules16->utf16,
 698                            normalizer->rules16->utf16_len,
 699                            UTRANS_REVERSE ,
 700                            0, 0,
 701                            normalizer->parse_error, status);
 702         break;
 703     default:
 704         *status = U_UNSUPPORTED_ERROR;
 705         return 0;
 706         break;
 707     }
 708
 709     if (U_SUCCESS(*status))
 710         return normalizer;
 711
 712     /* freeing if failed */
 713     icu_normalizer_destroy(normalizer);
 714     return 0;
 715 }
 716
 717
 718 void icu_normalizer_destroy(struct icu_normalizer * normalizer){
 719     if (normalizer) {
 720         if (normalizer->rules16)
 721             icu_buf_utf16_destroy(normalizer->rules16);
 722         if (normalizer->trans)
 723             utrans_close(normalizer->trans);
 724         xfree(normalizer);
 725     }
 726 }
 727
 728
 729
 730 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
 731                              struct icu_buf_utf16 * dest16,
 732                              struct icu_buf_utf16 * src16,
 733                              UErrorCode *status)
 734 {
 735     if (!normalizer || !normalizer->trans
 736         || !src16
 737         || !dest16)
 738         return 0;
 739
 740     if (!src16->utf16_len){           //guarding for empty source string
 741         icu_buf_utf16_clear(dest16);
 742         return 0;
 743     }
 744
 745     if (!icu_buf_utf16_copy(dest16, src16))
 746         return 0;
 747
 748
 749     utrans_transUChars (normalizer->trans,
 750                         dest16->utf16, &(dest16->utf16_len),
 751                         dest16->utf16_cap,
 752                         0, &(src16->utf16_len), status);
 753
 754     if (U_FAILURE(*status))
 755         icu_buf_utf16_clear(dest16);
 756
 757     return dest16->utf16_len;
 758 }
 759
 760
 761
 762
 763 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
 764                                               enum icu_chain_step_type type,
 765                                               const uint8_t * rule,
 766                                               struct icu_buf_utf16 * buf16,
 767                                               UErrorCode *status)
 768 {
 769     struct icu_chain_step * step = 0;
 770
 771     if(!chain || !type || !rule)
 772         return 0;
 773
 774     step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step));
 775
 776     step->type = type;
 777
 778     step->buf16 = buf16;
 779
 780     /* create auxilary objects */
 781     switch(step->type) {
 782     case ICU_chain_step_type_display:
 783         break;
 784     case ICU_chain_step_type_casemap:
 785         step->u.casemap = icu_casemap_create(rule[0], status);
 786         break;
 787     case ICU_chain_step_type_normalize:
 788         step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status);
 789         break;
 790     case ICU_chain_step_type_tokenize:
 791         step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
 792                                                  (char) rule[0], status);
 793         break;
 794     default:
 795         break;
 796     }
 797
 798     return step;
 799 }
 800
 801
 802 void icu_chain_step_destroy(struct icu_chain_step * step){
 803
 804     if (!step)
 805         return;
 806
 807     icu_chain_step_destroy(step->previous);
 808
 809     switch(step->type) {
 810     case ICU_chain_step_type_display:
 811         break;
 812     case ICU_chain_step_type_casemap:
 813         icu_casemap_destroy(step->u.casemap);
 814         icu_buf_utf16_destroy(step->buf16);
 815         break;
 816     case ICU_chain_step_type_normalize:
 817         icu_normalizer_destroy(step->u.normalizer);
 818         icu_buf_utf16_destroy(step->buf16);
 819         break;
 820     case ICU_chain_step_type_tokenize:
 821         icu_tokenizer_destroy(step->u.tokenizer);
 822         icu_buf_utf16_destroy(step->buf16);
 823         break;
 824     default:
 825         break;
 826     }
 827     xfree(step);
 828 }
 829
 830
 831
 832 struct icu_chain * icu_chain_create(const char *locale,
 833                                     int sort,
 834                                     UErrorCode * status)
 835 {
 836
 837     struct icu_chain * chain
 838         = (struct icu_chain *) xmalloc(sizeof(struct icu_chain));
 839
 840     *status = U_ZERO_ERROR;
 841
 842     chain->locale = xstrdup(locale);
 843
 844     chain->sort = sort;
 845
 846     chain->coll = ucol_open((const char *) chain->locale, status);
 847
 848     if (U_FAILURE(*status))
 849         return 0;
 850
 851
 852     chain->token_count = 0;
 853
 854     chain->src8cstr = 0;
 855
 856     chain->display8 = icu_buf_utf8_create(0);
 857     chain->norm8 = icu_buf_utf8_create(0);
 858     chain->sort8 = icu_buf_utf8_create(0);
 859
 860     chain->src16 = icu_buf_utf16_create(0);
 861
 862     chain->steps = 0;
 863
 864     return chain;
 865 }
 866
 867
 868 void icu_chain_destroy(struct icu_chain * chain)
 869 {
 870     if (chain){
 871
 872         if (chain->coll)
 873             ucol_close(chain->coll);
 874
 875         icu_buf_utf8_destroy(chain->display8);
 876         icu_buf_utf8_destroy(chain->norm8);
 877         icu_buf_utf8_destroy(chain->sort8);
 878
 879         icu_buf_utf16_destroy(chain->src16);
 880
 881         icu_chain_step_destroy(chain->steps);
 882         xfree(chain->locale);
 883         xfree(chain);
 884     }
 885 }
 886
 887
 888
 889 struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node,
 890                                         int sort,
 891                                         UErrorCode * status)
 892 {
 893     xmlNode *node = 0;
 894     struct icu_chain * chain = 0;
 895
 896     *status = U_ZERO_ERROR;
 897
 898     if (!xml_node ||xml_node->type != XML_ELEMENT_NODE)
 899         return 0;
 900
 901     {
 902         xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node,
 903                                           (xmlChar *) "locale");
 904
 905         if (xml_locale)
 906         {
 907             chain = icu_chain_create((const char *) xml_locale, sort, status);
 908             xmlFree(xml_locale);
 909         }
 910
 911     }
 912     if (!chain)
 913         return 0;
 914
 915     for (node = xml_node->children; node; node = node->next)
 916     {
 917         xmlChar *xml_rule;
 918         struct icu_chain_step * step = 0;
 919
 920         if (node->type != XML_ELEMENT_NODE)
 921             continue;
 922
 923         xml_rule = xmlGetProp(node, (xmlChar *) "rule");
 924
 925         if (!strcmp((const char *) node->name,
 926                     (const char *) "casemap")){
 927             step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
 928                                          (const uint8_t *) xml_rule, status);
 929         }
 930         else if (!strcmp((const char *) node->name,
 931                          (const char *) "normalize")){
 932             step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
 933                                          (const uint8_t *) xml_rule, status);
 934         }
 935         else if (!strcmp((const char *) node->name,
 936                          (const char *) "tokenize")){
 937             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
 938                                          (const uint8_t *) xml_rule, status);
 939         }
 940         else if (!strcmp((const char *) node->name,
 941                          (const char *) "display")){
 942             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
 943                                          (const uint8_t *) "", status);
 944         }
 945         xmlFree(xml_rule);
 946         if (!step || U_FAILURE(*status)){
 947             icu_chain_destroy(chain);
 948             return 0;
 949         }
 950
 951
 952     }
 953
 954     return chain;
 955 }
 956
 957
 958
 959 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
 960                                               enum icu_chain_step_type type,
 961                                               const uint8_t * rule,
 962                                               UErrorCode *status)
 963 {
 964     struct icu_chain_step * step = 0;
 965     struct icu_buf_utf16 * src16 = 0;
 966     struct icu_buf_utf16 * buf16 = 0;
 967
 968     if (!chain || !type || !rule)
 969         return 0;
 970
 971     /* assign utf16 src buffers as needed */
 972     if (chain->steps && chain->steps->buf16)
 973         src16 = chain->steps->buf16;
 974     else if (chain->src16)
 975         src16 = chain->src16;
 976     else
 977         return 0;
 978
 979
 980     /* create utf16 destination buffers as needed, or */
 981     switch(type) {
 982     case ICU_chain_step_type_display:
 983         buf16 = src16;
 984         break;
 985     case ICU_chain_step_type_casemap:
 986         buf16 = icu_buf_utf16_create(0);
 987         break;
 988     case ICU_chain_step_type_normalize:
 989         buf16 = icu_buf_utf16_create(0);
 990         break;
 991     case ICU_chain_step_type_tokenize:
 992         buf16 = icu_buf_utf16_create(0);
 993         break;
 994     default:
 995         break;
 996     }
 997
 998     /* create actual chain step with this buffer */
 999     step = icu_chain_step_create(chain, type, rule, buf16, status);
1000
1001     step->previous = chain->steps;
1002     chain->steps = step;
1003
1004     return step;
1005 }
1006
1007
1008 int icu_chain_step_next_token(struct icu_chain * chain,
1009                               struct icu_chain_step * step,
1010                               UErrorCode *status)
1011 {
1012     struct icu_buf_utf16 * src16 = 0;
1013     int got_new_token = 0;
1014
1015     if (!chain || !chain->src16 || !step || !step->more_tokens)
1016         return 0;
1017
1018     /* assign utf16 src buffers as neeed, advance in previous steps
1019        tokens until non-zero token met, and setting stop condition */
1020
1021     if (step->previous){
1022         src16 = step->previous->buf16;
1023         /* tokens might be killed in previous steps, therefore looping */
1024
1025         while (step->need_new_token
1026                && step->previous->more_tokens
1027                && !got_new_token)
1028             got_new_token
1029                 = icu_chain_step_next_token(chain, step->previous, status);
1030     }
1031     else { /* first step can only work once on chain->src16 input buffer */
1032         src16 = chain->src16;
1033         step->more_tokens = 0;
1034         got_new_token = 1;
1035     }
1036
1037     if (!src16)
1038         return 0;
1039
1040     /* stop if nothing to process */
1041     if (step->need_new_token && !got_new_token){
1042         step->more_tokens = 0;
1043         return 0;
1044     }
1045
1046     /* either an old token not finished yet, or a new token, thus
1047        perform the work, eventually put this steps output in
1048        step->buf16 or the chains UTF8 output buffers  */
1049
1050     switch(step->type) {
1051     case ICU_chain_step_type_display:
1052         icu_utf16_to_utf8(chain->display8, src16, status);
1053         break;
1054     case ICU_chain_step_type_casemap:
1055         icu_casemap_casemap(step->u.casemap,
1056                             step->buf16, src16, status,
1057                             chain->locale);
1058         break;
1059     case ICU_chain_step_type_normalize:
1060         icu_normalizer_normalize(step->u.normalizer,
1061                                  step->buf16, src16, status);
1062         break;
1063     case ICU_chain_step_type_tokenize:
1064         /* attach to new src16 token only first time during splitting */
1065         if (step->need_new_token){
1066             icu_tokenizer_attach(step->u.tokenizer, src16, status);
1067             step->need_new_token = 0;
1068         }
1069
1070
1071         /* splitting one src16 token into multiple buf16 tokens */
1072         step->more_tokens
1073             = icu_tokenizer_next_token(step->u.tokenizer,
1074                                        step->buf16, status);
1075
1076         /* make sure to get new previous token if this one had been used up
1077            by recursive call to _same_ step */
1078
1079         if (!step->more_tokens){
1080             step->more_tokens = icu_chain_step_next_token(chain, step, status);
1081             return step->more_tokens;  // avoid one token count too much!
1082         }
1083
1084         break;
1085     default:
1086         return 0;
1087         break;
1088     }
1089
1090     if (U_FAILURE(*status))
1091         return 0;
1092
1093     /* if token disappered into thin air, tell caller */
1094     /* if (!step->buf16->utf16_len && !step->more_tokens) */
1095     /*    return 0; */
1096
1097     return 1;
1098 }
1099
1100
1101 int icu_chain_assign_cstr(struct icu_chain * chain,
1102                           const char * src8cstr,
1103                           UErrorCode *status)
1104 {
1105     struct icu_chain_step * stp = 0;
1106
1107     if (!chain || !src8cstr)
1108         return 0;
1109
1110     chain->src8cstr = src8cstr;
1111
1112     stp = chain->steps;
1113
1114     /* clear token count */
1115     chain->token_count = 0;
1116
1117     /* clear all steps stop states */
1118     while (stp){
1119         stp->more_tokens = 1;
1120         stp->need_new_token = 1;
1121         stp = stp->previous;
1122     }
1123
1124     /* finally convert UTF8 to UTF16 string if needed */
1125     if (chain->steps || chain->sort)
1126         icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status);
1127
1128     if (U_FAILURE(*status))
1129         return 0;
1130
1131     return 1;
1132 }
1133
1134
1135
1136 int icu_chain_next_token(struct icu_chain * chain,
1137                          UErrorCode *status)
1138 {
1139     int got_token = 0;
1140
1141     *status = U_ZERO_ERROR;
1142
1143     if (!chain)
1144         return 0;
1145
1146     /* special case with no steps - same as index type binary */
1147     if (!chain->steps){
1148         if (chain->token_count)
1149             return 0;
1150         else {
1151             chain->token_count++;
1152
1153             if (chain->sort)
1154                 icu_sortkey8_from_utf16(chain->coll,
1155                                         chain->sort8, chain->steps->buf16,
1156                                         status);
1157             return chain->token_count;
1158         }
1159     }
1160     /* usual case, one or more icu chain steps existing */
1161     else {
1162
1163         while(!got_token && chain->steps && chain->steps->more_tokens)
1164             got_token = icu_chain_step_next_token(chain, chain->steps, status);
1165
1166         if (got_token){
1167             chain->token_count++;
1168
1169             icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status);
1170
1171             if (chain->sort)
1172                 icu_sortkey8_from_utf16(chain->coll,
1173                                         chain->sort8, chain->steps->buf16,
1174                                         status);
1175
1176             return chain->token_count;
1177         }
1178     }
1179
1180     return 0;
1181 }
1182
1183 int icu_chain_token_number(struct icu_chain * chain)
1184 {
1185     if (!chain)
1186         return 0;
1187
1188     return chain->token_count;
1189 }
1190
1191
1192 const char * icu_chain_token_display(struct icu_chain * chain)
1193 {
1194     if (chain->display8)
1195         return icu_buf_utf8_to_cstr(chain->display8);
1196
1197     return 0;
1198 }
1199
1200 const char * icu_chain_token_norm(struct icu_chain * chain)
1201 {
1202     if (!chain->steps)
1203         return chain->src8cstr;
1204
1205     if (chain->norm8)
1206         return icu_buf_utf8_to_cstr(chain->norm8);
1207
1208     return 0;
1209 }
1210
1211 const char * icu_chain_token_sortkey(struct icu_chain * chain)
1212 {
1213     if (chain->sort8)
1214         return icu_buf_utf8_to_cstr(chain->sort8);
1215
1216     return 0;
1217 }
1218
1219 const UCollator * icu_chain_get_coll(struct icu_chain * chain)
1220 {
1221     return chain->coll;
1222 }
1223
1224
1225 #endif /* YAZ_HAVE_ICU */
1226
1227
1228
1229
1230 /*
1231  * Local variables:
1232  * c-basic-offset: 4
1233  * indent-tabs-mode: nil
1234  * End:
1235  * vim: shiftwidth=4 tabstop=8 expandtab
1236  */