src/icu_I18N.c

   1 /* This file is part of the YAZ toolkit.
   2  * Copyright (C) 1995-2009 Index Data
   3  * See the file LICENSE for details.
   4  */
   5
   6 /**
   7  * \file icu_I18N.c
   8  * \brief ICU utilities
   9  */
  10
  11 #if HAVE_CONFIG_H
  12 #include "config.h"
  13 #endif
  14
  15 #define USE_TIMING 0
  16 #if USE_TIMING
  17 #include <yaz/timing.h>
  18 #endif
  19
  20 #if YAZ_HAVE_ICU
  21 #include <yaz/xmalloc.h>
  22
  23 #include <yaz/icu_I18N.h>
  24
  25 #include <yaz/log.h>
  26
  27 #include <string.h>
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30
  31 #include <unicode/ustring.h>  /* some more string fcns*/
  32 #include <unicode/uchar.h>    /* char names           */
  33
  34
  35 #include <unicode/ucol.h>
  36
  37
  38 int icu_check_status (UErrorCode status)
  39 {
  40     if (U_FAILURE(status))
  41     {
  42         yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
  43         return 0;
  44     }
  45     return 1;
  46
  47 }
  48
  49
  50
  51 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
  52 {
  53     struct icu_buf_utf16 * buf16
  54         = (struct icu_buf_utf16 *) xmalloc(sizeof(struct icu_buf_utf16));
  55
  56     buf16->utf16 = 0;
  57     buf16->utf16_len = 0;
  58     buf16->utf16_cap = 0;
  59
  60     if (capacity > 0){
  61         buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity);
  62         buf16->utf16[0] = (UChar) 0;
  63         buf16->utf16_cap = capacity;
  64     }
  65     return buf16;
  66 }
  67
  68 struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16)
  69 {
  70     if (buf16){
  71         if (buf16->utf16)
  72             buf16->utf16[0] = (UChar) 0;
  73         buf16->utf16_len = 0;
  74     }
  75     return buf16;
  76 }
  77
  78 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
  79                                             size_t capacity)
  80 {
  81     if (!buf16)
  82         return 0;
  83
  84     if (capacity >  0){
  85         if (0 == buf16->utf16)
  86             buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity);
  87         else
  88             buf16->utf16
  89                 = (UChar *) xrealloc(buf16->utf16, sizeof(UChar) * capacity);
  90
  91         icu_buf_utf16_clear(buf16);
  92         buf16->utf16_cap = capacity;
  93     }
  94     else {
  95         xfree(buf16->utf16);
  96         buf16->utf16 = 0;
  97         buf16->utf16_len = 0;
  98         buf16->utf16_cap = 0;
  99     }
 100
 101     return buf16;
 102 }
 103
 104
 105 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
 106                                           struct icu_buf_utf16 * src16)
 107 {
 108     if(!dest16 || !src16
 109        || dest16 == src16)
 110         return 0;
 111
 112     if (dest16->utf16_cap < src16->utf16_len)
 113         icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
 114
 115     u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
 116     dest16->utf16_len = src16->utf16_len;
 117
 118     return dest16;
 119 }
 120
 121
 122 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
 123 {
 124     if (buf16)
 125         xfree(buf16->utf16);
 126     xfree(buf16);
 127 }
 128
 129
 130
 131 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
 132 {
 133     struct icu_buf_utf8 * buf8
 134         = (struct icu_buf_utf8 *) xmalloc(sizeof(struct icu_buf_utf8));
 135
 136     buf8->utf8 = 0;
 137     buf8->utf8_len = 0;
 138     buf8->utf8_cap = 0;
 139
 140     if (capacity > 0){
 141         buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity);
 142         buf8->utf8[0] = (uint8_t) 0;
 143         buf8->utf8_cap = capacity;
 144     }
 145     return buf8;
 146 }
 147
 148
 149 struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8)
 150 {
 151     if (buf8){
 152         if (buf8->utf8)
 153             buf8->utf8[0] = (uint8_t) 0;
 154         buf8->utf8_len = 0;
 155     }
 156     return buf8;
 157 }
 158
 159
 160 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
 161                                           size_t capacity)
 162 {
 163     if (!buf8)
 164         return 0;
 165
 166     if (capacity >  0){
 167         if (0 == buf8->utf8)
 168             buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity);
 169         else
 170             buf8->utf8
 171                 = (uint8_t *) xrealloc(buf8->utf8, sizeof(uint8_t) * capacity);
 172
 173         buf8->utf8_cap = capacity;
 174     }
 175     else {
 176         xfree(buf8->utf8);
 177         buf8->utf8 = 0;
 178         buf8->utf8_len = 0;
 179         buf8->utf8_cap = 0;
 180     }
 181
 182     return buf8;
 183 }
 184
 185
 186 const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
 187 {
 188     if (!src8 || src8->utf8_len == 0)
 189         return "";
 190
 191     if (src8->utf8_len == src8->utf8_cap)
 192         src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1);
 193
 194     src8->utf8[src8->utf8_len] = '\0';
 195
 196     return (const char *) src8->utf8;
 197 }
 198
 199
 200 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
 201 {
 202     if (buf8)
 203         xfree(buf8->utf8);
 204     xfree(buf8);
 205 }
 206
 207
 208
 209 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
 210                                struct icu_buf_utf8 * src8,
 211                                UErrorCode * status)
 212 {
 213     int32_t utf16_len = 0;
 214
 215     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 216                   &utf16_len,
 217                   (const char *) src8->utf8, src8->utf8_len, status);
 218
 219     /* check for buffer overflow, resize and retry */
 220     if (*status == U_BUFFER_OVERFLOW_ERROR)
 221     {
 222         icu_buf_utf16_resize(dest16, utf16_len * 2);
 223         *status = U_ZERO_ERROR;
 224         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 225                       &utf16_len,
 226                       (const char *) src8->utf8, src8->utf8_len, status);
 227     }
 228
 229     if (U_SUCCESS(*status)
 230         && utf16_len <= dest16->utf16_cap)
 231         dest16->utf16_len = utf16_len;
 232     else
 233         icu_buf_utf16_clear(dest16);
 234
 235     return *status;
 236 }
 237
 238
 239
 240 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
 241                                     const char * src8cstr,
 242                                     UErrorCode * status)
 243 {
 244     size_t src8cstr_len = 0;
 245     int32_t utf16_len = 0;
 246
 247     *status = U_ZERO_ERROR;
 248     src8cstr_len = strlen(src8cstr);
 249
 250     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 251                   &utf16_len,
 252                   src8cstr, src8cstr_len, status);
 253
 254     /* check for buffer overflow, resize and retry */
 255     if (*status == U_BUFFER_OVERFLOW_ERROR)
 256     {
 257         icu_buf_utf16_resize(dest16, utf16_len * 2);
 258         *status = U_ZERO_ERROR;
 259         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 260                       &utf16_len,
 261                       src8cstr, src8cstr_len, status);
 262     }
 263
 264     if (U_SUCCESS(*status)
 265         && utf16_len <= dest16->utf16_cap)
 266         dest16->utf16_len = utf16_len;
 267     else
 268         icu_buf_utf16_clear(dest16);
 269
 270     return *status;
 271 }
 272
 273
 274
 275
 276 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
 277                              struct icu_buf_utf16 * src16,
 278                              UErrorCode * status)
 279 {
 280     int32_t utf8_len = 0;
 281
 282     u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 283                 &utf8_len,
 284                 src16->utf16, src16->utf16_len, status);
 285
 286     /* check for buffer overflow, resize and retry */
 287     if (*status == U_BUFFER_OVERFLOW_ERROR)
 288     {
 289         icu_buf_utf8_resize(dest8, utf8_len * 2);
 290         *status = U_ZERO_ERROR;
 291         u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 292                     &utf8_len,
 293                     src16->utf16, src16->utf16_len, status);
 294
 295     }
 296
 297     if (U_SUCCESS(*status)
 298         && utf8_len <= dest8->utf8_cap)
 299         dest8->utf8_len = utf8_len;
 300     else
 301         icu_buf_utf8_clear(dest8);
 302
 303     return *status;
 304 }
 305
 306
 307
 308 struct icu_casemap * icu_casemap_create(char action, UErrorCode *status)
 309 {
 310     struct icu_casemap * casemap
 311         = (struct icu_casemap *) xmalloc(sizeof(struct icu_casemap));
 312     casemap->action = action;
 313
 314     switch(casemap->action) {
 315     case 'l':
 316     case 'L':
 317     case 'u':
 318     case 'U':
 319     case 't':
 320     case 'T':
 321     case 'f':
 322     case 'F':
 323         break;
 324     default:
 325         icu_casemap_destroy(casemap);
 326         return 0;
 327     }
 328
 329     return casemap;
 330 }
 331
 332 void icu_casemap_destroy(struct icu_casemap * casemap)
 333 {
 334     xfree(casemap);
 335 }
 336
 337
 338 int icu_casemap_casemap(struct icu_casemap * casemap,
 339                         struct icu_buf_utf16 * dest16,
 340                         struct icu_buf_utf16 * src16,
 341                         UErrorCode *status,
 342                         const char *locale)
 343 {
 344     if(!casemap)
 345         return 0;
 346
 347     return icu_utf16_casemap(dest16, src16, locale,
 348                              casemap->action, status);
 349 }
 350
 351
 352 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
 353                       struct icu_buf_utf16 * src16,
 354                       const char *locale, char action,
 355                       UErrorCode *status)
 356 {
 357     int32_t dest16_len = 0;
 358
 359
 360     if (!src16->utf16_len){           /* guarding for empty source string */
 361         if (dest16->utf16)
 362             dest16->utf16[0] = (UChar) 0;
 363         dest16->utf16_len = 0;
 364         return U_ZERO_ERROR;
 365     }
 366
 367
 368     switch(action) {
 369     case 'l':
 370     case 'L':
 371         dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 372                                   src16->utf16, src16->utf16_len,
 373                                   locale, status);
 374         break;
 375     case 'u':
 376     case 'U':
 377         dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 378                                   src16->utf16, src16->utf16_len,
 379                                   locale, status);
 380         break;
 381     case 't':
 382     case 'T':
 383         dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 384                                   src16->utf16, src16->utf16_len,
 385                                   0, locale, status);
 386         break;
 387     case 'f':
 388     case 'F':
 389         dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 390                                    src16->utf16, src16->utf16_len,
 391                                    U_FOLD_CASE_DEFAULT, status);
 392         break;
 393
 394     default:
 395         return U_UNSUPPORTED_ERROR;
 396         break;
 397     }
 398
 399     /* check for buffer overflow, resize and retry */
 400     if (*status == U_BUFFER_OVERFLOW_ERROR
 401         && dest16 != src16        /* do not resize if in-place conversion */
 402         ){
 403         icu_buf_utf16_resize(dest16, dest16_len * 2);
 404         *status = U_ZERO_ERROR;
 405
 406
 407         switch(action) {
 408         case 'l':
 409         case 'L':
 410             dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 411                                       src16->utf16, src16->utf16_len,
 412                                       locale, status);
 413             break;
 414         case 'u':
 415         case 'U':
 416             dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 417                                       src16->utf16, src16->utf16_len,
 418                                       locale, status);
 419             break;
 420         case 't':
 421         case 'T':
 422             dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 423                                       src16->utf16, src16->utf16_len,
 424                                       0, locale, status);
 425             break;
 426         case 'f':
 427         case 'F':
 428             dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 429                                        src16->utf16, src16->utf16_len,
 430                                        U_FOLD_CASE_DEFAULT, status);
 431             break;
 432
 433         default:
 434             return U_UNSUPPORTED_ERROR;
 435             break;
 436         }
 437     }
 438
 439     if (U_SUCCESS(*status)
 440         && dest16_len <= dest16->utf16_cap)
 441         dest16->utf16_len = dest16_len;
 442     else {
 443         if (dest16->utf16)
 444             dest16->utf16[0] = (UChar) 0;
 445         dest16->utf16_len = 0;
 446     }
 447
 448     return *status;
 449 }
 450
 451
 452
 453 void icu_sortkey8_from_utf16(UCollator *coll,
 454                              struct icu_buf_utf8 * dest8,
 455                              struct icu_buf_utf16 * src16,
 456                              UErrorCode * status)
 457 {
 458
 459     int32_t sortkey_len = 0;
 460
 461     sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 462                                   dest8->utf8, dest8->utf8_cap);
 463
 464     /* check for buffer overflow, resize and retry */
 465     if (sortkey_len > dest8->utf8_cap) {
 466         icu_buf_utf8_resize(dest8, sortkey_len * 2);
 467         sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 468                                       dest8->utf8, dest8->utf8_cap);
 469     }
 470
 471     if (U_SUCCESS(*status)
 472         && sortkey_len > 0)
 473         dest8->utf8_len = sortkey_len;
 474     else
 475         icu_buf_utf8_clear(dest8);
 476 }
 477
 478
 479
 480 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
 481                                             UErrorCode *status)
 482 {
 483     struct icu_tokenizer * tokenizer
 484         = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
 485
 486     tokenizer->action = action;
 487     tokenizer->bi = 0;
 488     tokenizer->buf16 = 0;
 489     tokenizer->token_count = 0;
 490     tokenizer->token_id = 0;
 491     tokenizer->token_start = 0;
 492     tokenizer->token_end = 0;
 493
 494
 495     switch(tokenizer->action) {
 496     case 'l':
 497     case 'L':
 498         tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
 499         break;
 500     case 's':
 501     case 'S':
 502         tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
 503         break;
 504     case 'w':
 505     case 'W':
 506         tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
 507         break;
 508     case 'c':
 509     case 'C':
 510         tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
 511         break;
 512     case 't':
 513     case 'T':
 514         tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
 515         break;
 516     default:
 517         *status = U_UNSUPPORTED_ERROR;
 518         return 0;
 519         break;
 520     }
 521
 522     /* ICU error stuff is a very  funny business */
 523     if (U_SUCCESS(*status))
 524         return tokenizer;
 525
 526     /* freeing if failed */
 527     icu_tokenizer_destroy(tokenizer);
 528     return 0;
 529 }
 530
 531 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
 532 {
 533     if (tokenizer) {
 534         if (tokenizer->bi)
 535             ubrk_close(tokenizer->bi);
 536         xfree(tokenizer);
 537     }
 538 }
 539
 540 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 541                          struct icu_buf_utf16 * src16,
 542                          UErrorCode *status)
 543 {
 544     if (!tokenizer || !tokenizer->bi || !src16)
 545         return 0;
 546
 547
 548     tokenizer->buf16 = src16;
 549     tokenizer->token_count = 0;
 550     tokenizer->token_id = 0;
 551     tokenizer->token_start = 0;
 552     tokenizer->token_end = 0;
 553
 554     ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
 555
 556
 557     if (U_FAILURE(*status))
 558         return 0;
 559
 560     return 1;
 561 };
 562
 563 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 564                          struct icu_buf_utf16 * tkn16,
 565                          UErrorCode *status)
 566 {
 567     int32_t tkn_start = 0;
 568     int32_t tkn_end = 0;
 569     int32_t tkn_len = 0;
 570
 571
 572     if (!tokenizer || !tokenizer->bi
 573         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 574         return 0;
 575
 576     /*
 577     never change tokenizer->buf16 and keep always invariant
 578     0 <= tokenizer->token_start
 579        <= tokenizer->token_end
 580        <= tokenizer->buf16->utf16_len
 581     returns length of token
 582     */
 583
 584     if (0 == tokenizer->token_end) /* first call */
 585         tkn_start = ubrk_first(tokenizer->bi);
 586     else /* successive calls */
 587         tkn_start = tokenizer->token_end;
 588
 589     /* get next position */
 590     tkn_end = ubrk_next(tokenizer->bi);
 591
 592     /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
 593     if (UBRK_DONE == tkn_end)
 594         tkn_end = tokenizer->buf16->utf16_len;
 595
 596     /* copy out if everything is well */
 597     if(U_FAILURE(*status))
 598         return 0;
 599
 600     /* everything OK, now update internal state */
 601     tkn_len = tkn_end - tkn_start;
 602
 603     if (0 < tkn_len){
 604         tokenizer->token_count++;
 605         tokenizer->token_id++;
 606     } else {
 607         tokenizer->token_id = 0;
 608     }
 609     tokenizer->token_start = tkn_start;
 610     tokenizer->token_end = tkn_end;
 611
 612
 613     /* copying into token buffer if it exists */
 614     if (tkn16){
 615         if (tkn16->utf16_cap < tkn_len)
 616             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 617
 618         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 619                   tkn_len);
 620
 621         tkn16->utf16_len = tkn_len;
 622     }
 623
 624     return tkn_len;
 625 }
 626
 627
 628 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
 629 {
 630     return tokenizer->token_id;
 631 }
 632
 633 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
 634 {
 635     return tokenizer->token_start;
 636 }
 637
 638 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
 639 {
 640     return tokenizer->token_end;
 641 }
 642
 643 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
 644 {
 645     return (tokenizer->token_end - tokenizer->token_start);
 646 }
 647
 648 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 649 {
 650     return tokenizer->token_count;
 651 }
 652
 653
 654
 655 struct icu_transform * icu_transform_create(const char *id, char action,
 656                                             const char *rules,
 657                                             UErrorCode *status)
 658 {
 659     struct icu_buf_utf16 *id16 = icu_buf_utf16_create(0);
 660     struct icu_buf_utf16 *rules16 = icu_buf_utf16_create(0);
 661
 662     struct icu_transform * transform
 663         = (struct icu_transform *) xmalloc(sizeof(struct icu_transform));
 664
 665     transform->action = action;
 666     transform->trans = 0;
 667
 668     if (id)
 669         icu_utf16_from_utf8_cstr(id16, id, status);
 670     if (rules)
 671         icu_utf16_from_utf8_cstr(rules16, rules, status);
 672
 673     switch(transform->action)
 674     {
 675     case 'f':
 676     case 'F':
 677         transform->trans
 678             = utrans_openU(id16->utf16,
 679                            id16->utf16_len,
 680                            UTRANS_FORWARD,
 681                            rules16->utf16,
 682                            rules16->utf16_len,
 683                            &transform->parse_error, status);
 684         break;
 685     case 'r':
 686     case 'R':
 687         transform->trans
 688             = utrans_openU(id16->utf16,
 689                            id16->utf16_len,
 690                            UTRANS_REVERSE ,
 691                            rules16->utf16,
 692                            rules16->utf16_len,
 693                            &transform->parse_error, status);
 694         break;
 695     default:
 696         *status = U_UNSUPPORTED_ERROR;
 697         break;
 698     }
 699     icu_buf_utf16_destroy(rules16);
 700     icu_buf_utf16_destroy(id16);
 701
 702     if (U_SUCCESS(*status))
 703         return transform;
 704
 705     /* freeing if failed */
 706     icu_transform_destroy(transform);
 707     return 0;
 708 }
 709
 710
 711 void icu_transform_destroy(struct icu_transform * transform){
 712     if (transform) {
 713         if (transform->trans)
 714             utrans_close(transform->trans);
 715         xfree(transform);
 716     }
 717 }
 718
 719
 720
 721 int icu_transform_trans(struct icu_transform * transform,
 722                         struct icu_buf_utf16 * dest16,
 723                         struct icu_buf_utf16 * src16,
 724                         UErrorCode *status)
 725 {
 726     if (!transform || !transform->trans
 727         || !src16
 728         || !dest16)
 729         return 0;
 730
 731     if (!src16->utf16_len){           /* guarding for empty source string */
 732         icu_buf_utf16_clear(dest16);
 733         return 0;
 734     }
 735
 736     if (!icu_buf_utf16_copy(dest16, src16))
 737         return 0;
 738
 739
 740     utrans_transUChars (transform->trans,
 741                         dest16->utf16, &(dest16->utf16_len),
 742                         dest16->utf16_cap,
 743                         0, &(src16->utf16_len), status);
 744
 745     if (U_FAILURE(*status))
 746         icu_buf_utf16_clear(dest16);
 747
 748     return dest16->utf16_len;
 749 }
 750
 751
 752
 753
 754 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
 755                                               enum icu_chain_step_type type,
 756                                               const uint8_t * rule,
 757                                               struct icu_buf_utf16 * buf16,
 758                                               UErrorCode *status)
 759 {
 760     struct icu_chain_step * step = 0;
 761
 762     if(!chain || !type || !rule)
 763         return 0;
 764
 765     step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step));
 766
 767     step->type = type;
 768
 769     step->buf16 = buf16;
 770
 771     /* create auxilary objects */
 772     switch(step->type) {
 773     case ICU_chain_step_type_display:
 774         break;
 775     case ICU_chain_step_type_casemap:
 776         step->u.casemap = icu_casemap_create(rule[0], status);
 777         break;
 778     case ICU_chain_step_type_transform:
 779         /* rule omitted. Only ID used */
 780         step->u.transform = icu_transform_create((const char *) rule, 'f',
 781                                                  0, status);
 782         break;
 783     case ICU_chain_step_type_tokenize:
 784         step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
 785                                                  (char) rule[0], status);
 786         break;
 787     case ICU_chain_step_type_transliterate:
 788         /* we pass a dummy ID to utrans_openU.. */
 789         step->u.transform = icu_transform_create("custom", 'f',
 790                                                  (const char *) rule, status);
 791         break;
 792     default:
 793         break;
 794     }
 795
 796     return step;
 797 }
 798
 799
 800 void icu_chain_step_destroy(struct icu_chain_step * step){
 801
 802     if (!step)
 803         return;
 804
 805     icu_chain_step_destroy(step->previous);
 806
 807     switch(step->type) {
 808     case ICU_chain_step_type_display:
 809         break;
 810     case ICU_chain_step_type_casemap:
 811         icu_casemap_destroy(step->u.casemap);
 812         icu_buf_utf16_destroy(step->buf16);
 813         break;
 814     case ICU_chain_step_type_transform:
 815     case ICU_chain_step_type_transliterate:
 816         icu_transform_destroy(step->u.transform);
 817         icu_buf_utf16_destroy(step->buf16);
 818         break;
 819     case ICU_chain_step_type_tokenize:
 820         icu_tokenizer_destroy(step->u.tokenizer);
 821         icu_buf_utf16_destroy(step->buf16);
 822         break;
 823     default:
 824         break;
 825     }
 826     xfree(step);
 827 }
 828
 829
 830
 831 struct icu_chain * icu_chain_create(const char *locale,  int sort,
 832                                     UErrorCode * status)
 833 {
 834     struct icu_chain * chain
 835         = (struct icu_chain *) xmalloc(sizeof(struct icu_chain));
 836
 837     *status = U_ZERO_ERROR;
 838
 839     chain->locale = xstrdup(locale);
 840
 841     chain->sort = sort;
 842
 843     chain->coll = ucol_open((const char *) chain->locale, status);
 844
 845     if (U_FAILURE(*status))
 846         return 0;
 847
 848     chain->token_count = 0;
 849
 850     chain->src8cstr = 0;
 851
 852     chain->display8 = icu_buf_utf8_create(0);
 853     chain->norm8 = icu_buf_utf8_create(0);
 854     chain->sort8 = icu_buf_utf8_create(0);
 855
 856     chain->src16 = icu_buf_utf16_create(0);
 857
 858     chain->steps = 0;
 859
 860     return chain;
 861 }
 862
 863
 864 void icu_chain_destroy(struct icu_chain * chain)
 865 {
 866     if (chain)
 867     {
 868         if (chain->coll)
 869             ucol_close(chain->coll);
 870
 871         icu_buf_utf8_destroy(chain->display8);
 872         icu_buf_utf8_destroy(chain->norm8);
 873         icu_buf_utf8_destroy(chain->sort8);
 874
 875         icu_buf_utf16_destroy(chain->src16);
 876
 877         icu_chain_step_destroy(chain->steps);
 878         xfree(chain->locale);
 879         xfree(chain);
 880     }
 881 }
 882
 883
 884
 885 struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node,
 886                                         int sort,
 887                                         UErrorCode * status)
 888 {
 889     xmlNode *node = 0;
 890     struct icu_chain * chain = 0;
 891
 892     *status = U_ZERO_ERROR;
 893
 894     if (!xml_node ||xml_node->type != XML_ELEMENT_NODE)
 895         return 0;
 896
 897     {
 898         xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node,
 899                                           (xmlChar *) "locale");
 900
 901         if (xml_locale)
 902         {
 903             chain = icu_chain_create((const char *) xml_locale, sort, status);
 904             xmlFree(xml_locale);
 905         }
 906
 907     }
 908     if (!chain)
 909         return 0;
 910
 911     for (node = xml_node->children; node; node = node->next)
 912     {
 913         xmlChar *xml_rule;
 914         struct icu_chain_step * step = 0;
 915
 916         if (node->type != XML_ELEMENT_NODE)
 917             continue;
 918
 919         xml_rule = xmlGetProp(node, (xmlChar *) "rule");
 920
 921         if (!strcmp((const char *) node->name, "casemap"))
 922             step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
 923                                          (const uint8_t *) xml_rule, status);
 924         else if (!strcmp((const char *) node->name, "transform"))
 925             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
 926                                          (const uint8_t *) xml_rule, status);
 927         else if (!strcmp((const char *) node->name, "transliterate"))
 928             step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate,
 929                                          (const uint8_t *) xml_rule, status);
 930         else if (!strcmp((const char *) node->name, "tokenize"))
 931             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
 932                                          (const uint8_t *) xml_rule, status);
 933         else if (!strcmp((const char *) node->name, "display"))
 934             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
 935                                          (const uint8_t *) "", status);
 936         else if (!strcmp((const char *) node->name, "normalize"))
 937         {
 938             yaz_log(YLOG_WARN, "Element %s is deprecated. "
 939                     "Use transform instead", node->name);
 940             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
 941                                          (const uint8_t *) xml_rule, status);
 942         }
 943         else if (!strcmp((const char *) node->name, "index")
 944                  || !strcmp((const char *) node->name, "sortkey"))
 945         {
 946             yaz_log(YLOG_WARN, "Element %s is no longer needed. "
 947                     "Remove it from the configuration", node->name);
 948         }
 949         else
 950         {
 951             yaz_log(YLOG_WARN, "Unknown element %s", node->name);
 952             icu_chain_destroy(chain);
 953             return 0;
 954         }
 955         xmlFree(xml_rule);
 956         if (step && U_FAILURE(*status))
 957         {
 958             icu_chain_destroy(chain);
 959             return 0;
 960         }
 961     }
 962     return chain;
 963 }
 964
 965
 966
 967 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
 968                                               enum icu_chain_step_type type,
 969                                               const uint8_t * rule,
 970                                               UErrorCode *status)
 971 {
 972     struct icu_chain_step * step = 0;
 973     struct icu_buf_utf16 * src16 = 0;
 974     struct icu_buf_utf16 * buf16 = 0;
 975
 976     if (!chain || !type || !rule)
 977         return 0;
 978
 979     /* assign utf16 src buffers as needed */
 980     if (chain->steps && chain->steps->buf16)
 981         src16 = chain->steps->buf16;
 982     else if (chain->src16)
 983         src16 = chain->src16;
 984     else
 985         return 0;
 986
 987
 988     /* create utf16 destination buffers as needed, or */
 989     switch(type)
 990     {
 991     case ICU_chain_step_type_display:
 992         buf16 = src16;
 993         break;
 994     case ICU_chain_step_type_casemap:
 995         buf16 = icu_buf_utf16_create(0);
 996         break;
 997     case ICU_chain_step_type_transform:
 998     case ICU_chain_step_type_transliterate:
 999         buf16 = icu_buf_utf16_create(0);
1000         break;
1001     case ICU_chain_step_type_tokenize:
1002         buf16 = icu_buf_utf16_create(0);
1003         break;
1004         break;
1005     default:
1006         break;
1007     }
1008
1009     /* create actual chain step with this buffer */
1010     step = icu_chain_step_create(chain, type, rule, buf16, status);
1011
1012     step->previous = chain->steps;
1013     chain->steps = step;
1014
1015     return step;
1016 }
1017
1018
1019 int icu_chain_step_next_token(struct icu_chain * chain,
1020                               struct icu_chain_step * step,
1021                               UErrorCode *status)
1022 {
1023     struct icu_buf_utf16 * src16 = 0;
1024     int got_new_token = 0;
1025
1026     if (!chain || !chain->src16 || !step || !step->more_tokens)
1027         return 0;
1028
1029     /* assign utf16 src buffers as neeed, advance in previous steps
1030        tokens until non-zero token met, and setting stop condition */
1031
1032     if (step->previous)
1033     {
1034         src16 = step->previous->buf16;
1035         /* tokens might be killed in previous steps, therefore looping */
1036
1037         while (step->need_new_token
1038                && step->previous->more_tokens
1039                && !got_new_token)
1040             got_new_token
1041                 = icu_chain_step_next_token(chain, step->previous, status);
1042     }
1043     else
1044     { /* first step can only work once on chain->src16 input buffer */
1045         src16 = chain->src16;
1046         step->more_tokens = 0;
1047         got_new_token = 1;
1048     }
1049
1050     if (!src16)
1051         return 0;
1052
1053     /* stop if nothing to process */
1054     if (step->need_new_token && !got_new_token)
1055     {
1056         step->more_tokens = 0;
1057         return 0;
1058     }
1059
1060     /* either an old token not finished yet, or a new token, thus
1061        perform the work, eventually put this steps output in
1062        step->buf16 or the chains UTF8 output buffers  */
1063
1064     switch(step->type)
1065     {
1066     case ICU_chain_step_type_display:
1067         icu_utf16_to_utf8(chain->display8, src16, status);
1068         break;
1069     case ICU_chain_step_type_casemap:
1070         icu_casemap_casemap(step->u.casemap,
1071                             step->buf16, src16, status,
1072                             chain->locale);
1073         break;
1074     case ICU_chain_step_type_transform:
1075     case ICU_chain_step_type_transliterate:
1076         icu_transform_trans(step->u.transform,
1077                             step->buf16, src16, status);
1078         break;
1079     case ICU_chain_step_type_tokenize:
1080         /* attach to new src16 token only first time during splitting */
1081         if (step->need_new_token)
1082         {
1083             icu_tokenizer_attach(step->u.tokenizer, src16, status);
1084             step->need_new_token = 0;
1085         }
1086
1087         /* splitting one src16 token into multiple buf16 tokens */
1088         step->more_tokens
1089             = icu_tokenizer_next_token(step->u.tokenizer,
1090                                        step->buf16, status);
1091
1092         /* make sure to get new previous token if this one had been used up
1093            by recursive call to _same_ step */
1094
1095         if (!step->more_tokens)
1096         {
1097             step->more_tokens = icu_chain_step_next_token(chain, step, status);
1098             return step->more_tokens;  /* avoid one token count too much! */
1099         }
1100         break;
1101     default:
1102         return 0;
1103         break;
1104     }
1105
1106     if (U_FAILURE(*status))
1107         return 0;
1108
1109     /* if token disappered into thin air, tell caller */
1110     /* if (!step->buf16->utf16_len && !step->more_tokens) */
1111     /*    return 0; */
1112
1113     return 1;
1114 }
1115
1116
1117 int icu_chain_assign_cstr(struct icu_chain * chain,
1118                           const char * src8cstr,
1119                           UErrorCode *status)
1120 {
1121     struct icu_chain_step * stp = 0;
1122
1123     if (!chain || !src8cstr)
1124         return 0;
1125
1126     chain->src8cstr = src8cstr;
1127
1128     stp = chain->steps;
1129
1130     /* clear token count */
1131     chain->token_count = 0;
1132
1133     /* clear all steps stop states */
1134     while (stp)
1135     {
1136         stp->more_tokens = 1;
1137         stp->need_new_token = 1;
1138         stp = stp->previous;
1139     }
1140
1141     /* finally convert UTF8 to UTF16 string if needed */
1142     if (chain->steps || chain->sort)
1143         icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status);
1144
1145     if (U_FAILURE(*status))
1146         return 0;
1147
1148     return 1;
1149 }
1150
1151
1152
1153 int icu_chain_next_token(struct icu_chain * chain,
1154                          UErrorCode *status)
1155 {
1156     int got_token = 0;
1157
1158     *status = U_ZERO_ERROR;
1159
1160     if (!chain)
1161         return 0;
1162
1163     /* special case with no steps - same as index type binary */
1164     if (!chain->steps)
1165     {
1166         if (chain->token_count)
1167             return 0;
1168         else
1169         {
1170             chain->token_count++;
1171
1172             if (chain->sort)
1173                 icu_sortkey8_from_utf16(chain->coll,
1174                                         chain->sort8, chain->steps->buf16,
1175                                         status);
1176             return chain->token_count;
1177         }
1178     }
1179     /* usual case, one or more icu chain steps existing */
1180     else
1181     {
1182         while(!got_token && chain->steps && chain->steps->more_tokens)
1183             got_token = icu_chain_step_next_token(chain, chain->steps, status);
1184
1185         if (got_token)
1186         {
1187             chain->token_count++;
1188
1189             icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status);
1190
1191             if (chain->sort)
1192                 icu_sortkey8_from_utf16(chain->coll,
1193                                         chain->sort8, chain->steps->buf16,
1194                                         status);
1195
1196             return chain->token_count;
1197         }
1198     }
1199
1200     return 0;
1201 }
1202
1203 int icu_chain_token_number(struct icu_chain * chain)
1204 {
1205     if (!chain)
1206         return 0;
1207
1208     return chain->token_count;
1209 }
1210
1211
1212 const char * icu_chain_token_display(struct icu_chain * chain)
1213 {
1214     if (chain->display8)
1215         return icu_buf_utf8_to_cstr(chain->display8);
1216
1217     return 0;
1218 }
1219
1220 const char * icu_chain_token_norm(struct icu_chain * chain)
1221 {
1222     if (!chain->steps)
1223         return chain->src8cstr;
1224
1225     if (chain->norm8)
1226         return icu_buf_utf8_to_cstr(chain->norm8);
1227
1228     return 0;
1229 }
1230
1231 const char * icu_chain_token_sortkey(struct icu_chain * chain)
1232 {
1233     if (chain->sort8)
1234         return icu_buf_utf8_to_cstr(chain->sort8);
1235
1236     return 0;
1237 }
1238
1239 const UCollator * icu_chain_get_coll(struct icu_chain * chain)
1240 {
1241     return chain->coll;
1242 }
1243
1244 #endif /* YAZ_HAVE_ICU */
1245
1246 /*
1247  * Local variables:
1248  * c-basic-offset: 4
1249  * c-file-style: "Stroustrup"
1250  * indent-tabs-mode: nil
1251  * End:
1252  * vim: shiftwidth=4 tabstop=8 expandtab
1253  */
1254