src/icu_I18N.c

   1 /*
   2  * Copyright (C) 1995-2007, Index Data ApS
   3  * See the file LICENSE for details.
   4  *
   5  * $Id: icu_I18N.c,v 1.2 2007-10-22 17:32:07 adam Exp $
   6  */
   7
   8 #if HAVE_CONFIG_H
   9 #include "config.h"
  10 #endif
  11
  12 #define USE_TIMING 0
  13 #if USE_TIMING
  14 #include <yaz/timing.h>
  15 #endif
  16
  17
  18 #if HAVE_ICU
  19 #include <yaz/icu_I18N.h>
  20
  21 #include <yaz/log.h>
  22
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26
  27 #include <unicode/ustring.h>  /* some more string fcns*/
  28 #include <unicode/uchar.h>    /* char names           */
  29
  30
  31 #include <unicode/ucol.h>
  32
  33
  34 int icu_check_status (UErrorCode status)
  35 {
  36     if(U_FAILURE(status)){
  37         yaz_log(YLOG_WARN,
  38                 "ICU: %d %s\n", status, u_errorName(status));
  39         return 0;
  40     }
  41     return 1;
  42
  43 }
  44
  45
  46
  47 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
  48 {
  49     struct icu_buf_utf16 * buf16
  50         = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
  51
  52     buf16->utf16 = 0;
  53     buf16->utf16_len = 0;
  54     buf16->utf16_cap = 0;
  55
  56     if (capacity > 0){
  57         buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  58         buf16->utf16[0] = (UChar) 0;
  59         buf16->utf16_cap = capacity;
  60     }
  61     return buf16;
  62 }
  63
  64 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
  65                                             size_t capacity)
  66 {
  67     if (buf16){
  68         if (capacity >  0){
  69             if (0 == buf16->utf16)
  70                 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  71             else
  72                 buf16->utf16
  73                     = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
  74             buf16->utf16[0] = (UChar) 0;
  75             buf16->utf16_len = 0;
  76             buf16->utf16_cap = capacity;
  77         }
  78         else {
  79             if (buf16->utf16)
  80                 free(buf16->utf16);
  81             buf16->utf16 = 0;
  82             buf16->utf16_len = 0;
  83             buf16->utf16_cap = 0;
  84         }
  85     }
  86
  87     return buf16;
  88 }
  89
  90
  91 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
  92                                           struct icu_buf_utf16 * src16)
  93 {
  94     if(!dest16 || !src16
  95        || dest16 == src16)
  96         return 0;
  97
  98     if (dest16->utf16_cap < src16->utf16_len)
  99         icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
 100
 101     u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
 102     dest16->utf16_len = src16->utf16_len;
 103
 104     return dest16;
 105 }
 106
 107
 108 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
 109 {
 110     if (buf16){
 111         if (buf16->utf16)
 112             free(buf16->utf16);
 113         free(buf16);
 114     }
 115 }
 116
 117
 118
 119
 120
 121
 122 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
 123 {
 124     struct icu_buf_utf8 * buf8
 125         = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
 126
 127     buf8->utf8 = 0;
 128     buf8->utf8_len = 0;
 129     buf8->utf8_cap = 0;
 130
 131     if (capacity > 0){
 132         buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 133         buf8->utf8[0] = (uint8_t) 0;
 134         buf8->utf8_cap = capacity;
 135     }
 136     return buf8;
 137 }
 138
 139
 140
 141 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
 142                                           size_t capacity)
 143 {
 144     if (buf8){
 145         if (capacity >  0){
 146             if (0 == buf8->utf8)
 147                 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 148             else
 149                 buf8->utf8
 150                     = (uint8_t *) realloc(buf8->utf8,
 151                                           sizeof(uint8_t) * capacity);
 152             buf8->utf8_cap = capacity;
 153         }
 154         else {
 155             if (buf8->utf8)
 156                 free(buf8->utf8);
 157             buf8->utf8 = 0;
 158             buf8->utf8_len = 0;
 159             buf8->utf8_cap = 0;
 160         }
 161     }
 162
 163     return buf8;
 164 }
 165
 166
 167 struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8,
 168                                           struct icu_buf_utf8 * src8)
 169 {
 170     if(!dest8 || !src8
 171        || dest8 == src8)
 172         return 0;
 173
 174
 175     if (dest8->utf8_cap < src8->utf8_len)
 176         icu_buf_utf8_resize(dest8, src8->utf8_len * 2);
 177
 178     strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len);
 179
 180     return dest8;
 181 }
 182
 183
 184 const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
 185 {
 186     if (!src8 || src8->utf8_len == 0)
 187         return "";
 188     if (src8->utf8_len == src8->utf8_cap)
 189         src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1);
 190     src8->utf8[src8->utf8_len] = '\0';
 191     return (const char *) src8->utf8;
 192 }
 193
 194
 195 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
 196 {
 197     if (buf8){
 198         if (buf8->utf8)
 199             free(buf8->utf8);
 200         free(buf8);
 201     }
 202 }
 203
 204
 205
 206 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
 207                                struct icu_buf_utf8 * src8,
 208                                UErrorCode * status)
 209 {
 210     int32_t utf16_len = 0;
 211
 212     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 213                   &utf16_len,
 214                   (const char *) src8->utf8, src8->utf8_len, status);
 215
 216     /* check for buffer overflow, resize and retry */
 217     if (*status == U_BUFFER_OVERFLOW_ERROR)
 218     {
 219         icu_buf_utf16_resize(dest16, utf16_len * 2);
 220         *status = U_ZERO_ERROR;
 221         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 222                       &utf16_len,
 223                       (const char *) src8->utf8, src8->utf8_len, status);
 224     }
 225
 226     if (U_SUCCESS(*status)
 227         && utf16_len <= dest16->utf16_cap)
 228         dest16->utf16_len = utf16_len;
 229     else {
 230         dest16->utf16[0] = (UChar) 0;
 231         dest16->utf16_len = 0;
 232     }
 233
 234     return *status;
 235 }
 236
 237
 238
 239 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
 240                                     const char * src8cstr,
 241                                     UErrorCode * status)
 242 {
 243     size_t src8cstr_len = 0;
 244     int32_t utf16_len = 0;
 245
 246     src8cstr_len = strlen(src8cstr);
 247
 248     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 249                   &utf16_len,
 250                   src8cstr, src8cstr_len, status);
 251
 252     /* check for buffer overflow, resize and retry */
 253     if (*status == U_BUFFER_OVERFLOW_ERROR)
 254     {
 255         icu_buf_utf16_resize(dest16, utf16_len * 2);
 256         *status = U_ZERO_ERROR;
 257         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 258                       &utf16_len,
 259                       src8cstr, src8cstr_len, status);
 260     }
 261
 262     if (U_SUCCESS(*status)
 263         && utf16_len <= dest16->utf16_cap)
 264         dest16->utf16_len = utf16_len;
 265     else {
 266         dest16->utf16[0] = (UChar) 0;
 267         dest16->utf16_len = 0;
 268     }
 269
 270     return *status;
 271 }
 272
 273
 274
 275
 276 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
 277                              struct icu_buf_utf16 * src16,
 278                              UErrorCode * status)
 279 {
 280     int32_t utf8_len = 0;
 281
 282     u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 283                 &utf8_len,
 284                 src16->utf16, src16->utf16_len, status);
 285
 286     /* check for buffer overflow, resize and retry */
 287     if (*status == U_BUFFER_OVERFLOW_ERROR)
 288     {
 289         icu_buf_utf8_resize(dest8, utf8_len * 2);
 290         *status = U_ZERO_ERROR;
 291         u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 292                     &utf8_len,
 293                     src16->utf16, src16->utf16_len, status);
 294
 295     }
 296
 297     if (U_SUCCESS(*status)
 298         && utf8_len <= dest8->utf8_cap)
 299         dest8->utf8_len = utf8_len;
 300     else {
 301         dest8->utf8[0] = (uint8_t) 0;
 302         dest8->utf8_len = 0;
 303     }
 304
 305     return *status;
 306 }
 307
 308
 309
 310 struct icu_casemap * icu_casemap_create(const char *locale, char action,
 311                                         UErrorCode *status)
 312 {
 313     struct icu_casemap * casemap
 314         = (struct icu_casemap *) malloc(sizeof(struct icu_casemap));
 315     strcpy(casemap->locale, locale);
 316     casemap->action = action;
 317
 318     switch(casemap->action) {
 319     case 'l':
 320         break;
 321     case 'u':
 322         break;
 323     case 't':
 324         break;
 325     case 'f':
 326         break;
 327     default:
 328         icu_casemap_destroy(casemap);
 329         return 0;
 330     }
 331
 332     return casemap;
 333 }
 334
 335 void icu_casemap_destroy(struct icu_casemap * casemap)
 336 {
 337     if (casemap)
 338         free(casemap);
 339 }
 340
 341
 342 int icu_casemap_casemap(struct icu_casemap * casemap,
 343                         struct icu_buf_utf16 * dest16,
 344                         struct icu_buf_utf16 * src16,
 345                         UErrorCode *status)
 346 {
 347     if(!casemap)
 348         return 0;
 349
 350     return icu_utf16_casemap(dest16, src16,
 351                              casemap->locale, casemap->action, status);
 352 }
 353
 354
 355 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
 356                       struct icu_buf_utf16 * src16,
 357                       const char *locale, char action,
 358                       UErrorCode *status)
 359 {
 360     int32_t dest16_len = 0;
 361
 362     switch(action) {
 363     case 'l':
 364         dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 365                                   src16->utf16, src16->utf16_len,
 366                                   locale, status);
 367         break;
 368     case 'u':
 369         dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 370                                   src16->utf16, src16->utf16_len,
 371                                   locale, status);
 372         break;
 373     case 't':
 374         dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 375                                   src16->utf16, src16->utf16_len,
 376                                   0, locale, status);
 377         break;
 378     case 'f':
 379         dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 380                                    src16->utf16, src16->utf16_len,
 381                                    U_FOLD_CASE_DEFAULT, status);
 382         break;
 383
 384     default:
 385         return U_UNSUPPORTED_ERROR;
 386         break;
 387     }
 388
 389     /* check for buffer overflow, resize and retry */
 390     if (*status == U_BUFFER_OVERFLOW_ERROR
 391         && dest16 != src16        /* do not resize if in-place conversion */
 392         ){
 393         icu_buf_utf16_resize(dest16, dest16_len * 2);
 394         *status = U_ZERO_ERROR;
 395
 396
 397         switch(action) {
 398         case 'l':
 399             dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 400                                       src16->utf16, src16->utf16_len,
 401                                       locale, status);
 402             break;
 403         case 'u':
 404             dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 405                                       src16->utf16, src16->utf16_len,
 406                                       locale, status);
 407             break;
 408         case 't':
 409             dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 410                                       src16->utf16, src16->utf16_len,
 411                                       0, locale, status);
 412             break;
 413         case 'f':
 414             dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 415                                        src16->utf16, src16->utf16_len,
 416                                        U_FOLD_CASE_DEFAULT, status);
 417             break;
 418
 419         default:
 420             return U_UNSUPPORTED_ERROR;
 421             break;
 422         }
 423     }
 424
 425     if (U_SUCCESS(*status)
 426         && dest16_len <= dest16->utf16_cap)
 427         dest16->utf16_len = dest16_len;
 428     else {
 429         dest16->utf16[0] = (UChar) 0;
 430         dest16->utf16_len = 0;
 431     }
 432
 433     return *status;
 434 }
 435
 436
 437
 438 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
 439                                    struct icu_buf_utf8 * dest8,
 440                                    struct icu_buf_utf16 * src16,
 441                                    UErrorCode * status)
 442 {
 443
 444     int32_t sortkey_len = 0;
 445
 446     sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 447                                   dest8->utf8, dest8->utf8_cap);
 448
 449     /* check for buffer overflow, resize and retry */
 450     if (sortkey_len > dest8->utf8_cap) {
 451         icu_buf_utf8_resize(dest8, sortkey_len * 2);
 452         sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 453                                       dest8->utf8, dest8->utf8_cap);
 454     }
 455
 456     if (U_SUCCESS(*status)
 457         && sortkey_len > 0)
 458         dest8->utf8_len = sortkey_len;
 459     else {
 460         dest8->utf8[0] = (UChar) 0;
 461         dest8->utf8_len = 0;
 462     }
 463
 464     return sortkey_len;
 465 }
 466
 467
 468
 469 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
 470                                             UErrorCode *status)
 471 {
 472     struct icu_tokenizer * tokenizer
 473         = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
 474
 475     strcpy(tokenizer->locale, locale);
 476     tokenizer->action = action;
 477     tokenizer->bi = 0;
 478     tokenizer->buf16 = 0;
 479     tokenizer->token_count = 0;
 480     tokenizer->token_id = 0;
 481     tokenizer->token_start = 0;
 482     tokenizer->token_end = 0;
 483
 484
 485     switch(tokenizer->action) {
 486     case 'l':
 487         tokenizer->bi
 488             = ubrk_open(UBRK_LINE, tokenizer->locale,
 489                         0, 0, status);
 490         break;
 491     case 's':
 492         tokenizer->bi
 493             = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
 494                         0, 0, status);
 495         break;
 496     case 'w':
 497         tokenizer->bi
 498             = ubrk_open(UBRK_WORD, tokenizer->locale,
 499                         0, 0, status);
 500         break;
 501     case 'c':
 502         tokenizer->bi
 503             = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
 504                         0, 0, status);
 505         break;
 506     case 't':
 507         tokenizer->bi
 508             = ubrk_open(UBRK_TITLE, tokenizer->locale,
 509                         0, 0, status);
 510         break;
 511     default:
 512         *status = U_UNSUPPORTED_ERROR;
 513         return 0;
 514         break;
 515     }
 516
 517     /* ICU error stuff is a very  funny business */
 518     if (U_SUCCESS(*status))
 519         return tokenizer;
 520
 521     /* freeing if failed */
 522     icu_tokenizer_destroy(tokenizer);
 523     return 0;
 524 }
 525
 526 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
 527 {
 528     if (tokenizer) {
 529         if (tokenizer->bi)
 530             ubrk_close(tokenizer->bi);
 531         free(tokenizer);
 532     }
 533 }
 534
 535 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 536                          struct icu_buf_utf16 * src16,
 537                          UErrorCode *status)
 538 {
 539     if (!tokenizer || !tokenizer->bi || !src16)
 540         return 0;
 541
 542
 543     tokenizer->buf16 = src16;
 544     tokenizer->token_count = 0;
 545     tokenizer->token_id = 0;
 546     tokenizer->token_start = 0;
 547     tokenizer->token_end = 0;
 548
 549     ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
 550
 551
 552     if (U_FAILURE(*status))
 553         return 0;
 554
 555     return 1;
 556 };
 557
 558 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 559                          struct icu_buf_utf16 * tkn16,
 560                          UErrorCode *status)
 561 {
 562     int32_t tkn_start = 0;
 563     int32_t tkn_end = 0;
 564     int32_t tkn_len = 0;
 565
 566
 567     if (!tokenizer || !tokenizer->bi
 568         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 569         return 0;
 570
 571     /*
 572     never change tokenizer->buf16 and keep always invariant
 573     0 <= tokenizer->token_start
 574        <= tokenizer->token_end
 575        <= tokenizer->buf16->utf16_len
 576     returns length of token
 577     */
 578
 579     if (0 == tokenizer->token_end) /* first call */
 580         tkn_start = ubrk_first(tokenizer->bi);
 581     else /* successive calls */
 582         tkn_start = tokenizer->token_end;
 583
 584     /* get next position */
 585     tkn_end = ubrk_next(tokenizer->bi);
 586
 587     /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
 588     if (UBRK_DONE == tkn_end)
 589         tkn_end = tokenizer->buf16->utf16_len;
 590
 591     /* copy out if everything is well */
 592     if(U_FAILURE(*status))
 593         return 0;
 594
 595     /* everything OK, now update internal state */
 596     tkn_len = tkn_end - tkn_start;
 597
 598     if (0 < tkn_len){
 599         tokenizer->token_count++;
 600         tokenizer->token_id++;
 601     } else {
 602         tokenizer->token_id = 0;
 603     }
 604     tokenizer->token_start = tkn_start;
 605     tokenizer->token_end = tkn_end;
 606
 607
 608     /* copying into token buffer if it exists */
 609     if (tkn16){
 610         if (tkn16->utf16_cap < tkn_len)
 611             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 612
 613         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 614                   tkn_len);
 615
 616         tkn16->utf16_len = tkn_len;
 617     }
 618
 619     return tkn_len;
 620 }
 621
 622
 623 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
 624 {
 625     return tokenizer->token_id;
 626 }
 627
 628 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
 629 {
 630     return tokenizer->token_start;
 631 }
 632
 633 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
 634 {
 635     return tokenizer->token_end;
 636 }
 637
 638 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
 639 {
 640     return (tokenizer->token_end - tokenizer->token_start);
 641 }
 642
 643 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 644 {
 645     return tokenizer->token_count;
 646 }
 647
 648
 649
 650 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
 651                                               UErrorCode *status)
 652 {
 653
 654     struct icu_normalizer * normalizer
 655         = (struct icu_normalizer *) malloc(sizeof(struct icu_normalizer));
 656
 657     normalizer->action = action;
 658     normalizer->trans = 0;
 659     normalizer->rules16 =  icu_buf_utf16_create(0);
 660     icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status);
 661
 662     switch(normalizer->action) {
 663     case 'f':
 664         normalizer->trans
 665             = utrans_openU(normalizer->rules16->utf16,
 666                            normalizer->rules16->utf16_len,
 667                            UTRANS_FORWARD,
 668                            0, 0,
 669                            normalizer->parse_error, status);
 670         break;
 671     case 'r':
 672         normalizer->trans
 673             = utrans_openU(normalizer->rules16->utf16,
 674                            normalizer->rules16->utf16_len,
 675                            UTRANS_REVERSE ,
 676                            0, 0,
 677                            normalizer->parse_error, status);
 678         break;
 679     default:
 680         *status = U_UNSUPPORTED_ERROR;
 681         return 0;
 682         break;
 683     }
 684
 685     if (U_SUCCESS(*status))
 686         return normalizer;
 687
 688     /* freeing if failed */
 689     icu_normalizer_destroy(normalizer);
 690     return 0;
 691 }
 692
 693
 694 void icu_normalizer_destroy(struct icu_normalizer * normalizer){
 695     if (normalizer) {
 696         if (normalizer->rules16)
 697             icu_buf_utf16_destroy(normalizer->rules16);
 698         if (normalizer->trans)
 699         {
 700             utrans_close(normalizer->trans);
 701         }
 702         free(normalizer);
 703     }
 704 }
 705
 706
 707
 708 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
 709                              struct icu_buf_utf16 * dest16,
 710                              struct icu_buf_utf16 * src16,
 711                              UErrorCode *status)
 712 {
 713     if (!normalizer || !normalizer->trans || !src16 || !dest16)
 714         return 0;
 715
 716     if (!icu_buf_utf16_copy(dest16, src16))
 717         return 0;
 718
 719     utrans_transUChars (normalizer->trans,
 720                         dest16->utf16, &(dest16->utf16_len),
 721                         dest16->utf16_cap,
 722                         0, &(src16->utf16_len), status);
 723
 724     if (U_FAILURE(*status)){
 725         dest16->utf16[0] = (UChar) 0;
 726         dest16->utf16_len = 0;
 727     }
 728
 729     return dest16->utf16_len;
 730 }
 731
 732
 733
 734
 735 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
 736                                               enum icu_chain_step_type type,
 737                                               const uint8_t * rule,
 738                                               struct icu_buf_utf16 * buf16,
 739                                               UErrorCode *status)
 740 {
 741     struct icu_chain_step * step = 0;
 742
 743     if(!chain || !type || !rule)
 744         return 0;
 745
 746     step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step));
 747
 748     step->type = type;
 749
 750     step->buf16 = buf16;
 751
 752     /* create auxilary objects */
 753     switch(step->type) {
 754     case ICU_chain_step_type_display:
 755         break;
 756     case ICU_chain_step_type_index:
 757         break;
 758     case ICU_chain_step_type_sortkey:
 759         break;
 760     case ICU_chain_step_type_casemap:
 761         step->u.casemap = icu_casemap_create((char *) chain->locale,
 762                                              (char) rule[0], status);
 763         break;
 764     case ICU_chain_step_type_normalize:
 765         step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status);
 766         break;
 767     case ICU_chain_step_type_tokenize:
 768         step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
 769                                                  (char) rule[0], status);
 770         break;
 771     default:
 772         break;
 773     }
 774
 775     return step;
 776 }
 777
 778
 779 void icu_chain_step_destroy(struct icu_chain_step * step){
 780
 781     if (!step)
 782         return;
 783
 784     icu_chain_step_destroy(step->previous);
 785
 786     switch(step->type) {
 787     case ICU_chain_step_type_display:
 788         break;
 789     case ICU_chain_step_type_index:
 790         break;
 791     case ICU_chain_step_type_sortkey:
 792         break;
 793     case ICU_chain_step_type_casemap:
 794         icu_casemap_destroy(step->u.casemap);
 795         icu_buf_utf16_destroy(step->buf16);
 796         break;
 797     case ICU_chain_step_type_normalize:
 798         icu_normalizer_destroy(step->u.normalizer);
 799         icu_buf_utf16_destroy(step->buf16);
 800         break;
 801     case ICU_chain_step_type_tokenize:
 802         icu_tokenizer_destroy(step->u.tokenizer);
 803         icu_buf_utf16_destroy(step->buf16);
 804         break;
 805     default:
 806         break;
 807     }
 808     free(step);
 809 }
 810
 811
 812
 813 struct icu_chain * icu_chain_create(const uint8_t * identifier,
 814                                     const uint8_t * locale)
 815 {
 816
 817     struct icu_chain * chain
 818         = (struct icu_chain *) malloc(sizeof(struct icu_chain));
 819
 820     strncpy((char *) chain->identifier, (const char *) identifier, 128);
 821     chain->identifier[128 - 1] = '\0';
 822     strncpy((char *) chain->locale, (const char *) locale, 16);
 823     chain->locale[16 - 1] = '\0';
 824
 825     chain->token_count = 0;
 826
 827     chain->display8 = icu_buf_utf8_create(0);
 828     chain->norm8 = icu_buf_utf8_create(0);
 829     chain->sort8 = icu_buf_utf8_create(0);
 830
 831     chain->src16 = icu_buf_utf16_create(0);
 832
 833     chain->steps = 0;
 834
 835     return chain;
 836 }
 837
 838
 839 void icu_chain_destroy(struct icu_chain * chain)
 840 {
 841     if (chain){
 842         icu_buf_utf8_destroy(chain->display8);
 843         icu_buf_utf8_destroy(chain->norm8);
 844         icu_buf_utf8_destroy(chain->sort8);
 845
 846         icu_buf_utf16_destroy(chain->src16);
 847
 848         icu_chain_step_destroy(chain->steps);
 849         free(chain);
 850     }
 851 }
 852
 853
 854
 855 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
 856                                         UErrorCode * status){
 857
 858     xmlNode *node = 0;
 859     struct icu_chain * chain = 0;
 860
 861     if (!xml_node
 862         ||xml_node->type != XML_ELEMENT_NODE
 863         || strcmp((const char *) xml_node->name, "icu_chain"))
 864
 865         return 0;
 866
 867     {
 868         xmlChar *xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
 869         xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
 870
 871         if (!xml_id || !strlen((const char *) xml_id)
 872             || !xml_locale || !strlen((const char *) xml_locale))
 873             return 0;
 874
 875         chain = icu_chain_create((const uint8_t *) xml_id,
 876                                  (const uint8_t *) xml_locale);
 877
 878         xmlFree(xml_id);
 879         xmlFree(xml_locale);
 880     }
 881     if (!chain)
 882         return 0;
 883
 884     for (node = xml_node->children; node; node = node->next)
 885     {
 886         xmlChar *xml_rule;
 887         struct icu_chain_step * step = 0;
 888
 889         if (node->type != XML_ELEMENT_NODE)
 890             continue;
 891
 892         xml_rule = xmlGetProp(node, (xmlChar *) "rule");
 893
 894         if (!strcmp((const char *) node->name,
 895                     (const char *) "casemap")){
 896             step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
 897                                          (const uint8_t *) xml_rule, status);
 898         }
 899         else if (!strcmp((const char *) node->name,
 900                          (const char *) "normalize")){
 901             step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
 902                                          (const uint8_t *) xml_rule, status);
 903         }
 904         else if (!strcmp((const char *) node->name,
 905                          (const char *) "tokenize")){
 906             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
 907                                          (const uint8_t *) xml_rule, status);
 908         }
 909         else if (!strcmp((const char *) node->name,
 910                          (const char *) "display")){
 911             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
 912                                          (const uint8_t *) "", status);
 913         }
 914         else if (!strcmp((const char *) node->name,
 915                          (const char *) "index")){
 916             step = icu_chain_insert_step(chain, ICU_chain_step_type_index,
 917                                          (const uint8_t *) "", status);
 918         }
 919         else if (!strcmp((const char *) node->name,
 920                          (const char *) "sortkey")){
 921             step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey,
 922                                          (const uint8_t *) "", status);
 923         }
 924
 925         xmlFree(xml_rule);
 926         if (!step || U_FAILURE(*status)){
 927             icu_chain_destroy(chain);
 928             return 0;
 929         }
 930
 931
 932     }
 933
 934     return chain;
 935 }
 936
 937
 938
 939 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
 940                                               enum icu_chain_step_type type,
 941                                               const uint8_t * rule,
 942                                               UErrorCode *status)
 943 {
 944     struct icu_chain_step * step = 0;
 945     struct icu_buf_utf16 * src16 = 0;
 946     struct icu_buf_utf16 * buf16 = 0;
 947
 948     if (!chain || !type || !rule)
 949         return 0;
 950
 951     /* assign utf16 src buffers as needed */
 952     if (chain->steps && chain->steps->buf16)
 953         src16 = chain->steps->buf16;
 954     else if (chain->src16)
 955         src16 = chain->src16;
 956     else
 957         return 0;
 958
 959
 960     /* create utf16 destination buffers as needed, or */
 961     switch(type) {
 962     case ICU_chain_step_type_display:
 963         buf16 = src16;
 964         break;
 965     case ICU_chain_step_type_index:
 966         buf16 = src16;
 967         break;
 968     case ICU_chain_step_type_sortkey:
 969         buf16 = src16;
 970         break;
 971     case ICU_chain_step_type_casemap:
 972         buf16 = icu_buf_utf16_create(0);
 973         break;
 974     case ICU_chain_step_type_normalize:
 975         buf16 = icu_buf_utf16_create(0);
 976         break;
 977     case ICU_chain_step_type_tokenize:
 978         buf16 = icu_buf_utf16_create(0);
 979         break;
 980     default:
 981         break;
 982     }
 983
 984     /* create actual chain step with this buffer */
 985     step = icu_chain_step_create(chain, type, rule, buf16, status);
 986
 987     step->previous = chain->steps;
 988     chain->steps = step;
 989
 990     return step;
 991 }
 992
 993
 994 int icu_chain_step_next_token(struct icu_chain * chain,
 995                               struct icu_chain_step * step,
 996                               UErrorCode *status)
 997 {
 998     struct icu_buf_utf16 * src16 = 0;
 999
1000     if (!chain || !chain->src16 || !step || !step->more_tokens)
1001         return 0;
1002
1003     /* assign utf16 src buffers as neeed, advance in previous steps
1004        tokens until non-zero token met, and setting stop condition
1005     */
1006     if (step->previous){
1007         src16 = step->previous->buf16;
1008         if (step->need_new_token)
1009             step->more_tokens
1010                 = icu_chain_step_next_token(chain, step->previous, status);
1011     }
1012     else { /* first step can only work once on chain->src16 input buffer */
1013         src16 = chain->src16;
1014         step->more_tokens = 1;
1015     }
1016
1017     /* stop if nothing to process
1018        i.e new token source was not properly assigned
1019     */
1020     if (!step->more_tokens || !src16)
1021         return 0;
1022
1023     /* perform the work, eventually put this steps output in
1024        step->buf16 or the chains UTF8 output buffers  */
1025     switch(step->type) {
1026     case ICU_chain_step_type_display:
1027         icu_utf16_to_utf8(chain->display8, src16, status);
1028         break;
1029     case ICU_chain_step_type_index:
1030         icu_utf16_to_utf8(chain->norm8, src16, status);
1031         break;
1032     case ICU_chain_step_type_sortkey:
1033         icu_utf16_to_utf8(chain->sort8, src16, status);
1034         break;
1035     case ICU_chain_step_type_casemap:
1036         icu_casemap_casemap(step->u.casemap,
1037                             step->buf16, src16, status);
1038         break;
1039     case ICU_chain_step_type_normalize:
1040         icu_normalizer_normalize(step->u.normalizer,
1041                                  step->buf16, src16, status);
1042         break;
1043     case ICU_chain_step_type_tokenize:
1044         /* attach to new src16 token only first time during splitting */
1045         if (step->need_new_token){
1046             icu_tokenizer_attach(step->u.tokenizer, src16, status);
1047             step->need_new_token = 0;
1048         }
1049         /* splitting one src16 token into multiple buf16 tokens */
1050         step->more_tokens
1051             = icu_tokenizer_next_token(step->u.tokenizer,
1052                                        step->buf16, status);
1053         /* make sure to get new previous token if this one had been used up */
1054         if (step->previous && !step->more_tokens){
1055             if (icu_chain_step_next_token(chain, step->previous, status)){
1056                 icu_tokenizer_attach(step->u.tokenizer, src16, status);
1057                 step->need_new_token = 0;
1058                 step->more_tokens
1059                     = icu_tokenizer_next_token(step->u.tokenizer,
1060                                                step->buf16, status);
1061             }
1062         }
1063         if (0 == step->more_tokens)
1064             return 0;
1065         break;
1066     default:
1067         return 0;
1068         break;
1069     }
1070
1071
1072
1073     /* stop further token processing if last step and
1074        new tokens are needed from previous (non-existing) step
1075     */
1076     if (!step->previous && step->need_new_token)
1077         step->more_tokens = 0;
1078
1079     if (U_FAILURE(*status))
1080         return 0;
1081
1082     return 1;
1083 }
1084
1085
1086
1087 int icu_chain_assign_cstr(struct icu_chain * chain,
1088                           const char * src8cstr,
1089                           UErrorCode *status)
1090 {
1091     struct icu_chain_step * stp = 0;
1092
1093     if (!chain || !src8cstr)
1094         return 0;
1095
1096     stp = chain->steps;
1097
1098     /* clear token count */
1099     chain->token_count = 0;
1100
1101     /* clear all steps stop states */
1102     while (stp){
1103         stp->more_tokens = 1;
1104         stp->need_new_token = 1;
1105         stp = stp->previous;
1106     }
1107
1108     /* finally convert UTF8 to UTF16 string */
1109     icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status);
1110
1111     if (U_FAILURE(*status))
1112         return 0;
1113
1114     return 1;
1115 }
1116
1117
1118
1119 int icu_chain_next_token(struct icu_chain * chain,
1120                          UErrorCode *status)
1121 {
1122     int success = 0;
1123
1124     if (!chain || !chain->steps)
1125         return 0;
1126
1127     success = icu_chain_step_next_token(chain, chain->steps, status);
1128
1129     if (success){
1130         chain->token_count++;
1131         return chain->token_count;
1132     }
1133
1134     return 0;
1135 }
1136
1137 int icu_chain_get_token_count(struct icu_chain * chain)
1138 {
1139     if (!chain)
1140         return 0;
1141
1142     return chain->token_count;
1143 }
1144
1145
1146
1147 const char * icu_chain_get_display(struct icu_chain * chain)
1148 {
1149     if (chain->display8)
1150         return icu_buf_utf8_to_cstr(chain->display8);
1151
1152     return 0;
1153 }
1154
1155 const char * icu_chain_get_norm(struct icu_chain * chain)
1156 {
1157     if (chain->norm8)
1158         return icu_buf_utf8_to_cstr(chain->norm8);
1159
1160     return 0;
1161 }
1162
1163 const char * icu_chain_get_sort(struct icu_chain * chain)
1164 {
1165     if (chain->sort8)
1166         return icu_buf_utf8_to_cstr(chain->sort8);
1167
1168     return 0;
1169 }
1170
1171
1172 #endif /* HAVE_ICU */
1173
1174
1175
1176
1177 /*
1178  * Local variables:
1179  * c-basic-offset: 4
1180  * indent-tabs-mode: nil
1181  * End:
1182  * vim: shiftwidth=4 tabstop=8 expandtab
1183  */