src/icu_I18N.c

   1 /* $Id: icu_I18N.c,v 1.19 2007-05-22 07:51:45 adam Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4    This file is part of Pazpar2.
   5
   6    Pazpar2 is free software; you can redistribute it and/or modify it under
   7    the terms of the GNU General Public License as published by the Free
   8    Software Foundation; either version 2, or (at your option) any later
   9    version.
  10
  11    Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14    for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Pazpar2; see the file LICENSE.  If not, write to the
  18    Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19    02111-1307, USA.
  20 */
  21
  22 #if HAVE_CONFIG_H
  23 #include "cconfig.h"
  24 #endif
  25
  26 #define USE_TIMING 0
  27 #if USE_TIMING
  28 #include <yaz/timing.h>
  29 #endif
  30
  31
  32 #ifdef HAVE_ICU
  33 #include "icu_I18N.h"
  34
  35 #include <yaz/log.h>
  36
  37 #include <string.h>
  38 #include <stdlib.h>
  39 #include <stdio.h>
  40
  41 #include <unicode/ustring.h>  /* some more string fcns*/
  42 #include <unicode/uchar.h>    /* char names           */
  43
  44
  45 //#include <unicode/ustdio.h>
  46 //#include <unicode/utypes.h>   /* Basic ICU data types */
  47 #include <unicode/ucol.h>
  48 //#include <unicode/ucnv.h>     /* C   Converter API    */
  49 //#include <unicode/uloc.h>
  50 //#include <unicode/ubrk.h>
  51 /* #include <unicode/unistr.h> */
  52
  53
  54
  55
  56 int icu_check_status (UErrorCode status)
  57 {
  58     if(U_FAILURE(status)){
  59         yaz_log(YLOG_WARN,
  60                 "ICU: %d %s\n", status, u_errorName(status));
  61         return 0;
  62     }
  63     return 1;
  64
  65 }
  66
  67
  68
  69 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
  70 {
  71     struct icu_buf_utf16 * buf16
  72         = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
  73
  74     buf16->utf16 = 0;
  75     buf16->utf16_len = 0;
  76     buf16->utf16_cap = 0;
  77
  78     if (capacity > 0){
  79         buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  80         buf16->utf16[0] = (UChar) 0;
  81         buf16->utf16_cap = capacity;
  82     }
  83     return buf16;
  84 };
  85
  86
  87 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
  88                                             size_t capacity)
  89 {
  90     if (buf16){
  91         if (capacity >  0){
  92             if (0 == buf16->utf16)
  93                 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  94             else
  95                 buf16->utf16
  96                     = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
  97             buf16->utf16[0] = (UChar) 0;
  98             buf16->utf16_len = 0;
  99             buf16->utf16_cap = capacity;
 100         }
 101         else {
 102             if (buf16->utf16)
 103                 free(buf16->utf16);
 104             buf16->utf16 = 0;
 105             buf16->utf16_len = 0;
 106             buf16->utf16_cap = 0;
 107         }
 108     }
 109
 110     return buf16;
 111 };
 112
 113
 114 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
 115                                           struct icu_buf_utf16 * src16)
 116 {
 117     if(!dest16 || !src16
 118        || dest16 == src16)
 119         return 0;
 120
 121     if (dest16->utf16_cap < src16->utf16_len)
 122         icu_buf_utf16_resize(dest16, src16->utf16_len * 2);
 123
 124     u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len);
 125     dest16->utf16_len = src16->utf16_len;
 126
 127     return dest16;
 128 };
 129
 130
 131 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
 132 {
 133     if (buf16){
 134         if (buf16->utf16)
 135             free(buf16->utf16);
 136         free(buf16);
 137     }
 138 };
 139
 140
 141
 142
 143
 144
 145 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
 146 {
 147     struct icu_buf_utf8 * buf8
 148         = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
 149
 150     buf8->utf8 = 0;
 151     buf8->utf8_len = 0;
 152     buf8->utf8_cap = 0;
 153
 154     if (capacity > 0){
 155         buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 156         buf8->utf8[0] = (uint8_t) 0;
 157         buf8->utf8_cap = capacity;
 158     }
 159     return buf8;
 160 };
 161
 162
 163
 164 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
 165                                           size_t capacity)
 166 {
 167     if (buf8){
 168         if (capacity >  0){
 169             if (0 == buf8->utf8)
 170                 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 171             else
 172                 buf8->utf8
 173                     = (uint8_t *) realloc(buf8->utf8,
 174                                           sizeof(uint8_t) * capacity);
 175             buf8->utf8[0] = (uint8_t) 0;
 176             buf8->utf8_len = 0;
 177             buf8->utf8_cap = capacity;
 178         }
 179         else {
 180             if (buf8->utf8)
 181                 free(buf8->utf8);
 182             buf8->utf8 = 0;
 183             buf8->utf8_len = 0;
 184             buf8->utf8_cap = 0;
 185         }
 186     }
 187
 188     return buf8;
 189 };
 190
 191
 192 struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8,
 193                                           struct icu_buf_utf8 * src8)
 194 {
 195     if(!dest8 || !src8
 196        || dest8 == src8)
 197         return 0;
 198
 199
 200     if (dest8->utf8_cap < src8->utf8_len)
 201         icu_buf_utf8_resize(dest8, src8->utf8_len * 2);
 202
 203     strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len);
 204
 205     return dest8;
 206 };
 207
 208
 209
 210 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
 211 {
 212     if (buf8){
 213         if (buf8->utf8)
 214             free(buf8->utf8);
 215         free(buf8);
 216     }
 217 };
 218
 219
 220
 221 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
 222                                struct icu_buf_utf8 * src8,
 223                                UErrorCode * status)
 224 {
 225     int32_t utf16_len = 0;
 226
 227     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 228                   &utf16_len,
 229                   (const char *) src8->utf8, src8->utf8_len, status);
 230
 231     // check for buffer overflow, resize and retry
 232     if (*status == U_BUFFER_OVERFLOW_ERROR
 233         //|| dest16->utf16_len > dest16->utf16_cap
 234         ){
 235         icu_buf_utf16_resize(dest16, utf16_len * 2);
 236         *status = U_ZERO_ERROR;
 237         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 238                       &utf16_len,
 239                       (const char *) src8->utf8, src8->utf8_len, status);
 240     }
 241
 242     //if (*status != U_BUFFER_OVERFLOW_ERROR
 243     if (U_SUCCESS(*status)
 244         && utf16_len < dest16->utf16_cap)
 245         dest16->utf16_len = utf16_len;
 246     else {
 247         dest16->utf16[0] = (UChar) 0;
 248         dest16->utf16_len = 0;
 249     }
 250
 251     return *status;
 252 };
 253
 254
 255
 256 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
 257                                     const char * src8cstr,
 258                                     UErrorCode * status)
 259 {
 260     size_t src8cstr_len = 0;
 261     int32_t utf16_len = 0;
 262
 263     src8cstr_len = strlen(src8cstr);
 264
 265     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 266                   &utf16_len,
 267                   src8cstr, src8cstr_len, status);
 268
 269     // check for buffer overflow, resize and retry
 270     if (*status == U_BUFFER_OVERFLOW_ERROR
 271         //|| dest16->utf16_len > dest16->utf16_cap
 272         ){
 273         icu_buf_utf16_resize(dest16, utf16_len * 2);
 274         *status = U_ZERO_ERROR;
 275         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 276                       &utf16_len,
 277                       src8cstr, src8cstr_len, status);
 278     }
 279
 280     //  if (*status != U_BUFFER_OVERFLOW_ERROR
 281     if (U_SUCCESS(*status)
 282         && utf16_len < dest16->utf16_cap)
 283         dest16->utf16_len = utf16_len;
 284     else {
 285         dest16->utf16[0] = (UChar) 0;
 286         dest16->utf16_len = 0;
 287     }
 288
 289     return *status;
 290 };
 291
 292
 293
 294
 295 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
 296                              struct icu_buf_utf16 * src16,
 297                              UErrorCode * status)
 298 {
 299     int32_t utf8_len = 0;
 300
 301     u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 302                 &utf8_len,
 303                 src16->utf16, src16->utf16_len, status);
 304
 305     // check for buffer overflow, resize and retry
 306     if (*status == U_BUFFER_OVERFLOW_ERROR
 307         //|| dest8->utf8_len > dest8->utf8_cap
 308         ){
 309         icu_buf_utf8_resize(dest8, utf8_len * 2);
 310         *status = U_ZERO_ERROR;
 311         u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 312                     &utf8_len,
 313                     src16->utf16, src16->utf16_len, status);
 314
 315     }
 316
 317     //if (*status != U_BUFFER_OVERFLOW_ERROR
 318     if (U_SUCCESS(*status)
 319         && utf8_len < dest8->utf8_cap)
 320         dest8->utf8_len = utf8_len;
 321     else {
 322         dest8->utf8[0] = (uint8_t) 0;
 323         dest8->utf8_len = 0;
 324     }
 325
 326     return *status;
 327 };
 328
 329
 330
 331 struct icu_casemap * icu_casemap_create(const char *locale, char action,
 332                                         UErrorCode *status)
 333 {
 334     struct icu_casemap * casemap
 335         = (struct icu_casemap *) malloc(sizeof(struct icu_casemap));
 336     strcpy(casemap->locale, locale);
 337     casemap->action = action;
 338
 339     switch(casemap->action) {
 340     case 'l':
 341         break;
 342     case 'u':
 343         break;
 344     case 't':
 345         break;
 346     case 'f':
 347         break;
 348     default:
 349         icu_casemap_destroy(casemap);
 350         return 0;
 351     }
 352
 353     return casemap;
 354 };
 355
 356 void icu_casemap_destroy(struct icu_casemap * casemap)
 357 {
 358     if (casemap)
 359         free(casemap);
 360 };
 361
 362
 363 int icu_casemap_casemap(struct icu_casemap * casemap,
 364                         struct icu_buf_utf16 * dest16,
 365                         struct icu_buf_utf16 * src16,
 366                         UErrorCode *status)
 367 {
 368     if(!casemap)
 369         return 0;
 370
 371     return icu_utf16_casemap(dest16, src16,
 372                              casemap->locale, casemap->action, status);
 373 };
 374
 375
 376 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
 377                       struct icu_buf_utf16 * src16,
 378                       const char *locale, char action,
 379                       UErrorCode *status)
 380 {
 381     int32_t dest16_len = 0;
 382
 383     switch(action) {
 384     case 'l':
 385         dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 386                                   src16->utf16, src16->utf16_len,
 387                                   locale, status);
 388         break;
 389     case 'u':
 390         dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 391                                   src16->utf16, src16->utf16_len,
 392                                   locale, status);
 393         break;
 394     case 't':
 395         dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 396                                   src16->utf16, src16->utf16_len,
 397                                   0, locale, status);
 398         break;
 399     case 'f':
 400         dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 401                                    src16->utf16, src16->utf16_len,
 402                                    U_FOLD_CASE_DEFAULT, status);
 403         break;
 404
 405     default:
 406         return U_UNSUPPORTED_ERROR;
 407         break;
 408     }
 409
 410     // check for buffer overflow, resize and retry
 411     if (*status == U_BUFFER_OVERFLOW_ERROR
 412         && dest16 != src16        // do not resize if in-place conversion
 413         //|| dest16_len > dest16->utf16_cap
 414         ){
 415         icu_buf_utf16_resize(dest16, dest16_len * 2);
 416         *status = U_ZERO_ERROR;
 417
 418
 419         switch(action) {
 420         case 'l':
 421             dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 422                                       src16->utf16, src16->utf16_len,
 423                                       locale, status);
 424             break;
 425         case 'u':
 426             dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 427                                       src16->utf16, src16->utf16_len,
 428                                       locale, status);
 429             break;
 430         case 't':
 431             dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 432                                       src16->utf16, src16->utf16_len,
 433                                       0, locale, status);
 434             break;
 435         case 'f':
 436             dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 437                                        src16->utf16, src16->utf16_len,
 438                                        U_FOLD_CASE_DEFAULT, status);
 439             break;
 440
 441         default:
 442             return U_UNSUPPORTED_ERROR;
 443             break;
 444         }
 445     }
 446
 447     if (U_SUCCESS(*status)
 448         && dest16_len < dest16->utf16_cap)
 449         dest16->utf16_len = dest16_len;
 450     else {
 451         dest16->utf16[0] = (UChar) 0;
 452         dest16->utf16_len = 0;
 453     }
 454
 455     return *status;
 456 };
 457
 458
 459
 460 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
 461                                    struct icu_buf_utf8 * dest8,
 462                                    struct icu_buf_utf16 * src16,
 463                                    UErrorCode * status)
 464 {
 465
 466     int32_t sortkey_len = 0;
 467
 468     sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 469                                   dest8->utf8, dest8->utf8_cap);
 470
 471     // check for buffer overflow, resize and retry
 472     if (sortkey_len > dest8->utf8_cap) {
 473         icu_buf_utf8_resize(dest8, sortkey_len * 2);
 474         sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 475                                       dest8->utf8, dest8->utf8_cap);
 476     }
 477
 478     if (U_SUCCESS(*status)
 479         && sortkey_len > 0)
 480         dest8->utf8_len = sortkey_len;
 481     else {
 482         dest8->utf8[0] = (UChar) 0;
 483         dest8->utf8_len = 0;
 484     }
 485
 486     return sortkey_len;
 487 };
 488
 489
 490
 491 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
 492                                             UErrorCode *status)
 493 {
 494     struct icu_tokenizer * tokenizer
 495         = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
 496
 497     strcpy(tokenizer->locale, locale);
 498     tokenizer->action = action;
 499     tokenizer->bi = 0;
 500     tokenizer->buf16 = 0;
 501     tokenizer->token_count = 0;
 502     tokenizer->token_id = 0;
 503     tokenizer->token_start = 0;
 504     tokenizer->token_end = 0;
 505
 506
 507     switch(tokenizer->action) {
 508     case 'l':
 509         tokenizer->bi
 510             = ubrk_open(UBRK_LINE, tokenizer->locale,
 511                         0, 0, status);
 512         break;
 513     case 's':
 514         tokenizer->bi
 515             = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
 516                         0, 0, status);
 517         break;
 518     case 'w':
 519         tokenizer->bi
 520             = ubrk_open(UBRK_WORD, tokenizer->locale,
 521                         0, 0, status);
 522         break;
 523     case 'c':
 524         tokenizer->bi
 525             = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
 526                         0, 0, status);
 527         break;
 528     case 't':
 529         tokenizer->bi
 530             = ubrk_open(UBRK_TITLE, tokenizer->locale,
 531                         0, 0, status);
 532         break;
 533     default:
 534         *status = U_UNSUPPORTED_ERROR;
 535         return 0;
 536         break;
 537     }
 538
 539     // ICU error stuff is a very  funny business
 540     if (U_SUCCESS(*status))
 541         return tokenizer;
 542
 543     // freeing if failed
 544     icu_tokenizer_destroy(tokenizer);
 545     return 0;
 546 };
 547
 548 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
 549 {
 550     if (tokenizer) {
 551         if (tokenizer->bi)
 552             ubrk_close(tokenizer->bi);
 553         free(tokenizer);
 554     }
 555 };
 556
 557 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 558                          struct icu_buf_utf16 * src16,
 559                          UErrorCode *status)
 560 {
 561     if (!tokenizer || !tokenizer->bi || !src16)
 562         return 0;
 563
 564
 565     tokenizer->buf16 = src16;
 566     tokenizer->token_count = 0;
 567     tokenizer->token_id = 0;
 568     tokenizer->token_start = 0;
 569     tokenizer->token_end = 0;
 570
 571     ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
 572
 573
 574     if (U_FAILURE(*status))
 575         return 0;
 576
 577     return 1;
 578 };
 579
 580 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 581                          struct icu_buf_utf16 * tkn16,
 582                          UErrorCode *status)
 583 {
 584     int32_t tkn_start = 0;
 585     int32_t tkn_end = 0;
 586     int32_t tkn_len = 0;
 587
 588
 589     if (!tokenizer || !tokenizer->bi
 590         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 591         return 0;
 592
 593     // never change tokenizer->buf16 and keep always invariant
 594     // 0 <= tokenizer->token_start
 595     //   <= tokenizer->token_end
 596     //   <= tokenizer->buf16->utf16_len
 597     // returns length of token
 598
 599     if (0 == tokenizer->token_end) // first call
 600         tkn_start = ubrk_first(tokenizer->bi);
 601     else //successive calls
 602         tkn_start = tokenizer->token_end;
 603
 604     // get next position
 605     tkn_end = ubrk_next(tokenizer->bi);
 606
 607     // repairing invariant at end of ubrk, which is UBRK_DONE = -1
 608     if (UBRK_DONE == tkn_end)
 609         tkn_end = tokenizer->buf16->utf16_len;
 610
 611     // copy out if everything is well
 612     if(U_FAILURE(*status))
 613         return 0;
 614
 615     // everything OK, now update internal state
 616     tkn_len = tkn_end - tkn_start;
 617
 618     if (0 < tkn_len){
 619         tokenizer->token_count++;
 620         tokenizer->token_id++;
 621     } else {
 622         tokenizer->token_id = 0;
 623     }
 624     tokenizer->token_start = tkn_start;
 625     tokenizer->token_end = tkn_end;
 626
 627
 628     // copying into token buffer if it exists
 629     if (tkn16){
 630         if (tkn16->utf16_cap < tkn_len)
 631             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 632
 633         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 634                   tkn_len);
 635
 636         tkn16->utf16_len = tkn_len;
 637     }
 638
 639     return tkn_len;
 640 }
 641
 642
 643 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
 644 {
 645     return tokenizer->token_id;
 646 };
 647
 648 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
 649 {
 650     return tokenizer->token_start;
 651 };
 652
 653 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
 654 {
 655     return tokenizer->token_end;
 656 };
 657
 658 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
 659 {
 660     return (tokenizer->token_end - tokenizer->token_start);
 661 };
 662
 663 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 664 {
 665     return tokenizer->token_count;
 666 };
 667
 668
 669
 670 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
 671                                               UErrorCode *status)
 672 {
 673
 674     struct icu_normalizer * normalizer
 675         = (struct icu_normalizer *) malloc(sizeof(struct icu_normalizer));
 676
 677     normalizer->action = action;
 678     normalizer->trans = 0;
 679     normalizer->rules16 =  icu_buf_utf16_create(0);
 680     icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status);
 681
 682     switch(normalizer->action) {
 683     case 'f':
 684         normalizer->trans
 685             = utrans_openU(normalizer->rules16->utf16,
 686                            normalizer->rules16->utf16_len,
 687                            UTRANS_FORWARD,
 688                            0, 0,
 689                            normalizer->parse_error, status);
 690         // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
 691         break;
 692     case 'r':
 693         normalizer->trans
 694             = utrans_openU(normalizer->rules16->utf16,
 695                            normalizer->rules16->utf16_len,
 696                            UTRANS_REVERSE ,
 697                            0, 0,
 698                            normalizer->parse_error, status);
 699         // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans);
 700         break;
 701     default:
 702         *status = U_UNSUPPORTED_ERROR;
 703         return 0;
 704         break;
 705     }
 706
 707     if (U_SUCCESS(*status))
 708         return normalizer;
 709
 710     // freeing if failed
 711     icu_normalizer_destroy(normalizer);
 712     return 0;
 713 };
 714
 715
 716 void icu_normalizer_destroy(struct icu_normalizer * normalizer){
 717     if (normalizer) {
 718         if (normalizer->rules16)
 719             icu_buf_utf16_destroy(normalizer->rules16);
 720         if (normalizer->trans)
 721         {
 722             // yaz_log(YLOG_LOG, "utrans_close %p", normalizer->trans);
 723             utrans_close(normalizer->trans);
 724         }
 725         free(normalizer);
 726     }
 727 };
 728
 729
 730
 731 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
 732                              struct icu_buf_utf16 * dest16,
 733                              struct icu_buf_utf16 * src16,
 734                              UErrorCode *status)
 735 {
 736     if (!normalizer || !normalizer->trans || !src16 || !dest16)
 737         return 0;
 738
 739     if (!icu_buf_utf16_copy(dest16, src16))
 740         return 0;
 741
 742     utrans_transUChars (normalizer->trans,
 743                         dest16->utf16, &(dest16->utf16_len),
 744                         dest16->utf16_cap,
 745                         0, &(src16->utf16_len), status);
 746
 747     if (U_FAILURE(*status)){
 748         dest16->utf16[0] = (UChar) 0;
 749         dest16->utf16_len = 0;
 750     }
 751
 752     return dest16->utf16_len;
 753 }
 754
 755
 756
 757
 758 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
 759                                               enum icu_chain_step_type type,
 760                                               const uint8_t * rule,
 761                                               struct icu_buf_utf16 * buf16,
 762                                               UErrorCode *status)
 763 {
 764     struct icu_chain_step * step = 0;
 765
 766     if(!chain || !type || !rule)
 767         return 0;
 768
 769     step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step));
 770
 771     step->type = type;
 772     step->more_tokens = 0;
 773     step->need_new_token = 1;
 774
 775     if (buf16)
 776         step->buf16 = buf16;
 777     else
 778         step->buf16 = 0;
 779
 780     // create auxilary objects
 781     switch(step->type) {
 782     case ICU_chain_step_type_display:
 783         break;
 784     case ICU_chain_step_type_norm:
 785         break;
 786     case ICU_chain_step_type_sort:
 787         break;
 788     case ICU_chain_step_type_casemap:
 789         step->u.casemap = icu_casemap_create((char *) chain->locale,
 790                                              (char) rule[0], status);
 791         break;
 792     case ICU_chain_step_type_normalize:
 793         step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status);
 794         break;
 795     case ICU_chain_step_type_tokenize:
 796         step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
 797                                                  (char) rule[0], status);
 798         break;
 799     default:
 800         break;
 801     }
 802
 803     return step;
 804 };
 805
 806
 807 void icu_chain_step_destroy(struct icu_chain_step * step){
 808
 809     if (!step)
 810         return;
 811
 812     icu_chain_step_destroy(step->previous);
 813
 814     switch(step->type) {
 815     case ICU_chain_step_type_display:
 816         break;
 817     case ICU_chain_step_type_norm:
 818         break;
 819     case ICU_chain_step_type_sort:
 820         break;
 821     case ICU_chain_step_type_casemap:
 822         icu_casemap_destroy(step->u.casemap);
 823         icu_buf_utf16_destroy(step->buf16);
 824         break;
 825     case ICU_chain_step_type_normalize:
 826         icu_normalizer_destroy(step->u.normalizer);
 827         icu_buf_utf16_destroy(step->buf16);
 828         break;
 829     case ICU_chain_step_type_tokenize:
 830         icu_tokenizer_destroy(step->u.tokenizer);
 831         icu_buf_utf16_destroy(step->buf16);
 832         break;
 833     default:
 834         break;
 835     }
 836     free(step);
 837 };
 838
 839
 840
 841 struct icu_chain * icu_chain_create(const uint8_t * identifier,
 842                                     const uint8_t * locale)
 843 {
 844
 845     struct icu_chain * chain
 846         = (struct icu_chain *) malloc(sizeof(struct icu_chain));
 847
 848     strncpy((char *) chain->identifier, (const char *) identifier, 128);
 849     chain->identifier[128 - 1] = '\0';
 850     strncpy((char *) chain->locale, (const char *) locale, 16);
 851     chain->locale[16 - 1] = '\0';
 852
 853     chain->token_count = 0;
 854
 855     chain->display8 = icu_buf_utf8_create(0);
 856     chain->norm8 = icu_buf_utf8_create(0);
 857     chain->sort8 = icu_buf_utf8_create(0);
 858
 859     chain->src16 = icu_buf_utf16_create(0);
 860
 861     chain->steps = 0;
 862
 863     return chain;
 864 };
 865
 866
 867 void icu_chain_destroy(struct icu_chain * chain)
 868 {
 869     if (chain){
 870         icu_buf_utf8_destroy(chain->display8);
 871         icu_buf_utf8_destroy(chain->norm8);
 872         icu_buf_utf8_destroy(chain->sort8);
 873
 874         icu_buf_utf16_destroy(chain->src16);
 875
 876         icu_chain_step_destroy(chain->steps);
 877         free(chain);
 878     }
 879 };
 880
 881
 882
 883 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
 884                                         UErrorCode * status){
 885
 886     xmlNode *node = 0;
 887     struct icu_chain * chain = 0;
 888
 889     if (!xml_node
 890         ||xml_node->type != XML_ELEMENT_NODE
 891         || strcmp((const char *) xml_node->name, "icu_chain"))
 892
 893         return 0;
 894
 895     xmlChar *xml_id = xmlGetProp(xml_node, (xmlChar *) "id");
 896     xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
 897
 898     if (!xml_id || !strlen((const char *) xml_id)
 899         || !xml_locale || !strlen((const char *) xml_locale))
 900         return 0;
 901
 902     chain = icu_chain_create((const uint8_t *) xml_id,
 903                              (const uint8_t *) xml_locale);
 904
 905     xmlFree(xml_id);
 906     xmlFree(xml_locale);
 907     if (!chain)
 908         return 0;
 909
 910     for (node = xml_node->children; node; node = node->next)
 911     {
 912         if (node->type != XML_ELEMENT_NODE)
 913             continue;
 914
 915         xmlChar *xml_rule = xmlGetProp(node, (xmlChar *) "rule");
 916         struct icu_chain_step * step = 0;
 917
 918         if (!strcmp((const char *) node->name,
 919                     (const char *) "casemap")){
 920             step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
 921                                          (const uint8_t *) xml_rule, status);
 922         }
 923         else if (!strcmp((const char *) node->name,
 924                          (const char *) "normalize")){
 925             step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize,
 926                                          (const uint8_t *) xml_rule, status);
 927         }
 928         else if (!strcmp((const char *) node->name,
 929                          (const char *) "tokenize")){
 930             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
 931                                          (const uint8_t *) xml_rule, status);
 932         }
 933         else if (!strcmp((const char *) node->name,
 934                          (const char *) "display")){
 935             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
 936                                          (const uint8_t *) "", status);
 937         }
 938         else if (!strcmp((const char *) node->name,
 939                          (const char *) "normal")){
 940             step = icu_chain_insert_step(chain, ICU_chain_step_type_norm,
 941                                          (const uint8_t *) "", status);
 942         }
 943         else if (!strcmp((const char *) node->name,
 944                          (const char *) "sort")){
 945             step = icu_chain_insert_step(chain, ICU_chain_step_type_sort,
 946                                          (const uint8_t *) "", status);
 947         }
 948
 949         xmlFree(xml_rule);
 950         if (!step || U_FAILURE(*status)){
 951             icu_chain_destroy(chain);
 952             return 0;
 953         }
 954
 955
 956     }
 957
 958     return chain;
 959 };
 960
 961
 962
 963 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
 964                                               enum icu_chain_step_type type,
 965                                               const uint8_t * rule,
 966                                               UErrorCode *status)
 967 {
 968     struct icu_chain_step * step = 0;
 969     struct icu_buf_utf16 * src16 = 0;
 970     struct icu_buf_utf16 * buf16 = 0;
 971
 972     if (!chain || !type || !rule)
 973         return 0;
 974
 975     // assign utf16 src buffers as needed
 976     if (chain->steps && chain->steps->buf16)
 977         src16 = chain->steps->buf16;
 978     else if (chain->src16)
 979         src16 = chain->src16;
 980     else
 981         return 0;
 982
 983
 984     // create utf16 destination buffers as needed, or
 985     switch(type) {
 986     case ICU_chain_step_type_display:
 987         buf16 = src16;
 988         break;
 989     case ICU_chain_step_type_norm:
 990         buf16 = src16;
 991         break;
 992     case ICU_chain_step_type_sort:
 993         buf16 = src16;
 994         break;
 995     case ICU_chain_step_type_casemap:
 996         buf16 = icu_buf_utf16_create(0);
 997         break;
 998     case ICU_chain_step_type_normalize:
 999         buf16 = icu_buf_utf16_create(0);
1000         break;
1001     case ICU_chain_step_type_tokenize:
1002         buf16 = icu_buf_utf16_create(0);
1003         break;
1004     default:
1005         break;
1006     }
1007
1008     // create actual chain step with this buffer
1009     step = icu_chain_step_create(chain, type, rule, buf16, status);
1010
1011     step->previous = chain->steps;
1012     chain->steps = step;
1013
1014     return step;
1015 };
1016
1017
1018 int icu_chain_step_next_token(struct icu_chain * chain,
1019                               struct icu_chain_step * step,
1020                               UErrorCode *status)
1021 {
1022     struct icu_buf_utf16 * src16 = 0;
1023
1024     //printf("icu_chain_step_next_token %d\n", (int) step);
1025
1026     if (!chain || !chain->src16 || !step || !step->more_tokens)
1027         return 0;
1028
1029     // assign utf16 src buffers as neeed, advance in previous steps
1030     // tokens until non-zero token met, and setting stop condition
1031     if (step->previous){
1032         src16 = step->previous->buf16;
1033         if (step->need_new_token)
1034             //while (step->more_tokens &&  !src16->utf16_len)
1035                 step->more_tokens
1036                     = icu_chain_step_next_token(chain, step->previous, status);
1037     }
1038     else { // first step can only work once on chain->src16 input buffer
1039         src16 = chain->src16;
1040         step->more_tokens = 1;
1041     }
1042
1043     // stop if nothing to process
1044     // i.e new token source was not properly assigned
1045     if (!step->more_tokens || !src16) // || !src16->utf16_len
1046         return 0;
1047
1048     //printf("icu_chain_step_next_token %d working\n", (int) step);
1049
1050
1051     // perform the work, eventually put this steps output in
1052     // step->buf16 or the chains UTF8 output buffers
1053     switch(step->type) {
1054     case ICU_chain_step_type_display:
1055         icu_utf16_to_utf8(chain->display8, src16, status);
1056         break;
1057     case ICU_chain_step_type_norm:
1058         icu_utf16_to_utf8(chain->norm8, src16, status);
1059         break;
1060     case ICU_chain_step_type_sort:
1061         icu_utf16_to_utf8(chain->sort8, src16, status);
1062         break;
1063     case ICU_chain_step_type_casemap:
1064         icu_casemap_casemap(step->u.casemap,
1065                             step->buf16, src16, status);
1066         break;
1067     case ICU_chain_step_type_normalize:
1068         icu_normalizer_normalize(step->u.normalizer,
1069                                  step->buf16, src16, status);
1070         break;
1071     case ICU_chain_step_type_tokenize:
1072         // attach to new src16 token only first time during splitting
1073         if (step->need_new_token){
1074             icu_tokenizer_attach(step->u.tokenizer, src16, status);
1075             step->need_new_token = 0;
1076         }
1077         // splitting one src16 token into multiple buf16 tokens
1078         step->more_tokens
1079             = icu_tokenizer_next_token(step->u.tokenizer,
1080                                        step->buf16, status);
1081         // make sure to get new previous token if this one had been used up
1082         if (step->previous && !step->more_tokens){
1083             if (icu_chain_step_next_token(chain, step->previous, status)){
1084                 icu_tokenizer_attach(step->u.tokenizer, src16, status);
1085                 step->need_new_token = 0;
1086                 step->more_tokens
1087                     = icu_tokenizer_next_token(step->u.tokenizer,
1088                                                step->buf16, status);
1089             }
1090         }
1091         if (0 == step->more_tokens)
1092             return 0;
1093         break;
1094     default:
1095         return 0;
1096         break;
1097     }
1098
1099
1100
1101     // stop further token processing if last step and
1102     // new tokens are needed from previous (non-existing) step
1103     if (!step->previous && step->need_new_token)
1104         step->more_tokens = 0;
1105
1106     //printf("%d %d %d\n",
1107     //       step->more_tokens, src16->utf16_len, step->buf16->utf16_len);
1108
1109
1110     if (U_FAILURE(*status))
1111         return 0;
1112
1113     return 1;
1114 };
1115
1116
1117
1118 int icu_chain_assign_cstr(struct icu_chain * chain,
1119                           const char * src8cstr,
1120                           UErrorCode *status)
1121 {
1122     struct icu_chain_step * stp = 0;
1123
1124     if (!chain || !src8cstr)
1125         return 0;
1126
1127     stp = chain->steps;
1128
1129     // clear token count
1130     chain->token_count = 0;
1131
1132     // clear all steps stop states
1133
1134     while (stp){
1135         stp->more_tokens = 1;
1136         stp = stp->previous;
1137     }
1138
1139     // finally convert UTF8 to UTF16 string
1140     icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status);
1141
1142     if (U_FAILURE(*status))
1143         return 0;
1144
1145     return 1;
1146 };
1147
1148
1149
1150 int icu_chain_next_token(struct icu_chain * chain,
1151                          UErrorCode *status)
1152 {
1153     int success = 0;
1154
1155     if (!chain || !chain->steps)
1156         return 0;
1157
1158     success = icu_chain_step_next_token(chain, chain->steps, status);
1159
1160     if (success){
1161         chain->token_count++;
1162         return chain->token_count;
1163     }
1164
1165     return 0;
1166 };
1167
1168 int icu_chain_get_token_count(struct icu_chain * chain)
1169 {
1170     if (!chain)
1171         return 0;
1172
1173     return chain->token_count;
1174 };
1175
1176
1177
1178 const char * icu_chain_get_display(struct icu_chain * chain)
1179 {
1180     if (chain->display8)
1181         return (const char *) chain->display8->utf8;
1182
1183     return 0;
1184 };
1185
1186 const char * icu_chain_get_norm(struct icu_chain * chain)
1187 {
1188     if (chain->norm8)
1189         return (const char *) chain->norm8->utf8;
1190
1191     return 0;
1192 };
1193
1194 const char * icu_chain_get_sort(struct icu_chain * chain)
1195 {
1196     if (chain->sort8)
1197         return (const char *) chain->sort8->utf8;
1198
1199     return 0;
1200 };
1201
1202
1203
1204
1205 #endif // HAVE_ICU
1206
1207
1208
1209
1210 /*
1211  * Local variables:
1212  * c-basic-offset: 4
1213  * indent-tabs-mode: nil
1214  * End:
1215  * vim: shiftwidth=4 tabstop=8 expandtab
1216  */