src/icu_I18N.c

   1 /* $Id: icu_I18N.c,v 1.9 2007-05-10 11:53:47 marc Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4    This file is part of Pazpar2.
   5
   6    Pazpar2 is free software; you can redistribute it and/or modify it under
   7    the terms of the GNU General Public License as published by the Free
   8    Software Foundation; either version 2, or (at your option) any later
   9    version.
  10
  11    Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14    for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with Pazpar2; see the file LICENSE.  If not, write to the
  18    Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19    02111-1307, USA.
  20 */
  21
  22 #if HAVE_CONFIG_H
  23 #include "cconfig.h"
  24 #endif
  25
  26 #define USE_TIMING 0
  27 #if USE_TIMING
  28 #include <yaz/timing.h>
  29 #endif
  30
  31
  32 #ifdef HAVE_ICU
  33 #include "icu_I18N.h"
  34
  35 #include <yaz/log.h>
  36
  37 #include <string.h>
  38 #include <stdlib.h>
  39 #include <stdio.h>
  40
  41 #include <unicode/ustring.h>  /* some more string fcns*/
  42 #include <unicode/uchar.h>    /* char names           */
  43
  44
  45 //#include <unicode/ustdio.h>
  46 //#include <unicode/utypes.h>   /* Basic ICU data types */
  47 #include <unicode/ucol.h>
  48 //#include <unicode/ucnv.h>     /* C   Converter API    */
  49 //#include <unicode/uloc.h>
  50 //#include <unicode/ubrk.h>
  51 /* #include <unicode/unistr.h> */
  52
  53
  54
  55
  56 int icu_check_status (UErrorCode status)
  57 {
  58     if(U_FAILURE(status)){
  59         yaz_log(YLOG_WARN,
  60                 "ICU: %d %s\n", status, u_errorName(status));
  61         return 0;
  62     }
  63     return 1;
  64
  65 }
  66
  67
  68
  69 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
  70 {
  71     struct icu_buf_utf16 * buf16
  72         = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16));
  73
  74     buf16->utf16 = 0;
  75     buf16->utf16_len = 0;
  76     buf16->utf16_cap = 0;
  77
  78     if (capacity > 0){
  79         buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  80         buf16->utf16[0] = (UChar) 0;
  81         buf16->utf16_cap = capacity;
  82     }
  83     return buf16;
  84 };
  85
  86
  87 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
  88                                             size_t capacity)
  89 {
  90     if (buf16){
  91         if (capacity >  0){
  92             if (0 == buf16->utf16)
  93                 buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity);
  94             else
  95                 buf16->utf16
  96                     = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity);
  97             buf16->utf16[0] = (UChar) 0;
  98             buf16->utf16_len = 0;
  99             buf16->utf16_cap = capacity;
 100         }
 101         else {
 102             if (buf16->utf16)
 103                 free(buf16->utf16);
 104             buf16->utf16 = 0;
 105             buf16->utf16_len = 0;
 106             buf16->utf16_cap = 0;
 107         }
 108     }
 109
 110     return buf16;
 111 };
 112
 113
 114 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16)
 115 {
 116     if (buf16){
 117         if (buf16->utf16)
 118             free(buf16->utf16);
 119         free(buf16);
 120     }
 121 };
 122
 123
 124
 125
 126
 127
 128 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
 129 {
 130     struct icu_buf_utf8 * buf8
 131         = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8));
 132
 133     buf8->utf8 = 0;
 134     buf8->utf8_len = 0;
 135     buf8->utf8_cap = 0;
 136
 137     if (capacity > 0){
 138         buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 139         buf8->utf8[0] = (uint8_t) 0;
 140         buf8->utf8_cap = capacity;
 141     }
 142     return buf8;
 143 };
 144
 145
 146
 147 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
 148                                           size_t capacity)
 149 {
 150     if (buf8){
 151         if (capacity >  0){
 152             if (0 == buf8->utf8)
 153                 buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity);
 154             else
 155                 buf8->utf8
 156                     = (uint8_t *) realloc(buf8->utf8,
 157                                           sizeof(uint8_t) * capacity);
 158             buf8->utf8[0] = (uint8_t) 0;
 159             buf8->utf8_len = 0;
 160             buf8->utf8_cap = capacity;
 161         }
 162         else {
 163             if (buf8->utf8)
 164                 free(buf8->utf8);
 165             buf8->utf8 = 0;
 166             buf8->utf8_len = 0;
 167             buf8->utf8_cap = 0;
 168         }
 169     }
 170
 171     return buf8;
 172 };
 173
 174
 175
 176 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8)
 177 {
 178     if (buf8){
 179         if (buf8->utf8)
 180             free(buf8->utf8);
 181         free(buf8);
 182     }
 183 };
 184
 185
 186
 187 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
 188                                struct icu_buf_utf8 * src8,
 189                                UErrorCode * status)
 190 {
 191     int32_t utf16_len = 0;
 192
 193     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 194                   &utf16_len,
 195                   (const char *) src8->utf8, src8->utf8_len, status);
 196
 197     // check for buffer overflow, resize and retry
 198     if (*status == U_BUFFER_OVERFLOW_ERROR
 199         //|| dest16->utf16_len > dest16->utf16_cap
 200         ){
 201         icu_buf_utf16_resize(dest16, utf16_len * 2);
 202         *status = U_ZERO_ERROR;
 203         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 204                       &utf16_len,
 205                       (const char *) src8->utf8, src8->utf8_len, status);
 206     }
 207
 208     //if (*status != U_BUFFER_OVERFLOW_ERROR
 209     if (U_SUCCESS(*status)
 210         && utf16_len < dest16->utf16_cap)
 211         dest16->utf16_len = utf16_len;
 212     else {
 213         dest16->utf16[0] = (UChar) 0;
 214         dest16->utf16_len = 0;
 215     }
 216
 217     return *status;
 218 };
 219
 220
 221
 222 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
 223                                     const char * src8cstr,
 224                                     UErrorCode * status)
 225 {
 226     size_t src8cstr_len = 0;
 227     int32_t utf16_len = 0;
 228
 229     src8cstr_len = strlen(src8cstr);
 230
 231     u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 232                   &utf16_len,
 233                   src8cstr, src8cstr_len, status);
 234
 235     // check for buffer overflow, resize and retry
 236     if (*status == U_BUFFER_OVERFLOW_ERROR
 237         //|| dest16->utf16_len > dest16->utf16_cap
 238         ){
 239         icu_buf_utf16_resize(dest16, utf16_len * 2);
 240         *status = U_ZERO_ERROR;
 241         u_strFromUTF8(dest16->utf16, dest16->utf16_cap,
 242                       &utf16_len,
 243                       src8cstr, src8cstr_len, status);
 244     }
 245
 246     //  if (*status != U_BUFFER_OVERFLOW_ERROR
 247     if (U_SUCCESS(*status)
 248         && utf16_len < dest16->utf16_cap)
 249         dest16->utf16_len = utf16_len;
 250     else {
 251         dest16->utf16[0] = (UChar) 0;
 252         dest16->utf16_len = 0;
 253     }
 254
 255     return *status;
 256 };
 257
 258
 259
 260
 261 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
 262                              struct icu_buf_utf16 * src16,
 263                              UErrorCode * status)
 264 {
 265     int32_t utf8_len = 0;
 266
 267     u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 268                 &utf8_len,
 269                 src16->utf16, src16->utf16_len, status);
 270
 271     // check for buffer overflow, resize and retry
 272     if (*status == U_BUFFER_OVERFLOW_ERROR
 273         //|| dest8->utf8_len > dest8->utf8_cap
 274         ){
 275         icu_buf_utf8_resize(dest8, utf8_len * 2);
 276         *status = U_ZERO_ERROR;
 277         u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap,
 278                     &utf8_len,
 279                     src16->utf16, src16->utf16_len, status);
 280
 281     }
 282
 283     //if (*status != U_BUFFER_OVERFLOW_ERROR
 284     if (U_SUCCESS(*status)
 285         && utf8_len < dest8->utf8_cap)
 286         dest8->utf8_len = utf8_len;
 287     else {
 288         dest8->utf8[0] = (uint8_t) 0;
 289         dest8->utf8_len = 0;
 290     }
 291
 292     return *status;
 293 };
 294
 295
 296
 297 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
 298                       struct icu_buf_utf16 * src16,
 299                       const char *locale, char action,
 300                       UErrorCode *status)
 301 {
 302     int32_t dest16_len = 0;
 303
 304     switch(action) {
 305     case 'l':
 306         dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 307                                   src16->utf16, src16->utf16_len,
 308                                   locale, status);
 309         break;
 310     case 'u':
 311         dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 312                                   src16->utf16, src16->utf16_len,
 313                                   locale, status);
 314         break;
 315     case 't':
 316         dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 317                                   src16->utf16, src16->utf16_len,
 318                                   0, locale, status);
 319         break;
 320     case 'f':
 321         dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 322                                    src16->utf16, src16->utf16_len,
 323                                    U_FOLD_CASE_DEFAULT, status);
 324         break;
 325
 326     default:
 327         return U_UNSUPPORTED_ERROR;
 328         break;
 329     }
 330
 331     // check for buffer overflow, resize and retry
 332     if (*status == U_BUFFER_OVERFLOW_ERROR
 333         //|| dest16_len > dest16->utf16_cap
 334         ){
 335         icu_buf_utf16_resize(dest16, dest16_len * 2);
 336         *status = U_ZERO_ERROR;
 337
 338
 339         switch(action) {
 340         case 'l':
 341             dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap,
 342                                       src16->utf16, src16->utf16_len,
 343                                       locale, status);
 344             break;
 345         case 'u':
 346             dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap,
 347                                       src16->utf16, src16->utf16_len,
 348                                       locale, status);
 349             break;
 350         case 't':
 351             dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap,
 352                                       src16->utf16, src16->utf16_len,
 353                                       0, locale, status);
 354             break;
 355         case 'f':
 356             dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap,
 357                                        src16->utf16, src16->utf16_len,
 358                                        U_FOLD_CASE_DEFAULT, status);
 359             break;
 360
 361         default:
 362             return U_UNSUPPORTED_ERROR;
 363             break;
 364         }
 365     }
 366
 367     if (U_SUCCESS(*status)
 368         && dest16_len < dest16->utf16_cap)
 369         dest16->utf16_len = dest16_len;
 370     else {
 371         dest16->utf16[0] = (UChar) 0;
 372         dest16->utf16_len = 0;
 373     }
 374
 375     return *status;
 376 };
 377
 378
 379
 380 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
 381                                    struct icu_buf_utf8 * dest8,
 382                                    struct icu_buf_utf16 * src16,
 383                                    UErrorCode * status)
 384 {
 385
 386     int32_t sortkey_len = 0;
 387
 388     sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 389                                   dest8->utf8, dest8->utf8_cap);
 390
 391     // check for buffer overflow, resize and retry
 392     if (sortkey_len > dest8->utf8_cap) {
 393         icu_buf_utf8_resize(dest8, sortkey_len * 2);
 394         sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len,
 395                                       dest8->utf8, dest8->utf8_cap);
 396     }
 397
 398     if (U_SUCCESS(*status)
 399         && sortkey_len > 0)
 400         dest8->utf8_len = sortkey_len;
 401     else {
 402         dest8->utf8[0] = (UChar) 0;
 403         dest8->utf8_len = 0;
 404     }
 405
 406     return *status;
 407 };
 408
 409
 410
 411 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
 412                                             UErrorCode *status)
 413 {
 414     struct icu_tokenizer * tokenizer
 415         = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer));
 416
 417     strcpy(tokenizer->locale, locale);
 418     tokenizer->action = action;
 419     tokenizer->bi = 0;
 420     tokenizer->buf16 = 0;
 421     tokenizer->token_count = 0;
 422     tokenizer->token_id = 0;
 423     tokenizer->token_start = 0;
 424     tokenizer->token_end = 0;
 425
 426
 427     switch(tokenizer->action) {
 428     case 'l':
 429         tokenizer->bi
 430             = ubrk_open(UBRK_LINE, tokenizer->locale,
 431                         0, 0, status);
 432         break;
 433     case 's':
 434         tokenizer->bi
 435             = ubrk_open(UBRK_SENTENCE, tokenizer->locale,
 436                         0, 0, status);
 437         break;
 438     case 'w':
 439         tokenizer->bi
 440             = ubrk_open(UBRK_WORD, tokenizer->locale,
 441                         0, 0, status);
 442         break;
 443     case 'c':
 444         tokenizer->bi
 445             = ubrk_open(UBRK_CHARACTER, tokenizer->locale,
 446                         0, 0, status);
 447         break;
 448     case 't':
 449         tokenizer->bi
 450             = ubrk_open(UBRK_TITLE, tokenizer->locale,
 451                         0, 0, status);
 452         break;
 453     default:
 454         *status = U_UNSUPPORTED_ERROR;
 455         return 0;
 456         break;
 457     }
 458
 459     // ICU error stuff is a very  funny business
 460     if (U_SUCCESS(*status))
 461         return tokenizer;
 462
 463     // reestablishing zero error state
 464     //if (*status == U_USING_DEFAULT_WARNING)
 465     //    *status = U_ZERO_ERROR;
 466
 467
 468     // freeing if failed
 469     free(tokenizer);
 470     return 0;
 471 };
 472
 473 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
 474 {
 475
 476     if (tokenizer) {
 477         if (tokenizer->bi)
 478             ubrk_close(tokenizer->bi);
 479         free(tokenizer);
 480     }
 481 };
 482
 483 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 484                          struct icu_buf_utf16 * src16,
 485                          UErrorCode *status)
 486 {
 487     if (!tokenizer || !tokenizer->bi || !src16)
 488         return 0;
 489
 490
 491     tokenizer->buf16 = src16;
 492     tokenizer->token_count = 0;
 493     tokenizer->token_id = 0;
 494     tokenizer->token_start = 0;
 495     tokenizer->token_end = 0;
 496
 497     ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
 498
 499
 500     if (U_FAILURE(*status))
 501         return 0;
 502
 503     return 1;
 504 };
 505
 506 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 507                          struct icu_buf_utf16 * tkn16,
 508                          UErrorCode *status)
 509 {
 510     int32_t tkn_start = 0;
 511     int32_t tkn_end = 0;
 512     int32_t tkn_len = 0;
 513
 514
 515     if (!tokenizer || !tokenizer->bi
 516         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 517         return 0;
 518
 519     // never change tokenizer->buf16 and keep always invariant
 520     // 0 <= tokenizer->token_start
 521     //   <= tokenizer->token_end
 522     //   <= tokenizer->buf16->utf16_len
 523     // returns length of token
 524
 525     if (0 == tokenizer->token_end) // first call
 526         tkn_start = ubrk_first(tokenizer->bi);
 527     else //successive calls
 528         tkn_start = tokenizer->token_end;
 529
 530     // get next position
 531     tkn_end = ubrk_next(tokenizer->bi);
 532
 533     // repairing invariant at end of ubrk, which is UBRK_DONE = -1
 534     if (UBRK_DONE == tkn_end)
 535         tkn_end = tokenizer->buf16->utf16_len;
 536
 537     // copy out if everything is well
 538     if(U_FAILURE(*status))
 539         return 0;
 540
 541     // everything OK, now update internal state
 542     tkn_len = tkn_end - tkn_start;
 543
 544     if (0 < tkn_len){
 545         tokenizer->token_count++;
 546         tokenizer->token_id++;
 547     } else {
 548         tokenizer->token_id = 0;
 549     }
 550     tokenizer->token_start = tkn_start;
 551     tokenizer->token_end = tkn_end;
 552
 553
 554     // copying into token buffer if it exists
 555     if (tkn16){
 556         if (tkn16->utf16_cap < tkn_len)
 557             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 558
 559         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 560                   tkn_len);
 561
 562         tkn16->utf16_len = tkn_len;
 563     }
 564
 565     return tkn_len;
 566 }
 567
 568
 569 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer)
 570 {
 571     return tokenizer->token_id;
 572 };
 573
 574 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer)
 575 {
 576     return tokenizer->token_start;
 577 };
 578
 579 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer)
 580 {
 581     return tokenizer->token_end;
 582 };
 583
 584 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer)
 585 {
 586     return (tokenizer->token_end - tokenizer->token_start);
 587 };
 588
 589 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 590 {
 591     return tokenizer->token_count;
 592 };
 593
 594
 595
 596
 597 #endif // HAVE_ICU
 598
 599
 600
 601
 602 /*
 603  * Local variables:
 604  * c-basic-offset: 4
 605  * indent-tabs-mode: nil
 606  * End:
 607  * vim: shiftwidth=4 tabstop=8 expandtab
 608  */