src/icu_tokenizer.c

   1 /* This file is part of the YAZ toolkit.
   2  * Copyright (C) Index Data
   3  * See the file LICENSE for details.
   4  */
   5
   6 /**
   7  * \file
   8  * \brief ICU tokenization - using ubrk_-functions from ICU
   9  */
  10
  11 #if HAVE_CONFIG_H
  12 #include "config.h"
  13 #endif
  14
  15 #if YAZ_HAVE_ICU
  16 #include <yaz/xmalloc.h>
  17
  18 #include <yaz/icu_I18N.h>
  19
  20 #include <yaz/log.h>
  21
  22 #include <assert.h>
  23 #include <string.h>
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26
  27 #include <unicode/ustring.h>  /* some more string fcns*/
  28 #include <unicode/uchar.h>    /* char names           */
  29
  30 struct icu_tokenizer
  31 {
  32     char action;
  33     UBreakIterator* bi;
  34     struct icu_buf_utf16 * buf16;
  35     int32_t token_count;
  36     int32_t token_id;
  37     int32_t token_start;
  38     int32_t token_end;
  39 /*
  40   keep always invariant
  41   0 <= token_start
  42   <= token_end
  43   <= buf16->utf16_len
  44   and invariant
  45   0 <= token_id <= token_count
  46 */
  47 };
  48
  49 static void icu_tokenizer_reset(struct icu_tokenizer *tokenizer,
  50                                 char action)
  51 {
  52     tokenizer->action = action;
  53     tokenizer->bi = 0;
  54     tokenizer->buf16 = icu_buf_utf16_create(0);
  55     tokenizer->token_count = 0;
  56     tokenizer->token_id = 0;
  57     tokenizer->token_start = 0;
  58     tokenizer->token_end = 0;
  59     tokenizer->bi = 0;
  60 }
  61
  62 struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old)
  63 {
  64     int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
  65     UErrorCode status = U_ZERO_ERROR;
  66     struct icu_tokenizer * tokenizer
  67         = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
  68
  69     assert(old);
  70     icu_tokenizer_reset(tokenizer, old->action);
  71     assert(old->bi);
  72     tokenizer->bi = ubrk_safeClone(old->bi, NULL, &bufferSize, &status);
  73     if (U_SUCCESS(status))
  74         return tokenizer;
  75     return tokenizer;
  76 }
  77
  78 struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
  79                                            UErrorCode *status)
  80 {
  81     struct icu_tokenizer *tokenizer
  82         = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
  83
  84     icu_tokenizer_reset(tokenizer, action);
  85     switch (tokenizer->action)
  86     {
  87     case 'l':
  88     case 'L':
  89         tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
  90         break;
  91     case 's':
  92     case 'S':
  93         tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
  94         break;
  95     case 'w':
  96     case 'W':
  97         tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
  98         break;
  99     case 'c':
 100     case 'C':
 101         tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
 102         break;
 103     case 't':
 104     case 'T':
 105         tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
 106         break;
 107     default:
 108         *status = U_UNSUPPORTED_ERROR;
 109         return 0;
 110         break;
 111     }
 112
 113     /* ICU error stuff is a very  funny business */
 114     if (U_SUCCESS(*status))
 115         return tokenizer;
 116
 117     /* freeing if failed */
 118     icu_tokenizer_destroy(tokenizer);
 119     return 0;
 120 }
 121
 122 void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
 123 {
 124     if (tokenizer)
 125     {
 126         icu_buf_utf16_destroy(tokenizer->buf16);
 127         if (tokenizer->bi)
 128             ubrk_close(tokenizer->bi);
 129         xfree(tokenizer);
 130     }
 131 }
 132
 133 int icu_tokenizer_attach(struct icu_tokenizer *tokenizer,
 134                          struct icu_buf_utf16 *src16,
 135                          UErrorCode *status)
 136 {
 137     if (!tokenizer || !tokenizer->bi || !src16)
 138         return 0;
 139
 140     icu_buf_utf16_copy(tokenizer->buf16, src16);
 141
 142     tokenizer->token_count = 0;
 143     tokenizer->token_id = 0;
 144     tokenizer->token_start = 0;
 145     tokenizer->token_end = 0;
 146
 147     ubrk_setText(tokenizer->bi,
 148                  tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
 149
 150     if (U_FAILURE(*status))
 151         return 0;
 152
 153     return 1;
 154 }
 155
 156 int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
 157                                  struct icu_buf_utf16 *tkn16,
 158                                  UErrorCode *status,
 159                                  size_t *start, size_t *len)
 160 {
 161     int32_t tkn_start = 0;
 162     int32_t tkn_end = 0;
 163     int32_t tkn_len = 0;
 164
 165     if (!tokenizer || !tokenizer->bi
 166         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 167         return 0;
 168     /*
 169     never change tokenizer->buf16 and keep always invariant
 170     0 <= tokenizer->token_start
 171        <= tokenizer->token_end
 172        <= tokenizer->buf16->utf16_len
 173     returns length of token
 174     */
 175
 176     if (0 == tokenizer->token_end) /* first call */
 177         tkn_start = ubrk_first(tokenizer->bi);
 178     else /* successive calls */
 179         tkn_start = tokenizer->token_end;
 180
 181     /* get next position */
 182     tkn_end = ubrk_next(tokenizer->bi);
 183
 184     /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
 185     if (UBRK_DONE == tkn_end)
 186         tkn_end = tokenizer->buf16->utf16_len;
 187
 188     /* copy out if everything is well */
 189     if (U_FAILURE(*status))
 190         return 0;
 191
 192     /* everything OK, now update internal state */
 193     tkn_len = tkn_end - tkn_start;
 194
 195     if (0 < tkn_len)
 196     {
 197         tokenizer->token_count++;
 198         tokenizer->token_id++;
 199     }
 200     else
 201         tokenizer->token_id = 0;
 202
 203     tokenizer->token_start = tkn_start;
 204     tokenizer->token_end = tkn_end;
 205
 206     *start = tkn_start;
 207     *len = tkn_end - tkn_start;
 208
 209     /* copying into token buffer if it exists */
 210     if (tkn16)
 211     {
 212         if (tkn16->utf16_cap < tkn_len)
 213             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 214
 215         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 216                   tkn_len);
 217
 218         tkn16->utf16_len = tkn_len;
 219     }
 220
 221     return tkn_len;
 222 }
 223
 224 int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer)
 225 {
 226     return tokenizer->token_count;
 227 }
 228
 229 #endif /* YAZ_HAVE_ICU */
 230
 231 /*
 232  * Local variables:
 233  * c-basic-offset: 4
 234  * c-file-style: "Stroustrup"
 235  * indent-tabs-mode: nil
 236  * End:
 237  * vim: shiftwidth=4 tabstop=8 expandtab
 238  */
 239