src/icu_tokenizer.c

   1 /* This file is part of the YAZ toolkit.
   2  * Copyright (C) 1995-2010 Index Data
   3  * See the file LICENSE for details.
   4  */
   5
   6 /**
   7  * \file
   8  * \brief ICU tokenization - using ubrk_-functions from ICU
   9  */
  10
  11 #if HAVE_CONFIG_H
  12 #include "config.h"
  13 #endif
  14
  15 #if YAZ_HAVE_ICU
  16 #include <yaz/xmalloc.h>
  17
  18 #include <yaz/icu_I18N.h>
  19
  20 #include <yaz/log.h>
  21
  22 #include <string.h>
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25
  26 #include <unicode/ustring.h>  /* some more string fcns*/
  27 #include <unicode/uchar.h>    /* char names           */
  28
  29 struct icu_tokenizer
  30 {
  31     char action;
  32     UBreakIterator* bi;
  33     struct icu_buf_utf16 * buf16;
  34     int32_t token_count;
  35     int32_t token_id;
  36     int32_t token_start;
  37     int32_t token_end;
  38 /*
  39   keep always invariant
  40   0 <= token_start
  41   <= token_end
  42   <= buf16->utf16_len
  43   and invariant
  44   0 <= token_id <= token_count
  45 */
  46 };
  47
  48 struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
  49                                            UErrorCode *status)
  50 {
  51     struct icu_tokenizer * tokenizer
  52         = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
  53
  54     tokenizer->action = action;
  55     tokenizer->bi = 0;
  56     tokenizer->buf16 = icu_buf_utf16_create(0);
  57     tokenizer->token_count = 0;
  58     tokenizer->token_id = 0;
  59     tokenizer->token_start = 0;
  60     tokenizer->token_end = 0;
  61
  62     switch (tokenizer->action)
  63     {
  64     case 'l':
  65     case 'L':
  66         tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
  67         break;
  68     case 's':
  69     case 'S':
  70         tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
  71         break;
  72     case 'w':
  73     case 'W':
  74         tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
  75         break;
  76     case 'c':
  77     case 'C':
  78         tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
  79         break;
  80     case 't':
  81     case 'T':
  82         tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
  83         break;
  84     default:
  85         *status = U_UNSUPPORTED_ERROR;
  86         return 0;
  87         break;
  88     }
  89
  90     /* ICU error stuff is a very  funny business */
  91     if (U_SUCCESS(*status))
  92         return tokenizer;
  93
  94     /* freeing if failed */
  95     icu_tokenizer_destroy(tokenizer);
  96     return 0;
  97 }
  98
  99 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
 100 {
 101     if (tokenizer)
 102     {
 103         icu_buf_utf16_destroy(tokenizer->buf16);
 104         if (tokenizer->bi)
 105             ubrk_close(tokenizer->bi);
 106         xfree(tokenizer);
 107     }
 108 }
 109
 110 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 111                          struct icu_buf_utf16 * src16,
 112                          UErrorCode *status)
 113 {
 114     if (!tokenizer || !tokenizer->bi || !src16)
 115         return 0;
 116
 117     icu_buf_utf16_copy(tokenizer->buf16, src16);
 118
 119     tokenizer->token_count = 0;
 120     tokenizer->token_id = 0;
 121     tokenizer->token_start = 0;
 122     tokenizer->token_end = 0;
 123
 124     ubrk_setText(tokenizer->bi,
 125                  tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
 126
 127     if (U_FAILURE(*status))
 128         return 0;
 129
 130     return 1;
 131 }
 132
 133 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 134                                  struct icu_buf_utf16 * tkn16,
 135                                  UErrorCode *status)
 136 {
 137     int32_t tkn_start = 0;
 138     int32_t tkn_end = 0;
 139     int32_t tkn_len = 0;
 140
 141     if (!tokenizer || !tokenizer->bi
 142         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 143         return 0;
 144     /*
 145     never change tokenizer->buf16 and keep always invariant
 146     0 <= tokenizer->token_start
 147        <= tokenizer->token_end
 148        <= tokenizer->buf16->utf16_len
 149     returns length of token
 150     */
 151
 152     if (0 == tokenizer->token_end) /* first call */
 153         tkn_start = ubrk_first(tokenizer->bi);
 154     else /* successive calls */
 155         tkn_start = tokenizer->token_end;
 156
 157     /* get next position */
 158     tkn_end = ubrk_next(tokenizer->bi);
 159
 160     /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
 161     if (UBRK_DONE == tkn_end)
 162         tkn_end = tokenizer->buf16->utf16_len;
 163
 164     /* copy out if everything is well */
 165     if (U_FAILURE(*status))
 166         return 0;
 167
 168     /* everything OK, now update internal state */
 169     tkn_len = tkn_end - tkn_start;
 170
 171     if (0 < tkn_len)
 172     {
 173         tokenizer->token_count++;
 174         tokenizer->token_id++;
 175     } else {
 176         tokenizer->token_id = 0;
 177     }
 178     tokenizer->token_start = tkn_start;
 179     tokenizer->token_end = tkn_end;
 180
 181     /* copying into token buffer if it exists */
 182     if (tkn16){
 183         if (tkn16->utf16_cap < tkn_len)
 184             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 185
 186         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 187                   tkn_len);
 188
 189         tkn16->utf16_len = tkn_len;
 190     }
 191
 192     return tkn_len;
 193 }
 194
 195 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 196 {
 197     return tokenizer->token_count;
 198 }
 199
 200 #endif /* YAZ_HAVE_ICU */
 201
 202 /*
 203  * Local variables:
 204  * c-basic-offset: 4
 205  * c-file-style: "Stroustrup"
 206  * indent-tabs-mode: nil
 207  * End:
 208  * vim: shiftwidth=4 tabstop=8 expandtab
 209  */
 210