src/icu_tokenizer.c

   1 /* This file is part of the YAZ toolkit.
   2  * Copyright (C) 1995-2010 Index Data
   3  * See the file LICENSE for details.
   4  */
   5
   6 /**
   7  * \file
   8  * \brief ICU tokenization - using ubrk_-functions from ICU
   9  */
  10
  11 #if HAVE_CONFIG_H
  12 #include "config.h"
  13 #endif
  14
  15 #if YAZ_HAVE_ICU
  16 #include <yaz/xmalloc.h>
  17
  18 #include <yaz/icu_I18N.h>
  19
  20 #include <yaz/log.h>
  21
  22 #include <string.h>
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25
  26 #include <unicode/ustring.h>  /* some more string fcns*/
  27 #include <unicode/uchar.h>    /* char names           */
  28
  29 struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
  30                                            UErrorCode *status)
  31 {
  32     struct icu_tokenizer * tokenizer
  33         = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
  34
  35     tokenizer->action = action;
  36     tokenizer->bi = 0;
  37     tokenizer->buf16 = icu_buf_utf16_create(0);
  38     tokenizer->token_count = 0;
  39     tokenizer->token_id = 0;
  40     tokenizer->token_start = 0;
  41     tokenizer->token_end = 0;
  42
  43     switch (tokenizer->action)
  44     {
  45     case 'l':
  46     case 'L':
  47         tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
  48         break;
  49     case 's':
  50     case 'S':
  51         tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
  52         break;
  53     case 'w':
  54     case 'W':
  55         tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
  56         break;
  57     case 'c':
  58     case 'C':
  59         tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
  60         break;
  61     case 't':
  62     case 'T':
  63         tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
  64         break;
  65     default:
  66         *status = U_UNSUPPORTED_ERROR;
  67         return 0;
  68         break;
  69     }
  70
  71     /* ICU error stuff is a very  funny business */
  72     if (U_SUCCESS(*status))
  73         return tokenizer;
  74
  75     /* freeing if failed */
  76     icu_tokenizer_destroy(tokenizer);
  77     return 0;
  78 }
  79
  80 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
  81 {
  82     if (tokenizer)
  83     {
  84         icu_buf_utf16_destroy(tokenizer->buf16);
  85         if (tokenizer->bi)
  86             ubrk_close(tokenizer->bi);
  87         xfree(tokenizer);
  88     }
  89 }
  90
  91 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
  92                          struct icu_buf_utf16 * src16,
  93                          UErrorCode *status)
  94 {
  95     if (!tokenizer || !tokenizer->bi || !src16)
  96         return 0;
  97
  98     icu_buf_utf16_copy(tokenizer->buf16, src16);
  99
 100     tokenizer->token_count = 0;
 101     tokenizer->token_id = 0;
 102     tokenizer->token_start = 0;
 103     tokenizer->token_end = 0;
 104
 105     ubrk_setText(tokenizer->bi,
 106                  tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
 107
 108     if (U_FAILURE(*status))
 109         return 0;
 110
 111     return 1;
 112 }
 113
 114 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
 115                                  struct icu_buf_utf16 * tkn16,
 116                                  UErrorCode *status)
 117 {
 118     int32_t tkn_start = 0;
 119     int32_t tkn_end = 0;
 120     int32_t tkn_len = 0;
 121
 122     if (!tokenizer || !tokenizer->bi
 123         || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
 124         return 0;
 125     /*
 126     never change tokenizer->buf16 and keep always invariant
 127     0 <= tokenizer->token_start
 128        <= tokenizer->token_end
 129        <= tokenizer->buf16->utf16_len
 130     returns length of token
 131     */
 132
 133     if (0 == tokenizer->token_end) /* first call */
 134         tkn_start = ubrk_first(tokenizer->bi);
 135     else /* successive calls */
 136         tkn_start = tokenizer->token_end;
 137
 138     /* get next position */
 139     tkn_end = ubrk_next(tokenizer->bi);
 140
 141     /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
 142     if (UBRK_DONE == tkn_end)
 143         tkn_end = tokenizer->buf16->utf16_len;
 144
 145     /* copy out if everything is well */
 146     if (U_FAILURE(*status))
 147         return 0;
 148
 149     /* everything OK, now update internal state */
 150     tkn_len = tkn_end - tkn_start;
 151
 152     if (0 < tkn_len)
 153     {
 154         tokenizer->token_count++;
 155         tokenizer->token_id++;
 156     } else {
 157         tokenizer->token_id = 0;
 158     }
 159     tokenizer->token_start = tkn_start;
 160     tokenizer->token_end = tkn_end;
 161
 162     /* copying into token buffer if it exists */
 163     if (tkn16){
 164         if (tkn16->utf16_cap < tkn_len)
 165             icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
 166
 167         u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
 168                   tkn_len);
 169
 170         tkn16->utf16_len = tkn_len;
 171     }
 172
 173     return tkn_len;
 174 }
 175
 176 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
 177 {
 178     return tokenizer->token_count;
 179 }
 180
 181 #endif /* YAZ_HAVE_ICU */
 182
 183 /*
 184  * Local variables:
 185  * c-basic-offset: 4
 186  * c-file-style: "Stroustrup"
 187  * indent-tabs-mode: nil
 188  * End:
 189  * vim: shiftwidth=4 tabstop=8 expandtab
 190  */
 191