X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=src%2Ficu_tokenizer.c;h=7e2fc3f0656b275fa57e814fb3b20a031b9795a1;hp=f9b4926c8ae410aacac32ab99fa519b3887bdd10;hb=94b1547e5951e1e01bf5180159e74095cd0527f4;hpb=ded9dadedab82a379e0e52d78a61fe9632ac870c diff --git a/src/icu_tokenizer.c b/src/icu_tokenizer.c index f9b4926..7e2fc3f 100644 --- a/src/icu_tokenizer.c +++ b/src/icu_tokenizer.c @@ -1,5 +1,5 @@ /* This file is part of the YAZ toolkit. - * Copyright (C) 1995-2010 Index Data + * Copyright (C) 1995-2013 Index Data * See the file LICENSE for details. */ @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -37,20 +38,17 @@ struct icu_tokenizer int32_t token_end; /* keep always invariant - 0 <= token_start - <= token_end + 0 <= token_start + <= token_end <= buf16->utf16_len and invariant 0 <= token_id <= token_count */ }; -struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action, - UErrorCode *status) +static void icu_tokenizer_reset(struct icu_tokenizer *tokenizer, + char action) { - struct icu_tokenizer * tokenizer - = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer)); - tokenizer->action = action; tokenizer->bi = 0; tokenizer->buf16 = icu_buf_utf16_create(0); @@ -58,9 +56,34 @@ struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action, tokenizer->token_id = 0; tokenizer->token_start = 0; tokenizer->token_end = 0; + tokenizer->bi = 0; +} + +struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old) +{ + int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE; + UErrorCode status = U_ZERO_ERROR; + struct icu_tokenizer * tokenizer + = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer)); + + assert(old); + icu_tokenizer_reset(tokenizer, old->action); + assert(old->bi); + tokenizer->bi = ubrk_safeClone(old->bi, NULL, &bufferSize, &status); + if (U_SUCCESS(status)) + return tokenizer; + return tokenizer; +} +struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action, + UErrorCode *status) +{ + struct icu_tokenizer *tokenizer + = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer)); + + icu_tokenizer_reset(tokenizer, action); switch (tokenizer->action) - { + { case 'l': case 'L': tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status); @@ -86,7 +109,7 @@ struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action, return 0; break; } - + /* ICU error stuff is a very funny business */ if (U_SUCCESS(*status)) return tokenizer; @@ -96,7 +119,7 @@ struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action, return 0; } -void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer) +void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer) { if (tokenizer) { @@ -107,8 +130,8 @@ void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer) } } -int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, - struct icu_buf_utf16 * src16, +int icu_tokenizer_attach(struct icu_tokenizer *tokenizer, + struct icu_buf_utf16 *src16, UErrorCode *status) { if (!tokenizer || !tokenizer->bi || !src16) @@ -123,16 +146,17 @@ int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, ubrk_setText(tokenizer->bi, tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status); - + if (U_FAILURE(*status)) return 0; return 1; } -int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, - struct icu_buf_utf16 * tkn16, - UErrorCode *status) +int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, + struct icu_buf_utf16 *tkn16, + UErrorCode *status, + size_t *start, size_t *len) { int32_t tkn_start = 0; int32_t tkn_end = 0; @@ -143,8 +167,8 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, return 0; /* never change tokenizer->buf16 and keep always invariant - 0 <= tokenizer->token_start - <= tokenizer->token_end + 0 <= tokenizer->token_start + <= tokenizer->token_end <= tokenizer->buf16->utf16_len returns length of token */ @@ -163,8 +187,8 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, /* copy out if everything is well */ if (U_FAILURE(*status)) - return 0; - + return 0; + /* everything OK, now update internal state */ tkn_len = tkn_end - tkn_start; @@ -172,18 +196,23 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, { tokenizer->token_count++; tokenizer->token_id++; - } else { - tokenizer->token_id = 0; } + else + tokenizer->token_id = 0; + tokenizer->token_start = tkn_start; - tokenizer->token_end = tkn_end; + tokenizer->token_end = tkn_end; + + *start = tkn_start; + *len = tkn_end - tkn_start; /* copying into token buffer if it exists */ - if (tkn16){ + if (tkn16) + { if (tkn16->utf16_cap < tkn_len) icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2); - u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], + u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], tkn_len); tkn16->utf16_len = tkn_len; @@ -192,7 +221,7 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, return tkn_len; } -int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer) +int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer) { return tokenizer->token_count; }