X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=src%2Ficu_tokenizer.c;h=25b674c08b8e5890de88de5c6cd01efc100fbd11;hp=c7bba7fb63ec53f3fbd60b4315b4fef1bcf23306;hb=689388c889a644a40ca1f447cc862da009049836;hpb=54bd147d353f7f340a48a9da5c6a568446223371 diff --git a/src/icu_tokenizer.c b/src/icu_tokenizer.c index c7bba7f..25b674c 100644 --- a/src/icu_tokenizer.c +++ b/src/icu_tokenizer.c @@ -1,5 +1,5 @@ /* This file is part of the YAZ toolkit. - * Copyright (C) 1995-2009 Index Data + * Copyright (C) 1995-2011 Index Data * See the file LICENSE for details. */ @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -26,20 +27,61 @@ #include /* some more string fcns*/ #include /* char names */ -struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action, - UErrorCode *status) +struct icu_tokenizer +{ + char action; + UBreakIterator* bi; + struct icu_buf_utf16 * buf16; + int32_t token_count; + int32_t token_id; + int32_t token_start; + int32_t token_end; +/* + keep always invariant + 0 <= token_start + <= token_end + <= buf16->utf16_len + and invariant + 0 <= token_id <= token_count +*/ +}; + +static void icu_tokenizer_reset(struct icu_tokenizer *tokenizer, + char action) { - struct icu_tokenizer * tokenizer - = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer)); - tokenizer->action = action; tokenizer->bi = 0; - tokenizer->buf16 = 0; + tokenizer->buf16 = icu_buf_utf16_create(0); tokenizer->token_count = 0; tokenizer->token_id = 0; tokenizer->token_start = 0; tokenizer->token_end = 0; + tokenizer->bi = 0; +} +struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old) +{ + int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE; + UErrorCode status = U_ZERO_ERROR; + struct icu_tokenizer * tokenizer + = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer)); + + assert(old); + icu_tokenizer_reset(tokenizer, old->action); + assert(old->bi); + tokenizer->bi = ubrk_safeClone(old->bi, NULL, &bufferSize, &status); + if (U_SUCCESS(status)) + return tokenizer; + return tokenizer; +} + +struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action, + UErrorCode *status) +{ + struct icu_tokenizer * tokenizer + = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer)); + + icu_tokenizer_reset(tokenizer, action); switch (tokenizer->action) { case 'l': @@ -81,6 +123,7 @@ void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer) { if (tokenizer) { + icu_buf_utf16_destroy(tokenizer->buf16); if (tokenizer->bi) ubrk_close(tokenizer->bi); xfree(tokenizer); @@ -94,13 +137,15 @@ int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, if (!tokenizer || !tokenizer->bi || !src16) return 0; - tokenizer->buf16 = src16; + icu_buf_utf16_copy(tokenizer->buf16, src16); + tokenizer->token_count = 0; tokenizer->token_id = 0; tokenizer->token_start = 0; tokenizer->token_end = 0; - ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status); + ubrk_setText(tokenizer->bi, + tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status); if (U_FAILURE(*status)) return 0; @@ -150,14 +195,16 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, { tokenizer->token_count++; tokenizer->token_id++; - } else { - tokenizer->token_id = 0; } + else + tokenizer->token_id = 0; + tokenizer->token_start = tkn_start; tokenizer->token_end = tkn_end; /* copying into token buffer if it exists */ - if (tkn16){ + if (tkn16) + { if (tkn16->utf16_cap < tkn_len) icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);