/* This file is part of the YAZ toolkit.
- * Copyright (C) 1995-2009 Index Data
+ * Copyright (C) 1995-2010 Index Data
* See the file LICENSE for details.
*/
#include <unicode/ustring.h> /* some more string fcns*/
#include <unicode/uchar.h> /* char names */
+struct icu_tokenizer
+{
+ char action;
+ UBreakIterator* bi;
+ struct icu_buf_utf16 * buf16;
+ int32_t token_count;
+ int32_t token_id;
+ int32_t token_start;
+ int32_t token_end;
+/*
+ keep always invariant
+ 0 <= token_start
+ <= token_end
+ <= buf16->utf16_len
+ and invariant
+ 0 <= token_id <= token_count
+*/
+};
+
struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
UErrorCode *status)
{
tokenizer->action = action;
tokenizer->bi = 0;
- tokenizer->buf16 = 0;
+ tokenizer->buf16 = icu_buf_utf16_create(0);
tokenizer->token_count = 0;
tokenizer->token_id = 0;
tokenizer->token_start = 0;
{
if (tokenizer)
{
+ icu_buf_utf16_destroy(tokenizer->buf16);
if (tokenizer->bi)
ubrk_close(tokenizer->bi);
xfree(tokenizer);
if (!tokenizer || !tokenizer->bi || !src16)
return 0;
- tokenizer->buf16 = src16;
+ icu_buf_utf16_copy(tokenizer->buf16, src16);
+
tokenizer->token_count = 0;
tokenizer->token_id = 0;
tokenizer->token_start = 0;
tokenizer->token_end = 0;
- ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
+ ubrk_setText(tokenizer->bi,
+ tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
if (U_FAILURE(*status))
return 0;