/* This file is part of the YAZ toolkit.
- * Copyright (C) 1995-2009 Index Data
+ * Copyright (C) 1995-2010 Index Data
* See the file LICENSE for details.
*/
#include <yaz/log.h>
+#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <unicode/ustring.h> /* some more string fcns*/
#include <unicode/uchar.h> /* char names */
-struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
- UErrorCode *status)
+struct icu_tokenizer
+{
+ char action;
+ UBreakIterator* bi;
+ struct icu_buf_utf16 * buf16;
+ int32_t token_count;
+ int32_t token_id;
+ int32_t token_start;
+ int32_t token_end;
+/*
+ keep always invariant
+ 0 <= token_start
+ <= token_end
+ <= buf16->utf16_len
+ and invariant
+ 0 <= token_id <= token_count
+*/
+};
+
+static void icu_tokenizer_reset(struct icu_tokenizer *tokenizer,
+ char action)
{
- struct icu_tokenizer * tokenizer
- = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
-
tokenizer->action = action;
tokenizer->bi = 0;
- tokenizer->buf16 = 0;
+ tokenizer->buf16 = icu_buf_utf16_create(0);
tokenizer->token_count = 0;
tokenizer->token_id = 0;
tokenizer->token_start = 0;
tokenizer->token_end = 0;
+ tokenizer->bi = 0;
+}
+struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old)
+{
+ int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
+ UErrorCode status = U_ZERO_ERROR;
+ struct icu_tokenizer * tokenizer
+ = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
+
+ assert(old);
+ icu_tokenizer_reset(tokenizer, old->action);
+ assert(old->bi);
+ tokenizer->bi = ubrk_safeClone(old->bi, NULL, &bufferSize, &status);
+ if (U_SUCCESS(status))
+ return tokenizer;
+ return tokenizer;
+}
+
+struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
+ UErrorCode *status)
+{
+ struct icu_tokenizer * tokenizer
+ = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
+
+ icu_tokenizer_reset(tokenizer, action);
switch (tokenizer->action)
{
case 'l':
{
if (tokenizer)
{
+ icu_buf_utf16_destroy(tokenizer->buf16);
if (tokenizer->bi)
ubrk_close(tokenizer->bi);
xfree(tokenizer);
if (!tokenizer || !tokenizer->bi || !src16)
return 0;
- tokenizer->buf16 = src16;
+ icu_buf_utf16_copy(tokenizer->buf16, src16);
+
tokenizer->token_count = 0;
tokenizer->token_id = 0;
tokenizer->token_start = 0;
tokenizer->token_end = 0;
- ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status);
+ ubrk_setText(tokenizer->bi,
+ tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
if (U_FAILURE(*status))
return 0;