/* This file is part of the YAZ toolkit.
- * Copyright (C) 1995-2010 Index Data
+ * Copyright (C) Index Data
* See the file LICENSE for details.
*/
int32_t token_end;
/*
keep always invariant
- 0 <= token_start
- <= token_end
+ 0 <= token_start
+ <= token_end
<= buf16->utf16_len
and invariant
0 <= token_id <= token_count
struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old)
{
- uint32_t bufferSize = 10000;
- UErrorCode status = 0;
+ int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
+ UErrorCode status = U_ZERO_ERROR;
struct icu_tokenizer * tokenizer
= (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
UErrorCode *status)
{
- struct icu_tokenizer * tokenizer
+ struct icu_tokenizer *tokenizer
= (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
icu_tokenizer_reset(tokenizer, action);
switch (tokenizer->action)
- {
+ {
case 'l':
case 'L':
tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
return 0;
break;
}
-
+
/* ICU error stuff is a very funny business */
if (U_SUCCESS(*status))
return tokenizer;
return 0;
}
-void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer)
+void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
{
if (tokenizer)
{
}
}
-int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
- struct icu_buf_utf16 * src16,
+int icu_tokenizer_attach(struct icu_tokenizer *tokenizer,
+ struct icu_buf_utf16 *src16,
UErrorCode *status)
{
if (!tokenizer || !tokenizer->bi || !src16)
ubrk_setText(tokenizer->bi,
tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
-
+
if (U_FAILURE(*status))
return 0;
return 1;
}
-int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
- struct icu_buf_utf16 * tkn16,
- UErrorCode *status)
+int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
+ struct icu_buf_utf16 *tkn16,
+ UErrorCode *status,
+ size_t *start, size_t *len)
{
int32_t tkn_start = 0;
int32_t tkn_end = 0;
return 0;
/*
never change tokenizer->buf16 and keep always invariant
- 0 <= tokenizer->token_start
- <= tokenizer->token_end
+ 0 <= tokenizer->token_start
+ <= tokenizer->token_end
<= tokenizer->buf16->utf16_len
returns length of token
*/
/* copy out if everything is well */
if (U_FAILURE(*status))
- return 0;
-
+ return 0;
+
/* everything OK, now update internal state */
tkn_len = tkn_end - tkn_start;
{
tokenizer->token_count++;
tokenizer->token_id++;
- } else {
- tokenizer->token_id = 0;
}
+ else
+ tokenizer->token_id = 0;
+
tokenizer->token_start = tkn_start;
- tokenizer->token_end = tkn_end;
+ tokenizer->token_end = tkn_end;
+
+ *start = tkn_start;
+ *len = tkn_end - tkn_start;
/* copying into token buffer if it exists */
- if (tkn16){
+ if (tkn16)
+ {
if (tkn16->utf16_cap < tkn_len)
icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
- u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
+ u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
tkn_len);
tkn16->utf16_len = tkn_len;
return tkn_len;
}
-int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer)
+int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer)
{
return tokenizer->token_count;
}