From: Adam Dickmeiss Date: Mon, 30 Nov 2009 13:21:24 +0000 (+0100) Subject: Merge branch 'icu_refactor' into yaz4 X-Git-Tag: v4.0.0~104 X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=commitdiff_plain;h=4adcf3fecf9b8d2880ee0b671c9f9d20da18ca49;hp=691a433ec15d5a3e113f2712956d2a14347a5bd2 Merge branch 'icu_refactor' into yaz4 --- diff --git a/include/yaz/icu_I18N.h b/include/yaz/icu_I18N.h index 457f767..303fb67 100644 --- a/include/yaz/icu_I18N.h +++ b/include/yaz/icu_I18N.h @@ -40,9 +40,8 @@ #include /* Basic ICU data types */ #include /* char names */ -#include +#include #include -#include #include @@ -69,7 +68,7 @@ struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16, void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16); - +struct icu_buf_utf8; struct icu_buf_utf8 { @@ -88,23 +87,18 @@ struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8); -UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16, - struct icu_buf_utf8 * src8, - UErrorCode * status); - UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16, const char * src8cstr, UErrorCode * status); +const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8); + UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8, struct icu_buf_utf16 * src16, UErrorCode * status); -struct icu_casemap -{ - char action; -}; +struct icu_casemap; struct icu_casemap * icu_casemap_create(char action, UErrorCode *status); @@ -157,20 +151,9 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, struct icu_buf_utf16 * tkn16, UErrorCode *status); -int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer); -int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer); -int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer); -int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer); int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer); - - -struct icu_transform -{ - char action; - UParseError parse_error; - UTransliterator * trans; -}; +struct icu_transform; struct icu_transform * icu_transform_create(const char *id, char action, const char *rules, @@ -183,83 +166,10 @@ int icu_transform_trans(struct icu_transform * transform, struct icu_buf_utf16 * src16, UErrorCode *status); -enum icu_chain_step_type { - ICU_chain_step_type_none, - ICU_chain_step_type_display, /* convert to utf8 display format */ - ICU_chain_step_type_casemap, /* apply utf16 charmap */ - ICU_chain_step_type_transform, /* apply utf16 transform */ - ICU_chain_step_type_tokenize, /* apply utf16 tokenization */ - ICU_chain_step_type_transliterate /* apply utf16 tokenization */ -}; - - - -struct icu_chain_step -{ - /* type and action object */ - enum icu_chain_step_type type; - union { - struct icu_casemap * casemap; - struct icu_transform * transform; - struct icu_tokenizer * tokenizer; - } u; - /* temprary post-action utf16 buffer */ - struct icu_buf_utf16 * buf16; - struct icu_chain_step * previous; - int more_tokens; - int need_new_token; -}; - - -struct icu_chain; - -struct icu_chain_step * icu_chain_step_create(yaz_icu_chain_t chain, - enum icu_chain_step_type type, - const uint8_t * rule, - struct icu_buf_utf16 * buf16, - UErrorCode *status); - - -void icu_chain_step_destroy(struct icu_chain_step * step); - - -struct icu_chain -{ - char *locale; - int sort; - - const char * src8cstr; - - UCollator * coll; - - /* number of tokens returned so far */ - int32_t token_count; - - /* utf8 output buffers */ - struct icu_buf_utf8 * display8; - struct icu_buf_utf8 * norm8; - struct icu_buf_utf8 * sort8; - - /* utf16 source buffer */ - struct icu_buf_utf16 * src16; - - /* linked list of chain steps */ - struct icu_chain_step * steps; -}; - -struct icu_chain_step * icu_chain_insert_step(yaz_icu_chain_t chain, - enum icu_chain_step_type type, - const uint8_t * rule, - UErrorCode *status); - -int icu_chain_step_next_token(yaz_icu_chain_t chain, - struct icu_chain_step * step, - UErrorCode *status); +struct icu_chain_step; int icu_chain_token_number(yaz_icu_chain_t chain); -const UCollator * icu_chain_get_coll(yaz_icu_chain_t chain); - yaz_icu_chain_t icu_chain_create(const char * locale, int sort, UErrorCode * status); diff --git a/src/Makefile.am b/src/Makefile.am index 9f8c018..032da2e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -111,7 +111,8 @@ libyaz_server_la_SOURCES = statserv.c seshigh.c eventl.c \ libyaz_server_la_LDFLAGS=-version-info $(YAZ_VERSION_INFO) -libyaz_icu_la_SOURCES = icu_I18N.c +libyaz_icu_la_SOURCES = icu_chain.c icu_utf16.c icu_utf8.c \ + icu_transform.c icu_casemap.c icu_tokenizer.c icu_sortkey.c libyaz_icu_la_LDFLAGS=-version-info $(YAZ_VERSION_INFO) # Rules for Z39.50 V3 diff --git a/src/icu_I18N.c b/src/icu_I18N.c deleted file mode 100644 index 9f1d13c..0000000 --- a/src/icu_I18N.c +++ /dev/null @@ -1,1254 +0,0 @@ -/* This file is part of the YAZ toolkit. - * Copyright (C) 1995-2009 Index Data - * See the file LICENSE for details. - */ - -/** - * \file icu_I18N.c - * \brief ICU utilities - */ - -#if HAVE_CONFIG_H -#include "config.h" -#endif - -#define USE_TIMING 0 -#if USE_TIMING -#include -#endif - -#if YAZ_HAVE_ICU -#include - -#include - -#include - -#include -#include -#include - -#include /* some more string fcns*/ -#include /* char names */ - - -#include - - -int icu_check_status (UErrorCode status) -{ - if (U_FAILURE(status)) - { - yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status)); - return 0; - } - return 1; - -} - - - -struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity) -{ - struct icu_buf_utf16 * buf16 - = (struct icu_buf_utf16 *) xmalloc(sizeof(struct icu_buf_utf16)); - - buf16->utf16 = 0; - buf16->utf16_len = 0; - buf16->utf16_cap = 0; - - if (capacity > 0){ - buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity); - buf16->utf16[0] = (UChar) 0; - buf16->utf16_cap = capacity; - } - return buf16; -} - -struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16) -{ - if (buf16){ - if (buf16->utf16) - buf16->utf16[0] = (UChar) 0; - buf16->utf16_len = 0; - } - return buf16; -} - -struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16, - size_t capacity) -{ - if (!buf16) - return 0; - - if (capacity > 0){ - if (0 == buf16->utf16) - buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity); - else - buf16->utf16 - = (UChar *) xrealloc(buf16->utf16, sizeof(UChar) * capacity); - - icu_buf_utf16_clear(buf16); - buf16->utf16_cap = capacity; - } - else { - xfree(buf16->utf16); - buf16->utf16 = 0; - buf16->utf16_len = 0; - buf16->utf16_cap = 0; - } - - return buf16; -} - - -struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16) -{ - if(!dest16 || !src16 - || dest16 == src16) - return 0; - - if (dest16->utf16_cap < src16->utf16_len) - icu_buf_utf16_resize(dest16, src16->utf16_len * 2); - - u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len); - dest16->utf16_len = src16->utf16_len; - - return dest16; -} - - -void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16) -{ - if (buf16) - xfree(buf16->utf16); - xfree(buf16); -} - - - -struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity) -{ - struct icu_buf_utf8 * buf8 - = (struct icu_buf_utf8 *) xmalloc(sizeof(struct icu_buf_utf8)); - - buf8->utf8 = 0; - buf8->utf8_len = 0; - buf8->utf8_cap = 0; - - if (capacity > 0){ - buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity); - buf8->utf8[0] = (uint8_t) 0; - buf8->utf8_cap = capacity; - } - return buf8; -} - - -struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8) -{ - if (buf8){ - if (buf8->utf8) - buf8->utf8[0] = (uint8_t) 0; - buf8->utf8_len = 0; - } - return buf8; -} - - -struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, - size_t capacity) -{ - if (!buf8) - return 0; - - if (capacity > 0){ - if (0 == buf8->utf8) - buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity); - else - buf8->utf8 - = (uint8_t *) xrealloc(buf8->utf8, sizeof(uint8_t) * capacity); - - buf8->utf8_cap = capacity; - } - else { - xfree(buf8->utf8); - buf8->utf8 = 0; - buf8->utf8_len = 0; - buf8->utf8_cap = 0; - } - - return buf8; -} - - -const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8) -{ - if (!src8 || src8->utf8_len == 0) - return ""; - - if (src8->utf8_len == src8->utf8_cap) - src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1); - - src8->utf8[src8->utf8_len] = '\0'; - - return (const char *) src8->utf8; -} - - -void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8) -{ - if (buf8) - xfree(buf8->utf8); - xfree(buf8); -} - - - -UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16, - struct icu_buf_utf8 * src8, - UErrorCode * status) -{ - int32_t utf16_len = 0; - - u_strFromUTF8(dest16->utf16, dest16->utf16_cap, - &utf16_len, - (const char *) src8->utf8, src8->utf8_len, status); - - /* check for buffer overflow, resize and retry */ - if (*status == U_BUFFER_OVERFLOW_ERROR) - { - icu_buf_utf16_resize(dest16, utf16_len * 2); - *status = U_ZERO_ERROR; - u_strFromUTF8(dest16->utf16, dest16->utf16_cap, - &utf16_len, - (const char *) src8->utf8, src8->utf8_len, status); - } - - if (U_SUCCESS(*status) - && utf16_len <= dest16->utf16_cap) - dest16->utf16_len = utf16_len; - else - icu_buf_utf16_clear(dest16); - - return *status; -} - - - -UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16, - const char * src8cstr, - UErrorCode * status) -{ - size_t src8cstr_len = 0; - int32_t utf16_len = 0; - - *status = U_ZERO_ERROR; - src8cstr_len = strlen(src8cstr); - - u_strFromUTF8(dest16->utf16, dest16->utf16_cap, - &utf16_len, - src8cstr, src8cstr_len, status); - - /* check for buffer overflow, resize and retry */ - if (*status == U_BUFFER_OVERFLOW_ERROR) - { - icu_buf_utf16_resize(dest16, utf16_len * 2); - *status = U_ZERO_ERROR; - u_strFromUTF8(dest16->utf16, dest16->utf16_cap, - &utf16_len, - src8cstr, src8cstr_len, status); - } - - if (U_SUCCESS(*status) - && utf16_len <= dest16->utf16_cap) - dest16->utf16_len = utf16_len; - else - icu_buf_utf16_clear(dest16); - - return *status; -} - - - - -UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8, - struct icu_buf_utf16 * src16, - UErrorCode * status) -{ - int32_t utf8_len = 0; - - u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap, - &utf8_len, - src16->utf16, src16->utf16_len, status); - - /* check for buffer overflow, resize and retry */ - if (*status == U_BUFFER_OVERFLOW_ERROR) - { - icu_buf_utf8_resize(dest8, utf8_len * 2); - *status = U_ZERO_ERROR; - u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap, - &utf8_len, - src16->utf16, src16->utf16_len, status); - - } - - if (U_SUCCESS(*status) - && utf8_len <= dest8->utf8_cap) - dest8->utf8_len = utf8_len; - else - icu_buf_utf8_clear(dest8); - - return *status; -} - - - -struct icu_casemap * icu_casemap_create(char action, UErrorCode *status) -{ - struct icu_casemap * casemap - = (struct icu_casemap *) xmalloc(sizeof(struct icu_casemap)); - casemap->action = action; - - switch(casemap->action) { - case 'l': - case 'L': - case 'u': - case 'U': - case 't': - case 'T': - case 'f': - case 'F': - break; - default: - icu_casemap_destroy(casemap); - return 0; - } - - return casemap; -} - -void icu_casemap_destroy(struct icu_casemap * casemap) -{ - xfree(casemap); -} - - -int icu_casemap_casemap(struct icu_casemap * casemap, - struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16, - UErrorCode *status, - const char *locale) -{ - if(!casemap) - return 0; - - return icu_utf16_casemap(dest16, src16, locale, - casemap->action, status); -} - - -int icu_utf16_casemap(struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16, - const char *locale, char action, - UErrorCode *status) -{ - int32_t dest16_len = 0; - - - if (!src16->utf16_len){ /* guarding for empty source string */ - if (dest16->utf16) - dest16->utf16[0] = (UChar) 0; - dest16->utf16_len = 0; - return U_ZERO_ERROR; - } - - - switch(action) { - case 'l': - case 'L': - dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - locale, status); - break; - case 'u': - case 'U': - dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - locale, status); - break; - case 't': - case 'T': - dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - 0, locale, status); - break; - case 'f': - case 'F': - dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - U_FOLD_CASE_DEFAULT, status); - break; - - default: - return U_UNSUPPORTED_ERROR; - break; - } - - /* check for buffer overflow, resize and retry */ - if (*status == U_BUFFER_OVERFLOW_ERROR - && dest16 != src16 /* do not resize if in-place conversion */ - ){ - icu_buf_utf16_resize(dest16, dest16_len * 2); - *status = U_ZERO_ERROR; - - - switch(action) { - case 'l': - case 'L': - dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - locale, status); - break; - case 'u': - case 'U': - dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - locale, status); - break; - case 't': - case 'T': - dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - 0, locale, status); - break; - case 'f': - case 'F': - dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - U_FOLD_CASE_DEFAULT, status); - break; - - default: - return U_UNSUPPORTED_ERROR; - break; - } - } - - if (U_SUCCESS(*status) - && dest16_len <= dest16->utf16_cap) - dest16->utf16_len = dest16_len; - else { - if (dest16->utf16) - dest16->utf16[0] = (UChar) 0; - dest16->utf16_len = 0; - } - - return *status; -} - - - -void icu_sortkey8_from_utf16(UCollator *coll, - struct icu_buf_utf8 * dest8, - struct icu_buf_utf16 * src16, - UErrorCode * status) -{ - - int32_t sortkey_len = 0; - - sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, - dest8->utf8, dest8->utf8_cap); - - /* check for buffer overflow, resize and retry */ - if (sortkey_len > dest8->utf8_cap) { - icu_buf_utf8_resize(dest8, sortkey_len * 2); - sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, - dest8->utf8, dest8->utf8_cap); - } - - if (U_SUCCESS(*status) - && sortkey_len > 0) - dest8->utf8_len = sortkey_len; - else - icu_buf_utf8_clear(dest8); -} - - - -struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, - UErrorCode *status) -{ - struct icu_tokenizer * tokenizer - = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer)); - - tokenizer->action = action; - tokenizer->bi = 0; - tokenizer->buf16 = 0; - tokenizer->token_count = 0; - tokenizer->token_id = 0; - tokenizer->token_start = 0; - tokenizer->token_end = 0; - - - switch(tokenizer->action) { - case 'l': - case 'L': - tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status); - break; - case 's': - case 'S': - tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status); - break; - case 'w': - case 'W': - tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status); - break; - case 'c': - case 'C': - tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status); - break; - case 't': - case 'T': - tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status); - break; - default: - *status = U_UNSUPPORTED_ERROR; - return 0; - break; - } - - /* ICU error stuff is a very funny business */ - if (U_SUCCESS(*status)) - return tokenizer; - - /* freeing if failed */ - icu_tokenizer_destroy(tokenizer); - return 0; -} - -void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer) -{ - if (tokenizer) { - if (tokenizer->bi) - ubrk_close(tokenizer->bi); - xfree(tokenizer); - } -} - -int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, - struct icu_buf_utf16 * src16, - UErrorCode *status) -{ - if (!tokenizer || !tokenizer->bi || !src16) - return 0; - - - tokenizer->buf16 = src16; - tokenizer->token_count = 0; - tokenizer->token_id = 0; - tokenizer->token_start = 0; - tokenizer->token_end = 0; - - ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status); - - - if (U_FAILURE(*status)) - return 0; - - return 1; -}; - -int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, - struct icu_buf_utf16 * tkn16, - UErrorCode *status) -{ - int32_t tkn_start = 0; - int32_t tkn_end = 0; - int32_t tkn_len = 0; - - - if (!tokenizer || !tokenizer->bi - || !tokenizer->buf16 || !tokenizer->buf16->utf16_len) - return 0; - - /* - never change tokenizer->buf16 and keep always invariant - 0 <= tokenizer->token_start - <= tokenizer->token_end - <= tokenizer->buf16->utf16_len - returns length of token - */ - - if (0 == tokenizer->token_end) /* first call */ - tkn_start = ubrk_first(tokenizer->bi); - else /* successive calls */ - tkn_start = tokenizer->token_end; - - /* get next position */ - tkn_end = ubrk_next(tokenizer->bi); - - /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */ - if (UBRK_DONE == tkn_end) - tkn_end = tokenizer->buf16->utf16_len; - - /* copy out if everything is well */ - if(U_FAILURE(*status)) - return 0; - - /* everything OK, now update internal state */ - tkn_len = tkn_end - tkn_start; - - if (0 < tkn_len){ - tokenizer->token_count++; - tokenizer->token_id++; - } else { - tokenizer->token_id = 0; - } - tokenizer->token_start = tkn_start; - tokenizer->token_end = tkn_end; - - - /* copying into token buffer if it exists */ - if (tkn16){ - if (tkn16->utf16_cap < tkn_len) - icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2); - - u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], - tkn_len); - - tkn16->utf16_len = tkn_len; - } - - return tkn_len; -} - - -int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer) -{ - return tokenizer->token_id; -} - -int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer) -{ - return tokenizer->token_start; -} - -int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer) -{ - return tokenizer->token_end; -} - -int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer) -{ - return (tokenizer->token_end - tokenizer->token_start); -} - -int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer) -{ - return tokenizer->token_count; -} - - - -struct icu_transform * icu_transform_create(const char *id, char action, - const char *rules, - UErrorCode *status) -{ - struct icu_buf_utf16 *id16 = icu_buf_utf16_create(0); - struct icu_buf_utf16 *rules16 = icu_buf_utf16_create(0); - - struct icu_transform * transform - = (struct icu_transform *) xmalloc(sizeof(struct icu_transform)); - - transform->action = action; - transform->trans = 0; - - if (id) - icu_utf16_from_utf8_cstr(id16, id, status); - if (rules) - icu_utf16_from_utf8_cstr(rules16, rules, status); - - switch(transform->action) - { - case 'f': - case 'F': - transform->trans - = utrans_openU(id16->utf16, - id16->utf16_len, - UTRANS_FORWARD, - rules16->utf16, - rules16->utf16_len, - &transform->parse_error, status); - break; - case 'r': - case 'R': - transform->trans - = utrans_openU(id16->utf16, - id16->utf16_len, - UTRANS_REVERSE , - rules16->utf16, - rules16->utf16_len, - &transform->parse_error, status); - break; - default: - *status = U_UNSUPPORTED_ERROR; - break; - } - icu_buf_utf16_destroy(rules16); - icu_buf_utf16_destroy(id16); - - if (U_SUCCESS(*status)) - return transform; - - /* freeing if failed */ - icu_transform_destroy(transform); - return 0; -} - - -void icu_transform_destroy(struct icu_transform * transform){ - if (transform) { - if (transform->trans) - utrans_close(transform->trans); - xfree(transform); - } -} - - - -int icu_transform_trans(struct icu_transform * transform, - struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16, - UErrorCode *status) -{ - if (!transform || !transform->trans - || !src16 - || !dest16) - return 0; - - if (!src16->utf16_len){ /* guarding for empty source string */ - icu_buf_utf16_clear(dest16); - return 0; - } - - if (!icu_buf_utf16_copy(dest16, src16)) - return 0; - - - utrans_transUChars (transform->trans, - dest16->utf16, &(dest16->utf16_len), - dest16->utf16_cap, - 0, &(src16->utf16_len), status); - - if (U_FAILURE(*status)) - icu_buf_utf16_clear(dest16); - - return dest16->utf16_len; -} - - - - -struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, - enum icu_chain_step_type type, - const uint8_t * rule, - struct icu_buf_utf16 * buf16, - UErrorCode *status) -{ - struct icu_chain_step * step = 0; - - if(!chain || !type || !rule) - return 0; - - step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step)); - - step->type = type; - - step->buf16 = buf16; - - /* create auxilary objects */ - switch(step->type) { - case ICU_chain_step_type_display: - break; - case ICU_chain_step_type_casemap: - step->u.casemap = icu_casemap_create(rule[0], status); - break; - case ICU_chain_step_type_transform: - /* rule omitted. Only ID used */ - step->u.transform = icu_transform_create((const char *) rule, 'f', - 0, status); - break; - case ICU_chain_step_type_tokenize: - step->u.tokenizer = icu_tokenizer_create((char *) chain->locale, - (char) rule[0], status); - break; - case ICU_chain_step_type_transliterate: - /* we pass a dummy ID to utrans_openU.. */ - step->u.transform = icu_transform_create("custom", 'f', - (const char *) rule, status); - break; - default: - break; - } - - return step; -} - - -void icu_chain_step_destroy(struct icu_chain_step * step){ - - if (!step) - return; - - icu_chain_step_destroy(step->previous); - - switch(step->type) { - case ICU_chain_step_type_display: - break; - case ICU_chain_step_type_casemap: - icu_casemap_destroy(step->u.casemap); - icu_buf_utf16_destroy(step->buf16); - break; - case ICU_chain_step_type_transform: - case ICU_chain_step_type_transliterate: - icu_transform_destroy(step->u.transform); - icu_buf_utf16_destroy(step->buf16); - break; - case ICU_chain_step_type_tokenize: - icu_tokenizer_destroy(step->u.tokenizer); - icu_buf_utf16_destroy(step->buf16); - break; - default: - break; - } - xfree(step); -} - - - -struct icu_chain * icu_chain_create(const char *locale, int sort, - UErrorCode * status) -{ - struct icu_chain * chain - = (struct icu_chain *) xmalloc(sizeof(struct icu_chain)); - - *status = U_ZERO_ERROR; - - chain->locale = xstrdup(locale); - - chain->sort = sort; - - chain->coll = ucol_open((const char *) chain->locale, status); - - if (U_FAILURE(*status)) - return 0; - - chain->token_count = 0; - - chain->src8cstr = 0; - - chain->display8 = icu_buf_utf8_create(0); - chain->norm8 = icu_buf_utf8_create(0); - chain->sort8 = icu_buf_utf8_create(0); - - chain->src16 = icu_buf_utf16_create(0); - - chain->steps = 0; - - return chain; -} - - -void icu_chain_destroy(struct icu_chain * chain) -{ - if (chain) - { - if (chain->coll) - ucol_close(chain->coll); - - icu_buf_utf8_destroy(chain->display8); - icu_buf_utf8_destroy(chain->norm8); - icu_buf_utf8_destroy(chain->sort8); - - icu_buf_utf16_destroy(chain->src16); - - icu_chain_step_destroy(chain->steps); - xfree(chain->locale); - xfree(chain); - } -} - - - -struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node, - int sort, - UErrorCode * status) -{ - xmlNode *node = 0; - struct icu_chain * chain = 0; - - *status = U_ZERO_ERROR; - - if (!xml_node ||xml_node->type != XML_ELEMENT_NODE) - return 0; - - { - xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node, - (xmlChar *) "locale"); - - if (xml_locale) - { - chain = icu_chain_create((const char *) xml_locale, sort, status); - xmlFree(xml_locale); - } - - } - if (!chain) - return 0; - - for (node = xml_node->children; node; node = node->next) - { - xmlChar *xml_rule; - struct icu_chain_step * step = 0; - - if (node->type != XML_ELEMENT_NODE) - continue; - - xml_rule = xmlGetProp(node, (xmlChar *) "rule"); - - if (!strcmp((const char *) node->name, "casemap")) - step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap, - (const uint8_t *) xml_rule, status); - else if (!strcmp((const char *) node->name, "transform")) - step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, - (const uint8_t *) xml_rule, status); - else if (!strcmp((const char *) node->name, "transliterate")) - step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate, - (const uint8_t *) xml_rule, status); - else if (!strcmp((const char *) node->name, "tokenize")) - step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, - (const uint8_t *) xml_rule, status); - else if (!strcmp((const char *) node->name, "display")) - step = icu_chain_insert_step(chain, ICU_chain_step_type_display, - (const uint8_t *) "", status); - else if (!strcmp((const char *) node->name, "normalize")) - { - yaz_log(YLOG_WARN, "Element %s is deprecated. " - "Use transform instead", node->name); - step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, - (const uint8_t *) xml_rule, status); - } - else if (!strcmp((const char *) node->name, "index") - || !strcmp((const char *) node->name, "sortkey")) - { - yaz_log(YLOG_WARN, "Element %s is no longer needed. " - "Remove it from the configuration", node->name); - } - else - { - yaz_log(YLOG_WARN, "Unknown element %s", node->name); - icu_chain_destroy(chain); - return 0; - } - xmlFree(xml_rule); - if (step && U_FAILURE(*status)) - { - icu_chain_destroy(chain); - return 0; - } - } - return chain; -} - - - -struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, - enum icu_chain_step_type type, - const uint8_t * rule, - UErrorCode *status) -{ - struct icu_chain_step * step = 0; - struct icu_buf_utf16 * src16 = 0; - struct icu_buf_utf16 * buf16 = 0; - - if (!chain || !type || !rule) - return 0; - - /* assign utf16 src buffers as needed */ - if (chain->steps && chain->steps->buf16) - src16 = chain->steps->buf16; - else if (chain->src16) - src16 = chain->src16; - else - return 0; - - - /* create utf16 destination buffers as needed, or */ - switch(type) - { - case ICU_chain_step_type_display: - buf16 = src16; - break; - case ICU_chain_step_type_casemap: - buf16 = icu_buf_utf16_create(0); - break; - case ICU_chain_step_type_transform: - case ICU_chain_step_type_transliterate: - buf16 = icu_buf_utf16_create(0); - break; - case ICU_chain_step_type_tokenize: - buf16 = icu_buf_utf16_create(0); - break; - break; - default: - break; - } - - /* create actual chain step with this buffer */ - step = icu_chain_step_create(chain, type, rule, buf16, status); - - step->previous = chain->steps; - chain->steps = step; - - return step; -} - - -int icu_chain_step_next_token(struct icu_chain * chain, - struct icu_chain_step * step, - UErrorCode *status) -{ - struct icu_buf_utf16 * src16 = 0; - int got_new_token = 0; - - if (!chain || !chain->src16 || !step || !step->more_tokens) - return 0; - - /* assign utf16 src buffers as neeed, advance in previous steps - tokens until non-zero token met, and setting stop condition */ - - if (step->previous) - { - src16 = step->previous->buf16; - /* tokens might be killed in previous steps, therefore looping */ - - while (step->need_new_token - && step->previous->more_tokens - && !got_new_token) - got_new_token - = icu_chain_step_next_token(chain, step->previous, status); - } - else - { /* first step can only work once on chain->src16 input buffer */ - src16 = chain->src16; - step->more_tokens = 0; - got_new_token = 1; - } - - if (!src16) - return 0; - - /* stop if nothing to process */ - if (step->need_new_token && !got_new_token) - { - step->more_tokens = 0; - return 0; - } - - /* either an old token not finished yet, or a new token, thus - perform the work, eventually put this steps output in - step->buf16 or the chains UTF8 output buffers */ - - switch(step->type) - { - case ICU_chain_step_type_display: - icu_utf16_to_utf8(chain->display8, src16, status); - break; - case ICU_chain_step_type_casemap: - icu_casemap_casemap(step->u.casemap, - step->buf16, src16, status, - chain->locale); - break; - case ICU_chain_step_type_transform: - case ICU_chain_step_type_transliterate: - icu_transform_trans(step->u.transform, - step->buf16, src16, status); - break; - case ICU_chain_step_type_tokenize: - /* attach to new src16 token only first time during splitting */ - if (step->need_new_token) - { - icu_tokenizer_attach(step->u.tokenizer, src16, status); - step->need_new_token = 0; - } - - /* splitting one src16 token into multiple buf16 tokens */ - step->more_tokens - = icu_tokenizer_next_token(step->u.tokenizer, - step->buf16, status); - - /* make sure to get new previous token if this one had been used up - by recursive call to _same_ step */ - - if (!step->more_tokens) - { - step->more_tokens = icu_chain_step_next_token(chain, step, status); - return step->more_tokens; /* avoid one token count too much! */ - } - break; - default: - return 0; - break; - } - - if (U_FAILURE(*status)) - return 0; - - /* if token disappered into thin air, tell caller */ - /* if (!step->buf16->utf16_len && !step->more_tokens) */ - /* return 0; */ - - return 1; -} - - -int icu_chain_assign_cstr(struct icu_chain * chain, - const char * src8cstr, - UErrorCode *status) -{ - struct icu_chain_step * stp = 0; - - if (!chain || !src8cstr) - return 0; - - chain->src8cstr = src8cstr; - - stp = chain->steps; - - /* clear token count */ - chain->token_count = 0; - - /* clear all steps stop states */ - while (stp) - { - stp->more_tokens = 1; - stp->need_new_token = 1; - stp = stp->previous; - } - - /* finally convert UTF8 to UTF16 string if needed */ - if (chain->steps || chain->sort) - icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status); - - if (U_FAILURE(*status)) - return 0; - - return 1; -} - - - -int icu_chain_next_token(struct icu_chain * chain, - UErrorCode *status) -{ - int got_token = 0; - - *status = U_ZERO_ERROR; - - if (!chain) - return 0; - - /* special case with no steps - same as index type binary */ - if (!chain->steps) - { - if (chain->token_count) - return 0; - else - { - chain->token_count++; - - if (chain->sort) - icu_sortkey8_from_utf16(chain->coll, - chain->sort8, chain->steps->buf16, - status); - return chain->token_count; - } - } - /* usual case, one or more icu chain steps existing */ - else - { - while(!got_token && chain->steps && chain->steps->more_tokens) - got_token = icu_chain_step_next_token(chain, chain->steps, status); - - if (got_token) - { - chain->token_count++; - - icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status); - - if (chain->sort) - icu_sortkey8_from_utf16(chain->coll, - chain->sort8, chain->steps->buf16, - status); - - return chain->token_count; - } - } - - return 0; -} - -int icu_chain_token_number(struct icu_chain * chain) -{ - if (!chain) - return 0; - - return chain->token_count; -} - - -const char * icu_chain_token_display(struct icu_chain * chain) -{ - if (chain->display8) - return icu_buf_utf8_to_cstr(chain->display8); - - return 0; -} - -const char * icu_chain_token_norm(struct icu_chain * chain) -{ - if (!chain->steps) - return chain->src8cstr; - - if (chain->norm8) - return icu_buf_utf8_to_cstr(chain->norm8); - - return 0; -} - -const char * icu_chain_token_sortkey(struct icu_chain * chain) -{ - if (chain->sort8) - return icu_buf_utf8_to_cstr(chain->sort8); - - return 0; -} - -const UCollator * icu_chain_get_coll(struct icu_chain * chain) -{ - return chain->coll; -} - -#endif /* YAZ_HAVE_ICU */ - -/* - * Local variables: - * c-basic-offset: 4 - * c-file-style: "Stroustrup" - * indent-tabs-mode: nil - * End: - * vim: shiftwidth=4 tabstop=8 expandtab - */ - diff --git a/src/icu_casemap.c b/src/icu_casemap.c new file mode 100644 index 0000000..4c50f69 --- /dev/null +++ b/src/icu_casemap.c @@ -0,0 +1,184 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2009 Index Data + * See the file LICENSE for details. + */ + +/** + * \file + * \brief ICU character case (u_strToUpper, etc) + */ + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#if YAZ_HAVE_ICU +#include + +#include + +#include + +#include +#include +#include + +struct icu_casemap +{ + char action; +}; + +struct icu_casemap * icu_casemap_create(char action, UErrorCode *status) +{ + struct icu_casemap * casemap + = (struct icu_casemap *) xmalloc(sizeof(struct icu_casemap)); + casemap->action = action; + + switch (casemap->action) + { + case 'l': + case 'L': + case 'u': + case 'U': + case 't': + case 'T': + case 'f': + case 'F': + break; + default: + icu_casemap_destroy(casemap); + return 0; + } + return casemap; +} + +void icu_casemap_destroy(struct icu_casemap * casemap) +{ + xfree(casemap); +} + +int icu_casemap_casemap(struct icu_casemap * casemap, + struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + UErrorCode *status, + const char *locale) +{ + if(!casemap) + return 0; + + return icu_utf16_casemap(dest16, src16, locale, + casemap->action, status); +} + +int icu_utf16_casemap(struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + const char *locale, char action, + UErrorCode *status) +{ + int32_t dest16_len = 0; + + if (!src16->utf16_len) + { /* guarding for empty source string */ + if (dest16->utf16) + dest16->utf16[0] = (UChar) 0; + dest16->utf16_len = 0; + return U_ZERO_ERROR; + } + + switch (action) + { + case 'l': + case 'L': + dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + locale, status); + break; + case 'u': + case 'U': + dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + locale, status); + break; + case 't': + case 'T': + dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + 0, locale, status); + break; + case 'f': + case 'F': + dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + U_FOLD_CASE_DEFAULT, status); + break; + + default: + return U_UNSUPPORTED_ERROR; + break; + } + + /* check for buffer overflow, resize and retry */ + if (*status == U_BUFFER_OVERFLOW_ERROR + && dest16 != src16 /* do not resize if in-place conversion */ + ) + { + icu_buf_utf16_resize(dest16, dest16_len * 2); + *status = U_ZERO_ERROR; + + switch (action) { + case 'l': + case 'L': + dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + locale, status); + break; + case 'u': + case 'U': + dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + locale, status); + break; + case 't': + case 'T': + dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + 0, locale, status); + break; + case 'f': + case 'F': + dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap, + src16->utf16, src16->utf16_len, + U_FOLD_CASE_DEFAULT, status); + break; + + default: + return U_UNSUPPORTED_ERROR; + break; + } + } + + if (U_SUCCESS(*status) + && dest16_len <= dest16->utf16_cap) + dest16->utf16_len = dest16_len; + else + { + if (dest16->utf16) + dest16->utf16[0] = (UChar) 0; + dest16->utf16_len = 0; + } + + return *status; +} + + +#endif /* YAZ_HAVE_ICU */ + +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/src/icu_chain.c b/src/icu_chain.c new file mode 100644 index 0000000..73a7674 --- /dev/null +++ b/src/icu_chain.c @@ -0,0 +1,569 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2009 Index Data + * See the file LICENSE for details. + */ + +/** + * \file + * \brief ICU chain + */ + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#if YAZ_HAVE_ICU +#include + +#include + +#include + +#include +#include +#include + +#include /* some more string fcns*/ +#include /* char names */ + +enum icu_chain_step_type { + ICU_chain_step_type_none, + ICU_chain_step_type_display, /* convert to utf8 display format */ + ICU_chain_step_type_casemap, /* apply utf16 charmap */ + ICU_chain_step_type_transform, /* apply utf16 transform */ + ICU_chain_step_type_tokenize, /* apply utf16 tokenization */ + ICU_chain_step_type_transliterate /* apply utf16 tokenization */ +}; + +struct icu_chain_step +{ + /* type and action object */ + enum icu_chain_step_type type; + union { + struct icu_casemap * casemap; + struct icu_transform * transform; + struct icu_tokenizer * tokenizer; + } u; + /* temprary post-action utf16 buffer */ + struct icu_buf_utf16 * buf16; + struct icu_chain_step * previous; + int more_tokens; + int need_new_token; +}; + +struct icu_chain +{ + char *locale; + int sort; + + const char * src8cstr; + + UCollator * coll; + + /* number of tokens returned so far */ + int32_t token_count; + + /* utf8 output buffers */ + struct icu_buf_utf8 * display8; + struct icu_buf_utf8 * norm8; + struct icu_buf_utf8 * sort8; + + /* utf16 source buffer */ + struct icu_buf_utf16 * src16; + + /* linked list of chain steps */ + struct icu_chain_step * steps; +}; + +int icu_check_status(UErrorCode status) +{ + if (U_FAILURE(status)) + { + yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status)); + return 0; + } + return 1; +} + +static struct icu_chain_step *icu_chain_step_create( + struct icu_chain * chain, enum icu_chain_step_type type, + const uint8_t * rule, struct icu_buf_utf16 * buf16, + UErrorCode *status) +{ + struct icu_chain_step * step = 0; + + if(!chain || !type || !rule) + return 0; + + step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step)); + + step->type = type; + + step->buf16 = buf16; + + /* create auxilary objects */ + switch (step->type) + { + case ICU_chain_step_type_display: + break; + case ICU_chain_step_type_casemap: + step->u.casemap = icu_casemap_create(rule[0], status); + break; + case ICU_chain_step_type_transform: + /* rule omitted. Only ID used */ + step->u.transform = icu_transform_create((const char *) rule, 'f', + 0, status); + break; + case ICU_chain_step_type_tokenize: + step->u.tokenizer = icu_tokenizer_create((char *) chain->locale, + (char) rule[0], status); + break; + case ICU_chain_step_type_transliterate: + /* we pass a dummy ID to utrans_openU.. */ + step->u.transform = icu_transform_create("custom", 'f', + (const char *) rule, status); + break; + default: + break; + } + return step; +} + + +static void icu_chain_step_destroy(struct icu_chain_step * step) +{ + if (!step) + return; + + icu_chain_step_destroy(step->previous); + + switch (step->type) + { + case ICU_chain_step_type_display: + break; + case ICU_chain_step_type_casemap: + icu_casemap_destroy(step->u.casemap); + icu_buf_utf16_destroy(step->buf16); + break; + case ICU_chain_step_type_transform: + case ICU_chain_step_type_transliterate: + icu_transform_destroy(step->u.transform); + icu_buf_utf16_destroy(step->buf16); + break; + case ICU_chain_step_type_tokenize: + icu_tokenizer_destroy(step->u.tokenizer); + icu_buf_utf16_destroy(step->buf16); + break; + default: + break; + } + xfree(step); +} + +struct icu_chain *icu_chain_create(const char *locale, int sort, + UErrorCode * status) +{ + struct icu_chain * chain + = (struct icu_chain *) xmalloc(sizeof(struct icu_chain)); + + *status = U_ZERO_ERROR; + + chain->locale = xstrdup(locale); + + chain->sort = sort; + + chain->coll = ucol_open((const char *) chain->locale, status); + + if (U_FAILURE(*status)) + return 0; + + chain->token_count = 0; + + chain->src8cstr = 0; + + chain->display8 = icu_buf_utf8_create(0); + chain->norm8 = icu_buf_utf8_create(0); + chain->sort8 = icu_buf_utf8_create(0); + + chain->src16 = icu_buf_utf16_create(0); + + chain->steps = 0; + + return chain; +} + +void icu_chain_destroy(struct icu_chain * chain) +{ + if (chain) + { + if (chain->coll) + ucol_close(chain->coll); + + icu_buf_utf8_destroy(chain->display8); + icu_buf_utf8_destroy(chain->norm8); + icu_buf_utf8_destroy(chain->sort8); + + icu_buf_utf16_destroy(chain->src16); + + icu_chain_step_destroy(chain->steps); + xfree(chain->locale); + xfree(chain); + } +} + +static struct icu_chain_step *icu_chain_insert_step( + struct icu_chain * chain, enum icu_chain_step_type type, + const uint8_t * rule, UErrorCode *status); + +struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node, + int sort, + UErrorCode * status) +{ + xmlNode *node = 0; + struct icu_chain * chain = 0; + + *status = U_ZERO_ERROR; + + if (!xml_node ||xml_node->type != XML_ELEMENT_NODE) + return 0; + + { + xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node, + (xmlChar *) "locale"); + + if (xml_locale) + { + chain = icu_chain_create((const char *) xml_locale, sort, status); + xmlFree(xml_locale); + } + + } + if (!chain) + return 0; + + for (node = xml_node->children; node; node = node->next) + { + xmlChar *xml_rule; + struct icu_chain_step * step = 0; + + if (node->type != XML_ELEMENT_NODE) + continue; + + xml_rule = xmlGetProp(node, (xmlChar *) "rule"); + + if (!strcmp((const char *) node->name, "casemap")) + step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap, + (const uint8_t *) xml_rule, status); + else if (!strcmp((const char *) node->name, "transform")) + step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, + (const uint8_t *) xml_rule, status); + else if (!strcmp((const char *) node->name, "transliterate")) + step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate, + (const uint8_t *) xml_rule, status); + else if (!strcmp((const char *) node->name, "tokenize")) + step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, + (const uint8_t *) xml_rule, status); + else if (!strcmp((const char *) node->name, "display")) + step = icu_chain_insert_step(chain, ICU_chain_step_type_display, + (const uint8_t *) "", status); + else if (!strcmp((const char *) node->name, "normalize")) + { + yaz_log(YLOG_WARN, "Element %s is deprecated. " + "Use transform instead", node->name); + step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, + (const uint8_t *) xml_rule, status); + } + else if (!strcmp((const char *) node->name, "index") + || !strcmp((const char *) node->name, "sortkey")) + { + yaz_log(YLOG_WARN, "Element %s is no longer needed. " + "Remove it from the configuration", node->name); + } + else + { + yaz_log(YLOG_WARN, "Unknown element %s", node->name); + icu_chain_destroy(chain); + return 0; + } + xmlFree(xml_rule); + if (step && U_FAILURE(*status)) + { + icu_chain_destroy(chain); + return 0; + } + } + return chain; +} + +static struct icu_chain_step *icu_chain_insert_step( + struct icu_chain * chain, enum icu_chain_step_type type, + const uint8_t * rule, UErrorCode *status) +{ + struct icu_chain_step * step = 0; + struct icu_buf_utf16 * src16 = 0; + struct icu_buf_utf16 * buf16 = 0; + + if (!chain || !type || !rule) + return 0; + + /* assign utf16 src buffers as needed */ + if (chain->steps && chain->steps->buf16) + src16 = chain->steps->buf16; + else if (chain->src16) + src16 = chain->src16; + else + return 0; + + /* create utf16 destination buffers as needed, or */ + switch (type) + { + case ICU_chain_step_type_display: + buf16 = src16; + break; + case ICU_chain_step_type_casemap: + buf16 = icu_buf_utf16_create(0); + break; + case ICU_chain_step_type_transform: + case ICU_chain_step_type_transliterate: + buf16 = icu_buf_utf16_create(0); + break; + case ICU_chain_step_type_tokenize: + buf16 = icu_buf_utf16_create(0); + break; + break; + default: + break; + } + /* create actual chain step with this buffer */ + step = icu_chain_step_create(chain, type, rule, buf16, status); + + step->previous = chain->steps; + chain->steps = step; + + return step; +} + +static int icu_chain_step_next_token(struct icu_chain * chain, + struct icu_chain_step * step, + UErrorCode *status) +{ + struct icu_buf_utf16 * src16 = 0; + int got_new_token = 0; + + if (!chain || !chain->src16 || !step || !step->more_tokens) + return 0; + + /* assign utf16 src buffers as needed, advance in previous steps + tokens until non-zero token met, and setting stop condition */ + + if (step->previous) + { + src16 = step->previous->buf16; + /* tokens might be killed in previous steps, therefore looping */ + + while (step->need_new_token + && step->previous->more_tokens + && !got_new_token) + got_new_token + = icu_chain_step_next_token(chain, step->previous, status); + } + else + { /* first step can only work once on chain->src16 input buffer */ + src16 = chain->src16; + step->more_tokens = 0; + got_new_token = 1; + } + + if (!src16) + return 0; + + /* stop if nothing to process */ + if (step->need_new_token && !got_new_token) + { + step->more_tokens = 0; + return 0; + } + + /* either an old token not finished yet, or a new token, thus + perform the work, eventually put this steps output in + step->buf16 or the chains UTF8 output buffers */ + + switch (step->type) + { + case ICU_chain_step_type_display: + icu_utf16_to_utf8(chain->display8, src16, status); + break; + case ICU_chain_step_type_casemap: + icu_casemap_casemap(step->u.casemap, + step->buf16, src16, status, + chain->locale); + break; + case ICU_chain_step_type_transform: + case ICU_chain_step_type_transliterate: + icu_transform_trans(step->u.transform, + step->buf16, src16, status); + break; + case ICU_chain_step_type_tokenize: + /* attach to new src16 token only first time during splitting */ + if (step->need_new_token) + { + icu_tokenizer_attach(step->u.tokenizer, src16, status); + step->need_new_token = 0; + } + + /* splitting one src16 token into multiple buf16 tokens */ + step->more_tokens + = icu_tokenizer_next_token(step->u.tokenizer, + step->buf16, status); + + /* make sure to get new previous token if this one had been used up + by recursive call to _same_ step */ + + if (!step->more_tokens) + { + step->more_tokens = icu_chain_step_next_token(chain, step, status); + return step->more_tokens; /* avoid one token count too much! */ + } + break; + default: + return 0; + break; + } + + if (U_FAILURE(*status)) + return 0; + + /* if token disappered into thin air, tell caller */ + /* if (!step->buf16->utf16_len && !step->more_tokens) */ + /* return 0; */ + + return 1; +} + +int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr, + UErrorCode *status) +{ + struct icu_chain_step * stp = 0; + + if (!chain || !src8cstr) + return 0; + + chain->src8cstr = src8cstr; + + stp = chain->steps; + + /* clear token count */ + chain->token_count = 0; + + /* clear all steps stop states */ + while (stp) + { + stp->more_tokens = 1; + stp->need_new_token = 1; + stp = stp->previous; + } + + /* finally convert UTF8 to UTF16 string if needed */ + if (chain->steps || chain->sort) + icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status); + + if (U_FAILURE(*status)) + return 0; + + return 1; +} + +int icu_chain_next_token(struct icu_chain * chain, UErrorCode *status) +{ + int got_token = 0; + + *status = U_ZERO_ERROR; + + if (!chain) + return 0; + + /* special case with no steps - same as index type binary */ + if (!chain->steps) + { + if (chain->token_count) + return 0; + else + { + chain->token_count++; + + if (chain->sort) + icu_sortkey8_from_utf16(chain->coll, + chain->sort8, chain->steps->buf16, + status); + return chain->token_count; + } + } + /* usual case, one or more icu chain steps existing */ + else + { + while (!got_token && chain->steps && chain->steps->more_tokens) + got_token = icu_chain_step_next_token(chain, chain->steps, status); + + if (got_token) + { + chain->token_count++; + + icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status); + + if (chain->sort) + icu_sortkey8_from_utf16(chain->coll, + chain->sort8, chain->steps->buf16, + status); + return chain->token_count; + } + } + + return 0; +} + +int icu_chain_token_number(struct icu_chain * chain) +{ + if (!chain) + return 0; + + return chain->token_count; +} + +const char * icu_chain_token_display(struct icu_chain * chain) +{ + if (chain->display8) + return icu_buf_utf8_to_cstr(chain->display8); + + return 0; +} + +const char * icu_chain_token_norm(struct icu_chain * chain) +{ + if (!chain->steps) + return chain->src8cstr; + + if (chain->norm8) + return icu_buf_utf8_to_cstr(chain->norm8); + + return 0; +} + +const char * icu_chain_token_sortkey(struct icu_chain * chain) +{ + if (chain->sort8) + return icu_buf_utf8_to_cstr(chain->sort8); + + return 0; +} + +#endif /* YAZ_HAVE_ICU */ + +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/src/icu_sortkey.c b/src/icu_sortkey.c new file mode 100644 index 0000000..a00c473 --- /dev/null +++ b/src/icu_sortkey.c @@ -0,0 +1,65 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2009 Index Data + * See the file LICENSE for details. + */ + +/** + * \file + * \brief sortkey utility based on ICU Collator + */ + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#if YAZ_HAVE_ICU +#include + +#include + +#include + +#include +#include +#include + +#include /* some more string fcns*/ +#include /* char names */ + +void icu_sortkey8_from_utf16(UCollator *coll, + struct icu_buf_utf8 * dest8, + struct icu_buf_utf16 * src16, + UErrorCode * status) +{ + int32_t sortkey_len = 0; + + sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, + dest8->utf8, dest8->utf8_cap); + + /* check for buffer overflow, resize and retry */ + if (sortkey_len > dest8->utf8_cap) + { + icu_buf_utf8_resize(dest8, sortkey_len * 2); + sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, + dest8->utf8, dest8->utf8_cap); + } + + if (U_SUCCESS(*status) + && sortkey_len > 0) + dest8->utf8_len = sortkey_len; + else + icu_buf_utf8_clear(dest8); +} + + +#endif /* YAZ_HAVE_ICU */ + +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/src/icu_tokenizer.c b/src/icu_tokenizer.c new file mode 100644 index 0000000..c7bba7f --- /dev/null +++ b/src/icu_tokenizer.c @@ -0,0 +1,188 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2009 Index Data + * See the file LICENSE for details. + */ + +/** + * \file + * \brief ICU tokenization - using ubrk_-functions from ICU + */ + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#if YAZ_HAVE_ICU +#include + +#include + +#include + +#include +#include +#include + +#include /* some more string fcns*/ +#include /* char names */ + +struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action, + UErrorCode *status) +{ + struct icu_tokenizer * tokenizer + = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer)); + + tokenizer->action = action; + tokenizer->bi = 0; + tokenizer->buf16 = 0; + tokenizer->token_count = 0; + tokenizer->token_id = 0; + tokenizer->token_start = 0; + tokenizer->token_end = 0; + + switch (tokenizer->action) + { + case 'l': + case 'L': + tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status); + break; + case 's': + case 'S': + tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status); + break; + case 'w': + case 'W': + tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status); + break; + case 'c': + case 'C': + tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status); + break; + case 't': + case 'T': + tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status); + break; + default: + *status = U_UNSUPPORTED_ERROR; + return 0; + break; + } + + /* ICU error stuff is a very funny business */ + if (U_SUCCESS(*status)) + return tokenizer; + + /* freeing if failed */ + icu_tokenizer_destroy(tokenizer); + return 0; +} + +void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer) +{ + if (tokenizer) + { + if (tokenizer->bi) + ubrk_close(tokenizer->bi); + xfree(tokenizer); + } +} + +int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * src16, + UErrorCode *status) +{ + if (!tokenizer || !tokenizer->bi || !src16) + return 0; + + tokenizer->buf16 = src16; + tokenizer->token_count = 0; + tokenizer->token_id = 0; + tokenizer->token_start = 0; + tokenizer->token_end = 0; + + ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status); + + if (U_FAILURE(*status)) + return 0; + + return 1; +} + +int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * tkn16, + UErrorCode *status) +{ + int32_t tkn_start = 0; + int32_t tkn_end = 0; + int32_t tkn_len = 0; + + if (!tokenizer || !tokenizer->bi + || !tokenizer->buf16 || !tokenizer->buf16->utf16_len) + return 0; + /* + never change tokenizer->buf16 and keep always invariant + 0 <= tokenizer->token_start + <= tokenizer->token_end + <= tokenizer->buf16->utf16_len + returns length of token + */ + + if (0 == tokenizer->token_end) /* first call */ + tkn_start = ubrk_first(tokenizer->bi); + else /* successive calls */ + tkn_start = tokenizer->token_end; + + /* get next position */ + tkn_end = ubrk_next(tokenizer->bi); + + /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */ + if (UBRK_DONE == tkn_end) + tkn_end = tokenizer->buf16->utf16_len; + + /* copy out if everything is well */ + if (U_FAILURE(*status)) + return 0; + + /* everything OK, now update internal state */ + tkn_len = tkn_end - tkn_start; + + if (0 < tkn_len) + { + tokenizer->token_count++; + tokenizer->token_id++; + } else { + tokenizer->token_id = 0; + } + tokenizer->token_start = tkn_start; + tokenizer->token_end = tkn_end; + + /* copying into token buffer if it exists */ + if (tkn16){ + if (tkn16->utf16_cap < tkn_len) + icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2); + + u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], + tkn_len); + + tkn16->utf16_len = tkn_len; + } + + return tkn_len; +} + +int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer) +{ + return tokenizer->token_count; +} + +#endif /* YAZ_HAVE_ICU */ + +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/src/icu_transform.c b/src/icu_transform.c new file mode 100644 index 0000000..10ace94 --- /dev/null +++ b/src/icu_transform.c @@ -0,0 +1,140 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2009 Index Data + * See the file LICENSE for details. + */ + +/** + * \file + * \brief ICU transforms - using utrans_-functions from ICU + */ + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#if YAZ_HAVE_ICU +#include + +#include + +#include + +#include +#include +#include + +#include + +struct icu_transform +{ + char action; + UParseError parse_error; + UTransliterator * trans; +}; + +struct icu_transform * icu_transform_create(const char *id, char action, + const char *rules, + UErrorCode *status) +{ + struct icu_buf_utf16 *id16 = icu_buf_utf16_create(0); + struct icu_buf_utf16 *rules16 = icu_buf_utf16_create(0); + + struct icu_transform *transform + = (struct icu_transform *) xmalloc(sizeof(struct icu_transform)); + + transform->action = action; + transform->trans = 0; + + if (id) + icu_utf16_from_utf8_cstr(id16, id, status); + if (rules) + icu_utf16_from_utf8_cstr(rules16, rules, status); + + switch (transform->action) + { + case 'f': + case 'F': + transform->trans + = utrans_openU(id16->utf16, + id16->utf16_len, + UTRANS_FORWARD, + rules16->utf16, + rules16->utf16_len, + &transform->parse_error, status); + break; + case 'r': + case 'R': + transform->trans + = utrans_openU(id16->utf16, + id16->utf16_len, + UTRANS_REVERSE , + rules16->utf16, + rules16->utf16_len, + &transform->parse_error, status); + break; + default: + *status = U_UNSUPPORTED_ERROR; + break; + } + icu_buf_utf16_destroy(rules16); + icu_buf_utf16_destroy(id16); + + if (U_SUCCESS(*status)) + return transform; + + /* freeing if failed */ + icu_transform_destroy(transform); + return 0; +} + +void icu_transform_destroy(struct icu_transform * transform) +{ + if (transform) + { + if (transform->trans) + utrans_close(transform->trans); + xfree(transform); + } +} + +int icu_transform_trans(struct icu_transform * transform, + struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16, + UErrorCode *status) +{ + if (!transform || !transform->trans + || !src16 || !dest16) + return 0; + + if (!src16->utf16_len) + { /* guarding for empty source string */ + icu_buf_utf16_clear(dest16); + return 0; + } + + if (!icu_buf_utf16_copy(dest16, src16)) + return 0; + + utrans_transUChars (transform->trans, + dest16->utf16, &(dest16->utf16_len), + dest16->utf16_cap, + 0, &(src16->utf16_len), status); + + if (U_FAILURE(*status)) + icu_buf_utf16_clear(dest16); + + return dest16->utf16_len; +} + + +#endif /* YAZ_HAVE_ICU */ + +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/src/icu_utf16.c b/src/icu_utf16.c new file mode 100644 index 0000000..55766a2 --- /dev/null +++ b/src/icu_utf16.c @@ -0,0 +1,120 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2009 Index Data + * See the file LICENSE for details. + */ + +/** + * \file + * \brief UTF-16 string utilities for ICU + */ + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#if YAZ_HAVE_ICU +#include + +#include + +#include + +#include +#include +#include + +#include /* some more string fcns*/ +#include /* char names */ + +struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity) +{ + struct icu_buf_utf16 * buf16 + = (struct icu_buf_utf16 *) xmalloc(sizeof(struct icu_buf_utf16)); + + buf16->utf16 = 0; + buf16->utf16_len = 0; + buf16->utf16_cap = 0; + + if (capacity > 0) + { + buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity); + buf16->utf16[0] = (UChar) 0; + buf16->utf16_cap = capacity; + } + return buf16; +} + +struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16) +{ + if (buf16) + { + if (buf16->utf16) + buf16->utf16[0] = (UChar) 0; + buf16->utf16_len = 0; + } + return buf16; +} + +struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16, + size_t capacity) +{ + if (!buf16) + return 0; + + if (capacity > 0) + { + if (0 == buf16->utf16) + buf16->utf16 = (UChar *) xmalloc(sizeof(UChar) * capacity); + else + buf16->utf16 + = (UChar *) xrealloc(buf16->utf16, sizeof(UChar) * capacity); + + icu_buf_utf16_clear(buf16); + buf16->utf16_cap = capacity; + } + else + { + xfree(buf16->utf16); + buf16->utf16 = 0; + buf16->utf16_len = 0; + buf16->utf16_cap = 0; + } + + return buf16; +} + + +struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16, + struct icu_buf_utf16 * src16) +{ + if (!dest16 || !src16 || dest16 == src16) + return 0; + + if (dest16->utf16_cap < src16->utf16_len) + icu_buf_utf16_resize(dest16, src16->utf16_len * 2); + + u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len); + dest16->utf16_len = src16->utf16_len; + + return dest16; +} + +void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16) +{ + if (buf16) + xfree(buf16->utf16); + xfree(buf16); +} + + +#endif /* YAZ_HAVE_ICU */ + +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/src/icu_utf8.c b/src/icu_utf8.c new file mode 100644 index 0000000..1a4ce74 --- /dev/null +++ b/src/icu_utf8.c @@ -0,0 +1,175 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2009 Index Data + * See the file LICENSE for details. + */ + +/** + * \file + * \brief UTF-8 string utilities for ICU + */ + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#if YAZ_HAVE_ICU +#include + +#include + +#include + +#include +#include +#include + +#include /* some more string fcns*/ +#include /* char names */ + +struct icu_buf_utf8 *icu_buf_utf8_create(size_t capacity) +{ + struct icu_buf_utf8 * buf8 + = (struct icu_buf_utf8 *) xmalloc(sizeof(struct icu_buf_utf8)); + + buf8->utf8 = 0; + buf8->utf8_len = 0; + buf8->utf8_cap = 0; + + if (capacity > 0) + { + buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity); + buf8->utf8[0] = (uint8_t) 0; + buf8->utf8_cap = capacity; + } + return buf8; +} + +struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8) +{ + if (buf8) + { + if (buf8->utf8) + buf8->utf8[0] = (uint8_t) 0; + buf8->utf8_len = 0; + } + return buf8; +} + +struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, + size_t capacity) +{ + if (!buf8) + return 0; + + if (capacity > 0){ + if (0 == buf8->utf8) + buf8->utf8 = (uint8_t *) xmalloc(sizeof(uint8_t) * capacity); + else + buf8->utf8 + = (uint8_t *) xrealloc(buf8->utf8, sizeof(uint8_t) * capacity); + + buf8->utf8_cap = capacity; + } + else { + xfree(buf8->utf8); + buf8->utf8 = 0; + buf8->utf8_len = 0; + buf8->utf8_cap = 0; + } + + return buf8; +} + +const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8) +{ + if (!src8 || src8->utf8_len == 0) + return ""; + + if (src8->utf8_len == src8->utf8_cap) + src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1); + + src8->utf8[src8->utf8_len] = '\0'; + + return (const char *) src8->utf8; +} + +void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8) +{ + if (buf8) + xfree(buf8->utf8); + xfree(buf8); +} + +UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16, + const char * src8cstr, + UErrorCode * status) +{ + size_t src8cstr_len = 0; + int32_t utf16_len = 0; + + *status = U_ZERO_ERROR; + src8cstr_len = strlen(src8cstr); + + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + src8cstr, src8cstr_len, status); + + /* check for buffer overflow, resize and retry */ + if (*status == U_BUFFER_OVERFLOW_ERROR) + { + icu_buf_utf16_resize(dest16, utf16_len * 2); + *status = U_ZERO_ERROR; + u_strFromUTF8(dest16->utf16, dest16->utf16_cap, + &utf16_len, + src8cstr, src8cstr_len, status); + } + + if (U_SUCCESS(*status) + && utf16_len <= dest16->utf16_cap) + dest16->utf16_len = utf16_len; + else + icu_buf_utf16_clear(dest16); + + return *status; +} + +UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8, + struct icu_buf_utf16 * src16, + UErrorCode * status) +{ + int32_t utf8_len = 0; + + u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap, + &utf8_len, + src16->utf16, src16->utf16_len, status); + + /* check for buffer overflow, resize and retry */ + if (*status == U_BUFFER_OVERFLOW_ERROR) + { + icu_buf_utf8_resize(dest8, utf8_len * 2); + *status = U_ZERO_ERROR; + u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap, + &utf8_len, + src16->utf16, src16->utf16_len, status); + } + + if (U_SUCCESS(*status) + && utf8_len <= dest8->utf8_cap) + dest8->utf8_len = utf8_len; + else + icu_buf_utf8_clear(dest8); + + return *status; +} + +#endif /* YAZ_HAVE_ICU */ + +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + diff --git a/test/tst_icu_I18N.c b/test/tst_icu_I18N.c index ef3cdc5..bbdc993 100644 --- a/test/tst_icu_I18N.c +++ b/test/tst_icu_I18N.c @@ -23,9 +23,6 @@ #include #include -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ - - #define MAX_KEY_SIZE 256 struct icu_termmap { @@ -34,7 +31,6 @@ struct icu_termmap }; - int icu_termmap_cmp(const void *vp1, const void *vp2) { struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1; @@ -45,9 +41,7 @@ int icu_termmap_cmp(const void *vp1, const void *vp2) cmp = strcmp((const char *)itmp1->sort_key, (const char *)itmp2->sort_key); return cmp; -}; - - +} int test_icu_casemap(const char * locale, char action, @@ -73,9 +67,7 @@ int test_icu_casemap(const char * locale, char action, /* converting to UTF8 */ icu_utf16_to_utf8(dest8, dest16, &status); - - /* determine success */ if (dest8->utf8 && (dest8->utf8_len == strlen(chk8cstr)) @@ -98,18 +90,12 @@ int test_icu_casemap(const char * locale, char action, icu_buf_utf8_destroy(dest8); icu_buf_utf16_destroy(src16); icu_buf_utf16_destroy(dest16); - - + return success; } - - -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ - void test_icu_I18N_casemap(int argc, char **argv) { - /* Locale 'en' */ /* successful tests */ @@ -129,7 +115,6 @@ void test_icu_I18N_casemap(int argc, char **argv) "A ReD fOx hunTS sQUirriLs", "A Red Fox Hunts Squirrils")); - /* Locale 'da' */ /* success expected */ @@ -170,9 +155,6 @@ void test_icu_I18N_casemap(int argc, char **argv) } - -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ - int test_icu_sortmap(const char * locale, int src_list_len, const char ** src_list, const char ** chk_list) { @@ -194,39 +176,39 @@ int test_icu_sortmap(const char * locale, int src_list_len, return 0; /* assigning display terms and sort keys using buf 8 and buf16 */ - for( i = 0; i < src_list_len; i++) - { + for (i = 0; i < src_list_len; i++) + { - list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap)); + list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap)); - /* copy display term */ - strcpy(list[i]->disp_term, src_list[i]); + /* copy display term */ + strcpy(list[i]->disp_term, src_list[i]); - /* transforming to UTF16 */ - icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status); - icu_check_status(status); + /* transforming to UTF16 */ + icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status); + icu_check_status(status); - /* computing sortkeys */ - icu_sortkey8_from_utf16(coll, buf8, buf16, &status); - icu_check_status(status); + /* computing sortkeys */ + icu_sortkey8_from_utf16(coll, buf8, buf16, &status); + icu_check_status(status); - /* assigning sortkeys */ - memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len); - } - + /* assigning sortkeys */ + memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len); + } /* do the sorting */ - qsort(list, src_list_len, - sizeof(struct icu_termmap *), icu_termmap_cmp); + qsort(list, src_list_len, sizeof(struct icu_termmap *), icu_termmap_cmp); /* checking correct sorting */ - for (i = 0; i < src_list_len; i++){ + for (i = 0; i < src_list_len; i++) + { if (0 != strcmp(list[i]->disp_term, chk_list[i])){ success = 0; } } - if(!success){ + if (!success) + { printf("\nERROR\n"); printf("Input str: '%s' : ", locale); for (i = 0; i < src_list_len; i++) { @@ -245,12 +227,9 @@ int test_icu_sortmap(const char * locale, int src_list_len, printf("\n"); } - - - for( i = 0; i < src_list_len; i++) + for (i = 0; i < src_list_len; i++) free(list[i]); - - + ucol_close(coll); icu_buf_utf8_destroy(buf8); @@ -259,12 +238,8 @@ int test_icu_sortmap(const char * locale, int src_list_len, return success; } - -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ - void test_icu_I18N_sortmap(int argc, char **argv) { - /* successful tests */ size_t en_1_len = 6; const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"}; @@ -295,15 +270,9 @@ void test_icu_I18N_sortmap(int argc, char **argv) } - -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ - - - - int test_icu_normalizer(const char * rules8cstr, - const char * src8cstr, - const char * chk8cstr) + const char * src8cstr, + const char * chk8cstr) { int success = 0; @@ -329,7 +298,8 @@ int test_icu_normalizer(const char * rules8cstr, if(!strcmp((const char *) dest8->utf8, (const char *) chk8cstr)) success = 1; - else { + else + { success = 0; printf("Normalization\n"); printf("Rules: '%s'\n", rules8cstr); @@ -337,7 +307,6 @@ int test_icu_normalizer(const char * rules8cstr, printf("Normalized: '%s'\n", dest8->utf8); printf("Expected: '%s'\n", chk8cstr); } - icu_transform_destroy(transform); icu_buf_utf16_destroy(src16); @@ -345,14 +314,10 @@ int test_icu_normalizer(const char * rules8cstr, icu_buf_utf8_destroy(dest8); return success; -}; - - -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ +} void test_icu_I18N_normalizer(int argc, char **argv) { - YAZ_CHECK(test_icu_normalizer("[:Punctuation:] Any-Remove", "Don't shoot!", "Dont shoot")); @@ -378,18 +343,13 @@ void test_icu_I18N_normalizer(int argc, char **argv) " word4you? ", "word4you")); - YAZ_CHECK(test_icu_normalizer("NFD; [:Nonspacing Mark:] Remove; NFC", "à côté de l'alcôve ovoïde", "a cote de l'alcove ovoide")); - } - -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ - int test_icu_tokenizer(const char * locale, char action, - const char * src8cstr, int count) + const char * src8cstr, int count) { int success = 1; @@ -437,13 +397,8 @@ int test_icu_tokenizer(const char * locale, char action, return success; } - -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ - void test_icu_I18N_tokenizer(int argc, char **argv) { - - const char * en_str = "O Romeo, Romeo! wherefore art thou Romeo?"; @@ -452,8 +407,6 @@ void test_icu_I18N_tokenizer(int argc, char **argv) YAZ_CHECK(test_icu_tokenizer("en", 'w', en_str, 16)); YAZ_CHECK(test_icu_tokenizer("en", 'c', en_str, 41)); - - { const char * da_str = "Blåbærtærte. Denne kage stammer fra Finland. " @@ -464,10 +417,8 @@ void test_icu_I18N_tokenizer(int argc, char **argv) YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37)); YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110)); } - } - void test_icu_I18N_chain(int argc, char **argv) { const char * en_str @@ -476,7 +427,6 @@ void test_icu_I18N_chain(int argc, char **argv) UErrorCode status = U_ZERO_ERROR; struct icu_chain * chain = 0; - const char * xml_str = "" "" "" @@ -497,12 +447,13 @@ void test_icu_I18N_chain(int argc, char **argv) YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status)); - while (icu_chain_next_token(chain, &status)){ + while (icu_chain_next_token(chain, &status)) + { ; /* printf("%d '%s' '%s'\n", - icu_chain_token_number(chain), - icu_chain_token_norm(chain), - icu_chain_token_display(chain)); */ + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } YAZ_CHECK_EQ(icu_chain_token_number(chain), 7); @@ -510,12 +461,13 @@ void test_icu_I18N_chain(int argc, char **argv) YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status)); - while (icu_chain_next_token(chain, &status)){ + while (icu_chain_next_token(chain, &status)) + { ; /* printf("%d '%s' '%s'\n", - icu_chain_token_number(chain), - icu_chain_token_norm(chain), - icu_chain_token_display(chain)); */ + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } @@ -556,13 +508,13 @@ void test_bug_1140(void) chain, "O Romeo, Romeo! wherefore art thou\t Romeo?", &status)); - while (icu_chain_next_token(chain, &status)){ + while (icu_chain_next_token(chain, &status)) + { ; /* printf("%d '%s' '%s'\n", - icu_chain_token_number(chain), - icu_chain_token_norm(chain), - icu_chain_token_display(chain)); */ - + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } @@ -570,12 +522,13 @@ void test_bug_1140(void) YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status)); - while (icu_chain_next_token(chain, &status)){ - ; - /* printf("%d '%s' '%s'\n", - icu_chain_token_number(chain), - icu_chain_token_norm(chain), - icu_chain_token_display(chain)); */ + while (icu_chain_next_token(chain, &status)) + { + ; + /* printf("%d '%s' '%s'\n", + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } /* we expect 'what' 'is' 'this', i.e. 3 tokens */ @@ -585,7 +538,6 @@ void test_bug_1140(void) } - void test_chain_empty_token(void) { UErrorCode status = U_ZERO_ERROR; @@ -609,12 +561,13 @@ void test_chain_empty_token(void) chain, "a string with 15 tokenss and 8 displays", &status)); - while (icu_chain_next_token(chain, &status)){ + while (icu_chain_next_token(chain, &status)) + { ; /* printf("%d '%s' '%s'\n", - icu_chain_token_number(chain), - icu_chain_token_norm(chain), - icu_chain_token_display(chain)); */ + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } YAZ_CHECK_EQ(icu_chain_token_number(chain), 15); @@ -646,12 +599,13 @@ void test_chain_empty_chain(void) chain, src8, &status)); - while (icu_chain_next_token(chain, &status)){ + while (icu_chain_next_token(chain, &status)) + { ; /* printf("%d '%s' '%s'\n", - icu_chain_token_number(chain), - icu_chain_token_norm(chain), - icu_chain_token_display(chain)); */ + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } YAZ_CHECK_EQ(icu_chain_token_number(chain), 1); @@ -659,17 +613,13 @@ void test_chain_empty_chain(void) dest8 = (char *) icu_chain_token_norm(chain); YAZ_CHECK_EQ(strcmp(src8, dest8), 0); - icu_chain_destroy(chain); } #endif /* YAZ_HAVE_ICU */ -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ - int main(int argc, char **argv) { - YAZ_CHECK_INIT(argc, argv); YAZ_CHECK_LOG(); @@ -695,11 +645,6 @@ int main(int argc, char **argv) YAZ_CHECK_TERM; } - -/* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ - - - /* * Local variables: * c-basic-offset: 4