From 674986e09b1e2c5ea9670355aef21c813323d186 Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Thu, 25 Oct 2007 10:04:32 +0000 Subject: [PATCH] added special case with an empty ICU chain for 'raw' index processing without call of transliterators, case maps or tokenizators. It's optimized such that the following happens: if (!chain->sort), no utf8 to utf16 translation occurs, and the original cstring is returned for the icu_chain-get_norm8() call. if (chain->sort), utf8 to utf16 translation is made, the original cstring is returned for the icu_chain-get_norm8() call, and the utf16->sortkey conversion is run previous of calling icu_chain-get_sort8(). --- include/yaz/icu_I18N.h | 2 ++ src/icu_I18N.c | 60 ++++++++++++++++++++++++++++++++++-------------- test/tst_icu_I18N.c | 44 ++++++++++++++++++++++++++++++++++- 3 files changed, 88 insertions(+), 18 deletions(-) diff --git a/include/yaz/icu_I18N.h b/include/yaz/icu_I18N.h index 3abe6bb..f69714c 100644 --- a/include/yaz/icu_I18N.h +++ b/include/yaz/icu_I18N.h @@ -225,6 +225,8 @@ struct icu_chain uint8_t locale[16]; int sort; + const char * src8cstr; + UCollator * coll; /* number of tokens returned so far */ diff --git a/src/icu_I18N.c b/src/icu_I18N.c index 879fcd8..efc3cfc 100644 --- a/src/icu_I18N.c +++ b/src/icu_I18N.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2007, Index Data ApS * See the file LICENSE for details. * - * $Id: icu_I18N.c,v 1.7 2007-10-25 08:40:06 marc Exp $ + * $Id: icu_I18N.c,v 1.8 2007-10-25 10:04:32 marc Exp $ */ #if HAVE_CONFIG_H @@ -871,6 +871,8 @@ struct icu_chain * icu_chain_create(const uint8_t * locale, chain->token_count = 0; + chain->src8cstr = 0; + chain->display8 = icu_buf_utf8_create(0); chain->norm8 = icu_buf_utf8_create(0); chain->sort8 = icu_buf_utf8_create(0); @@ -879,7 +881,6 @@ struct icu_chain * icu_chain_create(const uint8_t * locale, chain->steps = 0; - return chain; } @@ -1119,6 +1120,8 @@ int icu_chain_assign_cstr(struct icu_chain * chain, if (!chain || !src8cstr) return 0; + chain->src8cstr = src8cstr; + stp = chain->steps; /* clear token count */ @@ -1131,8 +1134,9 @@ int icu_chain_assign_cstr(struct icu_chain * chain, stp = stp->previous; } - /* finally convert UTF8 to UTF16 string */ - icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status); + /* finally convert UTF8 to UTF16 string if needed */ + if (chain->steps || chain->sort) + icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status); if (U_FAILURE(*status)) return 0; @@ -1147,23 +1151,43 @@ int icu_chain_next_token(struct icu_chain * chain, { int got_token = 0; - if (!chain || !chain->steps) + if (!chain) return 0; - while(!got_token && chain->steps->more_tokens) - got_token = icu_chain_step_next_token(chain, chain->steps, status); - - if (got_token){ - chain->token_count++; - - icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status); + /* special case with no steps - same as index type binary */ + if (!chain->steps){ + if (chain->token_count) + return 0; + else { + chain->token_count++; + + if (chain->sort) + icu_sortkey8_from_utf16(chain->coll, + chain->sort8, chain->steps->buf16, + status); + return chain->token_count; + } + } + /* usual case, one or more icu chain steps existing */ + else { - icu_sortkey8_from_utf16(chain->coll, - chain->sort8, chain->steps->buf16, status); + while(!got_token && chain->steps && chain->steps->more_tokens) + got_token = icu_chain_step_next_token(chain, chain->steps, status); + + if (got_token){ + chain->token_count++; - return chain->token_count; + icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status); + + if (chain->sort) + icu_sortkey8_from_utf16(chain->coll, + chain->sort8, chain->steps->buf16, + status); + + return chain->token_count; + } } - + return 0; } @@ -1176,7 +1200,6 @@ int icu_chain_get_token_count(struct icu_chain * chain) } - const char * icu_chain_get_display(struct icu_chain * chain) { if (chain->display8) @@ -1187,6 +1210,9 @@ const char * icu_chain_get_display(struct icu_chain * chain) const char * icu_chain_get_norm(struct icu_chain * chain) { + if (!chain->steps) + return chain->src8cstr; + if (chain->norm8) return icu_buf_utf8_to_cstr(chain->norm8); diff --git a/test/tst_icu_I18N.c b/test/tst_icu_I18N.c index cbef9e8..a00e15c 100644 --- a/test/tst_icu_I18N.c +++ b/test/tst_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: tst_icu_I18N.c,v 1.8 2007-10-25 08:42:21 marc Exp $ +/* $Id: tst_icu_I18N.c,v 1.9 2007-10-25 10:04:33 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -641,6 +641,47 @@ void test_chain_empty_token(void) icu_chain_destroy(chain); } +void test_chain_empty_chain(void) +{ + UErrorCode status = U_ZERO_ERROR; + struct icu_chain * chain = 0; + + const char * xml_str = "" + ""; + + const char * src8 = "some 5487 weired !¤%&(/& sTuFf"; + char * dest8 = 0; + + xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); + xmlNode *xml_node = xmlDocGetRootElement(doc); + YAZ_CHECK(xml_node); + + chain = icu_chain_xml_config(xml_node, (uint8_t *) "en", 0, &status); + + xmlFreeDoc(doc); + YAZ_CHECK(chain); + + YAZ_CHECK(icu_chain_assign_cstr( + chain, src8, + &status)); + + while (icu_chain_next_token(chain, &status)){ + ; + //printf("%d '%s' '%s'\n", + // icu_chain_get_token_count(chain), + // icu_chain_get_norm(chain), + // icu_chain_get_display(chain)); + } + + YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 1); + + dest8 = icu_chain_get_norm(chain); + YAZ_CHECK_EQ(strcmp(src8, dest8), 0); + + + icu_chain_destroy(chain); +} + #endif // HAVE_ICU /* DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 */ @@ -659,6 +700,7 @@ int main(int argc, char **argv) test_icu_I18N_tokenizer(argc, argv); test_icu_I18N_chain(argc, argv); test_chain_empty_token(); + test_chain_empty_chain(); test_bug_1140(); #else /* HAVE_ICU */ -- 1.7.10.4