From c5408040f72a5a23dc23d6c8d30ee61ea8b6effc Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Mon, 29 Oct 2007 13:50:57 +0000 Subject: [PATCH] changed ICU chain to return all tokens, also those which became empty under normalization. These might still have some display terms attached to them, which are to be used by the snippet display stuff or the scan term display stuff. --- include/yaz/icu_I18N.h | 4 ---- src/icu_I18N.c | 16 ++++++------- test/tst_icu_I18N.c | 59 ++++++++++++++++++++++++------------------------ 3 files changed, 37 insertions(+), 42 deletions(-) diff --git a/include/yaz/icu_I18N.h b/include/yaz/icu_I18N.h index 61dbf70..c2c5042 100644 --- a/include/yaz/icu_I18N.h +++ b/include/yaz/icu_I18N.h @@ -273,10 +273,6 @@ int icu_chain_next_token(struct icu_chain * chain, int icu_chain_token_number(struct icu_chain * chain); -/*int icu_chain_token_start(struct icu_chain * chain); */ - -/*int icu_chain_token_end(struct icu_chain * chain); */ - const char * icu_chain_token_display(struct icu_chain * chain); const char * icu_chain_token_norm(struct icu_chain * chain); diff --git a/src/icu_I18N.c b/src/icu_I18N.c index a4aa244..932c1b2 100644 --- a/src/icu_I18N.c +++ b/src/icu_I18N.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2007, Index Data ApS * See the file LICENSE for details. * - * $Id: icu_I18N.c,v 1.9 2007-10-29 10:22:23 marc Exp $ + * $Id: icu_I18N.c,v 1.10 2007-10-29 13:50:57 marc Exp $ */ #if HAVE_CONFIG_H @@ -1035,6 +1035,7 @@ int icu_chain_step_next_token(struct icu_chain * chain, if (step->previous){ src16 = step->previous->buf16; /* tokens might be killed in previous steps, therefore looping */ + while (step->need_new_token && step->previous->more_tokens && !got_new_token) @@ -1088,8 +1089,10 @@ int icu_chain_step_next_token(struct icu_chain * chain, /* make sure to get new previous token if this one had been used up by recursive call to _same_ step */ - if (!step->more_tokens) + if (!step->more_tokens){ step->more_tokens = icu_chain_step_next_token(chain, step, status); + return step->more_tokens; // avoid one token count too much! + } break; default: @@ -1101,11 +1104,8 @@ int icu_chain_step_next_token(struct icu_chain * chain, return 0; /* if token disappered into thin air, tell caller */ - if (!step->buf16->utf16_len) - return 0; - - if (U_FAILURE(*status)) - return 0; + /* if (!step->buf16->utf16_len && !step->more_tokens) */ + /* return 0; */ return 1; } @@ -1173,7 +1173,7 @@ int icu_chain_next_token(struct icu_chain * chain, while(!got_token && chain->steps && chain->steps->more_tokens) got_token = icu_chain_step_next_token(chain, chain->steps, status); - + if (got_token){ chain->token_count++; diff --git a/test/tst_icu_I18N.c b/test/tst_icu_I18N.c index 07aab52..1d6e205 100644 --- a/test/tst_icu_I18N.c +++ b/test/tst_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: tst_icu_I18N.c,v 1.10 2007-10-29 10:22:23 marc Exp $ +/* $Id: tst_icu_I18N.c,v 1.11 2007-10-29 13:50:57 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -506,8 +506,6 @@ void test_icu_I18N_chain(int argc, char **argv) xmlNode *xml_node = xmlDocGetRootElement(doc); YAZ_CHECK(xml_node); - // printf("ICU chain:\ninput: '%s'\n", en_str); - chain = icu_chain_xml_config(xml_node, (uint8_t *) "en", 0, &status); @@ -516,12 +514,13 @@ void test_icu_I18N_chain(int argc, char **argv) YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status)); + //printf("ICU chain:\ninput: '%s'\n", en_str); while (icu_chain_next_token(chain, &status)){ ; - // printf("%d '%s' '%s'\n", - // icu_chain_token_number(chain), - // icu_chain_token_norm(chain), - // icu_chain_token_display(chain)); + /* printf("%d '%s' '%s'\n", + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } YAZ_CHECK_EQ(icu_chain_token_number(chain), 7); @@ -531,10 +530,10 @@ void test_icu_I18N_chain(int argc, char **argv) while (icu_chain_next_token(chain, &status)){ ; - //printf("%d '%s' '%s'\n", - // icu_chain_token_number(chain), - // icu_chain_token_norm(chain), - // icu_chain_token_display(chain)); + /* printf("%d '%s' '%s'\n", + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } @@ -577,10 +576,10 @@ void test_bug_1140(void) while (icu_chain_next_token(chain, &status)){ ; - //printf("%d '%s' '%s'\n", - // icu_chain_token_number(chain), - // icu_chain_token_norm(chain), - // icu_chain_token_display(chain)); + /* printf("%d '%s' '%s'\n", + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } @@ -591,10 +590,10 @@ void test_bug_1140(void) while (icu_chain_next_token(chain, &status)){ ; - //printf("%d '%s' '%s'\n", - // icu_chain_token_number(chain), - // icu_chain_token_norm(chain), - // icu_chain_token_display(chain)); + /* printf("%d '%s' '%s'\n", + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } /* we expect 'what' 'is' 'this', i.e. 3 tokens */ @@ -625,18 +624,18 @@ void test_chain_empty_token(void) YAZ_CHECK(chain); YAZ_CHECK(icu_chain_assign_cstr( - chain, "a string with 15 wordbreaks and 8 tokens", + chain, "a string with 15 tokenss and 8 displays", &status)); while (icu_chain_next_token(chain, &status)){ ; - //printf("%d '%s' '%s'\n", - // icu_chain_token_number(chain), - // icu_chain_token_norm(chain), - // icu_chain_token_display(chain)); + /* printf("%d '%s' '%s'\n", + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } - YAZ_CHECK_EQ(icu_chain_token_number(chain), 8); + YAZ_CHECK_EQ(icu_chain_token_number(chain), 15); icu_chain_destroy(chain); } @@ -667,15 +666,15 @@ void test_chain_empty_chain(void) while (icu_chain_next_token(chain, &status)){ ; - //printf("%d '%s' '%s'\n", - // icu_chain_token_number(chain), - // icu_chain_token_norm(chain), - // icu_chain_token_display(chain)); + /* printf("%d '%s' '%s'\n", + icu_chain_token_number(chain), + icu_chain_token_norm(chain), + icu_chain_token_display(chain)); */ } YAZ_CHECK_EQ(icu_chain_token_number(chain), 1); - dest8 = icu_chain_token_norm(chain); + dest8 = (char *) icu_chain_token_norm(chain); YAZ_CHECK_EQ(strcmp(src8, dest8), 0); -- 1.7.10.4