From 2de4cab9b87f848767078447142668fc3c30e5c9 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 12 Sep 2011 18:29:48 +0200 Subject: [PATCH] ICU chain "facet" honors YAC ICU element . Display term and normalized term is now saved per facet. --- src/charsets.c | 25 +++++++++++++++++++++++++ src/charsets.h | 1 + src/http_command.c | 4 ++-- src/session.c | 18 +++++++++++++++--- src/termlists.c | 13 ++++++++----- src/termlists.h | 6 ++++-- 6 files changed, 55 insertions(+), 12 deletions(-) diff --git a/src/charsets.c b/src/charsets.c index d70c11d..44ef5fc 100644 --- a/src/charsets.c +++ b/src/charsets.c @@ -44,6 +44,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA struct pp2_charset_s { const char *(*token_next_handler)(pp2_relevance_token_t prt); const char *(*get_sort_handler)(pp2_relevance_token_t prt); + const char *(*get_display_handler)(pp2_relevance_token_t prt); int ref_count; #if YAZ_HAVE_ICU struct icu_chain * icu_chn; @@ -54,10 +55,12 @@ struct pp2_charset_s { static const char *pp2_relevance_token_null(pp2_relevance_token_t prt); static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt); static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt); +static const char *pp2_get_display_ascii(pp2_relevance_token_t prt); #if YAZ_HAVE_ICU static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt); static const char *pp2_get_sort_icu(pp2_relevance_token_t prt); +static const char *pp2_get_display_icu(pp2_relevance_token_t prt); #endif /* tokenzier handle */ @@ -123,6 +126,7 @@ pp2_charset_t pp2_charset_create(struct icu_chain *icu_chn) pct->token_next_handler = pp2_relevance_token_null; pct->get_sort_handler = pp2_get_sort_ascii; + pct->get_display_handler = pp2_get_display_ascii; pct->ref_count = 1; #if YAZ_HAVE_ICU pct->icu_chn = 0; @@ -132,6 +136,7 @@ pp2_charset_t pp2_charset_create(struct icu_chain *icu_chn) pct->icu_sts = U_ZERO_ERROR; pct->token_next_handler = pp2_relevance_token_icu; pct->get_sort_handler = pp2_get_sort_icu; + pct->get_display_handler = pp2_get_display_icu; } #endif // YAZ_HAVE_ICU return pct; @@ -230,6 +235,11 @@ const char *pp2_get_sort(pp2_relevance_token_t prt) return prt->pct->get_sort_handler(prt); } +const char *pp2_get_display(pp2_relevance_token_t prt) +{ + return prt->pct->get_display_handler(prt); +} + #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) : -1) /* original tokenizer with our tokenize interface, but we add +1 to ensure no '\0' are in our string (except for EOF) @@ -278,6 +288,16 @@ static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt) } } +static const char *pp2_get_display_ascii(pp2_relevance_token_t prt) +{ + if (prt->last_cp == 0) + return 0; + else + { + return wrbuf_cstr(prt->norm_str); + } +} + static const char *pp2_relevance_token_null(pp2_relevance_token_t prt) { const char *cp = prt->cp; @@ -304,6 +324,11 @@ static const char *pp2_get_sort_icu(pp2_relevance_token_t prt) return icu_iter_get_sortkey(prt->iter); } +static const char *pp2_get_display_icu(pp2_relevance_token_t prt) +{ + return icu_iter_get_display(prt->iter); +} + #endif // YAZ_HAVE_ICU diff --git a/src/charsets.h b/src/charsets.h index 2d688eb..d03abb9 100644 --- a/src/charsets.h +++ b/src/charsets.h @@ -47,6 +47,7 @@ void pp2_relevance_first(pp2_relevance_token_t prt, void pp2_relevance_token_destroy(pp2_relevance_token_t prt); const char *pp2_relevance_token_next(pp2_relevance_token_t prt); const char *pp2_get_sort(pp2_relevance_token_t prt); +const char *pp2_get_display(pp2_relevance_token_t prt); #if 0 typedef int pp2_charset_normalize_t(pp2_charset_t pct, diff --git a/src/http_command.c b/src/http_command.c index 4258b29..b46ba7b 100644 --- a/src/http_command.c +++ b/src/http_command.c @@ -597,12 +597,12 @@ static void cmd_termlist(struct http_channel *c) for (i = 0; i < len && i < num; i++) { // prevnt sending empty term elements - if (!p[i]->term || !p[i]->term[0]) + if (!p[i]->display_term || !p[i]->display_term[0]) continue; wrbuf_puts(c->wrbuf, ""); wrbuf_puts(c->wrbuf, ""); - wrbuf_xmlputs(c->wrbuf, p[i]->term); + wrbuf_xmlputs(c->wrbuf, p[i]->display_term); wrbuf_puts(c->wrbuf, ""); wrbuf_printf(c->wrbuf, diff --git a/src/session.c b/src/session.c index 2adb1eb..ee39c7b 100644 --- a/src/session.c +++ b/src/session.c @@ -191,20 +191,31 @@ void add_facet(struct session *s, const char *type, const char *value, int count pp2_relevance_token_t prt; const char *facet_component; WRBUF facet_wrbuf = wrbuf_alloc(); + WRBUF display_wrbuf = wrbuf_alloc(); prt = pp2_relevance_tokenize(service->facet_pct); pp2_relevance_first(prt, value, 0); while ((facet_component = pp2_relevance_token_next(prt))) { + const char *display_component; if (*facet_component) { if (wrbuf_len(facet_wrbuf)) wrbuf_puts(facet_wrbuf, " "); wrbuf_puts(facet_wrbuf, facet_component); } + display_component = pp2_get_display(prt); + if (display_component) + { + if (wrbuf_len(display_wrbuf)) + wrbuf_puts(display_wrbuf, " "); + wrbuf_puts(display_wrbuf, display_component); + } } pp2_relevance_token_destroy(prt); - + + yaz_log(YLOG_LOG, "facet norm=%s", wrbuf_cstr(facet_wrbuf)); + yaz_log(YLOG_LOG, "facet display=%s", wrbuf_cstr(display_wrbuf)); if (wrbuf_len(facet_wrbuf)) { int i; @@ -229,10 +240,11 @@ void add_facet(struct session *s, const char *type, const char *value, int count #if 0 session_log(s, YLOG_DEBUG, "Facets for %s: %s norm:%s (%d)", type, value, wrbuf_cstr(facet_wrbuf), count); #endif - termlist_insert(s->termlists[i].termlist, wrbuf_cstr(facet_wrbuf), - count); + termlist_insert(s->termlists[i].termlist, wrbuf_cstr(display_wrbuf), + wrbuf_cstr(facet_wrbuf), count); } wrbuf_destroy(facet_wrbuf); + wrbuf_destroy(display_wrbuf); } static xmlDoc *record_to_xml(struct session *se, diff --git a/src/termlists.c b/src/termlists.c index c0a7c13..9741eb2 100644 --- a/src/termlists.c +++ b/src/termlists.c @@ -107,15 +107,16 @@ static void update_highscore(struct termlist *tl, struct termlist_score *t) } } -void termlist_insert(struct termlist *tl, const char *term, int freq) +void termlist_insert(struct termlist *tl, const char *display_term, + const char *norm_term, int freq) { unsigned int bucket; struct termlist_bucket **p; char buf[256], *cp; - if (strlen(term) > 255) + if (strlen(norm_term) > 255) return; - strcpy(buf, term); + strcpy(buf, norm_term); /* chop right */ for (cp = buf + strlen(buf); cp != buf && strchr(",. -", cp[-1]); cp--) cp[-1] = '\0'; @@ -123,7 +124,7 @@ void termlist_insert(struct termlist *tl, const char *term, int freq) bucket = jenkins_hash((unsigned char *)buf) % tl->hash_size; for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next) { - if (!strcmp(buf, (*p)->term.term)) + if (!strcmp(buf, (*p)->term.norm_term)) { (*p)->term.frequency += freq; update_highscore(tl, &((*p)->term)); @@ -134,7 +135,9 @@ void termlist_insert(struct termlist *tl, const char *term, int freq) { struct termlist_bucket *new = nmem_malloc(tl->nmem, sizeof(struct termlist_bucket)); - new->term.term = nmem_strdup(tl->nmem, buf); + new->term.norm_term = nmem_strdup(tl->nmem, buf); + new->term.display_term = *display_term ? + nmem_strdup(tl->nmem, display_term) : new->term.norm_term; new->term.frequency = freq; new->next = 0; *p = new; diff --git a/src/termlists.h b/src/termlists.h index 2278f2d..e586564 100644 --- a/src/termlists.h +++ b/src/termlists.h @@ -24,14 +24,16 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA struct termlist_score { - char *term; + char *norm_term; + char *display_term; int frequency; }; struct termlist; struct termlist *termlist_create(NMEM nmem, int highscore_size); -void termlist_insert(struct termlist *tl, const char *term, int freq); +void termlist_insert(struct termlist *tl, const char *display_term, + const char *norm_term, int freq); struct termlist_score **termlist_highscore(struct termlist *tl, int *len); #endif -- 1.7.10.4