From 5bb55be401f739bf7405a6cb04528e3bc9f93b5f Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 8 Nov 2010 11:13:53 +0100 Subject: [PATCH] Implement ICU normalization of facets, bug #3812 This, like for relevance, mergekey, sort is defined in XML fragment of server/service. --- doc/pazpar2_conf.xml | 17 +++++++++++--- src/pazpar2_config.c | 37 +++++++++++++++++++++++++++++- src/pazpar2_config.h | 3 +++ src/session.c | 61 +++++++++++++++++++++++++++++++++----------------- test/test_icu.cfg | 14 +++++++++--- test/test_icu_8.res | 21 +++++++++++++++++ test/test_icu_urls | 1 + 7 files changed, 127 insertions(+), 27 deletions(-) create mode 100644 test/test_icu_8.res diff --git a/doc/pazpar2_conf.xml b/doc/pazpar2_conf.xml index 1eb6fde..1f7501e 100644 --- a/doc/pazpar2_conf.xml +++ b/doc/pazpar2_conf.xml @@ -121,11 +121,11 @@ - relevance / sort / mergekey + relevance / sort / mergekey / facet - Specifies character set normalization for relevancy / sorting - and the mergekey - for the server. These definitions serves as + Specifies character set normalization for relevancy / sorting / + mergekey and facets - for the server. These definitions serves as default for services that don't have these given. For the meaning of these settings refer to the "relevance" element inside service. @@ -415,6 +415,17 @@ + facet + + + Specifies ICU tokenization and transformation rules + for tokens that are used in Pazpar2's facets. The contents + is similar to that of relevance. + + + + + settings diff --git a/src/pazpar2_config.c b/src/pazpar2_config.c index ec87caf..b659848 100644 --- a/src/pazpar2_config.c +++ b/src/pazpar2_config.c @@ -131,6 +131,7 @@ static struct conf_service *service_init(struct conf_server *server, service->relevance_pct = 0; service->sort_pct = 0; service->mergekey_pct = 0; + service->facet_pct = 0; service->id = service_id ? nmem_strdup(nmem, service_id) : 0; service->num_metadata = num_metadata; @@ -249,6 +250,7 @@ void service_destroy(struct conf_service *service) pp2_charset_destroy(service->relevance_pct); pp2_charset_destroy(service->sort_pct); pp2_charset_destroy(service->mergekey_pct); + pp2_charset_destroy(service->facet_pct); yaz_mutex_destroy(&service->mutex); nmem_destroy(service->nmem); } @@ -566,6 +568,20 @@ static struct conf_service *service_create_static(struct conf_server *server, return 0; } } + else if (!strcmp((const char *) n->name, "facet")) + { + if (service->mergekey_pct) + { + yaz_log(YLOG_LOG, "facety may not repeat in service"); + return 0; + } + else + { + service->facet_pct = pp2_charset_create_xml(n); + if (!service->mergekey_pct) + return 0; + } + } else if (!strcmp((const char *) n->name, (const char *) "metadata")) { if (parse_metadata(service, n, &md_node, &sk_node)) @@ -652,7 +668,7 @@ static void inherit_server_settings(struct conf_service *s) } } - /* use relevance/sort/mergekey from server if not defined + /* use relevance/sort/mergekey/facet from server if not defined for this service.. */ if (!s->relevance_pct) { @@ -686,6 +702,17 @@ static void inherit_server_settings(struct conf_service *s) else s->mergekey_pct = pp2_charset_create(0); } + + if (!s->facet_pct) + { + if (server->facet_pct) + { + s->facet_pct = server->facet_pct; + pp2_charset_incref(s->facet_pct); + } + else + s->facet_pct = pp2_charset_create(0); + } } struct conf_service *service_create(struct conf_server *server, @@ -721,6 +748,7 @@ static struct conf_server *server_create(struct conf_config *config, server->relevance_pct = 0; server->sort_pct = 0; server->mergekey_pct = 0; + server->facet_pct = 0; server->server_settings = 0; server->http_server = 0; server->iochan_man = 0; @@ -791,6 +819,12 @@ static struct conf_server *server_create(struct conf_config *config, if (!server->mergekey_pct) return 0; } + else if (!strcmp((const char *) n->name, "facet")) + { + server->facet_pct = pp2_charset_create_xml(n); + if (!server->facet_pct) + return 0; + } else if (!strcmp((const char *) n->name, "service")) { char *service_id = (char *) @@ -1015,6 +1049,7 @@ void server_destroy(struct conf_server *server) pp2_charset_destroy(server->relevance_pct); pp2_charset_destroy(server->sort_pct); pp2_charset_destroy(server->mergekey_pct); + pp2_charset_destroy(server->facet_pct); yaz_log(YLOG_LOG, "server_destroy server=%p", server); http_server_destroy(server->http_server); } diff --git a/src/pazpar2_config.h b/src/pazpar2_config.h index 9e8f13d..234b70c 100644 --- a/src/pazpar2_config.h +++ b/src/pazpar2_config.h @@ -119,6 +119,7 @@ struct conf_service pp2_charset_t relevance_pct; pp2_charset_t sort_pct; pp2_charset_t mergekey_pct; + pp2_charset_t facet_pct; struct database *databases; struct conf_targetprofiles *targetprofiles; @@ -142,6 +143,8 @@ struct conf_server pp2_charset_t relevance_pct; pp2_charset_t sort_pct; pp2_charset_t mergekey_pct; + pp2_charset_t facet_pct; + struct conf_service *service; struct conf_server *next; struct conf_config *config; diff --git a/src/session.c b/src/session.c index bcd7353..710bc2e 100644 --- a/src/session.c +++ b/src/session.c @@ -148,30 +148,51 @@ void pull_terms(NMEM nmem, struct ccl_rpn_node *n, char **termlist, int *num) void add_facet(struct session *s, const char *type, const char *value, int count) { - int i; - - if (!*value) - return; - for (i = 0; i < s->num_termlists; i++) - if (!strcmp(s->termlists[i].name, type)) - break; - if (i == s->num_termlists) + struct conf_service *service = s->service; + pp2_relevance_token_t prt; + const char *facet_component; + WRBUF facet_wrbuf = wrbuf_alloc(); + prt = pp2_relevance_tokenize(service->facet_pct); + + pp2_relevance_first(prt, value, 0); + while ((facet_component = pp2_relevance_token_next(prt))) { - if (i == SESSION_MAX_TERMLISTS) + if (*facet_component) { - session_log(s, YLOG_FATAL, "Too many termlists"); - return; + if (wrbuf_len(facet_wrbuf)) + wrbuf_puts(facet_wrbuf, " "); + wrbuf_puts(facet_wrbuf, facet_component); } - - s->termlists[i].name = nmem_strdup(s->nmem, type); - s->termlists[i].termlist - = termlist_create(s->nmem, TERMLIST_HIGH_SCORE); - s->num_termlists = i + 1; } - session_log(s, YLOG_DEBUG, "Session: facets for %s: %s (%d)", - type, value, count); - - termlist_insert(s->termlists[i].termlist, value, count); + pp2_relevance_token_destroy(prt); + + if (wrbuf_len(facet_wrbuf)) + { + int i; + for (i = 0; i < s->num_termlists; i++) + if (!strcmp(s->termlists[i].name, type)) + break; + if (i == s->num_termlists) + { + if (i == SESSION_MAX_TERMLISTS) + { + session_log(s, YLOG_FATAL, "Too many termlists"); + wrbuf_destroy(facet_wrbuf); + return; + } + + s->termlists[i].name = nmem_strdup(s->nmem, type); + s->termlists[i].termlist + = termlist_create(s->nmem, TERMLIST_HIGH_SCORE); + s->num_termlists = i + 1; + } + + session_log(s, YLOG_DEBUG, "Session: facets for %s: %s norm:%s (%d)", + type, value, wrbuf_cstr(facet_wrbuf), count); + termlist_insert(s->termlists[i].termlist, wrbuf_cstr(facet_wrbuf), + count); + } + wrbuf_destroy(facet_wrbuf); } static xmlDoc *record_to_xml(struct session *se, diff --git a/test/test_icu.cfg b/test/test_icu.cfg index f3bd2e5..7146916 100644 --- a/test/test_icu.cfg +++ b/test/test_icu.cfg @@ -7,7 +7,7 @@ - + @@ -16,20 +16,28 @@ - + - + + + + + + + + + diff --git a/test/test_icu_8.res b/test/test_icu_8.res new file mode 100644 index 0000000..ad8231f --- /dev/null +++ b/test/test_icu_8.res @@ -0,0 +1,21 @@ + +0 + +jack collins2 +mairs john w1 +wood helen m1 +englund carl r1 + + +radioisotope scanning1 +scintillation cameras1 +imaging systems in medicine1 +cartography1 +tomography1 +optical pattern recognition1 +computers1 +railroads1 +universities and colleges1 +community colleges1 + + diff --git a/test/test_icu_urls b/test/test_icu_urls index 57bd419..8589951 100644 --- a/test/test_icu_urls +++ b/test/test_icu_urls @@ -5,3 +5,4 @@ http://localhost:9763/search.pz2?session=1&command=search&query=computer http://localhost:9763/search.pz2?session=1&command=show&start=0&number=1&sort=title:1 http://localhost:9763/search.pz2?session=1&command=show&start=0&number=1&sort=date:0 http://localhost:9763/search.pz2?session=1&command=show&start=0&number=1&sort=date:1 +http://localhost:9763/search.pz2?session=1&command=termlist&name=author%2Csubject -- 1.7.10.4