Implement ICU normalization of facets, bug #3812
authorAdam Dickmeiss <adam@indexdata.dk>
Mon, 8 Nov 2010 10:13:53 +0000 (11:13 +0100)
committerAdam Dickmeiss <adam@indexdata.dk>
Mon, 8 Nov 2010 10:13:53 +0000 (11:13 +0100)
This, like for relevance, mergekey, sort is defined in XML fragment
of server/service.

doc/pazpar2_conf.xml
src/pazpar2_config.c
src/pazpar2_config.h
src/session.c
test/test_icu.cfg
test/test_icu_8.res [new file with mode: 0644]
test/test_icu_urls

index 1eb6fde..1f7501e 100644 (file)
     </varlistentry>
 
     <varlistentry>
-     <term>relevance / sort / mergekey</term>
+     <term>relevance / sort / mergekey / facet</term>
      <listitem>
       <para>
-       Specifies character set normalization for relevancy / sorting 
-       and the mergekey - for the server. These definitions serves as
+       Specifies character set normalization for relevancy / sorting /
+       mergekey and facets - for the server. These definitions serves as
        default for services that don't have these given. For the meaning
        of these settings refer to the "relevance" element inside service.
       </para>
        </varlistentry>
 
        <varlistentry>
+       <term>facet</term>
+       <listitem>
+        <para>
+         Specifies ICU tokenization and transformation rules
+         for tokens that are used in Pazpar2's facets. The contents
+         is similar to that of <literal>relevance</literal>.
+        </para>
+       </listitem>
+       </varlistentry>
+
+       <varlistentry>
        <term>settings</term>
        <listitem>
         <para>
index ec87caf..b659848 100644 (file)
@@ -131,6 +131,7 @@ static struct conf_service *service_init(struct conf_server *server,
     service->relevance_pct = 0;
     service->sort_pct = 0;
     service->mergekey_pct = 0;
+    service->facet_pct = 0;
 
     service->id = service_id ? nmem_strdup(nmem, service_id) : 0;
     service->num_metadata = num_metadata;
@@ -249,6 +250,7 @@ void service_destroy(struct conf_service *service)
             pp2_charset_destroy(service->relevance_pct);
             pp2_charset_destroy(service->sort_pct);
             pp2_charset_destroy(service->mergekey_pct);
+            pp2_charset_destroy(service->facet_pct);
             yaz_mutex_destroy(&service->mutex);
             nmem_destroy(service->nmem);
         }
@@ -566,6 +568,20 @@ static struct conf_service *service_create_static(struct conf_server *server,
                     return 0;
             }
         }
+        else if (!strcmp((const char *) n->name, "facet"))
+        {
+            if (service->mergekey_pct)
+            {
+                yaz_log(YLOG_LOG, "facety may not repeat in service");
+                return 0;
+            }
+            else
+            {
+                service->facet_pct = pp2_charset_create_xml(n);
+                if (!service->mergekey_pct)
+                    return 0;
+            }
+        }
         else if (!strcmp((const char *) n->name, (const char *) "metadata"))
         {
             if (parse_metadata(service, n, &md_node, &sk_node))
@@ -652,7 +668,7 @@ static void inherit_server_settings(struct conf_service *s)
         }
     }
     
-    /* use relevance/sort/mergekey from server if not defined
+    /* use relevance/sort/mergekey/facet from server if not defined
        for this service.. */
     if (!s->relevance_pct)
     {
@@ -686,6 +702,17 @@ static void inherit_server_settings(struct conf_service *s)
         else
             s->mergekey_pct = pp2_charset_create(0);
     }
+
+    if (!s->facet_pct)
+    {
+        if (server->facet_pct)
+        {
+            s->facet_pct = server->facet_pct;
+            pp2_charset_incref(s->facet_pct);
+        }
+        else
+            s->facet_pct = pp2_charset_create(0);
+    }
 }
 
 struct conf_service *service_create(struct conf_server *server,
@@ -721,6 +748,7 @@ static struct conf_server *server_create(struct conf_config *config,
     server->relevance_pct = 0;
     server->sort_pct = 0;
     server->mergekey_pct = 0;
+    server->facet_pct = 0;
     server->server_settings = 0;
     server->http_server = 0;
     server->iochan_man = 0;
@@ -791,6 +819,12 @@ static struct conf_server *server_create(struct conf_config *config,
             if (!server->mergekey_pct)
                 return 0;
         }
+        else if (!strcmp((const char *) n->name, "facet"))
+        {
+            server->facet_pct = pp2_charset_create_xml(n);
+            if (!server->facet_pct)
+                return 0;
+        }
         else if (!strcmp((const char *) n->name, "service"))
         {
             char *service_id = (char *)
@@ -1015,6 +1049,7 @@ void server_destroy(struct conf_server *server)
     pp2_charset_destroy(server->relevance_pct);
     pp2_charset_destroy(server->sort_pct);
     pp2_charset_destroy(server->mergekey_pct);
+    pp2_charset_destroy(server->facet_pct);
     yaz_log(YLOG_LOG, "server_destroy server=%p", server);
     http_server_destroy(server->http_server);
 }
index 9e8f13d..234b70c 100644 (file)
@@ -119,6 +119,7 @@ struct conf_service
     pp2_charset_t relevance_pct;
     pp2_charset_t sort_pct;
     pp2_charset_t mergekey_pct;
+    pp2_charset_t facet_pct;
 
     struct database *databases;
     struct conf_targetprofiles *targetprofiles;
@@ -142,6 +143,8 @@ struct conf_server
     pp2_charset_t relevance_pct;
     pp2_charset_t sort_pct;
     pp2_charset_t mergekey_pct;
+    pp2_charset_t facet_pct;
+
     struct conf_service *service;
     struct conf_server *next;
     struct conf_config *config;
index bcd7353..710bc2e 100644 (file)
@@ -148,30 +148,51 @@ void pull_terms(NMEM nmem, struct ccl_rpn_node *n, char **termlist, int *num)
 
 void add_facet(struct session *s, const char *type, const char *value, int count)
 {
-    int i;
-
-    if (!*value)
-        return;
-    for (i = 0; i < s->num_termlists; i++)
-        if (!strcmp(s->termlists[i].name, type))
-            break;
-    if (i == s->num_termlists)
+    struct conf_service *service = s->service;
+    pp2_relevance_token_t prt;
+    const char *facet_component;
+    WRBUF facet_wrbuf = wrbuf_alloc();
+    prt = pp2_relevance_tokenize(service->facet_pct);
+    
+    pp2_relevance_first(prt, value, 0);
+    while ((facet_component = pp2_relevance_token_next(prt)))
     {
-        if (i == SESSION_MAX_TERMLISTS)
+        if (*facet_component)
         {
-            session_log(s, YLOG_FATAL, "Too many termlists");
-            return;
+            if (wrbuf_len(facet_wrbuf))
+                wrbuf_puts(facet_wrbuf, " ");
+            wrbuf_puts(facet_wrbuf, facet_component);
         }
-
-        s->termlists[i].name = nmem_strdup(s->nmem, type);
-        s->termlists[i].termlist 
-            = termlist_create(s->nmem, TERMLIST_HIGH_SCORE);
-        s->num_termlists = i + 1;
     }
-    session_log(s, YLOG_DEBUG, "Session: facets for %s: %s (%d)",
-                type, value, count);
-
-    termlist_insert(s->termlists[i].termlist, value, count);
+    pp2_relevance_token_destroy(prt);
+    
+    if (wrbuf_len(facet_wrbuf))
+    {
+        int i;
+        for (i = 0; i < s->num_termlists; i++)
+            if (!strcmp(s->termlists[i].name, type))
+                break;
+        if (i == s->num_termlists)
+        {
+            if (i == SESSION_MAX_TERMLISTS)
+            {
+                session_log(s, YLOG_FATAL, "Too many termlists");
+                wrbuf_destroy(facet_wrbuf);
+                return;
+            }
+            
+            s->termlists[i].name = nmem_strdup(s->nmem, type);
+            s->termlists[i].termlist 
+                = termlist_create(s->nmem, TERMLIST_HIGH_SCORE);
+            s->num_termlists = i + 1;
+        }
+        
+        session_log(s, YLOG_DEBUG, "Session: facets for %s: %s norm:%s (%d)",
+                    type, value, wrbuf_cstr(facet_wrbuf), count);
+        termlist_insert(s->termlists[i].termlist, wrbuf_cstr(facet_wrbuf),
+                        count);
+    }
+    wrbuf_destroy(facet_wrbuf);
 }
 
 static xmlDoc *record_to_xml(struct session *se,
index f3bd2e5..7146916 100644 (file)
@@ -7,7 +7,7 @@
     <settings src="z3950_indexdata_com_marc.xml"/>
     
     <relevance>
-      <icu_chain id="relevance" locale="el">
+      <icu_chain locale="en">
        <transform rule="[:Control:] Any-Remove"/>
        <tokenize rule="l"/>
        <transform rule="[[:WhiteSpace:][:Punctuation:]] Remove"/>
     </relevance>
 
     <sort>
-      <icu_chain id="sort" locale="el">
+      <icu_chain locale="en">
        <transform rule="[[:Control:][:WhiteSpace:][:Punctuation:]] Remove"/>
        <casemap rule="l"/>
       </icu_chain>
     </sort>
     
     <mergekey>
-      <icu_chain id="mergekey" locale="el">
+      <icu_chain locale="en">
        <tokenize rule="l"/>
        <transform rule="[[:Control:][:WhiteSpace:][:Punctuation:]] Remove"/>
        <casemap rule="l"/>
       </icu_chain>
     </mergekey>
     
+    <facet>
+      <icu_chain locale="en">
+       <tokenize rule="l"/>
+       <transform rule="[[:Control:][:WhiteSpace:][:Punctuation:]] Remove"/>
+       <casemap rule="l"/>
+      </icu_chain>
+    </facet>
+    
     <service>
 
       <metadata name="url" merge="unique"/>
diff --git a/test/test_icu_8.res b/test/test_icu_8.res
new file mode 100644 (file)
index 0000000..ad8231f
--- /dev/null
@@ -0,0 +1,21 @@
+<termlist>
+<activeclients>0</activeclients>
+<list name="author">
+<term><name>jack collins</name><frequency>2</frequency></term>
+<term><name>mairs john w</name><frequency>1</frequency></term>
+<term><name>wood helen m</name><frequency>1</frequency></term>
+<term><name>englund carl r</name><frequency>1</frequency></term>
+</list>
+<list name="subject">
+<term><name>radioisotope scanning</name><frequency>1</frequency></term>
+<term><name>scintillation cameras</name><frequency>1</frequency></term>
+<term><name>imaging systems in medicine</name><frequency>1</frequency></term>
+<term><name>cartography</name><frequency>1</frequency></term>
+<term><name>tomography</name><frequency>1</frequency></term>
+<term><name>optical pattern recognition</name><frequency>1</frequency></term>
+<term><name>computers</name><frequency>1</frequency></term>
+<term><name>railroads</name><frequency>1</frequency></term>
+<term><name>universities and colleges</name><frequency>1</frequency></term>
+<term><name>community colleges</name><frequency>1</frequency></term>
+</list>
+</termlist>
index 57bd419..8589951 100644 (file)
@@ -5,3 +5,4 @@ http://localhost:9763/search.pz2?session=1&command=search&query=computer
 http://localhost:9763/search.pz2?session=1&command=show&start=0&number=1&sort=title:1
 http://localhost:9763/search.pz2?session=1&command=show&start=0&number=1&sort=date:0
 http://localhost:9763/search.pz2?session=1&command=show&start=0&number=1&sort=date:1
+http://localhost:9763/search.pz2?session=1&command=termlist&name=author%2Csubject