New metadata facility "icurule" for normalizing metadata text PAZ-1002
authorAdam Dickmeiss <adam@indexdata.dk>
Tue, 12 May 2015 11:11:30 +0000 (13:11 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Tue, 12 May 2015 11:11:30 +0000 (13:11 +0200)
This allows normalization of text before in-cluster merging takes place.

doc/pazpar2_conf.xml
src/pazpar2_config.c
src/pazpar2_config.h
src/record.h
src/session.c
test/test_icu.cfg

index 8d64acc..86dbdc8 100644 (file)
          </varlistentry>
 
          <varlistentry>
          </varlistentry>
 
          <varlistentry>
+          <term id="icurule">icurule</term>
+          <listitem>
+           <para>
+            Specifies the ICU rule set to be used for normalizing
+            metadata text. The "display" part of the rule is kept
+            in the returned metadata record (record+show commands), the
+            end result - normalized text - is used for performing
+            within-cluster merge (unique, longest, etc). If the icurule is
+            omitted, type generic (text) is converted as follows:
+            any of the characters "<literal> ,/.:([</literal>" are
+            chopped of prefix and suffix of text content
+            <emphasis>unless</emphasis> it includes the
+            characters "<literal>://</literal>" (URL).
+           </para>
+           <note>
+            <para>
+             Requires Pazpar2 1.9.0 or later.
+            </para>
+           </note>
+          </listitem>
+         </varlistentry>
+
+         <varlistentry>
           <term>setting</term>
           <listitem>
            <para>
           <term>setting</term>
           <listitem>
            <para>
index dbfd850..e23c740 100644 (file)
@@ -170,7 +170,8 @@ static struct conf_metadata* conf_service_add_metadata(
     enum conf_metadata_mergekey mt,
     const char *facetrule,
     const char *limitmap,
     enum conf_metadata_mergekey mt,
     const char *facetrule,
     const char *limitmap,
-    const char *limitcluster
+    const char *limitcluster,
+    const char *icurule
     )
 {
     struct conf_metadata * md = 0;
     )
 {
     struct conf_metadata * md = 0;
@@ -202,6 +203,7 @@ static struct conf_metadata* conf_service_add_metadata(
     md->facetrule = nmem_strdup_null(nmem, facetrule);
     md->limitmap = nmem_strdup_null(nmem, limitmap);
     md->limitcluster = nmem_strdup_null(nmem, limitcluster);
     md->facetrule = nmem_strdup_null(nmem, facetrule);
     md->limitmap = nmem_strdup_null(nmem, limitmap);
     md->limitcluster = nmem_strdup_null(nmem, limitcluster);
+    md->icurule = nmem_strdup_null(nmem, icurule);
     return md;
 }
 
     return md;
 }
 
@@ -315,6 +317,7 @@ static int parse_metadata(struct conf_service *service, xmlNode *n,
     xmlChar *xml_limitmap = 0;
     xmlChar *xml_limitcluster = 0;
     xmlChar *xml_icu_chain = 0;
     xmlChar *xml_limitmap = 0;
     xmlChar *xml_limitcluster = 0;
     xmlChar *xml_icu_chain = 0;
+    xmlChar *xml_icurule = 0;
 
     struct _xmlAttr *attr;
 
 
     struct _xmlAttr *attr;
 
@@ -358,6 +361,9 @@ static int parse_metadata(struct conf_service *service, xmlNode *n,
         else if (!xmlStrcmp(attr->name, BAD_CAST "limitcluster") &&
                  attr->children && attr->children->type == XML_TEXT_NODE)
             xml_limitcluster = attr->children->content;
         else if (!xmlStrcmp(attr->name, BAD_CAST "limitcluster") &&
                  attr->children && attr->children->type == XML_TEXT_NODE)
             xml_limitcluster = attr->children->content;
+        else if (!xmlStrcmp(attr->name, BAD_CAST "icurule") &&
+                 attr->children && attr->children->type == XML_TEXT_NODE)
+            xml_icurule = attr->children->content;
         else
         {
             yaz_log(YLOG_FATAL, "Unknown metadata attribute '%s'", attr->name);
         else
         {
             yaz_log(YLOG_FATAL, "Unknown metadata attribute '%s'", attr->name);
@@ -515,7 +521,9 @@ static int parse_metadata(struct conf_service *service, xmlNode *n,
                               mergekey_type,
                               (const char *) xml_icu_chain,
                               (const char *) xml_limitmap,
                               mergekey_type,
                               (const char *) xml_icu_chain,
                               (const char *) xml_limitmap,
-                              (const char *) xml_limitcluster);
+                              (const char *) xml_limitcluster,
+                              (const char *) xml_icurule
+        );
     (*md_node)++;
     return 0;
 }
     (*md_node)++;
     return 0;
 }
index 1343790..83d7314 100644 (file)
@@ -82,6 +82,7 @@ struct conf_metadata
 
     char *limitmap;  // Should be expanded into service-wide default e.g. pz:limitmap:<name>=value setting
     char *limitcluster;
 
     char *limitmap;  // Should be expanded into service-wide default e.g. pz:limitmap:<name>=value setting
     char *limitcluster;
+    char *icurule;
 };
 
 
 };
 
 
index cb8df1e..0f1f249 100644 (file)
@@ -27,6 +27,7 @@ struct conf_service;
 union data_types {
     struct {
         const char *disp;
 union data_types {
     struct {
         const char *disp;
+        const char *norm;
         const char *sort;
         const char *snippet;
     } text;
         const char *sort;
         const char *snippet;
     } text;
index 9e238a6..3dcf40f 100644 (file)
@@ -1478,7 +1478,8 @@ void statistics(struct session *se, struct statistics *stat)
 }
 
 static struct record_metadata *record_metadata_init(
 }
 
 static struct record_metadata *record_metadata_init(
-    NMEM nmem, const char *value, enum conf_metadata_type type,
+    NMEM nmem, const char *value, const char *norm,
+    enum conf_metadata_type type,
     struct _xmlAttr *attr)
 {
     struct record_metadata *rec_md = record_metadata_create(nmem);
     struct _xmlAttr *attr)
 {
     struct record_metadata *rec_md = record_metadata_create(nmem);
@@ -1508,11 +1509,20 @@ static struct record_metadata *record_metadata_init(
     {
     case Metadata_type_generic:
     case Metadata_type_skiparticle:
     {
     case Metadata_type_generic:
     case Metadata_type_skiparticle:
-        if (strstr(value, "://")) /* looks like a URL */
+        if (norm)
+        {
             rec_md->data.text.disp = nmem_strdup(nmem, value);
             rec_md->data.text.disp = nmem_strdup(nmem, value);
+            rec_md->data.text.norm = nmem_strdup(nmem, norm);
+        }
         else
         else
-            rec_md->data.text.disp =
-                normalize7bit_generic(nmem_strdup(nmem, value), " ,/.:([");
+        {
+            if (strstr(value, "://")) /* looks like a URL */
+                rec_md->data.text.disp = nmem_strdup(nmem, value);
+            else
+                rec_md->data.text.disp =
+                    normalize7bit_generic(nmem_strdup(nmem, value), " ,/.:([");
+            rec_md->data.text.norm = rec_md->data.text.disp;
+        }
         rec_md->data.text.sort = 0;
         rec_md->data.text.snippet = 0;
         break;
         rec_md->data.text.sort = 0;
         rec_md->data.text.snippet = 0;
         break;
@@ -2068,6 +2078,20 @@ static int ingest_to_cluster(struct client *cl,
 
             if (!type)
                 continue;
 
             if (!type)
                 continue;
+
+            md_field_id
+                = conf_service_metadata_field_id(service, (const char *) type);
+            if (md_field_id < 0)
+            {
+                if (se->number_of_warnings_unknown_metadata == 0)
+                {
+                    session_log(se, YLOG_WARN,
+                            "Ignoring unknown metadata element: %s", type);
+                }
+                se->number_of_warnings_unknown_metadata++;
+                continue;
+            }
+
             wrbuf_rewind(wrbuf_disp);
             value0 = xmlNodeListGetString(xdoc, n->children, 1);
             if (!value0 || !*value0)
             wrbuf_rewind(wrbuf_disp);
             value0 = xmlNodeListGetString(xdoc, n->children, 1);
             if (!value0 || !*value0)
@@ -2083,23 +2107,10 @@ static int ingest_to_cluster(struct client *cl,
             }
             if (value0)
                 xmlFree(value0);
             }
             if (value0)
                 xmlFree(value0);
-            md_field_id
-                = conf_service_metadata_field_id(service, (const char *) type);
-            if (md_field_id < 0)
-            {
-                if (se->number_of_warnings_unknown_metadata == 0)
-                {
-                    session_log(se, YLOG_WARN,
-                            "Ignoring unknown metadata element: %s", type);
-                }
-                se->number_of_warnings_unknown_metadata++;
-                continue;
-            }
-
             ser_md = &service->metadata[md_field_id];
 
             // non-merged metadata
             ser_md = &service->metadata[md_field_id];
 
             // non-merged metadata
-            rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp),
+            rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp), 0,
                                           ser_md->type, n->properties);
             if (!rec_md)
             {
                                           ser_md->type, n->properties);
             if (!rec_md)
             {
@@ -2186,7 +2197,6 @@ static int ingest_to_cluster(struct client *cl,
             const char *type = 0;
             xmlChar *value0;
 
             const char *type = 0;
             xmlChar *value0;
 
-            wrbuf_rewind(wrbuf_disp);
             type = yaz_xml_get_prop(n, "type");
             if (!type)
                 continue;
             type = yaz_xml_get_prop(n, "type");
             if (!type)
                 continue;
@@ -2204,6 +2214,9 @@ static int ingest_to_cluster(struct client *cl,
                 ser_sk = &service->sortkeys[sk_field_id];
             }
 
                 ser_sk = &service->sortkeys[sk_field_id];
             }
 
+            wrbuf_rewind(wrbuf_disp);
+            wrbuf_rewind(wrbuf_norm);
+
             value0 = xmlNodeListGetString(xdoc, n->children, 1);
             if (!value0 || !*value0)
             {
             value0 = xmlNodeListGetString(xdoc, n->children, 1);
             if (!value0 || !*value0)
             {
@@ -2211,16 +2224,29 @@ static int ingest_to_cluster(struct client *cl,
                     xmlFree(value0);
                 continue;
             }
                     xmlFree(value0);
                 continue;
             }
-            wrbuf_puts(wrbuf_disp, (const char *) value0);
-            xmlFree(value0);
 
 
+            if (ser_md->icurule)
+            {
+                run_icu(se, ser_md->icurule, (const char *) value0,
+                        wrbuf_norm, wrbuf_disp);
+                yaz_log(YLOG_LOG, "run_icu input=%s norm=%s disp=%s",
+                        (const char *) value0,
+                        wrbuf_cstr(wrbuf_norm), wrbuf_cstr(wrbuf_disp));
+                rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp),
+                                              wrbuf_cstr(wrbuf_norm),
+                                              ser_md->type, 0);
+            }
+            else
+            {
+                wrbuf_puts(wrbuf_disp, (const char *) value0);
+                rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp),
+                                              0,
+                                              ser_md->type, 0);
+            }
 
 
-            // merged metadata
-            rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp),
-                                          ser_md->type, 0);
+            xmlFree(value0);
 
             // see if the field was not in cluster already (from beginning)
 
             // see if the field was not in cluster already (from beginning)
-
             if (!rec_md)
                 continue;
 
             if (!rec_md)
                 continue;
 
@@ -2262,8 +2288,8 @@ static int ingest_to_cluster(struct client *cl,
             {
                 while (*wheretoput)
                 {
             {
                 while (*wheretoput)
                 {
-                    if (!strcmp((const char *) (*wheretoput)->data.text.disp,
-                                rec_md->data.text.disp))
+                    if (!strcmp((const char *) (*wheretoput)->data.text.norm,
+                                rec_md->data.text.norm))
                         break;
                     wheretoput = &(*wheretoput)->next;
                 }
                         break;
                     wheretoput = &(*wheretoput)->next;
                 }
@@ -2273,8 +2299,8 @@ static int ingest_to_cluster(struct client *cl,
             else if (ser_md->merge == Metadata_merge_longest)
             {
                 if (!*wheretoput
             else if (ser_md->merge == Metadata_merge_longest)
             {
                 if (!*wheretoput
-                    || strlen(rec_md->data.text.disp)
-                    > strlen((*wheretoput)->data.text.disp))
+                    || strlen(rec_md->data.text.norm)
+                    > strlen((*wheretoput)->data.text.norm))
                 {
                     *wheretoput = rec_md;
                     if (ser_sk)
                 {
                     *wheretoput = rec_md;
                     if (ser_sk)
index e3c1e89..1d8cc9c 100644 (file)
@@ -32,8 +32,9 @@
     </icu_chain>
 
     <icu_chain id="mychain" locale="en">
     </icu_chain>
 
     <icu_chain id="mychain" locale="en">
+      <transliterate>[[:WhiteSpace:][,.!;]]* } [$] > ;</transliterate>
       <display/>
       <display/>
-      <transform rule="[[:WhiteSpace:][:Punctuation:]] Remove"/>
+      <casemap rule="l"/>
     </icu_chain>
 
     <icu_chain id="facet-author" locale="en">
     </icu_chain>
 
     <icu_chain id="facet-author" locale="en">
@@ -56,7 +57,7 @@
       <metadata name="subject" merge="unique" termlist="yes" rank="3"/>
       <metadata name="id"/>
       <metadata name="lccn" merge="unique"/>
       <metadata name="subject" merge="unique" termlist="yes" rank="3"/>
       <metadata name="id"/>
       <metadata name="lccn" merge="unique"/>
-      <metadata name="description" brief="yes" merge="longest" rank="3"/>
+      <metadata name="description" brief="yes" merge="longest" rank="3" icurule="mychain"/>
       
       <metadata name="test-usersetting" brief="yes" setting="postproc"/>
       <metadata name="test" setting="parameter"/>
       
       <metadata name="test-usersetting" brief="yes" setting="postproc"/>
       <metadata name="test" setting="parameter"/>