From: Adam Dickmeiss Date: Tue, 12 May 2015 11:11:30 +0000 (+0200) Subject: New metadata facility "icurule" for normalizing metadata text PAZ-1002 X-Git-Tag: v1.9.0~1^2 X-Git-Url: http://git.indexdata.com/?p=pazpar2-moved-to-github.git;a=commitdiff_plain;h=85b1f355522cc620452552d76fd517f089c98ab2 New metadata facility "icurule" for normalizing metadata text PAZ-1002 This allows normalization of text before in-cluster merging takes place. --- diff --git a/doc/pazpar2_conf.xml b/doc/pazpar2_conf.xml index 8d64acc..86dbdc8 100644 --- a/doc/pazpar2_conf.xml +++ b/doc/pazpar2_conf.xml @@ -421,6 +421,29 @@ + icurule + + + Specifies the ICU rule set to be used for normalizing + metadata text. The "display" part of the rule is kept + in the returned metadata record (record+show commands), the + end result - normalized text - is used for performing + within-cluster merge (unique, longest, etc). If the icurule is + omitted, type generic (text) is converted as follows: + any of the characters " ,/.:([" are + chopped of prefix and suffix of text content + unless it includes the + characters "://" (URL). + + + + Requires Pazpar2 1.9.0 or later. + + + + + + setting diff --git a/src/pazpar2_config.c b/src/pazpar2_config.c index dbfd850..e23c740 100644 --- a/src/pazpar2_config.c +++ b/src/pazpar2_config.c @@ -170,7 +170,8 @@ static struct conf_metadata* conf_service_add_metadata( enum conf_metadata_mergekey mt, const char *facetrule, const char *limitmap, - const char *limitcluster + const char *limitcluster, + const char *icurule ) { struct conf_metadata * md = 0; @@ -202,6 +203,7 @@ static struct conf_metadata* conf_service_add_metadata( md->facetrule = nmem_strdup_null(nmem, facetrule); md->limitmap = nmem_strdup_null(nmem, limitmap); md->limitcluster = nmem_strdup_null(nmem, limitcluster); + md->icurule = nmem_strdup_null(nmem, icurule); return md; } @@ -315,6 +317,7 @@ static int parse_metadata(struct conf_service *service, xmlNode *n, xmlChar *xml_limitmap = 0; xmlChar *xml_limitcluster = 0; xmlChar *xml_icu_chain = 0; + xmlChar *xml_icurule = 0; struct _xmlAttr *attr; @@ -358,6 +361,9 @@ static int parse_metadata(struct conf_service *service, xmlNode *n, else if (!xmlStrcmp(attr->name, BAD_CAST "limitcluster") && attr->children && attr->children->type == XML_TEXT_NODE) xml_limitcluster = attr->children->content; + else if (!xmlStrcmp(attr->name, BAD_CAST "icurule") && + attr->children && attr->children->type == XML_TEXT_NODE) + xml_icurule = attr->children->content; else { yaz_log(YLOG_FATAL, "Unknown metadata attribute '%s'", attr->name); @@ -515,7 +521,9 @@ static int parse_metadata(struct conf_service *service, xmlNode *n, mergekey_type, (const char *) xml_icu_chain, (const char *) xml_limitmap, - (const char *) xml_limitcluster); + (const char *) xml_limitcluster, + (const char *) xml_icurule + ); (*md_node)++; return 0; } diff --git a/src/pazpar2_config.h b/src/pazpar2_config.h index 1343790..83d7314 100644 --- a/src/pazpar2_config.h +++ b/src/pazpar2_config.h @@ -82,6 +82,7 @@ struct conf_metadata char *limitmap; // Should be expanded into service-wide default e.g. pz:limitmap:=value setting char *limitcluster; + char *icurule; }; diff --git a/src/record.h b/src/record.h index cb8df1e..0f1f249 100644 --- a/src/record.h +++ b/src/record.h @@ -27,6 +27,7 @@ struct conf_service; union data_types { struct { const char *disp; + const char *norm; const char *sort; const char *snippet; } text; diff --git a/src/session.c b/src/session.c index 9e238a6..3dcf40f 100644 --- a/src/session.c +++ b/src/session.c @@ -1478,7 +1478,8 @@ void statistics(struct session *se, struct statistics *stat) } static struct record_metadata *record_metadata_init( - NMEM nmem, const char *value, enum conf_metadata_type type, + NMEM nmem, const char *value, const char *norm, + enum conf_metadata_type type, struct _xmlAttr *attr) { struct record_metadata *rec_md = record_metadata_create(nmem); @@ -1508,11 +1509,20 @@ static struct record_metadata *record_metadata_init( { case Metadata_type_generic: case Metadata_type_skiparticle: - if (strstr(value, "://")) /* looks like a URL */ + if (norm) + { rec_md->data.text.disp = nmem_strdup(nmem, value); + rec_md->data.text.norm = nmem_strdup(nmem, norm); + } else - rec_md->data.text.disp = - normalize7bit_generic(nmem_strdup(nmem, value), " ,/.:(["); + { + if (strstr(value, "://")) /* looks like a URL */ + rec_md->data.text.disp = nmem_strdup(nmem, value); + else + rec_md->data.text.disp = + normalize7bit_generic(nmem_strdup(nmem, value), " ,/.:(["); + rec_md->data.text.norm = rec_md->data.text.disp; + } rec_md->data.text.sort = 0; rec_md->data.text.snippet = 0; break; @@ -2068,6 +2078,20 @@ static int ingest_to_cluster(struct client *cl, if (!type) continue; + + md_field_id + = conf_service_metadata_field_id(service, (const char *) type); + if (md_field_id < 0) + { + if (se->number_of_warnings_unknown_metadata == 0) + { + session_log(se, YLOG_WARN, + "Ignoring unknown metadata element: %s", type); + } + se->number_of_warnings_unknown_metadata++; + continue; + } + wrbuf_rewind(wrbuf_disp); value0 = xmlNodeListGetString(xdoc, n->children, 1); if (!value0 || !*value0) @@ -2083,23 +2107,10 @@ static int ingest_to_cluster(struct client *cl, } if (value0) xmlFree(value0); - md_field_id - = conf_service_metadata_field_id(service, (const char *) type); - if (md_field_id < 0) - { - if (se->number_of_warnings_unknown_metadata == 0) - { - session_log(se, YLOG_WARN, - "Ignoring unknown metadata element: %s", type); - } - se->number_of_warnings_unknown_metadata++; - continue; - } - ser_md = &service->metadata[md_field_id]; // non-merged metadata - rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp), + rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp), 0, ser_md->type, n->properties); if (!rec_md) { @@ -2186,7 +2197,6 @@ static int ingest_to_cluster(struct client *cl, const char *type = 0; xmlChar *value0; - wrbuf_rewind(wrbuf_disp); type = yaz_xml_get_prop(n, "type"); if (!type) continue; @@ -2204,6 +2214,9 @@ static int ingest_to_cluster(struct client *cl, ser_sk = &service->sortkeys[sk_field_id]; } + wrbuf_rewind(wrbuf_disp); + wrbuf_rewind(wrbuf_norm); + value0 = xmlNodeListGetString(xdoc, n->children, 1); if (!value0 || !*value0) { @@ -2211,16 +2224,29 @@ static int ingest_to_cluster(struct client *cl, xmlFree(value0); continue; } - wrbuf_puts(wrbuf_disp, (const char *) value0); - xmlFree(value0); + if (ser_md->icurule) + { + run_icu(se, ser_md->icurule, (const char *) value0, + wrbuf_norm, wrbuf_disp); + yaz_log(YLOG_LOG, "run_icu input=%s norm=%s disp=%s", + (const char *) value0, + wrbuf_cstr(wrbuf_norm), wrbuf_cstr(wrbuf_disp)); + rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp), + wrbuf_cstr(wrbuf_norm), + ser_md->type, 0); + } + else + { + wrbuf_puts(wrbuf_disp, (const char *) value0); + rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp), + 0, + ser_md->type, 0); + } - // merged metadata - rec_md = record_metadata_init(se->nmem, wrbuf_cstr(wrbuf_disp), - ser_md->type, 0); + xmlFree(value0); // see if the field was not in cluster already (from beginning) - if (!rec_md) continue; @@ -2262,8 +2288,8 @@ static int ingest_to_cluster(struct client *cl, { while (*wheretoput) { - if (!strcmp((const char *) (*wheretoput)->data.text.disp, - rec_md->data.text.disp)) + if (!strcmp((const char *) (*wheretoput)->data.text.norm, + rec_md->data.text.norm)) break; wheretoput = &(*wheretoput)->next; } @@ -2273,8 +2299,8 @@ static int ingest_to_cluster(struct client *cl, else if (ser_md->merge == Metadata_merge_longest) { if (!*wheretoput - || strlen(rec_md->data.text.disp) - > strlen((*wheretoput)->data.text.disp)) + || strlen(rec_md->data.text.norm) + > strlen((*wheretoput)->data.text.norm)) { *wheretoput = rec_md; if (ser_sk) diff --git a/test/test_icu.cfg b/test/test_icu.cfg index e3c1e89..1d8cc9c 100644 --- a/test/test_icu.cfg +++ b/test/test_icu.cfg @@ -32,8 +32,9 @@ + [[:WhiteSpace:][,.!;]]* } [$] > ; - + @@ -56,7 +57,7 @@ - +