Conf pz:limitmap may perform local filtering
authorAdam Dickmeiss <adam@indexdata.dk>
Tue, 8 Nov 2011 11:07:05 +0000 (12:07 +0100)
committerAdam Dickmeiss <adam@indexdata.dk>
Tue, 8 Nov 2011 11:08:13 +0000 (12:08 +0100)
If pz:limitmap's leading string is 'local:', Pazpar2 will perform
local filtering for the search commands' limit parameter. Add tests
for local filtering on subject and date.

doc/pazpar2_conf.xml
src/client.c
src/client.h
src/facet_limit.c
src/facet_limit.h
src/session.c
test/test_http.urls
test/test_http_73.res [new file with mode: 0644]
test/test_http_74.res [new file with mode: 0644]
test/test_http_75.res [new file with mode: 0644]
test/test_http_76.res [new file with mode: 0644]

index dcc1fdf..f41070c 100644 (file)
      <listitem>
       <para>
        Specifies attributes for limiting a search to a field - using
-       the limit parameter for search. In some cases the mapping of 
+       the limit parameter for search. It can be used to filter locally
+       or remotely (search in a target). In some cases the mapping of 
        a field to a value is identical to an existing cclmap field; in
        other cases the field must be specified in a different way - for
        example to match a complete field (rather than parts of a subfield).
       </para>
       <para>
-       The value of limitmap may have one of two forms: referral to
-       an exisiting CCL field or a raw PQF string. Leading string
-       determines type; either <literal>ccl:</literal> for CCL field or
-       <literal>rpn:</literal> for PQF/RPN.
+       The value of limitmap may have one of three forms: referral to
+       an existing CCL field, a raw PQF string or a local limit. Leading string
+       determines type; either <literal>ccl:</literal> for CCL field, 
+       <literal>rpn:</literal> for PQF/RPN, or <literal>local:</literal>
+       for filtering in Pazpar2.
       </para>
       <note>
        <para>
        The limitmap facility is supported for Pazpar2 version 1.6.0.
+       Local filtering is supported in Pazpar2 1.6.6.
        </para>
       </note>
      </listitem>
index 0ae3641..9e0f903 100644 (file)
@@ -121,6 +121,7 @@ struct client {
     YAZ_MUTEX mutex;
     int ref_count;
     char *id;
+    facet_limits_t facet_limits;
 };
 
 struct suggestions {
@@ -827,6 +828,7 @@ struct client *client_create(const char *id)
     pazpar2_mutex_create(&cl->mutex, "client");
     cl->preferred = 0;
     cl->ref_count = 1;
+    cl->facet_limits = 0;
     assert(id);
     cl->id = xstrdup(id);
     client_use(1);
@@ -865,6 +867,7 @@ int client_destroy(struct client *c)
             c->cqlquery = 0;
             xfree(c->id);
             assert(!c->connection);
+            facet_limits_destroy(c->facet_limits);
 
             if (c->resultset)
             {
@@ -988,6 +991,33 @@ static char *make_solrquery(struct client *cl)
     return r;
 }
 
+const char *client_get_facet_limit_local(struct client *cl,
+                                         struct session_database *sdb,
+                                         int *l,
+                                         NMEM nmem, int *num, char ***values)
+{
+    const char *name = 0;
+    const char *value = 0;
+    for (; (name = facet_limits_get(cl->facet_limits, *l, &value)); (*l)++)
+    {
+        struct setting *s = 0;
+        
+        for (s = sdb->settings[PZ_LIMITMAP]; s; s = s->next)
+        {
+            const char *p = strchr(s->name + 3, ':');
+            if (p && !strcmp(p + 1, name) && s->value &&
+                !strncmp(s->value, "local:", 6))
+            {
+                nmem_strsplit_escape2(nmem, "|", value, values,
+                                      num, 1, '\\', 1);
+                (*l)++;
+                return name;
+            }
+        }
+    }
+    return 0;
+}
+
 static int apply_limit(struct session_database *sdb,
                        facet_limits_t facet_limits,
                        WRBUF w_pqf, WRBUF w_ccl)
@@ -1112,6 +1142,9 @@ int client_parse_query(struct client *cl, const char *query,
     if (apply_limit(sdb, facet_limits, w_pqf, w_ccl))
         return -2;
 
+    facet_limits_destroy(cl->facet_limits);
+    cl->facet_limits = facet_limits_dup(facet_limits);
+
     yaz_log(YLOG_LOG, "CCL query: %s", wrbuf_cstr(w_ccl));
     cn = ccl_find_str(ccl_map, wrbuf_cstr(w_ccl), &cerror, &cpos);
     ccl_qual_rm(&ccl_map);
index 97e8b68..0b1d413 100644 (file)
@@ -103,8 +103,10 @@ void client_unlock(struct client *c);
 int client_has_facet(struct client *cl, const char *name);
 void client_check_preferred_watch(struct client *cl);
 void client_reingest(struct client *cl);
-
-
+const char *client_get_facet_limit_local(struct client *cl,
+                                         struct session_database *sdb,
+                                         int *l,
+                                         NMEM nmem, int *num, char ***values);
 #endif
 
 /*
index 44bcff0..92d7eaf 100644 (file)
@@ -39,6 +39,28 @@ struct facet_limits {
     char **darray;
 };
 
+facet_limits_t facet_limits_dup(facet_limits_t fl)
+{
+    int i;
+    NMEM nmem = nmem_create();
+    facet_limits_t fn = nmem_malloc(nmem, sizeof(*fn));
+    fn->nmem = nmem;
+    fn->num = fl->num;
+    fn->darray = 0;
+    if (fl->num)
+    {
+        fn->darray = nmem_malloc(nmem, fn->num * sizeof(*fn->darray));
+        for (i = 0; i < fn->num; i++)
+        {
+            const char *src = fl->darray[i];
+            size_t sz = strlen(src) + 2 + strlen(src + strlen(src) + 1);
+            fn->darray[i] = nmem_malloc(nmem, sz);
+            memcpy(fn->darray[i], src, sz);
+        }
+    }
+    return fn;
+}
+
 facet_limits_t facet_limits_create(const char *param)
 {
     int i;
index 60b8e9a..60ce8ce 100644 (file)
@@ -32,6 +32,8 @@ const char *facet_limits_get(facet_limits_t fl, int idx, const char **value);
 
 void facet_limits_destroy(facet_limits_t fl);
 
+facet_limits_t facet_limits_dup(facet_limits_t fl);
+
 #endif
 
 /*
index ac41b36..5d34aa4 100644 (file)
@@ -165,13 +165,14 @@ static void session_leave(struct session *s)
     yaz_mutex_leave(s->session_mutex);
 }
 
-void add_facet(struct session *s, const char *type, const char *value, int count)
+static void session_normalize_facet(struct session *s, const char *type,
+                                    const char *value,
+                                    WRBUF display_wrbuf,
+                                    WRBUF facet_wrbuf)
 {
     struct conf_service *service = s->service;
     pp2_charset_token_t prt;
     const char *facet_component;
-    WRBUF facet_wrbuf = wrbuf_alloc();
-    WRBUF display_wrbuf = wrbuf_alloc();
     int i;
     const char *icu_chain_id = 0;
 
@@ -208,6 +209,14 @@ void add_facet(struct session *s, const char *type, const char *value, int count
         }
     }
     pp2_charset_token_destroy(prt);
+}
+
+void add_facet(struct session *s, const char *type, const char *value, int count)
+{
+    WRBUF facet_wrbuf = wrbuf_alloc();
+    WRBUF display_wrbuf = wrbuf_alloc();
+
+    session_normalize_facet(s, type, value, display_wrbuf, facet_wrbuf);
  
     if (wrbuf_len(facet_wrbuf))
     {
@@ -1564,13 +1573,83 @@ int ingest_record(struct client *cl, const char *rec,
     }
     session_enter(se);
     if (client_get_session(cl) == se)
-        ingest_to_cluster(cl, xdoc, root, record_no, mergekey_norm);
+        ret = ingest_to_cluster(cl, xdoc, root, record_no, mergekey_norm);
     session_leave(se);
     
     xmlFreeDoc(xdoc);
     return ret;
 }
 
+static int check_limit_local(struct client *cl,
+                             struct record *record)
+{
+    int skip_record = 0;
+    struct session *se = client_get_session(cl);
+    struct conf_service *service = se->service;
+    NMEM nmem_tmp = nmem_create();
+    struct session_database *sdb = client_get_database(cl);
+    int l = 0;
+    while (!skip_record)
+    {
+        struct conf_metadata *ser_md = 0;
+        struct record_metadata *rec_md = 0;
+        int md_field_id;
+        char **values = 0;
+        int i, num_v = 0;
+        
+        const char *name =
+            client_get_facet_limit_local(cl, sdb, &l, nmem_tmp, &num_v,
+                                         &values);
+        if (!name)
+            break;
+        
+        md_field_id = conf_service_metadata_field_id(service, name);
+        if (md_field_id < 0)
+        {
+            skip_record = 1;
+            break;
+        }
+        ser_md = &service->metadata[md_field_id];
+        rec_md = record->metadata[md_field_id];
+        for (i = 0; i < num_v; )
+        {
+            if (rec_md)
+            {
+                if (ser_md->type == Metadata_type_year 
+                    || ser_md->type == Metadata_type_date)
+                {
+                    int y = atoi(values[i]);
+                    if (y >= rec_md->data.number.min 
+                        && y <= rec_md->data.number.max)
+                        break;
+                }
+                else
+                {
+                    yaz_log(YLOG_LOG, "cmp: '%s' '%s'",
+                            rec_md->data.text.disp, values[i]);
+                    if (!strcmp(rec_md->data.text.disp, values[i]))
+                    {
+                        break;
+                    }
+                }
+                rec_md = rec_md->next;
+            }
+            else
+            {
+                rec_md = record->metadata[md_field_id];
+                i++;
+            }
+        }
+        if (i == num_v)
+        {
+            skip_record = 1;
+            break;
+        }
+    }
+    nmem_destroy(nmem_tmp);
+    return skip_record;
+}
+                             
 static int ingest_to_cluster(struct client *cl,
                              xmlDoc *xdoc,
                              xmlNode *root,
@@ -1644,6 +1723,16 @@ static int ingest_to_cluster(struct client *cl,
         }
     }
 
+    if (check_limit_local(cl, record))
+    {
+        session_log(se, YLOG_LOG, "Facet filtered out record no %d from %s",
+                    record_no, sdb->database->id);
+        if (type)
+            xmlFree(type);
+        if (value)
+            xmlFree(value);
+        return -2;
+    }
     cluster = reclist_insert(se->reclist, service, record,
                              mergekey_norm, &se->total_merged);
     if (!cluster)
index ef1dbf0..de45220 100644 (file)
@@ -55,18 +55,22 @@ http://localhost:9763/search.pz2?session=8&command=search&query=xyzzyz
 2 http://localhost:9763/search.pz2?session=8&command=show&block=1
 http://localhost:9763/search.pz2?session=8&command=search&query=a+and
 1 http://localhost:9763/search.pz2?session=8&command=show&block=1
-http://localhost:9763/search.pz2?command=init&pz:limitmap:author%5Bz3950.indexdata.com%2Fmarc%5D=ccl:author_phrase
-1 http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3dadam\,+james%7Cother_author
+http://localhost:9763/search.pz2?command=init&pz:limitmap:author%5Bz3950.indexdata.com%2Fmarc%5D=ccl:author_phrase&pz:limitmap:subject%5Bz3950.indexdata.com%2fmarc%5D=local:&pz:limitmap:date%5Bz3950.indexdata.com%2fmarc%5D=local:
+1 http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3Dadam\,+james%7Cother_author
 1 http://localhost:9763/search.pz2?session=9&command=show&block=1
-http://localhost:9763/search.pz2?session=9&command=settings&pz:limitmap:author%5Bz3950.indexdata.com%2Fmarc%5D=rpn:%40attr+1%3d1003+%40attr+6%3d3
-1 http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3dadam\,+james%7Cother_author
+http://localhost:9763/search.pz2?session=9&command=settings&pz:limitmap:author%5Bz3950.indexdata.com%2Fmarc%5D=rpn:%40attr+1%3D1003+%40attr+6%3D3
+1 http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3Dadam\,+james%7Cother_author
 1 http://localhost:9763/search.pz2?session=9&command=show&block=1
-http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3dadam\,+james%7Cother_author&filter=pz%3Aid%3Dz3950.indexdata.com%2Fmarc
+http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3Dadam\,+james%7Cother_author&filter=pz%3Aid%3Dz3950.indexdata.com%2Fmarc
 http://localhost:9763/search.pz2?session=9&command=bytarget
 http://localhost:9763/search.pz2?session=9&command=show
-http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3dadam\,+james%7Cother_author&filter=pz%3Aid%3Dnone
+http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3Dadam\,+james%7Cother_author&filter=pz%3Aid%3Dnone
 http://localhost:9763/search.pz2?session=9&command=bytarget
 http://localhost:9763/search.pz2?session=9&command=show
-http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3dadam\,+james%7Cother_author
+http://localhost:9763/search.pz2?session=9&command=search&query=greece&limit=author%3Dadam\,+james%7Cother_author
 http://localhost:9763/search.pz2?session=9&command=bytarget
 http://localhost:9763/search.pz2?session=9&command=show
+http://localhost:9763/search.pz2?session=9&command=search&query=computer&limit=subject%3DRailroads
+http://localhost:9763/search.pz2?session=9&command=show&block=1
+http://localhost:9763/search.pz2?session=9&command=search&query=computer&limit=date%3D1977
+http://localhost:9763/search.pz2?session=9&command=show&block=1
diff --git a/test/test_http_73.res b/test/test_http_73.res
new file mode 100644 (file)
index 0000000..ab63fe6
--- /dev/null
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<search><status>OK</status></search>
\ No newline at end of file
diff --git a/test/test_http_74.res b/test/test_http_74.res
new file mode 100644 (file)
index 0000000..64e8673
--- /dev/null
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<show><status>OK</status>
+<activeclients>0</activeclients>
+<merged>1</merged>
+<total>10</total>
+<start>0</start>
+<num>1</num>
+<hit>
+
+<md-title>Washington metropolitan area rail computer feasibility study;</md-title>
+<md-title-remainder>final report</md-title-remainder>
+<md-date>1971</md-date>
+<md-author>Englund, Carl R</md-author>
+<md-subject>Railroads</md-subject>
+<md-description>&quot;Contract DOT-UT-10003.&quot;</md-description><location id="z3950.indexdata.com/marc" name="Index Data MARC test server">
+<md-title>Washington metropolitan area rail computer feasibility study;</md-title>
+<md-title-remainder>final report</md-title-remainder>
+<md-date>1971</md-date>
+<md-author>Englund, Carl R</md-author>
+<md-subject>Railroads</md-subject>
+<md-description tag="500">&quot;Contract DOT-UT-10003.&quot;</md-description>
+<md-test-usersetting>XXXXXXXXXX</md-test-usersetting>
+<md-test-usersetting-2>test-usersetting-2 data: 
+        YYYYYYYYY</md-test-usersetting-2></location>
+<relevance>85714</relevance>
+<recid>content: title washington metropolitan area rail computer feasibility study author englund carl r medium book</recid>
+</hit>
+</show>
\ No newline at end of file
diff --git a/test/test_http_75.res b/test/test_http_75.res
new file mode 100644 (file)
index 0000000..ab63fe6
--- /dev/null
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<search><status>OK</status></search>
\ No newline at end of file
diff --git a/test/test_http_76.res b/test/test_http_76.res
new file mode 100644 (file)
index 0000000..ba0f848
--- /dev/null
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<show><status>OK</status>
+<activeclients>0</activeclients>
+<merged>4</merged>
+<total>10</total>
+<start>0</start>
+<num>4</num>
+<hit>
+
+<md-title>Computer science &amp; technology</md-title>
+<md-title-remainder>proceedings of a workshop held at the National Bureau of Standards, Gaithersburg, MD, June 3-4, 1976</md-title-remainder>
+<md-date>1977</md-date>
+<md-subject>Optical pattern recognition</md-subject><location id="z3950.indexdata.com/marc" name="Index Data MARC test server">
+<md-title>Computer science &amp; technology</md-title>
+<md-title-remainder>proceedings of a workshop held at the National Bureau of Standards, Gaithersburg, MD, June 3-4, 1976</md-title-remainder>
+<md-date>1977</md-date>
+<md-subject>Optical pattern recognition</md-subject>
+<md-test-usersetting>XXXXXXXXXX</md-test-usersetting>
+<md-test-usersetting-2>test-usersetting-2 data: 
+        YYYYYYYYY</md-test-usersetting-2></location>
+<relevance>57536</relevance>
+<recid>content: title computer science technology author medium book</recid>
+</hit>
+<hit>
+
+<md-title>The Computer Bible</md-title>
+<md-date>1973-1980</md-date>
+<md-subject>Bible. O.T</md-subject>
+<md-subject>Bible</md-subject>
+<md-description>Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates</md-description><location id="z3950.indexdata.com/marc" name="Index Data MARC test server">
+<md-title>The Computer Bible</md-title>
+<md-date>1973-1980</md-date>
+<md-subject>Bible. O.T</md-subject>
+<md-subject>Bible</md-subject>
+<md-description tag="500">Hebrew and Greek; introductions in English</md-description>
+<md-description tag="500">Vols. 2, 8: Missoula, Mont. : Published by Scholars Press for Biblical Research Associates</md-description>
+<md-test-usersetting>XXXXXXXXXX</md-test-usersetting>
+<md-test-usersetting-2>test-usersetting-2 data: 
+        YYYYYYYYY</md-test-usersetting-2></location>
+<relevance>57536</relevance>
+<recid>content: title the computer bible author medium book</recid>
+</hit>
+<hit>
+
+<md-title>The use of passwords for controlled access to computer resources</md-title>
+<md-date>1977</md-date>
+<md-author>Wood, Helen M</md-author>
+<md-subject>Computers</md-subject><location id="z3950.indexdata.com/marc" name="Index Data MARC test server">
+<md-title>The use of passwords for controlled access to computer resources</md-title>
+<md-date>1977</md-date>
+<md-author>Wood, Helen M</md-author>
+<md-subject>Computers</md-subject>
+<md-test-usersetting>XXXXXXXXXX</md-test-usersetting>
+<md-test-usersetting-2>test-usersetting-2 data: 
+        YYYYYYYYY</md-test-usersetting-2></location>
+<relevance>17260</relevance>
+<recid>content: title the use of passwords for controlled access to computer resources author wood helen m medium book</recid>
+</hit>
+<hit>
+
+<md-title>Reconstruction tomography in diagnostic radiology and nuclear medicine</md-title>
+<md-title-remainder>proceedings of the workshop</md-title-remainder>
+<md-date>1977</md-date>
+<md-subject>Tomography</md-subject>
+<md-description>Includes bibliographical references and index</md-description><location id="z3950.indexdata.com/marc" name="Index Data MARC test server">
+<md-title>Reconstruction tomography in diagnostic radiology and nuclear medicine</md-title>
+<md-title-remainder>proceedings of the workshop</md-title-remainder>
+<md-date>1977</md-date>
+<md-subject>Tomography</md-subject>
+<md-description tag="504">Includes bibliographical references and index</md-description>
+<md-test-usersetting>XXXXXXXXXX</md-test-usersetting>
+<md-test-usersetting-2>test-usersetting-2 data: 
+        YYYYYYYYY</md-test-usersetting-2></location>
+<relevance>0</relevance>
+<recid>content: title reconstruction tomography in diagnostic radiology and nuclear medicine author medium book</recid>
+</hit>
+</show>
\ No newline at end of file