More work on sorting
authorAdam Dickmeiss <adam@indexdata.dk>
Tue, 11 Oct 2011 13:25:40 +0000 (15:25 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Tue, 11 Oct 2011 13:25:40 +0000 (15:25 +0200)
Session only searches once for each sort criteria. Each record is
matched against all records in cluster, to avoid duplicates.

src/pazpar2_config.h
src/reclists.c
src/record.c
src/record.h
src/session.c
src/session.h
test/test_solr.urls
test/test_url.urls
test/test_url_8.res [new file with mode: 0644]

index cc072be..f3c346b 100644 (file)
@@ -30,7 +30,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
 enum conf_metadata_type {
     Metadata_type_generic,    // Generic text field
-    Metadata_type_number,     // A number
     Metadata_type_year,        // A number
     Metadata_type_date        // A number
 };
index 99c9c87..d063c17 100644 (file)
@@ -314,6 +314,16 @@ struct record_cluster *reclist_insert(struct reclist *l,
         if (!strcmp(merge_key, (*p)->record->merge_key))
         {
             struct record_cluster *existing = (*p)->record;
+            struct record *re = existing->records;
+
+            for (; re; re = re->next)
+            {
+                if (record_compare(record, re, service))
+                { 
+                    yaz_mutex_leave(l->mutex);
+                    return 0;
+                }
+            }
             record->next = existing->records;
             existing->records = record;
             cluster = existing;
index c43af42..874a0a2 100644 (file)
@@ -90,98 +90,41 @@ struct record_metadata * record_metadata_create(NMEM nmem)
 }
 
 
-struct record_metadata * record_metadata_insert(NMEM nmem, 
-                                                struct record_metadata ** rmd,
-                                                union data_types data)
+int record_compare(struct record *r1, struct record *r2,
+                   struct conf_service *service)
 {
-    struct record_metadata * tmp_rmd = 0;
-    // assert(nmem);
-
-    if(!rmd)
-        return 0;
-
-    // construct new record_metadata
-    tmp_rmd  = nmem_malloc(nmem, sizeof(struct record_metadata));
-    tmp_rmd->data = data;
-
-
-    // insert in *rmd's place, moving *rmd one down the list
-    tmp_rmd->next = *rmd;
-    *rmd = tmp_rmd;
-
-    return *rmd;
-}
-
-struct record_metadata * record_add_metadata_field_id(NMEM nmem, 
-                                                     struct record * record,
-                                                     int field_id, 
-                                                     union data_types data)
-{
-    if (field_id < 0 || !record || !record->metadata)
-        return 0;
-
-    return record_metadata_insert(nmem, &(record->metadata[field_id]), data);
-}
-
-
-struct record_metadata * record_add_metadata(NMEM nmem, 
-                                             struct record * record,
-                                             struct conf_service * service,
-                                             const char * name,
-                                             union data_types data)
-{
-    int field_id = 0;
-
-    if (!record || !record->metadata || !service || !name)  
-        return 0;
-    
-    field_id = conf_service_metadata_field_id(service, name);
-
-    if (-1 == field_id)
-        return 0;
-    
-    return record_metadata_insert(nmem, &(record->metadata[field_id]), data);
-}
-
-
-
-
-
-
-union data_types * record_assign_sortkey_field_id(NMEM nmem, 
-                                               struct record * record,
-                                               int field_id, 
-                                               union data_types data)
-{
-    if (field_id < 0 || !record || !record->sortkeys)
-        return 0;
-
-    return data_types_assign(nmem, &(record->sortkeys[field_id]), data);
-}
-
-
-
-union data_types * record_assign_sortkey(NMEM nmem, 
-                                      struct record * record,
-                                      struct conf_service * service,
-                                      const char * name,
-                                      union data_types data)
-{
-    int field_id = 0;
-
-    if (!record || !service || !name)  
-        return 0;
-    
-    field_id = conf_service_sortkey_field_id(service, name);
-
-    if (!(-1 < field_id) || !(field_id < service->num_sortkeys))
-        return 0;
-
-    return record_assign_sortkey_field_id(nmem, record, field_id, data);
+    int i;
+    for (i = 0; i < service->num_metadata; i++)
+    {
+        struct conf_metadata *ser_md = &service->metadata[i];
+        enum conf_metadata_type type = ser_md->type;
+            
+        struct record_metadata *m1 = r1->metadata[i];
+        struct record_metadata *m2 = r2->metadata[i];
+        while (m1 && m2)
+        {
+            switch (type)
+            {
+            case Metadata_type_generic:
+                if (strcmp(m1->data.text.disp, m2->data.text.disp))
+                    return 0;
+                break;
+            case Metadata_type_date:
+            case Metadata_type_year:
+                if (m1->data.number.min != m2->data.number.min ||
+                    m1->data.number.max != m2->data.number.max)
+                    return 0;
+                break;
+            }
+            m1 = m1->next;
+            m2 = m2->next;
+        }
+        if (m1 || m2)
+            return 0;
+    }
+    return 1;
 }
 
-
-
 /*
  * Local variables:
  * c-basic-offset: 4
index 3960f72..b7efc2e 100644 (file)
@@ -72,38 +72,7 @@ struct record * record_create(NMEM nmem, int num_metadata, int num_sortkeys,
 
 struct record_metadata * record_metadata_create(NMEM nmem);
 
-struct record_metadata * record_metadata_insert(NMEM nmem, 
-                                                struct record_metadata ** rmd,
-                                                union data_types data);
-
-
-struct record_metadata * record_add_metadata_field_id(NMEM nmem, 
-                                                      struct record * record,
-                                                      int field_id, 
-                                                      union data_types data);
-
-
-struct record_metadata * record_add_metadata(NMEM nmem, 
-                                             struct record * record,
-                                             struct conf_service * service,
-                                             const char * name,
-                                             union data_types data);
-
-
-union data_types * record_assign_sortkey_field_id(NMEM nmem, 
-                                               struct record * record,
-                                               int field_id, 
-                                               union data_types data);
-
-
-union data_types * record_assign_sortkey(NMEM nmem, 
-                                      struct record * record,
-                                      struct conf_service * service,
-                                      const char * name,
-                                      union data_types data);
-
-
-
+int record_compare(struct record *r1, struct record *r2, struct conf_service *service);
 
 struct record_cluster
 {
@@ -120,9 +89,6 @@ struct record_cluster
     struct record *records;
 };
 
-
-
-
 #endif // RECORD_H
 
 /*
index d18d6aa..c9628e4 100644 (file)
@@ -95,6 +95,12 @@ struct client_list {
     struct client_list *next;
 };
 
+struct session_sorted_results {
+    const char *field;
+    int increasing;
+    struct session_sorted_results *next;
+};
+
 /* session counting (1) , disable client counting (0) */
 static YAZ_MUTEX g_session_mutex = 0;
 static int no_sessions = 0;
@@ -593,10 +599,32 @@ int session_is_preferred_clients_ready(struct session *s)
 
 void search_sort(struct session *se, const char *field, int increasing)
 {
+    struct session_sorted_results *sr;
     struct client_list *l;
-    struct timeval tval;
 
     session_enter(se);
+
+    /* see if we already have sorted for this critieria */
+    for (sr = se->sorted_results; sr; sr = sr->next)
+    {
+        if (!strcmp(field, sr->field) && increasing == sr->increasing)
+            break;
+    }
+    if (sr)
+    {
+        yaz_log(YLOG_LOG, "search_sort: field=%s increasing=%d already fetched",
+                field, increasing);
+        session_leave(se);
+        return;
+    }
+    yaz_log(YLOG_LOG, "search_sort: field=%s increasing=%d must fetch",
+            field, increasing);
+    sr = nmem_malloc(se->nmem, sizeof(*sr));
+    sr->field = nmem_strdup(se->nmem, field);
+    sr->increasing = increasing;
+    sr->next = se->sorted_results;
+    se->sorted_results = sr;
+    
     for (l = se->clients; l; l = l->next)
     {
         struct client *cl = l->client;
@@ -619,9 +647,10 @@ void search_sort(struct session *se, const char *field, int increasing)
                 break;
             }
         }
-
+        
         if (strategy_plus_sort)
         {
+            struct timeval tval;
             if (client_prep_connection(cl, se->service->z3950_operation_timeout,
                                        se->service->z3950_session_timeout,
                                        se->service->server->iochan_man,
@@ -630,7 +659,7 @@ void search_sort(struct session *se, const char *field, int increasing)
                 char **array;
                 int num;
                 nmem_strsplit(se->nmem, ":", strategy_plus_sort, &array, &num);
-
+                
                 if (num == 2)
                 {
                     const char *sort_spec = array[1];
@@ -671,6 +700,13 @@ enum pazpar2_error_code search(struct session *se,
     nmem_reset(se->nmem);
     se->total_records = se->total_merged = 0;
     se->num_termlists = 0;
+
+    /* reset list of sorted results and clear to relevance search */
+    se->sorted_results = nmem_malloc(se->nmem, sizeof(*se->sorted_results));
+    se->sorted_results->field = nmem_strdup(se->nmem, "relevance");
+    se->sorted_results->increasing = 0;
+    se->sorted_results->next = 0;
+    
     live_channels = select_targets(se, filter);
     if (!live_channels)
     {
@@ -1481,7 +1517,7 @@ int ingest_record(struct client *cl, const char *rec,
     }
     session_enter(se);
     if (client_get_session(cl) == se)
-        ret = ingest_to_cluster(cl, xdoc, root, record_no, mergekey_norm);
+        ingest_to_cluster(cl, xdoc, root, record_no, mergekey_norm);
     session_leave(se);
     
     xmlFreeDoc(xdoc);
@@ -1497,25 +1533,81 @@ static int ingest_to_cluster(struct client *cl,
     xmlNode *n;
     xmlChar *type = 0;
     xmlChar *value = 0;
-    struct session_database *sdb = client_get_database(cl);
     struct session *se = client_get_session(cl);
     struct conf_service *service = se->service;
     struct record *record = record_create(se->nmem, 
                                           service->num_metadata,
                                           service->num_sortkeys, cl,
                                           record_no);
+
+    for (n = root->children; n; n = n->next)
+    {
+        if (type)
+            xmlFree(type);
+        if (value)
+            xmlFree(value);
+        type = value = 0;
+        
+        if (n->type != XML_ELEMENT_NODE)
+            continue;
+        if (!strcmp((const char *) n->name, "metadata"))
+        {
+            struct conf_metadata *ser_md = 0;
+            struct record_metadata **wheretoput = 0;
+            struct record_metadata *rec_md = 0;
+            int md_field_id = -1;
+            
+            type = xmlGetProp(n, (xmlChar *) "type");
+            value = xmlNodeListGetString(xdoc, n->children, 1);
+            
+            if (!type || !value || !*value)
+                continue;
+            
+            md_field_id 
+                = conf_service_metadata_field_id(service, (const char *) type);
+            if (md_field_id < 0)
+            {
+                if (se->number_of_warnings_unknown_metadata == 0)
+                {
+                    session_log(se, YLOG_WARN, 
+                            "Ignoring unknown metadata element: %s", type);
+                }
+                se->number_of_warnings_unknown_metadata++;
+                continue;
+            }
+           
+            ser_md = &service->metadata[md_field_id];
+
+            // non-merged metadata
+            rec_md = record_metadata_init(se->nmem, (const char *) value,
+                                          ser_md->type, n->properties);
+            if (!rec_md)
+            {
+                session_log(se, YLOG_WARN, "bad metadata data '%s' "
+                            "for element '%s'", value, type);
+                continue;
+            }
+            wheretoput = &record->metadata[md_field_id];
+            while (*wheretoput)
+                wheretoput = &(*wheretoput)->next;
+            *wheretoput = rec_md;
+        }
+    }
+
     struct record_cluster *cluster = reclist_insert(se->reclist,
                                                     service, 
                                                     record,
                                                     mergekey_norm,
                                                     &se->total_merged);
+    if (!cluster)
+        return -1;
 
-    const char *use_term_factor_str = session_setting_oneval(sdb, PZ_TERMLIST_TERM_FACTOR);
-    int use_term_factor = 0;
-    int term_factor = 1; 
-    if (use_term_factor_str && use_term_factor_str[0] != 0)
-       use_term_factor =  atoi(use_term_factor_str);
-    if (use_term_factor) {
+    struct session_database *sdb = client_get_database(cl);
+    int term_factor = 1;
+    const char *use_term_factor_str =
+        session_setting_oneval(sdb, PZ_TERMLIST_TERM_FACTOR);
+    if (use_term_factor_str && use_term_factor_str[0] == '1')
+    {
         int maxrecs = client_get_maxrecs(cl);
         int hits = (int) client_get_hits(cl);
         term_factor = MAX(hits, maxrecs) /  MAX(1, maxrecs);
@@ -1523,11 +1615,11 @@ static int ingest_to_cluster(struct client *cl,
         yaz_log(YLOG_DEBUG, "Using term factor: %d (%d / %d)", term_factor, MAX(hits, maxrecs), MAX(1, maxrecs));
     }
 
-    if (!cluster)
-        return -1;
     if (global_parameters.dump_records)
         session_log(se, YLOG_LOG, "Cluster id %s from %s (#%d)", cluster->recid,
                     sdb->database->id, record_no);
+
+
     relevance_newrec(se->relevance, cluster);
     
     // now parsing XML record and adding data to cluster or record metadata
@@ -1560,37 +1652,16 @@ static int ingest_to_cluster(struct client *cl,
             md_field_id 
                 = conf_service_metadata_field_id(service, (const char *) type);
             if (md_field_id < 0)
-            {
-                if (se->number_of_warnings_unknown_metadata == 0)
-                {
-                    session_log(se, YLOG_WARN, 
-                            "Ignoring unknown metadata element: %s", type);
-                }
-                se->number_of_warnings_unknown_metadata++;
                 continue;
-            }
             
             ser_md = &service->metadata[md_field_id];
             
-            if (ser_md->sortkey_offset >= 0){
+            if (ser_md->sortkey_offset >= 0)
+            {
                 sk_field_id = ser_md->sortkey_offset;
                 ser_sk = &service->sortkeys[sk_field_id];
             }
 
-            // non-merged metadata
-            rec_md = record_metadata_init(se->nmem, (const char *) value,
-                                          ser_md->type, n->properties);
-            if (!rec_md)
-            {
-                session_log(se, YLOG_WARN, "bad metadata data '%s' "
-                            "for element '%s'", value, type);
-                continue;
-            }
-            wheretoput = &record->metadata[md_field_id];
-            while (*wheretoput)
-                wheretoput = &(*wheretoput)->next;
-            *wheretoput = rec_md;
-
             // merged metadata
             rec_md = record_metadata_init(se->nmem, (const char *) value,
                                           ser_md->type, 0);
index d958eaf..b839ef2 100644 (file)
@@ -114,6 +114,7 @@ struct session {
     normalize_cache_t normalize_cache;
     YAZ_MUTEX session_mutex;
     unsigned session_id;
+    struct session_sorted_results *sorted_results;
 };
 
 struct statistics {
index 3daef61..4518018 100644 (file)
@@ -1,5 +1,5 @@
 http://localhost:9763/search.pz2?command=init&clear=1
 http://localhost:9763/search.pz2?session=1&command=settings&pz%3Atermlist_term_count%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=10&use_url_proxy%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=0&pz%3Apiggyback%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1&pz%3Apreferred%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1&pz%3Acclmap%3Asu%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Dsubject&pz%3Asru%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=solr&use_thumbnails%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=0&pz%3Acclmap%3Adate%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Ddate&medium%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=web&pz%3Aname%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=ocs_test&pz%3Acclmap%3Aissn%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=u%3D8&pz%3Acclmap%3Ati%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Dtitle&pz%3Acclmap%3Aau%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Dauthor&pz%3Axslt%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=solr-pz2.xsl&pz%3Acclmap%3Aterm%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Dtext+s%3Dal&pz%3Acclmap%3Aisbn%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=1%3Disbn&pz%3Aqueryencoding%5Bocs-test.indexdata.com%2Fsolr%2Fselect%5D=UTF-8
 http://localhost:9763/search.pz2?session=1&command=search&query=water
-1 http://localhost:9763/search.pz2?session=1&command=show&block=preferred
+2 http://localhost:9763/search.pz2?session=1&command=show&block=preferred
 http://localhost:9763/search.pz2?session=1&command=termlist&name=xtargets%2Csubject%2Cauthor%2Cdate%2Cmedium
index eab827f..b497907 100644 (file)
@@ -5,3 +5,4 @@ http://localhost:9763/search.pz2?session=1&command=settings&pz:url%5Bmy%5D=z3950
 http://localhost:9763/search.pz2?session=1&command=search&query=computer
 2 http://localhost:9763/search.pz2?session=1&command=show&block=1
 2 http://localhost:9763/search.pz2?session=1&command=show&block=1&sort=title:1
+1 http://localhost:9763/search.pz2?session=1&command=show&block=1&sort=title:0
diff --git a/test/test_url_8.res b/test/test_url_8.res
new file mode 100644 (file)
index 0000000..11e41c6
--- /dev/null
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<show><status>OK</status>
+<activeclients>0</activeclients>
+<merged>3</merged>
+<total>3</total>
+<start>0</start>
+<num>3</num>
+<hit>
+
+<md-title>OIL/GAS DRILLING</md-title>
+<md-description>This database contains information on oil and gas drilling such as well name, operator, driller, location, depth, copies of logs run, permits, samples (cuttings, core), completion records</md-description><location id="my" name="marcserver">
+<md-title>OIL/GAS DRILLING</md-title>
+<md-description tag="520">This database contains information on oil and gas drilling such as well name, operator, driller, location, depth, copies of logs run, permits, samples (cuttings, core), completion records</md-description>
+<md-description tag="513">1907-PRESENT</md-description></location>
+<recid>title oil gas drilling author medium book</recid>
+</hit>
+<hit>
+
+<md-title>GROUNDWATER RESOURCE MAPS - COUNTY SERIES</md-title>
+<md-description>A series of 1:250,000 scale maps showing well yield, well depth, and depth to bedrock for a large number of bedrock wells inventoried by the Maine Geological Survey in the mid-to late 1970&apos;s comprises this data set.  Some series also show bedrock topography and potentiometric surface.  Geographic coverage is restricted to Southern Maine</md-description><location id="my" name="marcserver">
+<md-title>GROUNDWATER RESOURCE MAPS - COUNTY SERIES</md-title>
+<md-description tag="520">A series of 1:250,000 scale maps showing well yield, well depth, and depth to bedrock for a large number of bedrock wells inventoried by the Maine Geological Survey in the mid-to late 1970&apos;s comprises this data set.  Some series also show bedrock topography and potentiometric surface.  Geographic coverage is restricted to Southern Maine</md-description>
+<md-description tag="513">1972-1978</md-description></location>
+<recid>title groundwater resource maps county series author medium book</recid>
+</hit>
+<hit>
+
+<md-title>BIBLIOGRAPHY OF MAINE GEOLOGY</md-title>
+<md-description>This data base is a computer based bibliography of marine geology.  It allows searching by topic and geographic location, similar to GEOREF.  It is currently under development to replace the printed Bibliography of Marine Geology</md-description><location id="my" name="marcserver">
+<md-title>BIBLIOGRAPHY OF MAINE GEOLOGY</md-title>
+<md-description tag="520">This data base is a computer based bibliography of marine geology.  It allows searching by topic and geographic location, similar to GEOREF.  It is currently under development to replace the printed Bibliography of Marine Geology</md-description>
+<md-description tag="513">1692-PRESENT</md-description></location>
+<recid>title bibliography of maine geology author medium book</recid>
+</hit>
+</show>
\ No newline at end of file