Merge remote-tracking branch 'origin/master' into ranking-h
[pazpar2-moved-to-github.git] / src / relevance.c
1 /* This file is part of Pazpar2.
2    Copyright (C) 2006-2013 Index Data
3
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <math.h>
26 #include <stdlib.h>
27
28 #include "pazpar2_config.h"
29 #include "relevance.h"
30 #include "session.h"
31 #include "client.h"
32
33 #ifdef WIN32
34 #define log2(x) (log(x)/log(2))
35 #endif
36
37 struct relevance
38 {
39     int *doc_frequency_vec;
40     int *term_frequency_vec_tmp;
41     int *term_pos;
42     int vec_len;
43     struct word_entry *entries;
44     pp2_charset_token_t prt;
45     int rank_cluster;
46     double follow_factor;
47     double lead_decay;
48     int length_divide;
49     NMEM nmem;
50     struct normalizing *norm;
51 };
52
53 // Structure to keep data for normalizing scores from one client
54 struct normalizing
55 {
56     int num;
57     float sum;
58     float max;
59     int count;
60     struct client *client;
61     struct normalizing *next;
62 };
63
64 struct word_entry {
65     const char *norm_str;
66     const char *display_str;
67     int termno;
68     char *ccl_field;
69     struct word_entry *next;
70 };
71
72 // Find the normalizing entry for this client, or create one if not there
73 struct normalizing *findnorm( struct relevance *rel, struct client* client)
74 {
75     struct normalizing *n = rel->norm;
76     while (n) {
77         if (n->client == client )
78             return n;
79         n = n->next;
80     }
81     n = nmem_malloc(rel->nmem, sizeof(struct normalizing) );
82     if ( rel->norm )
83         n->num = rel->norm->num +1;
84     else
85         n->num = 1;
86     n->sum = 0.0;
87     n->count = 0;
88     n->max = 0.0;
89     n->client = client;
90     n->next = rel->norm;
91     rel->norm = n;
92     return n;
93 }
94
95 static struct word_entry *word_entry_match(struct relevance *r,
96                                            const char *norm_str,
97                                            const char *rank, int *weight)
98 {
99     int i = 1;
100     struct word_entry *entries = r->entries;
101     for (; entries; entries = entries->next, i++)
102     {
103         if (*norm_str && !strcmp(norm_str, entries->norm_str))
104         {
105             const char *cp = 0;
106             int no_read = 0;
107             sscanf(rank, "%d%n", weight, &no_read);
108             rank += no_read;
109             while (*rank == ' ')
110                 rank++;
111             if (no_read > 0 && (cp = strchr(rank, ' ')))
112             {
113                 if ((cp - rank) == strlen(entries->ccl_field) &&
114                     memcmp(entries->ccl_field, rank, cp - rank) == 0)
115                     *weight = atoi(cp + 1);
116             }
117             return entries;
118         }
119     }
120     return 0;
121 }
122
123 int relevance_snippet(struct relevance *r,
124                       const char *words, const char *name,
125                       WRBUF w_snippet)
126 {
127     int no = 0;
128     const char *norm_str;
129     int highlight = 0;
130
131     pp2_charset_token_first(r->prt, words, 0);
132     while ((norm_str = pp2_charset_token_next(r->prt)))
133     {
134         size_t org_start, org_len;
135         struct word_entry *entries = r->entries;
136         int i;
137
138         pp2_get_org(r->prt, &org_start, &org_len);
139         for (; entries; entries = entries->next, i++)
140         {
141             if (*norm_str && !strcmp(norm_str, entries->norm_str))
142                 break;
143         }
144         if (entries)
145         {
146             if (!highlight)
147             {
148                 highlight = 1;
149                 wrbuf_puts(w_snippet, "<match>");
150                 no++;
151             }
152         }
153         else
154         {
155             if (highlight)
156             {
157                 highlight = 0;
158                 wrbuf_puts(w_snippet, "</match>");
159             }
160         }
161         wrbuf_xmlputs_n(w_snippet, words + org_start, org_len);
162     }
163     if (highlight)
164         wrbuf_puts(w_snippet, "</match>");
165     if (no)
166     {
167         yaz_log(YLOG_DEBUG, "SNIPPET match: %s", wrbuf_cstr(w_snippet));
168     }
169     return no;
170 }
171
172 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
173                           const char *words, const char *rank,
174                           const char *name)
175 {
176     int *w = r->term_frequency_vec_tmp;
177     const char *norm_str;
178     int i, length = 0;
179     double lead_decay = r->lead_decay;
180     struct word_entry *e;
181     WRBUF wr = cluster->relevance_explain1;
182     int printed_about_field = 0;
183
184     pp2_charset_token_first(r->prt, words, 0);
185     for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
186     {
187         w[i] = 0;
188         r->term_pos[i] = 0;
189     }
190
191     assert(rank);
192     while ((norm_str = pp2_charset_token_next(r->prt)))
193     {
194         int local_weight = 0;
195         e = word_entry_match(r, norm_str, rank, &local_weight);
196         if (e)
197         {
198             int res = e->termno;
199             int j;
200
201             if (!printed_about_field)
202             {
203                 printed_about_field = 1;
204                 wrbuf_printf(wr, "field=%s content=", name);
205                 if (strlen(words) > 50)
206                 {
207                     wrbuf_xmlputs_n(wr, words, 49);
208                     wrbuf_puts(wr, " ...");
209                 }
210                 else
211                     wrbuf_xmlputs(wr, words);
212                 wrbuf_puts(wr, ";\n");
213             }
214             assert(res < r->vec_len);
215             w[res] += local_weight / (1 + log2(1 + lead_decay * length));
216             wrbuf_printf(wr, "%s: w[%d] += w(%d) / "
217                          "(1+log2(1+lead_decay(%f) * length(%d)));\n",
218                          e->display_str, res, local_weight, lead_decay, length);
219             j = res - 1;
220             if (j > 0 && r->term_pos[j])
221             {
222                 int d = length + 1 - r->term_pos[j];
223                 wrbuf_printf(wr, "%s: w[%d] += w[%d](%d) * follow(%f) / "
224                              "(1+log2(d(%d));\n",
225                              e->display_str, res, res, w[res],
226                              r->follow_factor, d);
227                 w[res] += w[res] * r->follow_factor / (1 + log2(d));
228             }
229             for (j = 0; j < r->vec_len; j++)
230                 r->term_pos[j] = j < res ? 0 : length + 1;
231         }
232         length++;
233     }
234
235     for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
236     {
237         if (length == 0 || w[i] == 0)
238             continue;
239         wrbuf_printf(wr, "%s: tf[%d] += w[%d](%d)", e->display_str, i, i, w[i]);
240         switch (r->length_divide)
241         {
242         case 0:
243             cluster->term_frequency_vecf[i] += (double) w[i];
244             break;
245         case 1:
246             wrbuf_printf(wr, " / log2(1+length(%d))", length);
247             cluster->term_frequency_vecf[i] +=
248                 (double) w[i] / log2(1 + length);
249             break;
250         case 2:
251             wrbuf_printf(wr, " / length(%d)", length);
252             cluster->term_frequency_vecf[i] += (double) w[i] / length;
253         }
254         cluster->term_frequency_vec[i] += w[i];
255         wrbuf_printf(wr, " (%f);\n", cluster->term_frequency_vecf[i]);
256     }
257
258     cluster->term_frequency_vec[0] += length;
259 }
260
261 static void pull_terms(struct relevance *res, struct ccl_rpn_node *n)
262 {
263     char **words;
264     int numwords;
265     char *ccl_field;
266     int i;
267
268     switch (n->kind)
269     {
270     case CCL_RPN_AND:
271     case CCL_RPN_OR:
272     case CCL_RPN_NOT:
273     case CCL_RPN_PROX:
274         pull_terms(res, n->u.p[0]);
275         pull_terms(res, n->u.p[1]);
276         break;
277     case CCL_RPN_TERM:
278         nmem_strsplit(res->nmem, " ", n->u.t.term, &words, &numwords);
279         for (i = 0; i < numwords; i++)
280         {
281             const char *norm_str;
282
283             ccl_field = nmem_strdup_null(res->nmem, n->u.t.qual);
284
285             pp2_charset_token_first(res->prt, words[i], 0);
286             while ((norm_str = pp2_charset_token_next(res->prt)))
287             {
288                 struct word_entry **e = &res->entries;
289                 while (*e)
290                     e = &(*e)->next;
291                 *e = nmem_malloc(res->nmem, sizeof(**e));
292                 (*e)->norm_str = nmem_strdup(res->nmem, norm_str);
293                 (*e)->ccl_field = ccl_field;
294                 (*e)->termno = res->vec_len++;
295                 (*e)->display_str = nmem_strdup(res->nmem, words[i]);
296                 (*e)->next = 0;
297             }
298         }
299         break;
300     default:
301         break;
302     }
303 }
304 void relevance_clear(struct relevance *r)
305 {
306     if (r)
307     {
308         int i;
309         for (i = 0; i < r->vec_len; i++)
310             r->doc_frequency_vec[i] = 0;
311     }
312 }
313
314 struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
315                                        struct ccl_rpn_node *query,
316                                        int rank_cluster,
317                                        double follow_factor, double lead_decay,
318                                        int length_divide)
319 {
320     NMEM nmem = nmem_create();
321     struct relevance *res = nmem_malloc(nmem, sizeof(*res));
322
323     res->nmem = nmem;
324     res->entries = 0;
325     res->vec_len = 1;
326     res->rank_cluster = rank_cluster;
327     res->follow_factor = follow_factor;
328     res->lead_decay = lead_decay;
329     res->length_divide = length_divide;
330     res->prt = pp2_charset_token_create(pft, "relevance");
331
332     pull_terms(res, query);
333
334     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
335
336     // worker array
337     res->term_frequency_vec_tmp =
338         nmem_malloc(res->nmem,
339                     res->vec_len * sizeof(*res->term_frequency_vec_tmp));
340
341     res->term_pos =
342         nmem_malloc(res->nmem, res->vec_len * sizeof(*res->term_pos));
343
344     relevance_clear(res);
345
346     res->norm = 0; 
347     return res;
348 }
349
350 void relevance_destroy(struct relevance **rp)
351 {
352     if (*rp)
353     {
354         pp2_charset_token_destroy((*rp)->prt);
355         nmem_destroy((*rp)->nmem);
356         *rp = 0;
357     }
358 }
359
360 void relevance_mergerec(struct relevance *r, struct record_cluster *dst,
361                         const struct record_cluster *src)
362 {
363     int i;
364
365     for (i = 0; i < r->vec_len; i++)
366         dst->term_frequency_vec[i] += src->term_frequency_vec[i];
367
368     for (i = 0; i < r->vec_len; i++)
369         dst->term_frequency_vecf[i] += src->term_frequency_vecf[i];
370 }
371
372 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
373 {
374     int i;
375
376     // term frequency [1,..] . [0] is total length of all fields
377     rec->term_frequency_vec =
378         nmem_malloc(r->nmem,
379                     r->vec_len * sizeof(*rec->term_frequency_vec));
380     for (i = 0; i < r->vec_len; i++)
381         rec->term_frequency_vec[i] = 0;
382
383     // term frequency divided by length of field [1,...]
384     rec->term_frequency_vecf =
385         nmem_malloc(r->nmem,
386                     r->vec_len * sizeof(*rec->term_frequency_vecf));
387     for (i = 0; i < r->vec_len; i++)
388         rec->term_frequency_vecf[i] = 0.0;
389 }
390
391 static const char *getfield(struct record *bestrecord, const char *tag)
392 {
393     struct session *se = client_get_session(bestrecord->client);
394     int md_field_id = conf_service_metadata_field_id(se->service, tag);
395     struct record_metadata *md = 0;
396     if (md_field_id <0)
397         return "";
398     md = bestrecord->metadata[md_field_id];
399     if ( md)
400         return md->data.text.disp;
401     return "";
402 }
403
404 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
405 {
406     int i;
407     
408     // Find the best record in a cluster - the one with lowest position
409     // (in this proto. Later, find a better one)
410     struct record *bestrecord = 0;
411     struct record *record;
412     struct normalizing *n;
413     float score;
414     for (record = cluster->records; record; record = record->next) 
415         if ( bestrecord == 0 || bestrecord->position < record->position )
416             bestrecord = record;
417     n = findnorm(r,bestrecord->client);
418     n->count ++;
419     score = atof( getfield(bestrecord,"score") );
420     n->sum += score;
421     if ( n->max < score )
422         n->max = score;
423
424     for (i = 1; i < r->vec_len; i++)
425         if (cluster->term_frequency_vec[i] > 0)
426             r->doc_frequency_vec[i]++;
427
428     r->doc_frequency_vec[0]++;
429 }
430
431
432 // Helper to compare floats, for qsort
433 static int sort_float(const void *x, const void *y)
434 {
435     const float *fx = x;
436     const float *fy = y;
437     //yaz_log(YLOG_LOG,"sorting %f and %f", *fx, *fy);  // ###
438     if ( *fx < *fy )
439         return 1;
440     if ( *fx > *fy )
441         return -1;
442     return 0;   // do not return *fx-*fy, it is often too close to zero.
443 }
444
445 // Prepare for a relevance-sorted read
446 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
447                             enum conf_sortkey_type type)
448 {
449     int i;
450     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
451     int n_clients = clients_count();
452     int clusternumber = 0;
453     yaz_log(YLOG_LOG,"round-robin: have %d clients", n_clients);
454
455     reclist_enter(reclist);
456     // Calculate document frequency vector for each term.
457     for (i = 1; i < rel->vec_len; i++)
458     {
459         if (!rel->doc_frequency_vec[i])
460             idfvec[i] = 0;
461         else
462         {
463             /* add one to nominator idf(t,D) to ensure a value > 0 */
464             idfvec[i] = log((float) (1 + rel->doc_frequency_vec[0]) /
465                             rel->doc_frequency_vec[i]);
466         }
467     }
468     // Calculate relevance for each document
469     while (1)
470     {
471         int relevance = 0;
472         WRBUF w;
473         struct word_entry *e = rel->entries;
474         struct record_cluster *rec = reclist_read_record(reclist);
475         if (!rec)
476             break;
477         clusternumber++;
478         w = rec->relevance_explain2;
479         wrbuf_rewind(w);
480         wrbuf_puts(w, "relevance = 0;\n");
481         for (i = 1; i < rel->vec_len; i++)
482         {
483             float termfreq = (float) rec->term_frequency_vecf[i];
484             int add = 100000 * termfreq * idfvec[i];
485
486             wrbuf_printf(w, "idf[%d] = log(((1 + total(%d))/termoccur(%d));\n",
487                          i, rel->doc_frequency_vec[0],
488                          rel->doc_frequency_vec[i]);
489             wrbuf_printf(w, "%s: relevance += 100000 * tf[%d](%f) * "
490                          "idf[%d](%f) (%d);\n",
491                          e->display_str, i, termfreq, i, idfvec[i], add);
492             relevance += add;
493             e = e->next;
494         }
495         if (!rel->rank_cluster)
496         {
497             struct record *record;
498             int cluster_size = 0;
499
500             for (record = rec->records; record; record = record->next)
501                 cluster_size++;
502
503             wrbuf_printf(w, "score = relevance(%d)/cluster_size(%d);\n",
504                          relevance, cluster_size);
505             relevance /= cluster_size;
506         }
507         else
508         {
509             wrbuf_printf(w, "score = relevance(%d);\n", relevance);
510         }
511         // Experimental round-robin
512         // Overwrites the score calculated above, but I keep it there to
513         // get the log entries
514         if (type == Metadata_sortkey_relevance_h) {
515             struct record *record;
516             struct normalizing *norm;
517             struct record *bestrecord = 0;
518             int nclust = 0;
519             int tfrel = relevance; // keep the old tf/idf score
520             int robinscore = 0;
521             int solrscore = 0;
522             int normscore = 0;
523             const char *score;
524             const char *id;
525             const char *title;
526             char idbuf[64];
527             int mergescore = 0;
528             // Find the best record in a cluster - the one with lowest position
529             for (record = rec->records; record; record = record->next) {
530                 if ( bestrecord == 0 || bestrecord->position < record->position )
531                     bestrecord = record;
532                 nclust++; // and count them all, for logging
533             }
534             norm = findnorm(rel, bestrecord->client);
535             // Calculate a round-robin score
536             robinscore = -(bestrecord->position * n_clients + norm->num) ;
537             wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n",
538                          bestrecord->position, norm->num, nclust, tfrel, robinscore );
539             yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d",
540                          bestrecord->position, norm->num, nclust, relevance );
541
542             // Check if the record has a score field
543             score = getfield(bestrecord,"score");
544             id = getfield(bestrecord, "id");
545             title = getfield(bestrecord, "title");
546             solrscore = 10000.0 * atof(score);
547             // clear the id, we only want the first numerical part
548             i=0;
549             while( id[i] >= '0' && id[i] <= '9' ) {
550                 idbuf[i] = id[i];
551                 i++;
552             }
553             idbuf[i] = '\0';
554             if ( norm->count && *score )
555             {
556                 //float avg = norm->sum / norm->count;
557                 normscore = 10000.0 * (  atof(score) / norm->max );
558                 wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n",
559                         score, norm->max, normscore);
560             } else
561                 yaz_log(YLOG_LOG, "normscore: no count, can not normalize score '%s' ", score );
562
563             // If we have a score in the best record, we probably have in them all
564             // and we can try to merge scores
565             if ( *score ) {
566                 float scores[nclust];
567                 float s = 0.0;
568                 float sum = 0.0;
569                 int i=0;
570                 if ( rec->records && rec->records->next ) 
571                 { // have more than one record
572                     for (record = rec->records; record; record = record->next, i++)
573                     {
574                         const char *scorefld = getfield(record,"score");
575                         scores[i] = atof( scorefld );
576                         yaz_log(YLOG_LOG,"mergescore %d: %s", i, scorefld );
577                         wrbuf_printf(w,"mergeplot %d  %f x\n", clusternumber, 10000*scores[i] );
578                     }
579                     qsort(scores, nclust, sizeof(float), sort_float );
580                     for (i = 0; i<nclust; i++)
581                     {
582                         yaz_log(YLOG_LOG,"Sorted mergescore %d: %f + %f/%d = %f", i, s,scores[i],i+1, s+scores[i] / (i+1) );
583                         wrbuf_printf(w,"Sorted mergescore %d: %f + %f/%d = %f\n",  i, s,scores[i],i+1, s+scores[i] / (i+1));
584                         s += scores[i] / (i+1);
585                         sum += scores[i];
586                     }
587                     mergescore = s * 10000;
588                     wrbuf_printf(w,"mergeplot %d  x %d %f %f %d\n", clusternumber, mergescore,
589                         10000.0*sum, 10000.0*sum/nclust, nclust );
590                     yaz_log(YLOG_LOG,"mergeplot %d  x %d %f %f %d", clusternumber, mergescore,
591                         10000.0*sum, 10000.0*sum/nclust, nclust );
592                 }
593                 else
594                 { // only one record, take the easy way out of merging (and don't bother plotting)
595                     mergescore = atof( score ) * 10000;
596                 }
597             } // merge score
598             id = getfield(bestrecord, "id");
599             // clear the id, we only want the first numerical part
600             i=0;
601             while( id[i] >= '0' && id[i] <= '9' ) {
602                 idbuf[i] = id[i];
603                 i++;
604             }
605             idbuf[i] = '\0';
606             
607             title = getfield(bestrecord, "title");
608             wrbuf_printf(w,"plotline: %d %d %d %d %d %d %d # %s %s\n",
609                             norm->num, bestrecord->position,
610                             tfrel, robinscore, solrscore, normscore, mergescore, idbuf, title );
611             relevance = mergescore;
612         }
613         rec->relevance_score = relevance;
614     }
615     reclist_leave(reclist);
616     xfree(idfvec);
617 }
618
619 /*
620  * Local variables:
621  * c-basic-offset: 4
622  * c-file-style: "Stroustrup"
623  * indent-tabs-mode: nil
624  * End:
625  * vim: shiftwidth=4 tabstop=8 expandtab
626  */
627