1 /* This file is part of Pazpar2.
2 Copyright (C) 2006-2013 Index Data
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 #include "pazpar2_config.h"
29 #include "relevance.h"
34 #define log2(x) (log(x)/log(2))
39 int *doc_frequency_vec;
40 int *term_frequency_vec_tmp;
43 struct word_entry *entries;
44 pp2_charset_token_t prt;
50 struct normalizing *norm;
53 // Structure to keep data for normalizing scores from one client
60 struct client *client;
61 struct normalizing *next;
66 const char *display_str;
69 struct word_entry *next;
72 // Find the normalizing entry for this client, or create one if not there
73 struct normalizing *findnorm( struct relevance *rel, struct client* client)
75 struct normalizing *n = rel->norm;
77 if (n->client == client )
81 n = nmem_malloc(rel->nmem, sizeof(struct normalizing) );
83 n->num = rel->norm->num +1;
95 static struct word_entry *word_entry_match(struct relevance *r,
97 const char *rank, int *weight)
100 struct word_entry *entries = r->entries;
101 for (; entries; entries = entries->next, i++)
103 if (*norm_str && !strcmp(norm_str, entries->norm_str))
107 sscanf(rank, "%d%n", weight, &no_read);
111 if (no_read > 0 && (cp = strchr(rank, ' ')))
113 if ((cp - rank) == strlen(entries->ccl_field) &&
114 memcmp(entries->ccl_field, rank, cp - rank) == 0)
115 *weight = atoi(cp + 1);
123 int relevance_snippet(struct relevance *r,
124 const char *words, const char *name,
128 const char *norm_str;
131 pp2_charset_token_first(r->prt, words, 0);
132 while ((norm_str = pp2_charset_token_next(r->prt)))
134 size_t org_start, org_len;
135 struct word_entry *entries = r->entries;
138 pp2_get_org(r->prt, &org_start, &org_len);
139 for (; entries; entries = entries->next, i++)
141 if (*norm_str && !strcmp(norm_str, entries->norm_str))
149 wrbuf_puts(w_snippet, "<match>");
158 wrbuf_puts(w_snippet, "</match>");
161 wrbuf_xmlputs_n(w_snippet, words + org_start, org_len);
164 wrbuf_puts(w_snippet, "</match>");
167 yaz_log(YLOG_DEBUG, "SNIPPET match: %s", wrbuf_cstr(w_snippet));
172 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
173 const char *words, const char *rank,
176 int *w = r->term_frequency_vec_tmp;
177 const char *norm_str;
179 double lead_decay = r->lead_decay;
180 struct word_entry *e;
181 WRBUF wr = cluster->relevance_explain1;
182 int printed_about_field = 0;
184 pp2_charset_token_first(r->prt, words, 0);
185 for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
192 while ((norm_str = pp2_charset_token_next(r->prt)))
194 int local_weight = 0;
195 e = word_entry_match(r, norm_str, rank, &local_weight);
201 if (!printed_about_field)
203 printed_about_field = 1;
204 wrbuf_printf(wr, "field=%s content=", name);
205 if (strlen(words) > 50)
207 wrbuf_xmlputs_n(wr, words, 49);
208 wrbuf_puts(wr, " ...");
211 wrbuf_xmlputs(wr, words);
212 wrbuf_puts(wr, ";\n");
214 assert(res < r->vec_len);
215 w[res] += local_weight / (1 + log2(1 + lead_decay * length));
216 wrbuf_printf(wr, "%s: w[%d] += w(%d) / "
217 "(1+log2(1+lead_decay(%f) * length(%d)));\n",
218 e->display_str, res, local_weight, lead_decay, length);
220 if (j > 0 && r->term_pos[j])
222 int d = length + 1 - r->term_pos[j];
223 wrbuf_printf(wr, "%s: w[%d] += w[%d](%d) * follow(%f) / "
225 e->display_str, res, res, w[res],
226 r->follow_factor, d);
227 w[res] += w[res] * r->follow_factor / (1 + log2(d));
229 for (j = 0; j < r->vec_len; j++)
230 r->term_pos[j] = j < res ? 0 : length + 1;
235 for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
237 if (length == 0 || w[i] == 0)
239 wrbuf_printf(wr, "%s: tf[%d] += w[%d](%d)", e->display_str, i, i, w[i]);
240 switch (r->length_divide)
243 cluster->term_frequency_vecf[i] += (double) w[i];
246 wrbuf_printf(wr, " / log2(1+length(%d))", length);
247 cluster->term_frequency_vecf[i] +=
248 (double) w[i] / log2(1 + length);
251 wrbuf_printf(wr, " / length(%d)", length);
252 cluster->term_frequency_vecf[i] += (double) w[i] / length;
254 cluster->term_frequency_vec[i] += w[i];
255 wrbuf_printf(wr, " (%f);\n", cluster->term_frequency_vecf[i]);
258 cluster->term_frequency_vec[0] += length;
261 static void pull_terms(struct relevance *res, struct ccl_rpn_node *n)
274 pull_terms(res, n->u.p[0]);
275 pull_terms(res, n->u.p[1]);
278 nmem_strsplit(res->nmem, " ", n->u.t.term, &words, &numwords);
279 for (i = 0; i < numwords; i++)
281 const char *norm_str;
283 ccl_field = nmem_strdup_null(res->nmem, n->u.t.qual);
285 pp2_charset_token_first(res->prt, words[i], 0);
286 while ((norm_str = pp2_charset_token_next(res->prt)))
288 struct word_entry **e = &res->entries;
291 *e = nmem_malloc(res->nmem, sizeof(**e));
292 (*e)->norm_str = nmem_strdup(res->nmem, norm_str);
293 (*e)->ccl_field = ccl_field;
294 (*e)->termno = res->vec_len++;
295 (*e)->display_str = nmem_strdup(res->nmem, words[i]);
304 void relevance_clear(struct relevance *r)
309 for (i = 0; i < r->vec_len; i++)
310 r->doc_frequency_vec[i] = 0;
314 struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
315 struct ccl_rpn_node *query,
317 double follow_factor, double lead_decay,
320 NMEM nmem = nmem_create();
321 struct relevance *res = nmem_malloc(nmem, sizeof(*res));
326 res->rank_cluster = rank_cluster;
327 res->follow_factor = follow_factor;
328 res->lead_decay = lead_decay;
329 res->length_divide = length_divide;
330 res->prt = pp2_charset_token_create(pft, "relevance");
332 pull_terms(res, query);
334 res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
337 res->term_frequency_vec_tmp =
338 nmem_malloc(res->nmem,
339 res->vec_len * sizeof(*res->term_frequency_vec_tmp));
342 nmem_malloc(res->nmem, res->vec_len * sizeof(*res->term_pos));
344 relevance_clear(res);
350 void relevance_destroy(struct relevance **rp)
354 pp2_charset_token_destroy((*rp)->prt);
355 nmem_destroy((*rp)->nmem);
360 void relevance_mergerec(struct relevance *r, struct record_cluster *dst,
361 const struct record_cluster *src)
365 for (i = 0; i < r->vec_len; i++)
366 dst->term_frequency_vec[i] += src->term_frequency_vec[i];
368 for (i = 0; i < r->vec_len; i++)
369 dst->term_frequency_vecf[i] += src->term_frequency_vecf[i];
372 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
376 // term frequency [1,..] . [0] is total length of all fields
377 rec->term_frequency_vec =
379 r->vec_len * sizeof(*rec->term_frequency_vec));
380 for (i = 0; i < r->vec_len; i++)
381 rec->term_frequency_vec[i] = 0;
383 // term frequency divided by length of field [1,...]
384 rec->term_frequency_vecf =
386 r->vec_len * sizeof(*rec->term_frequency_vecf));
387 for (i = 0; i < r->vec_len; i++)
388 rec->term_frequency_vecf[i] = 0.0;
391 static const char *getfield(struct record *bestrecord, const char *tag)
393 struct session *se = client_get_session(bestrecord->client);
394 int md_field_id = conf_service_metadata_field_id(se->service, tag);
395 struct record_metadata *md = 0;
398 md = bestrecord->metadata[md_field_id];
400 return md->data.text.disp;
404 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
408 // Find the best record in a cluster - the one with lowest position
409 // (in this proto. Later, find a better one)
410 struct record *bestrecord = 0;
411 struct record *record;
412 struct normalizing *n;
414 for (record = cluster->records; record; record = record->next)
415 if ( bestrecord == 0 || bestrecord->position < record->position )
417 n = findnorm(r,bestrecord->client);
419 score = atof( getfield(bestrecord,"score") );
421 if ( n->max < score )
424 for (i = 1; i < r->vec_len; i++)
425 if (cluster->term_frequency_vec[i] > 0)
426 r->doc_frequency_vec[i]++;
428 r->doc_frequency_vec[0]++;
432 // Helper to compare floats, for qsort
433 static int sort_float(const void *x, const void *y)
437 //yaz_log(YLOG_LOG,"sorting %f and %f", *fx, *fy); // ###
442 return 0; // do not return *fx-*fy, it is often too close to zero.
445 // Prepare for a relevance-sorted read
446 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
447 enum conf_sortkey_type type)
450 float *idfvec = xmalloc(rel->vec_len * sizeof(float));
451 int n_clients = clients_count();
452 int clusternumber = 0;
453 yaz_log(YLOG_LOG,"round-robin: have %d clients", n_clients);
455 reclist_enter(reclist);
456 // Calculate document frequency vector for each term.
457 for (i = 1; i < rel->vec_len; i++)
459 if (!rel->doc_frequency_vec[i])
463 /* add one to nominator idf(t,D) to ensure a value > 0 */
464 idfvec[i] = log((float) (1 + rel->doc_frequency_vec[0]) /
465 rel->doc_frequency_vec[i]);
468 // Calculate relevance for each document
473 struct word_entry *e = rel->entries;
474 struct record_cluster *rec = reclist_read_record(reclist);
478 w = rec->relevance_explain2;
480 wrbuf_puts(w, "relevance = 0;\n");
481 for (i = 1; i < rel->vec_len; i++)
483 float termfreq = (float) rec->term_frequency_vecf[i];
484 int add = 100000 * termfreq * idfvec[i];
486 wrbuf_printf(w, "idf[%d] = log(((1 + total(%d))/termoccur(%d));\n",
487 i, rel->doc_frequency_vec[0],
488 rel->doc_frequency_vec[i]);
489 wrbuf_printf(w, "%s: relevance += 100000 * tf[%d](%f) * "
490 "idf[%d](%f) (%d);\n",
491 e->display_str, i, termfreq, i, idfvec[i], add);
495 if (!rel->rank_cluster)
497 struct record *record;
498 int cluster_size = 0;
500 for (record = rec->records; record; record = record->next)
503 wrbuf_printf(w, "score = relevance(%d)/cluster_size(%d);\n",
504 relevance, cluster_size);
505 relevance /= cluster_size;
509 wrbuf_printf(w, "score = relevance(%d);\n", relevance);
511 // Experimental round-robin
512 // Overwrites the score calculated above, but I keep it there to
513 // get the log entries
514 if (type == Metadata_sortkey_relevance_h) {
515 struct record *record;
516 struct normalizing *norm;
517 struct record *bestrecord = 0;
519 int tfrel = relevance; // keep the old tf/idf score
528 // Find the best record in a cluster - the one with lowest position
529 for (record = rec->records; record; record = record->next) {
530 if ( bestrecord == 0 || bestrecord->position < record->position )
532 nclust++; // and count them all, for logging
534 norm = findnorm(rel, bestrecord->client);
535 // Calculate a round-robin score
536 robinscore = -(bestrecord->position * n_clients + norm->num) ;
537 wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n",
538 bestrecord->position, norm->num, nclust, tfrel, robinscore );
539 yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d",
540 bestrecord->position, norm->num, nclust, relevance );
542 // Check if the record has a score field
543 score = getfield(bestrecord,"score");
544 id = getfield(bestrecord, "id");
545 title = getfield(bestrecord, "title");
546 solrscore = 10000.0 * atof(score);
547 // clear the id, we only want the first numerical part
549 while( id[i] >= '0' && id[i] <= '9' ) {
554 if ( norm->count && *score )
556 //float avg = norm->sum / norm->count;
557 normscore = 10000.0 * ( atof(score) / norm->max );
558 wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n",
559 score, norm->max, normscore);
561 yaz_log(YLOG_LOG, "normscore: no count, can not normalize score '%s' ", score );
563 // If we have a score in the best record, we probably have in them all
564 // and we can try to merge scores
566 float scores[nclust];
570 if ( rec->records && rec->records->next )
571 { // have more than one record
572 for (record = rec->records; record; record = record->next, i++)
574 const char *scorefld = getfield(record,"score");
575 scores[i] = atof( scorefld );
576 yaz_log(YLOG_LOG,"mergescore %d: %s", i, scorefld );
577 wrbuf_printf(w,"mergeplot %d %f x\n", clusternumber, 10000*scores[i] );
579 qsort(scores, nclust, sizeof(float), sort_float );
580 for (i = 0; i<nclust; i++)
582 yaz_log(YLOG_LOG,"Sorted mergescore %d: %f + %f/%d = %f", i, s,scores[i],i+1, s+scores[i] / (i+1) );
583 wrbuf_printf(w,"Sorted mergescore %d: %f + %f/%d = %f\n", i, s,scores[i],i+1, s+scores[i] / (i+1));
584 s += scores[i] / (i+1);
587 mergescore = s * 10000;
588 wrbuf_printf(w,"mergeplot %d x %d %f %f %d\n", clusternumber, mergescore,
589 10000.0*sum, 10000.0*sum/nclust, nclust );
590 yaz_log(YLOG_LOG,"mergeplot %d x %d %f %f %d", clusternumber, mergescore,
591 10000.0*sum, 10000.0*sum/nclust, nclust );
594 { // only one record, take the easy way out of merging (and don't bother plotting)
595 mergescore = atof( score ) * 10000;
598 id = getfield(bestrecord, "id");
599 // clear the id, we only want the first numerical part
601 while( id[i] >= '0' && id[i] <= '9' ) {
607 title = getfield(bestrecord, "title");
608 wrbuf_printf(w,"plotline: %d %d %d %d %d %d %d # %s %s\n",
609 norm->num, bestrecord->position,
610 tfrel, robinscore, solrscore, normscore, mergescore, idbuf, title );
611 relevance = mergescore;
613 rec->relevance_score = relevance;
615 reclist_leave(reclist);
622 * c-file-style: "Stroustrup"
623 * indent-tabs-mode: nil
625 * vim: shiftwidth=4 tabstop=8 expandtab