src/relevance.c

   1 /* This file is part of Pazpar2.
   2    Copyright (C) 2006-2013 Index Data
   3
   4 Pazpar2 is free software; you can redistribute it and/or modify it under
   5 the terms of the GNU General Public License as published by the Free
   6 Software Foundation; either version 2, or (at your option) any later
   7 version.
   8
   9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 for more details.
  13
  14 You should have received a copy of the GNU General Public License
  15 along with this program; if not, write to the Free Software
  16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 */
  19
  20 #if HAVE_CONFIG_H
  21 #include <config.h>
  22 #endif
  23
  24 #include <assert.h>
  25 #include <math.h>
  26 #include <stdlib.h>
  27
  28 #include "pazpar2_config.h"
  29 #include "relevance.h"
  30 #include "session.h"
  31 #include "client.h"
  32
  33 #ifdef WIN32
  34 #define log2(x) (log(x)/log(2))
  35 #endif
  36
  37 struct relevance
  38 {
  39     int *doc_frequency_vec;
  40     int *term_frequency_vec_tmp;
  41     int *term_pos;
  42     int vec_len;
  43     struct word_entry *entries;
  44     pp2_charset_token_t prt;
  45     int rank_cluster;
  46     double follow_factor;
  47     double lead_decay;
  48     int length_divide;
  49     NMEM nmem;
  50     struct normalizing *norm;
  51 };
  52
  53 // Structure to keep data for normalizing scores from one client
  54 struct normalizing
  55 {
  56     int num;
  57     float sum;
  58     float max;
  59     int count;
  60     struct client *client;
  61     struct normalizing *next;
  62 };
  63
  64 struct word_entry {
  65     const char *norm_str;
  66     const char *display_str;
  67     int termno;
  68     char *ccl_field;
  69     struct word_entry *next;
  70 };
  71
  72 // Find the normalizing entry for this client, or create one if not there
  73 struct normalizing *findnorm( struct relevance *rel, struct client* client)
  74 {
  75     struct normalizing *n = rel->norm;
  76     while (n) {
  77         if (n->client == client )
  78             return n;
  79         n = n->next;
  80     }
  81     n = nmem_malloc(rel->nmem, sizeof(struct normalizing) );
  82     if ( rel->norm )
  83         n->num = rel->norm->num +1;
  84     else
  85         n->num = 1;
  86     n->sum = 0.0;
  87     n->count = 0;
  88     n->max = 0.0;
  89     n->client = client;
  90     n->next = rel->norm;
  91     rel->norm = n;
  92     return n;
  93 }
  94
  95 static struct word_entry *word_entry_match(struct relevance *r,
  96                                            const char *norm_str,
  97                                            const char *rank, int *weight)
  98 {
  99     int i = 1;
 100     struct word_entry *entries = r->entries;
 101     for (; entries; entries = entries->next, i++)
 102     {
 103         if (*norm_str && !strcmp(norm_str, entries->norm_str))
 104         {
 105             const char *cp = 0;
 106             int no_read = 0;
 107             sscanf(rank, "%d%n", weight, &no_read);
 108             rank += no_read;
 109             while (*rank == ' ')
 110                 rank++;
 111             if (no_read > 0 && (cp = strchr(rank, ' ')))
 112             {
 113                 if ((cp - rank) == strlen(entries->ccl_field) &&
 114                     memcmp(entries->ccl_field, rank, cp - rank) == 0)
 115                     *weight = atoi(cp + 1);
 116             }
 117             return entries;
 118         }
 119     }
 120     return 0;
 121 }
 122
 123 int relevance_snippet(struct relevance *r,
 124                       const char *words, const char *name,
 125                       WRBUF w_snippet)
 126 {
 127     int no = 0;
 128     const char *norm_str;
 129     int highlight = 0;
 130
 131     pp2_charset_token_first(r->prt, words, 0);
 132     while ((norm_str = pp2_charset_token_next(r->prt)))
 133     {
 134         size_t org_start, org_len;
 135         struct word_entry *entries = r->entries;
 136         int i;
 137
 138         pp2_get_org(r->prt, &org_start, &org_len);
 139         for (; entries; entries = entries->next, i++)
 140         {
 141             if (*norm_str && !strcmp(norm_str, entries->norm_str))
 142                 break;
 143         }
 144         if (entries)
 145         {
 146             if (!highlight)
 147             {
 148                 highlight = 1;
 149                 wrbuf_puts(w_snippet, "<match>");
 150                 no++;
 151             }
 152         }
 153         else
 154         {
 155             if (highlight)
 156             {
 157                 highlight = 0;
 158                 wrbuf_puts(w_snippet, "</match>");
 159             }
 160         }
 161         wrbuf_xmlputs_n(w_snippet, words + org_start, org_len);
 162     }
 163     if (highlight)
 164         wrbuf_puts(w_snippet, "</match>");
 165     if (no)
 166     {
 167         yaz_log(YLOG_DEBUG, "SNIPPET match: %s", wrbuf_cstr(w_snippet));
 168     }
 169     return no;
 170 }
 171
 172 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
 173                           const char *words, const char *rank,
 174                           const char *name)
 175 {
 176     int *w = r->term_frequency_vec_tmp;
 177     const char *norm_str;
 178     int i, length = 0;
 179     double lead_decay = r->lead_decay;
 180     struct word_entry *e;
 181     WRBUF wr = cluster->relevance_explain1;
 182     int printed_about_field = 0;
 183
 184     pp2_charset_token_first(r->prt, words, 0);
 185     for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
 186     {
 187         w[i] = 0;
 188         r->term_pos[i] = 0;
 189     }
 190
 191     assert(rank);
 192     while ((norm_str = pp2_charset_token_next(r->prt)))
 193     {
 194         int local_weight = 0;
 195         e = word_entry_match(r, norm_str, rank, &local_weight);
 196         if (e)
 197         {
 198             int res = e->termno;
 199             int j;
 200
 201             if (!printed_about_field)
 202             {
 203                 printed_about_field = 1;
 204                 wrbuf_printf(wr, "field=%s content=", name);
 205                 if (strlen(words) > 50)
 206                 {
 207                     wrbuf_xmlputs_n(wr, words, 49);
 208                     wrbuf_puts(wr, " ...");
 209                 }
 210                 else
 211                     wrbuf_xmlputs(wr, words);
 212                 wrbuf_puts(wr, ";\n");
 213             }
 214             assert(res < r->vec_len);
 215             w[res] += local_weight / (1 + log2(1 + lead_decay * length));
 216             wrbuf_printf(wr, "%s: w[%d] += w(%d) / "
 217                          "(1+log2(1+lead_decay(%f) * length(%d)));\n",
 218                          e->display_str, res, local_weight, lead_decay, length);
 219             j = res - 1;
 220             if (j > 0 && r->term_pos[j])
 221             {
 222                 int d = length + 1 - r->term_pos[j];
 223                 wrbuf_printf(wr, "%s: w[%d] += w[%d](%d) * follow(%f) / "
 224                              "(1+log2(d(%d));\n",
 225                              e->display_str, res, res, w[res],
 226                              r->follow_factor, d);
 227                 w[res] += w[res] * r->follow_factor / (1 + log2(d));
 228             }
 229             for (j = 0; j < r->vec_len; j++)
 230                 r->term_pos[j] = j < res ? 0 : length + 1;
 231         }
 232         length++;
 233     }
 234
 235     for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
 236     {
 237         if (length == 0 || w[i] == 0)
 238             continue;
 239         wrbuf_printf(wr, "%s: tf[%d] += w[%d](%d)", e->display_str, i, i, w[i]);
 240         switch (r->length_divide)
 241         {
 242         case 0:
 243             cluster->term_frequency_vecf[i] += (double) w[i];
 244             break;
 245         case 1:
 246             wrbuf_printf(wr, " / log2(1+length(%d))", length);
 247             cluster->term_frequency_vecf[i] +=
 248                 (double) w[i] / log2(1 + length);
 249             break;
 250         case 2:
 251             wrbuf_printf(wr, " / length(%d)", length);
 252             cluster->term_frequency_vecf[i] += (double) w[i] / length;
 253         }
 254         cluster->term_frequency_vec[i] += w[i];
 255         wrbuf_printf(wr, " (%f);\n", cluster->term_frequency_vecf[i]);
 256     }
 257
 258     cluster->term_frequency_vec[0] += length;
 259 }
 260
 261 static void pull_terms(struct relevance *res, struct ccl_rpn_node *n)
 262 {
 263     char **words;
 264     int numwords;
 265     char *ccl_field;
 266     int i;
 267
 268     switch (n->kind)
 269     {
 270     case CCL_RPN_AND:
 271     case CCL_RPN_OR:
 272     case CCL_RPN_NOT:
 273     case CCL_RPN_PROX:
 274         pull_terms(res, n->u.p[0]);
 275         pull_terms(res, n->u.p[1]);
 276         break;
 277     case CCL_RPN_TERM:
 278         nmem_strsplit(res->nmem, " ", n->u.t.term, &words, &numwords);
 279         for (i = 0; i < numwords; i++)
 280         {
 281             const char *norm_str;
 282
 283             ccl_field = nmem_strdup_null(res->nmem, n->u.t.qual);
 284
 285             pp2_charset_token_first(res->prt, words[i], 0);
 286             while ((norm_str = pp2_charset_token_next(res->prt)))
 287             {
 288                 struct word_entry **e = &res->entries;
 289                 while (*e)
 290                     e = &(*e)->next;
 291                 *e = nmem_malloc(res->nmem, sizeof(**e));
 292                 (*e)->norm_str = nmem_strdup(res->nmem, norm_str);
 293                 (*e)->ccl_field = ccl_field;
 294                 (*e)->termno = res->vec_len++;
 295                 (*e)->display_str = nmem_strdup(res->nmem, words[i]);
 296                 (*e)->next = 0;
 297             }
 298         }
 299         break;
 300     default:
 301         break;
 302     }
 303 }
 304 void relevance_clear(struct relevance *r)
 305 {
 306     if (r)
 307     {
 308         int i;
 309         for (i = 0; i < r->vec_len; i++)
 310             r->doc_frequency_vec[i] = 0;
 311     }
 312 }
 313
 314 struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
 315                                        struct ccl_rpn_node *query,
 316                                        int rank_cluster,
 317                                        double follow_factor, double lead_decay,
 318                                        int length_divide)
 319 {
 320     NMEM nmem = nmem_create();
 321     struct relevance *res = nmem_malloc(nmem, sizeof(*res));
 322
 323     res->nmem = nmem;
 324     res->entries = 0;
 325     res->vec_len = 1;
 326     res->rank_cluster = rank_cluster;
 327     res->follow_factor = follow_factor;
 328     res->lead_decay = lead_decay;
 329     res->length_divide = length_divide;
 330     res->prt = pp2_charset_token_create(pft, "relevance");
 331
 332     pull_terms(res, query);
 333
 334     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
 335
 336     // worker array
 337     res->term_frequency_vec_tmp =
 338         nmem_malloc(res->nmem,
 339                     res->vec_len * sizeof(*res->term_frequency_vec_tmp));
 340
 341     res->term_pos =
 342         nmem_malloc(res->nmem, res->vec_len * sizeof(*res->term_pos));
 343
 344     relevance_clear(res);
 345
 346     res->norm = 0;
 347     return res;
 348 }
 349
 350 void relevance_destroy(struct relevance **rp)
 351 {
 352     if (*rp)
 353     {
 354         pp2_charset_token_destroy((*rp)->prt);
 355         nmem_destroy((*rp)->nmem);
 356         *rp = 0;
 357     }
 358 }
 359
 360 void relevance_mergerec(struct relevance *r, struct record_cluster *dst,
 361                         const struct record_cluster *src)
 362 {
 363     int i;
 364
 365     for (i = 0; i < r->vec_len; i++)
 366         dst->term_frequency_vec[i] += src->term_frequency_vec[i];
 367
 368     for (i = 0; i < r->vec_len; i++)
 369         dst->term_frequency_vecf[i] += src->term_frequency_vecf[i];
 370 }
 371
 372 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
 373 {
 374     int i;
 375
 376     // term frequency [1,..] . [0] is total length of all fields
 377     rec->term_frequency_vec =
 378         nmem_malloc(r->nmem,
 379                     r->vec_len * sizeof(*rec->term_frequency_vec));
 380     for (i = 0; i < r->vec_len; i++)
 381         rec->term_frequency_vec[i] = 0;
 382
 383     // term frequency divided by length of field [1,...]
 384     rec->term_frequency_vecf =
 385         nmem_malloc(r->nmem,
 386                     r->vec_len * sizeof(*rec->term_frequency_vecf));
 387     for (i = 0; i < r->vec_len; i++)
 388         rec->term_frequency_vecf[i] = 0.0;
 389 }
 390
 391 static const char *getfield(struct record *bestrecord, const char *tag)
 392 {
 393     struct session *se = client_get_session(bestrecord->client);
 394     int md_field_id = conf_service_metadata_field_id(se->service, tag);
 395     struct record_metadata *md = 0;
 396     if (md_field_id <0)
 397         return "";
 398     md = bestrecord->metadata[md_field_id];
 399     if ( md)
 400         return md->data.text.disp;
 401     return "";
 402 }
 403
 404 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
 405 {
 406     int i;
 407
 408     // Find the best record in a cluster - the one with lowest position
 409     // (in this proto. Later, find a better one)
 410     struct record *bestrecord = 0;
 411     struct record *record;
 412     struct normalizing *n;
 413     float score;
 414     for (record = cluster->records; record; record = record->next)
 415         if ( bestrecord == 0 || bestrecord->position < record->position )
 416             bestrecord = record;
 417     n = findnorm(r,bestrecord->client);
 418     n->count ++;
 419     score = atof( getfield(bestrecord,"score") );
 420     n->sum += score;
 421     if ( n->max < score )
 422         n->max = score;
 423
 424     for (i = 1; i < r->vec_len; i++)
 425         if (cluster->term_frequency_vec[i] > 0)
 426             r->doc_frequency_vec[i]++;
 427
 428     r->doc_frequency_vec[0]++;
 429 }
 430
 431
 432 // Helper to compare floats, for qsort
 433 static int sort_float(const void *x, const void *y)
 434 {
 435     const float *fx = x;
 436     const float *fy = y;
 437     //yaz_log(YLOG_LOG,"sorting %f and %f", *fx, *fy);  // ###
 438     if ( *fx < *fy )
 439         return 1;
 440     if ( *fx > *fy )
 441         return -1;
 442     return 0;   // do not return *fx-*fy, it is often too close to zero.
 443 }
 444
 445 // Prepare for a relevance-sorted read
 446 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
 447                             enum conf_sortkey_type type)
 448 {
 449     int i;
 450     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
 451     int n_clients = clients_count();
 452     int clusternumber = 0;
 453     yaz_log(YLOG_LOG,"round-robin: have %d clients", n_clients);
 454
 455     reclist_enter(reclist);
 456     // Calculate document frequency vector for each term.
 457     for (i = 1; i < rel->vec_len; i++)
 458     {
 459         if (!rel->doc_frequency_vec[i])
 460             idfvec[i] = 0;
 461         else
 462         {
 463             /* add one to nominator idf(t,D) to ensure a value > 0 */
 464             idfvec[i] = log((float) (1 + rel->doc_frequency_vec[0]) /
 465                             rel->doc_frequency_vec[i]);
 466         }
 467     }
 468     // Calculate relevance for each document
 469     while (1)
 470     {
 471         int relevance = 0;
 472         WRBUF w;
 473         struct word_entry *e = rel->entries;
 474         struct record_cluster *rec = reclist_read_record(reclist);
 475         if (!rec)
 476             break;
 477         clusternumber++;
 478         w = rec->relevance_explain2;
 479         wrbuf_rewind(w);
 480         wrbuf_puts(w, "relevance = 0;\n");
 481         for (i = 1; i < rel->vec_len; i++)
 482         {
 483             float termfreq = (float) rec->term_frequency_vecf[i];
 484             int add = 100000 * termfreq * idfvec[i];
 485
 486             wrbuf_printf(w, "idf[%d] = log(((1 + total(%d))/termoccur(%d));\n",
 487                          i, rel->doc_frequency_vec[0],
 488                          rel->doc_frequency_vec[i]);
 489             wrbuf_printf(w, "%s: relevance += 100000 * tf[%d](%f) * "
 490                          "idf[%d](%f) (%d);\n",
 491                          e->display_str, i, termfreq, i, idfvec[i], add);
 492             relevance += add;
 493             e = e->next;
 494         }
 495         if (!rel->rank_cluster)
 496         {
 497             struct record *record;
 498             int cluster_size = 0;
 499
 500             for (record = rec->records; record; record = record->next)
 501                 cluster_size++;
 502
 503             wrbuf_printf(w, "score = relevance(%d)/cluster_size(%d);\n",
 504                          relevance, cluster_size);
 505             relevance /= cluster_size;
 506         }
 507         else
 508         {
 509             wrbuf_printf(w, "score = relevance(%d);\n", relevance);
 510         }
 511         // Experimental round-robin
 512         // Overwrites the score calculated above, but I keep it there to
 513         // get the log entries
 514         if (type == Metadata_sortkey_relevance_h) {
 515             struct record *record;
 516             struct normalizing *norm;
 517             struct record *bestrecord = 0;
 518             int nclust = 0;
 519             int tfrel = relevance; // keep the old tf/idf score
 520             int robinscore = 0;
 521             int solrscore = 0;
 522             int normscore = 0;
 523             const char *score;
 524             const char *id;
 525             const char *title;
 526             char idbuf[64];
 527             int mergescore = 0;
 528             // Find the best record in a cluster - the one with lowest position
 529             for (record = rec->records; record; record = record->next) {
 530                 if ( bestrecord == 0 || bestrecord->position < record->position )
 531                     bestrecord = record;
 532                 nclust++; // and count them all, for logging
 533             }
 534             norm = findnorm(rel, bestrecord->client);
 535             // Calculate a round-robin score
 536             robinscore = -(bestrecord->position * n_clients + norm->num) ;
 537             wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n",
 538                          bestrecord->position, norm->num, nclust, tfrel, robinscore );
 539             yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d",
 540                          bestrecord->position, norm->num, nclust, relevance );
 541
 542             // Check if the record has a score field
 543             score = getfield(bestrecord,"score");
 544             id = getfield(bestrecord, "id");
 545             title = getfield(bestrecord, "title");
 546             solrscore = 10000.0 * atof(score);
 547             // clear the id, we only want the first numerical part
 548             i=0;
 549             while( id[i] >= '0' && id[i] <= '9' ) {
 550                 idbuf[i] = id[i];
 551                 i++;
 552             }
 553             idbuf[i] = '\0';
 554             if ( norm->count && *score )
 555             {
 556                 //float avg = norm->sum / norm->count;
 557                 normscore = 10000.0 * (  atof(score) / norm->max );
 558                 wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n",
 559                         score, norm->max, normscore);
 560             } else
 561                 yaz_log(YLOG_LOG, "normscore: no count, can not normalize score '%s' ", score );
 562
 563             // If we have a score in the best record, we probably have in them all
 564             // and we can try to merge scores
 565             if ( *score ) {
 566                 float scores[nclust];
 567                 float s = 0.0;
 568                 float sum = 0.0;
 569                 int i=0;
 570                 if ( rec->records && rec->records->next )
 571                 { // have more than one record
 572                     for (record = rec->records; record; record = record->next, i++)
 573                     {
 574                         const char *scorefld = getfield(record,"score");
 575                         scores[i] = atof( scorefld );
 576                         yaz_log(YLOG_LOG,"mergescore %d: %s", i, scorefld );
 577                         wrbuf_printf(w,"mergeplot %d  %f x\n", clusternumber, 10000*scores[i] );
 578                     }
 579                     qsort(scores, nclust, sizeof(float), sort_float );
 580                     for (i = 0; i<nclust; i++)
 581                     {
 582                         yaz_log(YLOG_LOG,"Sorted mergescore %d: %f + %f/%d = %f", i, s,scores[i],i+1, s+scores[i] / (i+1) );
 583                         wrbuf_printf(w,"Sorted mergescore %d: %f + %f/%d = %f\n",  i, s,scores[i],i+1, s+scores[i] / (i+1));
 584                         s += scores[i] / (i+1);
 585                         sum += scores[i];
 586                     }
 587                     mergescore = s * 10000;
 588                     wrbuf_printf(w,"mergeplot %d  x %d %f %f %d\n", clusternumber, mergescore,
 589                         10000.0*sum, 10000.0*sum/nclust, nclust );
 590                     yaz_log(YLOG_LOG,"mergeplot %d  x %d %f %f %d", clusternumber, mergescore,
 591                         10000.0*sum, 10000.0*sum/nclust, nclust );
 592                 }
 593                 else
 594                 { // only one record, take the easy way out of merging (and don't bother plotting)
 595                     mergescore = atof( score ) * 10000;
 596                 }
 597             } // merge score
 598             id = getfield(bestrecord, "id");
 599             // clear the id, we only want the first numerical part
 600             i=0;
 601             while( id[i] >= '0' && id[i] <= '9' ) {
 602                 idbuf[i] = id[i];
 603                 i++;
 604             }
 605             idbuf[i] = '\0';
 606
 607             title = getfield(bestrecord, "title");
 608             wrbuf_printf(w,"plotline: %d %d %d %d %d %d %d # %s %s\n",
 609                             norm->num, bestrecord->position,
 610                             tfrel, robinscore, solrscore, normscore, mergescore, idbuf, title );
 611             relevance = mergescore;
 612         }
 613         rec->relevance_score = relevance;
 614     }
 615     reclist_leave(reclist);
 616     xfree(idfvec);
 617 }
 618
 619 /*
 620  * Local variables:
 621  * c-basic-offset: 4
 622  * c-file-style: "Stroustrup"
 623  * indent-tabs-mode: nil
 624  * End:
 625  * vim: shiftwidth=4 tabstop=8 expandtab
 626  */
 627