src/relevance.c

   1 /* This file is part of Pazpar2.
   2    Copyright (C) 2006-2012 Index Data
   3
   4 Pazpar2 is free software; you can redistribute it and/or modify it under
   5 the terms of the GNU General Public License as published by the Free
   6 Software Foundation; either version 2, or (at your option) any later
   7 version.
   8
   9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 for more details.
  13
  14 You should have received a copy of the GNU General Public License
  15 along with this program; if not, write to the Free Software
  16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 */
  19
  20 #if HAVE_CONFIG_H
  21 #include <config.h>
  22 #endif
  23
  24 #include <assert.h>
  25 #include <math.h>
  26 #include <stdlib.h>
  27
  28 #include "relevance.h"
  29 #include "session.h"
  30
  31 struct relevance
  32 {
  33     int *doc_frequency_vec;
  34     int vec_len;
  35     struct word_entry *entries;
  36     pp2_charset_token_t prt;
  37     NMEM nmem;
  38 };
  39
  40 struct word_entry {
  41     const char *norm_str;
  42     int termno;
  43     char *ccl_field;
  44     struct word_entry *next;
  45 };
  46
  47 static int word_entry_match(struct word_entry *entries, const char *norm_str,
  48                             const char *frank, int *local_mult)
  49 {
  50     for (; entries; entries = entries->next)
  51     {
  52         if (!strcmp(norm_str, entries->norm_str))
  53         {
  54             const char *cp = 0;
  55             if (frank && (cp = strchr(frank, ' ')))
  56             {
  57                 if ((cp - frank) == strlen(entries->ccl_field) &&
  58                     memcmp(entries->ccl_field, frank, cp - frank) == 0)
  59                     *local_mult = atoi(cp + 1);
  60             }
  61             return entries->termno;
  62         }
  63     }
  64     return 0;
  65 }
  66
  67 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
  68                           const char *words, int multiplier, const char *name,
  69                           const char *frank)
  70 {
  71     int *mult = cluster->term_frequency_vec_tmp;
  72     const char *norm_str;
  73     int i, length = 0;
  74     pp2_charset_token_first(r->prt, words, 0);
  75     for (i = 1; i < r->vec_len; i++)
  76         mult[i] = 0;
  77
  78     while ((norm_str = pp2_charset_token_next(r->prt)))
  79     {
  80         int local_mult = multiplier;
  81         int res = word_entry_match(r->entries, norm_str, frank, &local_mult);
  82         if (res)
  83         {
  84             assert(res < r->vec_len);
  85             mult[res] += local_mult;
  86         }
  87         length++;
  88     }
  89
  90     for (i = 1; i < r->vec_len; i++)
  91     {
  92         if (length > 0) /* only add if non-empty */
  93             cluster->term_frequency_vecf[i] += (double) mult[i] / length;
  94         cluster->term_frequency_vec[i] += mult[i];
  95     }
  96
  97     cluster->term_frequency_vec[0] += length;
  98 }
  99
 100 static void pull_terms(struct relevance *res, struct ccl_rpn_node *n)
 101 {
 102     char **words;
 103     int numwords;
 104     char *ccl_field;
 105     int i;
 106
 107     switch (n->kind)
 108     {
 109     case CCL_RPN_AND:
 110     case CCL_RPN_OR:
 111     case CCL_RPN_NOT:
 112     case CCL_RPN_PROX:
 113         pull_terms(res, n->u.p[0]);
 114         pull_terms(res, n->u.p[1]);
 115         break;
 116     case CCL_RPN_TERM:
 117         nmem_strsplit(res->nmem, " ", n->u.t.term, &words, &numwords);
 118         for (i = 0; i < numwords; i++)
 119         {
 120             const char *norm_str;
 121
 122             ccl_field = nmem_strdup_null(res->nmem, n->u.t.qual);
 123
 124             pp2_charset_token_first(res->prt, words[i], 0);
 125             while ((norm_str = pp2_charset_token_next(res->prt)))
 126             {
 127                 struct word_entry **e = &res->entries;
 128                 while (*e)
 129                     e = &(*e)->next;
 130                 *e = nmem_malloc(res->nmem, sizeof(**e));
 131                 (*e)->norm_str = nmem_strdup(res->nmem, norm_str);
 132                 (*e)->ccl_field = ccl_field;
 133                 (*e)->termno = res->vec_len++;
 134                 (*e)->next = 0;
 135             }
 136         }
 137         break;
 138     default:
 139         break;
 140     }
 141 }
 142
 143 struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
 144                                        struct ccl_rpn_node *query)
 145 {
 146     NMEM nmem = nmem_create();
 147     struct relevance *res = nmem_malloc(nmem, sizeof(*res));
 148     int i;
 149
 150     res->nmem = nmem;
 151     res->entries = 0;
 152     res->vec_len = 1;
 153     res->prt = pp2_charset_token_create(pft, "relevance");
 154
 155     pull_terms(res, query);
 156
 157     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
 158     for (i = 0; i < res->vec_len; i++)
 159         res->doc_frequency_vec[i] = 0;
 160     return res;
 161 }
 162
 163 void relevance_destroy(struct relevance **rp)
 164 {
 165     if (*rp)
 166     {
 167         pp2_charset_token_destroy((*rp)->prt);
 168         nmem_destroy((*rp)->nmem);
 169         *rp = 0;
 170     }
 171 }
 172
 173 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
 174 {
 175     if (!rec->term_frequency_vec)
 176     {
 177         int i;
 178
 179         // term frequency [1,..] . [0] is total length of all fields
 180         rec->term_frequency_vec =
 181             nmem_malloc(r->nmem,
 182                         r->vec_len * sizeof(*rec->term_frequency_vec));
 183         for (i = 0; i < r->vec_len; i++)
 184             rec->term_frequency_vec[i] = 0;
 185
 186         // term frequency divided by length of field [1,...]
 187         rec->term_frequency_vecf =
 188             nmem_malloc(r->nmem,
 189                         r->vec_len * sizeof(*rec->term_frequency_vecf));
 190         for (i = 0; i < r->vec_len; i++)
 191             rec->term_frequency_vecf[i] = 0.0;
 192
 193         // for relevance_countwords (so we don't have to xmalloc/xfree)
 194         rec->term_frequency_vec_tmp =
 195             nmem_malloc(r->nmem,
 196                         r->vec_len * sizeof(*rec->term_frequency_vec_tmp));
 197     }
 198 }
 199
 200 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
 201 {
 202     int i;
 203
 204     for (i = 1; i < r->vec_len; i++)
 205         if (cluster->term_frequency_vec[i] > 0)
 206             r->doc_frequency_vec[i]++;
 207
 208     r->doc_frequency_vec[0]++;
 209 }
 210
 211 // Prepare for a relevance-sorted read
 212 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
 213 {
 214     int i;
 215     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
 216
 217     reclist_enter(reclist);
 218     // Calculate document frequency vector for each term.
 219     for (i = 1; i < rel->vec_len; i++)
 220     {
 221         if (!rel->doc_frequency_vec[i])
 222             idfvec[i] = 0;
 223         else
 224         {
 225             // This conditional may be terribly wrong
 226             // It was there to address the situation where vec[0] == vec[i]
 227             // which leads to idfvec[i] == 0... not sure about this
 228             // Traditional TF-IDF may assume that a word that occurs in every
 229             // record is irrelevant, but this is actually something we will
 230             // see a lot
 231             if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] /
 232                             rel->doc_frequency_vec[i])) < 0.0000001)
 233                 idfvec[i] = 1;
 234         }
 235     }
 236     // Calculate relevance for each document
 237     while (1)
 238     {
 239         int t;
 240         int relevance = 0;
 241         struct record_cluster *rec = reclist_read_record(reclist);
 242         if (!rec)
 243             break;
 244         for (t = 1; t < rel->vec_len; t++)
 245         {
 246             float termfreq = (float) rec->term_frequency_vecf[t];
 247             relevance += 100000 * (termfreq * idfvec[t] + 0.0000005);
 248         }
 249         rec->relevance_score = relevance;
 250     }
 251     reclist_leave(reclist);
 252     xfree(idfvec);
 253 }
 254
 255 /*
 256  * Local variables:
 257  * c-basic-offset: 4
 258  * c-file-style: "Stroustrup"
 259  * indent-tabs-mode: nil
 260  * End:
 261  * vim: shiftwidth=4 tabstop=8 expandtab
 262  */
 263