src/relevance.c

   1 /* This file is part of Pazpar2.
   2    Copyright (C) 2006-2008 Index Data
   3
   4 Pazpar2 is free software; you can redistribute it and/or modify it under
   5 the terms of the GNU General Public License as published by the Free
   6 Software Foundation; either version 2, or (at your option) any later
   7 version.
   8
   9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 for more details.
  13
  14 You should have received a copy of the GNU General Public License
  15 along with this program; if not, write to the Free Software
  16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 */
  19
  20 #if HAVE_CONFIG_H
  21 #include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <math.h>
  26 #include <stdlib.h>
  27
  28 #include "relevance.h"
  29 #include "pazpar2.h"
  30
  31 #define USE_TRIE 0
  32
  33 struct relevance
  34 {
  35     int *doc_frequency_vec;
  36     int vec_len;
  37 #if USE_TRIE
  38     struct word_trie *wt;
  39 #else
  40     struct word_entry *entries;
  41     pp2_charset_t pct;
  42 #endif
  43     NMEM nmem;
  44 };
  45
  46 #if USE_TRIE
  47 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
  48
  49
  50 // We use this data structure to recognize terms in input records,
  51 // and map them to record term vectors for counting.
  52 struct word_trie
  53 {
  54     struct
  55     {
  56         struct word_trie *child;
  57         int termno;
  58     } list[26];
  59 };
  60
  61 static struct word_trie *create_word_trie_node(NMEM nmem)
  62 {
  63     struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
  64     int i;
  65     for (i = 0; i < 26; i++)
  66     {
  67         res->list[i].child = 0;
  68         res->list[i].termno = -1;
  69     }
  70     return res;
  71 }
  72
  73 static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
  74 {
  75
  76     while (*term) {
  77         int c = tolower(*term);
  78         if (c < 'a' || c > 'z')
  79             term++;
  80         else
  81         {
  82             c -= 'a';
  83             if (!*(++term))
  84                 n->list[c].termno = num;
  85             else
  86             {
  87                 if (!n->list[c].child)
  88                 {
  89                     struct word_trie *new = create_word_trie_node(nmem);
  90                     n->list[c].child = new;
  91                 }
  92                 word_trie_addterm(nmem, n->list[c].child, term, num);
  93             }
  94             break;
  95         }
  96     }
  97 }
  98
  99 static int word_trie_match(struct word_trie *t, const char *word, int *skipped)
 100 {
 101     int c = raw_char(tolower(*word));
 102
 103     if (!*word)
 104         return 0;
 105
 106     word++;
 107     (*skipped)++;
 108     if (!*word || raw_char(*word) < 0)
 109     {
 110         if (t->list[c].termno > 0)
 111             return t->list[c].termno;
 112         else
 113             return 0;
 114     }
 115     else
 116     {
 117         if (t->list[c].child)
 118         {
 119             return word_trie_match(t->list[c].child, word, skipped);
 120         }
 121         else
 122             return 0;
 123     }
 124
 125 }
 126
 127
 128 static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
 129 {
 130     struct word_trie *res = create_word_trie_node(nmem);
 131     const char **p;
 132     int i;
 133
 134     for (i = 1, p = terms; *p; p++, i++)
 135         word_trie_addterm(nmem, res, *p, i);
 136     return res;
 137 }
 138
 139
 140 // FIXME. The definition of a word is crude here.. should support
 141 // some form of localization mechanism?
 142 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
 143                           const char *words, int multiplier)
 144 {
 145     while (*words)
 146     {
 147         char c;
 148         int res;
 149         int skipped = 0;
 150         while (*words && (c = raw_char(tolower(*words))) < 0)
 151             words++;
 152         if (!*words)
 153             break;
 154         res = word_trie_match(r->wt, words, &skipped);
 155         if (res)
 156         {
 157             words += skipped;
 158             cluster->term_frequency_vec[res] += multiplier;
 159         }
 160         else
 161         {
 162             while (*words && (c = raw_char(tolower(*words))) >= 0)
 163                 words++;
 164         }
 165         cluster->term_frequency_vec[0]++;
 166     }
 167 }
 168
 169 #else
 170
 171 struct word_entry {
 172     const char *norm_str;
 173     int termno;
 174     struct word_entry *next;
 175 };
 176
 177 static void add_word_entry(NMEM nmem,
 178                            struct word_entry **entries,
 179                            const char *norm_str,
 180                            int term_no)
 181 {
 182     struct word_entry *ne = nmem_malloc(nmem, sizeof(*ne));
 183     ne->norm_str = nmem_strdup(nmem, norm_str);
 184     ne->termno = term_no;
 185
 186     ne->next = *entries;
 187     *entries = ne;
 188 }
 189
 190
 191 int word_entry_match(struct word_entry *entries, const char *norm_str)
 192 {
 193     for (; entries; entries = entries->next)
 194     {
 195         if (!strcmp(norm_str, entries->norm_str))
 196             return entries->termno;
 197     }
 198     return 0;
 199 }
 200
 201 static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem,
 202                                              const char **terms)
 203 {
 204     int termno = 1; /* >0 signals THERE is an entry */
 205     struct word_entry *entries = 0;
 206     const char **p = terms;
 207
 208     for (; *p; p++)
 209     {
 210         pp2_relevance_token_t prt = pp2_relevance_tokenize(pct, *p);
 211         const char *norm_str;
 212
 213         while ((norm_str = pp2_relevance_token_next(prt)))
 214             add_word_entry(nmem, &entries, norm_str, termno);
 215
 216         pp2_relevance_token_destroy(prt);
 217
 218         termno++;
 219     }
 220     return entries;
 221 }
 222
 223 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
 224         const char *words, int multiplier)
 225 {
 226     pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words);
 227
 228     const char *norm_str;
 229
 230     while ((norm_str = pp2_relevance_token_next(prt)))
 231     {
 232         int res = word_entry_match(r->entries, norm_str);
 233         if (res)
 234             cluster->term_frequency_vec[res] += multiplier;
 235         cluster->term_frequency_vec[0]++;
 236     }
 237     pp2_relevance_token_destroy(prt);
 238 }
 239
 240 #endif
 241
 242
 243
 244 struct relevance *relevance_create(pp2_charset_t pct,
 245                                    NMEM nmem, const char **terms, int numrecs)
 246 {
 247     struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
 248     const char **p;
 249     int i;
 250
 251     for (p = terms, i = 0; *p; p++, i++)
 252         ;
 253     res->vec_len = ++i;
 254     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
 255     memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int));
 256     res->nmem = nmem;
 257 #if USE_TRIE
 258     res->wt = build_word_trie(nmem, terms);
 259 #else
 260     res->entries = build_word_entries(pct, nmem, terms);
 261     res->pct = pct;
 262 #endif
 263     return res;
 264 }
 265
 266 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
 267 {
 268     if (!rec->term_frequency_vec)
 269     {
 270         rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
 271         memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int));
 272     }
 273 }
 274
 275
 276 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
 277 {
 278     int i;
 279
 280     for (i = 1; i < r->vec_len; i++)
 281         if (cluster->term_frequency_vec[i] > 0)
 282             r->doc_frequency_vec[i]++;
 283
 284     r->doc_frequency_vec[0]++;
 285 }
 286
 287 // Prepare for a relevance-sorted read
 288 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
 289 {
 290     int i;
 291     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
 292
 293     // Calculate document frequency vector for each term.
 294     for (i = 1; i < rel->vec_len; i++)
 295     {
 296         if (!rel->doc_frequency_vec[i])
 297             idfvec[i] = 0;
 298         else
 299         {
 300             // This conditional may be terribly wrong
 301             // It was there to address the situation where vec[0] == vec[i]
 302             // which leads to idfvec[i] == 0... not sure about this
 303             // Traditional TF-IDF may assume that a word that occurs in every
 304             // record is irrelevant, but this is actually something we will
 305             // see a lot
 306             if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] /
 307                             rel->doc_frequency_vec[i])) < 0.0000001)
 308                 idfvec[i] = 1;
 309         }
 310     }
 311     // Calculate relevance for each document
 312     for (i = 0; i < reclist->num_records; i++)
 313     {
 314         int t;
 315         struct record_cluster *rec = reclist->flatlist[i];
 316         float relevance;
 317         relevance = 0;
 318         for (t = 1; t < rel->vec_len; t++)
 319         {
 320             float termfreq;
 321             if (!rec->term_frequency_vec[0])
 322                 break;
 323             termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
 324             relevance += termfreq * idfvec[t];
 325         }
 326         rec->relevance = (int) (relevance * 100000);
 327     }
 328     reclist->pointer = 0;
 329     xfree(idfvec);
 330 }
 331
 332 /*
 333  * Local variables:
 334  * c-basic-offset: 4
 335  * indent-tabs-mode: nil
 336  * End:
 337  * vim: shiftwidth=4 tabstop=8 expandtab
 338  */