src/relevance.c

   1 /* $Id: relevance.c,v 1.12 2007-05-10 09:26:19 adam Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4 This file is part of Pazpar2.
   5
   6 Pazpar2 is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 2, or (at your option) any later
   9 version.
  10
  11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Pazpar2; see the file LICENSE.  If not, write to the
  18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19 02111-1307, USA.
  20  */
  21
  22 #include <ctype.h>
  23 #include <math.h>
  24 #include <stdlib.h>
  25
  26 #if HAVE_CONFIG_H
  27 #include <cconfig.h>
  28 #endif
  29
  30 #include "relevance.h"
  31 #include "pazpar2.h"
  32
  33 #define USE_TRIE 0
  34
  35 struct relevance
  36 {
  37     int *doc_frequency_vec;
  38     int vec_len;
  39 #if USE_TRIE
  40     struct word_trie *wt;
  41 #else
  42     struct word_entry *entries;
  43 #endif
  44     NMEM nmem;
  45 };
  46
  47 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
  48
  49
  50 #if USE_TRIE
  51 // We use this data structure to recognize terms in input records,
  52 // and map them to record term vectors for counting.
  53 struct word_trie
  54 {
  55     struct
  56     {
  57         struct word_trie *child;
  58         int termno;
  59     } list[26];
  60 };
  61
  62 static struct word_trie *create_word_trie_node(NMEM nmem)
  63 {
  64     struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
  65     int i;
  66     for (i = 0; i < 26; i++)
  67     {
  68         res->list[i].child = 0;
  69         res->list[i].termno = -1;
  70     }
  71     return res;
  72 }
  73
  74 static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
  75 {
  76
  77     while (*term) {
  78         int c = tolower(*term);
  79         if (c < 'a' || c > 'z')
  80             term++;
  81         else
  82         {
  83             c -= 'a';
  84             if (!*(++term))
  85                 n->list[c].termno = num;
  86             else
  87             {
  88                 if (!n->list[c].child)
  89                 {
  90                     struct word_trie *new = create_word_trie_node(nmem);
  91                     n->list[c].child = new;
  92                 }
  93                 word_trie_addterm(nmem, n->list[c].child, term, num);
  94             }
  95             break;
  96         }
  97     }
  98 }
  99
 100 static int word_trie_match(struct word_trie *t, const char *word, int *skipped)
 101 {
 102     int c = raw_char(tolower(*word));
 103
 104     if (!*word)
 105         return 0;
 106
 107     word++;
 108     (*skipped)++;
 109     if (!*word || raw_char(*word) < 0)
 110     {
 111         if (t->list[c].termno > 0)
 112             return t->list[c].termno;
 113         else
 114             return 0;
 115     }
 116     else
 117     {
 118         if (t->list[c].child)
 119         {
 120             return word_trie_match(t->list[c].child, word, skipped);
 121         }
 122         else
 123             return 0;
 124     }
 125
 126 }
 127
 128
 129 static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
 130 {
 131     struct word_trie *res = create_word_trie_node(nmem);
 132     const char **p;
 133     int i;
 134
 135     for (i = 1, p = terms; *p; p++, i++)
 136         word_trie_addterm(nmem, res, *p, i);
 137     return res;
 138 }
 139
 140 #else
 141
 142 struct word_entry {
 143     const char *norm_str;
 144     int termno;
 145     struct word_entry *next;
 146 };
 147
 148 static void add_word_entry(NMEM nmem,
 149                            struct word_entry **entries,
 150                            const char *norm_str,
 151                            int term_no)
 152 {
 153     struct word_entry *ne = nmem_malloc(nmem, sizeof(*ne));
 154     ne->norm_str = nmem_strdup(nmem, norm_str);
 155     ne->termno = term_no;
 156
 157     ne->next = *entries;
 158     *entries = ne;
 159 }
 160
 161
 162 int word_entry_match(struct word_entry *entries, const char *norm_str)
 163 {
 164     for (; entries; entries = entries->next)
 165     {
 166         if (!strcmp(norm_str, entries->norm_str))
 167             return entries->termno;
 168     }
 169     return 0;
 170 }
 171
 172 static struct word_entry *build_word_entries(NMEM nmem,
 173                                              const char **terms)
 174 {
 175     int termno = 1; /* >0 signals THERE is an entry */
 176     struct word_entry *entries = 0;
 177     const char **p = terms;
 178     WRBUF norm_str = wrbuf_alloc();
 179
 180     for (; *p; p++)
 181     {
 182         const char *cp = *p;
 183         for (; *cp; cp++)
 184         {
 185             int c = raw_char(*cp);
 186             if (c >= 0)
 187                 wrbuf_putc(norm_str, c);
 188             else
 189             {
 190                 if (wrbuf_len(norm_str))
 191                     add_word_entry(nmem, &entries, wrbuf_cstr(norm_str),
 192                                    termno);
 193                 wrbuf_rewind(norm_str);
 194             }
 195         }
 196         if (wrbuf_len(norm_str))
 197             add_word_entry(nmem, &entries, wrbuf_cstr(norm_str), termno);
 198         wrbuf_rewind(norm_str);
 199         termno++;
 200     }
 201     wrbuf_destroy(norm_str);
 202     return entries;
 203 }
 204
 205
 206
 207 #endif
 208
 209
 210
 211 struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs)
 212 {
 213     struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
 214     const char **p;
 215     int i;
 216
 217     for (p = terms, i = 0; *p; p++, i++)
 218         ;
 219     res->vec_len = ++i;
 220     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
 221     memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int));
 222     res->nmem = nmem;
 223 #if USE_TRIE
 224     res->wt = build_word_trie(nmem, terms);
 225 #else
 226     res->entries = build_word_entries(nmem, terms);
 227 #endif
 228     return res;
 229 }
 230
 231 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
 232 {
 233     if (!rec->term_frequency_vec)
 234     {
 235         rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
 236         memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int));
 237     }
 238 }
 239
 240
 241 // FIXME. The definition of a word is crude here.. should support
 242 // some form of localization mechanism?
 243 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
 244         const char *words, int multiplier)
 245 {
 246 #if !USE_TRIE
 247     WRBUF norm_str = wrbuf_alloc();
 248 #endif
 249     while (*words)
 250     {
 251         char c;
 252         int res;
 253 #if USE_TRIE
 254         int skipped = 0;
 255 #endif
 256         while (*words && (c = raw_char(tolower(*words))) < 0)
 257             words++;
 258         if (!*words)
 259             return;
 260 #if USE_TRIE
 261         res = word_trie_match(r->wt, words, &skipped);
 262         if (res)
 263         {
 264             words += skipped;
 265             cluster->term_frequency_vec[res] += multiplier;
 266         }
 267         else
 268         {
 269             while (*words && (c = raw_char(tolower(*words))) >= 0)
 270                 words++;
 271         }
 272 #else
 273         while (*words && (c = raw_char(tolower(*words))) >= 0)
 274         {
 275             wrbuf_putc(norm_str, c);
 276             words++;
 277         }
 278         res = word_entry_match(r->entries, wrbuf_cstr(norm_str));
 279         if (res)
 280             cluster->term_frequency_vec[res] += multiplier;
 281         wrbuf_rewind(norm_str);
 282 #endif
 283         cluster->term_frequency_vec[0]++;
 284     }
 285 #if !USE_TRIE
 286     wrbuf_destroy(norm_str);
 287 #endif
 288 }
 289
 290 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
 291 {
 292     int i;
 293
 294     for (i = 1; i < r->vec_len; i++)
 295         if (cluster->term_frequency_vec[i] > 0)
 296             r->doc_frequency_vec[i]++;
 297
 298     r->doc_frequency_vec[0]++;
 299 }
 300
 301 #ifdef GAGA
 302 #ifdef FLOAT_REL
 303 static int comp(const void *p1, const void *p2)
 304 {
 305     float res;
 306     struct record **r1 = (struct record **) p1;
 307     struct record **r2 = (struct record **) p2;
 308     res = (*r2)->relevance - (*r1)->relevance;
 309     if (res > 0)
 310         return 1;
 311     else if (res < 0)
 312         return -1;
 313     else
 314         return 0;
 315 }
 316 #else
 317 static int comp(const void *p1, const void *p2)
 318 {
 319     struct record_cluster **r1 = (struct record_cluster **) p1;
 320     struct record_cluster **r2 = (struct record_cluster **) p2;
 321     return (*r2)->relevance - (*r1)->relevance;
 322 }
 323 #endif
 324 #endif
 325
 326 // Prepare for a relevance-sorted read
 327 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
 328 {
 329     int i;
 330     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
 331
 332     // Calculate document frequency vector for each term.
 333     for (i = 1; i < rel->vec_len; i++)
 334     {
 335         if (!rel->doc_frequency_vec[i])
 336             idfvec[i] = 0;
 337         else
 338         {
 339             // This conditional may be terribly wrong
 340             // It was there to address the situation where vec[0] == vec[i]
 341             // which leads to idfvec[i] == 0... not sure about this
 342             // Traditional TF-IDF may assume that a word that occurs in every
 343             // record is irrelevant, but this is actually something we will
 344             // see a lot
 345             if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] /
 346                             rel->doc_frequency_vec[i])) < 0.0000001)
 347                 idfvec[i] = 1;
 348         }
 349     }
 350     // Calculate relevance for each document
 351     for (i = 0; i < reclist->num_records; i++)
 352     {
 353         int t;
 354         struct record_cluster *rec = reclist->flatlist[i];
 355         float relevance;
 356         relevance = 0;
 357         for (t = 1; t < rel->vec_len; t++)
 358         {
 359             float termfreq;
 360             if (!rec->term_frequency_vec[0])
 361                 break;
 362             termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
 363             relevance += termfreq * idfvec[t];
 364         }
 365         rec->relevance = (int) (relevance * 100000);
 366     }
 367 #ifdef GAGA
 368     qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp);
 369 #endif
 370     reclist->pointer = 0;
 371     xfree(idfvec);
 372 }
 373
 374 /*
 375  * Local variables:
 376  * c-basic-offset: 4
 377  * indent-tabs-mode: nil
 378  * End:
 379  * vim: shiftwidth=4 tabstop=8 expandtab
 380  */