src/relevance.c

   1 /* $Id: relevance.c,v 1.13 2007-05-10 11:46:09 adam Exp $
   2    Copyright (c) 2006-2007, Index Data.
   3
   4 This file is part of Pazpar2.
   5
   6 Pazpar2 is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 2, or (at your option) any later
   9 version.
  10
  11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with Pazpar2; see the file LICENSE.  If not, write to the
  18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  19 02111-1307, USA.
  20  */
  21
  22 #include <ctype.h>
  23 #include <math.h>
  24 #include <stdlib.h>
  25
  26 #if HAVE_CONFIG_H
  27 #include <cconfig.h>
  28 #endif
  29
  30 #include "relevance.h"
  31 #include "pazpar2.h"
  32
  33 #define USE_TRIE 0
  34
  35 struct relevance
  36 {
  37     int *doc_frequency_vec;
  38     int vec_len;
  39 #if USE_TRIE
  40     struct word_trie *wt;
  41 #else
  42     struct word_entry *entries;
  43     pp2_charset_t pct;
  44 #endif
  45     NMEM nmem;
  46 };
  47
  48 #if USE_TRIE
  49 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
  50
  51
  52 // We use this data structure to recognize terms in input records,
  53 // and map them to record term vectors for counting.
  54 struct word_trie
  55 {
  56     struct
  57     {
  58         struct word_trie *child;
  59         int termno;
  60     } list[26];
  61 };
  62
  63 static struct word_trie *create_word_trie_node(NMEM nmem)
  64 {
  65     struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
  66     int i;
  67     for (i = 0; i < 26; i++)
  68     {
  69         res->list[i].child = 0;
  70         res->list[i].termno = -1;
  71     }
  72     return res;
  73 }
  74
  75 static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
  76 {
  77
  78     while (*term) {
  79         int c = tolower(*term);
  80         if (c < 'a' || c > 'z')
  81             term++;
  82         else
  83         {
  84             c -= 'a';
  85             if (!*(++term))
  86                 n->list[c].termno = num;
  87             else
  88             {
  89                 if (!n->list[c].child)
  90                 {
  91                     struct word_trie *new = create_word_trie_node(nmem);
  92                     n->list[c].child = new;
  93                 }
  94                 word_trie_addterm(nmem, n->list[c].child, term, num);
  95             }
  96             break;
  97         }
  98     }
  99 }
 100
 101 static int word_trie_match(struct word_trie *t, const char *word, int *skipped)
 102 {
 103     int c = raw_char(tolower(*word));
 104
 105     if (!*word)
 106         return 0;
 107
 108     word++;
 109     (*skipped)++;
 110     if (!*word || raw_char(*word) < 0)
 111     {
 112         if (t->list[c].termno > 0)
 113             return t->list[c].termno;
 114         else
 115             return 0;
 116     }
 117     else
 118     {
 119         if (t->list[c].child)
 120         {
 121             return word_trie_match(t->list[c].child, word, skipped);
 122         }
 123         else
 124             return 0;
 125     }
 126
 127 }
 128
 129
 130 static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
 131 {
 132     struct word_trie *res = create_word_trie_node(nmem);
 133     const char **p;
 134     int i;
 135
 136     for (i = 1, p = terms; *p; p++, i++)
 137         word_trie_addterm(nmem, res, *p, i);
 138     return res;
 139 }
 140
 141
 142 // FIXME. The definition of a word is crude here.. should support
 143 // some form of localization mechanism?
 144 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
 145                           const char *words, int multiplier)
 146 {
 147     while (*words)
 148     {
 149         char c;
 150         int res;
 151         int skipped = 0;
 152         while (*words && (c = raw_char(tolower(*words))) < 0)
 153             words++;
 154         if (!*words)
 155             break;
 156         res = word_trie_match(r->wt, words, &skipped);
 157         if (res)
 158         {
 159             words += skipped;
 160             cluster->term_frequency_vec[res] += multiplier;
 161         }
 162         else
 163         {
 164             while (*words && (c = raw_char(tolower(*words))) >= 0)
 165                 words++;
 166         }
 167         cluster->term_frequency_vec[0]++;
 168     }
 169 }
 170
 171 #else
 172
 173 struct word_entry {
 174     const char *norm_str;
 175     int termno;
 176     struct word_entry *next;
 177 };
 178
 179 static void add_word_entry(NMEM nmem,
 180                            struct word_entry **entries,
 181                            const char *norm_str,
 182                            int term_no)
 183 {
 184     struct word_entry *ne = nmem_malloc(nmem, sizeof(*ne));
 185     ne->norm_str = nmem_strdup(nmem, norm_str);
 186     ne->termno = term_no;
 187
 188     ne->next = *entries;
 189     *entries = ne;
 190 }
 191
 192
 193 int word_entry_match(struct word_entry *entries, const char *norm_str)
 194 {
 195     for (; entries; entries = entries->next)
 196     {
 197         if (!strcmp(norm_str, entries->norm_str))
 198             return entries->termno;
 199     }
 200     return 0;
 201 }
 202
 203 static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem,
 204                                              const char **terms)
 205 {
 206     int termno = 1; /* >0 signals THERE is an entry */
 207     struct word_entry *entries = 0;
 208     const char **p = terms;
 209
 210     for (; *p; p++)
 211     {
 212         pp2_relevance_token_t prt = pp2_relevance_tokenize(pct, *p);
 213         const char *norm_str;
 214
 215         while ((norm_str = pp2_relevance_token_next(prt)))
 216             add_word_entry(nmem, &entries, norm_str, termno);
 217
 218         pp2_relevance_token_destroy(prt);
 219
 220         termno++;
 221     }
 222     return entries;
 223 }
 224
 225 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
 226         const char *words, int multiplier)
 227 {
 228     pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words);
 229
 230     const char *norm_str;
 231
 232     while ((norm_str = pp2_relevance_token_next(prt)))
 233     {
 234         int res = word_entry_match(r->entries, norm_str);
 235         if (res)
 236             cluster->term_frequency_vec[res] += multiplier;
 237         cluster->term_frequency_vec[0]++;
 238     }
 239     pp2_relevance_token_destroy(prt);
 240 }
 241
 242 #endif
 243
 244
 245
 246 struct relevance *relevance_create(pp2_charset_t pct,
 247                                    NMEM nmem, const char **terms, int numrecs)
 248 {
 249     struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
 250     const char **p;
 251     int i;
 252
 253     for (p = terms, i = 0; *p; p++, i++)
 254         ;
 255     res->vec_len = ++i;
 256     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
 257     memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int));
 258     res->nmem = nmem;
 259 #if USE_TRIE
 260     res->wt = build_word_trie(nmem, terms);
 261 #else
 262     res->entries = build_word_entries(pct, nmem, terms);
 263     res->pct = pct;
 264 #endif
 265     return res;
 266 }
 267
 268 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
 269 {
 270     if (!rec->term_frequency_vec)
 271     {
 272         rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
 273         memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int));
 274     }
 275 }
 276
 277
 278 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
 279 {
 280     int i;
 281
 282     for (i = 1; i < r->vec_len; i++)
 283         if (cluster->term_frequency_vec[i] > 0)
 284             r->doc_frequency_vec[i]++;
 285
 286     r->doc_frequency_vec[0]++;
 287 }
 288
 289 #ifdef GAGA
 290 #ifdef FLOAT_REL
 291 static int comp(const void *p1, const void *p2)
 292 {
 293     float res;
 294     struct record **r1 = (struct record **) p1;
 295     struct record **r2 = (struct record **) p2;
 296     res = (*r2)->relevance - (*r1)->relevance;
 297     if (res > 0)
 298         return 1;
 299     else if (res < 0)
 300         return -1;
 301     else
 302         return 0;
 303 }
 304 #else
 305 static int comp(const void *p1, const void *p2)
 306 {
 307     struct record_cluster **r1 = (struct record_cluster **) p1;
 308     struct record_cluster **r2 = (struct record_cluster **) p2;
 309     return (*r2)->relevance - (*r1)->relevance;
 310 }
 311 #endif
 312 #endif
 313
 314 // Prepare for a relevance-sorted read
 315 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
 316 {
 317     int i;
 318     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
 319
 320     // Calculate document frequency vector for each term.
 321     for (i = 1; i < rel->vec_len; i++)
 322     {
 323         if (!rel->doc_frequency_vec[i])
 324             idfvec[i] = 0;
 325         else
 326         {
 327             // This conditional may be terribly wrong
 328             // It was there to address the situation where vec[0] == vec[i]
 329             // which leads to idfvec[i] == 0... not sure about this
 330             // Traditional TF-IDF may assume that a word that occurs in every
 331             // record is irrelevant, but this is actually something we will
 332             // see a lot
 333             if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] /
 334                             rel->doc_frequency_vec[i])) < 0.0000001)
 335                 idfvec[i] = 1;
 336         }
 337     }
 338     // Calculate relevance for each document
 339     for (i = 0; i < reclist->num_records; i++)
 340     {
 341         int t;
 342         struct record_cluster *rec = reclist->flatlist[i];
 343         float relevance;
 344         relevance = 0;
 345         for (t = 1; t < rel->vec_len; t++)
 346         {
 347             float termfreq;
 348             if (!rec->term_frequency_vec[0])
 349                 break;
 350             termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
 351             relevance += termfreq * idfvec[t];
 352         }
 353         rec->relevance = (int) (relevance * 100000);
 354     }
 355 #ifdef GAGA
 356     qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp);
 357 #endif
 358     reclist->pointer = 0;
 359     xfree(idfvec);
 360 }
 361
 362 /*
 363  * Local variables:
 364  * c-basic-offset: 4
 365  * indent-tabs-mode: nil
 366  * End:
 367  * vim: shiftwidth=4 tabstop=8 expandtab
 368  */