1 /* $Id: relevance.c,v 1.9 2007-04-10 08:48:56 adam Exp $
2 Copyright (c) 2006-2007, Index Data.
4 This file is part of Pazpar2.
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
30 #include "relevance.h"
35 int *doc_frequency_vec;
41 // We use this data structure to recognize terms in input records,
42 // and map them to record term vectors for counting.
47 struct word_trie *child;
52 static struct word_trie *create_word_trie_node(NMEM nmem)
54 struct word_trie *res = nmem_malloc(nmem, sizeof(struct word_trie));
56 for (i = 0; i < 26; i++)
58 res->list[i].child = 0;
59 res->list[i].termno = -1;
64 static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, int num)
67 int c = tolower(*term);
68 if (c < 'a' || c > 'z')
74 n->list[c].termno = num;
77 if (!n->list[c].child)
79 struct word_trie *new = create_word_trie_node(nmem);
80 n->list[c].child = new;
82 word_trie_addterm(nmem, n->list[c].child, term, num);
89 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1)
91 static int word_trie_match(struct word_trie *t, const char *word, int *skipped)
93 int c = raw_char(tolower(*word));
100 if (!*word || raw_char(*word) < 0)
102 if (t->list[c].termno > 0)
103 return t->list[c].termno;
109 if (t->list[c].child)
111 return word_trie_match(t->list[c].child, word, skipped);
120 static struct word_trie *build_word_trie(NMEM nmem, const char **terms)
122 struct word_trie *res = create_word_trie_node(nmem);
126 for (i = 1, p = terms; *p; p++, i++)
127 word_trie_addterm(nmem, res, *p, i);
131 struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs)
133 struct relevance *res = nmem_malloc(nmem, sizeof(struct relevance));
137 for (p = terms, i = 0; *p; p++, i++)
140 res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
141 memset(res->doc_frequency_vec, 0, res->vec_len * sizeof(int));
143 res->wt = build_word_trie(nmem, terms);
147 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
149 if (!rec->term_frequency_vec)
151 rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int));
152 memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int));
157 // FIXME. The definition of a word is crude here.. should support
158 // some form of localization mechanism?
159 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
160 const char *words, int multiplier)
167 while (*words && (c = raw_char(tolower(*words))) < 0)
172 if ((res = word_trie_match(r->wt, words, &skipped)))
175 cluster->term_frequency_vec[res] += multiplier;
179 while (*words && (c = raw_char(tolower(*words))) >= 0)
182 cluster->term_frequency_vec[0]++;
186 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
190 for (i = 1; i < r->vec_len; i++)
191 if (cluster->term_frequency_vec[i] > 0)
192 r->doc_frequency_vec[i]++;
194 r->doc_frequency_vec[0]++;
199 static int comp(const void *p1, const void *p2)
202 struct record **r1 = (struct record **) p1;
203 struct record **r2 = (struct record **) p2;
204 res = (*r2)->relevance - (*r1)->relevance;
213 static int comp(const void *p1, const void *p2)
215 struct record_cluster **r1 = (struct record_cluster **) p1;
216 struct record_cluster **r2 = (struct record_cluster **) p2;
217 return (*r2)->relevance - (*r1)->relevance;
222 // Prepare for a relevance-sorted read
223 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
226 float *idfvec = xmalloc(rel->vec_len * sizeof(float));
228 // Calculate document frequency vector for each term.
229 for (i = 1; i < rel->vec_len; i++)
231 if (!rel->doc_frequency_vec[i])
234 idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]);
236 // Calculate relevance for each document
237 for (i = 0; i < reclist->num_records; i++)
240 struct record_cluster *rec = reclist->flatlist[i];
243 for (t = 1; t < rel->vec_len; t++)
246 if (!rec->term_frequency_vec[0])
248 termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0];
249 relevance += termfreq * idfvec[t];
251 rec->relevance = (int) (relevance * 100000);
254 qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp);
256 reclist->pointer = 0;
263 * indent-tabs-mode: nil
265 * vim: shiftwidth=4 tabstop=8 expandtab