Change rank debug display
[pazpar2-moved-to-github.git] / src / relevance.c
1 /* This file is part of Pazpar2.
2    Copyright (C) 2006-2012 Index Data
3
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <math.h>
26 #include <stdlib.h>
27
28 #include "relevance.h"
29 #include "session.h"
30
31 struct relevance
32 {
33     int *doc_frequency_vec;
34     int *term_frequency_vec_tmp;
35     int *term_pos;
36     int vec_len;
37     struct word_entry *entries;
38     pp2_charset_token_t prt;
39     int rank_cluster;
40     double follow_factor;
41     double lead_decay;
42     int length_divide;
43     NMEM nmem;
44 };
45
46 struct word_entry {
47     const char *norm_str;
48     const char *display_str;
49     int termno;
50     char *ccl_field;
51     struct word_entry *next;
52 };
53
54 static struct word_entry *word_entry_match(struct relevance *r,
55                                            const char *norm_str,
56                                            const char *rank, int *weight)
57 {
58     int i = 1;
59     struct word_entry *entries = r->entries;
60     for (; entries; entries = entries->next, i++)
61     {
62         if (*norm_str && !strcmp(norm_str, entries->norm_str))
63         {
64             const char *cp = 0;
65             int no_read = 0;
66             sscanf(rank, "%d%n", weight, &no_read);
67             rank += no_read;
68             while (*rank == ' ')
69                 rank++;
70             if (no_read > 0 && (cp = strchr(rank, ' ')))
71             {
72                 if ((cp - rank) == strlen(entries->ccl_field) &&
73                     memcmp(entries->ccl_field, rank, cp - rank) == 0)
74                     *weight = atoi(cp + 1);
75             }
76             return entries;
77         }
78     }
79     return 0;
80 }
81
82 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
83                           const char *words, const char *rank,
84                           const char *name)
85 {
86     int *w = r->term_frequency_vec_tmp;
87     const char *norm_str;
88     int i, length = 0;
89     double lead_decay = r->lead_decay;
90     struct word_entry *e;
91     WRBUF wr = cluster->relevance_explain1;
92     int printed_about_field = 0;
93
94     pp2_charset_token_first(r->prt, words, 0);
95     for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
96     {
97         w[i] = 0;
98         r->term_pos[i] = 0;
99     }
100
101     assert(rank);
102     while ((norm_str = pp2_charset_token_next(r->prt)))
103     {
104         int local_weight = 0;
105         e = word_entry_match(r, norm_str, rank, &local_weight);
106         if (e)
107         {
108             int res = e->termno;
109             int j;
110
111             if (!printed_about_field)
112             {
113                 printed_about_field = 1;
114                 wrbuf_printf(wr, "field=%s content=", name);
115                 if (strlen(words) > 50)
116                 {
117                     wrbuf_xmlputs_n(wr, words, 49);
118                     wrbuf_puts(wr, " ...");
119                 }
120                 else
121                     wrbuf_xmlputs(wr, words);
122                 wrbuf_puts(wr, ";\n");
123             }
124             assert(res < r->vec_len);
125             w[res] += local_weight / (1 + log2(1 + lead_decay * length));
126             wrbuf_printf(wr, "%s: w[%d] += w(%d) / "
127                          "(1+log2(1+lead_decay(%f) * length(%d)));\n",
128                          e->display_str, res, local_weight, lead_decay, length);
129             j = res - 1;
130             if (j > 0 && r->term_pos[j])
131             {
132                 int d = length + 1 - r->term_pos[j];
133                 wrbuf_printf(wr, "%s: w[%d] += w[%d](%d) * follow(%f) / "
134                              "(1+log2(d(%d));\n",
135                              e->display_str, res, res, w[res],
136                              r->follow_factor, d);
137                 w[res] += w[res] * r->follow_factor / (1 + log2(d));
138             }
139             for (j = 0; j < r->vec_len; j++)
140                 r->term_pos[j] = j < res ? 0 : length + 1;
141         }
142         length++;
143     }
144
145     for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
146     {
147         if (length == 0 || w[i] == 0)
148             continue;
149         wrbuf_printf(wr, "%s: tf[%d] += w[%d](%d)", e->display_str, i, i, w[i]);
150         switch (r->length_divide)
151         {
152         case 0:
153             cluster->term_frequency_vecf[i] += (double) w[i];
154             break;
155         case 1:
156             wrbuf_printf(wr, " / log2(1+length(%d))", length);
157             cluster->term_frequency_vecf[i] +=
158                 (double) w[i] / log2(1 + length);
159             break;
160         case 2:
161             wrbuf_printf(wr, " / length(%d)", length);
162             cluster->term_frequency_vecf[i] += (double) w[i] / length;
163         }
164         cluster->term_frequency_vec[i] += w[i];
165         wrbuf_printf(wr, " (%f);\n", cluster->term_frequency_vecf[i]);
166     }
167
168     cluster->term_frequency_vec[0] += length;
169 }
170
171 static void pull_terms(struct relevance *res, struct ccl_rpn_node *n)
172 {
173     char **words;
174     int numwords;
175     char *ccl_field;
176     int i;
177
178     switch (n->kind)
179     {
180     case CCL_RPN_AND:
181     case CCL_RPN_OR:
182     case CCL_RPN_NOT:
183     case CCL_RPN_PROX:
184         pull_terms(res, n->u.p[0]);
185         pull_terms(res, n->u.p[1]);
186         break;
187     case CCL_RPN_TERM:
188         nmem_strsplit(res->nmem, " ", n->u.t.term, &words, &numwords);
189         for (i = 0; i < numwords; i++)
190         {
191             const char *norm_str;
192
193             ccl_field = nmem_strdup_null(res->nmem, n->u.t.qual);
194
195             pp2_charset_token_first(res->prt, words[i], 0);
196             while ((norm_str = pp2_charset_token_next(res->prt)))
197             {
198                 struct word_entry **e = &res->entries;
199                 while (*e)
200                     e = &(*e)->next;
201                 *e = nmem_malloc(res->nmem, sizeof(**e));
202                 (*e)->norm_str = nmem_strdup(res->nmem, norm_str);
203                 (*e)->ccl_field = ccl_field;
204                 (*e)->termno = res->vec_len++;
205                 (*e)->display_str = nmem_strdup(res->nmem, words[i]);
206                 (*e)->next = 0;
207             }
208         }
209         break;
210     default:
211         break;
212     }
213 }
214
215 struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
216                                        struct ccl_rpn_node *query,
217                                        int rank_cluster,
218                                        double follow_factor, double lead_decay,
219                                        int length_divide)
220 {
221     NMEM nmem = nmem_create();
222     struct relevance *res = nmem_malloc(nmem, sizeof(*res));
223     int i;
224
225     res->nmem = nmem;
226     res->entries = 0;
227     res->vec_len = 1;
228     res->rank_cluster = rank_cluster;
229     res->follow_factor = follow_factor;
230     res->lead_decay = lead_decay;
231     res->length_divide = length_divide;
232     res->prt = pp2_charset_token_create(pft, "relevance");
233
234     pull_terms(res, query);
235
236     res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
237     for (i = 0; i < res->vec_len; i++)
238         res->doc_frequency_vec[i] = 0;
239
240     // worker array
241     res->term_frequency_vec_tmp =
242         nmem_malloc(res->nmem,
243                     res->vec_len * sizeof(*res->term_frequency_vec_tmp));
244
245     res->term_pos =
246         nmem_malloc(res->nmem, res->vec_len * sizeof(*res->term_pos));
247
248     return res;
249 }
250
251 void relevance_destroy(struct relevance **rp)
252 {
253     if (*rp)
254     {
255         pp2_charset_token_destroy((*rp)->prt);
256         nmem_destroy((*rp)->nmem);
257         *rp = 0;
258     }
259 }
260
261 void relevance_newrec(struct relevance *r, struct record_cluster *rec)
262 {
263     if (!rec->term_frequency_vec)
264     {
265         int i;
266
267         // term frequency [1,..] . [0] is total length of all fields
268         rec->term_frequency_vec =
269             nmem_malloc(r->nmem,
270                         r->vec_len * sizeof(*rec->term_frequency_vec));
271         for (i = 0; i < r->vec_len; i++)
272             rec->term_frequency_vec[i] = 0;
273
274         // term frequency divided by length of field [1,...]
275         rec->term_frequency_vecf =
276             nmem_malloc(r->nmem,
277                         r->vec_len * sizeof(*rec->term_frequency_vecf));
278         for (i = 0; i < r->vec_len; i++)
279             rec->term_frequency_vecf[i] = 0.0;
280     }
281 }
282
283 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster)
284 {
285     int i;
286
287     for (i = 1; i < r->vec_len; i++)
288         if (cluster->term_frequency_vec[i] > 0)
289             r->doc_frequency_vec[i]++;
290
291     r->doc_frequency_vec[0]++;
292 }
293
294 // Prepare for a relevance-sorted read
295 void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
296 {
297     int i;
298     float *idfvec = xmalloc(rel->vec_len * sizeof(float));
299
300     reclist_enter(reclist);
301     // Calculate document frequency vector for each term.
302     for (i = 1; i < rel->vec_len; i++)
303     {
304         if (!rel->doc_frequency_vec[i])
305             idfvec[i] = 0;
306         else
307         {
308             /* add one to nominator idf(t,D) to ensure a value > 0 */
309             idfvec[i] = log((float) (1 + rel->doc_frequency_vec[0]) /
310                             rel->doc_frequency_vec[i]);
311         }
312     }
313     // Calculate relevance for each document
314     while (1)
315     {
316         int relevance = 0;
317         WRBUF w;
318         struct word_entry *e = rel->entries;
319         struct record_cluster *rec = reclist_read_record(reclist);
320         if (!rec)
321             break;
322         w = rec->relevance_explain2;
323         wrbuf_rewind(w);
324         wrbuf_puts(w, "relevance = 0;\n");
325         for (i = 1; i < rel->vec_len; i++)
326         {
327             float termfreq = (float) rec->term_frequency_vecf[i];
328             int add = 100000 * termfreq * idfvec[i];
329
330             wrbuf_printf(w, "idf[%d] = log(((1 + total(%d))/termoccur(%d));\n",
331                          i, rel->doc_frequency_vec[0],
332                          rel->doc_frequency_vec[i]);
333             wrbuf_printf(w, "%s: relevance += 100000 * tf[%d](%f) * "
334                          "idf[%d](%f) (%d);\n",
335                          e->display_str, i, termfreq, i, idfvec[i], add);
336             relevance += add;
337             e = e->next;
338         }
339         if (!rel->rank_cluster)
340         {
341             struct record *record;
342             int cluster_size = 0;
343
344             for (record = rec->records; record; record = record->next)
345                 cluster_size++;
346
347             wrbuf_printf(w, "score = relevance(%d)/cluster_size(%d);\n",
348                          relevance, cluster_size);
349             relevance /= cluster_size;
350         }
351         else
352         {
353             wrbuf_printf(w, "score = relevance(%d);\n", relevance);
354         }
355         rec->relevance_score = relevance;
356     }
357     reclist_leave(reclist);
358     xfree(idfvec);
359 }
360
361 /*
362  * Local variables:
363  * c-basic-offset: 4
364  * c-file-style: "Stroustrup"
365  * indent-tabs-mode: nil
366  * End:
367  * vim: shiftwidth=4 tabstop=8 expandtab
368  */
369