Improve estimate hit counts for rsprox sets.
[idzebra-moved-to-github.git] / rset / rsprox.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2011 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <assert.h>
24
25 #include <idzebra/util.h>
26 #include <rset.h>
27
28 #ifndef RSET_DEBUG
29 #define RSET_DEBUG 0
30 #endif
31
32 static RSFD r_open(RSET ct, int flag);
33 static void r_close(RSFD rfd);
34 static void r_delete(RSET ct);
35 static int r_forward(RSFD rfd, void *buf, TERMID *term, const void *untilbuf);
36 static int r_read(RSFD rfd, void *buf, TERMID *term);
37 static int r_write(RSFD rfd, const void *buf);
38 static void r_pos(RSFD rfd, double *current, double *total);
39 static void r_get_terms(RSET ct, TERMID *terms, int maxterms, int *curterm);
40
41 static const struct rset_control control = 
42 {
43     "prox",
44     r_delete,
45     r_get_terms,
46     r_open,
47     r_close,
48     r_forward,
49     r_pos,
50     r_read,
51     r_write,
52 };
53
54 struct rset_prox_info {
55     int ordered;
56     int exclusion;
57     int relation;
58     int distance;
59 };
60
61 struct rset_prox_rfd {
62     RSFD *rfd;
63     char **buf;  /* lookahead key buffers */
64     char *more;  /* more in each lookahead? */
65     TERMID *terms; /* lookahead terms */
66     zint hits;
67 };    
68
69
70 RSET rset_create_prox(NMEM nmem, struct rset_key_control *kcontrol,
71                       int scope,
72                       int rset_no, RSET *rset,
73                       int ordered, int exclusion,
74                       int relation, int distance)
75 {
76     RSET rnew = rset_create_base(&control, nmem, kcontrol, scope, 0,
77                                  rset_no, rset);
78     struct rset_prox_info *info;
79     info = (struct rset_prox_info *) nmem_malloc(rnew->nmem,sizeof(*info));
80     info->ordered = ordered;
81     info->exclusion = exclusion;
82     info->relation = relation;
83     info->distance = distance;
84     rnew->priv = info;
85     return rnew;
86 }
87
88 static void r_delete(RSET ct)
89 {
90 }
91
92 static RSFD r_open(RSET ct, int flag)
93 {
94     RSFD rfd;
95     struct rset_prox_rfd *p;
96     int i;
97
98     if (flag & RSETF_WRITE)
99     {
100         yaz_log(YLOG_FATAL, "prox set type is read-only");
101         return NULL;
102     }
103     rfd = rfd_create_base(ct);
104     if (rfd->priv)
105         p = (struct rset_prox_rfd *)(rfd->priv);
106     else
107     {
108         p = (struct rset_prox_rfd *) nmem_malloc(ct->nmem,sizeof(*p));
109         rfd->priv = p;
110         p->more = nmem_malloc(ct->nmem,sizeof(*p->more) * ct->no_children);
111         p->buf = nmem_malloc(ct->nmem,sizeof(*p->buf) * ct->no_children);
112         p->terms = nmem_malloc(ct->nmem,sizeof(*p->terms) * ct->no_children);
113         for (i = 0; i < ct->no_children; i++) 
114         {
115             p->buf[i] = nmem_malloc(ct->nmem,ct->keycontrol->key_size);
116             p->terms[i] = 0;
117         }
118         p->rfd = nmem_malloc(ct->nmem,sizeof(*p->rfd) * ct->no_children);
119     }
120     yaz_log(YLOG_DEBUG,"rsprox (%s) open [%p] n=%d", 
121             ct->control->desc, rfd, ct->no_children);
122     
123     for (i = 0; i < ct->no_children; i++)
124     {
125         p->rfd[i] = rset_open(ct->children[i], RSETF_READ);
126         p->more[i] = rset_read(p->rfd[i], p->buf[i], &p->terms[i]);
127     }
128     p->hits = 0;
129     return rfd;
130 }
131
132 static void r_close(RSFD rfd)
133 {
134     RSET ct = rfd->rset;
135     struct rset_prox_rfd *p = (struct rset_prox_rfd *)(rfd->priv);
136     
137     int i;
138     for (i = 0; i < ct->no_children; i++)
139         rset_close(p->rfd[i]);
140 }
141
142 static int r_forward(RSFD rfd, void *buf, TERMID *term, const void *untilbuf)
143 {
144     RSET ct = rfd->rset;
145     struct rset_prox_info *info = (struct rset_prox_info *)(ct->priv);
146     struct rset_prox_rfd *p = (struct rset_prox_rfd *)(rfd->priv);
147     const struct rset_key_control *kctrl = ct->keycontrol;
148     int cmp = 0;
149     int i;
150
151     if (untilbuf)
152     {
153         /* it is enough to forward first one. Other will follow. */
154         if (p->more[0] &&   /* was: cmp >=2 */
155             ((kctrl->cmp)(untilbuf, p->buf[0]) >= rfd->rset->scope) ) 
156             p->more[0] = rset_forward(p->rfd[0], p->buf[0], 
157                                       &p->terms[0], untilbuf);
158     }
159     if (info->ordered && info->relation <= 3 && info->exclusion == 0)
160     {
161         while (p->more[0]) 
162         {
163             for (i = 1; i < ct->no_children; i++)
164             {
165                 if (!p->more[i]) 
166                 {
167                     p->more[0] = 0; /* saves us a goto out of while loop. */
168                     break;
169                 }
170                 cmp = (*kctrl->cmp)(p->buf[i], p->buf[i-1]);
171                 if (cmp >= rfd->rset->scope)  /* not same record */
172                 {
173                     p->more[i-1] = rset_forward(p->rfd[i-1],
174                                                 p->buf[i-1],
175                                                 &p->terms[i-1],
176                                                 p->buf[i]);
177                     break;
178                 }
179                 else if (cmp > 0) /* within record and ordered */
180                 {
181                     zint diff = (*kctrl->getseq)(p->buf[i]) -
182                         (*kctrl->getseq)(p->buf[i-1]);
183                     if (info->relation == 3 && diff == info->distance)
184                         continue;
185                     else if (info->relation == 2 && diff <= info->distance)
186                         continue;
187                     else if (info->relation == 1 && diff < info->distance)
188                         continue;
189                     
190                     p->more[i-1] = rset_read(p->rfd[i-1], p->buf[i-1],
191                                              &p->terms[i-1]);
192                     break;
193                 }
194                 else  /* within record - wrong order */
195                 {
196                     p->more[i] = rset_forward(p->rfd[i], p->buf[i],
197                                               &p->terms[i], p->buf[i-1]);
198                     break;
199                 }
200             }
201             if (i == ct->no_children)
202             {
203                 i = ct->no_children-1;
204                 memcpy(buf, p->buf[i], kctrl->key_size);
205                 if (term)
206                     *term = p->terms[i];
207                 p->more[i] = rset_read(p->rfd[i], p->buf[i], &p->terms[i]);
208                 p->hits++;
209                 return 1;
210             }
211         }
212     }
213     else if (ct->no_children == 2)
214     {
215         while (p->more[0] && p->more[1]) 
216         {
217             int cmp = (*kctrl->cmp)(p->buf[0], p->buf[1]);
218             if ( cmp <= - rfd->rset->scope) /* cmp<-1*/
219                 p->more[0] = rset_forward(p->rfd[0], p->buf[0], 
220                                           &p->terms[0],p->buf[1]);
221             else if ( cmp >= rfd->rset->scope ) /* cmp>1 */
222                 p->more[1] = rset_forward(p->rfd[1], p->buf[1], 
223                                           &p->terms[1],p->buf[0]);
224             else
225             {
226                 zint seqno[500]; /* FIXME - why 500 ?? */
227                 int n = 0;
228                 
229                 seqno[n++] = (*kctrl->getseq)(p->buf[0]);
230                 while ((p->more[0] = rset_read(p->rfd[0],
231                                                p->buf[0], &p->terms[0])))
232                 {
233                     cmp = (*kctrl->cmp)(p->buf[0], p->buf[1]);
234                     if (cmp <= - rfd->rset->scope || cmp >= rfd->rset->scope)
235                         break;
236                     if (n < 500)
237                         seqno[n++] = (*kctrl->getseq)(p->buf[0]);
238                 }
239                 /* set up return buffer.. (save buf[1]) */
240                 memcpy(buf, p->buf[1], kctrl->key_size);
241                 if (term)
242                     *term = p->terms[1];
243                 while (1)
244                 {
245                     for (i = 0; i < n; i++)
246                     {
247                         zint diff = (*kctrl->getseq)(p->buf[1]) - seqno[i];
248                         int excl = info->exclusion;
249                         if (!info->ordered && diff < 0)
250                             diff = -diff;
251                         switch (info->relation)
252                         {
253                         case 1:      /* < */
254                             if (diff < info->distance && diff >= 0)
255                                 excl = !excl;
256                             break;
257                         case 2:      /* <= */
258                             if (diff <= info->distance && diff >= 0)
259                                 excl = !excl;
260                             break;
261                         case 3:      /* == */
262                             if (diff == info->distance && diff >= 0)
263                                 excl = !excl;
264                             break;
265                         case 4:      /* >= */
266                             if (diff >= info->distance && diff >= 0)
267                                 excl = !excl;
268                             break;
269                         case 5:      /* > */
270                             if (diff > info->distance && diff >= 0)
271                                 excl = !excl;
272                             break;
273                         case 6:      /* != */
274                             if (diff != info->distance && diff >= 0)
275                                 excl = !excl;
276                             break;
277                         }
278                         if (excl)
279                         {
280                             p->more[1] = rset_read( p->rfd[1], p->buf[1],
281                                                     &p->terms[1]);
282                             p->hits++;
283                             return 1;
284                         }
285                     }
286                     p->more[1] = rset_read(p->rfd[1], p->buf[1], &p->terms[1]);
287                     if (!p->more[1])
288                         break;
289                     cmp = (*kctrl->cmp)(buf, p->buf[1]);
290                     if (cmp <= - rfd->rset->scope || cmp >= rfd->rset->scope)
291                         break;
292                 }
293             }
294         }
295     }
296     return 0;
297 }
298
299
300 static int r_read(RSFD rfd, void *buf, TERMID *term)
301 {
302     return r_forward(rfd, buf, term, 0);
303 }
304
305 static int r_write(RSFD rfd, const void *buf)
306 {
307     yaz_log(YLOG_FATAL, "prox set type is read-only");
308     return -1;
309 }
310
311 static void r_pos(RSFD rfd, double *current, double *total)
312 {
313     RSET ct = rfd->rset;
314     struct rset_prox_rfd *p = (struct rset_prox_rfd *)(rfd->priv);
315     int i;
316     double ratio = 0.0;
317     
318     for (i = 0; i < ct->no_children; i++)
319     {
320         double cur, tot;
321         rset_pos(p->rfd[i], &cur, &tot);
322         if (tot > 0.0)
323         {
324             double nratio = cur / tot;
325             if (ratio < nratio)
326                 ratio = nratio;
327         }
328     }
329     *current = (double) p->hits;
330     if (ratio > 0.0)
331         *total = *current/ratio;
332     else
333         *total = 0.0;
334     
335     yaz_log(YLOG_DEBUG, "prox_pos: [%d] %0.1f/%0.1f= %0.4f ",
336             i, *current, *total, ratio);
337 }
338
339 static void r_get_terms(RSET ct, TERMID *terms, int maxterms, int *curterm)
340 {
341     int i;
342     for (i = 0; i < ct->no_children; i++)
343         rset_getterms(ct->children[i], terms, maxterms, curterm);
344 }
345
346 /*
347  * Local variables:
348  * c-basic-offset: 4
349  * c-file-style: "Stroustrup"
350  * indent-tabs-mode: nil
351  * End:
352  * vim: shiftwidth=4 tabstop=8 expandtab
353  */
354