Fixed bug #446: iso2709 filter SEGVs on bad input data.
[idzebra-moved-to-github.git] / rset / rsbetween.c
1 /* $Id: rsbetween.c,v 1.15.2.4 2004-12-17 13:43:09 heikki Exp $
2    Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
3    Index Data Aps
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra.  If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
20 02111-1307, USA.
21 */
22
23
24 /* rsbetween is (mostly) used for xml searches. It returns the hits of the
25  * "middle" rset, that are in between the "left" and "right" rsets. For
26  * example "Shakespeare" in between "<title>" and </title>. The thing is 
27  * complicated by the inclusion of attributes (from their own rset). If attrs
28  * specified, they must match the "left" rset (start tag). "Hamlet" between
29  * "<title lang=eng>" and "</title>". (This assumes that the attributes are
30  * indexed to the same seqno as the tags).
31
32 */ 
33
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <assert.h>
38
39 #include <rsbetween.h>
40 #include <zebrautl.h>
41
42 #define RSBETWEEN_DEBUG 0
43 #if RSBETWEEN_DEBUG
44 #define log_level LOG_DEBUG
45 #endif
46
47 static void *r_create_between(RSET ct, const struct rset_control *sel, void *parms);
48 static RSFD r_open_between (RSET ct, int flag);
49 static void r_close_between (RSFD rfd);
50 static void r_delete_between (RSET ct);
51 static void r_rewind_between (RSFD rfd);
52 static int r_forward_between(RSET ct, RSFD rfd, void *buf, int *term_index,
53                      int (*cmpfunc)(const void *p1, const void *p2),
54                      const void *untilbuf);
55 static int r_read (RSFD rfd, void *buf, int *term_index);
56 static int r_write_between (RSFD rfd, const void *buf);
57
58 static const struct rset_control control_between = 
59 {
60     "between",
61     r_create_between,
62     r_open_between,
63     r_close_between,
64     r_delete_between,
65     r_rewind_between,
66     r_forward_between,  /* rset_default_forward,  */
67     rset_default_pos,
68     r_read,
69     r_write_between,
70 };
71
72
73 const struct rset_control *rset_kind_between = &control_between;
74
75 #define WHICH_L 0
76 #define WHICH_M 1
77 #define WHICH_R 2
78 #define WHICH_A 3
79
80 struct rset_between_info {
81     int key_size;
82     RSET rset_l;
83     RSET rset_m;
84     RSET rset_r;
85     RSET rset_attr;
86     int term_index_s;
87     int (*cmp)(const void *p1, const void *p2);
88     char *(*printer)(const void *p1, char *buf);
89     struct rset_between_rfd *rfd_list;
90 };
91
92 struct rset_between_rfd {
93     RSFD rfd_l;
94     RSFD rfd_m;
95     RSFD rfd_r;
96     RSFD rfd_attr;
97     int  more_l;
98     int  more_m;
99     int  more_r;
100     int  more_attr;
101     int term_index_l;
102     int term_index_m;
103     int term_index_r;
104     void *buf_l;
105     void *buf_m;
106     void *buf_r;
107     void *buf_attr;
108     int level;
109     struct rset_between_rfd *next;
110     struct rset_between_info *info;
111     /* 1.4 backport */
112     void *recbuf; /* a key that tells which record we are in */
113     void *startbuf; /* the start tag */
114     int startbufok; /* we have seen the first start tag */
115     void *attrbuf;  /* the attr tag. If these two match, we have attr match */
116     int attrbufok; /* we have seen the first attr tag, can compare */
117     int depth; /* number of start-tags without end-tags */
118     int attrdepth; /* on what depth the attr matched */
119     int hits; /* number of hits returned so far */
120     /* Stuff for the multi-and read, also backported */
121     /* uses more_l/m/r/a etc from above */
122     int tailcount;
123     int tailbits[4]; 
124     int and_eof; 
125 };    
126
127 #if RSBETWEEN_DEBUG
128 static void log2 (struct rset_between_rfd *p, char *msg, int cmp_l, int cmp_r)
129 {
130     char buf_l[32];
131     char buf_m[32];
132     char buf_r[32];
133     logf(LOG_DEBUG,"btw: %s l=%s(%d/%d) m=%s(%d) r=%s(%d/%d), lev=%d",
134       msg, 
135       (*p->info->printer)(p->buf_l, buf_l), p->more_l, cmp_l,
136       (*p->info->printer)(p->buf_m, buf_m), p->more_m,
137       (*p->info->printer)(p->buf_r, buf_r), p->more_r, cmp_r,
138       p->level);
139 }
140 #endif
141
142 static void *r_create_between (RSET ct, const struct rset_control *sel,
143                                void *parms)
144 {
145     rset_between_parms *between_parms = (rset_between_parms *) parms;
146     struct rset_between_info *info;
147
148     info = (struct rset_between_info *) xmalloc (sizeof(*info));
149     info->key_size = between_parms->key_size;
150     info->rset_l = between_parms->rset_l;
151     info->rset_m = between_parms->rset_m;
152     info->rset_r = between_parms->rset_r;
153     info->rset_attr = between_parms->rset_attr;
154     if (rset_is_volatile(info->rset_l) || 
155         rset_is_volatile(info->rset_m) ||
156         rset_is_volatile(info->rset_r))
157         ct->flags |= RSET_FLAG_VOLATILE;
158     info->cmp = between_parms->cmp;
159     info->printer = between_parms->printer;
160     info->rfd_list = NULL;
161     
162     info->term_index_s = info->rset_l->no_rset_terms;
163     if (info->rset_m)
164     {
165         ct->no_rset_terms =
166             info->rset_l->no_rset_terms + 
167             info->rset_m->no_rset_terms + 
168             info->rset_r->no_rset_terms;
169         ct->rset_terms = (RSET_TERM *)
170             xmalloc (sizeof (*ct->rset_terms) * ct->no_rset_terms);
171         memcpy (ct->rset_terms, info->rset_l->rset_terms,
172                 info->rset_l->no_rset_terms * sizeof(*ct->rset_terms));
173         memcpy (ct->rset_terms + info->rset_l->no_rset_terms,
174                 info->rset_m->rset_terms,
175                 info->rset_m->no_rset_terms * sizeof(*ct->rset_terms));
176         memcpy (ct->rset_terms + info->rset_l->no_rset_terms + 
177                 info->rset_m->no_rset_terms,
178                 info->rset_r->rset_terms,
179                 info->rset_r->no_rset_terms * sizeof(*ct->rset_terms));
180     }
181     else
182     {
183         ct->no_rset_terms =
184             info->rset_l->no_rset_terms + 
185             info->rset_r->no_rset_terms;
186         ct->rset_terms = (RSET_TERM *)
187             xmalloc (sizeof (*ct->rset_terms) * ct->no_rset_terms);
188         memcpy (ct->rset_terms, info->rset_l->rset_terms,
189                 info->rset_l->no_rset_terms * sizeof(*ct->rset_terms));
190         memcpy (ct->rset_terms + info->rset_l->no_rset_terms,
191                 info->rset_r->rset_terms,
192                 info->rset_r->no_rset_terms * sizeof(*ct->rset_terms));
193     }
194
195     return info;
196 }
197
198 static RSFD r_open_between (RSET ct, int flag)
199 {
200     struct rset_between_info *info = (struct rset_between_info *) ct->buf;
201     struct rset_between_rfd *rfd;
202
203     if (flag & RSETF_WRITE)
204     {
205         logf (LOG_FATAL, "between set type is read-only");
206         return NULL;
207     }
208     rfd = (struct rset_between_rfd *) xmalloc (sizeof(*rfd));
209     rfd->next = info->rfd_list;
210     info->rfd_list = rfd;
211     rfd->info = info;
212
213     rfd->buf_l = xmalloc (info->key_size);
214     rfd->buf_m = xmalloc (info->key_size);
215     rfd->buf_r = xmalloc (info->key_size);
216     rfd->buf_attr = xmalloc (info->key_size);
217
218     rfd->rfd_l = rset_open (info->rset_l, RSETF_READ);
219     rfd->rfd_m = rset_open (info->rset_m, RSETF_READ);
220     rfd->rfd_r = rset_open (info->rset_r, RSETF_READ);
221     
222     rfd->more_l = rset_read (info->rset_l, rfd->rfd_l, rfd->buf_l,
223                              &rfd->term_index_l);
224     rfd->more_m = rset_read (info->rset_m, rfd->rfd_m, rfd->buf_m,
225                              &rfd->term_index_m);
226     rfd->more_r = rset_read (info->rset_r, rfd->rfd_r, rfd->buf_r,
227                              &rfd->term_index_r);
228     if (info->rset_attr)
229     {
230         int dummy;
231         rfd->rfd_attr = rset_open (info->rset_attr, RSETF_READ);
232         rfd->more_attr = rset_read (info->rset_attr, rfd->rfd_attr,
233                                     rfd->buf_attr, &dummy);
234     }
235     rfd->level=0;
236     rfd->recbuf=xmalloc (info->key_size); 
237     rfd->startbuf=xmalloc (info->key_size);
238     rfd->attrbuf=xmalloc (info->key_size); 
239     rfd->startbufok=0; 
240     rfd->attrbufok=0; 
241     rfd->depth=0;
242     rfd->attrdepth=0;
243     rfd->hits=-1; /* signals uninitialized */
244     rfd->tailcount=0;
245     rfd->tailbits[WHICH_L]=0; 
246     rfd->tailbits[WHICH_M]=0; 
247     rfd->tailbits[WHICH_R]=0; 
248     rfd->tailbits[WHICH_A]=0; 
249     rfd->and_eof=0; 
250     return rfd;
251 }
252
253 static void r_close_between (RSFD rfd)
254 {
255     struct rset_between_info *info = ((struct rset_between_rfd*)rfd)->info;
256     struct rset_between_rfd **rfdp;
257     
258     for (rfdp = &info->rfd_list; *rfdp; rfdp = &(*rfdp)->next)
259         if (*rfdp == rfd)
260         {
261             xfree ((*rfdp)->buf_l);
262             xfree ((*rfdp)->buf_m);
263             xfree ((*rfdp)->buf_r);
264             xfree ((*rfdp)->buf_attr);
265             xfree ((*rfdp)->recbuf);
266             xfree ((*rfdp)->startbuf);
267             xfree ((*rfdp)->attrbuf);
268             rset_close (info->rset_l, (*rfdp)->rfd_l);
269             rset_close (info->rset_m, (*rfdp)->rfd_m);
270             rset_close (info->rset_r, (*rfdp)->rfd_r);
271             if (info->rset_attr)
272                 rset_close (info->rset_attr, (*rfdp)->rfd_attr);
273             
274             *rfdp = (*rfdp)->next;
275             xfree (rfd);
276             return;
277         }
278     logf (LOG_FATAL, "r_close_between but no rfd match!");
279     assert (0);
280 }
281
282 static void r_delete_between (RSET ct)
283 {
284     struct rset_between_info *info = (struct rset_between_info *) ct->buf;
285
286     assert (info->rfd_list == NULL);
287     xfree (ct->rset_terms);
288     rset_delete (info->rset_l);
289     rset_delete (info->rset_m);
290     rset_delete (info->rset_r);
291     if (info->rset_attr)
292         rset_delete (info->rset_attr);
293     xfree (info);
294 }
295
296 static void r_rewind_between (RSFD rfd)
297 {
298     struct rset_between_info *info = ((struct rset_between_rfd*)rfd)->info;
299     struct rset_between_rfd *p = (struct rset_between_rfd *) rfd;
300
301 #if RSBETWEEN_DEBUG
302     logf (LOG_DEBUG, "rsbetween_rewind");
303 #endif
304     rset_rewind (info->rset_l, p->rfd_l);
305     rset_rewind (info->rset_m, p->rfd_m);
306     rset_rewind (info->rset_r, p->rfd_r);
307     p->more_l = rset_read (info->rset_l, p->rfd_l, p->buf_l, &p->term_index_l);
308     p->more_m = rset_read (info->rset_m, p->rfd_m, p->buf_m, &p->term_index_m);
309     p->more_r = rset_read (info->rset_r, p->rfd_r, p->buf_r, &p->term_index_r);
310     if (info->rset_attr)
311     {
312         int dummy;
313         rset_rewind (info->rset_attr, p->rfd_attr);
314         p->more_attr = rset_read (info->rset_attr, p->rfd_attr, p->buf_attr,
315                                   &dummy);
316     }
317     p->level=0;
318 }
319
320
321
322 static int r_forward_between(RSET ct, RSFD rfd, void *buf, int *term_index,
323                      int (*cmpfunc)(const void *p1, const void *p2),
324                      const void *untilbuf)
325 {
326     struct rset_between_info *info = ((struct rset_between_rfd*)rfd)->info;
327     struct rset_between_rfd *p = (struct rset_between_rfd *) rfd;
328     int rc;
329 #if RSBETWEEN_DEBUG
330     log2( p, "fwd: before forward", 0,0);
331 #endif
332     /* It is enough to forward the m pointer here, the read will */
333     /* naturally forward the l, m, and attr pointers */
334     if (p->more_m)
335         p->more_m=rset_forward(info->rset_m,p->rfd_m, p->buf_m,
336                         &p->term_index_m, info->cmp,untilbuf);
337 #if RSBETWEEN_DEBUG
338     log2( p, "fwd: after forward M", 0,0);
339 #endif
340     /* rc = r_read_between(rfd, buf, term_index);*/
341     rc = r_read(rfd, buf, term_index);
342 #if RSBETWEEN_DEBUG
343     log2( p, "fwd: after forward", 0,0);
344 #endif
345     return rc;
346 }
347
348
349
350 static int r_write_between (RSFD rfd, const void *buf)
351 {
352     logf (LOG_FATAL, "between set type is read-only");
353     return -1;
354 }
355
356
357
358 /* Heikki's back-port of the cleaner 1.4 algorithm */
359
360 static void checkattr(struct rset_between_rfd *p)
361 {
362     struct rset_between_info *info =p->info;
363     int cmp;
364     if (p->attrdepth)
365         return; /* already found one */
366     if (!info->rset_attr) 
367     {
368         p->attrdepth=-1; /* matches always */
369         return;
370     }
371     if ( p->startbufok && p->attrbufok )
372     { /* have buffers to compare */
373         cmp=(*info->cmp)(p->startbuf,p->attrbuf);
374         if (0==cmp) /* and the keys match */
375         {
376             p->attrdepth=p->depth;
377 #if RSBETWEEN_DEBUG
378             yaz_log(log_level, "found attribute match at depth %d",p->attrdepth);
379 #endif
380         }
381     }
382 }
383
384
385 /* Implements a multi-and between the l,r, and m rfds. Returns all */
386 /* hits from those, provided that all three point to the same record */
387 /* See rsmultiandor.c in zebra 1.4 for details */
388 static int read_anded(struct rset_between_rfd *p,void *buf,
389                       int *term, int *which) {
390     RSFD rfds[4];
391     RSET rsets[4];
392     void *bufs[4];
393     int *terms[4];
394     int dummyterm;
395     struct rset_between_info *info =p->info;
396     int i, mintail;
397     int n;
398     int cmp;
399
400     /* make the individual args into arrays, to match rsmultiandor */
401     rfds[WHICH_L]=p->rfd_l;
402     rfds[WHICH_M]=p->rfd_m;
403     rfds[WHICH_R]=p->rfd_r;
404     rfds[WHICH_A]=p->rfd_attr;
405     rsets[WHICH_L]=info->rset_l;
406     rsets[WHICH_M]=info->rset_m;
407     rsets[WHICH_R]=info->rset_r;
408     rsets[WHICH_A]=info->rset_attr;
409     bufs[WHICH_L]=p->buf_l;
410     bufs[WHICH_M]=p->buf_m;
411     bufs[WHICH_R]=p->buf_r;
412     bufs[WHICH_A]=p->buf_attr;
413     terms[WHICH_L]=&(p->term_index_l);
414     terms[WHICH_M]=&(p->term_index_m);
415     terms[WHICH_R]=&(p->term_index_r);
416     terms[WHICH_A]=&dummyterm;
417     dummyterm=0;
418     if (info->rset_attr)
419         n=4;
420     else
421         n=3;
422     while (1) {
423         if (p->tailcount)
424         { /* we are tailing */
425             mintail=0;
426             while ((mintail<n) && !p->tailbits[mintail])
427                 mintail++; /* first tail */
428             for (i=mintail+1;i<n;i++)
429             {
430                 if (p->tailbits[i])
431                 {
432                     cmp=(*info->cmp)(bufs[i],bufs[mintail]);
433                     if (cmp<0)
434                         mintail=i;
435                 }
436             }
437             /* return the lowest tail */
438             memcpy(buf, bufs[mintail], info->key_size);
439             *which=mintail;
440             *term=*terms[mintail];
441             /* and read that one further */
442             if (!rset_read(rsets[mintail],rfds[mintail], 
443                            bufs[mintail],terms[mintail]))
444             {
445                 p->and_eof=1; 
446                 p->tailbits[mintail]=0;
447                 (p->tailcount)--;
448                 return 1;
449             }
450             /* still a tail? */
451             cmp=(info->cmp)(bufs[mintail],buf);
452             if (cmp>=2) 
453             {
454                 p->tailbits[mintail]=0;
455                 (p->tailcount)--;
456             }
457             return 1;
458         }
459         else
460         { /* not tailing */
461             if (p->and_eof)
462                 return 0; /* game over */
463             i=1; /* assume items[0] is highest up */
464             while (i<n) 
465             {
466                 cmp=(*info->cmp)(bufs[0],bufs[i]);
467                 if (cmp<=-2)
468                 { /* [0] was behid, forward it and start anew */
469                     if (!rset_forward(rsets[0],rfds[0], bufs[0], 
470                                       terms[0], info->cmp, bufs[i])) 
471                     {
472                         p->and_eof=1;
473                         return 0;
474                     }
475                     i=0; /* all over again */
476                 }
477                 else if (cmp>=2)
478                 {/* [0] was ahead, forward i */
479                     if (!rset_forward(rsets[i],rfds[i], bufs[i], 
480                                       terms[i], info->cmp, bufs[0])) 
481                     {
482                         p->and_eof=1;
483                         return 0;
484                     }
485
486                 } else
487                     i++; 
488             } /* while i */
489             /* now all are within the same records, set tails and let upper 
490              * if return hits */
491             for (i=0; i<n;i++)
492                 p->tailbits[i]=1;
493             p->tailcount=n;
494         }
495     } 
496 } /* read_anded */
497
498 static int r_read (RSFD rfd, void *buf, int *term)
499 {
500     struct rset_between_rfd *p=(struct rset_between_rfd *)rfd;
501     struct rset_between_info *info =p->info;
502     int cmp;
503     int thisterm=0;
504     int which; 
505     *term=0; /* just in case, should not be necessary */
506 #if RSBETWEEN_DEBUG
507     yaz_log(log_level,"btw: == read: term=%p",term);
508 #endif
509     while ( read_anded(p,buf,&thisterm,&which) )
510     {
511 #if RSBETWEEN_DEBUG
512         yaz_log(log_level,"btw: read loop term=%p d=%d ad=%d",
513                 term, p->depth, p->attrdepth);
514 #endif
515         if (p->hits<0) 
516         {/* first time? */
517             memcpy(p->recbuf,buf,info->key_size);
518             p->hits=0;
519             cmp=2;
520         }
521         else {
522             cmp=(*info->cmp)(buf,p->recbuf);
523 #if RSBETWEEN_DEBUG
524             yaz_log(log_level, "btw: cmp=%d",cmp);
525 #endif
526         }
527
528         if (cmp>=2)
529         { 
530 #if RSBETWEEN_DEBUG
531             yaz_log(log_level,"btw: new record");
532 #endif
533             p->depth=0;
534             p->attrdepth=0;
535             memcpy(p->recbuf,buf,info->key_size);
536         }
537
538 #if RSBETWEEN_DEBUG
539         yaz_log(log_level,"btw:  which: %d", which); 
540 #endif
541         if (which==WHICH_L)
542         {
543             p->depth++;
544 #if RSBETWEEN_DEBUG
545             yaz_log(log_level,"btw: read start tag. d=%d",p->depth);
546 #endif
547             memcpy(p->startbuf,buf,info->key_size);
548             p->startbufok=1;
549             checkattr(rfd);  /* in case we already saw the attr here */
550         }
551         else if (which==WHICH_R)  
552         {
553             if (p->depth == p->attrdepth)
554                 p->attrdepth=0; /* ending the tag with attr match */
555             p->depth--;
556 #if RSBETWEEN_DEBUG
557             yaz_log(log_level,"btw: read end tag. d=%d ad=%d",
558                     p->depth, p->attrdepth);
559 #endif
560         }
561         else if (which==WHICH_A)
562         {
563 #if RSBETWEEN_DEBUG
564             yaz_log(log_level,"btw: read attr");
565 #endif
566             memcpy(p->attrbuf,buf,info->key_size);
567             p->attrbufok=1;
568             checkattr(rfd); /* in case the start tag came first */
569         }
570         else 
571         { /* mut be a real hit */
572             if (p->depth && p->attrdepth)
573             {
574                 p->hits++;
575 #if RSBETWEEN_DEBUG
576                 yaz_log(log_level,"btw: got a hit h=%d d=%d ad=%d t=%d+%d", 
577                         p->hits,p->depth,p->attrdepth,
578                         info->rset_m->no_rset_terms,thisterm);
579 #endif
580                 *term= info->rset_m->no_rset_terms + thisterm;
581                 return 1; /* everything else is in place already */
582             } else
583             {
584 #if RSBETWEEN_DEBUG
585                 yaz_log(log_level, "btw: Ignoring hit. h=%d d=%d ad=%d",
586                         p->hits,p->depth,p->attrdepth);
587 #endif
588             }
589         }
590     } /* while read */
591
592     return 0;
593
594 }  /* r_read */
595