From 4da0cd2978c9a902be772e95302e6522175402fd Mon Sep 17 00:00:00 2001 From: Heikki Levanto Date: Fri, 6 Aug 2004 09:43:03 +0000 Subject: [PATCH] estimatehits config option, actually estimating the hits pos function in rsisamb, rsbool, rstemp seem to work --- include/rset.h | 7 ++--- index/retrieve.c | 4 +-- index/zsets.c | 32 +++++++++++++++++++--- isamb/isamb.c | 18 ++++++++----- rset/rsbool.c | 79 ++++++++++++++++++++++++++++++++++++++++-------------- rset/rsisamb.c | 7 ++++- rset/rstemp.c | 20 +++++++++++--- 7 files changed, 127 insertions(+), 40 deletions(-) diff --git a/include/rset.h b/include/rset.h index 09dcf6d..e42a6e2 100644 --- a/include/rset.h +++ b/include/rset.h @@ -1,4 +1,4 @@ -/* $Id: rset.h,v 1.24 2004-08-04 09:59:03 heikki Exp $ +/* $Id: rset.h,v 1.25 2004-08-06 09:43:03 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -103,8 +103,9 @@ RSET rset_dup (RSET rs); #define rset_forward(rs, fd, buf, indx, cmpfunc, untilbuf) \ (*(rs)->control->f_forward)((rs), (fd), (buf), (indx), (cmpfunc), (untilbuf)) -/* int rset_count(RSET rs); */ -#define rset_count(rs) (*(rs)->control->f_count)(rs) +/* int rset_pos(RSET rs, RSFD fd, zint *current, zint *total); */ +#define rset_pos(rs,fd,cur,tot) \ + (*(rs)->control->f_pos)( (fd),(cur),(tot)) /* int rset_read(RSET rs, void *buf, int *indx); */ #define rset_read(rs, fd, buf, indx) (*(rs)->control->f_read)((fd), (buf), indx) diff --git a/index/retrieve.c b/index/retrieve.c index 909c41f..bb55d7a 100644 --- a/index/retrieve.c +++ b/index/retrieve.c @@ -1,4 +1,4 @@ -/* $Id: retrieve.c,v 1.22 2004-08-04 08:35:23 adam Exp $ +/* $Id: retrieve.c,v 1.23 2004-08-06 09:43:03 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -101,7 +101,7 @@ int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score, ODR stream, if (!rec) { logf (LOG_DEBUG, "rec_get fail on sysno=" ZINT_FORMAT, sysno); - *basenamep = 0; + *basenamep = 0; return 14; } recordAttr = rec_init_attr (zh->reg->zei, rec); diff --git a/index/zsets.c b/index/zsets.c index a26ac40..69d6590 100644 --- a/index/zsets.c +++ b/index/zsets.c @@ -1,4 +1,4 @@ -/* $Id: zsets.c,v 1.50 2004-08-04 08:35:24 adam Exp $ +/* $Id: zsets.c,v 1.51 2004-08-06 09:43:03 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -718,7 +718,7 @@ RSET resultSetRef (ZebraHandle zh, const char *resultSetId) void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) { - int kno = 0; + zint kno = 0; struct it_key key; RSFD rfd; int term_index, i; @@ -726,6 +726,9 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) struct rank_control *rc; struct zset_sort_info *sort_info; const char *rank_handler_name = res_get_def(zh->res, "rank", "rank-1"); + zint cur,tot; + zint est=-2; /* -2 not done, -1 can't do, >0 actual estimate*/ + zint esthits; sort_info = zebraSet->sort_info; sort_info->num_entries = 0; @@ -753,6 +756,8 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) void *handle = (*rc->begin) (zh->reg, rank_class->class_handle, rset); (zebraSet->hits)++; + esthits=atoi(res_get_def(zh->res,"estimatehits","0")); + if (!esthits) est=-1; /* can not do */ do { #if IT_KEY_NEW @@ -770,8 +775,26 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) psysno = this_sys; } (*rc->add) (handle, this_sys, term_index); + if ( (est==-2) && (zebraSet->hits==esthits)) + { /* time to estimate the hits */ + float f; + rset_pos(rset,rfd,&cur,&tot); + if (tot>0) { + f=1.0*cur/tot; + est=(zint)(zebraSet->hits/f); + /* FIXME - round the guess to 3 digits */ + logf(LOG_LOG, "Estimating hits (%s) " + ZINT_FORMAT"->%d" + "; "ZINT_FORMAT"->"ZINT_FORMAT, + rset->control->desc, + cur, zebraSet->hits, + tot,est); + zebraSet->hits=est; + } + } } - while (rset_read (rset, rfd, &key, &term_index)); + while (rset_read (rset, rfd, &key, &term_index) && (est<0) ); + score = (*rc->calc) (handle, psysno); resultSetInsertRank (zh, sort_info, psysno, score, 'A'); (*rc->end) (zh->reg, handle); @@ -785,7 +808,8 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) rset->rset_terms[i]->flags, rset->rset_terms[i]->count); - yaz_log (LOG_LOG, "%d keys, %d distinct sysnos", kno, zebraSet->hits); + yaz_log (LOG_LOG, ZINT_FORMAT " keys, %d distinct sysnos", + kno, zebraSet->hits); } ZebraRankClass zebraRankLookup (ZebraHandle zh, const char *name) diff --git a/isamb/isamb.c b/isamb/isamb.c index 5bcc0c9..3753fd9 100644 --- a/isamb/isamb.c +++ b/isamb/isamb.c @@ -1,4 +1,4 @@ -/* $Id: isamb.c,v 1.49 2004-08-04 09:59:03 heikki Exp $ +/* $Id: isamb.c,v 1.50 2004-08-06 09:43:03 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -1152,10 +1152,6 @@ int isamb_pp_read (ISAMB_PP pp, void *buf) #if NEW_FORWARD == 1 -/* -#undef ISAMB_DEBUB -#define ISAMB_DEBUG 1 -*/ static int isamb_pp_on_right_node(ISAMB_PP pp, int level, const void *untilbuf) { /* looks one node higher to see if we should be on this node at all */ /* useful in backing off quickly, and in avoiding tail descends */ @@ -1811,24 +1807,30 @@ static void isamb_pp_leaf_pos( ISAMB_PP pp, char *end=p->bytes+p->size; char *cur=p->bytes+p->offset; char *dst; + void *decodeClientData; assert(p->offset <= p->size); assert(cur <= end); assert(p->leaf); *current=0; *total=0; + decodeClientData = (pp->isamb->method->codec.start)(); + while(src < end) { dst=dummybuf; - (*pp->isamb->method->codec.decode)(p->decodeClientData,&dst, &src); + (*pp->isamb->method->codec.decode)(decodeClientData,&dst, &src); assert(dst<(char*) dummybuf+100); /*FIXME */ (*total)++; if (src<=cur) (*current)++; } +#if ISAMB_DEBUG logf(LOG_DEBUG, "isamb_pp_leaf_pos: cur="ZINT_FORMAT" tot="ZINT_FORMAT " ofs=%d sz=%d lev=%d", *current, *total, p->offset, p->size, pp->level); +#endif assert(src==end); + (pp->isamb->method->codec.stop)(decodeClientData); } static void isamb_pp_upper_pos( ISAMB_PP pp, zint *current, zint *total, @@ -1840,12 +1842,16 @@ static void isamb_pp_upper_pos( ISAMB_PP pp, zint *current, zint *total, char *cur=p->bytes+p->offset; zint item_size; ISAMB_P child; + assert(level>=0); assert(!p->leaf); + +#if ISAMB_DEBUG logf(LOG_DEBUG,"isamb_pp_upper_pos at beginning l=%d " "cur="ZINT_FORMAT" tot="ZINT_FORMAT " ofs=%d sz=%d pos=" ZINT_FORMAT, level, *current, *total, p->offset, p->size, p->pos); +#endif assert (p->offset <= p->size); decode_ptr (&src, &child ); /* first child */ while(src < end) { diff --git a/rset/rsbool.c b/rset/rsbool.c index 1a6b3c9..bc1a6dd 100644 --- a/rset/rsbool.c +++ b/rset/rsbool.c @@ -1,4 +1,4 @@ -/* $Id: rsbool.c,v 1.33 2004-08-04 09:59:03 heikki Exp $ +/* $Id: rsbool.c,v 1.34 2004-08-06 09:43:03 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -26,6 +26,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include +#include #include #ifndef RSET_DEBUG @@ -40,7 +41,7 @@ static void r_rewind (RSFD rfd); static int r_forward(RSET ct, RSFD rfd, void *buf, int *term_index, int (*cmpfunc)(const void *p1, const void *p2), const void *untilbuf); -/* static void r_pos (RSFD rfd, int *current, int *total); */ +static void r_pos (RSFD rfd, zint *current, zint *total); static int r_read_and (RSFD rfd, void *buf, int *term_index); static int r_read_or (RSFD rfd, void *buf, int *term_index); static int r_read_not (RSFD rfd, void *buf, int *term_index); @@ -55,7 +56,7 @@ static const struct rset_control control_and = r_delete, r_rewind, r_forward, /* rset_default_forward, */ - rset_default_pos, + r_pos, /* rset_default_pos */ r_read_and, r_write, }; @@ -68,12 +69,8 @@ static const struct rset_control control_or = r_close, r_delete, r_rewind, -#if 1 r_forward, -#else - rset_default_forward, -#endif - rset_default_pos, + r_pos, r_read_or, r_write, }; @@ -87,7 +84,7 @@ static const struct rset_control control_not = r_delete, r_rewind, r_forward, - rset_default_pos, + r_pos, r_read_not, r_write, }; @@ -108,6 +105,7 @@ struct rset_bool_info { }; struct rset_bool_rfd { + zint hits; RSFD rfd_l; RSFD rfd_r; int more_l; @@ -165,6 +163,7 @@ static RSFD r_open (RSET ct, int flag) rfd->next = info->rfd_list; info->rfd_list = rfd; rfd->info = info; + rfd->hits=0; rfd->buf_l = xmalloc (info->key_size); rfd->buf_r = xmalloc (info->key_size); @@ -219,6 +218,7 @@ static void r_rewind (RSFD rfd) rset_rewind (info->rset_r, p->rfd_r); p->more_l = rset_read (info->rset_l, p->rfd_l, p->buf_l, &p->term_index_l); p->more_r = rset_read (info->rset_r, p->rfd_r, p->buf_r, &p->term_index_r); + p->hits=0; } static int r_forward (RSET ct, RSFD rfd, void *buf, int *term_index, @@ -278,6 +278,8 @@ static int r_read_and (RSFD rfd, void *buf, int *term_index) struct rset_bool_rfd *p = (struct rset_bool_rfd *) rfd; struct rset_bool_info *info = p->info; + { zint cur,tot; r_pos(rfd, &cur, &tot); } + while (p->more_l || p->more_r) { int cmp; @@ -315,6 +317,7 @@ static int r_read_and (RSFD rfd, void *buf, int *term_index) key_logdump(LOG_DEBUG,buf); (*info->log_item)(LOG_DEBUG, buf, ""); #endif + p->hits++; return 1; } else if (cmp == -1) @@ -329,6 +332,7 @@ static int r_read_and (RSFD rfd, void *buf, int *term_index) rfd, p->more_l, p->more_r, cmp); (*info->log_item)(LOG_DEBUG, buf, ""); #endif + p->hits++; return 1; } else if (cmp > 1) /* cmp == 2 */ @@ -349,6 +353,7 @@ static int r_read_and (RSFD rfd, void *buf, int *term_index) p->more_l, p->more_r, cmp); (*info->log_item)(LOG_DEBUG, buf, ""); #endif + p->hits++; return 1; } #else @@ -366,6 +371,7 @@ static int r_read_and (RSFD rfd, void *buf, int *term_index) rfd, p->more_l, p->more_r, cmp); (*info->log_item)(LOG_DEBUG, buf, ""); #endif + p->hits++; return 1; } else @@ -400,6 +406,7 @@ static int r_read_and (RSFD rfd, void *buf, int *term_index) rfd, p->more_l, p->more_r, cmp); (*info->log_item)(LOG_DEBUG, buf, ""); #endif + p->hits++; return 1; } #else @@ -416,6 +423,7 @@ static int r_read_and (RSFD rfd, void *buf, int *term_index) rfd, p->more_l, p->more_r, cmp); (*info->log_item)(LOG_DEBUG, buf, ""); #endif + p->hits++; return 1; } else @@ -447,6 +455,7 @@ static int r_read_or (RSFD rfd, void *buf, int *term_index) struct rset_bool_rfd *p = (struct rset_bool_rfd *) rfd; struct rset_bool_info *info = p->info; + { zint cur,tot; r_pos(rfd, &cur, &tot); } while (p->more_l || p->more_r) { int cmp; @@ -470,6 +479,7 @@ static int r_read_or (RSFD rfd, void *buf, int *term_index) p->more_l, p->more_r, cmp); (*info->log_item)(LOG_DEBUG, buf, ""); #endif + p->hits++; return 1; } else if (cmp > 0) @@ -483,6 +493,7 @@ static int r_read_or (RSFD rfd, void *buf, int *term_index) p->more_l, p->more_r, cmp); (*info->log_item)(LOG_DEBUG, buf, ""); #endif + p->hits++; return 1; } else @@ -496,6 +507,7 @@ static int r_read_or (RSFD rfd, void *buf, int *term_index) p->more_l, p->more_r, cmp); (*info->log_item)(LOG_DEBUG, buf, ""); #endif + p->hits++; return 1; } } @@ -507,6 +519,7 @@ static int r_read_not (RSFD rfd, void *buf, int *term_index) struct rset_bool_rfd *p = (struct rset_bool_rfd *) rfd; struct rset_bool_info *info = p->info; + { zint cur,tot; r_pos(rfd, &cur, &tot); } while (p->more_l || p->more_r) { int cmp; @@ -523,20 +536,16 @@ static int r_read_not (RSFD rfd, void *buf, int *term_index) *term_index = p->term_index_l; p->more_l = rset_read (info->rset_l, p->rfd_l, p->buf_l, &p->term_index_l); + p->hits++; return 1; } else if (cmp > 1) - { -#if 0 - p->more_r = rset_read (info->rset_r, p->rfd_r, p->buf_r, - &p->term_index_r); -#else - p->more_r = rset_forward( - info->rset_r, p->rfd_r, - p->buf_r, &p->term_index_r, - (info->cmp), p->buf_l); -#endif - } + { + p->more_r = rset_forward( + info->rset_r, p->rfd_r, + p->buf_r, &p->term_index_r, + (info->cmp), p->buf_l); + } else { memcpy (buf, p->buf_l, info->key_size); @@ -568,3 +577,33 @@ static int r_write (RSFD rfd, const void *buf) return -1; } +static void r_pos (RSFD rfd, zint *current, zint *total) +{ + struct rset_bool_rfd *p = (struct rset_bool_rfd *) rfd; + struct rset_bool_info *info = p->info; + zint lcur,ltot; + zint rcur,rtot; + float r; + ltot=-1; rtot=-1; + rset_pos(info->rset_l, p->rfd_l, &lcur, <ot); + rset_pos(info->rset_r, p->rfd_r, &rcur, &rtot); + if ( (rtot<0) && (ltot<0)) { /*no position */ + *current=rcur; /* return same as you got */ + *total=rtot; /* probably -1 for not available */ + } + if ( rtot<0) { rtot=0; rcur=0;} /* if only one useful, use it */ + if ( ltot<0) { ltot=0; lcur=0;} + if ( rtot+ltot == 0 ) { /* empty rset */ + *current=0; + *total=0; + return; + } + r=1.0*(lcur+rcur)/(ltot+rtot); /* weighed average of l and r */ + *current=p->hits; + *total=(zint)(0.5+*current/r); +#if RSET_DEBUG + yaz_log(LOG_DEBUG,"bool_pos: (%s/%s) "ZINT_FORMAT"/"ZINT_FORMAT"= %0.4f ", + info->rset_l->control->desc, info->rset_r->control->desc, + *current, *total, r); +#endif +} diff --git a/rset/rsisamb.c b/rset/rsisamb.c index 0bc7107..ffab998 100644 --- a/rset/rsisamb.c +++ b/rset/rsisamb.c @@ -1,4 +1,4 @@ -/* $Id: rsisamb.c,v 1.11 2004-08-04 09:59:03 heikki Exp $ +/* $Id: rsisamb.c,v 1.12 2004-08-06 09:43:04 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -177,6 +177,10 @@ static void r_pos (RSFD rfd, zint *current, zint *total) struct rset_pp_info *pinfo = (struct rset_pp_info *) rfd; assert(rfd); isamb_pp_pos(pinfo->pt, current, total); +#if RSET_DEBUG + logf(LOG_DEBUG,"isamb.r_pos returning "ZINT_FORMAT"/"ZINT_FORMAT, + *current, *total); +#endif } static int r_read (RSFD rfd, void *buf, int *term_index) @@ -184,6 +188,7 @@ static int r_read (RSFD rfd, void *buf, int *term_index) struct rset_pp_info *pinfo = (struct rset_pp_info *) rfd; int r; *term_index = 0; + r = isamb_pp_read(pinfo->pt, buf); if (r > 0) { diff --git a/rset/rstemp.c b/rset/rstemp.c index f3fc8de..7b5420d 100644 --- a/rset/rstemp.c +++ b/rset/rstemp.c @@ -1,4 +1,4 @@ -/* $Id: rstemp.c,v 1.38 2004-08-03 14:54:41 heikki Exp $ +/* $Id: rstemp.c,v 1.39 2004-08-06 09:43:04 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 Index Data Aps @@ -42,6 +42,7 @@ static void r_rewind (RSFD rfd); /* static int r_count (RSET ct);*/ static int r_read (RSFD rfd, void *buf, int *term_index); static int r_write (RSFD rfd, const void *buf); +static void r_pos (RSFD rfd, zint *current, zint *total); static const struct rset_control control = { @@ -52,7 +53,7 @@ static const struct rset_control control = r_delete, r_rewind, rset_default_forward, - rset_default_pos, + r_pos, r_read, r_write, }; @@ -69,7 +70,7 @@ struct rset_temp_info { size_t pos_buf; /* position of first byte in window */ size_t pos_border; /* position of last byte+1 in window */ int dirty; /* window is dirty */ - int hits; /* no of hits */ + zint hits; /* no of hits */ char *temp_path; int (*cmp)(const void *p1, const void *p2); struct rset_temp_rfd *rfd_list; @@ -81,6 +82,7 @@ struct rset_temp_rfd { int *countp; void *buf; size_t pos_cur; /* current position in set */ + zint cur; /* number of the current hit */ }; static void *r_create(RSET ct, const struct rset_control *sel, void *parms) @@ -97,7 +99,7 @@ static void *r_create(RSET ct, const struct rset_control *sel, void *parms) info->pos_end = 0; info->pos_buf = 0; info->dirty = 0; - info->hits = -1; + info->hits = 0; info->cmp = temp_parms->cmp; info->rfd_list = NULL; @@ -305,6 +307,7 @@ static void r_rewind (RSFD rfd) ((struct rset_temp_rfd *)rfd)->pos_cur = 0; info->pos_buf = 0; r_reread (rfd); + ((struct rset_temp_rfd *)rfd)->cur=0; } /* @@ -340,6 +343,7 @@ static int r_read (RSFD rfd, void *buf, int *term_index) memcpy (mrfd->buf, buf, mrfd->info->key_size); (*mrfd->countp)++; } + mrfd->cur++; return 1; } @@ -363,5 +367,13 @@ static int r_write (RSFD rfd, const void *buf) mrfd->pos_cur = nc; if (nc > info->pos_end) info->pos_border = info->pos_end = nc; + info->hits++; return 1; } + +static void r_pos (RSFD rfd, zint *current, zint *total) +{ + struct rset_temp_rfd *mrfd = (struct rset_temp_rfd*) rfd; + *current=mrfd->cur; + *total=mrfd->info->hits; +} -- 1.7.10.4