From 73bedd5ba9152c9c107b502fae65723b551aff09 Mon Sep 17 00:00:00 2001 From: Heikki Levanto Date: Tue, 26 Oct 2004 15:32:11 +0000 Subject: [PATCH] Re-estabslished rank-1. Gets same order of results, but slightly different scores, due to using a better estimate for term occurrences. --- index/index.h | 10 ++++--- index/livcode.c | 11 ++++++-- index/rank1.c | 74 +++++++++++++++++++++++++++++---------------------- index/zebraapi.c | 6 ++++- index/zrpn.c | 4 +-- index/zsets.c | 30 ++++++++++----------- index/zvrank.c | 13 ++++++--- rset/rsmultiandor.c | 10 +++---- 8 files changed, 93 insertions(+), 65 deletions(-) diff --git a/index/index.h b/index/index.h index 0e873de..39ad880 100644 --- a/index/index.h +++ b/index/index.h @@ -1,4 +1,4 @@ -/* $Id: index.h,v 1.123 2004-09-28 12:39:55 adam Exp $ +/* $Id: index.h,v 1.124 2004-10-26 15:32:11 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -318,7 +318,9 @@ struct rank_control { char *name; void *(*create)(ZebraHandle zh); void (*destroy)(struct zebra_register *reg, void *class_handle); - void *(*begin)(struct zebra_register *reg, void *class_handle, RSET rset); + void *(*begin)(struct zebra_register *reg, + void *class_handle, RSET rset, NMEM nmem, + TERMID *terms, int numterms); /* ### Could add parameters to begin: * char *index; // author, title, etc. * int dbsize; // number of records in database @@ -326,7 +328,7 @@ struct rank_control { */ void (*end)(struct zebra_register *reg, void *set_handle); int (*calc)(void *set_handle, zint sysno); - void (*add)(void *set_handle, int seqno, int term_index); + void (*add)(void *set_handle, int seqno, TERMID term); }; struct term_set_entry { @@ -379,7 +381,7 @@ void resultSetSort (ZebraHandle zh, NMEM nmem, void resultSetSortSingle (ZebraHandle zh, NMEM nmem, ZebraSet sset, RSET rset, Z_SortKeySpecList *sort_sequence, int *sort_status); -void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset); +void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset, NMEM nmem); void resultSetInvalidate (ZebraHandle zh); int zebra_server_lock_init (ZebraService zh); diff --git a/index/livcode.c b/index/livcode.c index cd85e34..ba86b2e 100644 --- a/index/livcode.c +++ b/index/livcode.c @@ -11,10 +11,12 @@ rights reserved. Licensed under the Academic Free License version 1.1. http://opensource.org/licenses/academic.php -$Id: livcode.c,v 1.3 2004-08-20 14:44:46 heikki Exp $ +$Id: livcode.c,v 1.4 2004-10-26 15:32:11 heikki Exp $ */ +#ifdef SKIPTHIS /* Need to fix the interface - FIXME */ + #include #include #ifdef WIN32 @@ -538,6 +540,7 @@ struct rank_set_info { int last_pos; int no_entries; int no_rank_entries; + NMEM nmem; struct rank_term_info *entries; }; @@ -584,7 +587,8 @@ static void destroy (struct zebra_register *reg, void *class_handle) * each result set. The returned handle is a "set handle" and * will be used in each of the handlers below. */ -static void *begin (struct zebra_register *reg, void *class_handle, RSET rset) +static void *begin (struct zebra_register *reg, void *class_handle, + RSET rset, NMEM nmem) { struct rank_set_info *si = (struct rank_set_info *) xmalloc (sizeof(*si)); int i; @@ -594,6 +598,7 @@ static void *begin (struct zebra_register *reg, void *class_handle, RSET rset) /* do about this ??? */ si->no_entries = 0; /* rset->no_rset_terms; */ /* FIXME ??? */ si->no_rank_entries = 0; + si->nmem=nmem; si->entries = (struct rank_term_info *) xmalloc (sizeof(*si->entries)*si->no_entries); for (i = 0; i < si->no_entries; i++) @@ -706,3 +711,5 @@ static struct rank_control rank_control = { }; struct rank_control *rankliv_class = &rank_control; +#endif + diff --git a/index/rank1.c b/index/rank1.c index 05f0761..581e78a 100644 --- a/index/rank1.c +++ b/index/rank1.c @@ -1,4 +1,4 @@ -/* $Id: rank1.c,v 1.17 2004-08-20 14:44:46 heikki Exp $ +/* $Id: rank1.c,v 1.18 2004-10-26 15:32:11 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 Index Data Aps @@ -30,7 +30,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #endif -#define DEBUG_RANK 0 +#define DEBUG_RANK 1 #include "index.h" @@ -44,6 +44,8 @@ struct rank_term_info { int global_inv; int rank_flag; int rank_weight; + TERMID term; + int term_index; }; struct rank_set_info { @@ -51,6 +53,7 @@ struct rank_set_info { int no_entries; int no_rank_entries; struct rank_term_info *entries; + NMEM nmem; }; static int log2_int (unsigned g) @@ -67,8 +70,8 @@ static int log2_int (unsigned g) */ static void *create (ZebraHandle zh) { - struct rank_class_info *ci = (struct rank_class_info *) - xmalloc (sizeof(*ci)); + struct rank_class_info *ci = + (struct rank_class_info *) xmalloc (sizeof(*ci)); yaz_log (LOG_DEBUG, "rank-1 create"); return ci; @@ -88,54 +91,58 @@ static void destroy (struct zebra_register *reg, void *class_handle) } -/* +/** * begin: Prepares beginning of "real" ranking. Called once for * each result set. The returned handle is a "set handle" and * will be used in each of the handlers below. */ -static void *begin (struct zebra_register *reg, void *class_handle, RSET rset) +static void *begin (struct zebra_register *reg, + void *class_handle, RSET rset, NMEM nmem, + TERMID *terms, int numterms) { - struct rank_set_info *si = (struct rank_set_info *) xmalloc (sizeof(*si)); + struct rank_set_info *si = + (struct rank_set_info *) nmem_malloc (nmem,sizeof(*si)); int i; #if DEBUG_RANK yaz_log (LOG_LOG, "rank-1 begin"); #endif - si->no_entries = 0; /* rset->no_rset_terms; */ /* FIXME - what to do here*/ - /* Now that we don't count term occurrences, ranking will have to */ - /* different! */ + si->no_entries = numterms; si->no_rank_entries = 0; + si->nmem=nmem; si->entries = (struct rank_term_info *) - xmalloc (sizeof(*si->entries)*si->no_entries); - for (i = 0; i < si->no_entries; i++) + nmem_malloc (si->nmem, sizeof(*si->entries)*numterms); + for (i = 0; i < numterms; i++) { - zint g = 0; /* rset->rset_terms[i]->nn; */ /* FIXME ??? */ + zint g = rset_count(terms[i]->rset); #if DEBUG_RANK - yaz_log(LOG_LOG, "i=%d flags=%s", i, rset->rset_terms[i]->flags); + yaz_log(LOG_LOG, "i=%d flags=%s '%s'", i, + terms[i]->flags, terms[i]->name ); #endif - if (0) /* (!strncmp (rset->rset_terms[i]->flags, "rank,", 5)) */ - /* FIXME */ /* ??? */ + if (!strncmp (terms[i]->flags, "rank,", 5)) { - const char *cp = "w"; - /*= strstr(rset->rset_terms[i]->flags+4, ",w=");*/ - /* FIXME ??? */ + const char *cp = strstr(terms[i]->flags+4, ",w="); si->entries[i].rank_flag = 1; if (cp) si->entries[i].rank_weight = atoi (cp+3); else si->entries[i].rank_weight = 34; #if DEBUG_RANK - yaz_log (LOG_LOG, " i=%d weight=%d", i, - si->entries[i].rank_weight); + yaz_log (LOG_LOG, " i=%d weight=%d g="ZINT_FORMAT, i, + si->entries[i].rank_weight, g); #endif (si->no_rank_entries)++; } else si->entries[i].rank_flag = 0; - si->entries[i].local_occur = 0; + si->entries[i].local_occur = 0; /* FIXME */ si->entries[i].global_occur = g; si->entries[i].global_inv = 32 - log2_int (g); - yaz_log (LOG_DEBUG, " global_inv = %d g = " ZINT_FORMAT, (int) (32-log2_int (g)), g); + yaz_log (LOG_DEBUG, " global_inv = %d g = " ZINT_FORMAT, + (int) (32-log2_int (g)), g); + si->entries[i].term=terms[i]; + si->entries[i].term_index=i; + terms[i]->rankpriv=&(si->entries[i]); } return si; } @@ -146,25 +153,28 @@ static void *begin (struct zebra_register *reg, void *class_handle, RSET rset) */ static void end (struct zebra_register *reg, void *set_handle) { - struct rank_set_info *si = (struct rank_set_info *) set_handle; yaz_log (LOG_DEBUG, "rank-1 end"); - xfree (si->entries); - xfree (si); + /* no need to free anything, they are in nmems */ } -/* + +/** * add: Called for each word occurence in a result set. This routine * should be as fast as possible. This routine should "incrementally" * update the score. */ -static void add (void *set_handle, int seqno, int term_index) +static void add (void *set_handle, int seqno, TERMID term) { struct rank_set_info *si = (struct rank_set_info *) set_handle; + struct rank_term_info *ti= (struct rank_term_info *) term->rankpriv; + assert(si); + assert(term); + assert(ti); #if DEBUG_RANK - yaz_log (LOG_LOG, "rank-1 add seqno=%d term_index=%d", seqno, term_index); + yaz_log (LOG_LOG, "rank-1 add seqno=%d term=%s", seqno, term->name); #endif si->last_pos = seqno; - si->entries[term_index].local_occur++; + ti->local_occur++; } /* @@ -187,7 +197,7 @@ static int calc (void *set_handle, zint sysno) for (i = 0; i < si->no_entries; i++) { #if DEBUG_RANK - yaz_log(LOG_LOG, "i=%d rank_flag=%d lo=%d", + yaz_log(LOG_LOG, "calc: i=%d rank_flag=%d lo=%d", i, si->entries[i].rank_flag, si->entries[i].local_occur); #endif if (si->entries[i].rank_flag && (lo = si->entries[i].local_occur)) @@ -197,7 +207,7 @@ static int calc (void *set_handle, zint sysno) divisor = si->no_rank_entries * (8+log2_int (si->last_pos/si->no_entries)); score = score / divisor; #if DEBUG_RANK - yaz_log (LOG_LOG, "sysno=" ZINT_FORMAT " score=%d", sysno, score); + yaz_log (LOG_LOG, "calc sysno=" ZINT_FORMAT " score=%d", sysno, score); #endif if (score > 1000) score = 1000; diff --git a/index/zebraapi.c b/index/zebraapi.c index 9896bf6..e97e27c 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -1,4 +1,4 @@ -/* $Id: zebraapi.c,v 1.136 2004-10-15 10:07:32 heikki Exp $ +/* $Id: zebraapi.c,v 1.137 2004-10-26 15:32:11 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -292,8 +292,10 @@ struct zebra_register *zebra_register_open (ZebraService zs, const char *name, reg->ptr_i=0; zebraRankInstall (reg, rank1_class); +#ifdef SKIPTHIS /* FIXME - those ranks not yet converted to new interface */ zebraRankInstall (reg, rankzv_class); zebraRankInstall (reg, rankliv_class); +#endif recordCompression = res_get_def (res, "recordCompression", "none"); if (!strcmp (recordCompression, "none")) @@ -812,7 +814,9 @@ int zebra_search_RPN (ZebraHandle zh, ODR o, if (zebra_begin_read (zh)) return 1; +#ifdef SKIPTHIS /* FIXME - livcode rank not yet available */ zebra_livcode_transform(zh, query); +#endif resultSetAddRPN (zh, odr_extract_mem(o), query, zh->num_basenames, zh->basenames, setname); diff --git a/index/zrpn.c b/index/zrpn.c index 18e6dcc..c185b95 100644 --- a/index/zrpn.c +++ b/index/zrpn.c @@ -1,4 +1,4 @@ -/* $Id: zrpn.c,v 1.157 2004-10-15 10:07:32 heikki Exp $ +/* $Id: zrpn.c,v 1.158 2004-10-26 15:32:11 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -2304,7 +2304,7 @@ RSET rpn_search (ZebraHandle zh, NMEM nmem, NMEM rset_nmem, ; sort_sequence->num_specs = i; if (!i) - resultSetRank (zh, sset, rset); + resultSetRank (zh, sset, rset, rset_nmem); else { logf (LOG_DEBUG, "resultSetSortSingle in rpn_search"); diff --git a/index/zsets.c b/index/zsets.c index c284b96..efe14c7 100644 --- a/index/zsets.c +++ b/index/zsets.c @@ -1,4 +1,4 @@ -/* $Id: zsets.c,v 1.68 2004-10-22 11:33:28 heikki Exp $ +/* $Id: zsets.c,v 1.69 2004-10-26 15:32:11 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -727,13 +727,14 @@ RSET resultSetRef (ZebraHandle zh, const char *resultSetId) return NULL; } -void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) +void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset, NMEM nmem) { zint kno = 0; struct it_key key; RSFD rfd; TERMID termid; TERMID *terms; + int numterms; int i,n; ZebraRankClass rank_class; struct rank_control *rc; @@ -747,6 +748,12 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) sort_info = zebraSet->sort_info; sort_info->num_entries = 0; zebraSet->hits = 0; + n=0; + rset_getterms(rset,0,0,&n); + terms=malloc( sizeof(*terms)*n); + numterms=0; + rset_getterms(rset,terms,n,&numterms); + rfd = rset_open (rset, RSETF_READ); rank_class = zebraRankLookup (zh, rank_handler_name); @@ -762,14 +769,16 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) zint psysno = key.mem[0]; int score; void *handle = - (*rc->begin) (zh->reg, rank_class->class_handle, rset); + (*rc->begin) (zh->reg, rank_class->class_handle, rset, nmem, + terms, numterms); (zebraSet->hits)++; esthits=atoi(res_get_def(zh->res,"estimatehits","0")); if (!esthits) est=-1; /* can not do */ do { - zint this_sys = key.mem[0]; + zint this_sys = key.mem[0]; /* FIXME - assumes scope==2 */ + zint seqno = key.mem[1]; /* FIXME - assumes scope==2 */ kno++; if (this_sys != psysno) { @@ -779,9 +788,7 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) (zebraSet->hits)++; psysno = this_sys; } - /* FIXME - Ranking is broken, since rsets no longer have */ - /* term lists! */ - /* (*rc->add) (handle, this_sys, term_index); */ + (*rc->add) (handle, seqno, termid); if ( (est==-2) && (zebraSet->hits==esthits)) { /* time to estimate the hits */ @@ -811,15 +818,9 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) (*rc->end) (zh->reg, handle); } rset_close (rfd); - n=0; - rset_getterms(rset,0,0,&n); - terms=xmalloc( sizeof(*terms)*n); - i=n; - n=0; - rset_getterms(rset,terms,i,&n); - for (i = 0; i < n; i++) + for (i = 0; i < numterms; i++) { yaz_log (LOG_LOG, "term=\"%s\" " " type=%s count=" ZINT_FORMAT, @@ -827,7 +828,6 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) terms[i]->flags, rset_count(terms[i]->rset)); } - xfree(terms); yaz_log (LOG_DEBUG, ZINT_FORMAT " keys, "ZINT_FORMAT" distinct sysnos", kno, zebraSet->hits); } diff --git a/index/zvrank.c b/index/zvrank.c index e2c19c0..ecebce3 100644 --- a/index/zvrank.c +++ b/index/zvrank.c @@ -1,4 +1,4 @@ -/* $Id: zvrank.c,v 1.10 2004-08-20 14:44:46 heikki Exp $ +/* $Id: zvrank.c,v 1.11 2004-10-26 15:32:11 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 Index Data Aps @@ -40,6 +40,8 @@ fernuni-hagen.de> ** "ntc-atn", "atc-atn", etc. */ +#if SKIPTHIS /* FIXME - Disabled while changing the interface to ranking */ + #include /* for log */ #include @@ -61,8 +63,7 @@ static double blog(double x) { /* structures */ -struct rank_class_info { /* now we need this */ - int dummy; +struct rank_class_info { char rscheme[8]; /* name of weighting scheme */ }; @@ -76,6 +77,7 @@ struct rs_info { /* for result set */ char rscheme[8]; /* name of weighting scheme */ /**/ int veclen; + NMEM nmem; void (*d_tf_fct)(void *, void *); /* doc term frequency function */ void (*d_idf_fct)(void *, void *); /* doc idf function */ void (*d_norm_fct)(void *, void *); /* doc normalization function */ @@ -684,7 +686,8 @@ static void zv_destroy (struct zebra_register *reg, void *class_handle) { * each result set. The returned handle is a "set handle" and * will be used in each of the handlers below. */ -static void *zv_begin(struct zebra_register *reg, void *class_handle, RSET rset) +static void *zv_begin(struct zebra_register *reg, void *class_handle, + RSET rset, NMEM nmem) { struct rs_info *rs=(struct rs_info *)xmalloc(sizeof(*rs)); struct rank_class_info *ci=(struct rank_class_info *)class_handle; @@ -697,6 +700,7 @@ static void *zv_begin(struct zebra_register *reg, void *class_handle, RSET rset) /* FIXME - Now that we don't have term lists in rsets, what do */ /* we do here ??? */ zv_init(rs, ci->rscheme); + rs->nmem=nmem; rs->veclen=veclen; prn_rs(rs); @@ -815,4 +819,5 @@ static struct rank_control rank_control_vsm = { struct rank_control *rankzv_class = &rank_control_vsm; +#endif /* SKIPTHIS */ /* EOF */ diff --git a/rset/rsmultiandor.c b/rset/rsmultiandor.c index 4f4bdcd..9a88534 100644 --- a/rset/rsmultiandor.c +++ b/rset/rsmultiandor.c @@ -1,4 +1,4 @@ -/* $Id: rsmultiandor.c,v 1.7 2004-10-22 10:12:52 heikki Exp $ +/* $Id: rsmultiandor.c,v 1.8 2004-10-26 15:32:11 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -562,17 +562,19 @@ static void r_pos (RSFD rfd, double *current, double *total) int i; for (i=0; ino_rsets; i++){ rset_pos(mrfd->items[i].fd, &cur, &tot); - logf(LOG_DEBUG, "r_pos: %d %0.1f %0.1f", i, cur,tot); + /*logf(LOG_LOG, "r_pos: %d %0.1f %0.1f", i, cur,tot); */ scur += cur; stot += tot; } if (stot <1.0) { /* nothing there */ *current=0; *total=0; + /* logf(LOG_LOG, "r_pos: NULL %0.1f %0.1f", *current, *total);*/ return; } *current=mrfd->hits; *total=*current*stot/scur; + /*logf(LOG_LOG, "r_pos: = %0.1f %0.1f", *current, *total);*/ } @@ -595,12 +597,10 @@ static void r_get_terms(RSET ct, TERMID *terms, int maxterms, int *curterm) for (i=0;ino_rsets;i++) { rset_getterms(info->rsets[i], terms, maxterms, curterm); - yaz_log(LOG_DEBUG,"rsmulti: getterms: i=%d *cur=%d",i,*curterm); - /* FIXME - remove this log once we know it works */ if ( ( (*curterm) > firstterm+1 ) && ( (*curterm) <= maxterms ) && ( terms[(*curterm)-1] == terms[firstterm] ) ) - *curterm--; /* forget the term, seen that before */ + (*curterm)--; /* forget the term, seen that before */ } } -- 1.7.10.4