From 2fefff35eeb40ba802bb3ee11674a6037b84659c Mon Sep 17 00:00:00 2001 From: Heikki Levanto Date: Thu, 28 Oct 2004 10:37:15 +0000 Subject: [PATCH] Removed livcode ranking Fixed zvrank Added tests for zvrank Started to refactor tests/api --- index/Makefile.am | 4 +- index/livcode.c | 715 --------------------------------------------- index/rank1.c | 4 +- index/zebraapi.c | 11 +- index/zvrank.c | 47 +-- test/api/Makefile.am | 6 +- test/api/rankingrecords.h | 55 ++++ test/api/t10.c | 138 +++++++++ test/api/t9.c | 85 ++++++ test/api/testlib.c | 113 +++++++ 10 files changed, 426 insertions(+), 752 deletions(-) delete mode 100644 index/livcode.c create mode 100644 test/api/rankingrecords.h create mode 100644 test/api/t10.c create mode 100644 test/api/t9.c create mode 100644 test/api/testlib.c diff --git a/index/Makefile.am b/index/Makefile.am index feadea7..906e26b 100644 --- a/index/Makefile.am +++ b/index/Makefile.am @@ -1,4 +1,4 @@ -## $Id: Makefile.am,v 1.28 2004-09-27 10:44:49 adam Exp $ +## $Id: Makefile.am,v 1.29 2004-10-28 10:37:15 heikki Exp $ noinst_PROGRAMS = apitest kdump @@ -7,7 +7,7 @@ lib_LTLIBRARIES = libidzebra-api.la libidzebra_api_la_SOURCES = dir.c dirs.c trav.c kinput.c kcompare.c \ attribute.c symtab.c recindex.c recstat.c lockutil.c \ zebraapi.c zinfo.c invstat.c sortidx.c compact.c zsets.c zrpn.c \ - rank1.c trunc.c retrieve.c extract.c livcode.c \ + rank1.c trunc.c retrieve.c extract.c \ index.h recindex.h recindxp.h \ zinfo.h zserver.h zvrank.c diff --git a/index/livcode.c b/index/livcode.c deleted file mode 100644 index ba86b2e..0000000 --- a/index/livcode.c +++ /dev/null @@ -1,715 +0,0 @@ -/* - -The University of Liverpool - -Modifications to Zebra 1.1 / YAZ 1.7 to enable ranking -by attribute weight. - -Copyright (c) 2001-2002 The University of Liverpool. All -rights reserved. - -Licensed under the Academic Free License version 1.1. -http://opensource.org/licenses/academic.php - -$Id: livcode.c,v 1.4 2004-10-26 15:32:11 heikki Exp $ - -*/ - -#ifdef SKIPTHIS /* Need to fix the interface - FIXME */ - -#include -#include -#ifdef WIN32 -#include -#else -#include -#endif -#include - -#include "index.h" -#include "zserver.h" - -/* -** These functions/routines -** 1. reads in and builds a linked list of rank attr/rank score pairs -** 2. expand a simple query into a paired list of complex/simple nodes. -*/ - -typedef struct rstype -{ - struct rstype *next_rsnode ; - int rank ; - int score ; - char *rankstr ; -} rsnode, *refrsnode ; - -refrsnode start_rsnode = NULL ; - -/* -** Function/Routine prototypes -*/ -static int search_for_score( char *rankstr ) ; -static char *search_for_rankstr( int rank ) ; -static int search_for_rank( int rank ) ; -static refrsnode set_rsnode( int rank, int score ) ; -static int read_zrank_file(ZebraHandle zh) ; - -static void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ; -static void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ; -static Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop ) ; -static Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp ) ; -static Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 ) ; -static Z_Operand *set_operand( Z_Operand *thisop, int newattr ) ; -static int check_operand_attrs( Z_Operand *thisop ) ; - -/* -** search_for_score() -** given a rank-string traverse down the linked list ; -** return its score if found otherwise return -1. -*/ -int search_for_score( char *rankstr ) -{ - refrsnode node = start_rsnode ; - int rank ; - - if ( sscanf( rankstr,"%d",&rank ) ) - { - while ( node ) - { - if ( node->rank == rank ) return node->score ; - node = node->next_rsnode ; - } - } - return -1 ; -} - -/* -** search_for_rankstr() -** given a rank traverse down the linked list ; -** return its string if found otherwise return NULL. -*/ -char *search_for_rankstr( int rank ) -{ - refrsnode node = start_rsnode ; - - while ( node ) - { - if ( node->rank == rank ) return node->rankstr ; - node = node->next_rsnode ; - } - return "rank" ; -} - -/* -** search_for_rank() -** given a rank traverse down the linked list ; -** return 1 if found otherwise return 0. -*/ -int search_for_rank( int rank ) -{ - refrsnode node = start_rsnode ; - - while ( node ) - { - if ( node->rank == rank ) return 1 ; - node = node->next_rsnode ; - } - return 0 ; -} - -/* -** set_rsnode() -** given a rank and a score, build the rest of the rsnode structure. -*/ -refrsnode set_rsnode( int rank, int score ) -{ -#define BUFFMAX 128 - refrsnode node = (refrsnode)malloc( sizeof(rsnode) ) ; - char buff[BUFFMAX] ; - - node->next_rsnode = NULL ; - node->rank = rank ; - node->score = score ; - - sprintf( buff,"%d",rank ) ; - node->rankstr = (char *)malloc( strlen(buff)+1 ) ; - strcpy( node->rankstr, buff ) ; - - return node ; -} - -/* -** read_zrank_file(zh) -** read in the rankfile and build the rank/score linked list ; -** return 0 : can't open the zebra config. file -** return 0 : can't find the rankfile entry in the zebra config. file -** return 0 : can't open the rankfile itself -** return the number of distinct ranks read in. -*/ -int read_zrank_file(ZebraHandle zh) -{ -#define LINEMAX 256 - char line[ LINEMAX ] ; - char rname[ LINEMAX ] ; - char *lineptr ; - FILE *ifd ; - int rank = 0 ; - int score = 0 ; - int numranks = 0 ; - - /* - ** open the zebra configuration file and look for the "rankfile:" - ** entry which contains the path/name of the rankfile - */ - - const char *rankfile = res_get_def(zh->res, "rankfile", 0); - const char *profilePath = res_get_def(zh->res, "profilePath", - DEFAULT_PROFILE_PATH); - - if (!rankfile) - { - yaz_log(LOG_LOG, "rankfile entry not found in config file" ) ; - return 0 ; - } - ifd = yaz_path_fopen(profilePath, rankfile, "r" ) ; - if ( ifd ) - { - while ( (lineptr = fgets( line,LINEMAX,ifd )) ) - { - if ( sscanf( lineptr,"rankfile: %s", rname ) == 1 ) - rankfile = rname ; - } - - /* - ** open the rankfile and read the rank/score pairs - ** ignore 1016 - ** ignore duplicate ranks - ** ignore ranks without +ve scores - */ - if ( rankfile ) - { - if ( !(ifd = fopen( rankfile, "r" )) ) - { - logf( LOG_LOG, "unable to open rankfile %s",rankfile ) ; - return 0; - } - - while ( (lineptr = fgets( line,LINEMAX,ifd )) ) - { - sscanf( lineptr,"%d : %d", &rank,&score ) ; - if ( ( score > 0 ) && ( rank != 1016 ) ) - { - refrsnode new_rsnode ; - - if ( search_for_rank( rank ) == 0 ) - { - new_rsnode = set_rsnode( rank,score ) ; - new_rsnode->next_rsnode = start_rsnode ; - start_rsnode = new_rsnode ; - numranks++ ; - } - } - } - } - else - { - yaz_log(LOG_WARN|LOG_ERRNO, "unable to open config file (%s)", - rankfile); - } - } - return numranks ; -} - -/* -** set_operand() -** build an operand "node" - hav to make a complete copy of thisop and -** then insert newattr in the appropriate place -** -*/ -Z_Operand *set_operand( Z_Operand *thisop, int newattr ) -{ - Z_Operand *operand ; - Z_AttributesPlusTerm *attributesplusterm ; - Z_AttributeList *attributelist ; - Z_AttributeElement *attributeelement ; - Z_AttributeElement *attrptr ; - Z_AttributeElement **attrptrptr ; - Z_Term *term ; - Odr_oct *general ; - int i ; - - operand = (Z_Operand *) - malloc( sizeof( Z_Operand ) ) ; - attributesplusterm = (Z_AttributesPlusTerm *) - malloc( sizeof( Z_AttributesPlusTerm ) ) ; - attributelist = (Z_AttributeList *) - malloc( sizeof( Z_AttributeList ) ) ; - attributeelement = (Z_AttributeElement *) - malloc( sizeof( Z_AttributeElement ) ) ; - term = (Z_Term *) - malloc( sizeof( Z_Term ) ) ; - general = (Odr_oct *) - malloc( sizeof( Odr_oct ) ) ; - - operand->which = Z_Operand_APT ; - operand->u.attributesPlusTerm = attributesplusterm ; - - attributesplusterm->attributes = attributelist ; - attributesplusterm->term = term ; - - attributelist->num_attributes = thisop->u.attributesPlusTerm-> - attributes->num_attributes ; - - attrptr = (Z_AttributeElement *) malloc( sizeof(Z_AttributeElement) * - attributelist->num_attributes ) ; - attrptrptr = (Z_AttributeElement **) malloc( sizeof(Z_AttributeElement) * - attributelist->num_attributes ) ; - - attributelist->attributes = attrptrptr ; - - for ( i = 0 ; i < attributelist->num_attributes ; i++ ) - { - *attrptr = *thisop->u.attributesPlusTerm->attributes->attributes[i] ; - - attrptr->attributeType = (int *)malloc( sizeof(int *) ) ; - *attrptr->attributeType = *thisop->u.attributesPlusTerm->attributes-> - attributes[i]->attributeType; - - attrptr->value.numeric = (int *)malloc( sizeof(int *) ) ; - *attrptr->value.numeric = *thisop->u.attributesPlusTerm->attributes-> - attributes[i]->value.numeric; - - if ( (*attrptr->attributeType == 1) && - (*attrptr->value.numeric == 1016) ) - { - *attrptr->value.numeric = newattr ; - } - *attrptrptr++ = attrptr++ ; - } - - term->which = Z_Term_general ; - term->u.general = general ; - - general->len = thisop->u.attributesPlusTerm->term->u.general->len ; - general->size = thisop->u.attributesPlusTerm->term->u.general->size ; - general->buf = malloc( general->size ) ; - strcpy( general->buf, - thisop->u.attributesPlusTerm->term->u.general->buf ) ; - - return operand ; -} - -/* -** set_2operands() -** build a complex "node" with two (simple) operand "nodes" as branches -*/ -Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 ) -{ - Z_Complex *top ; - Z_RPNStructure *s1 ; - Z_RPNStructure *s2 ; - Z_Operator *roperator ; - - top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ; - s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ; - s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ; - roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ; - - top->roperator = roperator ; - top->roperator->which = Z_Operator_or ; - top->roperator->u.op_or = odr_nullval() ; - - top->s1 = s1 ; - top->s1->which = Z_RPNStructure_simple ; - top->s1->u.simple = sim1 ; - - top->s2 = s2 ; - top->s2->which = Z_RPNStructure_simple ; - top->s2->u.simple = sim2 ; - - return top ; -} - -/* -** set_1complex_1operand() -** build a complex "node" with a complex "node" branch and an -** operand "node" branch -*/ -Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp ) -{ - Z_Complex *top ; - Z_RPNStructure *s1 ; - Z_RPNStructure *s2 ; - Z_Operator *roperator ; - - top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ; - s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ; - s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ; - roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ; - - top->roperator = roperator ; - top->roperator->which = Z_Operator_or ; - top->roperator->u.op_or = odr_nullval() ; - - top->s1 = s1 ; - top->s1->which = Z_RPNStructure_complex ; - top->s1->u.complex = comp ; - - top->s2 = s2 ; - top->s2->which = Z_RPNStructure_simple ; - top->s2->u.simple = simp ; - - return top ; -} - -/* -** expand_query() -** expand a simple query into a number of complex queries -*/ -Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop ) -{ - Z_Complex *top ; - int numattrs = 0 ; - - /* - ** start_rsnode will be set if we have already read the rankfile - ** so don't bother again but we need to know the number of attributes - ** in the linked list so traverse it again to find out how many. - */ - if ( start_rsnode ) - { - refrsnode node = start_rsnode ; - while ( node ) - { - numattrs++ ; - node = node->next_rsnode ; - } - } - - /* - ** only expand the query if there are 2 or more attributes - */ - if ( numattrs >= 2 ) - { - refrsnode node = start_rsnode ; - int attr1 ; - int attr2 ; - - attr1 = node->rank ; node = node->next_rsnode ; - attr2 = node->rank ; node = node->next_rsnode ; - - /* - ** this is the special case and has to be done first because the - ** last complex node in the linear list has two simple nodes whereas - ** all the others have a complex and a simple. - */ - top = set_2operands( set_operand( thisop,attr1 ), - set_operand( thisop,attr2 ) ) ; - - /* - ** do the rest as complex/simple pairs - */ - while ( node ) - { - attr1 = node->rank ; node = node->next_rsnode ; - top = set_1complex_1operand( top,set_operand( thisop,attr1 ) ) ; - } - /* - ** finally add the 1016 rank attribute at the top of the tree - */ - top = set_1complex_1operand( top,set_operand( thisop,1016 ) ) ; - - return top ; - } - else return NULL ; -} - -/* -** check_operand_attrs() -** loop through the attributes of a particular operand -** return 1 if (type==1 && value==1016) && (type==2 && value==102) -** otherwise return 0 -*/ -int check_operand_attrs( Z_Operand *thisop ) -{ - Z_AttributeElement *attrptr ; - int cond1 = 0 ; - int cond2 = 0 ; - int numattrs ; - int i ; - - numattrs = thisop->u.attributesPlusTerm->attributes->num_attributes ; - - for ( i = 0 ; i < numattrs ; i++ ) - { - attrptr = thisop->u.attributesPlusTerm->attributes->attributes[i] ; - - if ( (*attrptr->attributeType == 1) && - (*attrptr->value.numeric == 1016) ) - cond1 = 1 ; - - if ( (*attrptr->attributeType == 2) && - (*attrptr->value.numeric == 102) ) - cond2 = 1 ; - } - - return (cond1 & cond2) ; -} - -/* -** convert_simple2complex() -** -*/ -void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct ) -{ - Z_Complex *complex = NULL ; - Z_Operand *operand = rpnstruct->u.simple ; - - if ( check_operand_attrs( operand ) ) - { - complex = expand_query(zh, operand ) ; - - if ( complex ) - { - /* - ** Everything is complete so replace the original - ** operand with the newly built complex structure - ** This is it ... no going back!! - */ - rpnstruct->which = Z_RPNStructure_complex ; - rpnstruct->u.complex = complex ; - } - } -} - -/* -** walk_complex_query() -** recursively traverse the tree expanding any simple queries we find -*/ -void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct ) -{ - if ( rpnstruct->which == Z_RPNStructure_simple ) - { - convert_simple2complex(zh, rpnstruct ) ; - } - else - { - walk_complex_query(zh, rpnstruct->u.complex->s1 ) ; - walk_complex_query(zh, rpnstruct->u.complex->s2 ) ; - } -} - -void zebra_livcode_transform(ZebraHandle zh, Z_RPNQuery *query) -{ - /* - ** Got a search request, - ** 1. if it is a simple query, see if it suitable for expansion - ** i.e. the attributes are of the form ... - ** (type==1 && value==1016) && (type==2 && value==102) - ** or - ** 2. if it is complex, traverse the complex query tree and expand - ** any simples simples as above - */ -#if LIV_CODE - Z_RPNStructure *rpnstruct = query->RPNStructure ; - - if ( rpnstruct->which == Z_RPNStructure_simple ) - { - convert_simple2complex(zh, rpnstruct ) ; - } - else if ( rpnstruct->which == Z_RPNStructure_complex ) - { - walk_complex_query(zh, rpnstruct ) ; - } -#endif -} - - -struct rank_class_info { - int dummy; -}; - -struct rank_term_info { - int local_occur; - int global_occur; - int global_inv; - int rank_flag; -}; - -struct rank_set_info { - int last_pos; - int no_entries; - int no_rank_entries; - NMEM nmem; - struct rank_term_info *entries; -}; - -static int log2_int (unsigned g) -{ - int n = 0; - while ((g = g>>1)) - n++; - return n; -} - -/* - * create: Creates/Initialises this rank handler. This routine is - * called exactly once. The routine returns the class_handle. - */ -static void *create (ZebraHandle zh) -{ - struct rank_class_info *ci = (struct rank_class_info *) - xmalloc (sizeof(*ci)); - - logf (LOG_DEBUG, "livrank create"); - - read_zrank_file(zh) ; - - return ci; -} - -/* - * destroy: Destroys this rank handler. This routine is called - * when the handler is no longer needed - i.e. when the server - * dies. The class_handle was previously returned by create. - */ -static void destroy (struct zebra_register *reg, void *class_handle) -{ - struct rank_class_info *ci = (struct rank_class_info *) class_handle; - - logf (LOG_DEBUG, "livrank destroy"); - xfree (ci); -} - - -/* - * begin: Prepares beginning of "real" ranking. Called once for - * each result set. The returned handle is a "set handle" and - * will be used in each of the handlers below. - */ -static void *begin (struct zebra_register *reg, void *class_handle, - RSET rset, NMEM nmem) -{ - struct rank_set_info *si = (struct rank_set_info *) xmalloc (sizeof(*si)); - int i; - - logf (LOG_DEBUG, "livrank begin"); - /* FIXME - Now that we don't have term counts in rsets, what do we */ - /* do about this ??? */ - si->no_entries = 0; /* rset->no_rset_terms; */ /* FIXME ??? */ - si->no_rank_entries = 0; - si->nmem=nmem; - si->entries = (struct rank_term_info *) - xmalloc (sizeof(*si->entries)*si->no_entries); - for (i = 0; i < si->no_entries; i++) - { - const char *flags = ""; /* rset->rset_terms[i]->flags; *//* FIXME ???*/ - int g = 0; /* rset->rset_terms[i]->nn; */ /* FIXME ??? */ - const char *cp = strstr(flags, ",u="); - - si->entries[i].rank_flag = 1; - if (cp) - { - char *t = search_for_rankstr(atoi(cp+3)); - if (t) - si->entries[i].rank_flag = search_for_score(t) ; - } - if ( si->entries[i].rank_flag ) - (si->no_rank_entries)++; - - si->entries[i].local_occur = 0; - si->entries[i].global_occur = g; - si->entries[i].global_inv = 32 - log2_int (g); - logf (LOG_DEBUG, "-------- %d ------", 32 - log2_int (g)); - } - return si; -} - -/* - * end: Terminates ranking process. Called after a result set - * has been ranked. - */ -static void end (struct zebra_register *reg, void *set_handle) -{ - struct rank_set_info *si = (struct rank_set_info *) set_handle; - logf (LOG_DEBUG, "livrank end"); - xfree (si->entries); - xfree (si); -} - -/* - * add: Called for each word occurence in a result set. This routine - * should be as fast as possible. This routine should "incrementally" - * update the score. - */ -static void add (void *set_handle, int seqno, int term_index) -{ - struct rank_set_info *si = (struct rank_set_info *) set_handle; - logf (LOG_DEBUG, "rank-1 add seqno=%d term_index=%d", seqno, term_index); - si->last_pos = seqno; - si->entries[term_index].local_occur++; -} - -/* - * calc: Called for each document in a result. This handler should - * produce a score based on previous call(s) to the add handler. The - * score should be between 0 and 1000. If score cannot be obtained - * -1 should be returned. - */ -static int calc (void *set_handle, zint sysno) -{ - int i, lo, divisor, score = 0; - struct rank_set_info *si = (struct rank_set_info *) set_handle; - - logf (LOG_DEBUG, "livrank calc sysno=" ZINT_FORMAT, sysno); - - if (!si->no_rank_entries) - return -1; - for (i = 0; i < si->no_entries; i++) - { - score += si->entries[i].local_occur * si->entries[i].rank_flag ; - } - for (i = 0; i < si->no_entries; i++) - if (si->entries[i].rank_flag && (lo = si->entries[i].local_occur)) - score += (8+log2_int (lo)) * si->entries[i].global_inv; - score *= 34; - divisor = si->no_rank_entries * (8+log2_int (si->last_pos/si->no_entries)); - score = score / divisor; - if (score > 1000) - score = 1000; - for (i = 0; i < si->no_entries; i++) - si->entries[i].local_occur = 0; - return score; -} - -/* - * Pseudo-meta code with sequence of calls as they occur in a - * server. Handlers are prefixed by --: - * - * server init - * -- create - * foreach search - * rank result set - * -- begin - * foreach record - * foreach word - * -- add - * -- calc - * -- end - * -- destroy - * server close - */ - -static struct rank_control rank_control = { - "livrank", - create, - destroy, - begin, - end, - calc, - add, -}; - -struct rank_control *rankliv_class = &rank_control; -#endif - diff --git a/index/rank1.c b/index/rank1.c index 581e78a..0eaf3b1 100644 --- a/index/rank1.c +++ b/index/rank1.c @@ -1,4 +1,4 @@ -/* $Id: rank1.c,v 1.18 2004-10-26 15:32:11 heikki Exp $ +/* $Id: rank1.c,v 1.19 2004-10-28 10:37:15 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 Index Data Aps @@ -30,7 +30,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #endif -#define DEBUG_RANK 1 +#define DEBUG_RANK 0 #include "index.h" diff --git a/index/zebraapi.c b/index/zebraapi.c index e97e27c..6a3069a 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -1,4 +1,4 @@ -/* $Id: zebraapi.c,v 1.137 2004-10-26 15:32:11 heikki Exp $ +/* $Id: zebraapi.c,v 1.138 2004-10-28 10:37:15 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -292,10 +292,7 @@ struct zebra_register *zebra_register_open (ZebraService zs, const char *name, reg->ptr_i=0; zebraRankInstall (reg, rank1_class); -#ifdef SKIPTHIS /* FIXME - those ranks not yet converted to new interface */ zebraRankInstall (reg, rankzv_class); - zebraRankInstall (reg, rankliv_class); -#endif recordCompression = res_get_def (res, "recordCompression", "none"); if (!strcmp (recordCompression, "none")) @@ -814,10 +811,6 @@ int zebra_search_RPN (ZebraHandle zh, ODR o, if (zebra_begin_read (zh)) return 1; -#ifdef SKIPTHIS /* FIXME - livcode rank not yet available */ - zebra_livcode_transform(zh, query); -#endif - resultSetAddRPN (zh, odr_extract_mem(o), query, zh->num_basenames, zh->basenames, setname); @@ -900,6 +893,8 @@ int zebra_records_retrieve (ZebraHandle zh, ODR stream, &recs[i].len, &recs[i].base); recs[i].errString = NULL; + recs[i].score=poset[i].score; + recs[i].sysno=poset[i].sysno; } else { diff --git a/index/zvrank.c b/index/zvrank.c index ecebce3..a993276 100644 --- a/index/zvrank.c +++ b/index/zvrank.c @@ -1,4 +1,4 @@ -/* $Id: zvrank.c,v 1.11 2004-10-26 15:32:11 heikki Exp $ +/* $Id: zvrank.c,v 1.12 2004-10-28 10:37:15 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 Index Data Aps @@ -40,7 +40,6 @@ fernuni-hagen.de> ** "ntc-atn", "atc-atn", etc. */ -#if SKIPTHIS /* FIXME - Disabled while changing the interface to ranking */ #include /* for log */ @@ -645,6 +644,7 @@ static void zv_init(RS rs, const char *rscheme) { rs->db_terms=500000; /* assign correct value here (for debugging) */ rs->db_f_max=50; /* assign correct value here */ rs->db_f_max_str="a"; /* assign correct value here (for debugging) */ + /* FIXME - get those values from somewhere */ zv_init_scheme(rs, rscheme); return; } @@ -687,38 +687,41 @@ static void zv_destroy (struct zebra_register *reg, void *class_handle) { * will be used in each of the handlers below. */ static void *zv_begin(struct zebra_register *reg, void *class_handle, - RSET rset, NMEM nmem) + RSET rset, NMEM nmem, TERMID *terms, int numterms) { - struct rs_info *rs=(struct rs_info *)xmalloc(sizeof(*rs)); + struct rs_info *rs=(struct rs_info *)nmem_malloc(nmem,sizeof(*rs)); struct rank_class_info *ci=(struct rank_class_info *)class_handle; int i; int veclen; + int *ip; zint gocc; /**/ yaz_log(LOG_DEBUG, "zv_begin"); - veclen= 0 ; /* rset->no_rset_terms;*/ /* smaller vector here */ - /* FIXME - Now that we don't have term lists in rsets, what do */ - /* we do here ??? */ + veclen= numterms; zv_init(rs, ci->rscheme); rs->nmem=nmem; rs->veclen=veclen; prn_rs(rs); - rs->qdoc=(struct ds_info *)xmalloc(sizeof(*rs->qdoc)); - rs->qdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->qdoc->terms)*rs->veclen); + rs->qdoc=(struct ds_info *)nmem_malloc(nmem,sizeof(*rs->qdoc)); + rs->qdoc->terms=(struct ts_info *)nmem_malloc(nmem, + sizeof(*rs->qdoc->terms)*rs->veclen); rs->qdoc->veclen=veclen; rs->qdoc->d_f_max=1; /* no duplicates */ rs->qdoc->d_f_max_str=""; - rs->rdoc=(struct ds_info *)xmalloc(sizeof(*rs->rdoc)); - rs->rdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->rdoc->terms)*rs->veclen); + rs->rdoc=(struct ds_info *)nmem_malloc(nmem,sizeof(*rs->rdoc)); + rs->rdoc->terms=(struct ts_info *)nmem_malloc(nmem, + sizeof(*rs->rdoc->terms)*rs->veclen); rs->rdoc->veclen=veclen; rs->rdoc->d_f_max=10; /* just a guess */ rs->rdoc->d_f_max_str=""; /* yaz_log(LOG_DEBUG, "zv_begin_init"); */ for (i = 0; i < rs->veclen; i++) { - gocc= 0; /* rset->rset_terms[i]->nn; */ /* FIXME ??? */ + gocc= rset_count(terms[i]->rset); + terms[i]->rankpriv=ip=nmem_malloc(nmem, sizeof(int)); + *ip=i; /* save the index for add() */ /* yaz_log(LOG_DEBUG, "zv_begin_init i=%d gocc=%d", i, gocc); */ rs->qdoc->terms[i].gocc=gocc; rs->qdoc->terms[i].locc=1; /* assume query has no duplicate terms */ @@ -737,13 +740,8 @@ static void *zv_begin(struct zebra_register *reg, void *class_handle, */ static void zv_end (struct zebra_register *reg, void *rsi) { - RS rs=(RS)rsi; yaz_log(LOG_DEBUG, "zv_end"); - xfree(rs->qdoc->terms); - xfree(rs->rdoc->terms); - xfree(rs->qdoc); - xfree(rs->rdoc); - xfree(rs); + /* they all are nmem'd */ return; } @@ -752,10 +750,13 @@ static void zv_end (struct zebra_register *reg, void *rsi) * should be as fast as possible. This routine should "incrementally" * update the score. */ -static void zv_add (void *rsi, int seqno, int i) { +static void zv_add (void *rsi, int seqno, TERMID term) { RS rs=(RS)rsi; - /* yaz_log(LOG_DEBUG, "zvrank zv_add seqno=%d term_index=%d", seqno, term_index);*/ + int *ip = term->rankpriv; + int i=*ip; rs->rdoc->terms[i].locc++; + yaz_log(LOG_DEBUG, "zvrank zv_add seqno=%d '%s' term_index=%d cnt=%d", + seqno, term->name, i, rs->rdoc->terms[i].locc ); } /* @@ -782,8 +783,9 @@ static int zv_calc (void *rsi, zint sysno) (*rs->d_norm_fct)(rs, rs->rdoc); dscore=rs->sim_fct(rs->qdoc, rs->rdoc); } - score = (int) dscore * 1000; - yaz_log (LOG_LOG, "sysno=" ZINT_FORMAT " score=%d", sysno, score); + score = (int) (dscore * 1000 +.5); + yaz_log (LOG_DEBUG, "zv_calc: sysno=" ZINT_FORMAT " score=%d", + sysno, score); if (score > 1000) /* should not happen */ score = 1000; return (int) score; @@ -819,5 +821,4 @@ static struct rank_control rank_control_vsm = { struct rank_control *rankzv_class = &rank_control_vsm; -#endif /* SKIPTHIS */ /* EOF */ diff --git a/test/api/Makefile.am b/test/api/Makefile.am index ceb5608..0bd1cac 100644 --- a/test/api/Makefile.am +++ b/test/api/Makefile.am @@ -1,9 +1,9 @@ -# $Id: Makefile.am,v 1.19 2004-10-24 13:34:45 adam Exp $ +# $Id: Makefile.am,v 1.20 2004-10-28 10:37:15 heikki Exp $ noinst_PROGRAMS = testclient testclient_SOURCES = testclient.c -check_PROGRAMS = t1 t2 t3 t4 t5 t6 t7 t8 +check_PROGRAMS = t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 TESTS = $(check_PROGRAMS) EXTRA_DIST=zebra.cfg zebra6.cfg zebra8.cfg @@ -16,6 +16,8 @@ t5_SOURCES = t5.c t6_SOURCES = t6.c t7_SOURCES = t7.c t8_SOURCES = t8.c +t9_SOURCES = t9.c testlib.c +t10_SOURCES = t10.c testlib.c AM_CPPFLAGS = -I$(top_srcdir)/include $(YAZINC) diff --git a/test/api/rankingrecords.h b/test/api/rankingrecords.h new file mode 100644 index 0000000..aae0365 --- /dev/null +++ b/test/api/rankingrecords.h @@ -0,0 +1,55 @@ +/* $Id: rankingrecords.h,v 1.1 2004-10-28 10:37:15 heikki Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 + Index Data Aps + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +/** rankingrecords.h - some test data for t9, and t10 */ + +const char *recs[] = { + "\n" + " The first title\n" + " \n" + " The first common word is the: the the the \n" + " The second common word is word \n" + " but all have the foo bar \n" + " \n" + "\n", + + "\n" + " The second title\n" + " \n" + " The first common word is the: the \n" + " The second common word is foo: foo foo \n" + " but all have the foo bar \n" + " \n" + "\n", + + "\n" + " The third title\n" + " \n" + " The first common word is the: the \n" + " The third common word is bar: bar \n" + " but all have the foo bar \n" + " \n" + "\n", + + 0 }; + + diff --git a/test/api/t10.c b/test/api/t10.c new file mode 100644 index 0000000..f333448 --- /dev/null +++ b/test/api/t10.c @@ -0,0 +1,138 @@ +/* $Id: t10.c,v 1.1 2004-10-28 10:37:15 heikki Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 + Index Data Aps + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +/** t10.c - test zv-rank */ + +#include +#include +#include +#include "testlib.h" +#include "rankingrecords.h" + +#define qry(zh,query,hits,string,score) \ + RankingQuery(__LINE__,(zh),(query),(hits),(string),(score)) + +struct tst { + char *schema; + char *hit1; + int score1; + char *hit2; + int score2; + char *hit3; + int score3; +}; + + + +struct tst tests[] = { + {"ntc-atn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"ntc-ntn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"ntc-btn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"ntc-apn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"ntc-npn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"ntc-bpn", "first title", 1000, "first title", 1000, "third title", 826 }, + + {"atc-atn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"atc-ntn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"atc-btn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"atc-apn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"atc-npn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"atc-bpn", "first title", 1000, "first title", 1000, "first title", 972 }, + + {"npc-atn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"npc-ntn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"npc-btn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"npc-apn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"npc-npn", "first title", 1000, "first title", 1000, "third title", 826 }, + {"npc-bpn", "first title", 1000, "first title", 1000, "third title", 826 }, + + {"apc-atn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"apc-ntn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"apc-btn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"apc-apn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"apc-npn", "first title", 1000, "first title", 1000, "first title", 972 }, + {"apc-bpn", "first title", 1000, "first title", 1000, "first title", 972 }, + + {0,0,0,0,0,0,0}, +}; + +int main(int argc, char **argv) +{ + int i; + char *addinfo; + ZebraService zs; + ZebraHandle zh; + + yaz_log_init_file("t10.log"); + /* yaz_log_init_level(LOG_ALL); */ + + nmem_init (); + + zs = start_service("zebrazv.cfg"); + zh = zebra_open (zs); + zebra_select_database(zh, "Default"); + logf(LOG_LOG,"going to call init"); + i=zebra_init(zh); + logf(LOG_LOG,"init returned %d",i); + if (i) { + printf("init failed with %d\n",i); + zebra_result(zh, &i, &addinfo); + printf(" Error %d %s\n",i,addinfo); + exit(1); + } + + zebra_begin_trans (zh, 1); + for (i = 0; recs[i]; i++) + zebra_add_record (zh, recs[i], strlen(recs[i])); + zebra_end_trans (zh); + zebra_commit (zh); + + // yaz_log_init_level(LOG_ALL); + + zebra_close(zh); + + + for (i=0; tests[i].schema; i++) + { + zh = zebra_open (zs); + zebra_select_database(zh, "Default"); + zebra_set_resource(zh, "zvrank.weighting-scheme", tests[i].schema); + logf(LOG_LOG,"============%d: %s ============", i,tests[i].schema); + + RankingQuery( __LINE__, zh, "@attr 1=1016 @attr 2=102 the", + 3, tests[i].hit1, tests[i].score1); + RankingQuery( __LINE__, zh, "@attr 1=1016 @attr 2=102 @or foo bar", + 3, tests[i].hit2, tests[i].score2); + RankingQuery( __LINE__, zh, + "@attr 1=1016 @attr 2=102 @or @or the foo bar", + 3, tests[i].hit3, tests[i].score3); + + zebra_close(zh); + } + + zebra_stop (zs); + + nmem_exit (); + xmalloc_trav ("x"); + logf(LOG_LOG,"============ ALL TESTS PASSED OK ============"); + exit (0); +} diff --git a/test/api/t9.c b/test/api/t9.c new file mode 100644 index 0000000..f22cfdb --- /dev/null +++ b/test/api/t9.c @@ -0,0 +1,85 @@ +/* $Id: t9.c,v 1.1 2004-10-28 10:37:15 heikki Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 + Index Data Aps + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +/** t9.c - test rank-1 */ + +#include +#include +#include +#include "testlib.h" +#include "rankingrecords.h" + +#define qry(zh,query,hits,string,score) \ + RankingQuery(__LINE__,(zh),(query),(hits),(string),(score)) + +int main(int argc, char **argv) +{ + int i; + char *addinfo; + ZebraService zs; + ZebraHandle zh; + + yaz_log_init_file("t9.log"); + /* yaz_log_init_level(LOG_ALL); */ + + nmem_init (); + + zs = start_service(""); /* default to zebra.cfg */ + zh = zebra_open (zs); + zebra_select_database(zh, "Default"); + logf(LOG_LOG,"going to call init"); + i=zebra_init(zh); + logf(LOG_LOG,"init returned %d",i); + if (i) { + printf("init failed with %d\n",i); + zebra_result(zh, &i, &addinfo); + printf(" Error %d %s\n",i,addinfo); + exit(1); + } + + zebra_begin_trans (zh, 1); + for (i = 0; recs[i]; i++) + zebra_add_record (zh, recs[i], strlen(recs[i])); + zebra_end_trans (zh); + + zebra_select_database(zh, "Default"); + + qry( zh, "@attr 1=1016 @attr 2=102 the", + 3, "first title", 872 ); + + qry( zh, "@attr 1=1016 @attr 2=102 foo", + 3, "second title", 850 ); + + /* get the record with the most significant hit, that is the 'bar' */ + /* as that is the rarest of my search words */ + qry( zh, "@attr 1=1016 @attr 2=102 @or @or the foo bar", + 3, "third title", 895 ); + + + zebra_commit (zh); + zebra_close (zh); + zebra_stop (zs); + + nmem_exit (); + xmalloc_trav ("x"); + exit (0); +} diff --git a/test/api/testlib.c b/test/api/testlib.c new file mode 100644 index 0000000..6513bbf --- /dev/null +++ b/test/api/testlib.c @@ -0,0 +1,113 @@ +/* $Id: testlib.c,v 1.1 2004-10-28 10:37:15 heikki Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 + Index Data Aps + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +/** testlib - utilities for the api tests */ + +#include +#include +#include + + +/* read zebra.cfg from env var srcdir if it exists; otherwise current dir */ +ZebraService start_service(char *cfgname) +{ + char cfg[256]; + char *srcdir = getenv("srcdir"); + if (!srcdir || ! *srcdir) + srcdir="."; + if (!cfgname || ! *cfgname ) + cfgname="zebra.cfg"; + /*sprintf(cfg, "%.200s%szebra.cfg", srcdir ? srcdir : "", srcdir ? "/" : ""); */ + + sprintf(cfg, "%.200s/%s",srcdir, cfgname); + return zebra_start(cfg); +} + +/** + * makes a query, checks number of hits, and for the first hit, that + * it contains the given string, and that it gets the right score + */ +void RankingQuery(int lineno, ZebraHandle zh, char *query, + int exphits, char *firstrec, int firstscore ) +{ + ZebraRetrievalRecord retrievalRecord[10]; + ODR odr_output = odr_createmem (ODR_DECODE); + ODR odr_input = odr_createmem (ODR_DECODE); + YAZ_PQF_Parser parser = yaz_pqf_create(); + Z_RPNQuery *rpn = yaz_pqf_parse(parser, odr_input, query); + const char *setname="rsetname"; + int hits; + int rc; + int i; + + logf(LOG_LOG,"======================================"); + logf(LOG_LOG,"qry[%d]: %s", lineno, query); + + if (!rpn) { + printf("Error: Parse failed \n%s\n",query); + exit(1); + } + rc=zebra_search_RPN (zh, odr_input, rpn, setname, &hits); + if (rc) { + printf("Error: search returned %d \n%s\n",rc,query); + exit (1); + } + + if (hits != exphits) { + printf("Error: search returned %d hits instead of %d\n", + hits, exphits); + exit (1); + } + yaz_pqf_destroy(parser); + + for (i = 0; i<10; i++) + { + retrievalRecord[i].position = i+1; + retrievalRecord[i].score = i+20000; + } + + rc=zebra_records_retrieve (zh, odr_output, setname, 0, + VAL_TEXT_XML, hits, retrievalRecord); + + if (rc) { + printf("Error: retrieve returned %d \n%s\n",rc,query); + exit (1); + } + + if (!strstr(retrievalRecord[0].buf, firstrec)) + { + printf("Error: Got the wrong record first\n"); + printf("Expected '%s' but got \n",firstrec); + printf("%.*s\n",retrievalRecord[0].len,retrievalRecord[0].buf); + exit(1); + } + + if (retrievalRecord[0].score != firstscore) + { + printf("Error: first rec got score %d instead of %d\n", + retrievalRecord[0].score, firstscore); + exit(1); + } + odr_destroy (odr_output); + odr_destroy (odr_input); +} + -- 1.7.10.4