3 The University of Liverpool
5 Modifications to Zebra 1.1 / YAZ 1.7 to enable ranking
8 Copyright (c) 2001-2002 The University of Liverpool. All
11 Licensed under the Academic Free License version 1.1.
12 http://opensource.org/licenses/academic.php
14 $Id: livcode.c,v 1.1.2.1 2006-10-27 11:06:46 adam Exp $
31 ** These functions/routines
32 ** 1. reads in and builds a linked list of rank attr/rank score pairs
33 ** 2. expand a simple query into a paired list of complex/simple nodes.
38 struct rstype *next_rsnode ;
42 } rsnode, *refrsnode ;
44 refrsnode start_rsnode = NULL ;
47 ** Function/Routine prototypes
49 static int search_for_score( char *rankstr ) ;
50 static char *search_for_rankstr( int rank ) ;
51 static int search_for_rank( int rank ) ;
52 static refrsnode set_rsnode( int rank, int score ) ;
53 static int read_zrank_file(ZebraHandle zh) ;
55 static void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ;
56 static void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ;
57 static Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop ) ;
58 static Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp ) ;
59 static Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 ) ;
60 static Z_Operand *set_operand( Z_Operand *thisop, int newattr ) ;
61 static int check_operand_attrs( Z_Operand *thisop ) ;
65 ** given a rank-string traverse down the linked list ;
66 ** return its score if found otherwise return -1.
68 int search_for_score( char *rankstr )
70 refrsnode node = start_rsnode ;
73 if ( sscanf( rankstr,"%d",&rank ) )
77 if ( node->rank == rank ) return node->score ;
78 node = node->next_rsnode ;
85 ** search_for_rankstr()
86 ** given a rank traverse down the linked list ;
87 ** return its string if found otherwise return NULL.
89 char *search_for_rankstr( int rank )
91 refrsnode node = start_rsnode ;
95 if ( node->rank == rank ) return node->rankstr ;
96 node = node->next_rsnode ;
103 ** given a rank traverse down the linked list ;
104 ** return 1 if found otherwise return 0.
106 int search_for_rank( int rank )
108 refrsnode node = start_rsnode ;
112 if ( node->rank == rank ) return 1 ;
113 node = node->next_rsnode ;
120 ** given a rank and a score, build the rest of the rsnode structure.
122 refrsnode set_rsnode( int rank, int score )
125 refrsnode node = (refrsnode)malloc( sizeof(rsnode) ) ;
128 node->next_rsnode = NULL ;
130 node->score = score ;
132 sprintf( buff,"%d",rank ) ;
133 node->rankstr = (char *)malloc( strlen(buff)+1 ) ;
134 strcpy( node->rankstr, buff ) ;
140 ** read_zrank_file(zh)
141 ** read in the rankfile and build the rank/score linked list ;
142 ** return 0 : can't open the zebra config. file
143 ** return 0 : can't find the rankfile entry in the zebra config. file
144 ** return 0 : can't open the rankfile itself
145 ** return the number of distinct ranks read in.
147 int read_zrank_file(ZebraHandle zh)
150 char line[ LINEMAX ] ;
151 char rname[ LINEMAX ] ;
159 ** open the zebra configuration file and look for the "rankfile:"
160 ** entry which contains the path/name of the rankfile
163 const char *rankfile = res_get_def(zh->res, "rankfile", 0);
164 const char *profilePath = res_get_def(zh->res, "profilePath",
165 DEFAULT_PROFILE_PATH);
169 yaz_log(LOG_LOG, "rankfile entry not found in config file" ) ;
172 ifd = yaz_path_fopen(profilePath, rankfile, "r" ) ;
175 while ( (lineptr = fgets( line,LINEMAX,ifd )) )
177 if ( sscanf( lineptr,"rankfile: %s", rname ) == 1 )
182 ** open the rankfile and read the rank/score pairs
184 ** ignore duplicate ranks
185 ** ignore ranks without +ve scores
189 if ( !(ifd = fopen( rankfile, "r" )) )
191 logf( LOG_LOG, "unable to open rankfile %s",rankfile ) ;
195 while ( (lineptr = fgets( line,LINEMAX,ifd )) )
197 sscanf( lineptr,"%d : %d", &rank,&score ) ;
198 if ( ( score > 0 ) && ( rank != 1016 ) )
200 refrsnode new_rsnode ;
202 if ( search_for_rank( rank ) == 0 )
204 new_rsnode = set_rsnode( rank,score ) ;
205 new_rsnode->next_rsnode = start_rsnode ;
206 start_rsnode = new_rsnode ;
214 yaz_log(LOG_WARN|LOG_ERRNO, "unable to open config file (%s)",
223 ** build an operand "node" - hav to make a complete copy of thisop and
224 ** then insert newattr in the appropriate place
227 Z_Operand *set_operand( Z_Operand *thisop, int newattr )
230 Z_AttributesPlusTerm *attributesplusterm ;
231 Z_AttributeList *attributelist ;
232 Z_AttributeElement *attributeelement ;
233 Z_AttributeElement *attrptr ;
234 Z_AttributeElement **attrptrptr ;
239 operand = (Z_Operand *)
240 malloc( sizeof( Z_Operand ) ) ;
241 attributesplusterm = (Z_AttributesPlusTerm *)
242 malloc( sizeof( Z_AttributesPlusTerm ) ) ;
243 attributelist = (Z_AttributeList *)
244 malloc( sizeof( Z_AttributeList ) ) ;
245 attributeelement = (Z_AttributeElement *)
246 malloc( sizeof( Z_AttributeElement ) ) ;
248 malloc( sizeof( Z_Term ) ) ;
249 general = (Odr_oct *)
250 malloc( sizeof( Odr_oct ) ) ;
252 operand->which = Z_Operand_APT ;
253 operand->u.attributesPlusTerm = attributesplusterm ;
255 attributesplusterm->attributes = attributelist ;
256 attributesplusterm->term = term ;
258 attributelist->num_attributes = thisop->u.attributesPlusTerm->
259 attributes->num_attributes ;
261 attrptr = (Z_AttributeElement *) malloc( sizeof(Z_AttributeElement) *
262 attributelist->num_attributes ) ;
263 attrptrptr = (Z_AttributeElement **) malloc( sizeof(Z_AttributeElement) *
264 attributelist->num_attributes ) ;
266 attributelist->attributes = attrptrptr ;
268 for ( i = 0 ; i < attributelist->num_attributes ; i++ )
270 *attrptr = *thisop->u.attributesPlusTerm->attributes->attributes[i] ;
272 attrptr->attributeType = (int *)malloc( sizeof(int *) ) ;
273 *attrptr->attributeType = *thisop->u.attributesPlusTerm->attributes->
274 attributes[i]->attributeType;
276 attrptr->value.numeric = (int *)malloc( sizeof(int *) ) ;
277 *attrptr->value.numeric = *thisop->u.attributesPlusTerm->attributes->
278 attributes[i]->value.numeric;
280 if ( (*attrptr->attributeType == 1) &&
281 (*attrptr->value.numeric == 1016) )
283 *attrptr->value.numeric = newattr ;
285 *attrptrptr++ = attrptr++ ;
288 term->which = Z_Term_general ;
289 term->u.general = general ;
291 general->len = thisop->u.attributesPlusTerm->term->u.general->len ;
292 general->size = thisop->u.attributesPlusTerm->term->u.general->size ;
293 general->buf = malloc( general->size ) ;
294 strcpy( (char *) general->buf,
296 thisop->u.attributesPlusTerm->term->u.general->buf ) ;
303 ** build a complex "node" with two (simple) operand "nodes" as branches
305 Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 )
310 Z_Operator *roperator ;
312 top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ;
313 s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
314 s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
315 roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ;
317 top->roperator = roperator ;
318 top->roperator->which = Z_Operator_or ;
319 top->roperator->u.op_or = odr_nullval() ;
322 top->s1->which = Z_RPNStructure_simple ;
323 top->s1->u.simple = sim1 ;
326 top->s2->which = Z_RPNStructure_simple ;
327 top->s2->u.simple = sim2 ;
333 ** set_1complex_1operand()
334 ** build a complex "node" with a complex "node" branch and an
335 ** operand "node" branch
337 Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp )
342 Z_Operator *roperator ;
344 top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ;
345 s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
346 s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
347 roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ;
349 top->roperator = roperator ;
350 top->roperator->which = Z_Operator_or ;
351 top->roperator->u.op_or = odr_nullval() ;
354 top->s1->which = Z_RPNStructure_complex ;
355 top->s1->u.complex = comp ;
358 top->s2->which = Z_RPNStructure_simple ;
359 top->s2->u.simple = simp ;
366 ** expand a simple query into a number of complex queries
368 Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop )
374 ** start_rsnode will be set if we have already read the rankfile
375 ** so don't bother again but we need to know the number of attributes
376 ** in the linked list so traverse it again to find out how many.
380 refrsnode node = start_rsnode ;
384 node = node->next_rsnode ;
389 ** only expand the query if there are 2 or more attributes
393 refrsnode node = start_rsnode ;
397 attr1 = node->rank ; node = node->next_rsnode ;
398 attr2 = node->rank ; node = node->next_rsnode ;
401 ** this is the special case and has to be done first because the
402 ** last complex node in the linear list has two simple nodes whereas
403 ** all the others have a complex and a simple.
405 top = set_2operands( set_operand( thisop,attr1 ),
406 set_operand( thisop,attr2 ) ) ;
409 ** do the rest as complex/simple pairs
413 attr1 = node->rank ; node = node->next_rsnode ;
414 top = set_1complex_1operand( top,set_operand( thisop,attr1 ) ) ;
417 ** finally add the 1016 rank attribute at the top of the tree
419 top = set_1complex_1operand( top,set_operand( thisop,1016 ) ) ;
427 ** check_operand_attrs()
428 ** loop through the attributes of a particular operand
429 ** return 1 if (type==1 && value==1016) && (type==2 && value==102)
430 ** otherwise return 0
432 int check_operand_attrs( Z_Operand *thisop )
434 Z_AttributeElement *attrptr ;
440 numattrs = thisop->u.attributesPlusTerm->attributes->num_attributes ;
442 for ( i = 0 ; i < numattrs ; i++ )
444 attrptr = thisop->u.attributesPlusTerm->attributes->attributes[i] ;
446 if ( (*attrptr->attributeType == 1) &&
447 (*attrptr->value.numeric == 1016) )
450 if ( (*attrptr->attributeType == 2) &&
451 (*attrptr->value.numeric == 102) )
455 return (cond1 & cond2) ;
459 ** convert_simple2complex()
462 void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct )
464 Z_Complex *complex = NULL ;
465 Z_Operand *operand = rpnstruct->u.simple ;
467 if ( check_operand_attrs( operand ) )
469 complex = expand_query(zh, operand ) ;
474 ** Everything is complete so replace the original
475 ** operand with the newly built complex structure
476 ** This is it ... no going back!!
478 rpnstruct->which = Z_RPNStructure_complex ;
479 rpnstruct->u.complex = complex ;
485 ** walk_complex_query()
486 ** recursively traverse the tree expanding any simple queries we find
488 void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct )
490 if ( rpnstruct->which == Z_RPNStructure_simple )
492 convert_simple2complex(zh, rpnstruct ) ;
496 walk_complex_query(zh, rpnstruct->u.complex->s1 ) ;
497 walk_complex_query(zh, rpnstruct->u.complex->s2 ) ;
501 void zebra_livcode_transform(ZebraHandle zh, Z_RPNQuery *query)
504 ** Got a search request,
505 ** 1. if it is a simple query, see if it suitable for expansion
506 ** i.e. the attributes are of the form ...
507 ** (type==1 && value==1016) && (type==2 && value==102)
509 ** 2. if it is complex, traverse the complex query tree and expand
510 ** any simples simples as above
513 Z_RPNStructure *rpnstruct = query->RPNStructure ;
515 if ( rpnstruct->which == Z_RPNStructure_simple )
517 convert_simple2complex(zh, rpnstruct ) ;
519 else if ( rpnstruct->which == Z_RPNStructure_complex )
521 walk_complex_query(zh, rpnstruct ) ;
527 struct rank_class_info {
531 struct rank_term_info {
538 struct rank_set_info {
542 struct rank_term_info *entries;
545 static int log2_int (unsigned g)
554 * create: Creates/Initialises this rank handler. This routine is
555 * called exactly once. The routine returns the class_handle.
557 static void *create (ZebraHandle zh)
559 struct rank_class_info *ci = (struct rank_class_info *)
560 xmalloc (sizeof(*ci));
562 logf (LOG_DEBUG, "livrank create");
564 read_zrank_file(zh) ;
570 * destroy: Destroys this rank handler. This routine is called
571 * when the handler is no longer needed - i.e. when the server
572 * dies. The class_handle was previously returned by create.
574 static void destroy (struct zebra_register *reg, void *class_handle)
576 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
578 logf (LOG_DEBUG, "livrank destroy");
584 * begin: Prepares beginning of "real" ranking. Called once for
585 * each result set. The returned handle is a "set handle" and
586 * will be used in each of the handlers below.
588 static void *begin (struct zebra_register *reg, void *class_handle, RSET rset)
590 struct rank_set_info *si = (struct rank_set_info *) xmalloc (sizeof(*si));
593 logf (LOG_DEBUG, "livrank begin");
594 si->no_entries = rset->no_rset_terms;
595 si->no_rank_entries = 0;
596 si->entries = (struct rank_term_info *)
597 xmalloc (sizeof(*si->entries)*si->no_entries);
598 for (i = 0; i < si->no_entries; i++)
600 const char *flags = rset->rset_terms[i]->flags;
601 int g = rset->rset_terms[i]->nn;
602 const char *cp = strstr(flags, ",u=");
604 si->entries[i].rank_flag = 1;
607 char *t = search_for_rankstr(atoi(cp+3));
609 si->entries[i].rank_flag = search_for_score(t) ;
611 if ( si->entries[i].rank_flag )
612 (si->no_rank_entries)++;
614 si->entries[i].local_occur = 0;
615 si->entries[i].global_occur = g;
616 si->entries[i].global_inv = 32 - log2_int (g);
617 logf (LOG_DEBUG, "-------- %d ------", 32 - log2_int (g));
623 * end: Terminates ranking process. Called after a result set
626 static void end (struct zebra_register *reg, void *set_handle)
628 struct rank_set_info *si = (struct rank_set_info *) set_handle;
629 logf (LOG_DEBUG, "livrank end");
635 * add: Called for each word occurence in a result set. This routine
636 * should be as fast as possible. This routine should "incrementally"
639 static void add (void *set_handle, int seqno, int term_index)
641 struct rank_set_info *si = (struct rank_set_info *) set_handle;
642 logf (LOG_DEBUG, "rank-1 add seqno=%d term_index=%d", seqno, term_index);
643 si->last_pos = seqno;
644 si->entries[term_index].local_occur++;
648 * calc: Called for each document in a result. This handler should
649 * produce a score based on previous call(s) to the add handler. The
650 * score should be between 0 and 1000. If score cannot be obtained
651 * -1 should be returned.
653 static int calc (void *set_handle, int sysno)
655 int i, lo, divisor, score = 0;
656 struct rank_set_info *si = (struct rank_set_info *) set_handle;
658 logf (LOG_DEBUG, "livrank calc sysno=%d", sysno);
660 if (!si->no_rank_entries)
662 for (i = 0; i < si->no_entries; i++)
664 score += si->entries[i].local_occur * si->entries[i].rank_flag ;
666 for (i = 0; i < si->no_entries; i++)
667 if (si->entries[i].rank_flag && (lo = si->entries[i].local_occur))
668 score += (8+log2_int (lo)) * si->entries[i].global_inv;
670 divisor = si->no_rank_entries * (8+log2_int (si->last_pos/si->no_entries));
671 score = score / divisor;
674 for (i = 0; i < si->no_entries; i++)
675 si->entries[i].local_occur = 0;
680 * Pseudo-meta code with sequence of calls as they occur in a
681 * server. Handlers are prefixed by --:
697 static struct rank_control rank_control = {
707 struct rank_control *rankliv_class = &rank_control;