X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Franksimilarity.c;h=4ee613a23a4213514774cb1248469c60bddb6513;hb=8de059a4446c5ad8f95e8090d1dc97f0c2bb8d9e;hp=8c822b27c189943e348b1f00d4c72d706074c51c;hpb=746f36f2b12f2a342dc0213cb03458359a493c8f;p=idzebra-moved-to-github.git diff --git a/index/ranksimilarity.c b/index/ranksimilarity.c index 8c822b2..4ee613a 100644 --- a/index/ranksimilarity.c +++ b/index/ranksimilarity.c @@ -1,4 +1,4 @@ -/* $Id: ranksimilarity.c,v 1.1 2006-05-03 09:31:26 marc Exp $ +/* $Id: ranksimilarity.c,v 1.3 2006-05-04 10:11:09 marc Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -37,44 +37,87 @@ static int log_level = 0; static int log_initialized = 0; struct ranksimilarity_class_info { - int dummy; + int dummy; }; +/** term specific info and statistics to be used under ranking */ struct ranksimilarity_term_info { - int local_occur; - zint global_occur; - int global_inv; - int rank_flag; - int rank_weight; - TERMID term; - int term_index; + + /** frequency of term within document field */ + int freq_term_docfield; + + /** frequency of term within result set of given term */ + zint freq_term_resset; + + /** number of docs within result set */ + zint no_docs_resset; + + /** number of terms in this field */ + zint no_terms_field; + + /** number of docs with this field in database*/ + zint no_docs_field; + + /** rank flag is one if term is to be included in ranking */ + int rank_flag; + + /** relative ranking weight of term */ + int term_weight; + + /** term id used to access term name and other info */ + TERMID term; + + /** index number in terms[i] array */ + int term_index; }; struct ranksimilarity_set_info { - int last_pos; - int no_entries; - int no_rank_entries; - struct ranksimilarity_term_info *entries; - NMEM nmem; + int last_pos; + + /** number of terms in query */ + int no_terms_query; + + /** number of terms in query which are included in ranking */ + int no_ranked_terms_query; + + /** number of documents in entire collection */ + zint no_docs_database; + + /** array of size no_terms_query with statistics gathered per term */ + struct ranksimilarity_term_info *entries; + + NMEM nmem; }; +/* local clean-up function */ +static void ranksimilar_rec_reset(struct ranksimilarity_set_info *si) +{ + int i; + + for (i = 0; i < si->no_terms_query; i++) + { + si->entries[i].freq_term_docfield = 0; + } +} + + /* * create: Creates/Initialises this rank handler. This routine is * called exactly once. The routine returns the class_handle. */ static void *create (ZebraHandle zh) { - struct ranksimilarity_class_info *ci = - (struct ranksimilarity_class_info *) xmalloc (sizeof(*ci)); + struct ranksimilarity_class_info *ci = + (struct ranksimilarity_class_info *) xmalloc (sizeof(*ci)); - if (!log_initialized) + if (!log_initialized) { - log_level = yaz_log_module_level("ranksimilarity"); - log_initialized = 1; + log_level = yaz_log_module_level("rank-similarity"); + log_initialized = 1; } - yaz_log(log_level, "create()"); - return 0; + yaz_log(log_level, "create()"); + return 0; } /* @@ -84,10 +127,10 @@ static void *create (ZebraHandle zh) */ static void destroy (struct zebra_register *reg, void *class_handle) { - struct ranksimilarity_class_info *ci - = (struct ranksimilarity_class_info *) class_handle; - yaz_log(log_level, "destroy()"); - xfree (ci); + struct ranksimilarity_class_info *ci + = (struct ranksimilarity_class_info *) class_handle; + yaz_log(log_level, "destroy()"); + xfree (ci); } @@ -100,53 +143,112 @@ static void *begin (struct zebra_register *reg, void *class_handle, RSET rset, NMEM nmem, TERMID *terms, int numterms) { - struct ranksimilarity_set_info *si = - (struct ranksimilarity_set_info *) nmem_malloc (nmem, sizeof(*si)); - int i; + struct ranksimilarity_set_info *si = + (struct ranksimilarity_set_info *) nmem_malloc (nmem, sizeof(*si)); + int i; + + yaz_log(log_level, "begin() numterms=%d", numterms); + + /* setting database global statistics */ + si->no_docs_database = -1; /* TODO */ + + /* setting query statistics */ + si->no_terms_query = numterms; + si->no_ranked_terms_query = 0; + + /* setting internal data structures */ + si->nmem=nmem; + si->entries = (struct ranksimilarity_term_info *) + nmem_malloc (si->nmem, sizeof(*si->entries)*numterms); - yaz_log(log_level, "begin()"); + /* reset the counts for the next term */ + ranksimilar_rec_reset(si); - /* count how many terms are ranked (2=102 or similar) */ - si->no_entries = numterms; - si->no_rank_entries = 0; - si->nmem=nmem; - si->entries = (struct ranksimilarity_term_info *) - nmem_malloc (si->nmem, sizeof(*si->entries)*numterms); - /* looping all terms in a specific field of query */ - for (i = 0; i < numterms; i++) + /* looping all terms in a specific field of query */ + for (i = 0; i < numterms; i++) { - struct ord_list *ol = terms[i]->ol; - - yaz_log(log_level, "begin() term i=%d flags=%s '%s'", i, - terms[i]->flags, terms[i]->name ); - - for (; ol; ol = ol->next) - { - int index_type = 0; - const char *db = 0; - const char *string_index = 0; - int set = -1; - int use = -1; - - zebraExplain_lookup_ord(reg->zei, - ol->ord, &index_type, &db, &set, &use, - &string_index); - - if (string_index) - yaz_log(log_level, "begin() ord=%d index_type=%c db=%s str-index=%s", - ol->ord, index_type, db, string_index); - else - yaz_log(log_level, "begin() ord=%d index_type=%c db=%s set=%d use=%d", - ol->ord, index_type, db, set, use); - } - if (!strncmp (terms[i]->flags, "rank,", 5)) - (si->no_rank_entries)++; - - /* setting next entry in term */ - terms[i]->rankpriv = &(si->entries[i]); + struct ord_list *ol = NULL; + + + /* adding to number of rank entries */ + if (strncmp (terms[i]->flags, "rank,", 5)) + { + si->entries[i].rank_flag = 0; + yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s not ranked", + i, terms[i]->name, terms[i]->flags); + } + else + { + const char *cp = strstr(terms[i]->flags+4, ",w="); + + (si->no_ranked_terms_query)++; + ol = terms[i]->ol; + si->entries[i].rank_flag = 1; + + /* notice that the call to rset_count(rset) has he side-effect of setting + rset->hits_limit = rset_count(rset) ??? */ + si->entries[i].freq_term_resset = rset_count(terms[i]->rset); + /* si->entries[i].freq_term_resset = terms[i]->rset->hits_count; */ + + + yaz_log(log_level, "begin() rset_count(terms[%d]->rset) = %d", + i, rset_count(terms[i]->rset)); + yaz_log(log_level, "begin() terms[%d]->rset->hits_limit = %d", + i, terms[i]->rset->hits_limit); + yaz_log(log_level, "begin() terms[%d]->rset->hits_count = %d", + i, terms[i]->rset->hits_count); + yaz_log(log_level, "begin() terms[%d]->rset->hits_round = %d", + i, terms[i]->rset->hits_round); + yaz_log(log_level, "begin() terms[%d]->rset->hits_approx = %d", + i, terms[i]->rset->hits_approx); + + + si->entries[i].no_docs_resset = -1; /*TODO*/ + si->entries[i].no_docs_field = -1; /*TODO*/ + si->entries[i].no_terms_field = -1; /*TODO*/ + + if (cp) + si->entries[i].term_weight = atoi (cp+3); + else + si->entries[i].term_weight = 34; /* sqrroot of 1000 */ + + yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s", + i, terms[i]->name, terms[i]->flags); + + /* looping indexes where term terms[i] is found */ + for (; ol; ol = ol->next) + { + int index_type = 0; + const char *db = 0; + const char *string_index = 0; + int set = -1; + int use = -1; + + zebraExplain_lookup_ord(reg->zei, + ol->ord, &index_type, &db, &set, &use, + &string_index); + + if (string_index) + yaz_log(log_level, + "begin() index: ord=%d type=%c db=%s str-index=%s", + ol->ord, index_type, db, string_index); + else + yaz_log(log_level, + "begin() index: ord=%d type=%c db=%s set=%d use=%d", + ol->ord, index_type, db, set, use); + } + + } + + si->entries[i].term = terms[i]; + si->entries[i].term_index=i; + + /* setting next entry in term */ + terms[i]->rankpriv = &(si->entries[i]); } - return si; + + return si; } /* @@ -155,7 +257,7 @@ static void *begin (struct zebra_register *reg, */ static void end (struct zebra_register *reg, void *set_handle) { - yaz_log(log_level, "end()"); + yaz_log(log_level, "end()"); } @@ -166,22 +268,22 @@ static void end (struct zebra_register *reg, void *set_handle) */ static void add (void *set_handle, int seqno, TERMID term) { - struct ranksimilarity_set_info *si = (struct ranksimilarity_set_info *) set_handle; + struct ranksimilarity_set_info *si + = (struct ranksimilarity_set_info *) set_handle; struct ranksimilarity_term_info *ti; - assert(si); - if (!term) + assert(si); + if (!term) { - /* yaz_log(log_level, "add() NULL term"); */ - return; + /* yaz_log(log_level, "add() seqno=%d NULL term", seqno); */ + return; } - - ti= (struct ranksimilarity_term_info *) term->rankpriv; - assert(ti); - si->last_pos = seqno; - ti->local_occur++; - /* yaz_log(log_level, "add() seqno=%d term=%s count=%d", - seqno, term->name,ti->local_occur); */ + ti= (struct ranksimilarity_term_info *) term->rankpriv; + assert(ti); + si->last_pos = seqno; + ti->freq_term_docfield++; + /* yaz_log(log_level, "add() seqno=%d term=%s freq_term_docfield=%d", + seqno, term->name, ti->freq_term_docfield); */ } /* @@ -197,28 +299,62 @@ static int calc (void *set_handle, zint sysno, zint staticrank, struct ranksimilarity_set_info *si = (struct ranksimilarity_set_info *) set_handle; - yaz_log(log_level, "calc()"); - if (!si->no_rank_entries) - return -1; /* ranking not enabled for any terms */ + yaz_log(log_level, "calc() sysno = %d", sysno); + yaz_log(log_level, "calc() staticrank = %d", staticrank); + + yaz_log(log_level, "calc() si->no_terms_query = %d", + si->no_terms_query); + yaz_log(log_level, "calc() si->no_ranked_terms_query = %d", + si->no_ranked_terms_query); + yaz_log(log_level, "calc() si->no_docs_database = %d", + si->no_docs_database); - /* here you put in your own score function */ - - /* reset the counts for the next term */ - for (i = 0; i < si->no_entries; i++) - si->entries[i].local_occur = 0; + if (!si->no_ranked_terms_query) + return -1; /* ranking not enabled for any terms */ + /* if we set *stop_flag = 1, we stop processing (of result set list) */ - /* staticrank = 0 is highest, MAXINT lowest */ /* here goes your formula to compute a scoring function */ /* you may use all the gathered statistics here */ + for (i = 0; i < si->no_terms_query; i++) + { + yaz_log(log_level, "calc() entries[%d] termid %d", + i, si->entries[i].term); + if (si->entries[i].term){ + yaz_log(log_level, "calc() entries[%d] term '%s' flags=%s", + i, si->entries[i].term->name, si->entries[i].term->flags); + yaz_log(log_level, "calc() entries[%d] rank_flag %d", + i, si->entries[i].rank_flag ); + yaz_log(log_level, "calc() entries[%d] term_weight %d", + i, si->entries[i].term_weight ); + yaz_log(log_level, "calc() entries[%d] freq_term_docfield %d", + i, si->entries[i].freq_term_docfield ); + yaz_log(log_level, "calc() entries[%d] freq_term_resset %d", + i, si->entries[i].freq_term_resset ); + yaz_log(log_level, "calc() entries[%d] no_docs_resset %d", + i, si->entries[i].no_docs_resset ); + yaz_log(log_level, "calc() entries[%d] no_docs_field %d", + i, si->entries[i].no_docs_field ); + yaz_log(log_level, "calc() entries[%d] no_terms_field %d", + i, si->entries[i].no_terms_field ); + } + } + + /* reset the counts for the next term */ + ranksimilar_rec_reset(si); + + + /* staticrank = 0 is highest, MAXINT lowest */ score = INT_MAX - staticrank; /* but score is reverse (logical) */ + /* debugging statistics output */ + yaz_log(log_level, "calc() statistics: score = %d", score); return score; } @@ -242,13 +378,13 @@ static int calc (void *set_handle, zint sysno, zint staticrank, */ static struct rank_control rank_control = { - "rank-similarity", - create, - destroy, - begin, - end, - calc, - add, + "rank-similarity", + create, + destroy, + begin, + end, + calc, + add, }; struct rank_control *rank_similarity_class = &rank_control;