From f0e68a4ee36180d52d69369d0bb09a33392bb643 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 27 Feb 2003 22:55:40 +0000 Subject: [PATCH] Fix memory leak for sortKeys.buf. Add Johannes Leveling's zvrank. Enable with 'rank: zvrank' in zebra.cfg. --- index/Makefile.am | 4 +- index/extract.c | 6 +- index/main.c | 3 +- index/zebraapi.c | 14 +- index/zsets.c | 4 +- index/zvrank.c | 848 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 867 insertions(+), 12 deletions(-) create mode 100644 index/zvrank.c diff --git a/index/Makefile.am b/index/Makefile.am index 227c635..1159faf 100644 --- a/index/Makefile.am +++ b/index/Makefile.am @@ -1,4 +1,4 @@ -## $Id: Makefile.am,v 1.20 2003-02-10 09:42:42 adam Exp $ +## $Id: Makefile.am,v 1.21 2003-02-27 22:55:40 adam Exp $ noinst_PROGRAMS = apitest kdump @@ -9,7 +9,7 @@ libzebra_a_SOURCES = dir.c dirs.c trav.c kinput.c kcompare.c \ zebraapi.c zinfo.c invstat.c sortidx.c compact.c zsets.c zrpn.c \ rank1.c trunc.c retrieve.c extract.c \ index.h recindex.h recindxp.h \ - zebraapi.h zinfo.h zserver.h + zebraapi.h zinfo.h zserver.h zvrank.c libzebra_a_LIBADD = \ ../rset/librset.a ../dict/libdict.a ../isams/libisams.a \ diff --git a/index/extract.c b/index/extract.c index fb6e228..bf94e3f 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.137 2003-02-26 12:30:54 pop Exp $ +/* $Id: extract.c,v 1.138 2003-02-27 22:55:40 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -440,8 +440,6 @@ static int recordExtract (ZebraHandle zh, zh->reg->keys.prevAttrSet = -1; zh->reg->keys.prevSeqNo = 0; zh->reg->sortKeys.buf_used = 0; - zh->reg->sortKeys.buf_max = 0; - zh->reg->sortKeys.buf = 0; recordOffset = fi->file_moffset; extractCtrl.offset = fi->file_moffset; @@ -975,8 +973,6 @@ int bufferExtractRecord (ZebraHandle zh, zh->reg->keys.prevAttrSet = -1; zh->reg->keys.prevSeqNo = 0; zh->reg->sortKeys.buf_used = 0; - zh->reg->sortKeys.buf_max = 0; - zh->reg->sortKeys.buf = 0; if (*recordType) { logf (LOG_DEBUG, "Record type explicitly specified: %s", recordType); diff --git a/index/main.c b/index/main.c index 8cdc106..6881e54 100644 --- a/index/main.c +++ b/index/main.c @@ -1,4 +1,4 @@ -/* $Id: main.c,v 1.100 2002-10-22 12:51:08 adam Exp $ +/* $Id: main.c,v 1.101 2003-02-27 22:55:40 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -287,6 +287,7 @@ int main (int argc, char **argv) (double) (tms2.tms_stime - tms1.tms_stime)/100); } #endif + nmem_exit(); exit (0); return 0; } diff --git a/index/zebraapi.c b/index/zebraapi.c index 2ed3b39..97ac2d7 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -1,4 +1,4 @@ -/* $Id: zebraapi.c,v 1.84 2003-02-25 21:46:52 adam Exp $ +/* $Id: zebraapi.c,v 1.85 2003-02-27 22:55:40 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -219,6 +219,8 @@ struct zebra_register *zebra_register_open (ZebraService zs, const char *name, reg->keys.buf_max = 0; reg->keys.buf = 0; + reg->sortKeys.buf = 0; + reg->sortKeys.buf_max = 0; reg->records = 0; reg->dict = 0; @@ -234,6 +236,7 @@ struct zebra_register *zebra_register_open (ZebraService zs, const char *name, reg->key_file_no = 0; zebraRankInstall (reg, rank1_class); + zebraRankInstall (reg, rankzv_class); recordCompression = res_get_def (res, "recordCompression", "none"); if (!strcmp (recordCompression, "none")) @@ -398,6 +401,9 @@ static void zebra_register_close (ZebraService zs, struct zebra_register *reg) bfs_destroy (reg->bfs); data1_destroy (reg->dh); + xfree (reg->sortKeys.buf); + xfree (reg->keys.buf); + xfree (reg->key_buf); xfree (reg->name); xfree (reg); @@ -430,6 +436,7 @@ void zebra_close (ZebraHandle zh) { ZebraService zs; struct zebra_session **sp; + int i; if (!zh) return; @@ -442,13 +449,16 @@ void zebra_close (ZebraHandle zh) return ; resultSetDestroy (zh, -1, 0, 0); - if (zh->reg) zebra_register_close (zh->service, zh->reg); zebra_close_res (zh); xfree (zh->record_encoding); + for (i = 0; i < zh->num_basenames; i++) + xfree (zh->basenames[i]); + xfree (zh->basenames); + if (zh->iconv_to_utf8 != 0) yaz_iconv_close (zh->iconv_to_utf8); if (zh->iconv_from_utf8 != 0) diff --git a/index/zsets.c b/index/zsets.c index 9ced5ee..ac28c8e 100644 --- a/index/zsets.c +++ b/index/zsets.c @@ -1,4 +1,4 @@ -/* $Id: zsets.c,v 1.41 2003-01-13 22:45:22 adam Exp $ +/* $Id: zsets.c,v 1.42 2003-02-27 22:55:40 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -717,7 +717,7 @@ void resultSetRank (ZebraHandle zh, ZebraSet zebraSet, RSET rset) yaz_log (LOG_LOG, "resultSetRank"); - rank_class = zebraRankLookup (zh, "rank-1"); + rank_class = zebraRankLookup (zh, res_get_def(zh->res, "rank", "rank-1")); rc = rank_class->control; if (rset_read (rset, rfd, &key, &term_index)) diff --git a/index/zvrank.c b/index/zvrank.c new file mode 100644 index 0000000..7265545 --- /dev/null +++ b/index/zvrank.c @@ -0,0 +1,848 @@ +/* $Id: zvrank.c,v 1.1 2003-02-27 22:55:40 adam Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 + Index Data Aps + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +/* zvrank.c */ +/* Vector Space Model for Zebra */ +/* +** six (seven) letter identifier for weighting schema +** best document weighting: +** tfc nfc (tpc npc) [original naming] +** ntc atc npc apc [SMART naming, used here] +** best query weighting: +** nfx tfx bfx (npx tpx bpx) [original naming] +** atn ntn btn apn npn bpn [SMART naming] +*/ + +#include /* for log */ + +#include +#include +#ifdef WIN32 +#include +#else +#include +#endif + +#include "index.h" + +double blog2(double x) { /* sometimes log_e or log_10 is used */ + if (x <= 0) + return 0; + return log(x); +} + +/* structures */ + +struct rs_info { /* for result set */ + int db_docs; /* number of documents in database (collection) */ + int db_terms; /* number of distinct terms in database */ + int db_f_max; /* maximum of f_t in database */ + char *db_f_max_str; /* string (most frequent term) */ + /**/ + char rschema[8]; /* name of ranking schema */ + /**/ + int veclen; + void (*d_tf_fct)(void *, void *); /* doc term frequency function */ + void (*d_idf_fct)(void *, void *); /* doc idf function */ + void (*d_norm_fct)(void *, void *); /* doc normalization function */ + /**/ + void (*q_tf_fct)(void *, void *); /* query term frequency function */ + void (*q_idf_fct)(void *, void *); /* query idf function */ + void (*q_norm_fct)(void *, void *); /* query normalization function */ + + double (*sim_fct)(void *, void *); /* similarity function (scoring function) */ + struct ds_info *qdoc; + struct ds_info *rdoc; +}; +typedef struct rs_info *RS; + +void prn_rs(RS rs) { + int i; + fprintf(stdout, "* RS:\n"); + fprintf(stdout, " db_docs: %d\n", rs->db_docs); + fprintf(stdout, " db_terms: %d\n", rs->db_terms); + fprintf(stdout, " f_max: %d\n", rs->db_f_max); + fprintf(stdout, " f_max_str: %s\n", rs->db_f_max_str); + fprintf(stdout, " veclen: %d\n", rs->veclen); + /* rschema implies functions */ + fprintf(stdout, " rschema: %s\n", rs->rschema); + return; +} + +struct ds_info { /* document info */ + char *docid; /* unique doc identifier */ + int docno; /* doc number */ + int doclen; /* document length */ + int d_f_max; /* maximum number of any term in doc */ + char *d_f_max_str; /* most frequent term in d */ + int veclen; /* vector length */ + struct ts_info *terms; + double docsim; /* similarity in [0, ..., 1] (= score/1000) */ +}; +typedef struct ds_info* DS; + +void prn_ds(DS ds) { + fprintf(stdout, " * DS:\n"); + fprintf(stdout, " docid: %s\n", ds->docid); + fprintf(stdout, " docno: %d\n", ds->docno); + fprintf(stdout, " doclen: %d\n", ds->doclen); + fprintf(stdout, " d_f_max: %d\n", ds->d_f_max); + fprintf(stdout, " d_f_max_str:%s\n", ds->d_f_max_str); + fprintf(stdout, " veclen: %d\n", ds->veclen); + return; +} + +struct ts_info { /* term info */ + char *name; + int *id; + /**/ + int gocc; + int locc; + double tf; + double idf; + double wt; +}; +typedef struct ts_info *TS; + +void prn_ts(TS ts) { + fprintf(stdout, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f\n", + ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt); + return; +} + +/* end structures */ + +/* *** */ + +/* +** weighting functions +*/ + +/* calculate new term frequency vector */ +void tf_none(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i; + int veclen; + int freq; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + freq=ds->terms[i].locc; + ds->terms[i].tf=freq; + } + return; +} + +void tf_binary(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i; + int veclen; + int freq; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + freq=ds->terms[i].locc; + if (freq > 0) + ds->terms[i].tf=1.0; + else + ds->terms[i].tf=0.0; + } + return; +} + +void tf_max_norm(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int tf_max; + int i; + int veclen; + int freq; + /**/ + tf_max=rs->db_f_max; + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + freq=ds->terms[i].locc; + if ((freq > 0) && + (tf_max > 0)) + ds->terms[i].tf=freq/tf_max; + else + ds->terms[i].tf=0.0; + } + return; +} + +void tf_aug_norm(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + double K; + double tf_max; + int i; + int veclen; + int freq; + /**/ + tf_max=rs->db_f_max; + veclen=ds->veclen; + K=0.5; + for (i=0; i < veclen; i++) { + freq=ds->terms[i].locc; + if ((freq > 0) && + (tf_max > 0)) + ds->terms[i].tf=K+(1-K)*(freq/tf_max); + else + ds->terms[i].tf=0.0; + } + return; +} + +void tf_square(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i; + int veclen; + int freq; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + freq=ds->terms[i].locc; + if (freq > 0) + ds->terms[i].tf=freq*freq; + else + ds->terms[i].tf=0.0; + } + return; +} + +void tf_log(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i; + int veclen; + int freq; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + freq=ds->terms[i].locc; + if (freq > 0) + ds->terms[i].tf=1+blog2(freq); + else + ds->terms[i].tf=0.0; + } + return; +} + +/* calculate inverse document frequency vector */ +void idf_none(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i, veclen; + int gocc; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + ds->terms[i].idf=1.0; + } + return; +} + +void idf_tfidf(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int num_docs; + int i, veclen; + int gocc; + double idf; + /**/ + veclen=ds->veclen; + num_docs=rs->db_docs; + for (i=0; i < veclen; i++) { + gocc=ds->terms[i].gocc; + if (gocc==0) + idf=0.0; + else + idf=blog2(num_docs/gocc); + ds->terms[i].idf=idf; + } + return; +} + +void idf_prob(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int num_docs; + int i, veclen; + int gocc; + double idf; + /**/ + veclen=ds->veclen; + num_docs=rs->db_docs; + for (i=0; i < veclen; i++) { + gocc=ds->terms[i].gocc; + if (gocc==0) + idf=0.0; + else + idf=(num_docs-gocc)/gocc; + ds->terms[i].idf=idf; + } + return; +} + +void idf_freq(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int num_docs; + int i, veclen; + int gocc; + double idf; + /**/ + veclen=ds->veclen; + num_docs=rs->db_docs; + if (num_docs==0) + idf=0.0; + else + idf=1/num_docs; + for (i=0; i < veclen; i++) { + // gocc=ds->terms[i].gocc; + ds->terms[i].idf=idf; + } + return; +} + +void idf_squared(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int num_docs; + int i, veclen; + int gocc; + double idf; + /**/ + veclen=ds->veclen; + num_docs=rs->db_docs; + for (i=0; i < veclen; i++) { + gocc=ds->terms[i].gocc; + if (gocc==0.0) + idf=0.0; + else + idf=blog2(num_docs/gocc); + idf=idf*idf; + ds->terms[i].idf=idf; + } + return; +} + +/* calculate normalized weight (tf-idf) vector */ +void norm_none(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i, veclen; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf; + } + return; +} + +void norm_sum(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i, veclen; + double tfs=0.0; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf; + tfs+=ds->terms[i].wt; + } + for (i=0; i < veclen; i++) { + if (tfs > 0) + ds->terms[i].wt=ds->terms[i].wt/tfs; + } + return; +} + +void norm_cosine(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i, veclen; + double tfs=0.0; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf; + tfs+=(ds->terms[i].wt*ds->terms[i].wt); + } + for (i=0; i < veclen; i++) { + if (tfs > 0) + ds->terms[i].wt=ds->terms[i].wt/tfs; + } + return; +} + +void norm_fourth(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i, veclen; + double tfs=0.0, fr; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf; + fr=(ds->terms[i].wt*ds->terms[i].wt); + fr=fr*fr; + tfs+=fr; /* ^ 4 */ + } + for (i=0; i < veclen; i++) { + if (tfs > 0) + ds->terms[i].wt=ds->terms[i].wt/tfs; + } + return; +} + +void norm_max(void *rsi, void *dsi) { + RS rs=(RS)rsi; + DS ds=(DS)dsi; + int i, veclen; + double tfm; + /**/ + veclen=ds->veclen; + for (i=0; i < veclen; i++) { + ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf; + if (ds->terms[i].wt > tfm) + tfm=ds->terms[i].wt; + } + for (i=0; i < veclen; i++) { + if (tfm > 0) + ds->terms[i].wt=ds->terms[i].wt/tfm; + } + return; +} + +/* add: norm_pivot, ... */ + +double sim_cosine(void *dsi1, void *dsi2) { + DS ds1=(DS)dsi1; + DS ds2=(DS)dsi2; + int i, veclen; + double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0; + double v1, v2; + /**/ + veclen=ds1->veclen; /* and ds2->veclen */ + for (i=0; i < veclen; i++) { + v1=ds1->terms[i].wt; + v2=ds2->terms[i].wt; + smul +=(v1*v2); + sqr11+=(v1*v1); + sqr22+=(v2*v2); + } + sdiv=sqrt(sqr11*sqr22); + if (sdiv==0.0) + return 0.0; + return (smul/sdiv); + } + +/* add: norm_jaccard, norm_dice, ... */ + +/* end weighting functions */ + +/* *** */ + +/* best-fully-weighted */ +const char* def_rschema="ntc-atn"; + +/* prototype */ +void zv_init_schema(RS, const char*); + +void zv_init(RS rs) { + char *sname="ntc-atn";/* obtain from configuration file */ + fprintf(stdout, "zv_init\n"); + /* alloc rs */ + rs->db_docs=100000; /* assign correct value here */ + rs->db_terms=500000; /* assign correct value here */ + rs->db_f_max=50; /* assign correct value here */ + rs->db_f_max_str="a"; /* assign correct value here */ + zv_init_schema(rs, sname); + return; +} + +void zv_init_schema(RS rs, const char *sname) { + int slen; + char c0, c1, c2, c3, c4, c5, c6; + /**/ + fprintf(stdout, "zv_init_schema\n"); + slen=strlen(sname); + if (slen>0) c0=sname[0]; else c0=def_rschema[0]; + if (slen>0) c1=sname[1]; else c0=def_rschema[1]; + if (slen>0) c2=sname[2]; else c0=def_rschema[2]; + c3='-'; + if (slen>0) c4=sname[4]; else c0=def_rschema[4]; + if (slen>0) c5=sname[5]; else c0=def_rschema[5]; + if (slen>0) c6=sname[6]; else c0=def_rschema[6]; + /**/ + /* assign doc functions */ + switch (c0) { + case 'b': + rs->d_tf_fct=tf_binary; + rs->rschema[0]='b'; + break; + case 'm': + rs->d_tf_fct=tf_max_norm; + rs->rschema[0]='m'; + break; + case 'a': + rs->d_tf_fct=tf_aug_norm; + rs->rschema[0]='a'; + break; + case 's': + rs->d_tf_fct=tf_square; + rs->rschema[0]='s'; + break; + case 'l': + rs->d_tf_fct=tf_log; + rs->rschema[0]='l'; + break; + default: /* 'n' */ + rs->d_tf_fct=tf_none; + rs->rschema[0]='n'; + } + switch (c1) { + case 't': + rs->d_idf_fct=idf_tfidf; + rs->rschema[1]='t'; + break; + case 'p': + rs->d_idf_fct=idf_prob; + rs->rschema[1]='p'; + break; + case 'f': + rs->d_idf_fct=idf_freq; + rs->rschema[1]='f'; + break; + case 's': + rs->d_idf_fct=idf_squared; + rs->rschema[1]='s'; + break; + default: /* 'n' */ + rs->d_idf_fct=idf_none; + rs->rschema[1]='n'; + } + switch (c2) { + case 's': + rs->d_norm_fct=norm_sum; + rs->rschema[2]='s'; + break; + case 'c': + rs->d_norm_fct=norm_cosine; + rs->rschema[2]='c'; + break; + case 'f': + rs->d_norm_fct=norm_fourth; + rs->rschema[2]='t'; + break; + case 'm': + rs->d_norm_fct=norm_max; + rs->rschema[2]='m'; + break; + default: /* 'n' */ + rs->d_norm_fct=norm_none; + rs->rschema[2]='n'; + } + /**/ + rs->rschema[3]='-'; + /* assign query functions */ + switch (c4) { + case 'b': + rs->q_tf_fct=tf_binary; + rs->rschema[4]='b'; + break; + case 'm': + rs->q_tf_fct=tf_max_norm; + rs->rschema[4]='m'; + break; + case 'a': + rs->q_tf_fct=tf_aug_norm; + rs->rschema[4]='a'; + break; + case 's': + rs->q_tf_fct=tf_square; + rs->rschema[4]='s'; + break; + case 'l': + rs->q_tf_fct=tf_log; + rs->rschema[4]='l'; + break; + default: /* 'n' */ + rs->q_tf_fct=tf_none; + rs->rschema[4]='n'; + } + switch (c5) { + case 't': + rs->q_idf_fct=idf_tfidf; + rs->rschema[5]='t'; + break; + case 'p': + rs->q_idf_fct=idf_prob; + rs->rschema[5]='p'; + break; + case 'f': + rs->q_idf_fct=idf_freq; + rs->rschema[5]='f'; + break; + case 's': + rs->q_idf_fct=idf_squared; + rs->rschema[5]='s'; + break; + default: /* 'n' */ + rs->q_idf_fct=idf_none; + rs->rschema[5]='n'; + } + switch (c6) { + case 's': + rs->q_norm_fct=norm_sum; + rs->rschema[6]='s'; + break; + case 'c': + rs->q_norm_fct=norm_cosine; + rs->rschema[6]='c'; + break; + case 'f': + rs->q_norm_fct=norm_fourth; + rs->rschema[6]='f'; + break; + case 'm': + rs->q_norm_fct=norm_max; + rs->rschema[6]='m'; + break; + default: /* 'n' */ + rs->q_norm_fct=norm_none; + rs->rschema[6]='n'; + } + rs->rschema[7]='\0'; + /**/ + rs->sim_fct=sim_cosine; + fprintf(stdout, "zv_schema %s\n", rs->rschema); + + return; +} + +/******/ + +struct rank_class_info { /* where do we need this ? */ + int dummy; +}; + +/* +struct rank_term_info { + int local_occur; + int global_occur; + int global_inv; + int rank_flag; + int rank_weight; +}; + +struct rank_set_info { + int last_pos; + int no_entries; + int no_rank_entries; + struct rank_term_info *entries; +}; +*/ + + +/* + * zv_create: Creates/Initialises this rank handler. This routine is + * called exactly once. The routine returns the class_handle. + */ +static void *zv_create (struct zebra_register *reg) { + struct rank_class_info *ci = (struct rank_class_info *) + xmalloc (sizeof(*ci)); + fprintf(stdout, "zv_create\n"); + logf (LOG_DEBUG, "zv_create"); + return ci; +} + +/* + * zv_destroy: Destroys this rank handler. This routine is called + * when the handler is no longer needed - i.e. when the server + * dies. The class_handle was previously returned by create. + */ +static void zv_destroy (struct zebra_register *reg, void *class_handle) { + struct rank_class_info *ci = (struct rank_class_info *) class_handle; + fprintf(stdout, "zv_destroy\n"); + logf (LOG_DEBUG, "zv_destroy"); + xfree (ci); +} + + +/* + * zv_begin: Prepares beginning of "real" ranking. Called once for + * each result set. The returned handle is a "set handle" and + * will be used in each of the handlers below. + */ +static void *zv_begin (struct zebra_register *reg, void *class_handle, RSET rset) +{ + struct rs_info *rs=(struct rs_info *)xmalloc(sizeof(*rs)); + int i; + int veclen, gocc; + /**/ + logf (LOG_DEBUG, "rank-1 zvbegin"); + fprintf(stdout, "zv_begin\n"); + veclen=rset->no_rset_terms; /* smaller vector here */ + zv_init(rs); + rs->veclen=veclen; + prn_rs(rs); + + rs->qdoc=(struct ds_info *)xmalloc(sizeof(*rs->qdoc)); + rs->qdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->qdoc->terms)*rs->veclen); + rs->qdoc->veclen=veclen; + + rs->rdoc=(struct ds_info *)xmalloc(sizeof(*rs->rdoc)); + rs->rdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->rdoc->terms)*rs->veclen); + rs->rdoc->veclen=veclen; + /* + si->no_entries = rset->no_rset_terms; + si->no_rank_entries = 0; + si->entries = (struct rank_term_info *) + xmalloc (sizeof(*si->entries)*si->no_entries); + */ + /* fprintf(stdout, "zv_begin_init\n"); */ + for (i = 0; i < rs->veclen; i++) + { + + gocc=rset->rset_terms[i]->nn; + /* fprintf(stdout, "zv_begin_init i=%d gocc=%d\n", i, gocc); */ + if (!strncmp (rset->rset_terms[i]->flags, "rank,", 5)) { + yaz_log (LOG_LOG, "%s", rset->rset_terms[i]->flags); + /*si->entries[i].rank_flag = 1; + (si->no_rank_entries)++; + */ + } else { + /* si->entries[i].rank_flag = 0; */ + } + rs->qdoc->terms[i].gocc=gocc; + rs->qdoc->terms[i].locc=1; /* assume query has no duplicates */ + rs->rdoc->terms[i].gocc=gocc; + rs->rdoc->terms[i].locc=0; + } + return rs; +} + +/* + * zv_end: Terminates ranking process. Called after a result set + * has been ranked. + */ +static void zv_end (struct zebra_register *reg, void *rsi) +{ + RS rs=(RS)rsi; + fprintf(stdout, "zv_end\n"); + logf (LOG_DEBUG, "rank-1 end"); + xfree(rs->qdoc->terms); + xfree(rs->rdoc->terms); + xfree(rs->qdoc); + xfree(rs->rdoc); + xfree(rs); + return; +} + +/* + * zv_add: Called for each word occurence in a result set. This routine + * should be as fast as possible. This routine should "incrementally" + * update the score. + */ +static void zv_add (void *rsi, int seqno, int i) { + RS rs=(RS)rsi; + /*logf (LOG_DEBUG, "rank-1 add seqno=%d term_index=%d", seqno, term_index);*/ + /*si->last_pos = seqno;*/ + rs->rdoc->terms[i].locc++; +} + +/* + * zv_calc: Called for each document in a result. This handler should + * produce a score based on previous call(s) to the add handler. The + * score should be between 0 and 1000. If score cannot be obtained + * -1 should be returned. + */ +static int zv_calc (void *rsi, int sysno) +{ + int i, veclen; //lo, divisor, score = 0; + int score=0; + double dscore=0.0; + RS rs=(RS)rsi; + /* fprintf(stdout, "zv_calc\n"); */ + /**/ + veclen=rs->veclen; + if (veclen==0) + return -1; + for (i = 0; i < veclen; i++) { + (*rs->q_tf_fct)(rs, rs->qdoc); /* we should actually do this once */ + (*rs->q_idf_fct)(rs, rs->qdoc); + (*rs->q_norm_fct)(rs, rs->qdoc); + + (*rs->d_tf_fct)(rs, rs->rdoc); + (*rs->d_idf_fct)(rs, rs->rdoc); + (*rs->d_norm_fct)(rs, rs->rdoc); + + dscore=rs->sim_fct(rs->qdoc, rs->rdoc); + } + score = dscore * 1000; + yaz_log (LOG_LOG, "sysno=%d score=%d", sysno, score); + if (score > 1000) + score = 1000; + /* + for (i = 0; i < si->no_entries; i++) + si->entries[i].local_occur = 0; + */ + return score; +} + +/* + * Pseudo-meta code with sequence of calls as they occur in a + * server. Handlers are prefixed by --: + * + * server init + * -- create + * foreach search + * rank result set + * -- begin + * foreach record + * foreach word + * -- add + * -- calc + * -- end + * -- destroy + * server close + */ + +static struct rank_control rank_control_vsm = { + "zvrank", /* "zv_rank", */ /* zvrank */ + zv_create, + zv_destroy, + zv_begin, + zv_end, + zv_calc, + zv_add, +}; + +struct rank_control *rankzv_class = &rank_control_vsm; + +/* EOF */ -- 1.7.10.4