From 267ea81ca8353540128c7f1c7d7f43148b4bc49f Mon Sep 17 00:00:00 2001 From: Sebastian Hammer Date: Sun, 26 Nov 2006 05:15:43 +0000 Subject: [PATCH] Adding relevance ranking, etc. --- default.bib | 13 +++++ http.c | 22 +------- pazpar2.c | 49 +++++++++++------ pazpar2.h | 6 +++ reclists.c | 22 +++----- reclists.h | 16 +++++- relevance.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++---------- relevance.h | 12 ++--- 8 files changed, 218 insertions(+), 91 deletions(-) create mode 100644 default.bib diff --git a/default.bib b/default.bib new file mode 100644 index 0000000..8069ecc --- /dev/null +++ b/default.bib @@ -0,0 +1,13 @@ +# CCL field mappings +# $Id: default.bib,v 1.1 2006-11-26 05:15:43 quinn Exp $ +# +# The rule below is used when no fields are specified +term u=1016 t=l,r s=al + +# Rules for some BIB-1 fields +au u=1 s=al +ti u=4 s=al +su u=21 s=al +isbn u=7 +issn u=8 +date u=30 r=r diff --git a/http.c b/http.c index f2d6906..0a61044 100644 --- a/http.c +++ b/http.c @@ -1,5 +1,5 @@ /* - * $Id: http.c,v 1.2 2006-11-24 20:29:07 quinn Exp $ + * $Id: http.c,v 1.3 2006-11-26 05:15:43 quinn Exp $ */ #include @@ -394,10 +394,8 @@ static void http_destroy(IOCHAN i) { struct http_channel *s = iochan_getdata(i); - yaz_log(YLOG_DEBUG, "Destroying http channel"); if (s->proxy) { - yaz_log(YLOG_DEBUG, "Destroying Proxy channel"); if (s->proxy->iochan) { close(iochan_getfd(s->proxy->iochan)); @@ -429,8 +427,6 @@ static int http_proxy(struct http_request *rq) struct http_header *hp; struct http_buf *requestbuf; - yaz_log(YLOG_DEBUG, "Proxy request"); - if (!p) // This is a new connection. Create a proxy channel { int sock; @@ -438,7 +434,6 @@ static int http_proxy(struct http_request *rq) int one = 1; int flags; - yaz_log(YLOG_DEBUG, "Creating a new proxy channel"); if (!(pe = getprotobyname("tcp"))) { abort(); } @@ -500,13 +495,10 @@ static void http_io(IOCHAN i, int event) struct http_buf *htbuf; case EVENT_INPUT: - yaz_log(YLOG_DEBUG, "HTTP Input event"); - htbuf = http_buf_create(); res = read(iochan_getfd(i), htbuf->buf, HTTP_BUF_SIZE -1); if (res <= 0 && errno != EAGAIN) { - yaz_log(YLOG_WARN|YLOG_ERRNO, "HTTP read"); http_buf_destroy(htbuf); http_destroy(i); return; @@ -519,11 +511,7 @@ static void http_io(IOCHAN i, int event) } if ((reqlen = request_check(hc->iqueue)) <= 2) - { - yaz_log(YLOG_DEBUG, "We don't have a complete HTTP request yet"); return; - } - yaz_log(YLOG_DEBUG, "We think we have a complete HTTP request (len %d)", reqlen); nmem_reset(hc->nmem); if (!(request = http_parse_request(hc, &hc->iqueue, reqlen))) @@ -552,7 +540,6 @@ static void http_io(IOCHAN i, int event) return; } http_buf_enqueue(&hc->oqueue, hb); - yaz_log(YLOG_DEBUG, "Response ready"); iochan_setflags(i, EVENT_OUTPUT); // Turns off input selecting } if (hc->iqueue) @@ -564,7 +551,6 @@ static void http_io(IOCHAN i, int event) break; case EVENT_OUTPUT: - yaz_log(YLOG_DEBUG, "HTTP output event"); if (hc->oqueue) { struct http_buf *wb = hc->oqueue; @@ -575,7 +561,6 @@ static void http_io(IOCHAN i, int event) http_destroy(i); return; } - yaz_log(YLOG_DEBUG, "HTTP Wrote %d octets", res); if (res == wb->len) { hc->oqueue = hc->oqueue->next; @@ -587,10 +572,8 @@ static void http_io(IOCHAN i, int event) wb->offset += res; } if (!hc->oqueue) { - yaz_log(YLOG_DEBUG, "Writing finished"); if (!strcmp(hc->version, "1.0")) { - yaz_log(YLOG_DEBUG, "Closing 1.0 connection"); http_destroy(i); return; } @@ -620,10 +603,8 @@ static void proxy_io(IOCHAN pi, int event) struct http_buf *htbuf; case EVENT_INPUT: - yaz_log(YLOG_DEBUG, "Proxy input event"); htbuf = http_buf_create(); res = read(iochan_getfd(pi), htbuf->buf, HTTP_BUF_SIZE -1); - yaz_log(YLOG_DEBUG, "Proxy read %d bytes.", res); if (res == 0 || (res < 0 && errno != EINPROGRESS)) { if (hc->oqueue) @@ -650,7 +631,6 @@ static void proxy_io(IOCHAN pi, int event) iochan_setflag(hc->iochan, EVENT_OUTPUT); break; case EVENT_OUTPUT: - yaz_log(YLOG_DEBUG, "Proxy output event"); if (!(htbuf = pc->oqueue)) { iochan_clearflag(pi, EVENT_OUTPUT); diff --git a/pazpar2.c b/pazpar2.c index c2d4fbf..4bfb6e6 100644 --- a/pazpar2.c +++ b/pazpar2.c @@ -1,4 +1,4 @@ -/* $Id: pazpar2.c,v 1.4 2006-11-24 20:29:07 quinn Exp $ */ +/* $Id: pazpar2.c,v 1.5 2006-11-26 05:15:43 quinn Exp $ */ #include #include @@ -16,6 +16,7 @@ #include #include #include +#include #include "pazpar2.h" #include "eventl.h" @@ -83,7 +84,7 @@ static struct parameters { struct timeval base_time; int toget; int chunk; - void *ccl_filter; + CCL_bibset ccl_filter; } global_parameters = { 30, @@ -514,10 +515,18 @@ static void extract_subject(struct session *s, const char *rec) } } +static void pull_relevance_keys(struct session *s, struct record *head, struct record *rec) +{ + relevance_newrec(s->relevance, head); + relevance_countwords(s->relevance, head, rec->merge_key, strlen(rec->merge_key)); + relevance_donerecord(s->relevance, head); +} + struct record *ingest_record(struct target *t, char *buf, int len) { struct session *s = t->session; struct record *res; + struct record *head; const char *recbuf; wrbuf_rewind(s->wrbuf); @@ -541,10 +550,11 @@ struct record *ingest_record(struct target *t, char *buf, int len) res->target = t; res->next_cluster = 0; res->target_offset = -1; + res->term_frequency_vec = 0; - yaz_log(YLOG_DEBUG, "Key: %s", res->merge_key); + head = reclist_insert(s->reclist, res); - reclist_insert(s->reclist, res); + pull_relevance_keys(s, head, res); return res; } @@ -583,7 +593,6 @@ void ingest_records(struct target *t, Z_Records *r) rec = ingest_record(t, buf, len); if (!rec) continue; - yaz_log(YLOG_DEBUG, "Ingested a fooking record"); } } @@ -600,10 +609,6 @@ static void do_presentResponse(IOCHAN i, Z_APDU *a) t->diagnostic = *recs->u.nonSurrogateDiagnostic->condition; t->state = Error; } - else - { - yaz_log(YLOG_DEBUG, "Got Records!"); - } } if (!*r->presentStatus && t->state != Error) @@ -697,7 +702,6 @@ static void handler(IOCHAN i, int event) t->state = Failed; return; } - yaz_log(YLOG_DEBUG, "Successfully decoded %d oct PDU", len); switch (a->which) { case Z_APDU_initResponse: @@ -856,11 +860,11 @@ void search(struct session *s, char *query) } if (live_channels) { - const char *t[] = { "aa", "ab", 0 }; + const char *p[] = { query, 0 }; int maxrecs = live_channels * global_parameters.toget; s->termlist = termlist_create(s->nmem, maxrecs, 15); s->reclist = reclist_create(s->nmem, maxrecs); - relevance_create(s->nmem, t, 1000); + s->relevance = relevance_create(s->nmem, p, maxrecs); } } @@ -925,15 +929,17 @@ struct record **show(struct session *s, int start, int *num) // FIXME -- skip initial records - reclist_rewind(s->reclist); + relevance_prepare_read(s->relevance, s->reclist); for (i = 0; i < *num; i++) { - recs[i] = reclist_read_record(s->reclist); - if (!recs[i]) + struct record *r = reclist_read_record(s->reclist); + if (!r) { *num = i; break; } + recs[i] = r; + yaz_log(YLOG_DEBUG, "%d: %s%s", r->relevance, r->merge_key, r->next_cluster ? " (cluster)": ""); } return recs; } @@ -967,9 +973,15 @@ void statistics(struct session *s, struct statistics *stat) stat->num_connections = i; } -static void *load_cclfile(const char *fn) +static CCL_bibset load_cclfile(const char *fn) { - return 0; + CCL_bibset res = ccl_qual_mk(); + if (ccl_qual_fname(res, fn) < 0) + { + yaz_log(YLOG_FATAL|YLOG_ERRNO, "%s", fn); + exit(1); + } + return res; } int main(int argc, char **argv) @@ -1006,6 +1018,9 @@ int main(int argc, char **argv) } + if (!global_parameters.ccl_filter) + load_cclfile("default.bib"); + event_loop(&channel_list); return 0; diff --git a/pazpar2.h b/pazpar2.h index aa33a65..cb4e70f 100644 --- a/pazpar2.h +++ b/pazpar2.h @@ -1,14 +1,19 @@ #ifndef PAZPAR2_H #define PAZPAR2_H +struct record; + #include #include "termlists.h" +#include "relevance.h" struct record { struct target *target; int target_offset; char *buf; char *merge_key; + int relevance; + int *term_frequency_vec; struct record *next_cluster; }; @@ -20,6 +25,7 @@ struct session { NMEM nmem; WRBUF wrbuf; struct termlist *termlist; + struct relevance *relevance; struct reclist *reclist; yaz_marc_t yaz_marc; }; diff --git a/reclists.c b/reclists.c index 004efcd..3a7ac5c 100644 --- a/reclists.c +++ b/reclists.c @@ -1,5 +1,5 @@ /* - * $Id: reclists.c,v 1.1 2006-11-24 20:29:07 quinn Exp $ + * $Id: reclists.c,v 1.2 2006-11-26 05:15:43 quinn Exp $ */ #include @@ -15,20 +15,6 @@ struct reclist_bucket struct reclist_bucket *next; }; -struct reclist -{ - struct reclist_bucket **hashtable; - int hashtable_size; - int hashmask; - - struct record **flatlist; - int flatlist_size; - int num_records; - int pointer; - - NMEM nmem; -}; - struct record *reclist_read_record(struct reclist *l) { if (l->pointer < l->num_records) @@ -81,10 +67,11 @@ struct reclist *reclist_create(NMEM nmem, int numrecs) return res; } -void reclist_insert(struct reclist *l, struct record *record) +struct record *reclist_insert(struct reclist *l, struct record *record) { unsigned int bucket; struct reclist_bucket **p; + struct record *head; bucket = hash(record->merge_key) & l->hashmask; for (p = &l->hashtable[bucket]; *p; p = &(*p)->next) @@ -96,6 +83,7 @@ void reclist_insert(struct reclist *l, struct record *record) yaz_log(YLOG_LOG, "Found a matching record: %s", record->merge_key); record->next_cluster = existing->next_cluster; existing->next_cluster = record; + head = existing; break; } } @@ -109,7 +97,9 @@ void reclist_insert(struct reclist *l, struct record *record) new->next = 0; *p = new; l->flatlist[l->num_records++] = record; + head = record; } + return head; } diff --git a/reclists.h b/reclists.h index b80522e..f9d38c3 100644 --- a/reclists.h +++ b/reclists.h @@ -1,10 +1,22 @@ #ifndef RECLISTS_H #define RECLISTS_H -struct reclist; +struct reclist +{ + struct reclist_bucket **hashtable; + int hashtable_size; + int hashmask; + + struct record **flatlist; + int flatlist_size; + int num_records; + int pointer; + + NMEM nmem; +}; struct reclist *reclist_create(NMEM, int numrecs); -void reclist_insert(struct reclist *tl, struct record *record); +struct record * reclist_insert(struct reclist *tl, struct record *record); struct record *reclist_read_record(struct reclist *l); void reclist_rewind(struct reclist *l); diff --git a/relevance.c b/relevance.c index fc85e38..221b709 100644 --- a/relevance.c +++ b/relevance.c @@ -1,28 +1,22 @@ /* - * $Id: relevance.c,v 1.1 2006-11-24 20:29:07 quinn Exp $ + * $Id: relevance.c,v 1.2 2006-11-26 05:15:43 quinn Exp $ */ #include +#include +#include #include "relevance.h" #include "pazpar2.h" struct relevance { - struct relevance_record *records; - int num_records; int *doc_frequency_vec; int vec_len; struct word_trie *wt; NMEM nmem; }; -struct relevance_record -{ - struct record *record; - int *term_frequency_vec; -}; - // We use this data structure to recognize terms in input records, // and map them to record term vectors for counting. struct word_trie @@ -55,21 +49,53 @@ static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, else { c -= 'a'; - if (!n->list[c].child) - { - struct word_trie *new = create_word_trie_node(nmem); - n->list[c].child = new; - } if (!*(++term)) n->list[c].termno = num; else + { + if (!n->list[c].child) + { + struct word_trie *new = create_word_trie_node(nmem); + n->list[c].child = new; + } word_trie_addterm(nmem, n->list[c].child, term, num); + } break; } } +} + +#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1) + +static int word_trie_match(struct word_trie *t, const char *word, int len, int *skipped) +{ + int c = raw_char(tolower(*word)); + + if (!len) + return 0; + + word++; len--; + (*skipped)++; + if (!len || raw_char(*word) < 0) + { + if (t->list[c].termno > 0) + return t->list[c].termno; + else + return 0; + } + else + { + if (t->list[c].child) + { + return word_trie_match(t->list[c].child, word, len, skipped); + } + else + return 0; + } } + static struct word_trie *build_word_trie(NMEM nmem, const char **terms) { struct word_trie *res = create_word_trie_node(nmem); @@ -93,38 +119,123 @@ struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs) res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int)); bzero(res->doc_frequency_vec, res->vec_len * sizeof(int)); res->nmem = nmem; - res->num_records = 0; - res->records = nmem_malloc(nmem, numrecs * sizeof(struct relevance_record *)); res->wt = build_word_trie(nmem, terms); return res; } -struct relevance_record *relevance_newrec(struct relevance *r, struct record *rec) +void relevance_newrec(struct relevance *r, struct record *rec) { - struct relevance_record *res = nmem_malloc(r->nmem, - sizeof(struct relevance_record)); - res->record = rec; - res->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int)); - bzero(res->term_frequency_vec, r->vec_len * sizeof(int)); - return res; + if (!rec->term_frequency_vec) + { + rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int)); + bzero(rec->term_frequency_vec, r->vec_len * sizeof(int)); + } } -void relevance_countwords(struct relevance_record *rec, const char *words, int len) + +// FIXME. The definition of a word is crude here.. should support +// some form of localization mechanism? +void relevance_countwords(struct relevance *r, struct record *head, + const char *words, int len) { + while (len) + { + char c; + int res; + int skipped; + while (len && (c = raw_char(tolower(*words))) < 0) + { + words++; + len--; + } + if (!len) + return; + skipped = 0; + if ((res = word_trie_match(r->wt, words, len, &skipped))) + { + words += skipped; + len -= skipped; + head->term_frequency_vec[res]++; + } + else + { + while (len && (c = raw_char(tolower(*words))) >= 0) + { + words++; + len--; + } + } + head->term_frequency_vec[0]++; + } } -void relevance_donerecord(struct relevance_record *rec) +void relevance_donerecord(struct relevance *r, struct record *head) { + int i; + + for (i = 1; i < r->vec_len; i++) + if (head->term_frequency_vec[i] > 0) + r->doc_frequency_vec[i]++; + + r->doc_frequency_vec[0]++; } -// Prepare for a relevance-sorted read of up to num entries -void relevance_prepare_read(struct relevance *r, int num) +#ifdef FLOAT_REL +static int comp(const void *p1, const void *p2) +{ + float res; + struct record **r1 = (struct record **) p1; + struct record **r2 = (struct record **) p2; + res = (*r2)->relevance - (*r1)->relevance; + if (res > 0) + return 1; + else if (res < 0) + return -1; + else + return 0; +} +#else +static int comp(const void *p1, const void *p2) { + struct record **r1 = (struct record **) p1; + struct record **r2 = (struct record **) p2; + return (*r2)->relevance - (*r1)->relevance; } +#endif -struct record *relevance_read(struct relevance *r) +// Prepare for a relevance-sorted read of up to num entries +void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) { - return 0; + int i; + float *idfvec = xmalloc(rel->vec_len * sizeof(float)); + + // Calculate document frequency vector for each term. + for (i = 1; i < rel->vec_len; i++) + { + if (!rel->doc_frequency_vec[i]) + idfvec[i] = 0; + else + idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]); + } + // Calculate relevance for each document + for (i = 0; i < reclist->num_records; i++) + { + int t; + struct record *rec = reclist->flatlist[i]; + float relevance; + relevance = 0; + for (t = 1; t < rel->vec_len; t++) + { + float termfreq; + if (!rec->term_frequency_vec[0]) + break; + termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0]; + relevance += termfreq * idfvec[t]; + } + rec->relevance = (int) (relevance * 100000); + } + qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp); + reclist->pointer = 0; } /* diff --git a/relevance.h b/relevance.h index 3a2bc15..02d661e 100644 --- a/relevance.h +++ b/relevance.h @@ -4,17 +4,17 @@ #include #include "pazpar2.h" +#include "reclists.h" struct relevance; -struct relevance_record; struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs); -struct relevance_record *relevance_newrec(struct relevance *r, struct record *rec); -void relevance_countwords(struct relevance_record *rec, const char *words, int len); -void relevance_donerecord(struct relevance_record *rec); +void relevance_newrec(struct relevance *r, struct record *rec); +void relevance_countwords(struct relevance *r, struct record *rec, + const char *words, int len); +void relevance_donerecord(struct relevance *r, struct record *rec); -void relevance_prepare_read(struct relevance *r, int num); -struct record *relevance_read(struct relevance *r); +void relevance_prepare_read(struct relevance *rel, struct reclist *rec); #endif -- 1.7.10.4