From 94bf7abfff6a30fab5567d8275db14122ed01822 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 7 Jun 2005 11:36:38 +0000 Subject: [PATCH] Added snippet utilities and snippet window implementation. XSLT filter gets snippet as simple string for now. --- include/idzebra/Makefile.am | 5 +- include/idzebra/recctrl.h | 5 +- include/idzebra/snippet.h | 55 +++++++++++++++ index/extract.c | 38 +++++++++- index/index.h | 9 ++- index/retrieve.c | 38 ++++++++-- index/zebraapi.c | 9 ++- index/zsets.c | 12 ++-- recctrl/xslt.c | 58 ++++++++++++++- test/xslt/marcschema.xml | 3 +- util/Makefile.am | 4 +- util/snippet.c | 163 +++++++++++++++++++++++++++++++++++++++++++ 12 files changed, 378 insertions(+), 21 deletions(-) create mode 100644 include/idzebra/snippet.h create mode 100644 util/snippet.c diff --git a/include/idzebra/Makefile.am b/include/idzebra/Makefile.am index 0c6f616..d92c6f1 100644 --- a/include/idzebra/Makefile.am +++ b/include/idzebra/Makefile.am @@ -1,5 +1,6 @@ -# $Id: Makefile.am,v 1.6 2005-03-30 09:25:23 adam Exp $ +# $Id: Makefile.am,v 1.7 2005-06-07 11:36:38 adam Exp $ pkginclude_HEADERS=api.h version.h res.h recctrl.h data1.h recgrs.h \ - zebramap.h bfile.h dict.h isam-codec.h isams.h isamc.h isamb.h util.h + zebramap.h bfile.h dict.h isam-codec.h isams.h isamc.h isamb.h util.h \ + snippet.h diff --git a/include/idzebra/recctrl.h b/include/idzebra/recctrl.h index 0a39e70..03e21af 100644 --- a/include/idzebra/recctrl.h +++ b/include/idzebra/recctrl.h @@ -1,4 +1,4 @@ -/* $Id: recctrl.h,v 1.10 2005-04-28 08:20:39 adam Exp $ +/* $Id: recctrl.h,v 1.11 2005-06-07 11:36:38 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -29,6 +29,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include #include +#include #include YAZ_BEGIN_CDECL @@ -87,6 +88,8 @@ struct recRetrieveCtrl { int recordSize; /* size of record in bytes */ char *fname; /* name of file (or NULL if internal) */ data1_handle dh; + zebra_snippets *hit_snippet; + zebra_snippets *doc_snippet; /* response */ oid_value output_format; diff --git a/include/idzebra/snippet.h b/include/idzebra/snippet.h new file mode 100644 index 0000000..e778e3c --- /dev/null +++ b/include/idzebra/snippet.h @@ -0,0 +1,55 @@ +/* $Id: snippet.h,v 1.1 2005-06-07 11:36:38 adam Exp $ + Copyright (C) 1995-2005 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +#ifndef SNIPPET_H +#define SNIPPET_H + +#include + +YAZ_BEGIN_CDECL + +struct zebra_snippet_word { + zint seqno; + int ord; + char *term; + int match; + struct zebra_snippet_word *next; +}; + +typedef struct zebra_snippets zebra_snippets; +typedef struct zebra_snippet_word zebra_snippet_word; + +zebra_snippets *zebra_snippets_create(); +void zebra_snippets_destroy(zebra_snippets *l); +void zebra_snippets_append(zebra_snippets *l, + zint seqno, int ord, const char *term); +void zebra_snippets_append_match(zebra_snippets *l, + zint seqno, int ord, const char *term, + int match); +zebra_snippet_word *zebra_snippets_list(zebra_snippets *l); +void zebra_snippets_log(zebra_snippets *l, int log_level); +zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit, + int window_size); + +YAZ_END_CDECL + +#endif diff --git a/index/extract.c b/index/extract.c index 02ee660..ca6d012 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.183 2005-05-31 13:01:36 adam Exp $ +/* $Id: extract.c,v 1.184 2005-06-07 11:36:38 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -1473,6 +1473,42 @@ void extract_add_it_key (ZebraHandle zh, keys->buf_used = dst - keys->buf; } +ZEBRA_RES zebra_snippets_rec_keys(ZebraHandle zh, struct recKeys *reckeys, + zebra_snippets *snippets) +{ + void *decode_handle = iscz1_start(); + int off = 0; + int seqno = 0; + NMEM nmem = nmem_create(); + + yaz_log(YLOG_LOG, "zebra_rec_keys_snippets buf=%p sz=%d", reckeys->buf, + reckeys->buf_used); + assert(reckeys->buf); + while (off < reckeys->buf_used) + { + const char *src = reckeys->buf + off; + struct it_key key; + char *dst = (char*) &key; + char dst_buf[IT_MAX_WORD]; + char *dst_term = dst_buf; + + iscz1_decode(decode_handle, &dst, &src); + assert(key.len <= 4 && key.len > 2); + + seqno = (int) key.mem[key.len-1]; + + zebra_term_untrans_iconv(zh, nmem, src[0], &dst_term, src+1); + zebra_snippets_append(snippets, seqno, key.mem[0], dst_term); + while (*src++) + ; + off = src - reckeys->buf; + nmem_reset(nmem); + } + nmem_destroy(nmem); + iscz1_stop(decode_handle); + return ZEBRA_OK; +} + void print_rec_keys(ZebraHandle zh, struct recKeys *reckeys) { void *decode_handle = iscz1_start(); diff --git a/index/index.h b/index/index.h index 7a921ef..6d656ac 100644 --- a/index/index.h +++ b/index/index.h @@ -1,4 +1,4 @@ -/* $Id: index.h,v 1.140 2005-06-06 21:31:08 adam Exp $ +/* $Id: index.h,v 1.141 2005-06-07 11:36:38 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -416,7 +416,8 @@ extern struct rank_control *rank1_class; extern struct rank_control *rankzv_class; extern struct rank_control *rankliv_class; -int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score, ODR stream, +int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score, + zebra_snippets *hit_snippet, ODR stream, oid_value input_format, Z_RecordComposition *comp, oid_value *output_format, char **rec_bufp, int *rec_lenp, char **basenamep, @@ -465,6 +466,10 @@ int zebra_record_int_read (void *fh, char *buf, size_t count); void zebra_record_int_end (void *fh, off_t offset); void print_rec_keys(ZebraHandle zh, struct recKeys *reckeys); +ZEBRA_RES zebra_snippets_rec_keys(ZebraHandle zh, struct recKeys *reckeys, + zebra_snippets *snippets); +ZEBRA_RES zebra_snippets_hit_vector(ZebraHandle zh, const char *setname, + zint sysno, zebra_snippets *snippets); void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, int cmd, struct recKeys *reckeys); diff --git a/index/retrieve.c b/index/retrieve.c index acdbb79..8dbe8a5 100644 --- a/index/retrieve.c +++ b/index/retrieve.c @@ -1,4 +1,4 @@ -/* $Id: retrieve.c,v 1.30 2005-05-31 13:01:37 adam Exp $ +/* $Id: retrieve.c,v 1.31 2005-06-07 11:36:38 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -82,7 +82,8 @@ void zebra_record_int_end (void *fh, off_t off) fc->offset_end = off; } -int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score, ODR stream, +int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score, + zebra_snippets *hit_snippet, ODR stream, oid_value input_format, Z_RecordComposition *comp, oid_value *output_format, char **rec_bufp, int *rec_lenp, char **basenamep, @@ -211,16 +212,43 @@ int zebra_record_fetch (ZebraHandle zh, SYSNO sysno, int score, ODR stream, retrieveCtrl.res = zh->res; retrieveCtrl.rec_buf = 0; retrieveCtrl.rec_len = -1; - + retrieveCtrl.hit_snippet = hit_snippet; + retrieveCtrl.doc_snippet = zebra_snippets_create(); + if (1) { + /* snippets code */ struct recKeys reckeys; + zebra_snippets *snippet; reckeys.buf = rec->info[recInfo_delKeys]; reckeys.buf_used = rec->size[recInfo_delKeys]; - print_rec_keys(zh, &reckeys); + zebra_snippets_rec_keys(zh, &reckeys, retrieveCtrl.doc_snippet); + + + yaz_log(YLOG_LOG, "DOC SNIPPET:"); + zebra_snippets_log(retrieveCtrl.doc_snippet, YLOG_LOG); + yaz_log(YLOG_LOG, "HIT SNIPPET:"); + zebra_snippets_log(retrieveCtrl.hit_snippet, YLOG_LOG); + + snippet = zebra_snippets_window(retrieveCtrl.doc_snippet, + retrieveCtrl.hit_snippet, + 10); + + yaz_log(YLOG_LOG, "WINDOW SNIPPET:"); + zebra_snippets_log(snippet, YLOG_LOG); + + (*rt->retrieve)(clientData, &retrieveCtrl); + + zebra_snippets_destroy(snippet); } - (*rt->retrieve)(clientData, &retrieveCtrl); + else + { + (*rt->retrieve)(clientData, &retrieveCtrl); + } + + zebra_snippets_destroy(retrieveCtrl.doc_snippet); + *output_format = retrieveCtrl.output_format; *rec_bufp = (char *) retrieveCtrl.rec_buf; *rec_lenp = retrieveCtrl.rec_len; diff --git a/index/zebraapi.c b/index/zebraapi.c index f87f209..5ee16a2 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -1,4 +1,4 @@ -/* $Id: zebraapi.c,v 1.173 2005-06-02 11:59:53 adam Exp $ +/* $Id: zebraapi.c,v 1.174 2005-06-07 11:36:38 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -944,14 +944,18 @@ ZEBRA_RES zebra_records_retrieve(ZebraHandle zh, ODR stream, { char *buf; int len; + zebra_snippets *hit_snippet = zebra_snippets_create(); - zebra_get_hit_vector(zh, setname, poset[i].sysno); + zebra_snippets_hit_vector(zh, setname, poset[i].sysno, + hit_snippet); recs[i].errCode = zebra_record_fetch(zh, poset[i].sysno, poset[i].score, + hit_snippet, stream, input_format, comp, &recs[i].format, &buf, &len, &recs[i].base, &recs[i].errString); + recs[i].len = len; if (len > 0) { @@ -962,6 +966,7 @@ ZEBRA_RES zebra_records_retrieve(ZebraHandle zh, ODR stream, recs[i].buf = buf; recs[i].score = poset[i].score; recs[i].sysno = poset[i].sysno; + zebra_snippets_destroy(hit_snippet); } else { diff --git a/index/zsets.c b/index/zsets.c index 3ec27f7..b4650a1 100644 --- a/index/zsets.c +++ b/index/zsets.c @@ -1,4 +1,4 @@ -/* $Id: zsets.c,v 1.86 2005-06-06 21:31:08 adam Exp $ +/* $Id: zsets.c,v 1.87 2005-06-07 11:36:38 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -1033,8 +1033,8 @@ ZEBRA_RES zebra_result_set_term_info(ZebraHandle zh, const char *setname, return ZEBRA_FAIL; } -ZEBRA_RES zebra_get_hit_vector(ZebraHandle zh, const char *setname, - zint sysno) +ZEBRA_RES zebra_snippets_hit_vector(ZebraHandle zh, const char *setname, + zint sysno, zebra_snippets *snippets) { ZebraSet sset = resultSetGet(zh, setname); yaz_log(YLOG_LOG, "zebra_get_hit_vector setname=%s zysno=" ZINT_FORMAT, @@ -1074,14 +1074,18 @@ ZEBRA_RES zebra_get_hit_vector(ZebraHandle zh, const char *setname, { struct ord_list *ol; key_logdump_txt(YLOG_LOG, &key, termid->name); - yaz_log(YLOG_LOG, " type=%d", termid->type); for (ol = termid->ol; ol; ol = ol->next) + { yaz_log(YLOG_LOG, " ord=%d", ol->ord); + zebra_snippets_append(snippets, key.mem[key.len-1], + ol->ord, termid->name); + } } } rset_close(rsfd); rset_delete(rset_comb); + nmem_destroy(nmem); } return ZEBRA_OK; } diff --git a/recctrl/xslt.c b/recctrl/xslt.c index 9fe383a..be771d3 100644 --- a/recctrl/xslt.c +++ b/recctrl/xslt.c @@ -1,4 +1,4 @@ -/* $Id: xslt.c,v 1.7 2005-06-01 07:32:46 adam Exp $ +/* $Id: xslt.c,v 1.8 2005-06-07 11:36:38 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -57,6 +57,16 @@ struct filter_info { #define ZEBRA_SCHEMA_IDENTITY_NS "http://indexdata.dk/zebra/identity/1" static const char *zebra_index_ns = ZEBRA_INDEX_NS; +static void set_param_xml(const char **params, const char *name, + const char *value, ODR odr) +{ + while (*params) + params++; + params[0] = name; + params[1] = value; + params[2] = 0; +} + static void set_param_str(const char **params, const char *name, const char *value, ODR odr) { @@ -412,6 +422,51 @@ static int ioclose_ret(void *context) } +static const char *snippet_doc(struct recRetrieveCtrl *p) +{ + const char *xml_doc_str; + int ord = 0; + WRBUF wrbuf = wrbuf_alloc(); + zebra_snippets *res = + zebra_snippets_window(p->doc_snippet, p->hit_snippet, 10); + zebra_snippet_word *w = zebra_snippets_list(res); + +#if 1 + wrbuf_printf(wrbuf, "\'"); +#else + wrbuf_printf(wrbuf, "\n"); +#endif + for (; w; w = w->next) + { + if (ord == 0) + ord = w->ord; + else if (ord != w->ord) + break; +#if 1 + wrbuf_printf(wrbuf, "%s%s%s ", + w->match ? "*" : "", + w->term, + w->match ? "*" : ""); +#else + wrbuf_printf(wrbuf, " ", + (w->match ? "match='1'" : ""), + w->ord, w->seqno); + wrbuf_xmlputs(wrbuf, w->term); + wrbuf_printf(wrbuf, "\n"); +#endif + } +#if 1 + wrbuf_printf(wrbuf, "\'"); +#else + wrbuf_printf(wrbuf, "\n"); +#endif + xml_doc_str = odr_strdup(p->odr, wrbuf_buf(wrbuf)); + + zebra_snippets_destroy(res); + wrbuf_free(wrbuf, 1); + return xml_doc_str; +} + static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) { const char *esn = ZEBRA_SCHEMA_IDENTITY_NS; @@ -447,6 +502,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p) set_param_int(params, "score", p->score, p->odr); set_param_int(params, "size", p->recordSize, p->odr); + set_param_xml(params, "snippet", snippet_doc(p), p->odr); doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */, 0 /* URL */, 0 /* encoding */, diff --git a/test/xslt/marcschema.xml b/test/xslt/marcschema.xml index f4cf959..7ffa22b 100644 --- a/test/xslt/marcschema.xml +++ b/test/xslt/marcschema.xml @@ -1,8 +1,9 @@ - + + diff --git a/util/Makefile.am b/util/Makefile.am index 8c9783c..11c6178 100644 --- a/util/Makefile.am +++ b/util/Makefile.am @@ -1,4 +1,4 @@ -## $Id: Makefile.am,v 1.13 2005-03-30 09:25:25 adam Exp $ +## $Id: Makefile.am,v 1.14 2005-06-07 11:36:42 adam Exp $ lib_LTLIBRARIES = libidzebra-util.la @@ -14,6 +14,6 @@ AM_CPPFLAGS = -I$(srcdir)/../include $(YAZINC) -DDEFAULT_PROFILE_PATH=\"$(pkgdat LDADD = libidzebra-util.la $(YAZLALIB) libidzebra_util_la_SOURCES = zint.c res.c charmap.c zebramap.c passwddb.c \ - zebra-lock.c dirent.c xpath.c atoi_zn.c + zebra-lock.c dirent.c xpath.c atoi_zn.c snippet.c passtest_SOURCES = passtest.c diff --git a/util/snippet.c b/util/snippet.c new file mode 100644 index 0000000..c7222dd --- /dev/null +++ b/util/snippet.c @@ -0,0 +1,163 @@ +/* $Id: snippet.c,v 1.1 2005-06-07 11:36:43 adam Exp $ + Copyright (C) 1995-2005 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + +#include +#include +#include +#include + +struct zebra_snippets { + NMEM nmem; + zebra_snippet_word *front; + zebra_snippet_word *tail; +}; + +zebra_snippets *zebra_snippets_create() +{ + NMEM nmem = nmem_create(); + zebra_snippets *l = nmem_malloc(nmem, sizeof(*l)); + l->nmem = nmem; + l->front = l->tail = 0; + return l; +} + +void zebra_snippets_destroy(zebra_snippets *l) +{ + if (l) + nmem_destroy(l->nmem); +} + +void zebra_snippets_append(zebra_snippets *l, + zint seqno, int ord, const char *term) +{ + zebra_snippets_append_match(l, seqno, ord, term, 0); +} + +void zebra_snippets_append_match(zebra_snippets *l, + zint seqno, int ord, const char *term, + int match) +{ + struct zebra_snippet_word *w = nmem_malloc(l->nmem, sizeof(*w)); + + w->next = 0; + if (l->tail) + l->tail->next = w; + else + l->front = w; + l->tail = w; + + w->seqno = seqno; + w->ord = ord; + w->term = nmem_strdup(l->nmem, term); + w->match = match; +} + +zebra_snippet_word *zebra_snippets_list(zebra_snippets *l) +{ + return l->front; +} + +void zebra_snippets_log(zebra_snippets *l, int log_level) +{ + zebra_snippet_word *w; + for (w = l->front; w; w = w->next) + yaz_log(log_level, "term=%s%s seqno=" ZINT_FORMAT " ord=%d", + w->term, (w->match ? "*" : ""), w->seqno, w->ord); +} + +zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit, + int window_size) +{ + int ord = -1; + + zebra_snippets *result = zebra_snippets_create(); + + while(1) + { + int window_start; + zebra_snippet_word *hit_w, *doc_w; + int min_ord = 0; /* not set yet */ + for (hit_w = zebra_snippets_list(hit); hit_w; hit_w = hit_w->next) + if (hit_w->ord > ord && + (min_ord == 0 || hit_w->ord < min_ord)) + min_ord = hit_w->ord; + if (min_ord == 0) + break; + ord = min_ord; + + int first_seq_no_best_window = 0; + int last_seq_no_best_window = 0; + int number_best_window = 0; + + for (hit_w = zebra_snippets_list(hit); hit_w; hit_w = hit_w->next) + { + if (hit_w->ord == ord) + { + zebra_snippet_word *look_w = hit_w; + int number_this = 0; + int seq_no_last = 0; + while (look_w && look_w->seqno < hit_w->seqno + window_size) + { + if (look_w->ord == ord) + { + seq_no_last = look_w->seqno; + number_this++; + } + look_w = look_w->next; + } + if (number_this > number_best_window) + { + number_best_window = number_this; + first_seq_no_best_window = hit_w->seqno; + last_seq_no_best_window = seq_no_last; + } + } + } + yaz_log(YLOG_LOG, "ord=%d", ord); + yaz_log(YLOG_LOG, "first_seq_no_best_window=%d", first_seq_no_best_window); + yaz_log(YLOG_LOG, "last_seq_no_best_window=%d", last_seq_no_best_window); + yaz_log(YLOG_LOG, "number_best_window=%d", number_best_window); + + window_start = (first_seq_no_best_window + last_seq_no_best_window - + window_size) / 2; + for (doc_w = zebra_snippets_list(doc); doc_w; doc_w = doc_w->next) + if (doc_w->ord == ord + && doc_w->seqno >= window_start + && doc_w->seqno < window_start + window_size) + { + int match = 0; + for (hit_w = zebra_snippets_list(hit); hit_w; hit_w = hit_w->next) + { + if (hit_w->ord == ord && hit_w->seqno == doc_w->seqno) + + { + match = 1; + break; + } + } + zebra_snippets_append_match(result, doc_w->seqno, ord, + doc_w->term, match); + } + } + return result; +} + -- 1.7.10.4