From 6988ba91e363565638c27a8d5895ad9afc409e75 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 21 Aug 2007 11:06:46 +0000 Subject: [PATCH] Generic snippet support. Unlike previous versions of snippet implementations for Zebra this is not tied to a specific filter. The snippet(s) are returned as an XML record with one or more snippets in it - for special element set name zebra::snippet. --- doc/architecture.xml | 11 ++- include/idzebra/snippet.h | 28 ++++++-- index/extract.c | 172 +++++++++++++++++++++++++++++++++++++++++---- index/index.h | 9 ++- index/retrieve.c | 163 +++++++++++++++++++++++++++++++++++------- index/zebraapi.c | 5 +- index/zsets.c | 4 +- util/snippet.c | 148 +++++++++++++++++++++++++++++++++----- 8 files changed, 472 insertions(+), 68 deletions(-) diff --git a/doc/architecture.xml b/doc/architecture.xml index dca8925..aaafe2c 100644 --- a/doc/architecture.xml +++ b/doc/architecture.xml @@ -1,5 +1,5 @@ - + Overview of &zebra; Architecture
@@ -496,6 +496,15 @@ &acro.xml; and &acro.sutrs; + + + zebra::snippet + + + Get snippet for record. + + &acro.xml; + diff --git a/include/idzebra/snippet.h b/include/idzebra/snippet.h index e4cb07f..474b9a4 100644 --- a/include/idzebra/snippet.h +++ b/include/idzebra/snippet.h @@ -1,4 +1,4 @@ -/* $Id: snippet.h,v 1.8 2007-01-15 20:08:24 adam Exp $ +/* $Id: snippet.h,v 1.9 2007-08-21 11:06:46 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -32,7 +32,10 @@ struct zebra_snippet_word { int ord; char *term; int match; + int mark; + int ws; struct zebra_snippet_word *next; + struct zebra_snippet_word *prev; }; typedef struct zebra_snippets zebra_snippets; @@ -46,23 +49,38 @@ void zebra_snippets_destroy(zebra_snippets *l); YAZ_EXPORT void zebra_snippets_append(zebra_snippets *l, - zint seqno, int ord, const char *term); + zint seqno, int ws, int ord, const char *term); + +YAZ_EXPORT +void zebra_snippets_appendn(zebra_snippets *l, + zint seqno, int ws, int ord, + const char *term, size_t term_len); YAZ_EXPORT void zebra_snippets_append_match(zebra_snippets *l, - zint seqno, int ord, const char *term, + zint seqno, int ws, int ord, + const char *term, size_t term_len, int match); YAZ_EXPORT zebra_snippet_word *zebra_snippets_list(zebra_snippets *l); YAZ_EXPORT -void zebra_snippets_log(zebra_snippets *l, int log_level); +const zebra_snippet_word *zebra_snippets_constlist(const zebra_snippets *l); + +YAZ_EXPORT +void zebra_snippets_log(const zebra_snippets *l, int log_level, int all); YAZ_EXPORT -zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit, +zebra_snippets *zebra_snippets_window(const zebra_snippets *doc, + const zebra_snippets *hit, int window_size); +YAZ_EXPORT +void zebra_snippets_ring(zebra_snippets *doc, const zebra_snippets *hit, + int before, int after); + + YAZ_END_CDECL #endif diff --git a/index/extract.c b/index/extract.c index 1312bfe..64e39d3 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.258 2007-05-08 14:27:23 adam Exp $ +/* $Id: extract.c,v 1.259 2007-08-21 11:06:47 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -91,6 +91,20 @@ static void logRecord (ZebraHandle zh) } } +static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl) +{ + int i; + for (i = 0; i<256; i++) + { + if (zebra_maps_is_positioned(zh->reg->zebra_maps, i)) + ctrl->seqno[i] = 1; + else + ctrl->seqno[i] = 0; + } + ctrl->flagShowRecords = !zh->m_flag_rw; +} + + static void extract_add_index_string (RecWord *p, zinfo_index_category_t cat, const char *str, int length); @@ -108,6 +122,147 @@ static void extract_init(struct recExtractCtrl *p, RecWord *w) w->segment = 0; } +struct snip_rec_info { + ZebraHandle zh; + zebra_snippets *snippets; +}; + + +static void snippet_add_complete_field(RecWord *p) +{ + +} + +static void snippet_add_incomplete_field(RecWord *p, int ord) +{ + struct snip_rec_info *h = p->extractCtrl->handle; + ZebraHandle zh = h->zh; + const char *b = p->term_buf; + int remain = p->term_len; + int first = 1; + const char **map = 0; + const char *start = b; + + if (remain > 0) + map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0); + + while (map) + { + char buf[IT_MAX_WORD+1]; + const char *last = b; + int i, remain; + + /* Skip spaces */ + while (map && *map && **map == *CHR_SPACE) + { + remain = p->term_len - (b - p->term_buf); + last = b; + if (remain > 0) + map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, + remain, 0); + else + map = 0; + } + if (!map) + break; + if (start != last) + { + zebra_snippets_appendn(h->snippets, p->seqno, 1, ord, + start, last - start); + + } + start = last; + + i = 0; + while (map && *map && **map != *CHR_SPACE) + { + const char *cp = *map; + + while (i < IT_MAX_WORD && *cp) + buf[i++] = *(cp++); + remain = p->term_len - (b - p->term_buf); + last = b; + if (remain > 0) + map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0); + else + map = 0; + } + if (!i) + return; + + if (first) + { + first = 0; + if (zebra_maps_is_first_in_field(zh->reg->zebra_maps, p->index_type)) + { + /* first in field marker */ + p->seqno++; + } + } + if (start != last) + zebra_snippets_appendn(h->snippets, p->seqno, 0, ord, + start, last - start); + start = last; + p->seqno++; + } + +} + +static void snippet_token_add(RecWord *p) +{ + struct snip_rec_info *h = p->extractCtrl->handle; + ZebraHandle zh = h->zh; + + if (zebra_maps_is_index(zh->reg->zebra_maps, p->index_type)) + { + ZebraExplainInfo zei = zh->reg->zei; + int ch = zebraExplain_lookup_attr_str( + zei, zinfo_index_category_index, p->index_type, p->index_name); + + if (zebra_maps_is_complete (h->zh->reg->zebra_maps, p->index_type)) + snippet_add_complete_field (p); + else + snippet_add_incomplete_field(p, ch); + } +} + +static void snippet_schema_add( + struct recExtractCtrl *p, Odr_oid *oid) +{ + +} + +void extract_snippet(ZebraHandle zh, zebra_snippets *sn, + struct ZebraRecStream *stream, + RecType rt, void *recTypeClientData) +{ + struct recExtractCtrl extractCtrl; + struct snip_rec_info info; + int r; + + extractCtrl.stream = stream; + extractCtrl.first_record = 1; + extractCtrl.init = extract_init; + extractCtrl.tokenAdd = snippet_token_add; + extractCtrl.schemaAdd = snippet_schema_add; + assert(zh->reg); + assert(zh->reg->dh); + + extractCtrl.dh = zh->reg->dh; + + info.zh = zh; + info.snippets = sn; + extractCtrl.handle = &info; + extractCtrl.match_criteria[0] = '\0'; + extractCtrl.staticrank = 0; + extractCtrl.action = action_insert; + + init_extractCtrl(zh, &extractCtrl); + + r = (*rt->extract)(recTypeClientData, &extractCtrl); + +} + static void searchRecordKey(ZebraHandle zh, zebra_rec_keys_t reckeys, const char *index_name, @@ -305,19 +460,6 @@ struct recordLogInfo { struct recordGroup *rGroup; }; -static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl) -{ - int i; - for (i = 0; i<256; i++) - { - if (zebra_maps_is_positioned(zh->reg->zebra_maps, i)) - ctrl->seqno[i] = 1; - else - ctrl->seqno[i] = 0; - } - ctrl->flagShowRecords = !zh->m_flag_rw; -} - static void all_matches_add(struct recExtractCtrl *ctrl) { RecWord word; @@ -1289,7 +1431,7 @@ ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh, assert(index_type); zebra_term_untrans_iconv(zh, nmem, index_type, &dst_term, str); - zebra_snippets_append(snippets, seqno, ord, dst_term); + zebra_snippets_append(snippets, seqno, 0, ord, dst_term); nmem_reset(nmem); } } diff --git a/index/index.h b/index/index.h index 1f6e466..8738d1a 100644 --- a/index/index.h +++ b/index/index.h @@ -1,4 +1,4 @@ -/* $Id: index.h,v 1.198 2007-05-08 12:50:04 adam Exp $ +/* $Id: index.h,v 1.199 2007-08-21 11:06:47 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -307,7 +307,8 @@ ZEBRA_RES resultSetRank(ZebraHandle zh, ZebraSet zebraSet, RSET rset, NMEM nmem); void resultSetInvalidate(ZebraHandle zh); -int zebra_record_fetch(ZebraHandle zh, zint sysno, int score, +int zebra_record_fetch(ZebraHandle zh, const char *setname, + zint sysno, int score, zebra_snippets *hit_snippet, ODR stream, const Odr_oid *input_format, Z_RecordComposition *comp, const Odr_oid **output_format, char **rec_bufp, @@ -316,6 +317,10 @@ int zebra_record_fetch(ZebraHandle zh, zint sysno, int score, void extract_get_fname_tmp(ZebraHandle zh, char *fname, int no); +void extract_snippet(ZebraHandle zh, zebra_snippets *sn, + struct ZebraRecStream *stream, RecType rt, + void *recTypeClientData); + void zebra_index_merge(ZebraHandle zh); ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, diff --git a/index/retrieve.c b/index/retrieve.c index f166dc2..d3ca5ad 100644 --- a/index/retrieve.c +++ b/index/retrieve.c @@ -1,4 +1,4 @@ -/* $Id: retrieve.c,v 1.70 2007-05-08 12:50:04 adam Exp $ +/* $Id: retrieve.c,v 1.71 2007-08-21 11:06:47 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -34,14 +34,15 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include "index.h" #include +#include #include #include #define ZEBRA_XML_HEADER_STR "reg->zei, *rec); @@ -399,7 +400,131 @@ static void retrieve_puts_int(WRBUF wrbuf, const char *name, wrbuf_printf(wrbuf, "%s %i\n", name, value); } -int zebra_special_fetch(ZebraHandle zh, zint sysno, int score, ODR odr, + +static void snippet_xml_record(ZebraHandle zh, WRBUF wrbuf, zebra_snippets *doc) +{ + const zebra_snippet_word *doc_w; + int mark_state = 0; + + wrbuf_printf(wrbuf, "%s>\n", ZEBRA_XML_HEADER_STR); + for (doc_w = zebra_snippets_constlist(doc); doc_w; doc_w = doc_w->next) + { + if (doc_w->mark) + { + int index_type; + const char *db = 0; + const char *string_index = 0; + + zebraExplain_lookup_ord(zh->reg->zei, doc_w->ord, + &index_type, &db, &string_index); + + if (mark_state == 0) + { + wrbuf_printf(wrbuf, " ", index_type); + } + if (doc_w->match) + wrbuf_puts(wrbuf, ""); + /* not printing leading ws */ + if (mark_state || !doc_w->ws || doc_w->match) + wrbuf_xmlputs(wrbuf, doc_w->term); + if (doc_w->match) + wrbuf_puts(wrbuf, ""); + } + else if (mark_state == 1) + { + wrbuf_puts(wrbuf, "\n"); + } + mark_state = doc_w->mark; + } + if (mark_state == 1) + { + wrbuf_puts(wrbuf, "\n"); + } + wrbuf_printf(wrbuf, ""); +} + +int zebra_special_snippet_fetch(ZebraHandle zh, const char *setname, + zint sysno, ODR odr, + const char *elemsetname, + const Odr_oid *input_format, + const Odr_oid **output_format, + char **rec_bufp, int *rec_lenp) +{ + int return_code = 0; + Record rec; + + rec = rec_get(zh->reg->records, sysno); + if (!rec) + { + yaz_log(YLOG_WARN, "rec_get fail on sysno=" ZINT_FORMAT, sysno); + return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + } + else + { + const char *file_type = rec->info[recInfo_fileType]; + void *recTypeClientData; + RecType rt = recType_byName(zh->reg->recTypes, zh->res, + file_type, &recTypeClientData); + zebra_snippets *hit_snippet = zebra_snippets_create(); + WRBUF wrbuf = wrbuf_alloc(); + + zebra_snippets_hit_vector(zh, setname, sysno, hit_snippet); + + if (!rt) + return_code = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS; + else + { + struct ZebraRecStream stream; + + return_code = zebra_create_record_stream(zh, &rec, &stream); + if (return_code == 0) + { + zebra_snippets *rec_snippet = zebra_snippets_create(); + extract_snippet(zh, rec_snippet, &stream, + rt, recTypeClientData); + +#if 0 + /* for debugging purposes */ + yaz_log(YLOG_LOG, "---------------------------"); + yaz_log(YLOG_LOG, "REC SNIPPET:"); + zebra_snippets_log(rec_snippet, YLOG_LOG, 1); + yaz_log(YLOG_LOG, "---------------------------"); + yaz_log(YLOG_LOG, "HIT SNIPPET:"); + zebra_snippets_log(hit_snippet, YLOG_LOG, 1); +#endif + + zebra_snippets_ring(rec_snippet, hit_snippet, 5, 5); + +#if 0 + yaz_log(YLOG_LOG, "---------------------------"); + yaz_log(YLOG_LOG, "RING SNIPPET:"); + zebra_snippets_log(rec_snippet, YLOG_LOG, 1); +#endif + + snippet_xml_record(zh, wrbuf, rec_snippet); + + *output_format = yaz_oid_recsyn_xml; + + + zebra_snippets_destroy(rec_snippet); + } + stream.destroy(&stream); + } + if (return_code == 0) + { + *rec_lenp = wrbuf_len(wrbuf); + *rec_bufp = odr_strdup(odr, wrbuf_cstr(wrbuf)); + } + wrbuf_destroy(wrbuf); + rec_free(&rec); + zebra_snippets_destroy(hit_snippet); + } + return return_code; +} + +int zebra_special_fetch(ZebraHandle zh, const char *setname, + zint sysno, int score, ODR odr, const char *elemsetname, const Odr_oid *input_format, const Odr_oid **output_format, @@ -411,6 +536,13 @@ int zebra_special_fetch(ZebraHandle zh, zint sysno, int score, ODR odr, /* *rec_lenp = 0; */ + if (elemsetname && 0 == strcmp(elemsetname, "snippet")) + { + return zebra_special_snippet_fetch(zh, setname, sysno, odr, + elemsetname + 7, + input_format, output_format, + rec_bufp, rec_lenp); + } /* processing zebra::meta::sysno elemset without fetching binary data */ if (elemsetname && 0 == strcmp(elemsetname, "meta::sysno")) @@ -558,7 +690,8 @@ int zebra_special_fetch(ZebraHandle zh, zint sysno, int score, ODR odr, } -int zebra_record_fetch(ZebraHandle zh, zint sysno, int score, +int zebra_record_fetch(ZebraHandle zh, const char *setname, + zint sysno, int score, zebra_snippets *hit_snippet, ODR odr, const Odr_oid *input_format, Z_RecordComposition *comp, const Odr_oid **output_format, @@ -579,7 +712,7 @@ int zebra_record_fetch(ZebraHandle zh, zint sysno, int score, /* processing zebra special elementset names of form 'zebra:: */ if (elemsetname && 0 == strncmp(elemsetname, "zebra::", 7)) - return zebra_special_fetch(zh, sysno, score, odr, + return zebra_special_fetch(zh, setname, sysno, score, odr, elemsetname + 7, input_format, output_format, rec_bufp, rec_lenp); @@ -610,7 +743,6 @@ int zebra_record_fetch(ZebraHandle zh, zint sysno, int score, if (rec) { - zebra_snippets *snippet; zebra_rec_keys_t reckeys = zebra_rec_keys_open(); RecType rt; struct recRetrieveCtrl retrieveCtrl; @@ -641,22 +773,6 @@ int zebra_record_fetch(ZebraHandle zh, zint sysno, int score, zebra_rec_keys_to_snippets(zh, reckeys, retrieveCtrl.doc_snippet); zebra_rec_keys_close(reckeys); -#if 0 - /* for debugging purposes */ - yaz_log(YLOG_LOG, "DOC SNIPPET:"); - zebra_snippets_log(retrieveCtrl.doc_snippet, YLOG_LOG); - yaz_log(YLOG_LOG, "HIT SNIPPET:"); - zebra_snippets_log(retrieveCtrl.hit_snippet, YLOG_LOG); -#endif - snippet = zebra_snippets_window(retrieveCtrl.doc_snippet, - retrieveCtrl.hit_snippet, - 10); -#if 0 - /* for debugging purposes */ - yaz_log(YLOG_LOG, "WINDOW SNIPPET:"); - zebra_snippets_log(snippet, YLOG_LOG); -#endif - if (!(rt = recType_byName(zh->reg->recTypes, zh->res, file_type, &clientData))) { @@ -679,7 +795,6 @@ int zebra_record_fetch(ZebraHandle zh, zint sysno, int score, *addinfo = retrieveCtrl.addinfo; } - zebra_snippets_destroy(snippet); zebra_snippets_destroy(retrieveCtrl.doc_snippet); stream.destroy(&stream); diff --git a/index/zebraapi.c b/index/zebraapi.c index 10a4954..93c9ad9 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -1,4 +1,4 @@ -/* $Id: zebraapi.c,v 1.256 2007-05-21 11:54:59 adam Exp $ +/* $Id: zebraapi.c,v 1.257 2007-08-21 11:06:47 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -1144,7 +1144,8 @@ ZEBRA_RES zebra_records_retrieve(ZebraHandle zh, ODR stream, hit_snippet); #endif recs[i].errCode = - zebra_record_fetch(zh, poset[i].sysno, poset[i].score, + zebra_record_fetch(zh, setname, + poset[i].sysno, poset[i].score, hit_snippet, stream, input_format, comp, &recs[i].format, &buf, &len, diff --git a/index/zsets.c b/index/zsets.c index fb997da..cdfc2d2 100644 --- a/index/zsets.c +++ b/index/zsets.c @@ -1,4 +1,4 @@ -/* $Id: zsets.c,v 1.121 2007-04-16 08:44:32 adam Exp $ +/* $Id: zsets.c,v 1.122 2007-08-21 11:06:47 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -1273,7 +1273,7 @@ ZEBRA_RES zebra_snippets_hit_vector(ZebraHandle zh, const char *setname, struct ord_list *ol; for (ol = termid->ol; ol; ol = ol->next) { - zebra_snippets_append(snippets, key.mem[key.len-1], + zebra_snippets_append(snippets, key.mem[key.len-1], 0, ol->ord, termid->name); } } diff --git a/util/snippet.c b/util/snippet.c index 6d45a01..b299e5f 100644 --- a/util/snippet.c +++ b/util/snippet.c @@ -1,4 +1,4 @@ -/* $Id: snippet.c,v 1.12 2007-01-15 15:10:26 adam Exp $ +/* $Id: snippet.c,v 1.13 2007-08-21 11:06:47 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -21,6 +21,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include +#include #include #include #include @@ -47,28 +48,46 @@ void zebra_snippets_destroy(zebra_snippets *l) } void zebra_snippets_append(zebra_snippets *l, - zint seqno, int ord, const char *term) + zint seqno, int ws, int ord, const char *term) { - zebra_snippets_append_match(l, seqno, ord, term, 0); + zebra_snippets_append_match(l, seqno, ws, ord, term, strlen(term), 0); } +void zebra_snippets_appendn(zebra_snippets *l, + zint seqno, int ws, int ord, const char *term, + size_t term_len) +{ + zebra_snippets_append_match(l, seqno, ws, ord, term, term_len, 0); +} + + void zebra_snippets_append_match(zebra_snippets *l, - zint seqno, int ord, const char *term, + zint seqno, int ws, int ord, + const char *term, size_t term_len, int match) { struct zebra_snippet_word *w = nmem_malloc(l->nmem, sizeof(*w)); w->next = 0; + w->prev = l->tail; if (l->tail) + { l->tail->next = w; + } else + { l->front = w; + } l->tail = w; w->seqno = seqno; + w->ws = ws; w->ord = ord; - w->term = nmem_strdup(l->nmem, term); + w->term = nmem_malloc(l->nmem, term_len+1); + memcpy(w->term, term, term_len); + w->term[term_len] = '\0'; w->match = match; + w->mark = 0; } zebra_snippet_word *zebra_snippets_list(zebra_snippets *l) @@ -76,19 +95,28 @@ zebra_snippet_word *zebra_snippets_list(zebra_snippets *l) return l->front; } -void zebra_snippets_log(zebra_snippets *l, int log_level) +const zebra_snippet_word *zebra_snippets_constlist(const zebra_snippets *l) +{ + return l->front; +} + +void zebra_snippets_log(const zebra_snippets *l, int log_level, int all) { zebra_snippet_word *w; for (w = l->front; w; w = w->next) - yaz_log(log_level, "term=%s%s seqno=" ZINT_FORMAT " ord=%d", - w->term, (w->match ? "*" : ""), w->seqno, w->ord); + { + if (all || w->mark) + yaz_log(log_level, "term='%s'%s mark=%d seqno=" ZINT_FORMAT " ord=%d", + w->term, (w->match && !w->ws ? "*" : ""), w->mark, + w->seqno, w->ord); + } } -zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit, +zebra_snippets *zebra_snippets_window(const zebra_snippets *doc, + const zebra_snippets *hit, int window_size) { int ord = -1; - zebra_snippets *result = zebra_snippets_create(); if (window_size == 0) window_size = 1000000; @@ -99,10 +127,10 @@ zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit, zint first_seq_no_best_window = 0; zint last_seq_no_best_window = 0; int number_best_window = 0; - zebra_snippet_word *hit_w, *doc_w; + const zebra_snippet_word *hit_w, *doc_w; int min_ord = 0; /* not set yet */ - for (hit_w = zebra_snippets_list(hit); hit_w; hit_w = hit_w->next) + for (hit_w = zebra_snippets_constlist(hit); hit_w; hit_w = hit_w->next) if (hit_w->ord > ord && (min_ord == 0 || hit_w->ord < min_ord)) { @@ -112,11 +140,11 @@ zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit, break; ord = min_ord; - for (hit_w = zebra_snippets_list(hit); hit_w; hit_w = hit_w->next) + for (hit_w = zebra_snippets_constlist(hit); hit_w; hit_w = hit_w->next) { if (hit_w->ord == ord) { - zebra_snippet_word *look_w = hit_w; + const zebra_snippet_word *look_w = hit_w; int number_this = 0; zint seq_no_last = 0; while (look_w && look_w->seqno < hit_w->seqno + window_size) @@ -145,13 +173,13 @@ zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit, window_start = (first_seq_no_best_window + last_seq_no_best_window - window_size) / 2; - for (doc_w = zebra_snippets_list(doc); doc_w; doc_w = doc_w->next) + for (doc_w = zebra_snippets_constlist(doc); doc_w; doc_w = doc_w->next) if (doc_w->ord == ord && doc_w->seqno >= window_start && doc_w->seqno < window_start + window_size) { int match = 0; - for (hit_w = zebra_snippets_list(hit); hit_w; + for (hit_w = zebra_snippets_constlist(hit); hit_w; hit_w = hit_w->next) { if (hit_w->ord == ord && hit_w->seqno == doc_w->seqno) @@ -162,11 +190,97 @@ zebra_snippets *zebra_snippets_window(zebra_snippets *doc, zebra_snippets *hit, } } zebra_snippets_append_match(result, doc_w->seqno, - ord, doc_w->term, match); + doc_w->ws, + ord, doc_w->term, + strlen(doc_w->term), match); } } return result; } + +static void zebra_snippets_clear(zebra_snippets *sn) +{ + zebra_snippet_word *w; + + for (w = zebra_snippets_list(sn); w; w = w->next) + { + w->mark = 0; + w->match = 0; + } +} + +void zebra_snippets_ring(zebra_snippets *doc, const zebra_snippets *hit, + int before, int after) +{ + int ord = -1; + + zebra_snippets_clear(doc); + while (1) + { + const zebra_snippet_word *hit_w; + zebra_snippet_word *doc_w; + int min_ord = 0; /* not set yet */ + + for (hit_w = zebra_snippets_constlist(hit); hit_w; hit_w = hit_w->next) + if (hit_w->ord > ord && + (min_ord == 0 || hit_w->ord < min_ord)) + { + min_ord = hit_w->ord; + } + if (min_ord == 0) + break; + ord = min_ord; + + for (hit_w = zebra_snippets_constlist(hit); hit_w; hit_w = hit_w->next) + { + if (hit_w->ord == ord) + { + for (doc_w = zebra_snippets_list(doc); doc_w; doc_w = doc_w->next) + { + if (doc_w->ord == ord && doc_w->seqno == hit_w->seqno + && !doc_w->ws) + { + doc_w->match = 1; + doc_w->mark = 1; + break; + } + + } + /* mark following terms */ + if (doc_w) + { + zebra_snippet_word *w = doc_w->next; + while (w) + if (w->ord == ord + && hit_w->seqno - before < w->seqno + && hit_w->seqno + after > w->seqno) + { + w->mark = 1; + w = w->next; + } + else + break; + } + /* mark preceding terms */ + if (doc_w) + { + zebra_snippet_word *w = doc_w->prev; + while (w) + if (w->ord == ord + && hit_w->seqno - before < w->seqno + && hit_w->seqno + after > w->seqno) + { + w->mark = 1; + w = w->prev; + } + else + break; + } + } + } + } +} + /* * Local variables: -- 1.7.10.4