From deb0cef3d4d19dc6508b2fed71711b3fb1be26a2 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Wed, 16 Aug 2006 13:16:35 +0000 Subject: [PATCH] Experimental segment facility (for matching of words within one field/segment). --- NEWS | 3 ++ configure.ac | 4 +- include/idzebra/recctrl.h | 3 +- include/idzebra/version.h | 6 +-- index/extract.c | 103 ++++++++++++++++----------------------------- index/index.h | 3 +- index/kcompare.c | 14 +++--- index/kcontrol.c | 15 ++++--- index/limit.c | 4 +- index/safari.c | 59 ++++++++++++++++++++------ index/zebraapi.c | 10 ++++- test/api/safari.cfg | 4 +- test/api/safari1.c | 52 +++++++++++------------ 13 files changed, 148 insertions(+), 132 deletions(-) diff --git a/NEWS b/NEWS index 3ef80ac..902a796 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,6 @@ +Experimental segment facility (for matching of words within one +field/segment). + --- 2.0.0 2006/08/14 New record filter (record type) 'alvis' which uses XSLT transformations diff --git a/configure.ac b/configure.ac index 749280a..0dd501c 100644 --- a/configure.ac +++ b/configure.ac @@ -1,8 +1,8 @@ dnl Zebra, Index Data ApS, 1995-2006 -dnl $Id: configure.ac,v 1.26 2006-08-14 12:18:46 adam Exp $ +dnl $Id: configure.ac,v 1.27 2006-08-16 13:16:35 adam Exp $ dnl AC_PREREQ(2.59) -AC_INIT([idzebra],[2.0.0],[adam@indexdata.dk]) +AC_INIT([idzebra],[2.0.1],[adam@indexdata.dk]) AC_CONFIG_SRCDIR(configure.ac) AC_CONFIG_AUX_DIR(config) AM_INIT_AUTOMAKE([1.8]) diff --git a/include/idzebra/recctrl.h b/include/idzebra/recctrl.h index c202d8d..3c45251 100644 --- a/include/idzebra/recctrl.h +++ b/include/idzebra/recctrl.h @@ -1,4 +1,4 @@ -/* $Id: recctrl.h,v 1.27 2006-08-15 14:28:33 adam Exp $ +/* $Id: recctrl.h,v 1.28 2006-08-16 13:16:35 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -55,6 +55,7 @@ typedef struct { const char *term_buf; int term_len; zint seqno; + zint segment; zint record_id; zint section_id; struct recExtractCtrl *extractCtrl; diff --git a/include/idzebra/version.h b/include/idzebra/version.h index f3a7283..491b537 100644 --- a/include/idzebra/version.h +++ b/include/idzebra/version.h @@ -1,4 +1,4 @@ -/* $Id: version.h,v 1.7 2006-08-14 10:40:14 adam Exp $ +/* $Id: version.h,v 1.8 2006-08-16 13:16:35 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -22,9 +22,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #ifndef ZEBRAVER -#define ZEBRAVER "2.0.0" +#define ZEBRAVER "2.0.1" -#define ZEBRADATE "$Date: 2006-08-14 10:40:14 $" +#define ZEBRADATE "$Date: 2006-08-16 13:16:35 $" #endif /* diff --git a/index/extract.c b/index/extract.c index 4b76e03..e4973ab 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.226 2006-08-15 14:28:33 adam Exp $ +/* $Id: extract.c,v 1.227 2006-08-16 13:16:36 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -112,7 +112,7 @@ static void extract_add_index_string (RecWord *p, static void extract_set_store_data_prepare(struct recExtractCtrl *p); -static void extract_init (struct recExtractCtrl *p, RecWord *w) +static void extract_init(struct recExtractCtrl *p, RecWord *w) { w->seqno = 1; w->index_name = "any"; @@ -120,6 +120,7 @@ static void extract_init (struct recExtractCtrl *p, RecWord *w) w->extractCtrl = p; w->record_id = 0; w->section_id = 0; + w->segment = 0; } static void searchRecordKey(ZebraHandle zh, @@ -153,7 +154,7 @@ static void searchRecordKey(ZebraHandle zh, zint seqno; while (zebra_rec_keys_read(reckeys, &str, &slen, &key)) { - assert(key.len <= 4 && key.len > 2); + assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2); seqno = key.mem[key.len-1]; @@ -1374,10 +1375,11 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) { int ch = 0; + int i, j = 0; struct it_key key_out; - zint *keyp = key_out.mem; - assert(key_in.len == 4); + assert(key_in.len >= 2); + assert(key_in.len <= IT_KEY_LEVEL_MAX); /* check for buffer overflow */ if (zh->reg->key_buf_used + 1024 > @@ -1389,6 +1391,9 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] = (char*)zh->reg->key_buf + zh->reg->key_buf_used; + /* key_in.mem[0] ord/ch */ + /* key_in.mem[1] filter specified record ID */ + /* encode the ordinal value (field/use/attribute) .. */ ch = CAST_ZINT_TO_INT(key_in.mem[0]); zh->reg->key_buf_used += @@ -1411,19 +1416,17 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, (long) staticrank); staticrank = 0; } - *keyp++ = staticrank; - key_out.len = 4; + key_out.mem[j++] = staticrank; } - else - key_out.len = 3; if (key_in.mem[1]) /* filter specified record ID */ - *keyp++ = key_in.mem[1]; + key_out.mem[j++] = key_in.mem[1]; else - *keyp++ = sysno; - *keyp++ = key_in.mem[2]; /* section_id */ - *keyp++ = key_in.mem[3]; /* sequence .. */ - + key_out.mem[j++] = sysno; + for (i = 2; i < key_in.len; i++) + key_out.mem[j++] = key_in.mem[i]; + key_out.len = j; + memcpy((char*)zh->reg->key_buf + zh->reg->key_buf_used, &key_out, sizeof(key_out)); (zh->reg->key_buf_used) += sizeof(key_out); @@ -1578,7 +1581,7 @@ ZEBRA_RES zebra_snippets_rec_keys(ZebraHandle zh, zint seqno; int index_type; - assert(key.len <= 4 && key.len > 2); + assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2); seqno = key.mem[key.len-1]; ord = CAST_ZINT_TO_INT(key.mem[0]); @@ -1610,7 +1613,7 @@ void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys) int index_type; int ord = CAST_ZINT_TO_INT(key.mem[0]); const char *db = 0; - assert(key.len <= 4 && key.len > 2); + assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2); zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0); @@ -1630,37 +1633,21 @@ static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat, struct it_key key; ZebraHandle zh = p->extractCtrl->handle; ZebraExplainInfo zei = zh->reg->zei; - int ch; - - if (!p->index_name) - return; + int ch, i; ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name); if (ch < 0) ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name); - key.len = 4; - key.mem[0] = ch; - key.mem[1] = p->record_id; - key.mem[2] = p->section_id; - key.mem[3] = p->seqno; + i = 0; + key.mem[i++] = ch; + key.mem[i++] = p->record_id; + key.mem[i++] = p->section_id; -#if 0 - if (1) - { - char strz[80]; - int i; - - strz[0] = 0; - for (i = 0; iattrSet, p->attrUse, p->record_id, p->section_id, p->seqno, - strz); - } -#endif + if (zh->m_segment_indexing) + key.mem[i++] = p->segment; + key.mem[i++] = p->seqno; + key.len = i; zebra_rec_keys_write(zh->reg->keys, str, length, &key); } @@ -1672,45 +1659,27 @@ static void extract_add_sort_string(RecWord *p, const char *str, int length) ZebraExplainInfo zei = zh->reg->zei; int ch; zinfo_index_category_t cat = zinfo_index_category_sort; - - - if (!p->index_name) - return; ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name); if (ch < 0) ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name); - key.len = 4; + key.len = 2; key.mem[0] = ch; key.mem[1] = p->record_id; - key.mem[2] = p->section_id; - key.mem[3] = p->seqno; -#if 0 - if (1) - { - char strz[80]; - int i; - - strz[0] = 0; - for (i = 0; iattrSet, p->attrUse, p->record_id, p->section_id, p->seqno, - strz); - } -#endif zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key); } -static void extract_add_string (RecWord *p, const char *string, int length) +static void extract_add_string(RecWord *p, const char *string, int length) { ZebraHandle zh = p->extractCtrl->handle; assert (length > 0); - if (zebra_maps_is_sort (zh->reg->zebra_maps, p->index_type)) - extract_add_sort_string (p, string, length); + + if (!p->index_name) + return; + + if (zebra_maps_is_sort(zh->reg->zebra_maps, p->index_type)) + extract_add_sort_string(p, string, length); else { extract_add_index_string(p, zinfo_index_category_index, diff --git a/index/index.h b/index/index.h index 1e8fa72..be2e858 100644 --- a/index/index.h +++ b/index/index.h @@ -1,4 +1,4 @@ -/* $Id: index.h,v 1.172 2006-08-15 14:28:34 adam Exp $ +/* $Id: index.h,v 1.173 2006-08-16 13:16:36 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -243,6 +243,7 @@ struct zebra_session { int shadow_enable; int m_staticrank; + int m_segment_indexing; zint records_inserted; zint records_updated; diff --git a/index/kcompare.c b/index/kcompare.c index 3fd1c7e..00050c7 100644 --- a/index/kcompare.c +++ b/index/kcompare.c @@ -1,4 +1,4 @@ -/* $Id: kcompare.c,v 1.60 2006-08-14 10:40:15 adam Exp $ +/* $Id: kcompare.c,v 1.61 2006-08-16 13:16:36 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -68,7 +68,7 @@ int key_compare_it (const void *p1, const void *p2) int i, l = ((struct it_key *) p1)->len; if (((struct it_key *) p2)->len > l) l = ((struct it_key *) p2)->len; - assert (l <= 4 && l > 0); + assert (l <= IT_KEY_LEVEL_MAX && l > 0); for (i = 0; i < l; i++) { if (((struct it_key *) p1)->mem[i] != ((struct it_key *) p2)->mem[i]) @@ -97,7 +97,7 @@ int key_compare (const void *p1, const void *p2) l = i1.len; if (i2.len > l) l = i2.len; - assert (l <= 4 && l > 0); + assert (l <= IT_KEY_LEVEL_MAX && l > 0); for (i = 0; i < l; i++) { if (i1.mem[i] != i2.mem[i]) @@ -122,7 +122,7 @@ zint key_get_segment(const void *p) { struct it_key k; memcpy (&k, p, sizeof(k)); - return k.mem[k.len-1] / KEY_SEGMENT_SIZE; + return k.mem[k.len-2]; } int key_qsort_compare (const void *p1, const void *p2) @@ -156,7 +156,7 @@ void key_init(struct it_key *key) { int i; key->len = 0; - for (i = 0; imem[i] = 0; } @@ -165,7 +165,7 @@ void iscz1_reset (void *vp) struct iscz1_code_info *p = (struct iscz1_code_info *) vp; int i; p->key.len = 0; - for (i = 0; i< IT_KEY_LEVEL_MAX; i++) + for (i = 0; i < IT_KEY_LEVEL_MAX; i++) p->key.mem[i] = 0; } @@ -223,7 +223,7 @@ void iscz1_encode (void *vp, char **dst, const char **src) /* deal with leader + delta encoding .. */ d = 0; - assert(tkey.len > 0 && tkey.len <= 4); + assert(tkey.len > 0 && tkey.len <= IT_KEY_LEVEL_MAX); for (i = 0; i < tkey.len; i++) { d = tkey.mem[i] - p->key.mem[i]; diff --git a/index/kcontrol.c b/index/kcontrol.c index 4594a90..e5122a7 100644 --- a/index/kcontrol.c +++ b/index/kcontrol.c @@ -1,4 +1,4 @@ -/* $Id: kcontrol.c,v 1.4 2006-08-14 10:40:15 adam Exp $ +/* $Id: kcontrol.c,v 1.5 2006-08-16 13:16:36 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -62,16 +62,21 @@ struct rset_key_control *zebra_key_control_create(ZebraHandle zh) kc->context = cp; kc->key_size = sizeof(struct it_key); - kc->scope = 2; kc->cmp = key_compare_it; kc->key_logdump_txt = key_logdump_txt; kc->getseq = key_get_seq; - res_val = zebra_get_resource(zh, "segment", 0); - kc->get_segment = 0; - if (res_val && atoi(res_val)) + + if (zh->m_segment_indexing) { + kc->scope = 3; /* segment + seq is "same" record */ kc->get_segment = key_get_segment; } + else + { + kc->scope = 2; /* seq is "same" record */ + kc->get_segment = 0; + } + zebra_limit_for_rset(zh->m_limit, &kc->filter_func, &cp->filter_destroy, diff --git a/index/limit.c b/index/limit.c index 577167b..d418bca 100644 --- a/index/limit.c +++ b/index/limit.c @@ -1,4 +1,4 @@ -/* $Id: limit.c,v 1.8 2006-08-14 10:40:15 adam Exp $ +/* $Id: limit.c,v 1.9 2006-08-16 13:16:36 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -68,8 +68,6 @@ static int zebra_limit_filter_cb(const void *buf, void *data) #if ZEBRA_LIMIT_DEBUG yaz_log(YLOG_LOG, "zebra_limit_filter_cb zl=%p key->len=%d", zl, key->len); #endif - if (key->len != 3) - return 1; for (i = 0; zl->ids[i]; i++) { #if ZEBRA_LIMIT_DEBUG diff --git a/index/safari.c b/index/safari.c index 056b4ce..8327bda 100644 --- a/index/safari.c +++ b/index/safari.c @@ -1,4 +1,4 @@ -/* $Id: safari.c,v 1.2 2006-08-14 10:40:15 adam Exp $ +/* $Id: safari.c,v 1.3 2006-08-16 13:16:36 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -29,13 +29,20 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include struct filter_info { - char *sep; + int segments; }; -static void *filter_init (Res res, RecType recType) +static void *filter_init(Res res, RecType recType) { struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo)); - tinfo->sep = 0; + tinfo->segments = 0; + return tinfo; +} + +static void *filter_init2(Res res, RecType recType) +{ + struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo)); + tinfo->segments = 1; return tinfo; } @@ -47,7 +54,6 @@ static ZEBRA_RES filter_config(void *clientData, Res res, const char *args) static void filter_destroy(void *clientData) { struct filter_info *tinfo = clientData; - xfree (tinfo->sep); xfree (tinfo); } @@ -118,8 +124,6 @@ static int filter_extract(void *clientData, struct recExtractCtrl *p) yaz_log(YLOG_LOG, "filter_extract off=%ld", (long) (*fi->p->tellf)(fi->p->fh)); #endif - xfree(tinfo->sep); - tinfo->sep = 0; (*p->init)(p, &recWord); if (!fi_gets(fi, line, sizeof(line)-1)) @@ -135,13 +139,29 @@ static int filter_extract(void *clientData, struct recExtractCtrl *p) #if 0 yaz_log(YLOG_LOG, "safari line: %s", line); #endif - if (sscanf(line, ZINT_FORMAT " " ZINT_FORMAT " " ZINT_FORMAT " %39s %n", - &recWord.record_id, &recWord.section_id, &recWord.seqno, - field, &nor) < 4) - { - yaz_log(YLOG_WARN, "Bad safari record line: %s", line); - return RECCTRL_EXTRACT_ERROR_GENERIC; - } + if (tinfo->segments) + { + if (sscanf(line, ZINT_FORMAT " " ZINT_FORMAT " " ZINT_FORMAT + ZINT_FORMAT " %39s %n", + &recWord.record_id, &recWord.section_id, + &recWord.segment, + &recWord.seqno, + field, &nor) < 5) + { + yaz_log(YLOG_WARN, "Bad safari record line: %s", line); + return RECCTRL_EXTRACT_ERROR_GENERIC; + } + } + else + { + if (sscanf(line, ZINT_FORMAT " " ZINT_FORMAT " " ZINT_FORMAT " %39s %n", + &recWord.record_id, &recWord.section_id, &recWord.seqno, + field, &nor) < 4) + { + yaz_log(YLOG_WARN, "Bad safari record line: %s", line); + return RECCTRL_EXTRACT_ERROR_GENERIC; + } + } for (cp = line + nor; *cp == ' '; cp++) ; recWord.index_name = field; @@ -258,6 +278,16 @@ static struct recType filter_type = { filter_retrieve }; +static struct recType filter_type2 = { + 0, + "safari2", + filter_init2, + filter_config, + filter_destroy, + filter_extract, + filter_retrieve +}; + RecType #ifdef IDZEBRA_STATIC_SAFARI idzebra_filter_safari @@ -267,6 +297,7 @@ idzebra_filter [] = { &filter_type, + &filter_type2, 0, }; /* diff --git a/index/zebraapi.c b/index/zebraapi.c index a1673f7..61268c2 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -1,4 +1,4 @@ -/* $Id: zebraapi.c,v 1.224 2006-08-14 10:40:15 adam Exp $ +/* $Id: zebraapi.c,v 1.225 2006-08-16 13:16:36 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -140,6 +140,7 @@ ZebraHandle zebra_open(ZebraService zs, Res res) zh->shadow_enable = 1; zh->m_staticrank = 0; + zh->m_segment_indexing = 0; default_encoding = res_get_def(zh->session_res, "encoding", "ISO-8859-1"); @@ -767,6 +768,13 @@ static void zebra_select_register (ZebraHandle zh, const char *new_reg) if (res_get_int(zh->res, "staticrank", &zh->m_staticrank) == ZEBRA_OK) yaz_log(YLOG_LOG, "static rank set and is %d", zh->m_staticrank); } + if (zh->res) + { + if (res_get_int(zh->res, "segment", &zh->m_segment_indexing) == + ZEBRA_OK) + yaz_log(YLOG_LOG, "segment indexing set and is %d", + zh->m_segment_indexing); + } } void map_basenames_func (void *vp, const char *name, const char *value) diff --git a/test/api/safari.cfg b/test/api/safari.cfg index 557ab57..ae323f9 100644 --- a/test/api/safari.cfg +++ b/test/api/safari.cfg @@ -1,9 +1,9 @@ -# $Id: safari.cfg,v 1.4 2006-07-04 14:10:32 adam Exp $ +# $Id: safari.cfg,v 1.5 2006-08-16 13:16:37 adam Exp $ profilepath: ${srcdir:-.}/../../tab attset: bib1.att -recordType: safari +recordType: safari2 segment: 1024 diff --git a/test/api/safari1.c b/test/api/safari1.c index 9c71bf8..dbe2fde 100644 --- a/test/api/safari1.c +++ b/test/api/safari1.c @@ -1,4 +1,4 @@ -/* $Id: safari1.c,v 1.13 2006-08-14 10:40:22 adam Exp $ +/* $Id: safari1.c,v 1.14 2006-08-16 13:16:37 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -28,43 +28,43 @@ const char *myrec[] = { "1234\n" /* ID first record */ /* chunk owner seq idx term */ - "00024338 125060 1 any the\n" - "00024338 125060 2 any art\n" - "00024338 125060 3 any mand\n" + "00024338 125060 0 1 any the\n" + "00024338 125060 0 2 any art\n" + "00024338 125060 0 3 any mand\n" , "5678\n" /* other record - same owner id */ - "00024339 125060 1 any den\n" - "00024339 125060 2 any gamle\n" - "00024339 125060 3 any mand\n" + "00024339 125060 0 1 any den\n" + "00024339 125060 0 2 any gamle\n" + "00024339 125060 0 3 any mand\n" , "5678\n" /* same record chunk id as before .. */ - "00024339 125060 1 any the\n" - "00024339 125060 2 any gamle\n" - "00024339 125060 3 any mand\n" + "00024339 125060 0 1 any the\n" + "00024339 125060 0 2 any gamle\n" + "00024339 125060 0 3 any mand\n" , "1000\n" /* separate record */ - "00024339 125061 1 any the\n" - "00024339 125061 2 any gamle\n" - "00024339 125061 3 any mand\n" + "00024339 125061 0 1 any the\n" + "00024339 125061 0 2 any gamle\n" + "00024339 125061 0 3 any mand\n" , "1001\n" /* separate record */ - "00024340 125062 1 any the\n" - "00024340 125062 1 any the\n" /* DUP KEY, bug #432 */ - "00024340 125062 2 any old\n" - "00024340 125062 3 any mand\n" + "00024340 125062 0 1 any the\n" + "00024340 125062 0 1 any the\n" /* DUP KEY, bug #432 */ + "00024340 125062 0 2 any old\n" + "00024340 125062 0 3 any mand\n" , "1002\n" /* segment testing record */ - "00024341 125062 1 title a\n" - "00024341 125062 2 title b\n" + "00024341 125062 0 1 title a\n" + "00024341 125062 0 2 title b\n" - "00024341 125062 1024 title b\n" - "00024341 125062 1025 title c\n" - "00024341 125062 1026 title d\n" - "00024341 125062 1027 title e\n" - "00024341 125062 1028 title f\n" + "00024341 125062 1 1024 title b\n" + "00024341 125062 1 1025 title c\n" + "00024341 125062 1 1026 title d\n" + "00024341 125062 1 1027 title e\n" + "00024341 125062 1 1028 title f\n" - "00024341 125062 2048 title g\n" - "00024341 125062 2049 title c\n" + "00024341 125062 2 2048 title g\n" + "00024341 125062 2 2049 title c\n" , 0 -- 1.7.10.4