X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Frpnscan.c;h=670a30dbc6f92a488318fbec5438d1cf7eb3bd63;hb=bfe7ce5c9c47f3f4ad1ac76d4232b9807d5ee158;hp=487075fcd3f253e6b73d3ab797d937782f328210;hpb=2a0be28b861bf340b8213ad17438daf49e5cb8b0;p=idzebra-moved-to-github.git diff --git a/index/rpnscan.c b/index/rpnscan.c index 487075f..670a30d 100644 --- a/index/rpnscan.c +++ b/index/rpnscan.c @@ -1,5 +1,5 @@ -/* $Id: rpnscan.c,v 1.2 2006-09-21 10:10:07 adam Exp $ - Copyright (C) 1995-2006 +/* $Id: rpnscan.c,v 1.19 2007-11-01 14:56:07 adam Exp $ + Copyright (C) 1995-2007 Index Data ApS This file is part of the Zebra server. @@ -33,25 +33,19 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include "index.h" #include +#include #include #include #include +#include -struct scan_info_entry { - char *term; - ISAM_P isam_p; -}; +#define RPN_MAX_ORDS 32 -struct scan_info { - struct scan_info_entry *list; - ODR odr; - int before, after; - char prefix[20]; -}; +static int log_scan = YLOG_LOG; /* convert APT SCAN term to internal cmap */ static ZEBRA_RES trans_scan_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, - char *termz, int reg_type) + char *termz, zebra_map_t zm) { char termz0[IT_MAX_WORD]; @@ -69,7 +63,7 @@ static ZEBRA_RES trans_scan_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, while ((len = (cp_end - cp)) > 0) { - map = zebra_maps_input(zh->reg->zebra_maps, reg_type, &cp, len, 0); + map = zebra_maps_input(zm, &cp, len, 0); if (**map == *CHR_SPACE) space_map = *map; else @@ -87,7 +81,7 @@ static ZEBRA_RES trans_scan_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, return ZEBRA_OK; } -static void count_set(ZebraHandle zh, RSET rset, zint *count) +static void count_set(ZebraHandle zh, RSET rset, zint *count, zint approx_limit) { zint psysno = 0; struct it_key key; @@ -95,7 +89,7 @@ static void count_set(ZebraHandle zh, RSET rset, zint *count) yaz_log(YLOG_DEBUG, "count_set"); - rset->hits_limit = zh->approx_limit; + rset->hits_limit = approx_limit; *count = 0; rfd = rset_open(rset, RSETF_READ); @@ -108,319 +102,396 @@ static void count_set(ZebraHandle zh, RSET rset, zint *count) break; } } - rset_close (rfd); + rset_close(rfd); *count = rset->hits_count; } -static int scan_handle (char *name, const char *info, int pos, void *client) +static void get_first_snippet_from_rset(ZebraHandle zh, + RSET rset, zebra_snippets *snippets, + zint *sysno) { - int len_prefix, idx; - struct scan_info *scan_info = (struct scan_info *) client; + struct it_key key; + RSFD rfd; + TERMID termid; + size_t sysno_mem_index = 0; + + if (zh->m_staticrank) + sysno_mem_index = 1; + + yaz_log(YLOG_DEBUG, "get_first_snippet_from_rset"); + + rfd = rset_open(rset, RSETF_READ); + *sysno = 0; + while (rset_read(rfd, &key, &termid)) + { + if (key.mem[sysno_mem_index] != *sysno) + { + if (*sysno) + break; + *sysno = key.mem[sysno_mem_index]; + } + if (termid) + { + struct ord_list *ol; + for (ol = termid->ol; ol; ol = ol->next) + { + zebra_snippets_append(snippets, key.mem[key.len-1], 0, + ol->ord, termid->name); + } + } + } + rset_close(rfd); +} + +struct scan2_info_entry { + WRBUF term; + char prefix[20]; + ISAM_P isam_p; + int pos_to_save; + int ord; +}; + +static int scan_handle2(char *name, const char *info, int pos, void *client) +{ + int len_prefix; + struct scan2_info_entry *scan_info = (struct scan2_info_entry *) client; + + if (scan_info->pos_to_save != pos) + return 0; len_prefix = strlen(scan_info->prefix); - if (memcmp (name, scan_info->prefix, len_prefix)) + if (memcmp(name, scan_info->prefix, len_prefix)) return 1; - if (pos > 0) - idx = scan_info->after - pos + scan_info->before; - else - idx = - pos - 1; - /* skip special terms.. of no interest */ - if (name[len_prefix] < 4) + /* skip special terms such as first-in-field specials */ + if (name[len_prefix] < CHR_BASE_CHAR) return 1; - if (idx < 0) - return 0; - scan_info->list[idx].term = (char *) - odr_malloc(scan_info->odr, strlen(name + len_prefix)+1); - strcpy(scan_info->list[idx].term, name + len_prefix); - assert (*info == sizeof(ISAM_P)); - memcpy (&scan_info->list[idx].isam_p, info+1, sizeof(ISAM_P)); + wrbuf_rewind(scan_info->term); + wrbuf_puts(scan_info->term, name+len_prefix); + + assert(*info == sizeof(ISAM_P)); + memcpy(&scan_info->isam_p, info+1, sizeof(ISAM_P)); return 0; } -#define RPN_MAX_ORDS 32 - -static ZEBRA_RES rpn_scan_ver1(ZebraHandle zh, ODR stream, - Z_AttributesPlusTerm *zapt, - int *position, int *num_entries, - ZebraScanEntry **list, - int *is_partial, RSET limit_set, - int return_zero, - int index_type, int ord_no, int *ords) +static int scan_save_set(ZebraHandle zh, ODR stream, NMEM nmem, + struct rset_key_control *kc, + Z_AttributesPlusTerm *zapt, + RSET limit_set, + const char *term, + const char *index_type, + struct scan2_info_entry *ar, int ord_no, + ZebraScanEntry *glist, int pos) { - int pos = *position; - int num = *num_entries; - int before; - int after; int i; - struct scan_info *scan_info_array; - char termz[IT_MAX_WORD+20]; - ZebraScanEntry *glist; - NMEM rset_nmem = 0; - struct rset_key_control *kc = 0; - int ptr[RPN_MAX_ORDS]; - - before = pos-1; - if (before < 0) - before = 0; - after = 1+num-pos; - if (after < 0) - after = 0; - yaz_log(YLOG_DEBUG, "rpn_scan pos=%d num=%d before=%d " - "after=%d before+after=%d", - pos, num, before, after, before+after); - scan_info_array = (struct scan_info *) - odr_malloc(stream, ord_no * sizeof(*scan_info_array)); + RSET rset = 0; + zint approx_limit = zh->approx_limit; + AttrType global_hits_limit_attr; + int l; + attr_init_APT(&global_hits_limit_attr, zapt, 12); + + l = attr_find(&global_hits_limit_attr, NULL); + if (l != -1) + approx_limit = l; + for (i = 0; i < ord_no; i++) { - int j, prefix_len = 0; - int before_tmp = before, after_tmp = after; - struct scan_info *scan_info = scan_info_array + i; - struct rpn_char_map_info rcmi; + if (ar[i].isam_p && strcmp(wrbuf_cstr(ar[i].term), term) == 0) + { + struct ord_list *ol = ord_list_create(nmem); + RSET rset_t; + + ol = ord_list_append(nmem, ol, ar[i].ord); + + assert(ol); + rset_t = rset_trunc( + zh, &ar[i].isam_p, 1, + wrbuf_buf(ar[i].term), wrbuf_len(ar[i].term), + NULL, 1, zapt->term->which, nmem, + kc, kc->scope, ol, index_type, + 0 /* hits_limit_value */, + 0 /* term_ref_id_str */); + if (!rset) + rset = rset_t; + else + { + RSET rsets[2]; + + rsets[0] = rset; + rsets[1] = rset_t; + rset = rset_create_or(nmem, kc, kc->scope, 0 /* termid */, + 2, rsets); + } + ar[i].isam_p = 0; + } + } + if (rset) + { + zint count; + /* merge with limit_set if given */ + if (limit_set) + { + RSET rsets[2]; + rsets[0] = rset; + rsets[1] = rset_dup(limit_set); + + rset = rset_create_and(nmem, kc, kc->scope, 2, rsets); + } + /* count it */ + count_set(zh, rset, &count, approx_limit); - rpn_char_map_prepare (zh->reg, index_type, &rcmi); + if (pos != -1) + { + zint sysno; + int code = -1; + zebra_snippets *rec_snippets = zebra_snippets_create(); + zebra_snippets *hit_snippets = zebra_snippets_create(); - scan_info->before = before; - scan_info->after = after; - scan_info->odr = stream; + glist[pos].term = 0; + glist[pos].display_term = 0; + + get_first_snippet_from_rset(zh, rset, hit_snippets, &sysno); + if (sysno) + code = zebra_get_rec_snippets(zh, sysno, rec_snippets); + + if (code == 0) + { + const struct zebra_snippet_word *w = + zebra_snippets_lookup(rec_snippets, hit_snippets); + if (w) + { + glist[pos].display_term = odr_strdup(stream, w->term); + } + } + if (!glist[pos].term) + zebra_term_untrans_iconv(zh, stream->mem, index_type, + &glist[pos].term, term); + glist[pos].occurrences = count; + zebra_snippets_destroy(rec_snippets); + zebra_snippets_destroy(hit_snippets); + } + rset_delete(rset); + if (count > 0) + return 1; + else + return 0; + } + return 0; +} - scan_info->list = (struct scan_info_entry *) - odr_malloc(stream, (before+after) * sizeof(*scan_info->list)); - for (j = 0; jlist[j].term = NULL; +static ZEBRA_RES rpn_scan_norm(ZebraHandle zh, ODR stream, NMEM nmem, + struct rset_key_control *kc, + Z_AttributesPlusTerm *zapt, + int *position, int *num_entries, + ZebraScanEntry **list, + int *is_partial, RSET limit_set, + const char *index_type, + int ord_no, int *ords) +{ + struct scan2_info_entry *ar = nmem_malloc(nmem, sizeof(*ar) * ord_no); + struct rpn_char_map_info rcmi; + zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, index_type); + int i, dif; + int after_pos; + int pos = 0; - prefix_len += key_SU_encode (ords[i], termz + prefix_len); - termz[prefix_len] = 0; - strcpy(scan_info->prefix, termz); + ZebraScanEntry *glist = (ZebraScanEntry *) + odr_malloc(stream, *num_entries * sizeof(*glist)); - if (trans_scan_term(zh, zapt, termz+prefix_len, index_type) == - ZEBRA_FAIL) - return ZEBRA_FAIL; - - dict_scan(zh->reg->dict, termz, &before_tmp, &after_tmp, - scan_info, scan_handle); + *is_partial = 0; + if (*position > *num_entries+1) + { + *is_partial = 1; + *position = 1; + *num_entries = 0; + return ZEBRA_OK; } - glist = (ZebraScanEntry *) - odr_malloc(stream, (before+after)*sizeof(*glist)); + rpn_char_map_prepare(zh->reg, zm, &rcmi); - rset_nmem = nmem_create(); - kc = zebra_key_control_create(zh); + for (i = 0; i < ord_no; i++) + ar[i].term = wrbuf_alloc(); - /* consider terms after main term */ for (i = 0; i < ord_no; i++) - ptr[i] = before; - - *is_partial = 0; - for (i = 0; i= 0 && - (tst = scan_info_array[j].list[ptr[j]].term) && - (!mterm || strcmp (tst, mterm) < 0)) - { - j0 = j; - mterm = tst; - } + for (i = 0; i < ord_no; i++) + wrbuf_destroy(ar[i].term); + return ZEBRA_FAIL; } - if (j0 == -1) - break; /* no value found, stop */ + wrbuf_rewind(ar[i].term); + wrbuf_puts(ar[i].term, termz + prefix_len); + ar[i].isam_p = 0; + ar[i].ord = ords[i]; + } + /** deal with terms before position .. */ + /* the glist index starts at zero (unlike scan positions */ + for (pos = *position-2; pos >= 0; ) + { + const char *hi = 0; - /* get result set for first one , but only if it's within bounds */ - if (lo >= 0) - { - /* get result set for first term */ - zebra_term_untrans_iconv(zh, stream->mem, index_type, - &glist[lo].term, mterm); - rset = rset_trunc(zh, &scan_info_array[j0].list[ptr[j0]].isam_p, 1, - glist[lo].term, strlen(glist[lo].term), - NULL, 0, zapt->term->which, rset_nmem, - kc, kc->scope, 0, index_type, 0 /* hits_limit */, - 0 /* term_ref_id_str */); - } - ptr[j0]++; /* move index for this set .. */ - /* get result set for remaining scan terms */ - for (j = j0+1; j= 0 && - (tst = scan_info_array[j].list[ptr[j]].term) && - !strcmp (tst, mterm)) + if (ar[i].isam_p == 0) { - if (lo >= 0) - { - RSET rsets[2]; - - rsets[0] = rset; - rsets[1] = - rset_trunc( - zh, &scan_info_array[j].list[ptr[j]].isam_p, 1, - glist[lo].term, - strlen(glist[lo].term), NULL, 0, - zapt->term->which,rset_nmem, - kc, kc->scope, 0, index_type, 0 /* hits_limit */, - 0 /* term_ref_id_str */ ); - rset = rset_create_or(rset_nmem, kc, - kc->scope, 0 /* termid */, - 2, rsets); - } - ptr[j]++; + char termz[IT_MAX_WORD+20]; + int before = 1; + int after = 0; + + ar[i].pos_to_save = -1; + + strcpy(termz, ar[i].prefix); + strcat(termz, wrbuf_cstr(ar[i].term)); + dict_scan(zh->reg->dict, termz, &before, &after, + ar+i, scan_handle2); } } - if (lo >= 0) - { - zint count; - /* merge with limit_set if given */ - if (limit_set) - { - RSET rsets[2]; - rsets[0] = rset; - rsets[1] = rset_dup(limit_set); - - rset = rset_create_and(rset_nmem, kc, kc->scope, 2, rsets); - } - /* count it */ - count_set(zh, rset, &count); - glist[lo].occurrences = count; - rset_delete(rset); - } + /* get maximum after scan */ + for (i = 0; i < ord_no; i++) + { + if (ar[i].isam_p + && (hi == 0 || strcmp(wrbuf_cstr(ar[i].term), hi) > 0)) + hi = wrbuf_cstr(ar[i].term); + } + if (!hi) + break; + if (scan_save_set(zh, stream, nmem, kc, zapt, limit_set, hi, + index_type, ar, ord_no, glist, + (pos >= 0 && pos < *num_entries) ? pos : -1)) + --pos; } - if (i < after) + /* see if we got all terms before.. */ + dif = 1 + pos; + if (dif > 0) { - *num_entries -= (after-i); + /* did not get all terms; adjust the real position and reduce + number of entries */ + yaz_log(YLOG_LOG, "before terms dif=%d", dif); + glist = glist + dif; + *num_entries -= dif; + *position -= dif; *is_partial = 1; - if (*num_entries < 0) - { - (*kc->dec)(kc); - nmem_destroy(rset_nmem); - *num_entries = 0; - return ZEBRA_OK; - } } - /* consider terms before main term */ - for (i = 0; i= 0 && - (tst = scan_info_array[j].list[before-1-ptr[j]].term) && - (!mterm || strcmp (tst, mterm) > 0)) - { - j0 = j; - mterm = tst; - } - } - if (j0 == -1) - break; - - zebra_term_untrans_iconv(zh, stream->mem, index_type, - &glist[lo].term, mterm); - - rset = rset_trunc - (zh, &scan_info_array[j0].list[before-1-ptr[j0]].isam_p, 1, - glist[lo].term, strlen(glist[lo].term), - NULL, 0, zapt->term->which, rset_nmem, - kc, kc->scope, 0, index_type, 0 /* hits_limit */, - 0 /* term_ref_id_str */); - - ptr[j0]++; - - for (j = j0+1; j= 0 && - (tst = scan_info_array[j].list[before-1-ptr[j]].term) && - !strcmp (tst, mterm)) - { - RSET rsets[2]; - - rsets[0] = rset; - rsets[1] = rset_trunc( - zh, - &scan_info_array[j].list[before-1-ptr[j]].isam_p, 1, - glist[lo].term, - strlen(glist[lo].term), NULL, 0, - zapt->term->which, rset_nmem, - kc, kc->scope, 0, index_type, 0 /* hits_limit */, - 0 /* term_ref_id_str */); - rset = rset_create_or(rset_nmem, kc, - kc->scope, 0 /* termid */, 2, rsets); - - ptr[j]++; - } - } - if (limit_set) - { - RSET rsets[2]; - rsets[0] = rset; - rsets[1] = rset_dup(limit_set); - - rset = rset_create_and(rset_nmem, kc, kc->scope, 2, rsets); - } - count_set(zh, rset, &count); - glist[lo].occurrences = count; - rset_delete (rset); + char termz[IT_MAX_WORD+20]; + int prefix_len = 0; + + prefix_len = key_SU_encode(ords[i], termz); + termz[prefix_len] = 0; + strcpy(ar[i].prefix, termz); + + if (trans_scan_term(zh, zapt, termz+prefix_len, zm) == + ZEBRA_FAIL) + return ZEBRA_FAIL; + wrbuf_rewind(ar[i].term); + wrbuf_puts(ar[i].term, termz + prefix_len); + ar[i].isam_p = 0; + ar[i].ord = ords[i]; } - (*kc->dec)(kc); - nmem_destroy(rset_nmem); - i = before-i; - if (i) + + after_pos = 1; /* immediate term first.. */ + for (pos = *position-1; pos < *num_entries; ) + { + const char *lo = 0; + + /* scan on all minimum terms */ + for (i = 0; i < ord_no; i++) + { + if (ar[i].isam_p == 0) + { + char termz[IT_MAX_WORD+20]; + int before = 0; + int after = after_pos; + + ar[i].pos_to_save = 1; + + strcpy(termz, ar[i].prefix); + strcat(termz, wrbuf_cstr(ar[i].term)); + dict_scan(zh->reg->dict, termz, &before, &after, + ar+i, scan_handle2); + } + } + after_pos = 2; /* next round we grab following term */ + + /* get minimum after scan */ + for (i = 0; i < ord_no; i++) + { + if (ar[i].isam_p + && (lo == 0 || strcmp(wrbuf_cstr(ar[i].term), lo) < 0)) + lo = wrbuf_cstr(ar[i].term); + } + if (!lo) + break; + if (scan_save_set(zh, stream, nmem, kc, zapt, limit_set, lo, + index_type, ar, ord_no, glist, + (pos >= 0 && pos < *num_entries) ? pos : -1)) + pos++; + + } + if (pos != *num_entries) { + if (pos >= 0) + *num_entries = pos; + else + *num_entries = 0; *is_partial = 1; - *position -= i; - *num_entries -= i; - if (*num_entries <= 0) - { - *num_entries = 0; - return ZEBRA_OK; - } } - - *list = glist + i; /* list is set to first 'real' entry */ - - yaz_log(YLOG_DEBUG, "position = %d, num_entries = %d", - *position, *num_entries); + + *list = glist; + + for (i = 0; i < ord_no; i++) + wrbuf_destroy(ar[i].term); + return ZEBRA_OK; } +struct scan1_info_entry { + char *term; + ISAM_P isam_p; +}; + +struct scan_info { + struct scan1_info_entry *list; + ODR odr; + int before, after; + char prefix[20]; +}; ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, - oid_value attributeset, + const Odr_oid *attributeset, int num_bases, char **basenames, int *position, int *num_entries, ZebraScanEntry **list, - int *is_partial, RSET limit_set, int return_zero) + int *is_partial, RSET limit_set) { int base_no; int ords[RPN_MAX_ORDS], ord_no = 0; - unsigned index_type; + const char *index_type; char *search_type = NULL; char rank_type[128]; int complete_flag; int sort_flag; + NMEM nmem; + ZEBRA_RES res; + struct rset_key_control *kc = 0; *list = 0; *is_partial = 0; - if (attributeset == VAL_NONE) - attributeset = VAL_BIB1; + if (!attributeset) + attributeset = yaz_oid_attset_bib_1; if (!limit_set) /* no limit set given already */ { @@ -438,19 +509,26 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, if (termset_value_numeric != -2) { - sprintf(resname, "%d", termset_value_numeric); termset_name = resname; } else termset_name = termset_value_string; - limit_set = resultSetRef (zh, termset_name); + limit_set = resultSetRef(zh, termset_name); + + if (!limit_set) + { + zebra_setError(zh, + YAZ_BIB1_SPECIFIED_RESULT_SET_DOES_NOT_EXIST, + termset_name); + return ZEBRA_FAIL; + } } } - yaz_log(YLOG_DEBUG, "position = %d, num = %d set=%d", - *position, *num_entries, attributeset); + yaz_log(YLOG_DEBUG, "position = %d, num = %d", + *position, *num_entries); if (zebra_maps_attr(zh->reg->zebra_maps, zapt, &index_type, &search_type, rank_type, &complete_flag, &sort_flag)) @@ -469,7 +547,7 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, { int ord; - if (zebraExplain_curDatabase (zh->reg->zei, basenames[base_no])) + if (zebraExplain_curDatabase(zh->reg->zei, basenames[base_no])) { zebra_setError(zh, YAZ_BIB1_DATABASE_UNAVAILABLE, basenames[base_no]); @@ -486,15 +564,25 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, *num_entries = 0; /* zebra_apt_get_ord should set error reason */ return ZEBRA_FAIL; } - /* prepare dictionary scanning */ if (*num_entries < 1) { *num_entries = 0; - return ZEBRA_OK; + return ZEBRA_OK; } - return rpn_scan_ver1(zh, stream, zapt, position, num_entries, list, - is_partial, limit_set, return_zero, - index_type, ord_no, ords); + nmem = nmem_create(); + kc = zebra_key_control_create(zh); + + if (sort_flag) + res = rpn_facet(zh, stream, nmem, kc, zapt, position, num_entries, + list, + is_partial, limit_set, index_type, ord_no, ords); + else + res = rpn_scan_norm(zh, stream, nmem, kc, zapt, position, num_entries, + list, + is_partial, limit_set, index_type, ord_no, ords); + nmem_destroy(nmem); + (*kc->dec)(kc); + return res; } /*