X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Frpnscan.c;h=02bf5dabf8fa98d3386ab5eb82b75978fe94405d;hb=918c8b1ec479083d82c390d5dceb4899654cb666;hp=ad68857a36deb16bdbc5138df23c679d85396340;hpb=1ab2b4589da1d33372cc1f9a87afdac160ca11de;p=idzebra-moved-to-github.git diff --git a/index/rpnscan.c b/index/rpnscan.c index ad68857..02bf5da 100644 --- a/index/rpnscan.c +++ b/index/rpnscan.c @@ -1,4 +1,4 @@ -/* $Id: rpnscan.c,v 1.1 2006-09-21 08:56:52 adam Exp $ +/* $Id: rpnscan.c,v 1.3 2006-09-21 20:22:34 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -33,21 +33,14 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include "index.h" #include +#include #include #include #include -struct scan_info_entry { - char *term; - ISAM_P isam_p; -}; +#define RPN_MAX_ORDS 32 -struct scan_info { - struct scan_info_entry *list; - ODR odr; - int before, after; - char prefix[20]; -}; +int log_scan = YLOG_LOG; /* convert APT SCAN term to internal cmap */ static ZEBRA_RES trans_scan_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, @@ -112,131 +105,321 @@ static void count_set(ZebraHandle zh, RSET rset, zint *count) *count = rset->hits_count; } -static int scan_handle (char *name, const char *info, int pos, void *client) +struct scan2_info_entry { + WRBUF term; + char prefix[20]; + ISAM_P isam_p; + int pos_to_save; +}; + +static int scan_handle2(char *name, const char *info, int pos, void *client) { - int len_prefix, idx; - struct scan_info *scan_info = (struct scan_info *) client; + int len_prefix; + struct scan2_info_entry *scan_info = (struct scan2_info_entry *) client; + + if (scan_info->pos_to_save != pos) + return 0; len_prefix = strlen(scan_info->prefix); if (memcmp (name, scan_info->prefix, len_prefix)) return 1; - if (pos > 0) - idx = scan_info->after - pos + scan_info->before; - else - idx = - pos - 1; - /* skip special terms.. of no interest */ - if (name[len_prefix] < 4) + /* skip special terms such as first-in-field specials */ + if (name[len_prefix] < CHR_BASE_CHAR) return 1; - if (idx < 0) - return 0; - scan_info->list[idx].term = (char *) - odr_malloc(scan_info->odr, strlen(name + len_prefix)+1); - strcpy(scan_info->list[idx].term, name + len_prefix); + wrbuf_rewind(scan_info->term); + wrbuf_puts(scan_info->term, name+len_prefix); + assert (*info == sizeof(ISAM_P)); - memcpy (&scan_info->list[idx].isam_p, info+1, sizeof(ISAM_P)); + memcpy (&scan_info->isam_p, info+1, sizeof(ISAM_P)); return 0; } -#define RPN_MAX_ORDS 32 - -ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, - oid_value attributeset, - int num_bases, char **basenames, - int *position, int *num_entries, ZebraScanEntry **list, - int *is_partial, RSET limit_set, int return_zero) +static int scan_save_set(ZebraHandle zh, ODR stream, NMEM nmem, + struct rset_key_control *kc, + Z_AttributesPlusTerm *zapt, + RSET limit_set, + const char *term, + int index_type, + struct scan2_info_entry *ar, int ord_no, + ZebraScanEntry *glist, int pos) { int i; - int pos = *position; - int num = *num_entries; - int before; - int after; - int base_no; - char termz[IT_MAX_WORD+20]; - struct scan_info *scan_info_array; - ZebraScanEntry *glist; - int ords[RPN_MAX_ORDS], ord_no = 0; - int ptr[RPN_MAX_ORDS]; - - unsigned index_type; - char *search_type = NULL; - char rank_type[128]; - int complete_flag; - int sort_flag; - NMEM rset_nmem = NULL; - struct rset_key_control *kc = 0; - - *list = 0; - *is_partial = 0; - - if (attributeset == VAL_NONE) - attributeset = VAL_BIB1; - - if (!limit_set) + RSET rset = 0; + for (i = 0; i < ord_no; i++) { - AttrType termset; - int termset_value_numeric; - const char *termset_value_string; - attr_init_APT(&termset, zapt, 8); - termset_value_numeric = - attr_find_ex(&termset, NULL, &termset_value_string); - if (termset_value_numeric != -1) + if (ar[i].isam_p && strcmp(wrbuf_buf(ar[i].term), term) == 0) { - char resname[32]; - const char *termset_name = 0; - - if (termset_value_numeric != -2) + RSET rset_t = rset_trunc( + zh, &ar[i].isam_p, 1, + wrbuf_buf(ar[i].term), wrbuf_len(ar[i].term), + NULL, 0, zapt->term->which, nmem, + kc, kc->scope, 0, index_type, + 0 /* hits_limit */, + 0 /* term_ref_id_str */); + if (!rset) + rset = rset_t; + else { + RSET rsets[2]; - sprintf(resname, "%d", termset_value_numeric); - termset_name = resname; + rsets[0] = rset; + rsets[1] = rset_t; + rset = rset_create_or(nmem, kc, kc->scope, 0 /* termid */, + 2, rsets); } - else - termset_name = termset_value_string; + ar[i].isam_p = 0; + } + } + if (rset) + { + zint count; + /* merge with limit_set if given */ + if (limit_set) + { + RSET rsets[2]; + rsets[0] = rset; + rsets[1] = rset_dup(limit_set); - limit_set = resultSetRef (zh, termset_name); + rset = rset_create_and(nmem, kc, kc->scope, 2, rsets); + } + /* count it */ + count_set(zh, rset, &count); + rset_delete(rset); + if (count > 0) + { + if (pos != -1) + { + zebra_term_untrans_iconv(zh, stream->mem, index_type, + &glist[pos].term, term); + glist[pos].occurrences = count; + } + return 1; } } + return 0; +} - yaz_log(YLOG_DEBUG, "position = %d, num = %d set=%d", - pos, num, attributeset); - - if (zebra_maps_attr(zh->reg->zebra_maps, zapt, &index_type, &search_type, - rank_type, &complete_flag, &sort_flag)) +static ZEBRA_RES rpn_scan_ver2(ZebraHandle zh, ODR stream, NMEM nmem, + struct rset_key_control *kc, + Z_AttributesPlusTerm *zapt, + int *position, int *num_entries, + ZebraScanEntry **list, + int *is_partial, RSET limit_set, + int index_type, int ord_no, int *ords) +{ + struct scan2_info_entry *ar = nmem_malloc(nmem, sizeof(*ar) * ord_no); + struct rpn_char_map_info rcmi; + int i, dif; + int pos = 0; + + ZebraScanEntry *glist = (ZebraScanEntry *) + odr_malloc(stream, *num_entries * sizeof(*glist)); + + *is_partial = 0; + if (*position > *num_entries+1) { + *is_partial = 1; + *position = 1; *num_entries = 0; - zebra_setError(zh, YAZ_BIB1_UNSUPP_ATTRIBUTE_TYPE, 0); - return ZEBRA_FAIL; + return ZEBRA_OK; } - for (base_no = 0; base_no < num_bases && ord_no < RPN_MAX_ORDS; base_no++) + rpn_char_map_prepare (zh->reg, index_type, &rcmi); + + for (i = 0; i < ord_no; i++) + ar[i].term = wrbuf_alloc(); + + for (i = 0; i < ord_no; i++) { - int ord; + char termz[IT_MAX_WORD+20]; + int prefix_len = 0; + + prefix_len = key_SU_encode (ords[i], termz); + termz[prefix_len] = 0; + strcpy(ar[i].prefix, termz); + + if (trans_scan_term(zh, zapt, termz+prefix_len, index_type) == + ZEBRA_FAIL) + return ZEBRA_FAIL; + wrbuf_rewind(ar[i].term); + wrbuf_puts(ar[i].term, termz + prefix_len); + ar[i].isam_p = 0; + } + /** deal with terms before position .. */ + /* the glist index starts at zero (unlike scan positions */ + for (pos = *position-2; pos >= 0; ) + { + const char *hi = 0; - if (zebraExplain_curDatabase (zh->reg->zei, basenames[base_no])) - { - zebra_setError(zh, YAZ_BIB1_DATABASE_UNAVAILABLE, - basenames[base_no]); - *num_entries = 0; - return ZEBRA_FAIL; - } - if (zebra_apt_get_ord(zh, zapt, index_type, 0, attributeset, &ord) - != ZEBRA_OK) - continue; - ords[ord_no++] = ord; + /* scan on all maximum terms */ + for (i = 0; i < ord_no; i++) + { + if (ar[i].isam_p == 0) + { + char termz[IT_MAX_WORD+20]; + int before = 1; + int after = 0; + + ar[i].pos_to_save = -1; + + strcpy(termz, ar[i].prefix); + strcat(termz, wrbuf_buf(ar[i].term)); + dict_scan(zh->reg->dict, termz, &before, &after, + ar+i, scan_handle2); + } + } + /* get maximum after scan */ + for (i = 0; i < ord_no; i++) + { + if (ar[i].isam_p + && (hi == 0 || strcmp(wrbuf_buf(ar[i].term), hi) > 0)) + hi = wrbuf_buf(ar[i].term); + } + if (!hi) + break; + if (scan_save_set(zh, stream, nmem, kc, zapt, limit_set, hi, + index_type, ar, ord_no, glist, + (pos >= 0 && pos < *num_entries) ? pos : -1)) + --pos; } - if (ord_no == 0) + /* see if we got all terms before.. */ + dif = 1 + pos; + if (dif > 0) { - *num_entries = 0; - return ZEBRA_FAIL; + /* did not get all terms; adjust the real position and reduce + number of entries */ + yaz_log(YLOG_LOG, "before terms dif=%d", dif); + glist = glist + dif; + *num_entries -= dif; + *position -= dif; + *is_partial = 1; } - /* prepare dictionary scanning */ - if (num < 1) + for (i = 0; i < ord_no; i++) { - *num_entries = 0; - return ZEBRA_OK; + char termz[IT_MAX_WORD+20]; + int prefix_len = 0; + + prefix_len = key_SU_encode (ords[i], termz); + termz[prefix_len] = 0; + strcpy(ar[i].prefix, termz); + + if (trans_scan_term(zh, zapt, termz+prefix_len, index_type) == + ZEBRA_FAIL) + return ZEBRA_FAIL; + wrbuf_rewind(ar[i].term); + wrbuf_puts(ar[i].term, termz + prefix_len); + ar[i].isam_p = 0; } + + for (pos = *position-1; pos < *num_entries; ) + { + const char *lo = 0; + + /* scan on all minimum terms */ + for (i = 0; i < ord_no; i++) + { + if (ar[i].isam_p == 0) + { + char termz[IT_MAX_WORD+20]; + int before = 0; + int after = (pos == *position-1) ? 1 : 2; + + ar[i].pos_to_save = 1; + + strcpy(termz, ar[i].prefix); + strcat(termz, wrbuf_buf(ar[i].term)); + dict_scan(zh->reg->dict, termz, &before, &after, + ar+i, scan_handle2); + } + } + /* get minimum after scan */ + for (i = 0; i < ord_no; i++) + { + if (ar[i].isam_p + && (lo == 0 || strcmp(wrbuf_buf(ar[i].term), lo) < 0)) + lo = wrbuf_buf(ar[i].term); + } + if (!lo) + break; + if (scan_save_set(zh, stream, nmem, kc, zapt, limit_set, lo, + index_type, ar, ord_no, glist, + (pos >= 0 && pos < *num_entries) ? pos : -1)) + pos++; + + } + if (pos != *num_entries) + { + if (pos >= 0) + *num_entries = pos; + else + *num_entries = 0; + *is_partial = 1; + } + + *list = glist; + + return ZEBRA_OK; +} + +struct scan1_info_entry { + char *term; + ISAM_P isam_p; +}; + +struct scan_info { + struct scan1_info_entry *list; + ODR odr; + int before, after; + char prefix[20]; +}; + +static int scan_handle1(char *name, const char *info, int pos, void *client) +{ + int len_prefix, idx; + struct scan_info *scan_info = (struct scan_info *) client; + + len_prefix = strlen(scan_info->prefix); + if (memcmp (name, scan_info->prefix, len_prefix)) + return 1; + if (pos > 0) + idx = scan_info->after - pos + scan_info->before; + else + idx = - pos - 1; + + /* skip special terms such as first-in-field specials */ + if (name[len_prefix] < CHR_BASE_CHAR) + return 1; + + if (idx < 0) + return 0; + scan_info->list[idx].term = (char *) + odr_malloc(scan_info->odr, strlen(name + len_prefix)+1); + strcpy(scan_info->list[idx].term, name + len_prefix); + assert (*info == sizeof(ISAM_P)); + memcpy (&scan_info->list[idx].isam_p, info+1, sizeof(ISAM_P)); + return 0; +} + +static ZEBRA_RES rpn_scan_ver1(ZebraHandle zh, ODR stream, NMEM rset_nmem, + struct rset_key_control *kc, + Z_AttributesPlusTerm *zapt, + int *position, int *num_entries, + ZebraScanEntry **list, + int *is_partial, RSET limit_set, + int index_type, int ord_no, int *ords) +{ + int pos = *position; + int num = *num_entries; + int before; + int after; + int i; + struct scan_info *scan_info_array; + char termz[IT_MAX_WORD+20]; + ZebraScanEntry *glist; + int ptr[RPN_MAX_ORDS]; + before = pos-1; if (before < 0) before = 0; @@ -261,7 +444,7 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, scan_info->after = after; scan_info->odr = stream; - scan_info->list = (struct scan_info_entry *) + scan_info->list = (struct scan1_info_entry *) odr_malloc(stream, (before+after) * sizeof(*scan_info->list)); for (j = 0; jlist[j].term = NULL; @@ -275,14 +458,11 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, return ZEBRA_FAIL; dict_scan(zh->reg->dict, termz, &before_tmp, &after_tmp, - scan_info, scan_handle); + scan_info, scan_handle1); } glist = (ZebraScanEntry *) odr_malloc(stream, (before+after)*sizeof(*glist)); - rset_nmem = nmem_create(); - kc = zebra_key_control_create(zh); - /* consider terms after main term */ for (i = 0; i < ord_no; i++) ptr[i] = before; @@ -374,8 +554,6 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, *is_partial = 1; if (*num_entries < 0) { - (*kc->dec)(kc); - nmem_destroy(rset_nmem); *num_entries = 0; return ZEBRA_OK; } @@ -453,8 +631,6 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, glist[lo].occurrences = count; rset_delete (rset); } - (*kc->dec)(kc); - nmem_destroy(rset_nmem); i = before-i; if (i) { @@ -475,6 +651,111 @@ ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, return ZEBRA_OK; } + +ZEBRA_RES rpn_scan(ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, + oid_value attributeset, + int num_bases, char **basenames, + int *position, int *num_entries, ZebraScanEntry **list, + int *is_partial, RSET limit_set) +{ + int base_no; + int ords[RPN_MAX_ORDS], ord_no = 0; + + unsigned index_type; + char *search_type = NULL; + char rank_type[128]; + int complete_flag; + int sort_flag; + NMEM nmem; + ZEBRA_RES res; + struct rset_key_control *kc = 0; + + *list = 0; + *is_partial = 0; + + if (attributeset == VAL_NONE) + attributeset = VAL_BIB1; + + if (!limit_set) /* no limit set given already */ + { + /* see if there is a @attr 8=set */ + AttrType termset; + int termset_value_numeric; + const char *termset_value_string; + attr_init_APT(&termset, zapt, 8); + termset_value_numeric = + attr_find_ex(&termset, NULL, &termset_value_string); + if (termset_value_numeric != -1) + { + char resname[32]; + const char *termset_name = 0; + + if (termset_value_numeric != -2) + { + + sprintf(resname, "%d", termset_value_numeric); + termset_name = resname; + } + else + termset_name = termset_value_string; + + limit_set = resultSetRef (zh, termset_name); + } + } + + yaz_log(YLOG_DEBUG, "position = %d, num = %d set=%d", + *position, *num_entries, attributeset); + + if (zebra_maps_attr(zh->reg->zebra_maps, zapt, &index_type, &search_type, + rank_type, &complete_flag, &sort_flag)) + { + *num_entries = 0; + zebra_setError(zh, YAZ_BIB1_UNSUPP_ATTRIBUTE_TYPE, 0); + return ZEBRA_FAIL; + } + if (num_bases > RPN_MAX_ORDS) + { + zebra_setError(zh, YAZ_BIB1_TOO_MANY_DATABASES_SPECIFIED, 0); + return ZEBRA_FAIL; + } + + for (base_no = 0; base_no < num_bases; base_no++) + { + int ord; + + if (zebraExplain_curDatabase (zh->reg->zei, basenames[base_no])) + { + zebra_setError(zh, YAZ_BIB1_DATABASE_UNAVAILABLE, + basenames[base_no]); + *num_entries = 0; + return ZEBRA_FAIL; + } + if (zebra_apt_get_ord(zh, zapt, index_type, 0, attributeset, &ord) + != ZEBRA_OK) + continue; + ords[ord_no++] = ord; + } + if (ord_no == 0) + { + *num_entries = 0; /* zebra_apt_get_ord should set error reason */ + return ZEBRA_FAIL; + } + if (*num_entries < 1) + { + *num_entries = 0; + return ZEBRA_OK; + } + nmem = nmem_create(); + kc = zebra_key_control_create(zh); + + res = rpn_scan_ver2(zh, stream, nmem, kc, zapt, position, num_entries, + list, + is_partial, limit_set, index_type, ord_no, ords); + nmem_destroy(nmem); + (*kc->dec)(kc); + return res; +} + /* * Local variables: * c-basic-offset: 4