From 99dfd244ebcc60d73eb50500f67207ae1aa591d8 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 8 Nov 2007 21:21:58 +0000 Subject: [PATCH] First successful test with ICU sortkeys in dictionary. --- index/extract.c | 3 +-- index/rpnsearch.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++----- test/api/t17.idx | 3 ++- util/zebramap.c | 49 +++++++++++++++++++++++++++++++++++++------------ 4 files changed, 88 insertions(+), 20 deletions(-) diff --git a/index/extract.c b/index/extract.c index 6ac93de..167793a 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.268 2007-11-06 10:29:59 adam Exp $ +/* $Id: extract.c,v 1.269 2007-11-08 21:21:58 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -1716,7 +1716,6 @@ static void extract_add_icu(RecWord *p, zebra_map_t zm) key.mem[i++] = p->seqno; key.len = i; - yaz_log(YLOG_LOG, "keys_write %.*s", (int) res_len, res_buf); zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key); p->seqno++; diff --git a/index/rpnsearch.c b/index/rpnsearch.c index 2698c58..d07bf3f 100644 --- a/index/rpnsearch.c +++ b/index/rpnsearch.c @@ -1,4 +1,4 @@ -/* $Id: rpnsearch.c,v 1.20 2007-11-01 14:10:03 adam Exp $ +/* $Id: rpnsearch.c,v 1.21 2007-11-08 21:21:58 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -65,7 +65,10 @@ void rpn_char_map_prepare(struct zebra_register *reg, zebra_map_t zm, struct rpn_char_map_info *map_info) { map_info->zm = zm; - dict_grep_cmap(reg->dict, map_info, rpn_char_map_handler); + if (zebra_maps_is_icu(zm)) + dict_grep_cmap(reg->dict, 0, 0); + else + dict_grep_cmap(reg->dict, map_info, rpn_char_map_handler); } #define TERM_COUNT @@ -232,6 +235,38 @@ static void add_non_space(const char *start, const char *end, } } + +static int term_100_icu(zebra_map_t zm, + const char **src, WRBUF term_dict, int space_split, + char *dst_term) +{ + int no = 0; + const char *res_buf = 0; + size_t res_len = 0; + int r = zebra_map_tokenize(zm, *src, strlen(*src), + &res_buf, &res_len); + + yaz_log(YLOG_LOG, "term_100_icu r=%d", r); + if (r) + strcat(dst_term, *src); + *src += strlen(*src); + while (r) + { + int i; + no++; + for (i = 0; i < res_len; i++) + { + if (strchr(REGEX_CHARS, res_buf[i])) + wrbuf_putc(term_dict, '\\'); + if (res_buf[i] < 32) + wrbuf_putc(term_dict, 1); + wrbuf_putc(term_dict, res_buf[i]); + } + r = zebra_map_tokenize(zm, 0, 0, &res_buf, &res_len); + } + return no; +} + /* term_100: handle term, where trunc = none(no operators at all) */ static int term_100(zebra_map_t zm, const char **src, WRBUF term_dict, int space_split, @@ -245,6 +280,9 @@ static int term_100(zebra_map_t zm, const char *space_start = 0; const char *space_end = 0; + if (zebra_maps_is_icu(zm)) + return term_100_icu(zm, src, term_dict, space_split, dst_term); + if (!term_pre(zm, src, NULL, NULL, !space_split)) return 0; s0 = *src; @@ -950,7 +988,7 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, char ord_buf[32]; int ord_len, i; zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, index_type); - + *ol = ord_list_create(stream); rpn_char_map_prepare(zh->reg, zm, &rcmi); @@ -1096,8 +1134,13 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, const char *input = wrbuf_cstr(term_dict) + prefix_len; esc_str(buf, sizeof(buf), input, strlen(input)); } - yaz_log(log_level_rpn, "dict_lookup_grep: %s", - wrbuf_cstr(term_dict) + prefix_len); + { + WRBUF pr_wr = wrbuf_alloc(); + + wrbuf_verbose_str(pr_wr, wrbuf_buf(term_dict), wrbuf_len(term_dict)); + yaz_log(YLOG_LOG, "dict_lookup_grep: %s", wrbuf_cstr(pr_wr)); + wrbuf_destroy(pr_wr); + } r = dict_lookup_grep(zh->reg->dict, wrbuf_cstr(term_dict), regex_range, grep_info, &max_pos, ord_len /* number of "exact" chars */, diff --git a/test/api/t17.idx b/test/api/t17.idx index 6159a15..de511b3 100644 --- a/test/api/t17.idx +++ b/test/api/t17.idx @@ -1,5 +1,5 @@ # Zebra indexes as referred to from the *.abs-files. -# $Id: t17.idx,v 1.3 2007-11-08 13:35:36 adam Exp $ +# $Id: t17.idx,v 1.4 2007-11-08 21:21:58 adam Exp $ # # Traditional word index @@ -12,6 +12,7 @@ alwaysmatches 1 firstinfield 1 # simplechain dummy icuchain words-icu.xml +debug 1 # Phrase index # Used if completeness is 'complete {sub}field' (@attr 6=2, @attr 6=1) diff --git a/util/zebramap.c b/util/zebramap.c index 202806f..ae74a27 100644 --- a/util/zebramap.c +++ b/util/zebramap.c @@ -1,4 +1,4 @@ -/* $Id: zebramap.c,v 1.70 2007-11-08 13:35:36 adam Exp $ +/* $Id: zebramap.c,v 1.71 2007-11-08 21:21:58 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -47,6 +47,7 @@ struct zebra_map { int first_in_field; int type; int use_chain; + int debug; union { struct { int entry_size; @@ -61,7 +62,8 @@ struct zebra_map { #if YAZ_HAVE_ICU struct icu_chain *icu_chain; #endif - WRBUF simple_buf; + WRBUF input_str; + WRBUF print_str; size_t simple_off; struct zebra_map *next; }; @@ -92,7 +94,8 @@ void zebra_maps_close(zebra_maps_t zms) #if YAZ_HAVE_XML2 xmlFreeDoc(zm->doc); #endif - wrbuf_destroy(zm->simple_buf); + wrbuf_destroy(zm->input_str); + wrbuf_destroy(zm->print_str); zm = zm->next; } wrbuf_destroy(zms->wrbuf_1); @@ -109,6 +112,7 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type, zm->id = nmem_strdup(zms->nmem, index_type); zm->maptab_name = 0; zm->use_chain = 0; + zm->debug = 0; zm->maptab = 0; zm->type = map_type; zm->completeness = 0; @@ -128,7 +132,8 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type, #if YAZ_HAVE_XML2 zm->doc = 0; #endif - zm->simple_buf = wrbuf_alloc(); + zm->input_str = wrbuf_alloc(); + zm->print_str = wrbuf_alloc(); return zm; } @@ -265,6 +270,10 @@ static int parse_command(zebra_maps_t zms, int argc, char **argv, return -1; #endif } + else if (!yaz_matchstr(argv[0], "debug") && argc == 2) + { + zm->debug = atoi(argv[1]); + } else { yaz_log(YLOG_WARN, "%s:%d: Unrecognized directive '%s'", @@ -605,8 +614,8 @@ WRBUF zebra_replace(zebra_map_t zm, const char *ex_list, static int tokenize_simple(zebra_map_t zm, const char **result_buf, size_t *result_len) { - char *buf = wrbuf_buf(zm->simple_buf); - size_t len = wrbuf_len(zm->simple_buf); + char *buf = wrbuf_buf(zm->input_str); + size_t len = wrbuf_len(zm->input_str); size_t i = zm->simple_off; size_t start; @@ -638,8 +647,8 @@ int zebra_map_tokenize(zebra_map_t zm, if (buf) { - wrbuf_rewind(zm->simple_buf); - wrbuf_write(zm->simple_buf, buf, len); + wrbuf_rewind(zm->input_str); + wrbuf_write(zm->input_str, buf, len); zm->simple_off = 0; } @@ -651,19 +660,35 @@ int zebra_map_tokenize(zebra_map_t zm, UErrorCode status; if (buf) { - yaz_log(YLOG_LOG, "assicn_cstr %s", wrbuf_cstr(zm->simple_buf)); + if (zm->debug) + { + wrbuf_rewind(zm->print_str); + wrbuf_verbose_str(zm->print_str, wrbuf_buf(zm->input_str), + wrbuf_len(zm->input_str)); + + yaz_log(YLOG_LOG, "input %s", + wrbuf_cstr(zm->print_str)); + } icu_chain_assign_cstr(zm->icu_chain, - wrbuf_cstr(zm->simple_buf), + wrbuf_cstr(zm->input_str), &status); assert(U_SUCCESS(status)); } while (icu_chain_next_token(zm->icu_chain, &status)) { assert(U_SUCCESS(status)); - *result_buf = icu_chain_token_norm(zm->icu_chain); + *result_buf = icu_chain_token_sortkey(zm->icu_chain); assert(*result_buf); - yaz_log(YLOG_LOG, "got result %s", *result_buf); + *result_len = strlen(*result_buf); + + if (zm->debug) + { + wrbuf_rewind(zm->print_str); + wrbuf_verbose_str(zm->print_str, *result_buf, *result_len); + yaz_log(YLOG_LOG, "output %s", wrbuf_cstr(zm->print_str)); + } + if (**result_buf != '\0') return 1; } -- 1.7.10.4