From 85ad68ab178a261dc548284ee68aae9107cbfaaf Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 28 Aug 2012 15:50:51 +0200 Subject: [PATCH 1/1] Start work on ICU based regexp searches --- index/rpnsearch.c | 113 ++++++++++++++++++++++++++++++++++++++---- test/api/test_icu_indexing.c | 66 +++++++++++++++++++++--- 2 files changed, 163 insertions(+), 16 deletions(-) diff --git a/index/rpnsearch.c b/index/rpnsearch.c index f119eb5..8b474a9 100644 --- a/index/rpnsearch.c +++ b/index/rpnsearch.c @@ -234,20 +234,109 @@ static void add_non_space(const char *start, const char *end, } +static int term_102_icu(zebra_map_t zm, + const char **src, WRBUF term_dict, int space_split, + WRBUF display_term) +{ + int no_terms = 0; + const char *s0 = *src, *s1; + while (*s0 == ' ') + s0++; + s1 = s0; + for (;;) + { + if (*s1 == ' ' && space_split) + break; + else if (*s1 && !strchr(REGEX_CHARS "-", *s1)) + s1++; + else + { + /* EOF or regex reserved char */ + if (s0 != s1) + { + const char *res_buf = 0; + size_t res_len = 0; + const char *display_buf; + size_t display_len; + + zebra_map_tokenize_start(zm, s0, s1 - s0); + + if (zebra_map_tokenize_next(zm, &res_buf, &res_len, + &display_buf, &display_len)) + { + size_t i = res_len; + while (--i >= 0 && res_buf[i] != '\x01') + ; + if (i > 0) + { + while (--i >= 0 && res_buf[i] != '\x01') + ; + } + res_len = i; /* reduce res_len */ + for (i = 0; i < res_len; i++) + { + if (strchr(REGEX_CHARS "\\", res_buf[i])) + wrbuf_putc(term_dict, '\\'); + if (res_buf[i] < 32) + wrbuf_putc(term_dict, '\x01'); + + wrbuf_putc(term_dict, res_buf[i]); + } + wrbuf_write(display_term, display_buf, display_len); + + no_terms++; + } + } + if (*s1 == '\0') + break; + + wrbuf_putc(term_dict, *s1); + wrbuf_putc(display_term, *s1); + + s1++; + s0 = s1; + } + } + if (no_terms) + wrbuf_puts(term_dict, "\x01\x01.*"); + *src = s1; + return no_terms; +} + static int term_100_icu(zebra_map_t zm, const char **src, WRBUF term_dict, int space_split, WRBUF display_term, int mode) { - int i; + size_t i; const char *res_buf = 0; size_t res_len = 0; const char *display_buf; size_t display_len; + const char *s0 = *src, *s1; + + while (*s0 == ' ') + s0++; + + if (*s0 == '\0') + return 0; + + if (space_split) + { + s1 = s0; + while (*s1 && *s1 != ' ') + s1++; + } + else + s1 = s0 + strlen(s0); + + *src = s1; + + zebra_map_tokenize_start(zm, s0, s1 - s0); + if (!zebra_map_tokenize_next(zm, &res_buf, &res_len, &display_buf, &display_len)) { - *src += strlen(*src); return 0; } wrbuf_write(display_term, display_buf, display_len); @@ -279,15 +368,14 @@ static int term_100_icu(zebra_map_t zm, if (strchr(REGEX_CHARS "\\", res_buf[i])) wrbuf_putc(term_dict, '\\'); if (res_buf[i] < 32) - wrbuf_putc(term_dict, 1); - + wrbuf_putc(term_dict, '\x01'); + wrbuf_putc(term_dict, res_buf[i]); } if (mode & 1) wrbuf_puts(term_dict, ".*"); else if (mode) wrbuf_puts(term_dict, "\x01\x01.*"); - return 1; } @@ -1053,6 +1141,13 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, return ZEBRA_OK; } break; + case 102: + if (!term_102_icu(zm, &termp, term_dict, space_split, display_term)) + { + *term_sub = 0; + return ZEBRA_OK; + } + break; case 1: /* right truncation */ if (!term_100_icu(zm, &termp, term_dict, space_split, display_term, 1)) { @@ -1389,8 +1484,6 @@ static ZEBRA_RES search_terms_list(ZebraHandle zh, struct rset_key_control *kc) { zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, index_type); - if (zebra_maps_is_icu(zm)) - zebra_map_tokenize_start(zm, termz, strlen(termz)); return search_terms_chrmap(zh, zapt, termz, attributeSet, hits_limit, stream, index_type, complete_flag, rank_type, xpath_use, @@ -2063,7 +2156,7 @@ static RSET xpath_trunc(ZebraHandle zh, NMEM stream, return rset_create_null(rset_nmem, kc, 0); else { - int i, r, max_pos; + int i, max_pos; char ord_buf[32]; RSET rset; WRBUF term_dict = wrbuf_alloc(); @@ -2081,8 +2174,8 @@ static RSET xpath_trunc(ZebraHandle zh, NMEM stream, wrbuf_puts(term_dict, term); grep_info.isam_p_indx = 0; - r = dict_lookup_grep(zh->reg->dict, wrbuf_cstr(term_dict), 0, - &grep_info, &max_pos, 0, grep_handle); + dict_lookup_grep(zh->reg->dict, wrbuf_cstr(term_dict), 0, + &grep_info, &max_pos, 0, grep_handle); yaz_log(YLOG_DEBUG, "%s %d positions", term, grep_info.isam_p_indx); rset = rset_trunc(zh, grep_info.isam_p_buf, diff --git a/test/api/test_icu_indexing.c b/test/api/test_icu_indexing.c index b584451..dbe408e 100644 --- a/test/api/test_icu_indexing.c +++ b/test/api/test_icu_indexing.c @@ -45,7 +45,7 @@ const char *myrec[] = { "\nMy computer\n\n", "\nMy x computer\n\n", "\nMy computer x\n\n" , - "\n" char_ae "\n\n" , + "\n" char_ae "rme\n\n" , "\nB" char_aring "d\n" "זיהוי סדר הארועים בסיפור המרד הגדול מאת צביה בן-שלום 提示:直接点击数据库名称,将进入单库检索 Ngày xửa ngày xưa D.W. all wet\n\n" , 0} ; @@ -65,7 +65,53 @@ static void tst(int argc, char **argv) YAZ_CHECK(tl_query(zh, "@attr 1=title computer", 3)); - YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 5=1 comput", 3)); + YAZ_CHECK(tl_query(zh, "@attr 5=1 @attr 1=title computer", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=1 @attr 1=title compute", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=1 @attr 1=title computee", 0)); + + YAZ_CHECK(tl_query(zh, "@attr 5=1 @attr 1=title co", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=2 @attr 1=title computer", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=2 @attr 1=title compute", 0)); + + YAZ_CHECK(tl_query(zh, "@attr 5=2 @attr 1=title er", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=3 @attr 1=title computer", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=3 @attr 1=title compute", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=3 @attr 1=title er", 4)); + + YAZ_CHECK(tl_query(zh, "@attr 5=3 @attr 1=title ompute", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title com.*er", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title cm.*er", 0)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title com.*ër", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title com?m.*er", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title coy?m.*er", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title co[m].*er", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title co[mn].*er", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title co[m-n].*er", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title co[a-z].*er", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 @attr 1=title co[a-n].*er", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 1=title com.*ër", 0)); + + YAZ_CHECK(tl_query(zh, "@attr 1=title @and @attr 5=102 com.*er x", 2)); + + YAZ_CHECK(tl_query(zh, "@attr 1=title @and x @attr 5=102 com.*er", 2)); YAZ_CHECK(tl_query(zh, "@attr 1=title .computer.", 3)); @@ -75,14 +121,20 @@ static void tst(int argc, char **argv) YAZ_CHECK(tl_query(zh, "@attr 1=title mY", 3)); - YAZ_CHECK(tl_query(zh, char_ae, 1)); - YAZ_CHECK(tl_query(zh, char_AE, 1)); + YAZ_CHECK(tl_query(zh, char_ae "rme", 1)); + YAZ_CHECK(tl_query(zh, char_AE "RME", 1)); YAZ_CHECK(tl_query(zh, "b" char_aring "d", 1)); YAZ_CHECK(tl_query(zh, "B" char_Aring "D", 1)); YAZ_CHECK(tl_query(zh, "b" char_aring1 "d", 1)); YAZ_CHECK(tl_query(zh, "B" char_Aring1 "D", 1)); + YAZ_CHECK(tl_query(zh, "@attr 5=102 b" char_aring "d", 1)); + YAZ_CHECK(tl_query(zh, "@attr 5=102 b.d", 1)); + + YAZ_CHECK(tl_query(zh, "@attr 5=102 " char_ae "rme", 1)); + YAZ_CHECK(tl_query(zh, "@attr 5=102 " "..rme", 1)); + /* Abstract searches . Chinese mostly */ YAZ_CHECK(tl_query(zh, "@attr 1=abstract בן", 1)); YAZ_CHECK(tl_query(zh, "@attr 1=abstract צביה", 1)); @@ -98,6 +150,8 @@ static void tst(int argc, char **argv) YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 בס", 1)); YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 ב", 1)); + YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=102 בן", 1)); + /* phrase search */ YAZ_CHECK(tl_query(zh, "@attr 1=title {my computer}", 2)); YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 6=1 {my computer}", 2)); @@ -115,7 +169,7 @@ static void tst(int argc, char **argv) /* scan */ { /* word search */ - const char *ent[] = { char_ae, "B" char_aring "d", "computer", + const char *ent[] = { char_ae "rme", "B" char_aring "d", "computer", "My", "x", 0 }; YAZ_CHECK(tl_scan(zh, "@attr 1=title 0", 1, 10, 1, 5, 1, ent)); } @@ -126,7 +180,7 @@ static void tst(int argc, char **argv) } { /* phrase search */ - const char *ent[] = { char_ae, "B" char_aring "d", "My computer" }; + const char *ent[] = { char_ae "rme", "B" char_aring "d", "My computer" }; YAZ_CHECK(tl_scan(zh, "@attr 1=title @attr 6=2 0", 1, 3, 1, 3, 0, ent)); } -- 1.7.10.4