From e1be1f5267e2be257664ded166b6890e4f24db83 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Wed, 26 Mar 2008 15:03:45 +0100 Subject: [PATCH] Handle right-trucation for ICU normalized terms. The string searches now handles right-trucation for ICU normalized terms. That is @attr 5=1 is supported as well as no-truncation, @attr 5=100. All other truncation attributes produces an error. We use some pretty heuristic methods to determine where to put the .* in our resulting regular expression. --- dfa/dfa.c | 14 +-- dict/lookgrep.c | 2 + include/dfa.h | 1 + index/rpnsearch.c | 263 ++++++++++++++++++++++++++++++++--------------------- test/api/t17.c | 14 ++- 5 files changed, 180 insertions(+), 114 deletions(-) diff --git a/dfa/dfa.c b/dfa/dfa.c index 8450d2a..f909f2e 100644 --- a/dfa/dfa.c +++ b/dfa/dfa.c @@ -1032,9 +1032,12 @@ static struct DFA_parse *dfa_parse_init (void) parse_info->rule = 0; parse_info->root = NULL; + /* initialize the anyset which by default does not include \n */ parse_info->anyset = mk_BSet (&parse_info->charset); res_BSet (parse_info->charset, parse_info->anyset); + add_BSet (parse_info->charset, parse_info->anyset, '\n'); com_BSet (parse_info->charset, parse_info->anyset); + parse_info->use_Tnode = parse_info->max_Tnode = 0; parse_info->start = parse_info->end = NULL; parse_info->charMap = NULL; @@ -1096,6 +1099,11 @@ struct DFA *dfa_init (void) return dfa; } +void dfa_anyset_includes_nl(struct DFA *dfa) +{ + add_BSet (dfa->parse_info->charset, dfa->parse_info->anyset, '\n'); +} + void dfa_set_cmap (struct DFA *dfa, void *vp, const char **(*cmap)(void *vp, const char **from, int len)) { @@ -1117,12 +1125,6 @@ int dfa_parse (struct DFA *dfa, const char **pattern) assert (dfa->parse_info); parse_info = dfa->parse_info; - if (!parse_info->cmap) - { - res_BSet (parse_info->charset, parse_info->anyset); - add_BSet (parse_info->charset, parse_info->anyset, '\n'); - com_BSet (parse_info->charset, parse_info->anyset); - } do_parse (parse_info, pattern, &top); if (parse_info->err_code) return parse_info->err_code; diff --git a/dict/lookgrep.c b/dict/lookgrep.c index b0ad4ac..b3de2af 100644 --- a/dict/lookgrep.c +++ b/dict/lookgrep.c @@ -390,6 +390,8 @@ int dict_lookup_grep(Dict dict, const char *pattern, int range, void *client, dfa_verbose = 1; #endif + dfa_anyset_includes_nl(dfa); + yaz_log(YLOG_DEBUG, "dict_lookup_grep range=%d", range); for (i = 0; pattern[i]; i++) { diff --git a/include/dfa.h b/include/dfa.h index 045cd69..eceb0a6 100644 --- a/include/dfa.h +++ b/include/dfa.h @@ -61,6 +61,7 @@ struct DFA { }; struct DFA *dfa_init (void); +void dfa_anyset_includes_nl(struct DFA *dfa); void dfa_set_cmap (struct DFA *dfa, void *vp, const char **(*cmap)(void *vp, const char **from, int len)); int dfa_parse (struct DFA *, const char **); diff --git a/index/rpnsearch.c b/index/rpnsearch.c index a117ad2..8b63758 100644 --- a/index/rpnsearch.c +++ b/index/rpnsearch.c @@ -238,7 +238,8 @@ static void add_non_space(const char *start, const char *end, static int term_100_icu(zebra_map_t zm, const char **src, WRBUF term_dict, int space_split, - WRBUF display_term) + WRBUF display_term, + int right_trunc) { int i; const char *res_buf = 0; @@ -252,14 +253,38 @@ static int term_100_icu(zebra_map_t zm, return 0; } wrbuf_write(display_term, display_buf, display_len); + if (right_trunc) + { + /* ICU sort keys seem to be of the form + basechars \x01 accents \x01 length + For now we'll just right truncate from basechars . This + may give false hits due to accents not being used. + */ + i = res_len; + while (--i >= 0 && res_buf[i] != '\x01') + ; + if (i > 0) + { + while (--i >= 0 && res_buf[i] != '\x01') + ; + } + if (i == 0) + { /* did not find base chars at all. Throw error */ + return -1; + } + res_len = i; /* reduce res_len */ + } for (i = 0; i < res_len; i++) { if (strchr(REGEX_CHARS "\\", res_buf[i])) wrbuf_putc(term_dict, '\\'); if (res_buf[i] < 32) wrbuf_putc(term_dict, 1); + wrbuf_putc(term_dict, res_buf[i]); } + if (right_trunc) + wrbuf_puts(term_dict, ".*"); return 1; } @@ -275,9 +300,6 @@ static int term_100(zebra_map_t zm, const char *space_start = 0; const char *space_end = 0; - if (zebra_maps_is_icu(zm)) - return term_100_icu(zm, src, term_dict, space_split, display_term); - if (!term_pre(zm, src, NULL, NULL, !space_split)) return 0; s0 = *src; @@ -1013,113 +1035,144 @@ static ZEBRA_RES string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, wrbuf_putc(term_dict, ')'); prefix_len = wrbuf_len(term_dict); - - switch (truncation_value) - { - case -1: /* not specified */ - case 100: /* do not truncate */ - if (!string_relation(zh, zapt, &termp, term_dict, - attributeSet, - zm, space_split, display_term, - &relation_error)) + + if (zebra_maps_is_icu(zm)) + { + /* ICU case */ + switch (truncation_value) { - if (relation_error) + case -1: /* not specified */ + case 100: /* do not truncate */ + if (!term_100_icu(zm, &termp, term_dict, space_split, display_term, 0)) { - zebra_setError(zh, relation_error, 0); - return ZEBRA_FAIL; + *term_sub = 0; + return ZEBRA_OK; } - *term_sub = 0; - return ZEBRA_OK; - } - break; - case 1: /* right truncation */ - wrbuf_putc(term_dict, '('); - if (!term_100(zm, &termp, term_dict, space_split, display_term)) - { - *term_sub = 0; - return ZEBRA_OK; - } - wrbuf_puts(term_dict, ".*)"); - break; - case 2: /* keft truncation */ - wrbuf_puts(term_dict, "(.*"); - if (!term_100(zm, &termp, term_dict, space_split, display_term)) - { - *term_sub = 0; - return ZEBRA_OK; - } - wrbuf_putc(term_dict, ')'); - break; - case 3: /* left&right truncation */ - wrbuf_puts(term_dict, "(.*"); - if (!term_100(zm, &termp, term_dict, space_split, display_term)) - { - *term_sub = 0; - return ZEBRA_OK; - } - wrbuf_puts(term_dict, ".*)"); - break; - case 101: /* process # in term */ - wrbuf_putc(term_dict, '('); - if (!term_101(zm, &termp, term_dict, space_split, display_term)) - { - *term_sub = 0; - return ZEBRA_OK; - } - wrbuf_puts(term_dict, ")"); - break; - case 102: /* Regexp-1 */ - wrbuf_putc(term_dict, '('); - if (!term_102(zm, &termp, term_dict, space_split, display_term)) - { - *term_sub = 0; - return ZEBRA_OK; - } - wrbuf_putc(term_dict, ')'); - break; - case 103: /* Regexp-2 */ - regex_range = 1; - wrbuf_putc(term_dict, '('); - if (!term_103(zm, &termp, term_dict, ®ex_range, - space_split, display_term)) - { - *term_sub = 0; - return ZEBRA_OK; - } - wrbuf_putc(term_dict, ')'); - break; - case 104: /* process # and ! in term */ - wrbuf_putc(term_dict, '('); - if (!term_104(zm, &termp, term_dict, space_split, display_term)) - { - *term_sub = 0; - return ZEBRA_OK; - } - wrbuf_putc(term_dict, ')'); - break; - case 105: /* process * and ! in term */ - wrbuf_putc(term_dict, '('); - if (!term_105(zm, &termp, term_dict, space_split, display_term, 1)) - { - *term_sub = 0; - return ZEBRA_OK; + break; + case 1: /* right truncation */ + if (!term_100_icu(zm, &termp, term_dict, space_split, display_term, 1)) + { + *term_sub = 0; + return ZEBRA_OK; + } + break; + default: + zebra_setError_zint(zh, + YAZ_BIB1_UNSUPP_TRUNCATION_ATTRIBUTE, + truncation_value); + return ZEBRA_FAIL; } - wrbuf_putc(term_dict, ')'); - break; - case 106: /* process * and ! in term */ - wrbuf_putc(term_dict, '('); - if (!term_105(zm, &termp, term_dict, space_split, display_term, 0)) + } + else + { + /* non-ICU case. using string.chr and friends */ + switch (truncation_value) { - *term_sub = 0; - return ZEBRA_OK; + case -1: /* not specified */ + case 100: /* do not truncate */ + if (!string_relation(zh, zapt, &termp, term_dict, + attributeSet, + zm, space_split, display_term, + &relation_error)) + { + if (relation_error) + { + zebra_setError(zh, relation_error, 0); + return ZEBRA_FAIL; + } + *term_sub = 0; + return ZEBRA_OK; + } + break; + case 1: /* right truncation */ + wrbuf_putc(term_dict, '('); + if (!term_100(zm, &termp, term_dict, space_split, display_term)) + { + *term_sub = 0; + return ZEBRA_OK; + } + wrbuf_puts(term_dict, ".*)"); + break; + case 2: /* left truncation */ + wrbuf_puts(term_dict, "(.*"); + if (!term_100(zm, &termp, term_dict, space_split, display_term)) + { + *term_sub = 0; + return ZEBRA_OK; + } + wrbuf_putc(term_dict, ')'); + break; + case 3: /* left&right truncation */ + wrbuf_puts(term_dict, "(.*"); + if (!term_100(zm, &termp, term_dict, space_split, display_term)) + { + *term_sub = 0; + return ZEBRA_OK; + } + wrbuf_puts(term_dict, ".*)"); + break; + case 101: /* process # in term */ + wrbuf_putc(term_dict, '('); + if (!term_101(zm, &termp, term_dict, space_split, display_term)) + { + *term_sub = 0; + return ZEBRA_OK; + } + wrbuf_puts(term_dict, ")"); + break; + case 102: /* Regexp-1 */ + wrbuf_putc(term_dict, '('); + if (!term_102(zm, &termp, term_dict, space_split, display_term)) + { + *term_sub = 0; + return ZEBRA_OK; + } + wrbuf_putc(term_dict, ')'); + break; + case 103: /* Regexp-2 */ + regex_range = 1; + wrbuf_putc(term_dict, '('); + if (!term_103(zm, &termp, term_dict, ®ex_range, + space_split, display_term)) + { + *term_sub = 0; + return ZEBRA_OK; + } + wrbuf_putc(term_dict, ')'); + break; + case 104: /* process # and ! in term */ + wrbuf_putc(term_dict, '('); + if (!term_104(zm, &termp, term_dict, space_split, display_term)) + { + *term_sub = 0; + return ZEBRA_OK; + } + wrbuf_putc(term_dict, ')'); + break; + case 105: /* process * and ! in term */ + wrbuf_putc(term_dict, '('); + if (!term_105(zm, &termp, term_dict, space_split, display_term, 1)) + { + *term_sub = 0; + return ZEBRA_OK; + } + wrbuf_putc(term_dict, ')'); + break; + case 106: /* process * and ! in term */ + wrbuf_putc(term_dict, '('); + if (!term_105(zm, &termp, term_dict, space_split, display_term, 0)) + { + *term_sub = 0; + return ZEBRA_OK; + } + wrbuf_putc(term_dict, ')'); + break; + default: + zebra_setError_zint(zh, + YAZ_BIB1_UNSUPP_TRUNCATION_ATTRIBUTE, + truncation_value); + return ZEBRA_FAIL; } - wrbuf_putc(term_dict, ')'); - break; - default: - zebra_setError_zint(zh, - YAZ_BIB1_UNSUPP_TRUNCATION_ATTRIBUTE, - truncation_value); - return ZEBRA_FAIL; } if (1) { diff --git a/test/api/t17.c b/test/api/t17.c index 107204d..57d57f2 100644 --- a/test/api/t17.c +++ b/test/api/t17.c @@ -60,9 +60,11 @@ static void tst(int argc, char **argv) /* simple term */ YAZ_CHECK(tl_query(zh, "@attr 1=title notfound", 0)); - - YAZ_CHECK(tl_query(zh, "@attr 1=title computer", 3)); + YAZ_CHECK(tl_query(zh, "@attr 1=title computer", 3)); + + YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 5=1 comput", 3)); + YAZ_CHECK(tl_query(zh, "@attr 1=title .computer.", 3)); YAZ_CHECK(tl_query(zh, "@attr 1=title x", 2)); @@ -84,9 +86,16 @@ static void tst(int argc, char **argv) YAZ_CHECK(tl_query(zh, "@attr 1=abstract צביה", 1)); YAZ_CHECK(tl_query(zh, "@attr 1=abstract הגדול", 1)); YAZ_CHECK(tl_query(zh, "@attr 1=abstract בסיפור", 1)); + YAZ_CHECK(tl_query(zh, "@attr 1=abstract בסיפ", 0)); YAZ_CHECK(tl_query(zh, "@attr 1=abstract 点", 1)); YAZ_CHECK(tl_query(zh, "@attr 1=abstract wet", 1)); + YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 בסיפ", 1)); + YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 סיפ", 0)); + YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 בסי", 1)); + YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 בס", 1)); + YAZ_CHECK(tl_query(zh, "@attr 1=abstract @attr 5=1 ב", 1)); + /* phrase search */ YAZ_CHECK(tl_query(zh, "@attr 1=title {my computer}", 2)); YAZ_CHECK(tl_query(zh, "@attr 1=title @attr 6=1 {my computer}", 2)); @@ -112,7 +121,6 @@ static void tst(int argc, char **argv) const char *ent[] = { char_ae, "B" char_aring "d", "My computer" }; YAZ_CHECK(tl_scan(zh, "@attr 1=title @attr 6=2 0", 1, 3, 1, 3, 0, ent)); } - YAZ_CHECK(tl_close_down(zh, zs)); #endif -- 1.7.10.4