From 0cac58c528d39e9838a4ee6e3f76bc5ca6bd30bf Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Fri, 11 Mar 2005 17:56:32 +0000 Subject: [PATCH] Implemented the 'equivalent' directive for .chr-files. --- NEWS | 2 + include/charmap.h | 7 +- include/idzebra/zebramap.h | 6 +- index/zrpn.c | 148 ++++++++++++++++++++++++++++++++---------- test/charmap/charmap1.c | 14 +++- test/charmap/string.utf8.chr | 13 ++-- test/charmap/x.xml | 1 + util/charmap.c | 105 ++++++++++++++++++++++-------- util/zebramap.c | 28 +++++++- 9 files changed, 250 insertions(+), 74 deletions(-) diff --git a/NEWS b/NEWS index d8a4e85..ac1300d 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,5 @@ +Implemented the 'equivalent' directive for .chr-files. + Added 'melm' directive to absyn format to simplify config files for MARC-style databases. See tab/marc21.abs for an example. diff --git a/include/charmap.h b/include/charmap.h index 2283dd6..929b5dd 100644 --- a/include/charmap.h +++ b/include/charmap.h @@ -1,4 +1,4 @@ -/* $Id: charmap.h,v 1.11 2005-01-15 19:38:24 adam Exp $ +/* $Id: charmap.h,v 1.12 2005-03-11 17:56:32 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -46,9 +46,8 @@ YAZ_EXPORT void chrmaptab_destroy (chrmaptab tab); YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len, int first); YAZ_EXPORT const char **chr_map_input_x(chrmaptab t, const char **from, int *len, int first); -YAZ_EXPORT const char **chr_map_input_q(chrmaptab maptab, - const char **from, int len, - const char **qmap); +YAZ_EXPORT const char **chr_map_q_input(chrmaptab maptab, + const char **from, int len, int first); YAZ_EXPORT const char *chr_map_output(chrmaptab t, const char **from, int len); diff --git a/include/idzebra/zebramap.h b/include/idzebra/zebramap.h index 26f6de5..980fc23 100644 --- a/include/idzebra/zebramap.h +++ b/include/idzebra/zebramap.h @@ -1,4 +1,4 @@ -/* $Id: zebramap.h,v 1.3 2005-01-15 19:38:24 adam Exp $ +/* $Id: zebramap.h,v 1.4 2005-03-11 17:56:33 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -41,6 +41,10 @@ const char **zebra_maps_input (ZebraMaps zms, unsigned reg_id, const char **from, int len, int first); YAZ_EXPORT +const char **zebra_maps_search (ZebraMaps zms, unsigned reg_id, + const char **from, int len, int *q_map_match); + +YAZ_EXPORT const char *zebra_maps_output(ZebraMaps, unsigned reg_id, const char **from); YAZ_EXPORT diff --git a/index/zrpn.c b/index/zrpn.c index 758608b..dfb7ab9 100644 --- a/index/zrpn.c +++ b/index/zrpn.c @@ -1,4 +1,4 @@ -/* $Id: zrpn.c,v 1.170 2005-03-05 09:19:15 adam Exp $ +/* $Id: zrpn.c,v 1.171 2005-03-11 17:56:34 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -295,6 +295,33 @@ static int term_pre(ZebraMaps zebra_maps, int reg_type, const char **src, return *s0; } + +static void esc_str(char *out_buf, int out_size, + const char *in_buf, int in_size) +{ + int k; + + assert(out_buf); + assert(in_buf); + assert(out_size > 20); + *out_buf = '\0'; + for (k = 0; k 126) + pc = '?'; + else + pc = c; + sprintf(out_buf +strlen(out_buf), "%02X:%c ", c, pc); + if (strlen(out_buf) > out_size-20) + { + strcat(out_buf, ".."); + break; + } + } +} + #define REGEX_CHARS " []()|.*+?!" /* term_100: handle term, where trunc = none(no operators at all) */ @@ -302,7 +329,7 @@ static int term_100(ZebraMaps zebra_maps, int reg_type, const char **src, char *dst, int space_split, char *dst_term) { - const char *s0, *s1; + const char *s0; const char **map; int i = 0; int j = 0; @@ -315,8 +342,10 @@ static int term_100(ZebraMaps zebra_maps, int reg_type, s0 = *src; while (*s0) { - s1 = s0; - map = zebra_maps_input(zebra_maps, reg_type, &s0, strlen(s0), 0); + const char *s1 = s0; + int q_map_match = 0; + map = zebra_maps_search(zebra_maps, reg_type, &s0, strlen(s0), + &q_map_match); if (space_split) { if (**map == *CHR_SPACE) @@ -343,14 +372,26 @@ static int term_100(ZebraMaps zebra_maps, int reg_type, space_start = space_end = 0; } } - /* add non-space char */ - while (s1 < s0) - { - if (strchr(REGEX_CHARS, *s1)) - dst[i++] = '\\'; - dst_term[j++] = *s1; - dst[i++] = *s1++; - } + /* add non-space char */ + memcpy(dst_term+j, s1, s0 - s1); + j += (s0 - s1); + if (!q_map_match) + { + while (s1 < s0) + { + if (strchr(REGEX_CHARS, *s1)) + dst[i++] = '\\'; + dst[i++] = *s1++; + } + } + else + { + char tmpbuf[80]; + esc_str(tmpbuf, sizeof(tmpbuf), map[0], strlen(map[0])); + + strcpy(dst + i, map[0]); + i += strlen(map[0]); + } } dst[i] = '\0'; dst_term[j] = '\0'; @@ -363,7 +404,7 @@ static int term_101(ZebraMaps zebra_maps, int reg_type, const char **src, char *dst, int space_split, char *dst_term) { - const char *s0, *s1; + const char *s0; const char **map; int i = 0; int j = 0; @@ -381,17 +422,33 @@ static int term_101(ZebraMaps zebra_maps, int reg_type, } else { - s1 = s0; - map = zebra_maps_input(zebra_maps, reg_type, &s0, strlen(s0), 0); + const char *s1 = s0; + int q_map_match = 0; + map = zebra_maps_search(zebra_maps, reg_type, &s0, strlen(s0), + &q_map_match); if (space_split && **map == *CHR_SPACE) break; - while (s1 < s0) - { - if (strchr(REGEX_CHARS, *s1)) - dst[i++] = '\\'; - dst_term[j++] = *s1; - dst[i++] = *s1++; - } + + /* add non-space char */ + memcpy(dst_term+j, s1, s0 - s1); + j += (s0 - s1); + if (!q_map_match) + { + while (s1 < s0) + { + if (strchr(REGEX_CHARS, *s1)) + dst[i++] = '\\'; + dst[i++] = *s1++; + } + } + else + { + char tmpbuf[80]; + esc_str(tmpbuf, sizeof(tmpbuf), map[0], strlen(map[0])); + + strcpy(dst + i, map[0]); + i += strlen(map[0]); + } } } dst[i] = '\0'; @@ -407,7 +464,7 @@ static int term_103(ZebraMaps zebra_maps, int reg_type, const char **src, { int i = 0; int j = 0; - const char *s0, *s1; + const char *s0; const char **map; if (!term_pre(zebra_maps, reg_type, src, "^\\()[].*+?|", "(", !space_split)) @@ -430,22 +487,39 @@ static int term_103(ZebraMaps zebra_maps, int reg_type, const char **src, } else { - s1 = s0; - map = zebra_maps_input(zebra_maps, reg_type, &s0, strlen(s0), 0); - if (**map == *CHR_SPACE) + const char *s1 = s0; + int q_map_match = 0; + map = zebra_maps_search(zebra_maps, reg_type, &s0, strlen(s0), + &q_map_match); + if (space_split && **map == *CHR_SPACE) break; - while (s1 < s0) - { - if (strchr(REGEX_CHARS, *s1)) - dst[i++] = '\\'; - dst_term[j++] = *s1; - dst[i++] = *s1++; - } + + /* add non-space char */ + memcpy(dst_term+j, s1, s0 - s1); + j += (s0 - s1); + if (!q_map_match) + { + while (s1 < s0) + { + if (strchr(REGEX_CHARS, *s1)) + dst[i++] = '\\'; + dst[i++] = *s1++; + } + } + else + { + char tmpbuf[80]; + esc_str(tmpbuf, sizeof(tmpbuf), map[0], strlen(map[0])); + + strcpy(dst + i, map[0]); + i += strlen(map[0]); + } } } dst[i] = '\0'; dst_term[j] = '\0'; *src = s0; + return i; } @@ -1173,6 +1247,12 @@ static int string_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt, } if (attr_ok) { + char buf[80]; + const char *input = term_dict + prefix_len; + esc_str(buf, sizeof(buf), input, strlen(input)); + } + if (attr_ok) + { yaz_log(log_level_rpn, "dict_lookup_grep: %s", term_dict+prefix_len); r = dict_lookup_grep(zh->reg->dict, term_dict, regex_range, grep_info, &max_pos, init_pos, @@ -1915,7 +1995,7 @@ static RSET xpath_trunc(ZebraHandle zh, NMEM stream, grep_info.isam_p_indx = 0; r = dict_lookup_grep(zh->reg->dict, term_dict, 0, &grep_info, &max_pos, 0, grep_handle); - yaz_log (YLOG_LOG, "%s %d positions", term, + yaz_log (YLOG_DEBUG, "%s %d positions", term, grep_info.isam_p_indx); rset = rset_trunc(zh, grep_info.isam_p_buf, grep_info.isam_p_indx, term, strlen(term), diff --git a/test/charmap/charmap1.c b/test/charmap/charmap1.c index ad3db3d..137e872 100644 --- a/test/charmap/charmap1.c +++ b/test/charmap/charmap1.c @@ -1,4 +1,4 @@ -/* $Id: charmap1.c,v 1.3 2005-01-15 19:38:35 adam Exp $ +/* $Id: charmap1.c,v 1.4 2005-03-11 17:56:36 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -46,5 +46,17 @@ int main(int argc, char **argv) do_query(__LINE__, zh, "@term string ḥ", 1); + /* search for UNICODE A ring */ + do_query(__LINE__, zh, "@term string lås", 1); + + /* search for aa */ + do_query(__LINE__, zh, "@term string laas", 1); + + /* search for aa (regular) */ + do_query(__LINE__, zh, "@attr 5=102 @term string lås", 1); + + /* search for aaa */ + do_query(__LINE__, zh, "@term string laaas", 0); + return close_down(zh, zs, 0); } diff --git a/test/charmap/string.utf8.chr b/test/charmap/string.utf8.chr index d67402f..aaa2cbf 100644 --- a/test/charmap/string.utf8.chr +++ b/test/charmap/string.utf8.chr @@ -1,4 +1,4 @@ -# $Id: string.utf8.chr,v 1.1 2004-03-09 15:12:15 adam Exp $ +# $Id: string.utf8.chr,v 1.2 2005-03-11 17:56:36 adam Exp $ # Define the basic value-set. *Beware* of changing this without re-indexing # your databases. @@ -15,10 +15,10 @@ space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~ # Characters to be considered equivalent for searching purposes. -# equivalent æä(ae) -# equivalent øö(oe) -# equivalent å(aa) -# equivalent uü +equivalent æä(ae) +equivalent øö(oe) +equivalent å(aa) +equivalent uü # Supplemental mappings @@ -26,3 +26,6 @@ space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~ map \L1E25 h # Latin letter with H with dot below map \L1E24 h + + + diff --git a/test/charmap/x.xml b/test/charmap/x.xml index cf5970f..93405cf 100644 --- a/test/charmap/x.xml +++ b/test/charmap/x.xml @@ -2,6 +2,7 @@ h æ + laas <Acronym> UUCCSEIS </Acronym> diff --git a/util/charmap.c b/util/charmap.c index 8a5d2b8..bfcd963 100644 --- a/util/charmap.c +++ b/util/charmap.c @@ -1,4 +1,4 @@ -/* $Id: charmap.c,v 1.35 2005-01-16 23:14:58 adam Exp $ +/* $Id: charmap.c,v 1.36 2005-03-11 17:56:36 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -76,6 +76,15 @@ typedef struct chrwork } chrwork; /* + * Callback for equivalent stuff + */ +typedef struct +{ + NMEM nmem; + int no_eq; + char *eq[CHR_MAXEQUIV]; +} chr_equiv_work; +/* * Add an entry to the character map. */ static chr_t_entry *set_map_string(chr_t_entry *root, NMEM nmem, @@ -211,6 +220,20 @@ const char **chr_map_input(chrmaptab maptab, const char **from, int len, int fir return (const char **) (res->target); } +const char **chr_map_q_input(chrmaptab maptab, + const char **from, int len, int first) +{ + chr_t_entry *t = maptab->q_input; + chr_t_entry *res; + int len_tmp[2]; + + len_tmp[0] = len; + len_tmp[1] = -1; + if (!(res = find_entry_x(t, from, len_tmp, first))) + return 0; + return (const char **) (res->target); +} + const char *chr_map_output(chrmaptab maptab, const char **from, int len) { unsigned char c = ** (unsigned char **) from; @@ -399,6 +422,18 @@ static void fun_mkstring(const char *s, void *data, int num) } /* + * Create an unmodified string (scan_string handler). + */ +static void fun_add_equivalent_string(const char *s, void *data, int num) +{ + chr_equiv_work *arg = (chr_equiv_work *) data; + + if (arg->no_eq == CHR_MAXEQUIV) + return; + arg->eq[arg->no_eq++] = nmem_strdup(arg->nmem, s); +} + +/* * Add a map to the string contained in the argument. */ static void fun_add_map(const char *s, void *data, int num) @@ -413,21 +448,6 @@ static void fun_add_map(const char *s, void *data, int num) yaz_log (YLOG_DEBUG, " %3d", (unsigned char) *s); } -/* - * Add a query map to the string contained in the argument. - */ -static void fun_add_qmap(const char *s, void *data, int num) -{ - chrwork *arg = (chrwork *) data; - - assert(arg->map->q_input); - yaz_log (YLOG_DEBUG, "set qmap %.*s", (int) strlen(s), s); - set_map_string(arg->map->q_input, arg->map->nmem, s, - strlen(s), arg->string, 0); - for (s = arg->string; *s; s++) - yaz_log (YLOG_DEBUG, " %3d", (unsigned char) *s); -} - static int scan_to_utf8 (yaz_iconv_t t, ucs4_t *from, size_t inlen, char *outbuf, size_t outbytesleft) { @@ -690,29 +710,58 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, ++errors; } } - else if (!yaz_matchstr(argv[0], "qmap")) + else if (!yaz_matchstr(argv[0], "equivalent")) { - chrwork buf; + chr_equiv_work w; - if (argc != 3) + if (argc != 2) { - yaz_log(YLOG_FATAL, "charmap directive qmap requires 2 args"); + yaz_log(YLOG_FATAL, "equivalent requires 1 argument"); ++errors; } - buf.map = res; - buf.string[0] = '\0'; - if (scan_string(argv[2], t_unicode, t_utf8, - fun_mkstring, &buf, 0) < 0) + w.nmem = res->nmem; + w.no_eq = 0; + if (scan_string(argv[1], t_unicode, t_utf8, + fun_add_equivalent_string, &w, 0) < 0) { - yaz_log(YLOG_FATAL, "Bad qmap target"); + yaz_log(YLOG_FATAL, "equivalent: invalid string"); ++errors; } - if (scan_string(argv[1], t_unicode, t_utf8, - fun_add_qmap, &buf, 0) < 0) + else if (w.no_eq == 0) { - yaz_log(YLOG_FATAL, "Bad qmap source"); + yaz_log(YLOG_FATAL, "equivalent: no strings"); ++errors; } + else + { + char *result_str; + int i, slen = 5; + + /* determine length of regular expression */ + for (i = 0; i<w.no_eq; i++) + slen += strlen(w.eq[i]) + 1; + result_str = nmem_malloc(res->nmem, slen + 5); + + /* build the regular expression */ + *result_str = '\0'; + slen = 0; + for (i = 0; i<w.no_eq; i++) + { + result_str[slen++] = i ? '|' : '('; + strcpy(result_str + slen, w.eq[i]); + slen += strlen(w.eq[i]); + } + result_str[slen++] = ')'; + result_str[slen] = '\0'; + + /* each eq will map to this regular expression */ + for (i = 0; i<w.no_eq; i++) + { + set_map_string(res->q_input, res->nmem, + w.eq[i], strlen(w.eq[i]), + result_str, 0); + } + } } else if (!yaz_matchstr(argv[0], "encoding")) { diff --git a/util/zebramap.c b/util/zebramap.c index 005793b..0a95eda 100644 --- a/util/zebramap.c +++ b/util/zebramap.c @@ -1,4 +1,4 @@ -/* $Id: zebramap.c,v 1.39 2005-01-16 23:14:58 adam Exp $ +/* $Id: zebramap.c,v 1.40 2005-03-11 17:56:36 adam Exp $ Copyright (C) 1995-2005 Index Data ApS @@ -305,6 +305,32 @@ const char **zebra_maps_input (ZebraMaps zms, unsigned reg_id, return zms->temp_map_ptr; } +const char **zebra_maps_search(ZebraMaps zms, unsigned reg_id, + const char **from, int len, int *q_map_match) +{ + chrmaptab maptab; + + *q_map_match = 0; + maptab = zebra_charmap_get (zms, reg_id); + if (maptab) + { + const char **map; + map = chr_map_q_input(maptab, from, len, 0); + if (map && map[0]) + { + *q_map_match = 1; + return map; + } + map = chr_map_input(maptab, from, len, 0); + if (map) + return map; + } + zms->temp_map_str[0] = **from; + + (*from)++; + return zms->temp_map_ptr; +} + const char *zebra_maps_output(ZebraMaps zms, unsigned reg_id, const char **from) { -- 1.7.10.4