X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=util%2Fzebramap.c;h=b92566bce52d4b79a1a40f8207613671923569f6;hb=4d9f66ed35ee70e5670cd67cc9e2da7b5de93bf7;hp=08870cf1f03502f4d19c05dfeb3c027985733861;hpb=18dd1a93498eca4352323fb277ea437d24106b1f;p=idzebra-moved-to-github.git diff --git a/util/zebramap.c b/util/zebramap.c index 08870cf..b92566b 100644 --- a/util/zebramap.c +++ b/util/zebramap.c @@ -1,4 +1,4 @@ -/* $Id: zebramap.c,v 1.62 2007-11-05 11:27:24 adam Exp $ +/* $Id: zebramap.c,v 1.67 2007-11-07 11:22:58 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -28,6 +28,9 @@ #include #include +#if HAVE_ICU +#include +#endif #include #define ZEBRA_MAP_TYPE_SORT 1 @@ -43,6 +46,7 @@ struct zebra_map { int alwaysmatches; int first_in_field; int type; + int use_chain; union { struct { int entry_size; @@ -50,7 +54,16 @@ struct zebra_map { } u; chrmaptab maptab; const char *maptab_name; + const char *locale; zebra_maps_t zebra_maps; +#if YAZ_HAVE_XML2 + xmlDocPtr doc; +#endif +#if HAVE_ICU + struct icu_chain *icu_chain; +#endif + WRBUF simple_buf; + size_t simple_off; struct zebra_map *next; }; @@ -61,7 +74,7 @@ struct zebra_maps_s { char temp_map_str[2]; const char *temp_map_ptr[2]; WRBUF wrbuf_1; - int no_maps; + int no_files_read; zebra_map_t map_list; zebra_map_t last_map; }; @@ -73,6 +86,14 @@ void zebra_maps_close(zebra_maps_t zms) { if (zm->maptab) chrmaptab_destroy(zm->maptab); +#if HAVE_ICU + if (zm->icu_chain) + icu_chain_destroy(zm->icu_chain); +#endif +#if YAZ_HAVE_XML2 + xmlFreeDoc(zm->doc); +#endif + wrbuf_destroy(zm->simple_buf); zm = zm->next; } wrbuf_destroy(zms->wrbuf_1); @@ -88,6 +109,8 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type, zm->zebra_maps = zms; zm->id = nmem_strdup(zms->nmem, index_type); zm->maptab_name = 0; + zm->use_chain = 0; + zm->locale = 0; zm->maptab = 0; zm->type = map_type; zm->completeness = 0; @@ -101,9 +124,13 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type, zms->map_list = zm; zms->last_map = zm; zm->next = 0; - - zms->no_maps++; - +#if HAVE_ICU + zm->icu_chain = 0; +#endif +#if YAZ_HAVE_XML2 + zm->doc = 0; +#endif + zm->simple_buf = wrbuf_alloc(); return zm; } @@ -191,6 +218,65 @@ static int parse_command(zebra_maps_t zms, int argc, char **argv, return -1; } } + else if (!yaz_matchstr(argv[0], "locale")) + { + zm->locale = nmem_strdup(zms->nmem, argv[1]); + } + else if (!yaz_matchstr(argv[0], "simplechain")) + { + zm->use_chain = 1; +#if HAVE_ICU + zm->icu_chain = 0; +#endif + } + else if (!yaz_matchstr(argv[0], "icuchain")) + { +#if YAZ_HAVE_XML2 + if (!zm->locale) + { + yaz_log(YLOG_WARN, "%s:%d: locale required before icuchain", + fname, lineno); + return -1; + } + zm->doc = xmlParseFile(argv[1]); + if (!zm->doc) + { + yaz_log(YLOG_WARN, "%s:%d: Could not load icuchain config '%s'", + fname, lineno, argv[1]); + return -1; + } + else + { +#if HAVE_ICU + UErrorCode status; + xmlNode *xml_node = xmlDocGetRootElement(zm->doc); + zm->icu_chain = + icu_chain_xml_config(xml_node, zm->locale, +/* not sure about sort for this function yet.. */ +#if 1 + 1, +#else + zm->type == ZEBRA_MAP_TYPE_SORT, +#endif + &status); + if (!zm->icu_chain) + { + yaz_log(YLOG_WARN, "%s:%d: Failed to load ICU chain %s", + fname, lineno, argv[1]); + } + zm->use_chain = 1; +#else + yaz_log(YLOG_WARN, "%s:%d: ICU support unavailable", + fname, lineno); + return -1; +#endif + } +#else + yaz_log(YLOG_WARN, "%s:%d: XML support unavailable", + fname, lineno); + return -1; +#endif + } else { yaz_log(YLOG_WARN, "%s:%d: Unrecognized directive '%s'", @@ -224,6 +310,8 @@ ZEBRA_RES zebra_maps_read_file(zebra_maps_t zms, const char *fname) if (failures) return ZEBRA_FAIL; + + (zms->no_files_read)++; return ZEBRA_OK; } @@ -233,7 +321,6 @@ zebra_maps_t zebra_maps_open(Res res, const char *base_path, zebra_maps_t zms = (zebra_maps_t) xmalloc(sizeof(*zms)); zms->nmem = nmem_create(); - zms->no_maps = 0; zms->tabpath = profile_path ? nmem_strdup(zms->nmem, profile_path) : 0; zms->tabroot = 0; if (base_path) @@ -249,6 +336,7 @@ zebra_maps_t zebra_maps_open(Res res, const char *base_path, zms->wrbuf_1 = wrbuf_alloc(); + zms->no_files_read = 0; return zms; } @@ -268,13 +356,9 @@ zebra_map_t zebra_map_get_or_add(zebra_maps_t zms, const char *id) { zm = zebra_add_map(zms, id, ZEBRA_MAP_TYPE_INDEX); - /* no reason to warn if no maps are installed at ALL - Note that zebra_add_maps increments no_maps .. - */ - if (zms->no_maps > 1) + /* no reason to warn if no maps are read from file */ + if (zms->no_files_read) yaz_log(YLOG_WARN, "Unknown register type: %s", id); - else - zms->no_maps = 0; zm->maptab_name = nmem_strdup(zms->nmem, "@"); zm->completeness = 0; @@ -528,6 +612,91 @@ WRBUF zebra_replace(zebra_map_t zm, const char *ex_list, return zm->zebra_maps->wrbuf_1; } +#define SE_CHARS ";,.()-/?<> \r\n\t" + +static int tokenize_simple(zebra_map_t zm, + const char **result_buf, size_t *result_len) +{ + char *buf = wrbuf_buf(zm->simple_buf); + size_t len = wrbuf_len(zm->simple_buf); + size_t i = zm->simple_off; + size_t start; + + while (i < len && strchr(SE_CHARS, buf[i])) + i++; + start = i; + while (i < len && !strchr(SE_CHARS, buf[i])) + { + if (buf[i] > 32 && buf[i] < 127) + buf[i] = tolower(buf[i]); + i++; + } + + zm->simple_off = i; + if (start != i) + { + *result_buf = buf + start; + *result_len = i - start; + return 1; + } + return 0; + } + +int zebra_map_tokenize(zebra_map_t zm, + const char *buf, size_t len, + const char **result_buf, size_t *result_len) +{ + assert(zm->use_chain); + + if (buf) + { + wrbuf_rewind(zm->simple_buf); + wrbuf_write(zm->simple_buf, buf, len); + zm->simple_off = 0; + } + +#if HAVE_ICU + if (!zm->icu_chain) + return tokenize_simple(zm, result_buf, result_len); + else + { + UErrorCode status; + if (buf) + { + yaz_log(YLOG_LOG, "assicn_cstr %s", wrbuf_cstr(zm->simple_buf)); + icu_chain_assign_cstr(zm->icu_chain, + wrbuf_cstr(zm->simple_buf), + &status); + assert(U_SUCCESS(status)); + } + while (icu_chain_next_token(zm->icu_chain, &status)) + { + assert(U_SUCCESS(status)); + *result_buf = icu_chain_token_norm(zm->icu_chain); + assert(*result_buf); + yaz_log(YLOG_LOG, "got result %s", *result_buf); + *result_len = strlen(*result_buf); + if (**result_buf != '\0') + return 1; + } + assert(U_SUCCESS(status)); + } + return 0; +#else + return tokenize_simple(zm, result_buf, result_len); +#endif +} + +int zebra_maps_is_icu(zebra_map_t zm) +{ +#if HAVE_ICU + return zm->use_chain; +#else + return 0; +#endif +} + + /* * Local variables: * c-basic-offset: 4