X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=util%2Fzebramap.c;h=ea467981429d87cf312788f5604f048ace293950;hp=b96e9f0e459ddac5be0600ad5b5fe3b332b27e47;hb=250de4ed23a44f5eb3552db317eef0d0fbe3265c;hpb=6f7dfe3c3f09f7104c1ae7616c9d207edeab308d diff --git a/util/zebramap.c b/util/zebramap.c index b96e9f0..ea46798 100644 --- a/util/zebramap.c +++ b/util/zebramap.c @@ -1,25 +1,25 @@ -/* $Id: zebramap.c,v 1.60 2007-10-30 19:17:15 adam Exp $ - Copyright (C) 1995-2007 - Index Data ApS - - This file is part of the Zebra server. - - Zebra is free software; you can redistribute it and/or modify it under - the terms of the GNU General Public License as published by the Free - Software Foundation; either version 2, or (at your option) any later - version. - - Zebra is distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - for more details. - - You should have received a copy of the GNU General Public License - along with Zebra; see the file LICENSE.zebra. If not, write to the - Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA - 02111-1307, USA. +/* This file is part of the Zebra server. + Copyright (C) 2004-2013 Index Data + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#if HAVE_CONFIG_H +#include +#endif #include #include #include @@ -28,6 +28,9 @@ #include #include +#if YAZ_HAVE_ICU +#include +#endif #include #define ZEBRA_MAP_TYPE_SORT 1 @@ -37,23 +40,31 @@ #define ZEBRA_REPLACE_ANY 300 struct zebra_map { - unsigned reg_id; + const char *id; int completeness; int positioned; int alwaysmatches; int first_in_field; int type; + int use_chain; + int debug; union { struct { - int dummy; - } index; - struct { int entry_size; } sort; } u; chrmaptab maptab; const char *maptab_name; zebra_maps_t zebra_maps; +#if YAZ_HAVE_XML2 + xmlDocPtr doc; +#endif +#if YAZ_HAVE_ICU + struct icu_chain *icu_chain; +#endif + WRBUF input_str; + WRBUF print_str; + size_t simple_off; struct zebra_map *next; }; @@ -63,11 +74,10 @@ struct zebra_maps_s { NMEM nmem; char temp_map_str[2]; const char *temp_map_ptr[2]; - struct zebra_map **lookup_array; WRBUF wrbuf_1; - int no_maps; + int no_files_read; zebra_map_t map_list; - zebra_map_t *last_map; + zebra_map_t last_map; }; void zebra_maps_close(zebra_maps_t zms) @@ -77,6 +87,15 @@ void zebra_maps_close(zebra_maps_t zms) { if (zm->maptab) chrmaptab_destroy(zm->maptab); +#if YAZ_HAVE_ICU + if (zm->icu_chain) + icu_chain_destroy(zm->icu_chain); +#endif +#if YAZ_HAVE_XML2 + xmlFreeDoc(zm->doc); +#endif + wrbuf_destroy(zm->input_str); + wrbuf_destroy(zm->print_str); zm = zm->next; } wrbuf_destroy(zms->wrbuf_1); @@ -90,8 +109,10 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type, zebra_map_t zm = (zebra_map_t) nmem_malloc(zms->nmem, sizeof(*zm)); zm->zebra_maps = zms; - zm->reg_id = index_type[0]; + zm->id = nmem_strdup(zms->nmem, index_type); zm->maptab_name = 0; + zm->use_chain = 0; + zm->debug = 0; zm->maptab = 0; zm->type = map_type; zm->completeness = 0; @@ -99,15 +120,177 @@ zebra_map_t zebra_add_map(zebra_maps_t zms, const char *index_type, zm->alwaysmatches = 0; zm->first_in_field = 0; + if (zms->last_map) + zms->last_map->next = zm; + else + zms->map_list = zm; + zms->last_map = zm; zm->next = 0; - *zms->last_map = zm; - zms->last_map = &zm->next; - - zms->no_maps++; - +#if YAZ_HAVE_ICU + zm->icu_chain = 0; +#endif +#if YAZ_HAVE_XML2 + zm->doc = 0; +#endif + zm->input_str = wrbuf_alloc(); + zm->print_str = wrbuf_alloc(); return zm; } +static int parse_command(zebra_maps_t zms, int argc, char **argv, + const char *fname, int lineno) +{ + zebra_map_t zm = zms->last_map; + if (argc == 1) + { + yaz_log(YLOG_WARN, "%s:%d: Missing arguments for '%s'", + fname, lineno, argv[0]); + return -1; + } + if (argc > 2) + { + yaz_log(YLOG_WARN, "%s:%d: Too many arguments for '%s'", + fname, lineno, argv[0]); + return -1; + } + if (!yaz_matchstr(argv[0], "index")) + { + zm = zebra_add_map(zms, argv[1], ZEBRA_MAP_TYPE_INDEX); + zm->positioned = 1; + } + else if (!yaz_matchstr(argv[0], "sort")) + { + zm = zebra_add_map(zms, argv[1], ZEBRA_MAP_TYPE_SORT); + zm->u.sort.entry_size = 80; + } + else if (!yaz_matchstr(argv[0], "staticrank")) + { + zm = zebra_add_map(zms, argv[1], ZEBRA_MAP_TYPE_STATICRANK); + zm->completeness = 1; + } + else if (!zm) + { + yaz_log(YLOG_WARN, "%s:%d: Missing sort/index before '%s'", + fname, lineno, argv[0]); + return -1; + } + else if (!yaz_matchstr(argv[0], "charmap") && argc == 2) + { + if (zm->type != ZEBRA_MAP_TYPE_STATICRANK) + zm->maptab_name = nmem_strdup(zms->nmem, argv[1]); + else + { + yaz_log(YLOG_WARN|YLOG_FATAL, "%s:%d: charmap for " + "staticrank is invalid", fname, lineno); + yaz_log(YLOG_LOG, "Type is %d", zm->type); + return -1; + } + } + else if (!yaz_matchstr(argv[0], "completeness") && argc == 2) + { + zm->completeness = atoi(argv[1]); + } + else if (!yaz_matchstr(argv[0], "position") && argc == 2) + { + zm->positioned = atoi(argv[1]); + } + else if (!yaz_matchstr(argv[0], "alwaysmatches") && argc == 2) + { + if (zm->type != ZEBRA_MAP_TYPE_STATICRANK) + zm->alwaysmatches = atoi(argv[1]); + else + { + yaz_log(YLOG_WARN|YLOG_FATAL, "%s:%d: alwaysmatches for " + "staticrank is invalid", fname, lineno); + return -1; + } + } + else if (!yaz_matchstr(argv[0], "firstinfield") && argc == 2) + { + zm->first_in_field = atoi(argv[1]); + } + else if (!yaz_matchstr(argv[0], "entrysize") && argc == 2) + { + if (zm->type == ZEBRA_MAP_TYPE_SORT) + zm->u.sort.entry_size = atoi(argv[1]); + else + { + yaz_log(YLOG_WARN, + "%s:%d: entrysize only valid in sort section", + fname, lineno); + return -1; + } + } + else if (!yaz_matchstr(argv[0], "simplechain")) + { + zm->use_chain = 1; +#if YAZ_HAVE_ICU + zm->icu_chain = 0; +#endif + } + else if (!yaz_matchstr(argv[0], "icuchain")) + { + char full_path[1024]; + if (!yaz_filepath_resolve(argv[1], zms->tabpath, zms->tabroot, + full_path)) + { + yaz_log(YLOG_WARN, "%s:%d: Could not locate icuchain config '%s'", + fname, lineno, argv[1]); + return -1; + } +#if YAZ_HAVE_XML2 + zm->doc = xmlParseFile(full_path); + if (!zm->doc) + { + yaz_log(YLOG_WARN, "%s:%d: Could not load icuchain config '%s'", + fname, lineno, argv[1]); + return -1; + } + else + { +#if YAZ_HAVE_ICU + UErrorCode status; + xmlNode *xml_node = xmlDocGetRootElement(zm->doc); + zm->icu_chain = + icu_chain_xml_config(xml_node, +/* not sure about sort for this function yet.. */ +#if 1 + 1, +#else + zm->type == ZEBRA_MAP_TYPE_SORT, +#endif + &status); + if (!zm->icu_chain) + { + yaz_log(YLOG_WARN, "%s:%d: Failed to load ICU chain %s", + fname, lineno, argv[1]); + } + zm->use_chain = 1; +#else + yaz_log(YLOG_WARN, "%s:%d: ICU support unavailable", + fname, lineno); + return -1; +#endif + } +#else + yaz_log(YLOG_WARN, "%s:%d: XML support unavailable", + fname, lineno); + return -1; +#endif + } + else if (!yaz_matchstr(argv[0], "debug") && argc == 2) + { + zm->debug = atoi(argv[1]); + } + else + { + yaz_log(YLOG_WARN, "%s:%d: Unrecognized directive '%s'", + fname, lineno, argv[0]); + return -1; + } + return 0; +} + ZEBRA_RES zebra_maps_read_file(zebra_maps_t zms, const char *fname) { FILE *f; @@ -116,7 +299,6 @@ ZEBRA_RES zebra_maps_read_file(zebra_maps_t zms, const char *fname) int argc; int lineno = 0; int failures = 0; - zebra_map_t zm = 0; if (!(f = yaz_fopen(zms->tabpath, fname, "r", zms->tabroot))) { @@ -125,112 +307,31 @@ ZEBRA_RES zebra_maps_read_file(zebra_maps_t zms, const char *fname) } while ((argc = readconf_line(f, &lineno, line, 512, argv, 10))) { - if (argc == 1) - { - yaz_log(YLOG_WARN, "%s:%d: Missing arguments for '%s'", - fname, lineno, argv[0]); - failures++; - break; - } - if (argc > 2) - { - yaz_log(YLOG_WARN, "%s:%d: Too many arguments for '%s'", - fname, lineno, argv[0]); - failures++; - break; - } - if (!yaz_matchstr(argv[0], "index")) - { - zm = zebra_add_map(zms, argv[1], ZEBRA_MAP_TYPE_INDEX); - zm->positioned = 1; - } - else if (!yaz_matchstr(argv[0], "sort")) - { - zm = zebra_add_map(zms, argv[1], ZEBRA_MAP_TYPE_SORT); - zm->u.sort.entry_size = 80; - } - else if (!yaz_matchstr(argv[0], "staticrank")) - { - zm = zebra_add_map(zms, argv[1], ZEBRA_MAP_TYPE_STATICRANK); - zm->completeness = 1; - } - else if (!zm) - { - yaz_log(YLOG_WARN, "%s:%d: Missing sort/index before '%s'", - fname, lineno, argv[0]); + int r = parse_command(zms, argc, argv, fname, lineno); + if (r) failures++; - } - else if (!yaz_matchstr(argv[0], "charmap") && argc == 2) - { - if (zm->type != ZEBRA_MAP_TYPE_STATICRANK) - zm->maptab_name = nmem_strdup(zms->nmem, argv[1]); - else - { - yaz_log(YLOG_WARN|YLOG_FATAL, "%s:%d: charmap for " - "staticrank is invalid", fname, lineno); - yaz_log(YLOG_LOG, "Type is %d", zm->type); - failures++; - } - } - else if (!yaz_matchstr(argv[0], "completeness") && argc == 2) - { - zm->completeness = atoi(argv[1]); - } - else if (!yaz_matchstr(argv[0], "position") && argc == 2) - { - zm->positioned = atoi(argv[1]); - } - else if (!yaz_matchstr(argv[0], "alwaysmatches") && argc == 2) - { - if (zm->type != ZEBRA_MAP_TYPE_STATICRANK) - zm->alwaysmatches = atoi(argv[1]); - else - { - yaz_log(YLOG_WARN|YLOG_FATAL, "%s:%d: alwaysmatches for " - "staticrank is invalid", fname, lineno); - failures++; - } - } - else if (!yaz_matchstr(argv[0], "firstinfield") && argc == 2) - { - zm->first_in_field = atoi(argv[1]); - } - else if (!yaz_matchstr(argv[0], "entrysize") && argc == 2) - { - if (zm->type == ZEBRA_MAP_TYPE_SORT) - zm->u.sort.entry_size = atoi(argv[1]); - } - else - { - yaz_log(YLOG_WARN, "%s:%d: Unrecognized directive '%s'", - fname, lineno, argv[0]); - failures++; - } } yaz_fclose(f); - for (zm = zms->map_list; zm; zm = zm->next) - zms->lookup_array[zm->reg_id] = zm; - if (failures) return ZEBRA_FAIL; + + (zms->no_files_read)++; return ZEBRA_OK; } zebra_maps_t zebra_maps_open(Res res, const char *base_path, - const char *profile_path) + const char *profile_path) { zebra_maps_t zms = (zebra_maps_t) xmalloc(sizeof(*zms)); - int i; zms->nmem = nmem_create(); - zms->no_maps = 0; zms->tabpath = profile_path ? nmem_strdup(zms->nmem, profile_path) : 0; zms->tabroot = 0; if (base_path) zms->tabroot = nmem_strdup(zms->nmem, base_path); zms->map_list = 0; - zms->last_map = &zms->map_list; + zms->last_map = 0; zms->temp_map_str[0] = '\0'; zms->temp_map_str[1] = '\0'; @@ -238,47 +339,41 @@ zebra_maps_t zebra_maps_open(Res res, const char *base_path, zms->temp_map_ptr[0] = zms->temp_map_str; zms->temp_map_ptr[1] = NULL; - zms->lookup_array = (zebra_map_t *) - nmem_malloc(zms->nmem, sizeof(*zms->lookup_array)*256); zms->wrbuf_1 = wrbuf_alloc(); - for (i = 0; i<256; i++) - zms->lookup_array[i] = 0; + zms->no_files_read = 0; return zms; } -zebra_map_t zebra_map_get(zebra_maps_t zms, unsigned reg_id) +void zebra_maps_define_default_sort(zebra_maps_t zms) { - assert(reg_id >= 0 && reg_id <= 255); - return zms->lookup_array[reg_id]; + zebra_map_t zm = zebra_add_map(zms, "s", ZEBRA_MAP_TYPE_SORT); + zm->u.sort.entry_size = 80; } -zebra_map_t zebra_map_get_or_add(zebra_maps_t zms, unsigned reg_id) +zebra_map_t zebra_map_get(zebra_maps_t zms, const char *id) { - struct zebra_map *zm = zebra_map_get(zms, reg_id); + zebra_map_t zm; + for (zm = zms->map_list; zm; zm = zm->next) + if (!strcmp(zm->id, id)) + break; + return zm; +} + +zebra_map_t zebra_map_get_or_add(zebra_maps_t zms, const char *id) +{ + struct zebra_map *zm = zebra_map_get(zms, id); if (!zm) { - char name[2]; - name[0] = reg_id; - name[1] = '\0'; - - zm = zebra_add_map(zms, name, ZEBRA_MAP_TYPE_INDEX); - - /* no reason to warn if no maps are installed at ALL - Note that zebra_add_maps increments no_maps .. - */ - if (zms->no_maps > 1) - yaz_log(YLOG_WARN, "Unknown register type: %c", reg_id); - else - zms->no_maps = 0; + zm = zebra_add_map(zms, id, ZEBRA_MAP_TYPE_INDEX); + + /* no reason to warn if no maps are read from file */ + if (zms->no_files_read) + yaz_log(YLOG_WARN, "Unknown register type: %s", id); zm->maptab_name = nmem_strdup(zms->nmem, "@"); zm->completeness = 0; zm->positioned = 1; - zm->next = zms->map_list; - zms->map_list = zm->next; - - zms->lookup_array[zm->reg_id & 255] = zm; } return zm; } @@ -306,7 +401,7 @@ const char **zebra_maps_input(zebra_map_t zm, chrmaptab maptab = zebra_charmap_get(zm); if (maptab) return chr_map_input(maptab, from, len, first); - + zm->zebra_maps->temp_map_str[0] = **from; (*from)++; @@ -317,7 +412,7 @@ const char **zebra_maps_search(zebra_map_t zm, const char **from, int len, int *q_map_match) { chrmaptab maptab; - + *q_map_match = 0; maptab = zebra_charmap_get(zm); if (maptab) @@ -352,7 +447,7 @@ const char *zebra_maps_output(zebra_map_t zm, /* ------------------------------------ */ int zebra_maps_is_complete(zebra_map_t zm) -{ +{ if (zm) return zm->completeness; return 0; @@ -378,7 +473,7 @@ int zebra_maps_is_staticrank(zebra_map_t zm) return zm->type == ZEBRA_MAP_TYPE_STATICRANK; return 0; } - + int zebra_maps_is_sort(zebra_map_t zm) { if (zm) @@ -477,11 +572,11 @@ int zebra_maps_attr(zebra_maps_t zms, Z_AttributesPlusTerm *zapt, break; case 106: /* document-text */ *search_type = "or-list"; - break; + break; case -1: case 1: /* phrase */ case 2: /* word */ - case 108: /* string */ + case 108: /* string */ *search_type = "phrase"; break; case 107: /* local-number */ @@ -528,9 +623,135 @@ WRBUF zebra_replace(zebra_map_t zm, const char *ex_list, return zm->zebra_maps->wrbuf_1; } +#define SE_CHARS ";,.()-/?<> \r\n\t" + +static int tokenize_simple(zebra_map_t zm, + const char **result_buf, size_t *result_len) +{ + char *buf = wrbuf_buf(zm->input_str); + size_t len = wrbuf_len(zm->input_str); + size_t i = zm->simple_off; + size_t start; + + while (i < len && strchr(SE_CHARS, buf[i])) + i++; + start = i; + while (i < len && !strchr(SE_CHARS, buf[i])) + { + if (buf[i] > 32 && buf[i] < 127) + buf[i] = tolower(buf[i]); + i++; + } + + zm->simple_off = i; + if (start != i) + { + *result_buf = buf + start; + *result_len = i - start; + return 1; + } + return 0; + } + + +int zebra_map_tokenize_next(zebra_map_t zm, + const char **result_buf, size_t *result_len, + const char **display_buf, size_t *display_len) +{ + assert(zm->use_chain); + +#if YAZ_HAVE_ICU + if (!zm->icu_chain) + return tokenize_simple(zm, result_buf, result_len); + else + { + UErrorCode status; + while (icu_chain_next_token(zm->icu_chain, &status)) + { + if (!U_SUCCESS(status)) + return 0; + *result_buf = icu_chain_token_sortkey(zm->icu_chain); + assert(*result_buf); + + *result_len = strlen(*result_buf); + + if (display_buf) + { + *display_buf = icu_chain_token_display(zm->icu_chain); + if (display_len) + *display_len = strlen(*display_buf); + } + if (zm->debug) + { + wrbuf_rewind(zm->print_str); + wrbuf_write_escaped(zm->print_str, *result_buf, *result_len); + yaz_log(YLOG_LOG, "output %s", wrbuf_cstr(zm->print_str)); + } + + if (**result_buf != '\0') + return 1; + } + } + return 0; +#else + return tokenize_simple(zm, result_buf, result_len); +#endif +} + +int zebra_map_tokenize_start(zebra_map_t zm, + const char *buf, size_t len) +{ +#if YAZ_HAVE_ICU + int ret; +#endif + assert(zm->use_chain); + + wrbuf_rewind(zm->input_str); + wrbuf_write(zm->input_str, buf, len); + zm->simple_off = 0; +#if YAZ_HAVE_ICU + if (zm->icu_chain) + { + UErrorCode status; + if (zm->debug) + { + wrbuf_rewind(zm->print_str); + wrbuf_write_escaped(zm->print_str, wrbuf_buf(zm->input_str), + wrbuf_len(zm->input_str)); + + yaz_log(YLOG_LOG, "input %s", + wrbuf_cstr(zm->print_str)); + } + ret = icu_chain_assign_cstr(zm->icu_chain, + wrbuf_cstr(zm->input_str), &status); + if (!ret && !U_SUCCESS(status)) + { + if (zm->debug) + { + yaz_log(YLOG_WARN, "bad encoding for input"); + } + return -1; + } + } +#endif + return 0; +} + +int zebra_maps_is_icu(zebra_map_t zm) +{ + assert(zm); +#if YAZ_HAVE_ICU + return zm->use_chain; +#else + return 0; +#endif +} + + /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab