X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;ds=sidebyside;f=src%2Fcharsets.c;h=5fa6c39d16064daedb686e1fc4a45880fc581d27;hb=8961ed761e348e972f00d015284ce75c16b1648c;hp=397e73b8d337a85083b28a82592d2a2036ea5211;hpb=6a3018550c63b06a788bfeae83de16318965d43b;p=pazpar2-moved-to-github.git diff --git a/src/charsets.c b/src/charsets.c index 397e73b..5fa6c39 100644 --- a/src/charsets.c +++ b/src/charsets.c @@ -1,5 +1,5 @@ /* This file is part of Pazpar2. - Copyright (C) 2006-2011 Index Data + Copyright (C) 2006-2012 Index Data Pazpar2 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -45,32 +45,32 @@ static pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node); static pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn); static pp2_charset_t pp2_charset_create_a_to_z(void); static void pp2_charset_destroy(pp2_charset_t pct); -static pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct); +static pp2_charset_token_t pp2_charset_tokenize(pp2_charset_t pct); /* charset handle */ struct pp2_charset_s { - const char *(*token_next_handler)(pp2_relevance_token_t prt); - const char *(*get_sort_handler)(pp2_relevance_token_t prt); - const char *(*get_display_handler)(pp2_relevance_token_t prt); + const char *(*token_next_handler)(pp2_charset_token_t prt); + const char *(*get_sort_handler)(pp2_charset_token_t prt); + const char *(*get_display_handler)(pp2_charset_token_t prt); #if YAZ_HAVE_ICU struct icu_chain * icu_chn; UErrorCode icu_sts; #endif }; -static const char *pp2_relevance_token_null(pp2_relevance_token_t prt); -static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt); -static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt); -static const char *pp2_get_display_ascii(pp2_relevance_token_t prt); +static const char *pp2_charset_token_null(pp2_charset_token_t prt); +static const char *pp2_charset_token_a_to_z(pp2_charset_token_t prt); +static const char *pp2_get_sort_ascii(pp2_charset_token_t prt); +static const char *pp2_get_display_ascii(pp2_charset_token_t prt); #if YAZ_HAVE_ICU -static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt); -static const char *pp2_get_sort_icu(pp2_relevance_token_t prt); -static const char *pp2_get_display_icu(pp2_relevance_token_t prt); +static const char *pp2_charset_token_icu(pp2_charset_token_t prt); +static const char *pp2_get_sort_icu(pp2_charset_token_t prt); +static const char *pp2_get_display_icu(pp2_charset_token_t prt); #endif /* tokenzier handle */ -struct pp2_relevance_token_s { +struct pp2_charset_token_s { const char *cp; /* unnormalized buffer we're tokenizing */ const char *last_cp; /* pointer to last token we're dealing with */ pp2_charset_t pct; /* our main charset handle (type+config) */ @@ -160,23 +160,26 @@ int pp2_charset_fact_define(pp2_charset_fact_t pft, { int r; pp2_charset_t pct; - xmlChar *id; + xmlChar *id = 0; assert(xml_node); pct = pp2_charset_create_xml(xml_node); if (!pct) return -1; - id = xmlGetProp(xml_node, (xmlChar*) "id"); - if (id) - default_id = (const char *) id; if (!default_id) { - yaz_log(YLOG_WARN, "Missing id for icu_chain"); - pp2_charset_destroy(pct); - return -1; + id = xmlGetProp(xml_node, (xmlChar*) "id"); + if (!id) + { + yaz_log(YLOG_WARN, "Missing id for icu_chain"); + pp2_charset_destroy(pct); + return -1; + } + default_id = (const char *) id; } r = pp2_charset_fact_add(pft, pct, default_id); - xmlFree(id); + if (id) + xmlFree(id); return r; } @@ -198,7 +201,7 @@ pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node) //xmlChar *xmlstr = 0; //int size = 0; //xmlDocDumpMemory(icu_doc, size); - + yaz_log(YLOG_FATAL, "Could not parse ICU chain config:\n" "<%s>\n ... \n", xml_node->name, xml_node->name); @@ -209,7 +212,7 @@ pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node) yaz_log(YLOG_FATAL, "Error: ICU support requested with element:\n" "<%s>\n ... \n", xml_node->name, xml_node->name); - yaz_log(YLOG_FATAL, + yaz_log(YLOG_FATAL, "But no ICU support is compiled into the YAZ library."); return 0; #endif // YAZ_HAVE_ICU @@ -218,7 +221,7 @@ pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node) pp2_charset_t pp2_charset_create_a_to_z(void) { pp2_charset_t pct = pp2_charset_create(0); - pct->token_next_handler = pp2_relevance_token_a_to_z; + pct->token_next_handler = pp2_charset_token_a_to_z; return pct; } @@ -226,7 +229,7 @@ pp2_charset_t pp2_charset_create(struct icu_chain *icu_chn) { pp2_charset_t pct = xmalloc(sizeof(*pct)); - pct->token_next_handler = pp2_relevance_token_null; + pct->token_next_handler = pp2_charset_token_null; pct->get_sort_handler = pp2_get_sort_ascii; pct->get_display_handler = pp2_get_display_ascii; #if YAZ_HAVE_ICU @@ -235,7 +238,7 @@ pp2_charset_t pp2_charset_create(struct icu_chain *icu_chn) { pct->icu_chn = icu_chn; pct->icu_sts = U_ZERO_ERROR; - pct->token_next_handler = pp2_relevance_token_icu; + pct->token_next_handler = pp2_charset_token_icu; pct->get_sort_handler = pp2_get_sort_icu; pct->get_display_handler = pp2_get_display_icu; } @@ -251,19 +254,19 @@ void pp2_charset_destroy(pp2_charset_t pct) xfree(pct); } -pp2_relevance_token_t pp2_relevance_create(pp2_charset_fact_t pft, - const char *id) +pp2_charset_token_t pp2_charset_token_create(pp2_charset_fact_t pft, + const char *id) { struct pp2_charset_entry *pce; for (pce = pft->list; pce; pce = pce->next) if (!strcmp(id, pce->name)) - return pp2_relevance_tokenize(pce->pct); + return pp2_charset_tokenize(pce->pct); return 0; } -pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct) +pp2_charset_token_t pp2_charset_tokenize(pp2_charset_t pct) { - pp2_relevance_token_t prt = xmalloc(sizeof(*prt)); + pp2_charset_token_t prt = xmalloc(sizeof(*prt)); assert(pct); @@ -281,17 +284,16 @@ pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct) return prt; } -void pp2_relevance_first(pp2_relevance_token_t prt, - const char *buf, - int skip_article) -{ +void pp2_charset_token_first(pp2_charset_token_t prt, + const char *buf, int skip_article) +{ if (skip_article) { const char *p = buf; char firstword[64]; char *pout = firstword; char articles[] = "the den der die des an a "; // must end in space - + for (; *p && *p != ' ' && pout - firstword < (sizeof(firstword)-2); p++) *pout++ = tolower(*(unsigned char *)p); *pout++ = ' '; @@ -313,32 +315,32 @@ void pp2_relevance_first(pp2_relevance_token_t prt, #endif // YAZ_HAVE_ICU } -void pp2_relevance_token_destroy(pp2_relevance_token_t prt) +void pp2_charset_token_destroy(pp2_charset_token_t prt) { assert(prt); #if YAZ_HAVE_ICU if (prt->iter) icu_iter_destroy(prt->iter); #endif - if(prt->norm_str) + if(prt->norm_str) wrbuf_destroy(prt->norm_str); - if(prt->sort_str) + if(prt->sort_str) wrbuf_destroy(prt->sort_str); xfree(prt); } -const char *pp2_relevance_token_next(pp2_relevance_token_t prt) +const char *pp2_charset_token_next(pp2_charset_token_t prt) { assert(prt); return (prt->pct->token_next_handler)(prt); } -const char *pp2_get_sort(pp2_relevance_token_t prt) +const char *pp2_get_sort(pp2_charset_token_t prt) { return prt->pct->get_sort_handler(prt); } -const char *pp2_get_display(pp2_relevance_token_t prt) +const char *pp2_get_display(pp2_charset_token_t prt) { return prt->pct->get_display_handler(prt); } @@ -347,7 +349,7 @@ const char *pp2_get_display(pp2_relevance_token_t prt) /* original tokenizer with our tokenize interface, but we add +1 to ensure no '\0' are in our string (except for EOF) */ -static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt) +static const char *pp2_charset_token_a_to_z(pp2_charset_token_t prt) { const char *cp = prt->cp; int c; @@ -374,7 +376,7 @@ static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt) return wrbuf_cstr(prt->norm_str); } -static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt) +static const char *pp2_get_sort_ascii(pp2_charset_token_t prt) { if (prt->last_cp == 0) return 0; @@ -383,7 +385,7 @@ static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt) char *tmp = xstrdup(prt->last_cp); char *result = 0; result = normalize7bit_mergekey(tmp); - + wrbuf_rewind(prt->sort_str); wrbuf_puts(prt->sort_str, result); xfree(tmp); @@ -391,7 +393,7 @@ static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt) } } -static const char *pp2_get_display_ascii(pp2_relevance_token_t prt) +static const char *pp2_get_display_ascii(pp2_charset_token_t prt) { if (prt->last_cp == 0) return 0; @@ -401,7 +403,7 @@ static const char *pp2_get_display_ascii(pp2_relevance_token_t prt) } } -static const char *pp2_relevance_token_null(pp2_relevance_token_t prt) +static const char *pp2_charset_token_null(pp2_charset_token_t prt) { const char *cp = prt->cp; @@ -413,7 +415,7 @@ static const char *pp2_relevance_token_null(pp2_relevance_token_t prt) } #if YAZ_HAVE_ICU -static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt) +static const char *pp2_charset_token_icu(pp2_charset_token_t prt) { if (icu_iter_next(prt->iter)) { @@ -422,12 +424,12 @@ static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt) return 0; } -static const char *pp2_get_sort_icu(pp2_relevance_token_t prt) +static const char *pp2_get_sort_icu(pp2_charset_token_t prt) { return icu_iter_get_sortkey(prt->iter); } -static const char *pp2_get_display_icu(pp2_relevance_token_t prt) +static const char *pp2_get_display_icu(pp2_charset_token_t prt) { return icu_iter_get_display(prt->iter); }