From 57b393132b1e795da47e50c13260a1346c8029e9 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 25 Dec 2008 15:37:18 +0100 Subject: [PATCH] Using the ICU wrapper from YAZ rather than its own. Using the ICU wrapper from YAZ rather than its own. The behavior is similar but the ICU XML format is changed a bit. YAZ 3.0.39 produces proper warnings so that content can be updated. --- NEWS | 5 + configure.ac | 3 +- debian/control | 3 +- doc/pazpar2_conf.xml | 13 +- src/Makefile.am | 28 +- src/charsets.c | 45 +- src/client.c | 2 +- src/icu_I18N.c | 1220 -------------------------------------------------- src/icu_I18N.h | 282 ------------ src/icu_chain_test.c | 554 ----------------------- src/pazpar2.c | 2 +- src/test_icu_I18N.c | 691 ---------------------------- 12 files changed, 40 insertions(+), 2808 deletions(-) delete mode 100644 src/icu_I18N.c delete mode 100644 src/icu_I18N.h delete mode 100644 src/icu_chain_test.c delete mode 100644 src/test_icu_I18N.c diff --git a/NEWS b/NEWS index 45508d7..21e9b97 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,8 @@ +Pazpar2 no longer uses its own ICU wrapper. Instead the ICU wrapper +library part of YAZ is used. + +Added SRU client support. + Automatically computes pz:nativesyntax if not provided (works for XML and MARC) --- 1.0.13 2008/11/24 diff --git a/configure.ac b/configure.ac index 17709df..43d7370 100644 --- a/configure.ac +++ b/configure.ac @@ -19,7 +19,7 @@ AC_PROG_RANLIB AC_LANG(C) -YAZ_INIT([static threads],[3.0.37]) +YAZ_INIT([static icu threads],[3.0.39]) if test -z "$YAZLIB"; then AC_MSG_ERROR([YAZ development libraries missing]) fi @@ -27,7 +27,6 @@ YAZ_DOC AC_CHECK_HEADERS([sys/time.h sys/socket.h unistd.h netinet/in.h netdb.h arpa/inet.h]) AC_CHECK_FUNCS([getaddrinfo]) -AC_CHECK_ICU([3.6],[ICU_CPPFLAGS="$ICU_CPPFLAGS -D HAVE_ICU=1"]) AC_CONFIG_FILES([ Doxyfile Makefile diff --git a/debian/control b/debian/control index 175e8bc..af68095 100644 --- a/debian/control +++ b/debian/control @@ -2,8 +2,7 @@ Source: pazpar2 Section: net Priority: extra Maintainer: Adam Dickmeiss -Build-Depends: debhelper (>= 5), autotools-dev, libyaz3-dev, - libicu36-dev | libicu-dev, docbook-xsl +Build-Depends: debhelper (>= 5), autotools-dev, libyaz3-dev, docbook-xsl Standards-Version: 3.7.2 Package: pazpar2 diff --git a/doc/pazpar2_conf.xml b/doc/pazpar2_conf.xml index b3a6281..cc45924 100644 --- a/doc/pazpar2_conf.xml +++ b/doc/pazpar2_conf.xml @@ -125,7 +125,7 @@ - normalize + transform Normalization and transformation of tokens follows @@ -152,17 +152,6 @@ - index - - - Finally the 'index' element instruction - without - any 'rule' attribute - is used to store the tokens - after chain processing in the relevance ranking - unit of Pazpar2. It will always be the last - instruction in the chain. - - - diff --git a/src/Makefile.am b/src/Makefile.am index 9195380..d027124 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,10 +2,7 @@ sbin_PROGRAMS = pazpar2 -noinst_PROGRAMS = icu_chain_test - check_PROGRAMS = test_config \ - test_icu_I18N \ test_record \ test_reclists \ test_relevance \ @@ -20,11 +17,10 @@ MAINTAINERCLEANFILES = Makefile.in cconfig.h CONFIG_CLEAN_FILES=*.log -AM_CFLAGS = $(YAZINC) $(ICU_CPPFLAGS) - +AM_CFLAGS = $(YAZINC) + libpazpar2_a_SOURCES = pazpar2_config.c pazpar2_config.h eventl.c eventl.h \ http.c http_command.c http_command.h http.h \ - icu_I18N.h icu_I18N.c \ logic.c pazpar2.h \ record.h record.c reclists.c reclists.h \ relevance.c relevance.h termlists.c termlists.h \ @@ -36,29 +32,23 @@ libpazpar2_a_SOURCES = pazpar2_config.c pazpar2_config.h eventl.c eventl.h \ dirent.c direntz.h pazpar2_SOURCES = pazpar2.c -pazpar2_LDADD = libpazpar2.a $(YAZLIB) $(ICU_LIBS) - -icu_chain_test_SOURCES = icu_chain_test.c icu_I18N.c -icu_chain_test_LDADD = $(YAZLIB) $(ICU_LIBS) +pazpar2_LDADD = libpazpar2.a $(YAZLIB) test_config_SOURCES = test_config.c -test_config_LDADD = libpazpar2.a $(YAZLIB) $(ICU_LIBS) +test_config_LDADD = libpazpar2.a $(YAZLIB) test_record_SOURCES = test_record.c -test_record_LDADD = libpazpar2.a $(YAZLIB) $(ICU_LIBS) +test_record_LDADD = libpazpar2.a $(YAZLIB) test_reclists_SOURCES = test_reclists.c -test_reclists_LDADD = libpazpar2.a $(YAZLIB) $(ICU_LIBS) +test_reclists_LDADD = libpazpar2.a $(YAZLIB) test_relevance_SOURCES = test_relevance.c -test_relevance_LDADD = libpazpar2.a $(YAZLIB) $(ICU_LIBS) +test_relevance_LDADD = libpazpar2.a $(YAZLIB) test_sel_thread_SOURCES = test_sel_thread.c -test_sel_thread_LDADD = libpazpar2.a $(YAZLIB) $(ICU_LIBS) - -test_icu_I18N_SOURCES = test_icu_I18N.c -test_icu_I18N_LDADD = libpazpar2.a $(YAZLIB) $(ICU_LIBS) +test_sel_thread_LDADD = libpazpar2.a $(YAZLIB) test_normalize_SOURCES = test_normalize.c -test_normalize_LDADD = libpazpar2.a $(YAZLIB) $(ICU_LIBS) +test_normalize_LDADD = libpazpar2.a $(YAZLIB) diff --git a/src/charsets.c b/src/charsets.c index f722416..0d63c11 100644 --- a/src/charsets.c +++ b/src/charsets.c @@ -34,27 +34,28 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include "charsets.h" #include "normalize7bit.h" -#ifdef HAVE_ICU -#include "icu_I18N.h" -#endif // HAVE_ICU +#if YAZ_HAVE_ICU +#include +#endif + /* charset handle */ struct pp2_charset_s { const char *(*token_next_handler)(pp2_relevance_token_t prt); const char *(*get_sort_handler)(pp2_relevance_token_t prt, int skip); -#ifdef HAVE_ICU +#if YAZ_HAVE_ICU struct icu_chain * icu_chn; UErrorCode icu_sts; -#endif // HAVE_ICU +#endif }; static const char *pp2_relevance_token_a_to_z(pp2_relevance_token_t prt); static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt, int skip_article); -#ifdef HAVE_ICU +#if YAZ_HAVE_ICU static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt); static const char *pp2_get_sort_icu(pp2_relevance_token_t prt, int skip_article); -#endif // HAVE_ICU +#endif /* tokenzier handle */ struct pp2_relevance_token_s { @@ -68,14 +69,14 @@ struct pp2_relevance_token_s { pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node) { -#ifdef HAVE_ICU +#if YAZ_HAVE_ICU UErrorCode status = U_ZERO_ERROR; struct icu_chain *chain = 0; if (xml_node) xml_node = xml_node->children; while (xml_node && xml_node->type != XML_ELEMENT_NODE) xml_node = xml_node->next; - chain = icu_chain_xml_config(xml_node, &status); + chain = icu_chain_xml_config(xml_node, 1, &status); if (!chain || U_FAILURE(status)){ //xmlDocPtr icu_doc = 0; //xmlChar *xmlstr = 0; @@ -88,17 +89,14 @@ pp2_charset_t pp2_charset_create_xml(xmlNode *xml_node) return 0; } return pp2_charset_create(chain); -#else // HAVE_ICU +#else // YAZ_HAVE_ICU yaz_log(YLOG_FATAL, "Error: ICU support requested with element:\n" "<%s>\n ... \n", xml_node->name, xml_node->name); yaz_log(YLOG_FATAL, - "But no ICU support compiled into pazpar2 server."); - yaz_log(YLOG_FATAL, - "Please install libicu36-dev and icu-doc or similar, " - "re-configure and re-compile"); + "But no ICU support is compiled into the YAZ library."); return 0; -#endif // HAVE_ICU +#endif // YAZ_HAVE_ICU } @@ -108,7 +106,7 @@ pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn) pct->token_next_handler = pp2_relevance_token_a_to_z; pct->get_sort_handler = pp2_get_sort_ascii; -#ifdef HAVE_ICU +#if YAZ_HAVE_ICU pct->icu_chn = 0; if (icu_chn) { @@ -117,7 +115,7 @@ pp2_charset_t pp2_charset_create(struct icu_chain * icu_chn) pct->token_next_handler = pp2_relevance_token_icu; pct->get_sort_handler = pp2_get_sort_icu; } -#endif // HAVE_ICU +#endif // YAZ_HAVE_ICU return pct; } @@ -139,7 +137,7 @@ pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, prt->last_cp = 0; prt->pct = pct; -#ifdef HAVE_ICU +#if YAZ_HAVE_ICU if (pct->icu_chn) { int ok = 0; @@ -148,7 +146,7 @@ pp2_relevance_token_t pp2_relevance_tokenize(pp2_charset_t pct, //printf("\nfield ok: %d '%s'\n", ok, buf); prt->pct = pct; } -#endif // HAVE_ICU +#endif // YAZ_HAVE_ICU return prt; } @@ -224,7 +222,7 @@ static const char *pp2_get_sort_ascii(pp2_relevance_token_t prt, } -#ifdef HAVE_ICU +#if YAZ_HAVE_ICU static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt) { if (icu_chain_next_token(prt->pct->icu_chn, &prt->pct->icu_sts)) @@ -233,7 +231,7 @@ static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt) { return 0; } - return icu_chain_get_norm(prt->pct->icu_chn); + return icu_chain_token_norm(prt->pct->icu_chn); } return 0; } @@ -241,11 +239,10 @@ static const char *pp2_relevance_token_icu(pp2_relevance_token_t prt) static const char *pp2_get_sort_icu(pp2_relevance_token_t prt, int skip_article) { - return icu_chain_get_sort(prt->pct->icu_chn); + return icu_chain_token_sortkey(prt->pct->icu_chn); } -#endif // HAVE_ICU - +#endif // YAZ_HAVE_ICU /* diff --git a/src/client.c b/src/client.c index 9cb5140..33f5efa 100644 --- a/src/client.c +++ b/src/client.c @@ -475,8 +475,8 @@ void client_start_search(struct client *cl) if (cl->cqlquery) { - yaz_log(YLOG_LOG, "Search %s CQL: %s", sdb->database->url, cl->cqlquery); ZOOM_query q = ZOOM_query_create(); + yaz_log(YLOG_LOG, "Search %s CQL: %s", sdb->database->url, cl->cqlquery); ZOOM_query_cql(q, cl->cqlquery); rs = ZOOM_connection_search(link, q); ZOOM_query_destroy(q); diff --git a/src/icu_I18N.c b/src/icu_I18N.c deleted file mode 100644 index f40b529..0000000 --- a/src/icu_I18N.c +++ /dev/null @@ -1,1220 +0,0 @@ -/* This file is part of Pazpar2. - Copyright (C) 2006-2008 Index Data - -Pazpar2 is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free -Software Foundation; either version 2, or (at your option) any later -version. - -Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -*/ - -#if HAVE_CONFIG_H -#include -#endif - -#define USE_TIMING 0 -#if USE_TIMING -#include -#endif - - -#ifdef HAVE_ICU -#include "icu_I18N.h" - -#include - -#include -#include -#include - -#include /* some more string fcns*/ -#include /* char names */ - - -//#include -//#include /* Basic ICU data types */ -#include -//#include /* C Converter API */ -//#include -//#include -/* #include */ - - - - -int icu_check_status (UErrorCode status) -{ - if(U_FAILURE(status)){ - yaz_log(YLOG_WARN, - "ICU: %d %s\n", status, u_errorName(status)); - return 0; - } - return 1; - -} - - - -struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity) -{ - struct icu_buf_utf16 * buf16 - = (struct icu_buf_utf16 *) malloc(sizeof(struct icu_buf_utf16)); - - buf16->utf16 = 0; - buf16->utf16_len = 0; - buf16->utf16_cap = 0; - - if (capacity > 0){ - buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity); - buf16->utf16[0] = (UChar) 0; - buf16->utf16_cap = capacity; - } - return buf16; -}; - -struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16, - size_t capacity) -{ - if (buf16){ - if (capacity > 0){ - if (0 == buf16->utf16) - buf16->utf16 = (UChar *) malloc(sizeof(UChar) * capacity); - else - buf16->utf16 - = (UChar *) realloc(buf16->utf16, sizeof(UChar) * capacity); - buf16->utf16[0] = (UChar) 0; - buf16->utf16_len = 0; - buf16->utf16_cap = capacity; - } - else { - if (buf16->utf16) - free(buf16->utf16); - buf16->utf16 = 0; - buf16->utf16_len = 0; - buf16->utf16_cap = 0; - } - } - - return buf16; -}; - - -struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16) -{ - if(!dest16 || !src16 - || dest16 == src16) - return 0; - - if (dest16->utf16_cap < src16->utf16_len) - icu_buf_utf16_resize(dest16, src16->utf16_len * 2); - - u_strncpy(dest16->utf16, src16->utf16, src16->utf16_len); - dest16->utf16_len = src16->utf16_len; - - return dest16; -}; - - -void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16) -{ - if (buf16){ - if (buf16->utf16) - free(buf16->utf16); - free(buf16); - } -}; - - - - - - -struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity) -{ - struct icu_buf_utf8 * buf8 - = (struct icu_buf_utf8 *) malloc(sizeof(struct icu_buf_utf8)); - - buf8->utf8 = 0; - buf8->utf8_len = 0; - buf8->utf8_cap = 0; - - if (capacity > 0){ - buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity); - buf8->utf8[0] = (uint8_t) 0; - buf8->utf8_cap = capacity; - } - return buf8; -}; - - - -struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, - size_t capacity) -{ - if (buf8){ - if (capacity > 0){ - if (0 == buf8->utf8) - buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity); - else - buf8->utf8 - = (uint8_t *) realloc(buf8->utf8, - sizeof(uint8_t) * capacity); - buf8->utf8_cap = capacity; - } - else { - if (buf8->utf8) - free(buf8->utf8); - buf8->utf8 = 0; - buf8->utf8_len = 0; - buf8->utf8_cap = 0; - } - } - - return buf8; -}; - - -struct icu_buf_utf8 * icu_buf_utf8_copy(struct icu_buf_utf8 * dest8, - struct icu_buf_utf8 * src8) -{ - if(!dest8 || !src8 - || dest8 == src8) - return 0; - - - if (dest8->utf8_cap < src8->utf8_len) - icu_buf_utf8_resize(dest8, src8->utf8_len * 2); - - strncpy((char*) dest8->utf8, (char*) src8->utf8, src8->utf8_len); - - return dest8; -}; - - -const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8) -{ - if (!src8 || src8->utf8_len == 0) - return ""; - if (src8->utf8_len == src8->utf8_cap) - src8 = icu_buf_utf8_resize(src8, src8->utf8_len * 2 + 1); - src8->utf8[src8->utf8_len] = '\0'; - return (const char *) src8->utf8; -} - - -void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8) -{ - if (buf8){ - if (buf8->utf8) - free(buf8->utf8); - free(buf8); - } -}; - - - -UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16, - struct icu_buf_utf8 * src8, - UErrorCode * status) -{ - int32_t utf16_len = 0; - - u_strFromUTF8(dest16->utf16, dest16->utf16_cap, - &utf16_len, - (const char *) src8->utf8, src8->utf8_len, status); - - // check for buffer overflow, resize and retry - if (*status == U_BUFFER_OVERFLOW_ERROR - //|| dest16->utf16_len > dest16->utf16_cap - ){ - icu_buf_utf16_resize(dest16, utf16_len * 2); - *status = U_ZERO_ERROR; - u_strFromUTF8(dest16->utf16, dest16->utf16_cap, - &utf16_len, - (const char *) src8->utf8, src8->utf8_len, status); - } - - //if (*status != U_BUFFER_OVERFLOW_ERROR - if (U_SUCCESS(*status) - && utf16_len <= dest16->utf16_cap) - dest16->utf16_len = utf16_len; - else { - dest16->utf16[0] = (UChar) 0; - dest16->utf16_len = 0; - } - - return *status; -}; - - - -UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16, - const char * src8cstr, - UErrorCode * status) -{ - size_t src8cstr_len = 0; - int32_t utf16_len = 0; - - src8cstr_len = strlen(src8cstr); - - u_strFromUTF8(dest16->utf16, dest16->utf16_cap, - &utf16_len, - src8cstr, src8cstr_len, status); - - // check for buffer overflow, resize and retry - if (*status == U_BUFFER_OVERFLOW_ERROR - //|| dest16->utf16_len > dest16->utf16_cap - ){ - icu_buf_utf16_resize(dest16, utf16_len * 2); - *status = U_ZERO_ERROR; - u_strFromUTF8(dest16->utf16, dest16->utf16_cap, - &utf16_len, - src8cstr, src8cstr_len, status); - } - - // if (*status != U_BUFFER_OVERFLOW_ERROR - if (U_SUCCESS(*status) - && utf16_len <= dest16->utf16_cap) - dest16->utf16_len = utf16_len; - else { - dest16->utf16[0] = (UChar) 0; - dest16->utf16_len = 0; - } - - return *status; -}; - - - - -UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8, - struct icu_buf_utf16 * src16, - UErrorCode * status) -{ - int32_t utf8_len = 0; - - u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap, - &utf8_len, - src16->utf16, src16->utf16_len, status); - - // check for buffer overflow, resize and retry - if (*status == U_BUFFER_OVERFLOW_ERROR - //|| dest8->utf8_len > dest8->utf8_cap - ){ - icu_buf_utf8_resize(dest8, utf8_len * 2); - *status = U_ZERO_ERROR; - u_strToUTF8((char *) dest8->utf8, dest8->utf8_cap, - &utf8_len, - src16->utf16, src16->utf16_len, status); - - } - - //if (*status != U_BUFFER_OVERFLOW_ERROR - if (U_SUCCESS(*status) - && utf8_len <= dest8->utf8_cap) - dest8->utf8_len = utf8_len; - else { - dest8->utf8[0] = (uint8_t) 0; - dest8->utf8_len = 0; - } - - return *status; -}; - - - -struct icu_casemap * icu_casemap_create(const char *locale, char action, - UErrorCode *status) -{ - struct icu_casemap * casemap - = (struct icu_casemap *) malloc(sizeof(struct icu_casemap)); - strcpy(casemap->locale, locale); - casemap->action = action; - - switch(casemap->action) { - case 'l': - break; - case 'u': - break; - case 't': - break; - case 'f': - break; - default: - icu_casemap_destroy(casemap); - return 0; - } - - return casemap; -}; - -void icu_casemap_destroy(struct icu_casemap * casemap) -{ - if (casemap) - free(casemap); -}; - - -int icu_casemap_casemap(struct icu_casemap * casemap, - struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16, - UErrorCode *status) -{ - if(!casemap) - return 0; - - return icu_utf16_casemap(dest16, src16, - casemap->locale, casemap->action, status); -}; - - -int icu_utf16_casemap(struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16, - const char *locale, char action, - UErrorCode *status) -{ - int32_t dest16_len = 0; - - switch(action) { - case 'l': - dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - locale, status); - break; - case 'u': - dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - locale, status); - break; - case 't': - dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - 0, locale, status); - break; - case 'f': - dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - U_FOLD_CASE_DEFAULT, status); - break; - - default: - return U_UNSUPPORTED_ERROR; - break; - } - - // check for buffer overflow, resize and retry - if (*status == U_BUFFER_OVERFLOW_ERROR - && dest16 != src16 // do not resize if in-place conversion - //|| dest16_len > dest16->utf16_cap - ){ - icu_buf_utf16_resize(dest16, dest16_len * 2); - *status = U_ZERO_ERROR; - - - switch(action) { - case 'l': - dest16_len = u_strToLower(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - locale, status); - break; - case 'u': - dest16_len = u_strToUpper(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - locale, status); - break; - case 't': - dest16_len = u_strToTitle(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - 0, locale, status); - break; - case 'f': - dest16_len = u_strFoldCase(dest16->utf16, dest16->utf16_cap, - src16->utf16, src16->utf16_len, - U_FOLD_CASE_DEFAULT, status); - break; - - default: - return U_UNSUPPORTED_ERROR; - break; - } - } - - if (U_SUCCESS(*status) - && dest16_len <= dest16->utf16_cap) - dest16->utf16_len = dest16_len; - else { - dest16->utf16[0] = (UChar) 0; - dest16->utf16_len = 0; - } - - return *status; -}; - - - -UErrorCode icu_sortkey8_from_utf16(UCollator *coll, - struct icu_buf_utf8 * dest8, - struct icu_buf_utf16 * src16, - UErrorCode * status) -{ - - int32_t sortkey_len = 0; - - sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, - dest8->utf8, dest8->utf8_cap); - - // check for buffer overflow, resize and retry - if (sortkey_len > dest8->utf8_cap) { - icu_buf_utf8_resize(dest8, sortkey_len * 2); - sortkey_len = ucol_getSortKey(coll, src16->utf16, src16->utf16_len, - dest8->utf8, dest8->utf8_cap); - } - - if (U_SUCCESS(*status) - && sortkey_len > 0) - dest8->utf8_len = sortkey_len; - else { - dest8->utf8[0] = (UChar) 0; - dest8->utf8_len = 0; - } - - return sortkey_len; -}; - - - -struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, - UErrorCode *status) -{ - struct icu_tokenizer * tokenizer - = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer)); - - strcpy(tokenizer->locale, locale); - tokenizer->action = action; - tokenizer->bi = 0; - tokenizer->buf16 = 0; - tokenizer->token_count = 0; - tokenizer->token_id = 0; - tokenizer->token_start = 0; - tokenizer->token_end = 0; - - - switch(tokenizer->action) { - case 'l': - tokenizer->bi - = ubrk_open(UBRK_LINE, tokenizer->locale, - 0, 0, status); - break; - case 's': - tokenizer->bi - = ubrk_open(UBRK_SENTENCE, tokenizer->locale, - 0, 0, status); - break; - case 'w': - tokenizer->bi - = ubrk_open(UBRK_WORD, tokenizer->locale, - 0, 0, status); - break; - case 'c': - tokenizer->bi - = ubrk_open(UBRK_CHARACTER, tokenizer->locale, - 0, 0, status); - break; - case 't': - tokenizer->bi - = ubrk_open(UBRK_TITLE, tokenizer->locale, - 0, 0, status); - break; - default: - *status = U_UNSUPPORTED_ERROR; - return 0; - break; - } - - // ICU error stuff is a very funny business - if (U_SUCCESS(*status)) - return tokenizer; - - // freeing if failed - icu_tokenizer_destroy(tokenizer); - return 0; -}; - -void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer) -{ - if (tokenizer) { - if (tokenizer->bi) - ubrk_close(tokenizer->bi); - free(tokenizer); - } -}; - -int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, - struct icu_buf_utf16 * src16, - UErrorCode *status) -{ - if (!tokenizer || !tokenizer->bi || !src16) - return 0; - - - tokenizer->buf16 = src16; - tokenizer->token_count = 0; - tokenizer->token_id = 0; - tokenizer->token_start = 0; - tokenizer->token_end = 0; - - ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status); - - - if (U_FAILURE(*status)) - return 0; - - return 1; -}; - -int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, - struct icu_buf_utf16 * tkn16, - UErrorCode *status) -{ - int32_t tkn_start = 0; - int32_t tkn_end = 0; - int32_t tkn_len = 0; - - - if (!tokenizer || !tokenizer->bi - || !tokenizer->buf16 || !tokenizer->buf16->utf16_len) - return 0; - - // never change tokenizer->buf16 and keep always invariant - // 0 <= tokenizer->token_start - // <= tokenizer->token_end - // <= tokenizer->buf16->utf16_len - // returns length of token - - if (0 == tokenizer->token_end) // first call - tkn_start = ubrk_first(tokenizer->bi); - else //successive calls - tkn_start = tokenizer->token_end; - - // get next position - tkn_end = ubrk_next(tokenizer->bi); - - // repairing invariant at end of ubrk, which is UBRK_DONE = -1 - if (UBRK_DONE == tkn_end) - tkn_end = tokenizer->buf16->utf16_len; - - // copy out if everything is well - if(U_FAILURE(*status)) - return 0; - - // everything OK, now update internal state - tkn_len = tkn_end - tkn_start; - - if (0 < tkn_len){ - tokenizer->token_count++; - tokenizer->token_id++; - } else { - tokenizer->token_id = 0; - } - tokenizer->token_start = tkn_start; - tokenizer->token_end = tkn_end; - - - // copying into token buffer if it exists - if (tkn16){ - if (tkn16->utf16_cap < tkn_len) - icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2); - - u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], - tkn_len); - - tkn16->utf16_len = tkn_len; - } - - return tkn_len; -} - - -int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer) -{ - return tokenizer->token_id; -}; - -int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer) -{ - return tokenizer->token_start; -}; - -int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer) -{ - return tokenizer->token_end; -}; - -int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer) -{ - return (tokenizer->token_end - tokenizer->token_start); -}; - -int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer) -{ - return tokenizer->token_count; -}; - - - -struct icu_normalizer * icu_normalizer_create(const char *rules, char action, - UErrorCode *status) -{ - - struct icu_normalizer * normalizer - = (struct icu_normalizer *) malloc(sizeof(struct icu_normalizer)); - - normalizer->action = action; - normalizer->trans = 0; - normalizer->rules16 = icu_buf_utf16_create(0); - icu_utf16_from_utf8_cstr(normalizer->rules16, rules, status); - - switch(normalizer->action) { - case 'f': - normalizer->trans - = utrans_openU(normalizer->rules16->utf16, - normalizer->rules16->utf16_len, - UTRANS_FORWARD, - 0, 0, - normalizer->parse_error, status); - // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans); - break; - case 'r': - normalizer->trans - = utrans_openU(normalizer->rules16->utf16, - normalizer->rules16->utf16_len, - UTRANS_REVERSE , - 0, 0, - normalizer->parse_error, status); - // yaz_log(YLOG_LOG, "utrans_open %p", normalizer->trans); - break; - default: - *status = U_UNSUPPORTED_ERROR; - return 0; - break; - } - - if (U_SUCCESS(*status)) - return normalizer; - - // freeing if failed - icu_normalizer_destroy(normalizer); - return 0; -}; - - -void icu_normalizer_destroy(struct icu_normalizer * normalizer){ - if (normalizer) { - if (normalizer->rules16) - icu_buf_utf16_destroy(normalizer->rules16); - if (normalizer->trans) - { - // yaz_log(YLOG_LOG, "utrans_close %p", normalizer->trans); - utrans_close(normalizer->trans); - } - free(normalizer); - } -}; - - - -int icu_normalizer_normalize(struct icu_normalizer * normalizer, - struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16, - UErrorCode *status) -{ - if (!normalizer || !normalizer->trans || !src16 || !dest16) - return 0; - - if (!icu_buf_utf16_copy(dest16, src16)) - return 0; - - utrans_transUChars (normalizer->trans, - dest16->utf16, &(dest16->utf16_len), - dest16->utf16_cap, - 0, &(src16->utf16_len), status); - - if (U_FAILURE(*status)){ - dest16->utf16[0] = (UChar) 0; - dest16->utf16_len = 0; - } - - return dest16->utf16_len; -} - - - - -struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, - enum icu_chain_step_type type, - const uint8_t * rule, - struct icu_buf_utf16 * buf16, - UErrorCode *status) -{ - struct icu_chain_step * step = 0; - - if(!chain || !type || !rule) - return 0; - - step = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step)); - - step->type = type; - - step->buf16 = buf16; - - // create auxilary objects - switch(step->type) { - case ICU_chain_step_type_display: - break; - case ICU_chain_step_type_index: - break; - case ICU_chain_step_type_sortkey: - break; - case ICU_chain_step_type_casemap: - step->u.casemap = icu_casemap_create((char *) chain->locale, - (char) rule[0], status); - break; - case ICU_chain_step_type_normalize: - step->u.normalizer = icu_normalizer_create((char *) rule, 'f', status); - break; - case ICU_chain_step_type_tokenize: - step->u.tokenizer = icu_tokenizer_create((char *) chain->locale, - (char) rule[0], status); - break; - default: - break; - } - - return step; -}; - - -void icu_chain_step_destroy(struct icu_chain_step * step){ - - if (!step) - return; - - icu_chain_step_destroy(step->previous); - - switch(step->type) { - case ICU_chain_step_type_display: - break; - case ICU_chain_step_type_index: - break; - case ICU_chain_step_type_sortkey: - break; - case ICU_chain_step_type_casemap: - icu_casemap_destroy(step->u.casemap); - icu_buf_utf16_destroy(step->buf16); - break; - case ICU_chain_step_type_normalize: - icu_normalizer_destroy(step->u.normalizer); - icu_buf_utf16_destroy(step->buf16); - break; - case ICU_chain_step_type_tokenize: - icu_tokenizer_destroy(step->u.tokenizer); - icu_buf_utf16_destroy(step->buf16); - break; - default: - break; - } - free(step); -}; - - - -struct icu_chain * icu_chain_create(const uint8_t * identifier, - const uint8_t * locale) -{ - - struct icu_chain * chain - = (struct icu_chain *) malloc(sizeof(struct icu_chain)); - - strncpy((char *) chain->identifier, (const char *) identifier, 128); - chain->identifier[128 - 1] = '\0'; - strncpy((char *) chain->locale, (const char *) locale, 16); - chain->locale[16 - 1] = '\0'; - - chain->token_count = 0; - - chain->display8 = icu_buf_utf8_create(0); - chain->norm8 = icu_buf_utf8_create(0); - chain->sort8 = icu_buf_utf8_create(0); - - chain->src16 = icu_buf_utf16_create(0); - - chain->steps = 0; - - return chain; -}; - - -void icu_chain_destroy(struct icu_chain * chain) -{ - if (chain){ - icu_buf_utf8_destroy(chain->display8); - icu_buf_utf8_destroy(chain->norm8); - icu_buf_utf8_destroy(chain->sort8); - - icu_buf_utf16_destroy(chain->src16); - - icu_chain_step_destroy(chain->steps); - free(chain); - } -}; - - - -struct icu_chain * icu_chain_xml_config(xmlNode *xml_node, - UErrorCode * status){ - - xmlNode *node = 0; - struct icu_chain * chain = 0; - xmlChar *xml_id = 0; - xmlChar *xml_locale = 0; - - if (!xml_node - ||xml_node->type != XML_ELEMENT_NODE - || strcmp((const char *) xml_node->name, "icu_chain")) - - return 0; - - xml_id = xmlGetProp(xml_node, (xmlChar *) "id"); - xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale"); - - if (!xml_id || !strlen((const char *) xml_id) - || !xml_locale || !strlen((const char *) xml_locale)) - return 0; - - chain = icu_chain_create((const uint8_t *) xml_id, - (const uint8_t *) xml_locale); - - xmlFree(xml_id); - xmlFree(xml_locale); - if (!chain) - return 0; - - for (node = xml_node->children; node; node = node->next) - { - xmlChar *xml_rule = 0; - struct icu_chain_step * step = 0; - if (node->type != XML_ELEMENT_NODE) - continue; - - xml_rule = xmlGetProp(node, (xmlChar *) "rule"); - - if (!strcmp((const char *) node->name, - (const char *) "casemap")){ - step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap, - (const uint8_t *) xml_rule, status); - } - else if (!strcmp((const char *) node->name, - (const char *) "normalize")){ - step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, - (const uint8_t *) xml_rule, status); - } - else if (!strcmp((const char *) node->name, - (const char *) "tokenize")){ - step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, - (const uint8_t *) xml_rule, status); - } - else if (!strcmp((const char *) node->name, - (const char *) "display")){ - step = icu_chain_insert_step(chain, ICU_chain_step_type_display, - (const uint8_t *) "", status); - } - else if (!strcmp((const char *) node->name, - (const char *) "index")){ - step = icu_chain_insert_step(chain, ICU_chain_step_type_index, - (const uint8_t *) "", status); - } - else if (!strcmp((const char *) node->name, - (const char *) "sortkey")){ - step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey, - (const uint8_t *) "", status); - } - - xmlFree(xml_rule); - if (!step || U_FAILURE(*status)){ - icu_chain_destroy(chain); - return 0; - } - - - } - - return chain; -}; - - - -struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, - enum icu_chain_step_type type, - const uint8_t * rule, - UErrorCode *status) -{ - struct icu_chain_step * step = 0; - struct icu_buf_utf16 * src16 = 0; - struct icu_buf_utf16 * buf16 = 0; - - if (!chain || !type || !rule) - return 0; - - // assign utf16 src buffers as needed - if (chain->steps && chain->steps->buf16) - src16 = chain->steps->buf16; - else if (chain->src16) - src16 = chain->src16; - else - return 0; - - - // create utf16 destination buffers as needed, or - switch(type) { - case ICU_chain_step_type_display: - buf16 = src16; - break; - case ICU_chain_step_type_index: - buf16 = src16; - break; - case ICU_chain_step_type_sortkey: - buf16 = src16; - break; - case ICU_chain_step_type_casemap: - buf16 = icu_buf_utf16_create(0); - break; - case ICU_chain_step_type_normalize: - buf16 = icu_buf_utf16_create(0); - break; - case ICU_chain_step_type_tokenize: - buf16 = icu_buf_utf16_create(0); - break; - default: - break; - } - - // create actual chain step with this buffer - step = icu_chain_step_create(chain, type, rule, buf16, status); - - step->previous = chain->steps; - chain->steps = step; - - return step; -}; - - -int icu_chain_step_next_token(struct icu_chain * chain, - struct icu_chain_step * step, - UErrorCode *status) -{ - struct icu_buf_utf16 * src16 = 0; - - //printf("icu_chain_step_next_token %d\n", (int) step); - - if (!chain || !chain->src16 || !step || !step->more_tokens) - return 0; - - // assign utf16 src buffers as neeed, advance in previous steps - // tokens until non-zero token met, and setting stop condition - if (step->previous){ - src16 = step->previous->buf16; - if (step->need_new_token) - //while (step->more_tokens && !src16->utf16_len) - step->more_tokens - = icu_chain_step_next_token(chain, step->previous, status); - } - else { // first step can only work once on chain->src16 input buffer - src16 = chain->src16; - step->more_tokens = 1; - } - - // stop if nothing to process - // i.e new token source was not properly assigned - if (!step->more_tokens || !src16) // || !src16->utf16_len - return 0; - - //printf("icu_chain_step_next_token %d working\n", (int) step); - - - // perform the work, eventually put this steps output in - // step->buf16 or the chains UTF8 output buffers - switch(step->type) { - case ICU_chain_step_type_display: - icu_utf16_to_utf8(chain->display8, src16, status); - break; - case ICU_chain_step_type_index: - icu_utf16_to_utf8(chain->norm8, src16, status); - break; - case ICU_chain_step_type_sortkey: - icu_utf16_to_utf8(chain->sort8, src16, status); - break; - case ICU_chain_step_type_casemap: - icu_casemap_casemap(step->u.casemap, - step->buf16, src16, status); - break; - case ICU_chain_step_type_normalize: - icu_normalizer_normalize(step->u.normalizer, - step->buf16, src16, status); - break; - case ICU_chain_step_type_tokenize: - // attach to new src16 token only first time during splitting - if (step->need_new_token){ - icu_tokenizer_attach(step->u.tokenizer, src16, status); - step->need_new_token = 0; - } - // splitting one src16 token into multiple buf16 tokens - step->more_tokens - = icu_tokenizer_next_token(step->u.tokenizer, - step->buf16, status); - // make sure to get new previous token if this one had been used up - if (step->previous && !step->more_tokens){ - if (icu_chain_step_next_token(chain, step->previous, status)){ - icu_tokenizer_attach(step->u.tokenizer, src16, status); - step->need_new_token = 0; - step->more_tokens - = icu_tokenizer_next_token(step->u.tokenizer, - step->buf16, status); - } - } - if (0 == step->more_tokens) - return 0; - break; - default: - return 0; - break; - } - - - - // stop further token processing if last step and - // new tokens are needed from previous (non-existing) step - if (!step->previous && step->need_new_token) - step->more_tokens = 0; - - //printf("%d %d %d\n", - // step->more_tokens, src16->utf16_len, step->buf16->utf16_len); - - - if (U_FAILURE(*status)) - return 0; - - return 1; -}; - - - -int icu_chain_assign_cstr(struct icu_chain * chain, - const char * src8cstr, - UErrorCode *status) -{ - struct icu_chain_step * stp = 0; - - if (!chain || !src8cstr) - return 0; - - stp = chain->steps; - - // clear token count - chain->token_count = 0; - - // clear all steps stop states - - while (stp){ - stp->more_tokens = 1; - stp->need_new_token = 1; - stp = stp->previous; - } - - // finally convert UTF8 to UTF16 string - icu_utf16_from_utf8_cstr(chain->src16, src8cstr, status); - - if (U_FAILURE(*status)) - return 0; - - return 1; -}; - - - -int icu_chain_next_token(struct icu_chain * chain, - UErrorCode *status) -{ - int success = 0; - - if (!chain || !chain->steps) - return 0; - - success = icu_chain_step_next_token(chain, chain->steps, status); - - if (success){ - chain->token_count++; - return chain->token_count; - } - - return 0; -}; - -int icu_chain_get_token_count(struct icu_chain * chain) -{ - if (!chain) - return 0; - - return chain->token_count; -}; - - - -const char * icu_chain_get_display(struct icu_chain * chain) -{ - if (chain->display8) - return icu_buf_utf8_to_cstr(chain->display8); - - return 0; -}; - -const char * icu_chain_get_norm(struct icu_chain * chain) -{ - if (chain->norm8) - return icu_buf_utf8_to_cstr(chain->norm8); - - return 0; -}; - -const char * icu_chain_get_sort(struct icu_chain * chain) -{ - if (chain->sort8) - return icu_buf_utf8_to_cstr(chain->sort8); - - return 0; -}; - - - - -#endif // HAVE_ICU - - - - -/* - * Local variables: - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - * vim: shiftwidth=4 tabstop=8 expandtab - */ diff --git a/src/icu_I18N.h b/src/icu_I18N.h deleted file mode 100644 index dab51fc..0000000 --- a/src/icu_I18N.h +++ /dev/null @@ -1,282 +0,0 @@ -/* This file is part of Pazpar2. - Copyright (C) 2006-2008 Index Data - -Pazpar2 is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free -Software Foundation; either version 2, or (at your option) any later -version. - -Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -*/ - -#ifndef ICU_I18NL_H -#define ICU_I18NL_H - -#include - -#include -#include - -#include /* Basic ICU data types */ -#include /* char names */ - -//#include -#include -//#include /* C Converter API */ -//#include /* some more string fcns*/ -//#include -#include -//#include -#include - - - -// declared structs and functions - -int icu_check_status (UErrorCode status); - -struct icu_buf_utf16 -{ - UChar * utf16; - int32_t utf16_len; - int32_t utf16_cap; -}; - -struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity); -struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16, - size_t capacity); -struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16); -void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16); - - - -struct icu_buf_utf8 -{ - uint8_t * utf8; - int32_t utf8_len; - int32_t utf8_cap; -}; - -struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity); -struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, - size_t capacity); -void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8); - - -UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16, - struct icu_buf_utf8 * src8, - UErrorCode * status); - -UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16, - const char * src8cstr, - UErrorCode * status); - - -UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8, - struct icu_buf_utf16 * src16, - UErrorCode * status); - -struct icu_casemap -{ - char locale[16]; - char action; -}; - -struct icu_casemap * icu_casemap_create(const char *locale, char action, - UErrorCode *status); - -void icu_casemap_destroy(struct icu_casemap * casemap); - -int icu_casemap_casemap(struct icu_casemap * casemap, - struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16, - UErrorCode *status); - -int icu_utf16_casemap(struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16, - const char *locale, char action, - UErrorCode *status); - -UErrorCode icu_sortkey8_from_utf16(UCollator *coll, - struct icu_buf_utf8 * dest8, - struct icu_buf_utf16 * src16, - UErrorCode * status); - -struct icu_tokenizer -{ - char locale[16]; - char action; - UBreakIterator* bi; - struct icu_buf_utf16 * buf16; - int32_t token_count; - int32_t token_id; - int32_t token_start; - int32_t token_end; - // keep always invariant - // 0 <= token_start - // <= token_end - // <= buf16->utf16_len - // and invariant - // 0 <= token_id <= token_count -}; - -struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, - UErrorCode *status); - -void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer); - -int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, - struct icu_buf_utf16 * src16, UErrorCode *status); - -int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, - struct icu_buf_utf16 * tkn16, - UErrorCode *status); - -int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer); -int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer); -int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer); -int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer); -int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer); - - - -struct icu_normalizer -{ - char action; - struct icu_buf_utf16 * rules16; - UParseError parse_error[256]; - UTransliterator * trans; -}; - -struct icu_normalizer * icu_normalizer_create(const char *rules, char action, - UErrorCode *status); - - -void icu_normalizer_destroy(struct icu_normalizer * normalizer); - -int icu_normalizer_normalize(struct icu_normalizer * normalizer, - struct icu_buf_utf16 * dest16, - struct icu_buf_utf16 * src16, - UErrorCode *status); - - -#if 0 -struct icu_token -{ - int32_t token_id; - uint8_t * display8; - uint8_t * norm8; - uint8_t * sort8; -} -#endif - - -enum icu_chain_step_type { - ICU_chain_step_type_none, // - ICU_chain_step_type_display, // convert to utf8 display format - ICU_chain_step_type_index, // convert to utf8 index format - ICU_chain_step_type_sortkey, // convert to utf8 sortkey format - ICU_chain_step_type_casemap, // apply utf16 charmap - ICU_chain_step_type_normalize, // apply utf16 normalization - ICU_chain_step_type_tokenize // apply utf16 tokenization -}; - - - -struct icu_chain_step -{ - // type and action object - enum icu_chain_step_type type; - union { - struct icu_casemap * casemap; - struct icu_normalizer * normalizer; - struct icu_tokenizer * tokenizer; - } u; - // temprary post-action utf16 buffer - struct icu_buf_utf16 * buf16; - struct icu_chain_step * previous; - int more_tokens; - int need_new_token; -}; - - -struct icu_chain; - -struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain, - enum icu_chain_step_type type, - const uint8_t * rule, - struct icu_buf_utf16 * buf16, - UErrorCode *status); - - -void icu_chain_step_destroy(struct icu_chain_step * step); - - -struct icu_chain -{ - uint8_t identifier[128]; - uint8_t locale[16]; - - // number of tokens returned so far - int32_t token_count; - - // utf8 output buffers - struct icu_buf_utf8 * display8; - struct icu_buf_utf8 * norm8; - struct icu_buf_utf8 * sort8; - - // utf16 source buffer - struct icu_buf_utf16 * src16; - - // linked list of chain steps - struct icu_chain_step * steps; -}; - -struct icu_chain * icu_chain_create(const uint8_t * identifier, - const uint8_t * locale); - -void icu_chain_destroy(struct icu_chain * chain); - -struct icu_chain * icu_chain_xml_config(xmlNode *xml_node, - UErrorCode * status); - - -struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain, - enum icu_chain_step_type type, - const uint8_t * rule, - UErrorCode *status); - - -int icu_chain_step_next_token(struct icu_chain * chain, - struct icu_chain_step * step, - UErrorCode *status); - -int icu_chain_assign_cstr(struct icu_chain * chain, - const char * src8cstr, - UErrorCode *status); - -int icu_chain_next_token(struct icu_chain * chain, - UErrorCode *status); - -int icu_chain_get_token_count(struct icu_chain * chain); - -const char * icu_chain_get_display(struct icu_chain * chain); - -const char * icu_chain_get_norm(struct icu_chain * chain); - -const char * icu_chain_get_sort(struct icu_chain * chain); - - - - - -#endif // ICU_I18NL_H diff --git a/src/icu_chain_test.c b/src/icu_chain_test.c deleted file mode 100644 index 873551c..0000000 --- a/src/icu_chain_test.c +++ /dev/null @@ -1,554 +0,0 @@ -/* This file is part of Pazpar2. - Copyright (C) 2006-2008 Index Data - -Pazpar2 is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free -Software Foundation; either version 2, or (at your option) any later -version. - -Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -*/ - -#if HAVE_CONFIG_H -#include -#endif - -#include - -#include -#include - -//#include -#include - - -#ifdef HAVE_ICU - -#include -#include - -#include "icu_I18N.h" - -/* commando line and config parameters */ -static struct config_t { - char conffile[1024]; - char print[1024]; - int xmloutput; - struct icu_chain * chain; - FILE * infile; - FILE * outfile; -} config; - - - -void print_option_error(const struct config_t *p_config) -{ - fprintf(stderr, "Calling error, valid options are :\n"); - fprintf(stderr, "icu_chain_test\n" - " [-c (path/to/config/file.xml)]\n" - " [-p (a|c|l|t)] print ICU info \n" - " [-x] XML output\n" - "\n" - "Examples:\n" - "cat hugetextfile.txt | ./icu_chain_test -c config.xml \n" - "./icu_chain_test -p c\n" - "./icu_chain_test -p l -x\n" - "./icu_chain_test -p t -x\n" - "\n" - "Example ICU chain XML configuration file:\n" - "\n" - " \n" - " \n" - " \n" - " \n" - " \n" - " \n" - " \n" - "\n" - ); - exit(1); -} - -void read_params(int argc, char **argv, struct config_t *p_config) -{ - char *arg; - int ret; - - /* set default parameters */ - p_config->conffile[0] = 0; - p_config->print[0] = 0; - p_config->xmloutput = 0; - p_config->chain = 0; - p_config->infile = stdin; - p_config->outfile = stdout; - - /* set up command line parameters */ - - while ((ret = options("c:p:x", argv, argc, &arg)) != -2) - { - switch (ret) - { - case 'c': - strcpy(p_config->conffile, arg); - break; - case 'p': - strcpy(p_config->print, arg); - break; - case 'x': - p_config->xmloutput = 1; - break; - default: - print_option_error(p_config); - } - } - - if ((!strlen(p_config->conffile) - && !strlen(p_config->print)) - || !config.infile - || !config.outfile) - - print_option_error(p_config); -}; - - -/* UConverter *conv; */ -/* conv = ucnv_open("utf-8", &status); */ -/* assert(U_SUCCESS(status)); */ - -/* *ustr16_len */ -/* = ucnv_toUChars(conv, ustr16, 1024, */ -/* (const char *) *xstr8, strlen((const char *) *xstr8), */ -/* &status); */ - - - -/* ucnv_fromUChars(conv, */ -/* (char *) *xstr8, strlen((const char *) *xstr8), */ -/* ustr16, *ustr16_len, */ -/* &status); */ -/* ucnv_close(conv); */ - - -static void print_icu_converters(const struct config_t *p_config) -{ - int32_t count; - int32_t i; - - count = ucnv_countAvailable(); - if (p_config->xmloutput) - fprintf(config.outfile, "\n", - count, ucnv_getDefaultName()); - else { - fprintf(config.outfile, "Available ICU converters: %d\n", count); - fprintf(config.outfile, "Default ICU Converter is: '%s'\n", - ucnv_getDefaultName()); - } - - for(i=0;ixmloutput) - fprintf(config.outfile, "\n", - ucnv_getAvailableName(i)); - else - fprintf(config.outfile, "%s ", ucnv_getAvailableName(i)); - } - - if (p_config->xmloutput) - fprintf(config.outfile, "\n"); - else - fprintf(config.outfile, "\n"); -} - -static void print_icu_transliterators(const struct config_t *p_config) -{ - int32_t count; - int32_t i; - - count = utrans_countAvailableIDs(); - - int32_t buf_cap = 128; - char buf[buf_cap]; - - if (p_config->xmloutput) - fprintf(config.outfile, "\n", count); - else - fprintf(config.outfile, "Available ICU transliterators: %d\n", count); - - for(i = 0; i xmloutput) - fprintf(config.outfile, "\n", buf); - else - fprintf(config.outfile, " %s", buf); - } - - if (p_config->xmloutput){ - fprintf(config.outfile, "\n"); - } - else - { - fprintf(config.outfile, "\n\nUnicode Set Patterns:\n" - " Pattern Description\n" - " Ranges [a-z] The lower case letters a through z\n" - " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n" - " String [abc{def}] chars a, b and c, and string 'def'\n" - " Categories [\\p{Letter}] Perl General Category 'Letter'.\n" - " Categories [:Letter:] Posix General Category 'Letter'.\n" - "\n" - " Combination Example\n" - " Union [[:Greek:] [:letter:]]\n" - " Intersection [[:Greek:] & [:letter:]]\n" - " Set Complement [[:Greek:] - [:letter:]]\n" - " Complement [^[:Greek:] [:letter:]]\n" - "\n" - "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n" - "\n" - "Examples:\n" - " [:Punctuation:] Any-Remove\n" - " [:Cased-Letter:] Any-Upper\n" - " [:Control:] Any-Remove\n" - " [:Decimal_Number:] Any-Remove\n" - " [:Final_Punctuation:] Any-Remove\n" - " [:Georgian:] Any-Upper\n" - " [:Katakana:] Any-Remove\n" - " [:Arabic:] Any-Remove\n" - " [:Punctuation:] Remove\n" - " [[:Punctuation:]-[.,]] Remove\n" - " [:Line_Separator:] Any-Remove\n" - " [:Math_Symbol:] Any-Remove\n" - " Lower; [:^Letter:] Remove (word tokenization)\n" - " [:^Number:] Remove (numeric tokenization)\n" - " [:^Katagana:] Remove (remove everything except Katagana)\n" - " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n" - " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n" - " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n" - " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n" - "\n" - "see http://icu.sourceforge.net/userguide/Transform.html\n" - " http://www.unicode.org/Public/UNIDATA/UCD.html\n" - " http://icu.sourceforge.net/userguide/Transform.html\n" - " http://icu.sourceforge.net/userguide/TransformRule.html\n" - ); - - - fprintf(config.outfile, "\n\n"); - - } -} - -static void print_icu_xml_locales(const struct config_t *p_config) -{ - int32_t count; - int32_t i; - UErrorCode status = U_ZERO_ERROR; - - UChar keyword[64]; - int32_t keyword_len = 0; - char keyword_str[128]; - int32_t keyword_str_len = 0; - - UChar language[64]; - int32_t language_len = 0; - char lang_str[128]; - int32_t lang_str_len = 0; - - UChar script[64]; - int32_t script_len = 0; - char script_str[128]; - int32_t script_str_len = 0; - - UChar location[64]; - int32_t location_len = 0; - char location_str[128]; - int32_t location_str_len = 0; - - UChar variant[64]; - int32_t variant_len = 0; - char variant_str[128]; - int32_t variant_str_len = 0; - - UChar name[64]; - int32_t name_len = 0; - char name_str[128]; - int32_t name_str_len = 0; - - UChar localname[64]; - int32_t localname_len = 0; - char localname_str[128]; - int32_t localname_str_len = 0; - - count = uloc_countAvailable() ; - - if (p_config->xmloutput){ - - fprintf(config.outfile, "\n", - count, uloc_getDefault(), ucol_countAvailable()); - } - - for(i=0;ixmloutput){ - fprintf(config.outfile, ""); - if (strlen(localname_str)) - fprintf(config.outfile, "%s", localname_str); - fprintf(config.outfile, "\n"); - } - else if (1 == p_config->xmloutput){ - fprintf(config.outfile, "%s", uloc_getAvailable(i)); - fprintf(config.outfile, " | "); - if (strlen(name_str)) - fprintf(config.outfile, "%s", name_str); - fprintf(config.outfile, " | "); - if (strlen(localname_str)) - fprintf(config.outfile, "%s", localname_str); - fprintf(config.outfile, "\n"); - } - else - fprintf(config.outfile, "%s ", uloc_getAvailable(i)); - } - if (p_config->xmloutput) - fprintf(config.outfile, "\n"); - else - fprintf(config.outfile, "\n"); - - if(U_FAILURE(status)) { - fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status)); - exit(status); - } -} - - -static void print_info(const struct config_t *p_config) -{ - if (p_config->xmloutput) - fprintf(config.outfile, "\n" - "\n"); - - if ('c' == config.print[0]) - print_icu_converters(&config); - else if ('l' == config.print[0]) - print_icu_xml_locales(&config); - else if ('t' == config.print[0]) - print_icu_transliterators(&config); - else { - print_icu_converters(&config); - print_icu_xml_locales(&config); - print_icu_transliterators(&config); - } - - if (p_config->xmloutput) - fprintf(config.outfile, "\n"); - - exit(0); -}; - - - -static void process_text_file(const struct config_t *p_config) -{ - char *line = 0; - char linebuf[1024]; - - xmlDoc *doc = xmlParseFile(config.conffile); - xmlNode *xml_node = xmlDocGetRootElement(doc); - - long unsigned int token_count = 0; - long unsigned int line_count = 0; - - UErrorCode status = U_ZERO_ERROR; - int success = 0; - - if (! xml_node) { - printf("Could not parse XML config file '%s' \n", - config.conffile); - exit (1); - } - - - config.chain = icu_chain_xml_config(xml_node, &status); - - if (config.chain && U_SUCCESS(status)) - success = 1; - else { - printf("Could not set up ICU chain from config file '%s' \n", - config.conffile); - exit (1); - } - - if (p_config->xmloutput) - fprintf(config.outfile, - "\n" - "\n" - "\n"); - - // read input lines for processing - while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile))) - { - success = icu_chain_assign_cstr(config.chain, line, &status); - line_count++; - - while (success && icu_chain_next_token(config.chain, &status)){ - if (U_FAILURE(status)) - success = 0; - else { - token_count++; - if (p_config->xmloutput) - fprintf(config.outfile, - "\n", - token_count, - line_count, - icu_chain_get_norm(config.chain), - icu_chain_get_display(config.chain)); - else - fprintf(config.outfile, "%lu %lu '%s' '%s'\n", - token_count, - line_count, - icu_chain_get_norm(config.chain), - icu_chain_get_display(config.chain)); - } - } - - } - - if (p_config->xmloutput) - fprintf(config.outfile, - "\n" - "\n"); - - icu_chain_destroy(config.chain); - xmlFreeDoc(doc); - if (line) - free(line); -}; - -#endif // HAVE_ICU - - -int main(int argc, char **argv) -{ - -#ifdef HAVE_ICU - - read_params(argc, argv, &config); - - if (config.conffile && strlen(config.conffile)) - process_text_file(&config); - - if (config.print && strlen(config.print)) - print_info(&config); - -#else // HAVE_ICU - - printf("ICU not available on your system.\n" - "Please install libicu36-dev and icu-doc or similar, " - "re-configure and re-compile\n"); - - -#endif // HAVE_ICU - - return(0); -}; - - -/* - * Local variables: - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - * vim: shiftwidth=4 tabstop=8 expandtab - */ - diff --git a/src/pazpar2.c b/src/pazpar2.c index 0ede6be..1631734 100644 --- a/src/pazpar2.c +++ b/src/pazpar2.c @@ -61,7 +61,7 @@ static void show_version(void) yaz_version(yaz_version_str, 0); printf("Configuration:"); -#if HAVE_ICU +#if YAZ_HAVE_ICU printf(" icu:?"); #endif printf(" yaz:%s", yaz_version_str); diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c deleted file mode 100644 index bc55d7d..0000000 --- a/src/test_icu_I18N.c +++ /dev/null @@ -1,691 +0,0 @@ -/* This file is part of Pazpar2. - Copyright (C) 2006-2008 Index Data - -Pazpar2 is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free -Software Foundation; either version 2, or (at your option) any later -version. - -Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -*/ - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - - -#if HAVE_CONFIG_H -#include -#endif - -#define USE_TIMING 0 -#if USE_TIMING -#include -#endif - -#include - - - -#ifdef HAVE_ICU -#include "icu_I18N.h" - -#include -#include - -//#include -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - - -#define MAX_KEY_SIZE 256 -struct icu_termmap -{ - uint8_t sort_key[MAX_KEY_SIZE]; // standard C string '\0' terminated - char disp_term[MAX_KEY_SIZE]; // standard C utf-8 string -}; - - - -int icu_termmap_cmp(const void *vp1, const void *vp2) -{ - struct icu_termmap *itmp1 = *(struct icu_termmap **) vp1; - struct icu_termmap *itmp2 = *(struct icu_termmap **) vp2; - - int cmp = 0; - - cmp = strcmp((const char *)itmp1->sort_key, - (const char *)itmp2->sort_key); - return cmp; -}; - - - - -int test_icu_casemap(const char * locale, char action, - const char * src8cstr, const char * chk8cstr) -{ - int success = 0; - UErrorCode status = U_ZERO_ERROR; - - struct icu_buf_utf8 * src8 = icu_buf_utf8_create(0); - struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0); - struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); - struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0); - - - int src8cstr_len = strlen(src8cstr); - int chk8cstr_len = strlen(chk8cstr); - - // converting to UTF16 - icu_utf16_from_utf8_cstr(src16, src8cstr, &status); - - // perform case mapping - icu_utf16_casemap(dest16, src16, locale, action, &status); - - // converting to UTF8 - icu_utf16_to_utf8(dest8, dest16, &status); - - - - // determine success - if (dest8->utf8 - && (dest8->utf8_len == strlen(chk8cstr)) - && !strcmp(chk8cstr, (const char *) dest8->utf8)) - success = 1; - else - success = 0; - - // report failures - if (!success){ - printf("\nERROR\n"); - printf("original string: '%s' (%d)\n", src8cstr, src8cstr_len); - printf("icu_casemap '%s:%c' '%s' (%d)\n", - locale, action, dest8->utf8, dest8->utf8_len); - printf("expected string: '%s' (%d)\n", chk8cstr, chk8cstr_len); - } - - // clean the buffers - icu_buf_utf8_destroy(src8); - icu_buf_utf8_destroy(dest8); - icu_buf_utf16_destroy(src16); - icu_buf_utf16_destroy(dest16); - - - return success; -} - - - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - -void test_icu_I18N_casemap(int argc, char **argv) -{ - - // Locale 'en' - - // sucessful tests - YAZ_CHECK(test_icu_casemap("en", 'l', - "A ReD fOx hunTS sQUirriLs", - "a red fox hunts squirrils")); - - YAZ_CHECK(test_icu_casemap("en", 'u', - "A ReD fOx hunTS sQUirriLs", - "A RED FOX HUNTS SQUIRRILS")); - - YAZ_CHECK(test_icu_casemap("en", 'f', - "A ReD fOx hunTS sQUirriLs", - "a red fox hunts squirrils")); - - YAZ_CHECK(test_icu_casemap("en", 't', - "A ReD fOx hunTS sQUirriLs", - "A Red Fox Hunts Squirrils")); - - - // Locale 'da' - - // sucess expected - YAZ_CHECK(test_icu_casemap("da", 'l', - "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN", - "åh æble, øs fløde i åen efter blåbærgrøden")); - - YAZ_CHECK(test_icu_casemap("da", 'u', - "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN", - "ÅH ÆBLE, ØS FLØDE I ÅEN EFTER BLÅBÆRGRØDEN")); - - YAZ_CHECK(test_icu_casemap("da", 'f', - "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN", - "åh æble, øs fløde i åen efter blåbærgrøden")); - - YAZ_CHECK(test_icu_casemap("da", 't', - "åh ÆbLE, øs fLØde i Åen efter bLåBærGRødeN", - "Åh Æble, Øs Fløde I Åen Efter Blåbærgrøden")); - - // Locale 'de' - - // sucess expected - YAZ_CHECK(test_icu_casemap("de", 'l', - "zWÖlf ärgerliche Würste rollen ÜBer die StRAße", - "zwölf ärgerliche würste rollen über die straße")); - - YAZ_CHECK(test_icu_casemap("de", 'u', - "zWÖlf ärgerliche Würste rollen ÜBer die StRAße", - "ZWÖLF ÄRGERLICHE WÜRSTE ROLLEN ÜBER DIE STRASSE")); - - YAZ_CHECK(test_icu_casemap("de", 'f', - "zWÖlf ärgerliche Würste rollen ÜBer die StRAße", - "zwölf ärgerliche würste rollen über die strasse")); - - YAZ_CHECK(test_icu_casemap("de", 't', - "zWÖlf ärgerliche Würste rollen ÜBer die StRAße", - "Zwölf Ärgerliche Würste Rollen Über Die Straße")); - -} - - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - -int test_icu_sortmap(const char * locale, int src_list_len, - const char ** src_list, const char ** chk_list) -{ - int success = 1; - - UErrorCode status = U_ZERO_ERROR; - - struct icu_buf_utf8 * buf8 = icu_buf_utf8_create(0); - struct icu_buf_utf16 * buf16 = icu_buf_utf16_create(0); - - int i; - - struct icu_termmap * list[src_list_len]; - - UCollator *coll = ucol_open(locale, &status); - icu_check_status(status); - - if(U_FAILURE(status)) - return 0; - - // assigning display terms and sort keys using buf 8 and buf16 - for( i = 0; i < src_list_len; i++) - { - - list[i] = (struct icu_termmap *) malloc(sizeof(struct icu_termmap)); - - // copy display term - strcpy(list[i]->disp_term, src_list[i]); - - // transforming to UTF16 - icu_utf16_from_utf8_cstr(buf16, list[i]->disp_term, &status); - icu_check_status(status); - - // computing sortkeys - icu_sortkey8_from_utf16(coll, buf8, buf16, &status); - icu_check_status(status); - - // assigning sortkeys - memcpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len); - //strncpy(list[i]->sort_key, buf8->utf8, buf8->utf8_len); - //strcpy((char *) list[i]->sort_key, (const char *) buf8->utf8); - } - - - // do the sorting - qsort(list, src_list_len, - sizeof(struct icu_termmap *), icu_termmap_cmp); - - // checking correct sorting - for (i = 0; i < src_list_len; i++){ - if (0 != strcmp(list[i]->disp_term, chk_list[i])){ - success = 0; - } - } - - if(!success){ - printf("\nERROR\n"); - printf("Input str: '%s' : ", locale); - for (i = 0; i < src_list_len; i++) { - printf(" '%s'", list[i]->disp_term); - } - printf("\n"); - printf("ICU sort: '%s' : ", locale); - for (i = 0; i < src_list_len; i++) { - printf(" '%s'", list[i]->disp_term); - //printf("(%d|%d)", list[i]->sort_key[0],list[i]->sort_key[1]); - } - printf("\n"); - printf("Expected: '%s' : ", locale); - for (i = 0; i < src_list_len; i++) { - printf(" '%s'", chk_list[i]); - } - printf("\n"); - } - - - - for( i = 0; i < src_list_len; i++) - free(list[i]); - - - ucol_close(coll); - - icu_buf_utf8_destroy(buf8); - icu_buf_utf16_destroy(buf16); - - return success; -} - - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - -void test_icu_I18N_sortmap(int argc, char **argv) -{ - - // sucessful tests - size_t en_1_len = 6; - const char * en_1_src[6] = {"z", "K", "a", "A", "Z", "k"}; - const char * en_1_cck[6] = {"a", "A", "k", "K", "z", "Z"}; - YAZ_CHECK(test_icu_sortmap("en", en_1_len, en_1_src, en_1_cck)); - YAZ_CHECK(test_icu_sortmap("en_AU", en_1_len, en_1_src, en_1_cck)); - YAZ_CHECK(test_icu_sortmap("en_CA", en_1_len, en_1_src, en_1_cck)); - YAZ_CHECK(test_icu_sortmap("en_GB", en_1_len, en_1_src, en_1_cck)); - YAZ_CHECK(test_icu_sortmap("en_US", en_1_len, en_1_src, en_1_cck)); - - // sucessful tests - size_t da_1_len = 6; - const char * da_1_src[6] = {"z", "å", "o", "æ", "a", "ø"}; - const char * da_1_cck[6] = {"a", "o", "z", "æ", "ø", "å"}; - YAZ_CHECK(test_icu_sortmap("da", da_1_len, da_1_src, da_1_cck)); - YAZ_CHECK(test_icu_sortmap("da_DK", da_1_len, da_1_src, da_1_cck)); - - // sucessful tests - size_t de_1_len = 9; - const char * de_1_src[9] = {"u", "ä", "o", "t", "s", "ß", "ü", "ö", "a"}; - const char * de_1_cck[9] = {"a","ä", "o", "ö", "s", "ß", "t", "u", "ü"}; - YAZ_CHECK(test_icu_sortmap("de", de_1_len, de_1_src, de_1_cck)); - YAZ_CHECK(test_icu_sortmap("de_AT", de_1_len, de_1_src, de_1_cck)); - YAZ_CHECK(test_icu_sortmap("de_DE", de_1_len, de_1_src, de_1_cck)); - -} - - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - - - - -int test_icu_normalizer(const char * rules8cstr, - const char * src8cstr, - const char * chk8cstr) -{ - int success = 0; - - UErrorCode status = U_ZERO_ERROR; - - struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); - struct icu_buf_utf16 * dest16 = icu_buf_utf16_create(0); - struct icu_buf_utf8 * dest8 = icu_buf_utf8_create(0); - struct icu_normalizer * normalizer - = icu_normalizer_create(rules8cstr, 'f', &status); - icu_check_status(status); - - icu_utf16_from_utf8_cstr(src16, src8cstr, &status); - icu_check_status(status); - - icu_normalizer_normalize(normalizer, dest16, src16, &status); - icu_check_status(status); - - icu_utf16_to_utf8(dest8, dest16, &status); - icu_check_status(status); - - - if(!strcmp((const char *) dest8->utf8, - (const char *) chk8cstr)) - success = 1; - else { - success = 0; - printf("Normalization\n"); - printf("Rules: '%s'\n", rules8cstr); - printf("Input: '%s'\n", src8cstr); - printf("Normalized: '%s'\n", dest8->utf8); - printf("Expected: '%s'\n", chk8cstr); - } - - - icu_normalizer_destroy(normalizer); - icu_buf_utf16_destroy(src16); - icu_buf_utf16_destroy(dest16); - icu_buf_utf8_destroy(dest8); - - return success; -}; - - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - -void test_icu_I18N_normalizer(int argc, char **argv) -{ - - YAZ_CHECK(test_icu_normalizer("[:Punctuation:] Any-Remove", - "Don't shoot!", - "Dont shoot")); - - YAZ_CHECK(test_icu_normalizer("[:Control:] Any-Remove", - "Don't\n shoot!", - "Don't shoot!")); - - YAZ_CHECK(test_icu_normalizer("[:Decimal_Number:] Any-Remove", - "This is 4 you!", - "This is you!")); - - YAZ_CHECK(test_icu_normalizer("Lower; [:^Letter:] Remove", - "Don't shoot!", - "dontshoot")); - - YAZ_CHECK(test_icu_normalizer("[:^Number:] Remove", - "Monday 15th of April", - "15")); - - YAZ_CHECK(test_icu_normalizer("Lower;" - "[[:WhiteSpace:][:Punctuation:]] Remove", - " word4you? ", - "word4you")); - - - YAZ_CHECK(test_icu_normalizer("NFD; [:Nonspacing Mark:] Remove; NFC", - "à côté de l'alcôve ovoïde", - "a cote de l'alcove ovoide")); - -} - - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - -int test_icu_tokenizer(const char * locale, char action, - const char * src8cstr, int count) -{ - int success = 1; - - UErrorCode status = U_ZERO_ERROR; - struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); - struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0); - struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0); - - //printf("Input: '%s'\n", src8cstr); - - // transforming to UTF16 - icu_utf16_from_utf8_cstr(src16, src8cstr, &status); - icu_check_status(status); - - // set up tokenizer - struct icu_tokenizer * tokenizer - = icu_tokenizer_create(locale, action, &status); - icu_check_status(status); - YAZ_CHECK(tokenizer); - - // attach text buffer to tokenizer - icu_tokenizer_attach(tokenizer, src16, &status); - icu_check_status(status); - YAZ_CHECK(tokenizer->bi); - - // perform work on tokens - //printf("Tokens: "); - while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){ - icu_check_status(status); - - // converting to UTF8 - icu_utf16_to_utf8(tkn8, tkn16, &status); - - //printf("token %d %d %d %d '%s'\n", - // - // icu_tokenizer_token_start(tokenizer), - // icu_tokenizer_token_end(tokenizer), - // icu_tokenizer_token_length(tokenizer), - // tkn8->utf8); - } - - if (count != icu_tokenizer_token_count(tokenizer)){ - success = 0; - printf("\nTokenizer '%s:%c' Error: \n", locale, action); - printf("Input: '%s'\n", src8cstr); - printf("Tokens: %d", icu_tokenizer_token_count(tokenizer)); - printf(", expected: %d\n", count); - } - - icu_tokenizer_destroy(tokenizer); - icu_buf_utf16_destroy(src16); - icu_buf_utf16_destroy(tkn16); - icu_buf_utf8_destroy(tkn8); - - return success; -} - - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - -void test_icu_I18N_tokenizer(int argc, char **argv) -{ - - - const char * en_str - = "O Romeo, Romeo! wherefore art thou Romeo?"; - - YAZ_CHECK(test_icu_tokenizer("en", 's', en_str, 2)); - YAZ_CHECK(test_icu_tokenizer("en", 'l', en_str, 7)); - YAZ_CHECK(test_icu_tokenizer("en", 'w', en_str, 16)); - YAZ_CHECK(test_icu_tokenizer("en", 'c', en_str, 41)); - - - - const char * da_str - = "Blåbærtærte. Denne kage stammer fra Finland. " - "Den er med blåbær, men alle sommerens forskellige bær kan bruges."; - - YAZ_CHECK(test_icu_tokenizer("da", 's', da_str, 3)); - YAZ_CHECK(test_icu_tokenizer("dar", 'l', da_str, 17)); - YAZ_CHECK(test_icu_tokenizer("da", 'w', da_str, 37)); - YAZ_CHECK(test_icu_tokenizer("da", 'c', da_str, 110)); - -} - - -void test_icu_I18N_chain(int argc, char **argv) -{ - const char * en_str - = "O Romeo, Romeo! wherefore art thou\t Romeo?"; - - printf("ICU chain:\ninput: '%s'\n", en_str); - - UErrorCode status = U_ZERO_ERROR; - //struct icu_chain_step * step = 0; - struct icu_chain * chain = 0; - - - const char * xml_str = "" - "" - "" - "" - "" - "" - "" - "" - ""; - - - xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); - xmlNode *xml_node = xmlDocGetRootElement(doc); - YAZ_CHECK(xml_node); - - - chain = icu_chain_xml_config(xml_node, &status); - -#if 0 - chain = icu_chain_create((uint8_t *) "en:word", (uint8_t *) "en"); - step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, - (const uint8_t *) "[:Control:] Any-Remove", - &status); - step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, - (const uint8_t *) "s", - &status); - step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, - (const uint8_t *) "l", - &status); - step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, - (const uint8_t *) - "[[:WhiteSpace:][:Punctuation:]] Any-Remove", - &status); - step = icu_chain_insert_step(chain, ICU_chain_step_type_display, - (const uint8_t *)"", - &status); -/* step = icu_chain_insert_step(chain, ICU_chain_step_type_normalize, */ -/* (const uint8_t *) "Lower", */ -/* &status); */ - step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap, - (const uint8_t *) "l", - &status); - step = icu_chain_insert_step(chain, ICU_chain_step_type_index, - (const uint8_t *)"", - &status); -/* step = icu_chain_insert_step(chain, ICU_chain_step_type_sortkey, */ -/* (const uint8_t *)"", */ -/* &status); */ - -#endif - - xmlFreeDoc(doc); - YAZ_CHECK(chain); - - YAZ_CHECK(icu_chain_assign_cstr(chain, en_str, &status)); - - while (icu_chain_next_token(chain, &status)){ - printf("%d '%s' '%s'\n", - icu_chain_get_token_count(chain), - icu_chain_get_norm(chain), - icu_chain_get_display(chain)); - } - - YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 7); - - - YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status)); - - while (icu_chain_next_token(chain, &status)){ - printf("%d '%s' '%s'\n", - icu_chain_get_token_count(chain), - icu_chain_get_norm(chain), - icu_chain_get_display(chain)); - } - - - YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 3); - - icu_chain_destroy(chain); -} - - -void test_bug_1140(void) -{ - const char * en_str - = "O Romeo, Romeo! wherefore art thou\t Romeo?"; - - printf("ICU chain:\ninput: '%s'\n", en_str); - - UErrorCode status = U_ZERO_ERROR; - //struct icu_chain_step * step = 0; - struct icu_chain * chain = 0; - - const char * xml_str = "" - - /* if the first rule is normalize instead. Then it works */ -#if 0 - "" -#endif - "" - "" - "" - "" - "" - "" - ""; - - - xmlDoc *doc = xmlParseMemory(xml_str, strlen(xml_str)); - xmlNode *xml_node = xmlDocGetRootElement(doc); - YAZ_CHECK(xml_node); - - chain = icu_chain_xml_config(xml_node, &status); - - xmlFreeDoc(doc); - YAZ_CHECK(chain); - - YAZ_CHECK(icu_chain_assign_cstr( - chain, "O Romeo, Romeo! wherefore art thou\t Romeo?", - &status)); - - while (icu_chain_next_token(chain, &status)) - ; - - YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 7); - - YAZ_CHECK(icu_chain_assign_cstr(chain, "what is this?", &status)); - - while (icu_chain_next_token(chain, &status)){ - printf("%d '%s' '%s'\n", - icu_chain_get_token_count(chain), - icu_chain_get_norm(chain), - icu_chain_get_display(chain)); - } - - /* we expect 'what' 'is' 'this', i.e. 3 tokens */ - YAZ_CHECK_EQ(icu_chain_get_token_count(chain), 3); - - icu_chain_destroy(chain); -} - -#endif // HAVE_ICU - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - -int main(int argc, char **argv) -{ - - YAZ_CHECK_INIT(argc, argv); - YAZ_CHECK_LOG(); - -#ifdef HAVE_ICU - - //test_icu_I18N_casemap_failures(argc, argv); - test_icu_I18N_casemap(argc, argv); - test_icu_I18N_sortmap(argc, argv); - test_icu_I18N_normalizer(argc, argv); - test_icu_I18N_tokenizer(argc, argv); - test_icu_I18N_chain(argc, argv); - test_bug_1140(); - -#else // HAVE_ICU - - printf("ICU unit tests omitted.\n" - "Please install libicu36-dev and icu-doc or similar\n"); - YAZ_CHECK(0 == 0); - -#endif // HAVE_ICU - - YAZ_CHECK_TERM; -} - - -// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 - - - -/* - * Local variables: - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - * vim: shiftwidth=4 tabstop=8 expandtab - */ -- 1.7.10.4