X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=util%2Fyaz-icu.c;h=1386a16f53cc3aeac32ffcd0eed4b96309e1871d;hp=dd972493866f57b166c76aa57e9e7b2eebc8a5cd;hb=28d3e3b08a856a66cb90ebf08787f8fa27a772eb;hpb=fffad7b88b5fcc5ce744151034d1df379919d6d6 diff --git a/util/yaz-icu.c b/util/yaz-icu.c index dd97249..1386a16 100644 --- a/util/yaz-icu.c +++ b/util/yaz-icu.c @@ -1,8 +1,6 @@ -/* - * Copyright (C) 1995-2007, Index Data ApS +/* This file is part of the YAZ toolkit. + * Copyright (C) Index Data * See the file LICENSE for details. - * - * $Id: yaz-icu.c,v 1.8 2007-11-07 09:50:24 adam Exp $ */ #if HAVE_CONFIG_H @@ -13,36 +11,44 @@ #include #include +#include #include - -#if HAVE_ICU +#if YAZ_HAVE_ICU #include #include +#include +#include +#include +#include -#include +#include +#include +#include /* commando line and config parameters */ -static struct config_t { +struct config_t { char conffile[1024]; char print[1024]; int xmloutput; - struct icu_chain * chain; + int sortoutput; + int org_output; + yaz_icu_chain_t chain; FILE * infile; FILE * outfile; -} config; +}; - - void print_option_error(const struct config_t *p_config) -{ - fprintf(stderr, "Calling error, valid options are :\n"); - fprintf(stderr, "yaz-icu\n" - " [-c (path/to/config/file.xml)]\n" - " [-p (a|c|l|t)] print ICU info \n" - " [-x] XML output\n" +{ + fprintf(stderr, "yaz-icu [options] [infile]\n" + "Options:\n" + " -c file XML configuration\n" + " -p a|c|l|t Print ICU info \n" + " -s Show sort normalization key\n" + " -o Show org positions\n" + " -x XML output instread of text\n" "\n" "Examples:\n" "cat hugetextfile.txt | ./yaz-icu -c config.xml \n" @@ -51,10 +57,10 @@ void print_option_error(const struct config_t *p_config) "./yaz-icu -p t -x\n" "\n" "Example ICU chain XML configuration file:\n" - "\n" - " \n" + "\n" + " \n" " \n" - " \n" + " \n" " \n" "\n" ); @@ -62,21 +68,23 @@ void print_option_error(const struct config_t *p_config) } void read_params(int argc, char **argv, struct config_t *p_config) -{ +{ char *arg; int ret; - + /* set default parameters */ p_config->conffile[0] = 0; p_config->print[0] = 0; p_config->xmloutput = 0; + p_config->sortoutput = 0; p_config->chain = 0; - p_config->infile = stdin; + p_config->infile = 0; p_config->outfile = stdout; - + p_config->org_output = 0; + /* set up command line parameters */ - - while ((ret = options("c:p:x", argv, argc, &arg)) != -2) + + while ((ret = options("c:op:sx", argv, argc, &arg)) != -2) { switch (ret) { @@ -86,40 +94,41 @@ void read_params(int argc, char **argv, struct config_t *p_config) case 'p': strcpy(p_config->print, arg); break; + case 's': + p_config->sortoutput = 1; + break; case 'x': p_config->xmloutput = 1; break; + case 'o': + p_config->org_output = 1; + break; + case 0: + if (p_config->infile) + { + fprintf(stderr, "yaz-icu: only one input file may be given\n"); + print_option_error(p_config); + } + p_config->infile = fopen(arg, "r"); + if (!p_config->infile) + { + fprintf(stderr, "yaz-icu: cannot open %s : %s\n", + arg, strerror(errno)); + exit(1); + } + break; default: + fprintf(stderr, "yaz_icu: invalid option: %s\n", arg); print_option_error(p_config); } } - - if ((!strlen(p_config->conffile) - && !strlen(p_config->print)) - || !config.infile - || !config.outfile) - - print_option_error(p_config); -} - -/* UConverter *conv; */ -/* conv = ucnv_open("utf-8", &status); */ -/* assert(U_SUCCESS(status)); */ - -/* *ustr16_len */ -/* = ucnv_toUChars(conv, ustr16, 1024, */ -/* (const char *) *xstr8, strlen((const char *) *xstr8), */ -/* &status); */ - - - -/* ucnv_fromUChars(conv, */ -/* (char *) *xstr8, strlen((const char *) *xstr8), */ -/* ustr16, *ustr16_len, */ -/* &status); */ -/* ucnv_close(conv); */ + if (p_config->infile == 0) + p_config->infile = stdin; + if (!strlen(p_config->conffile) && !strlen(p_config->print)) + print_option_error(p_config); +} static void print_icu_converters(const struct config_t *p_config) { @@ -128,55 +137,56 @@ static void print_icu_converters(const struct config_t *p_config) count = ucnv_countAvailable(); if (p_config->xmloutput) - fprintf(config.outfile, "\n", + fprintf(p_config->outfile, "\n", count, ucnv_getDefaultName()); - else { - fprintf(config.outfile, "Available ICU converters: %d\n", count); - fprintf(config.outfile, "Default ICU Converter is: '%s'\n", + else + { + fprintf(p_config->outfile, "Available ICU converters: %d\n", count); + fprintf(p_config->outfile, "Default ICU Converter is: '%s'\n", ucnv_getDefaultName()); } - - for(i=0;ixmloutput) - fprintf(config.outfile, "\n", + fprintf(p_config->outfile, "\n", ucnv_getAvailableName(i)); - else - fprintf(config.outfile, "%s ", ucnv_getAvailableName(i)); + else + fprintf(p_config->outfile, "%s\n", ucnv_getAvailableName(i)); } - + if (p_config->xmloutput) - fprintf(config.outfile, "\n"); + fprintf(p_config->outfile, "\n"); else - fprintf(config.outfile, "\n"); + fprintf(p_config->outfile, "\n"); } static void print_icu_transliterators(const struct config_t *p_config) { - int32_t buf_cap = 128; - char buf[buf_cap]; - int32_t i; - int32_t count = utrans_countAvailableIDs(); - + UErrorCode status; + UEnumeration *en = utrans_openIDs(&status); + int32_t count = uenum_count(en, &status); + const char *name; + int32_t length; + if (p_config->xmloutput) - fprintf(config.outfile, "\n", count); - else - fprintf(config.outfile, "Available ICU transliterators: %d\n", count); - - for(i = 0; i outfile, "\n", count); + else + fprintf(p_config->outfile, "Available ICU transliterators: %d\n", count); + + while ((name = uenum_next(en, &length, &status))) { - utrans_getAvailableID(i, buf, buf_cap); if (p_config->xmloutput) - fprintf(config.outfile, "\n", buf); + fprintf(p_config->outfile, "\n", name); else - fprintf(config.outfile, " %s", buf); - } - - if (p_config->xmloutput){ - fprintf(config.outfile, "\n"); + fprintf(p_config->outfile, "%s\n", name); } + uenum_close(en); + if (p_config->xmloutput) + fprintf(p_config->outfile, "\n"); else { - fprintf(config.outfile, "\n\nUnicode Set Patterns:\n" + fprintf(p_config->outfile, "\n\nUnicode Set Patterns:\n" " Pattern Description\n" " Ranges [a-z] The lower case letters a through z\n" " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n" @@ -213,15 +223,13 @@ static void print_icu_transliterators(const struct config_t *p_config) " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n" " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n" "\n" - "see http://icu.sourceforge.net/userguide/Transform.html\n" - " http://www.unicode.org/Public/UNIDATA/UCD.html\n" - " http://icu.sourceforge.net/userguide/Transform.html\n" - " http://icu.sourceforge.net/userguide/TransformRule.html\n" + "see http://userguide.icu-project.org/transforms/general\n" + " http://www.unicode.org/reports/tr44/\n" ); - - - fprintf(config.outfile, "\n\n"); - + + + fprintf(p_config->outfile, "\n\n"); + } } @@ -230,7 +238,7 @@ static void print_icu_xml_locales(const struct config_t *p_config) int32_t count; int32_t i; UErrorCode status = U_ZERO_ERROR; - + UChar keyword[64]; int32_t keyword_len = 0; char keyword_str[128]; @@ -268,28 +276,33 @@ static void print_icu_xml_locales(const struct config_t *p_config) count = uloc_countAvailable() ; - if (p_config->xmloutput){ - - fprintf(config.outfile, "\n", + if (p_config->xmloutput) + { + fprintf(p_config->outfile, "\n", count, uloc_getDefault(), ucol_countAvailable()); } - - for(i=0;ioutfile, "Available ICU locales: %d\n", count); + fprintf(p_config->outfile, "Default locale is: %s\n", uloc_getDefault()); + } + + for (i = 0; i < count; i++) { - keyword_len - = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", - keyword, 64, + keyword_len + = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", + keyword, 64, &status); u_strToUTF8(keyword_str, 128, &keyword_str_len, keyword, keyword_len, &status); - - - language_len - = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", - language, 64, + + + language_len + = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", + language, 64, &status); u_strToUTF8(lang_str, 128, &lang_str_len, @@ -297,45 +310,45 @@ static void print_icu_xml_locales(const struct config_t *p_config) &status); - script_len - = uloc_getDisplayScript(uloc_getAvailable(i), "en", - script, 64, + script_len + = uloc_getDisplayScript(uloc_getAvailable(i), "en", + script, 64, &status); u_strToUTF8(script_str, 128, &script_str_len, script, script_len, &status); - location_len - = uloc_getDisplayCountry(uloc_getAvailable(i), "en", - location, 64, + location_len + = uloc_getDisplayCountry(uloc_getAvailable(i), "en", + location, 64, &status); u_strToUTF8(location_str, 128, &location_str_len, location, location_len, &status); - variant_len - = uloc_getDisplayVariant(uloc_getAvailable(i), "en", - variant, 64, + variant_len + = uloc_getDisplayVariant(uloc_getAvailable(i), "en", + variant, 64, &status); u_strToUTF8(variant_str, 128, &variant_str_len, variant, variant_len, &status); - name_len - = uloc_getDisplayName(uloc_getAvailable(i), "en", - name, 64, + name_len + = uloc_getDisplayName(uloc_getAvailable(i), "en", + name, 64, &status); u_strToUTF8(name_str, 128, &name_str_len, name, name_len, &status); - localname_len - = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), - localname, 64, + localname_len + = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), + localname, 64, &status); u_strToUTF8(localname_str, 128, &localname_str_len, @@ -343,51 +356,49 @@ static void print_icu_xml_locales(const struct config_t *p_config) &status); - if (p_config->xmloutput){ - fprintf(config.outfile, "xmloutput) + { + fprintf(p_config->outfile, "outfile, " language=\"%s\"", lang_str); if (strlen(script_str)) - fprintf(config.outfile, " script=\"%s\"", script_str); + fprintf(p_config->outfile, " script=\"%s\"", script_str); if (strlen(location_str)) - fprintf(config.outfile, " location=\"%s\"", location_str); + fprintf(p_config->outfile, " location=\"%s\"", location_str); if (strlen(variant_str)) - fprintf(config.outfile, " variant=\"%s\"", variant_str); + fprintf(p_config->outfile, " variant=\"%s\"", variant_str); if (strlen(name_str)) - fprintf(config.outfile, " name=\"%s\"", name_str); + fprintf(p_config->outfile, " name=\"%s\"", name_str); if (strlen(localname_str)) - fprintf(config.outfile, " localname=\"%s\"", localname_str); - fprintf(config.outfile, ">"); + fprintf(p_config->outfile, " localname=\"%s\"", localname_str); + fprintf(p_config->outfile, ">"); if (strlen(localname_str)) - fprintf(config.outfile, "%s", localname_str); - fprintf(config.outfile, "\n"); + fprintf(p_config->outfile, "%s", localname_str); + fprintf(p_config->outfile, "\n"); } - else if (1 == p_config->xmloutput){ - fprintf(config.outfile, "%s", uloc_getAvailable(i)); - fprintf(config.outfile, " | "); + else if (1 == p_config->xmloutput) + { + fprintf(p_config->outfile, "%s", uloc_getAvailable(i)); + fprintf(p_config->outfile, " | "); if (strlen(name_str)) - fprintf(config.outfile, "%s", name_str); - fprintf(config.outfile, " | "); + fprintf(p_config->outfile, "%s", name_str); + fprintf(p_config->outfile, " | "); if (strlen(localname_str)) - fprintf(config.outfile, "%s", localname_str); - fprintf(config.outfile, "\n"); + fprintf(p_config->outfile, "%s", localname_str); + fprintf(p_config->outfile, "\n"); } else - fprintf(config.outfile, "%s ", uloc_getAvailable(i)); + fprintf(p_config->outfile, "%s\n", uloc_getAvailable(i)); } if (p_config->xmloutput) - fprintf(config.outfile, "\n"); + fprintf(p_config->outfile, "\n"); else - fprintf(config.outfile, "\n"); + fprintf(p_config->outfile, "\n"); - if(U_FAILURE(status)) { + if (U_FAILURE(status)) + { fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status)); - exit(status); + exit(2); } } @@ -395,148 +406,181 @@ static void print_icu_xml_locales(const struct config_t *p_config) static void print_info(const struct config_t *p_config) { if (p_config->xmloutput) - fprintf(config.outfile, "\n" + fprintf(p_config->outfile, "\n" "\n"); - if ('c' == config.print[0]) - print_icu_converters(&config); - else if ('l' == config.print[0]) - print_icu_xml_locales(&config); - else if ('t' == config.print[0]) - print_icu_transliterators(&config); + if ('c' == p_config->print[0]) + print_icu_converters(p_config); + else if ('l' == p_config->print[0]) + print_icu_xml_locales(p_config); + else if ('t' == p_config->print[0]) + print_icu_transliterators(p_config); else { - print_icu_converters(&config); - print_icu_xml_locales(&config); - print_icu_transliterators(&config); + print_icu_converters(p_config); + print_icu_xml_locales(p_config); + print_icu_transliterators(p_config); } if (p_config->xmloutput) - fprintf(config.outfile, "\n"); + fprintf(p_config->outfile, "\n"); exit(0); } -static void process_text_file(const struct config_t *p_config) +static void process_text_file(struct config_t *p_config) { char *line = 0; char linebuf[1024]; - - xmlDoc *doc = xmlParseFile(config.conffile); + + xmlDoc *doc = xmlParseFile(p_config->conffile); xmlNode *xml_node = xmlDocGetRootElement(doc); - xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale"); - long unsigned int token_count = 0; - long unsigned int line_count = 0; - + long unsigned int token_count = 0; + long unsigned int line_count = 0; + UErrorCode status = U_ZERO_ERROR; - int success = 0; - - if (! xml_node) { + + if (!xml_node) + { printf("Could not parse XML config file '%s' \n", - config.conffile); - exit (1); + p_config->conffile); + exit(1); } - if (!xml_locale || !strlen((const char *) xml_locale)) - return; - - config.chain = icu_chain_xml_config(xml_node, (const char *) xml_locale, 0, - &status); - - xmlFree(xml_locale); + p_config->chain = icu_chain_xml_config(xml_node, 1, &status); - - if (config.chain && U_SUCCESS(status)) - success = 1; - else { + if (!p_config->chain || !U_SUCCESS(status)) + { printf("Could not set up ICU chain from config file '%s' \n", - config.conffile); - exit (1); + p_config->conffile); + exit(1); } - - if (p_config->xmloutput) - fprintf(config.outfile, + fprintf(p_config->outfile, "\n" "\n" "\n"); - + /* read input lines for processing */ - while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile))) + while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile))) { - success = icu_chain_assign_cstr(config.chain, line, &status); + WRBUF sw = wrbuf_alloc(); + WRBUF cdata = wrbuf_alloc(); + int success = icu_chain_assign_cstr(p_config->chain, line, &status); line_count++; - while (success && icu_chain_next_token(config.chain, &status)){ + while (success && icu_chain_next_token(p_config->chain, &status)) + { if (U_FAILURE(status)) success = 0; - else { + else + { + size_t start, len; + const char *sortkey = icu_chain_token_sortkey(p_config->chain); + + icu_chain_get_org_info(p_config->chain, &start, &len); + wrbuf_rewind(sw); + wrbuf_puts_escaped(sw, sortkey); token_count++; - if (p_config->xmloutput) - fprintf(config.outfile, - "\n", - token_count, - line_count, - icu_chain_token_norm(config.chain), - icu_chain_token_display(config.chain)); + if (p_config->xmloutput) + { + fprintf(p_config->outfile, + "chain)); + fprintf(p_config->outfile, " norm=\"%s\"", + wrbuf_cstr(cdata)); + + wrbuf_rewind(cdata); + wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain)); + fprintf(p_config->outfile, " display=\"%s\"", + wrbuf_cstr(cdata)); + + if (p_config->sortoutput) + { + wrbuf_rewind(cdata); + wrbuf_xmlputs(cdata, wrbuf_cstr(sw)); + fprintf(p_config->outfile, " sortkey=\"%s\"", + wrbuf_cstr(cdata)); + } + fprintf(p_config->outfile, "/>\n"); + } else - fprintf(config.outfile, "%lu %lu '%s' '%s'\n", + { + fprintf(p_config->outfile, "%lu %lu '%s' '%s'", token_count, line_count, - icu_chain_token_norm(config.chain), - icu_chain_token_display(config.chain)); + icu_chain_token_norm(p_config->chain), + icu_chain_token_display(p_config->chain)); + if (p_config->sortoutput) + { + fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw)); + } + if (p_config->org_output) + { + fprintf(p_config->outfile, " %ld+%ld", + (long) start, (long) len); + } + fprintf(p_config->outfile, "\n"); + } } } - + wrbuf_destroy(sw); + wrbuf_destroy(cdata); } if (p_config->xmloutput) - fprintf(config.outfile, + fprintf(p_config->outfile, "\n" "\n"); - icu_chain_destroy(config.chain); + icu_chain_destroy(p_config->chain); xmlFreeDoc(doc); if (line) free(line); } -#endif /* HAVE_ICU */ +#endif /* YAZ_HAVE_ICU */ -int main(int argc, char **argv) +int main(int argc, char **argv) { +#if YAZ_HAVE_ICU + struct config_t config; -#if HAVE_ICU - + yaz_enable_panic_backtrace(*argv); read_params(argc, argv, &config); if (config.conffile && strlen(config.conffile)) process_text_file(&config); - + if (config.print && strlen(config.print)) print_info(&config); -#else /* HAVE_ICU */ + u_cleanup(); +#else /* YAZ_HAVE_ICU */ printf("ICU not available on your system.\n" - "Please install libicu36-dev and icu-doc or similar, " + "Please install libicu-dev and icu-doc or similar, " "re-configure and re-compile\n"); -#endif /* HAVE_ICU */ + exit(3); +#endif /* YAZ_HAVE_ICU */ - return(0); + return 0; } /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab