-/*
- * Copyright (C) 1995-2007, Index Data ApS
+/* This file is part of the YAZ toolkit.
+ * Copyright (C) 1995-2013 Index Data
* See the file LICENSE for details.
- *
- * $Id: yaz-icu.c,v 1.3 2007-10-24 14:48:18 marc Exp $
*/
#if HAVE_CONFIG_H
#include <stdio.h>
#include <stdlib.h>
+#include <errno.h>
#include <yaz/options.h>
-
-#if HAVE_ICU
+#if YAZ_HAVE_ICU
#include <unicode/ucnv.h>
#include <unicode/ustring.h>
+#include <unicode/ucol.h>
+#include <unicode/ubrk.h>
+#include <unicode/utrans.h>
+#include <unicode/uclean.h>
-#include <yaz/icu_I18N.h>
+#include <yaz/icu.h>
+#include <yaz/wrbuf.h>
/* commando line and config parameters */
-static struct config_t {
+struct config_t {
char conffile[1024];
char print[1024];
int xmloutput;
- struct icu_chain * chain;
+ int sortoutput;
+ int org_output;
+ yaz_icu_chain_t chain;
FILE * infile;
FILE * outfile;
-} config;
+};
-
-
void print_option_error(const struct config_t *p_config)
-{
- fprintf(stderr, "Calling error, valid options are :\n");
- fprintf(stderr, "yaz-icu\n"
- " [-c (path/to/config/file.xml)]\n"
- " [-p (a|c|l|t)] print ICU info \n"
- " [-x] XML output\n"
+{
+ fprintf(stderr, "yaz-icu [options] [infile]\n"
+ "Options:\n"
+ " -c file XML configuration\n"
+ " -p a|c|l|t Print ICU info \n"
+ " -s Show sort normalization key\n"
+ " -o Show org positions\n"
+ " -x XML output instread of text\n"
"\n"
"Examples:\n"
"cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
"./yaz-icu -p t -x\n"
"\n"
"Example ICU chain XML configuration file:\n"
- "<icu_chain id=\"en:word\" locale=\"en\">\n"
- " <normalize rule=\"[:Control:] Any-Remove\"/>\n"
+ "<icu_chain locale=\"en\">\n"
+ " <transform rule=\"[:Control:] Any-Remove\"/>\n"
" <tokenize rule=\"l\"/>\n"
- " <normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
- " <display/>\n"
+ " <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
" <casemap rule=\"l\"/>\n"
- " <index/>\n"
- " <sortkey/>\n"
"</icu_chain>\n"
);
exit(1);
}
void read_params(int argc, char **argv, struct config_t *p_config)
-{
+{
char *arg;
int ret;
-
+
/* set default parameters */
p_config->conffile[0] = 0;
p_config->print[0] = 0;
p_config->xmloutput = 0;
+ p_config->sortoutput = 0;
p_config->chain = 0;
- p_config->infile = stdin;
+ p_config->infile = 0;
p_config->outfile = stdout;
-
+ p_config->org_output = 0;
+
/* set up command line parameters */
-
- while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
+
+ while ((ret = options("c:op:sx", argv, argc, &arg)) != -2)
{
switch (ret)
{
case 'p':
strcpy(p_config->print, arg);
break;
+ case 's':
+ p_config->sortoutput = 1;
+ break;
case 'x':
p_config->xmloutput = 1;
break;
+ case 'o':
+ p_config->org_output = 1;
+ break;
+ case 0:
+ if (p_config->infile)
+ {
+ fprintf(stderr, "yaz-icu: only one input file may be given\n");
+ print_option_error(p_config);
+ }
+ p_config->infile = fopen(arg, "r");
+ if (!p_config->infile)
+ {
+ fprintf(stderr, "yaz-icu: cannot open %s : %s\n",
+ arg, strerror(errno));
+ exit(1);
+ }
+ break;
default:
+ fprintf(stderr, "yaz_icu: invalid option: %s\n", arg);
print_option_error(p_config);
}
}
-
- if ((!strlen(p_config->conffile)
- && !strlen(p_config->print))
- || !config.infile
- || !config.outfile)
-
- print_option_error(p_config);
-}
-
-/* UConverter *conv; */
-/* conv = ucnv_open("utf-8", &status); */
-/* assert(U_SUCCESS(status)); */
-
-/* *ustr16_len */
-/* = ucnv_toUChars(conv, ustr16, 1024, */
-/* (const char *) *xstr8, strlen((const char *) *xstr8), */
-/* &status); */
-
-
-
-/* ucnv_fromUChars(conv, */
-/* (char *) *xstr8, strlen((const char *) *xstr8), */
-/* ustr16, *ustr16_len, */
-/* &status); */
-/* ucnv_close(conv); */
+ if (p_config->infile == 0)
+ p_config->infile = stdin;
+ if (!strlen(p_config->conffile) && !strlen(p_config->print))
+ print_option_error(p_config);
+}
static void print_icu_converters(const struct config_t *p_config)
{
count = ucnv_countAvailable();
if (p_config->xmloutput)
- fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
+ fprintf(p_config->outfile, "<converters count=\"%d\" default=\"%s\">\n",
count, ucnv_getDefaultName());
- else {
- fprintf(config.outfile, "Available ICU converters: %d\n", count);
- fprintf(config.outfile, "Default ICU Converter is: '%s'\n",
+ else
+ {
+ fprintf(p_config->outfile, "Available ICU converters: %d\n", count);
+ fprintf(p_config->outfile, "Default ICU Converter is: '%s'\n",
ucnv_getDefaultName());
}
-
- for(i=0;i<count;i++){
+
+ for (i = 0; i < count; i++)
+ {
if (p_config->xmloutput)
- fprintf(config.outfile, "<converter id=\"%s\"/>\n",
+ fprintf(p_config->outfile, "<converter id=\"%s\"/>\n",
ucnv_getAvailableName(i));
- else
- fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
+ else
+ fprintf(p_config->outfile, "%s\n", ucnv_getAvailableName(i));
}
-
+
if (p_config->xmloutput)
- fprintf(config.outfile, "</converters>\n");
+ fprintf(p_config->outfile, "</converters>\n");
else
- fprintf(config.outfile, "\n");
+ fprintf(p_config->outfile, "\n");
}
static void print_icu_transliterators(const struct config_t *p_config)
{
- int32_t buf_cap = 128;
- char buf[buf_cap];
- int32_t i;
- int32_t count = utrans_countAvailableIDs();
-
+ UErrorCode status;
+ UEnumeration *en = utrans_openIDs(&status);
+ int32_t count = uenum_count(en, &status);
+ const char *name;
+ int32_t length;
+
if (p_config->xmloutput)
- fprintf(config.outfile, "<transliterators count=\"%d\">\n", count);
- else
- fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
-
- for(i = 0; i <count; i++)
+ fprintf(p_config->outfile, "<transliterators count=\"%d\">\n", count);
+ else
+ fprintf(p_config->outfile, "Available ICU transliterators: %d\n", count);
+
+ while ((name = uenum_next(en, &length, &status)))
{
- utrans_getAvailableID(i, buf, buf_cap);
if (p_config->xmloutput)
- fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
+ fprintf(p_config->outfile, "<transliterator id=\"%s\"/>\n", name);
else
- fprintf(config.outfile, " %s", buf);
- }
-
- if (p_config->xmloutput){
- fprintf(config.outfile, "</transliterators>\n");
+ fprintf(p_config->outfile, "%s\n", name);
}
+ uenum_close(en);
+ if (p_config->xmloutput)
+ fprintf(p_config->outfile, "</transliterators>\n");
else
{
- fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
+ fprintf(p_config->outfile, "\n\nUnicode Set Patterns:\n"
" Pattern Description\n"
" Ranges [a-z] The lower case letters a through z\n"
" Named Chars [abc123] The six characters a,b,c,1,2 and 3\n"
" [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
" [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
"\n"
- "see http://icu.sourceforge.net/userguide/Transform.html\n"
- " http://www.unicode.org/Public/UNIDATA/UCD.html\n"
- " http://icu.sourceforge.net/userguide/Transform.html\n"
- " http://icu.sourceforge.net/userguide/TransformRule.html\n"
+ "see http://userguide.icu-project.org/transforms/general\n"
+ " http://www.unicode.org/reports/tr44/\n"
);
-
-
- fprintf(config.outfile, "\n\n");
-
+
+
+ fprintf(p_config->outfile, "\n\n");
+
}
}
int32_t count;
int32_t i;
UErrorCode status = U_ZERO_ERROR;
-
+
UChar keyword[64];
int32_t keyword_len = 0;
char keyword_str[128];
count = uloc_countAvailable() ;
- if (p_config->xmloutput){
-
- fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
+ if (p_config->xmloutput)
+ {
+ fprintf(p_config->outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
count, uloc_getDefault(), ucol_countAvailable());
}
-
- for(i=0;i<count;i++)
+ else
+ {
+ fprintf(p_config->outfile, "Available ICU locales: %d\n", count);
+ fprintf(p_config->outfile, "Default locale is: %s\n", uloc_getDefault());
+ }
+
+ for (i = 0; i < count; i++)
{
- keyword_len
- = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
- keyword, 64,
+ keyword_len
+ = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
+ keyword, 64,
&status);
u_strToUTF8(keyword_str, 128, &keyword_str_len,
keyword, keyword_len,
&status);
-
-
- language_len
- = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
- language, 64,
+
+
+ language_len
+ = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
+ language, 64,
&status);
u_strToUTF8(lang_str, 128, &lang_str_len,
&status);
- script_len
- = uloc_getDisplayScript(uloc_getAvailable(i), "en",
- script, 64,
+ script_len
+ = uloc_getDisplayScript(uloc_getAvailable(i), "en",
+ script, 64,
&status);
u_strToUTF8(script_str, 128, &script_str_len,
script, script_len,
&status);
- location_len
- = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
- location, 64,
+ location_len
+ = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
+ location, 64,
&status);
u_strToUTF8(location_str, 128, &location_str_len,
location, location_len,
&status);
- variant_len
- = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
- variant, 64,
+ variant_len
+ = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
+ variant, 64,
&status);
u_strToUTF8(variant_str, 128, &variant_str_len,
variant, variant_len,
&status);
- name_len
- = uloc_getDisplayName(uloc_getAvailable(i), "en",
- name, 64,
+ name_len
+ = uloc_getDisplayName(uloc_getAvailable(i), "en",
+ name, 64,
&status);
u_strToUTF8(name_str, 128, &name_str_len,
name, name_len,
&status);
- localname_len
- = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
- localname, 64,
+ localname_len
+ = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
+ localname, 64,
&status);
u_strToUTF8(localname_str, 128, &localname_str_len,
&status);
- if (p_config->xmloutput){
- fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
- /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
- /* if (strlen(keyword_str)) */
- /* fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
- /* if (ucol_getAvailable(i)) */
- /* fprintf(config.outfile, " collation=\"1\""); */
+ if (p_config->xmloutput)
+ {
+ fprintf(p_config->outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
if (strlen(lang_str))
- fprintf(config.outfile, " language=\"%s\"", lang_str);
+ fprintf(p_config->outfile, " language=\"%s\"", lang_str);
if (strlen(script_str))
- fprintf(config.outfile, " script=\"%s\"", script_str);
+ fprintf(p_config->outfile, " script=\"%s\"", script_str);
if (strlen(location_str))
- fprintf(config.outfile, " location=\"%s\"", location_str);
+ fprintf(p_config->outfile, " location=\"%s\"", location_str);
if (strlen(variant_str))
- fprintf(config.outfile, " variant=\"%s\"", variant_str);
+ fprintf(p_config->outfile, " variant=\"%s\"", variant_str);
if (strlen(name_str))
- fprintf(config.outfile, " name=\"%s\"", name_str);
+ fprintf(p_config->outfile, " name=\"%s\"", name_str);
if (strlen(localname_str))
- fprintf(config.outfile, " localname=\"%s\"", localname_str);
- fprintf(config.outfile, ">");
+ fprintf(p_config->outfile, " localname=\"%s\"", localname_str);
+ fprintf(p_config->outfile, ">");
if (strlen(localname_str))
- fprintf(config.outfile, "%s", localname_str);
- fprintf(config.outfile, "</locale>\n");
+ fprintf(p_config->outfile, "%s", localname_str);
+ fprintf(p_config->outfile, "</locale>\n");
}
- else if (1 == p_config->xmloutput){
- fprintf(config.outfile, "%s", uloc_getAvailable(i));
- fprintf(config.outfile, " | ");
+ else if (1 == p_config->xmloutput)
+ {
+ fprintf(p_config->outfile, "%s", uloc_getAvailable(i));
+ fprintf(p_config->outfile, " | ");
if (strlen(name_str))
- fprintf(config.outfile, "%s", name_str);
- fprintf(config.outfile, " | ");
+ fprintf(p_config->outfile, "%s", name_str);
+ fprintf(p_config->outfile, " | ");
if (strlen(localname_str))
- fprintf(config.outfile, "%s", localname_str);
- fprintf(config.outfile, "\n");
+ fprintf(p_config->outfile, "%s", localname_str);
+ fprintf(p_config->outfile, "\n");
}
else
- fprintf(config.outfile, "%s ", uloc_getAvailable(i));
+ fprintf(p_config->outfile, "%s\n", uloc_getAvailable(i));
}
if (p_config->xmloutput)
- fprintf(config.outfile, "</locales>\n");
+ fprintf(p_config->outfile, "</locales>\n");
else
- fprintf(config.outfile, "\n");
+ fprintf(p_config->outfile, "\n");
- if(U_FAILURE(status)) {
+ if (U_FAILURE(status))
+ {
fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
- exit(status);
+ exit(2);
}
}
static void print_info(const struct config_t *p_config)
{
if (p_config->xmloutput)
- fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ fprintf(p_config->outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<icu>\n");
- if ('c' == config.print[0])
- print_icu_converters(&config);
- else if ('l' == config.print[0])
- print_icu_xml_locales(&config);
- else if ('t' == config.print[0])
- print_icu_transliterators(&config);
+ if ('c' == p_config->print[0])
+ print_icu_converters(p_config);
+ else if ('l' == p_config->print[0])
+ print_icu_xml_locales(p_config);
+ else if ('t' == p_config->print[0])
+ print_icu_transliterators(p_config);
else {
- print_icu_converters(&config);
- print_icu_xml_locales(&config);
- print_icu_transliterators(&config);
+ print_icu_converters(p_config);
+ print_icu_xml_locales(p_config);
+ print_icu_transliterators(p_config);
}
if (p_config->xmloutput)
- fprintf(config.outfile, "</icu>\n");
+ fprintf(p_config->outfile, "</icu>\n");
exit(0);
}
-static void process_text_file(const struct config_t *p_config)
+static void process_text_file(struct config_t *p_config)
{
char *line = 0;
char linebuf[1024];
-
- xmlDoc *doc = xmlParseFile(config.conffile);
+
+ xmlDoc *doc = xmlParseFile(p_config->conffile);
xmlNode *xml_node = xmlDocGetRootElement(doc);
- xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
- long unsigned int token_count = 0;
- long unsigned int line_count = 0;
-
+ long unsigned int token_count = 0;
+ long unsigned int line_count = 0;
+
UErrorCode status = U_ZERO_ERROR;
- int success = 0;
-
- if (! xml_node) {
+
+ if (!xml_node)
+ {
printf("Could not parse XML config file '%s' \n",
- config.conffile);
- exit (1);
+ p_config->conffile);
+ exit(1);
}
- if (!xml_locale || !strlen((const char *) xml_locale))
- return;
-
- config.chain = icu_chain_xml_config(xml_node, (uint8_t *) "en", &status);
-
- xmlFree(xml_locale);
+ p_config->chain = icu_chain_xml_config(xml_node, 1, &status);
-
- if (config.chain && U_SUCCESS(status))
- success = 1;
- else {
+ if (!p_config->chain || !U_SUCCESS(status))
+ {
printf("Could not set up ICU chain from config file '%s' \n",
- config.conffile);
- exit (1);
+ p_config->conffile);
+ if (!U_SUCCESS(status))
+ printf("ICU Error: %d %s\n", status, u_errorName(status));
+ exit(1);
}
-
-
if (p_config->xmloutput)
- fprintf(config.outfile,
+ fprintf(p_config->outfile,
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<icu>\n"
"<tokens>\n");
-
+
/* read input lines for processing */
- while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
+ while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile)))
{
- success = icu_chain_assign_cstr(config.chain, line, &status);
+ WRBUF sw = wrbuf_alloc();
+ WRBUF cdata = wrbuf_alloc();
+ int success = icu_chain_assign_cstr(p_config->chain, line, &status);
line_count++;
- while (success && icu_chain_next_token(config.chain, &status)){
+ while (success && icu_chain_next_token(p_config->chain, &status))
+ {
if (U_FAILURE(status))
success = 0;
- else {
+ else
+ {
+ size_t start, len;
+ const char *sortkey = icu_chain_token_sortkey(p_config->chain);
+
+ icu_chain_get_org_info(p_config->chain, &start, &len);
+ wrbuf_rewind(sw);
+ wrbuf_puts_escaped(sw, sortkey);
token_count++;
- if (p_config->xmloutput)
- fprintf(config.outfile,
- "<token id=\%lu\" line=\"%lu\""
- " norm=\"%s\" display=\"%s\"/>\n",
- token_count,
- line_count,
- icu_chain_get_norm(config.chain),
- icu_chain_get_display(config.chain));
+ if (p_config->xmloutput)
+ {
+ fprintf(p_config->outfile,
+ "<token id=\"%lu\" line=\"%lu\"",
+ token_count, line_count);
+
+ wrbuf_rewind(cdata);
+ wrbuf_xmlputs(cdata, icu_chain_token_norm(p_config->chain));
+ fprintf(p_config->outfile, " norm=\"%s\"",
+ wrbuf_cstr(cdata));
+
+ wrbuf_rewind(cdata);
+ wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain));
+ fprintf(p_config->outfile, " display=\"%s\"",
+ wrbuf_cstr(cdata));
+
+ if (p_config->sortoutput)
+ {
+ wrbuf_rewind(cdata);
+ wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
+ fprintf(p_config->outfile, " sortkey=\"%s\"",
+ wrbuf_cstr(cdata));
+ }
+ fprintf(p_config->outfile, "/>\n");
+ }
else
- fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
+ {
+ fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
token_count,
line_count,
- icu_chain_get_norm(config.chain),
- icu_chain_get_display(config.chain));
+ icu_chain_token_norm(p_config->chain),
+ icu_chain_token_display(p_config->chain));
+ if (p_config->sortoutput)
+ {
+ fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
+ }
+ if (p_config->org_output)
+ {
+ fprintf(p_config->outfile, " %ld+%ld",
+ (long) start, (long) len);
+ }
+ fprintf(p_config->outfile, "\n");
+ }
}
}
-
+ wrbuf_destroy(sw);
+ wrbuf_destroy(cdata);
}
if (p_config->xmloutput)
- fprintf(config.outfile,
+ fprintf(p_config->outfile,
"</tokens>\n"
"</icu>\n");
- icu_chain_destroy(config.chain);
+ icu_chain_destroy(p_config->chain);
xmlFreeDoc(doc);
if (line)
free(line);
}
-#endif /* HAVE_ICU */
+#endif /* YAZ_HAVE_ICU */
-int main(int argc, char **argv)
+int main(int argc, char **argv)
{
-
-#if HAVE_ICU
+#if YAZ_HAVE_ICU
+ struct config_t config;
read_params(argc, argv, &config);
if (config.conffile && strlen(config.conffile))
process_text_file(&config);
-
+
if (config.print && strlen(config.print))
print_info(&config);
-#else /* HAVE_ICU */
+ u_cleanup();
+#else /* YAZ_HAVE_ICU */
printf("ICU not available on your system.\n"
- "Please install libicu36-dev and icu-doc or similar, "
+ "Please install libicu-dev and icu-doc or similar, "
"re-configure and re-compile\n");
-#endif /* HAVE_ICU */
+ exit(3);
+#endif /* YAZ_HAVE_ICU */
- return(0);
+ return 0;
}
/*
* Local variables:
* c-basic-offset: 4
+ * c-file-style: "Stroustrup"
* indent-tabs-mode: nil
* End:
* vim: shiftwidth=4 tabstop=8 expandtab