1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2011 Index Data
3 * See the file LICENSE for details.
16 #include <yaz/options.h>
20 #include <unicode/ucnv.h>
21 #include <unicode/ustring.h>
22 #include <unicode/ucol.h>
23 #include <unicode/ubrk.h>
24 #include <unicode/utrans.h>
27 #include <yaz/wrbuf.h>
29 /* commando line and config parameters */
30 static struct config_t {
35 yaz_icu_chain_t chain;
42 void print_option_error(const struct config_t *p_config)
44 fprintf(stderr, "yaz-icu [options] [infile]\n"
46 " -c file XML configuration\n"
47 " -p a|c|l|t Print ICU info \n"
48 " -s Show sort normalization key\n"
49 " -x XML output instread of text\n"
52 "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
57 "Example ICU chain XML configuration file:\n"
58 "<icu_chain locale=\"en\">\n"
59 " <transform rule=\"[:Control:] Any-Remove\"/>\n"
60 " <tokenize rule=\"l\"/>\n"
61 " <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
62 " <casemap rule=\"l\"/>\n"
68 void read_params(int argc, char **argv, struct config_t *p_config)
73 /* set default parameters */
74 p_config->conffile[0] = 0;
75 p_config->print[0] = 0;
76 p_config->xmloutput = 0;
77 p_config->sortoutput = 0;
80 p_config->outfile = stdout;
82 /* set up command line parameters */
84 while ((ret = options("c:p:xs", argv, argc, &arg)) != -2)
89 strcpy(p_config->conffile, arg);
92 strcpy(p_config->print, arg);
95 p_config->sortoutput = 1;
98 p_config->xmloutput = 1;
101 if (p_config->infile)
103 fprintf(stderr, "yaz-icu: only one input file may be given\n");
104 print_option_error(p_config);
106 p_config->infile = fopen(arg, "r");
107 if (!p_config->infile)
109 fprintf(stderr, "yaz-icu: cannot open %s : %s\n",
110 arg, strerror(errno));
115 fprintf(stderr, "yaz_icu: invalid option: %s\n", arg);
116 print_option_error(p_config);
120 if (p_config->infile == 0)
121 p_config->infile = stdin;
123 if (!strlen(p_config->conffile) && !strlen(p_config->print))
124 print_option_error(p_config);
128 /* UConverter *conv; */
129 /* conv = ucnv_open("utf-8", &status); */
130 /* assert(U_SUCCESS(status)); */
133 /* = ucnv_toUChars(conv, ustr16, 1024, */
134 /* (const char *) *xstr8, strlen((const char *) *xstr8), */
139 /* ucnv_fromUChars(conv, */
140 /* (char *) *xstr8, strlen((const char *) *xstr8), */
141 /* ustr16, *ustr16_len, */
143 /* ucnv_close(conv); */
146 static void print_icu_converters(const struct config_t *p_config)
151 count = ucnv_countAvailable();
152 if (p_config->xmloutput)
153 fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
154 count, ucnv_getDefaultName());
157 fprintf(config.outfile, "Available ICU converters: %d\n", count);
158 fprintf(config.outfile, "Default ICU Converter is: '%s'\n",
159 ucnv_getDefaultName());
162 for (i = 0; i < count; i++)
164 if (p_config->xmloutput)
165 fprintf(config.outfile, "<converter id=\"%s\"/>\n",
166 ucnv_getAvailableName(i));
168 fprintf(config.outfile, "%s\n", ucnv_getAvailableName(i));
171 if (p_config->xmloutput)
172 fprintf(config.outfile, "</converters>\n");
174 fprintf(config.outfile, "\n");
177 static void print_icu_transliterators(const struct config_t *p_config)
180 UEnumeration *en = utrans_openIDs(&status);
181 int32_t count = uenum_count(en, &status);
185 if (p_config->xmloutput)
186 fprintf(config.outfile, "<transliterators count=\"%d\">\n", count);
188 fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
190 while ((name = uenum_next(en, &length, &status)))
192 if (p_config->xmloutput)
193 fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", name);
195 fprintf(config.outfile, "%s\n", name);
198 if (p_config->xmloutput)
199 fprintf(config.outfile, "</transliterators>\n");
202 fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
203 " Pattern Description\n"
204 " Ranges [a-z] The lower case letters a through z\n"
205 " Named Chars [abc123] The six characters a,b,c,1,2 and 3\n"
206 " String [abc{def}] chars a, b and c, and string 'def'\n"
207 " Categories [\\p{Letter}] Perl General Category 'Letter'.\n"
208 " Categories [:Letter:] Posix General Category 'Letter'.\n"
210 " Combination Example\n"
211 " Union [[:Greek:] [:letter:]]\n"
212 " Intersection [[:Greek:] & [:letter:]]\n"
213 " Set Complement [[:Greek:] - [:letter:]]\n"
214 " Complement [^[:Greek:] [:letter:]]\n"
216 "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
219 " [:Punctuation:] Any-Remove\n"
220 " [:Cased-Letter:] Any-Upper\n"
221 " [:Control:] Any-Remove\n"
222 " [:Decimal_Number:] Any-Remove\n"
223 " [:Final_Punctuation:] Any-Remove\n"
224 " [:Georgian:] Any-Upper\n"
225 " [:Katakana:] Any-Remove\n"
226 " [:Arabic:] Any-Remove\n"
227 " [:Punctuation:] Remove\n"
228 " [[:Punctuation:]-[.,]] Remove\n"
229 " [:Line_Separator:] Any-Remove\n"
230 " [:Math_Symbol:] Any-Remove\n"
231 " Lower; [:^Letter:] Remove (word tokenization)\n"
232 " [:^Number:] Remove (numeric tokenization)\n"
233 " [:^Katagana:] Remove (remove everything except Katagana)\n"
234 " Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
235 " NFD; [:Nonspacing Mark:] Remove; NFC (removes accents from characters)\n"
236 " [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
237 " [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
239 "see http://userguide.icu-project.org/transforms/general\n"
240 " http://www.unicode.org/reports/tr44/\n"
244 fprintf(config.outfile, "\n\n");
249 static void print_icu_xml_locales(const struct config_t *p_config)
253 UErrorCode status = U_ZERO_ERROR;
256 int32_t keyword_len = 0;
257 char keyword_str[128];
258 int32_t keyword_str_len = 0;
261 int32_t language_len = 0;
263 int32_t lang_str_len = 0;
266 int32_t script_len = 0;
267 char script_str[128];
268 int32_t script_str_len = 0;
271 int32_t location_len = 0;
272 char location_str[128];
273 int32_t location_str_len = 0;
276 int32_t variant_len = 0;
277 char variant_str[128];
278 int32_t variant_str_len = 0;
281 int32_t name_len = 0;
283 int32_t name_str_len = 0;
286 int32_t localname_len = 0;
287 char localname_str[128];
288 int32_t localname_str_len = 0;
290 count = uloc_countAvailable() ;
292 if (p_config->xmloutput)
294 fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
295 count, uloc_getDefault(), ucol_countAvailable());
299 fprintf(config.outfile, "Available ICU locales: %d\n", count);
300 fprintf(config.outfile, "Default locale is: %s\n", uloc_getDefault());
303 for (i = 0; i < count; i++)
307 = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
311 u_strToUTF8(keyword_str, 128, &keyword_str_len,
312 keyword, keyword_len,
317 = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
321 u_strToUTF8(lang_str, 128, &lang_str_len,
322 language, language_len,
327 = uloc_getDisplayScript(uloc_getAvailable(i), "en",
331 u_strToUTF8(script_str, 128, &script_str_len,
336 = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
340 u_strToUTF8(location_str, 128, &location_str_len,
341 location, location_len,
345 = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
349 u_strToUTF8(variant_str, 128, &variant_str_len,
350 variant, variant_len,
354 = uloc_getDisplayName(uloc_getAvailable(i), "en",
358 u_strToUTF8(name_str, 128, &name_str_len,
363 = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
367 u_strToUTF8(localname_str, 128, &localname_str_len,
368 localname, localname_len,
372 if (p_config->xmloutput)
374 fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
375 /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
376 /* if (strlen(keyword_str)) */
377 /* fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
378 /* if (ucol_getAvailable(i)) */
379 /* fprintf(config.outfile, " collation=\"1\""); */
380 if (strlen(lang_str))
381 fprintf(config.outfile, " language=\"%s\"", lang_str);
382 if (strlen(script_str))
383 fprintf(config.outfile, " script=\"%s\"", script_str);
384 if (strlen(location_str))
385 fprintf(config.outfile, " location=\"%s\"", location_str);
386 if (strlen(variant_str))
387 fprintf(config.outfile, " variant=\"%s\"", variant_str);
388 if (strlen(name_str))
389 fprintf(config.outfile, " name=\"%s\"", name_str);
390 if (strlen(localname_str))
391 fprintf(config.outfile, " localname=\"%s\"", localname_str);
392 fprintf(config.outfile, ">");
393 if (strlen(localname_str))
394 fprintf(config.outfile, "%s", localname_str);
395 fprintf(config.outfile, "</locale>\n");
397 else if (1 == p_config->xmloutput)
399 fprintf(config.outfile, "%s", uloc_getAvailable(i));
400 fprintf(config.outfile, " | ");
401 if (strlen(name_str))
402 fprintf(config.outfile, "%s", name_str);
403 fprintf(config.outfile, " | ");
404 if (strlen(localname_str))
405 fprintf(config.outfile, "%s", localname_str);
406 fprintf(config.outfile, "\n");
409 fprintf(config.outfile, "%s\n", uloc_getAvailable(i));
411 if (p_config->xmloutput)
412 fprintf(config.outfile, "</locales>\n");
414 fprintf(config.outfile, "\n");
416 if (U_FAILURE(status))
418 fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
424 static void print_info(const struct config_t *p_config)
426 if (p_config->xmloutput)
427 fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
430 if ('c' == config.print[0])
431 print_icu_converters(&config);
432 else if ('l' == config.print[0])
433 print_icu_xml_locales(&config);
434 else if ('t' == config.print[0])
435 print_icu_transliterators(&config);
437 print_icu_converters(&config);
438 print_icu_xml_locales(&config);
439 print_icu_transliterators(&config);
442 if (p_config->xmloutput)
443 fprintf(config.outfile, "</icu>\n");
450 static void process_text_file(const struct config_t *p_config)
455 xmlDoc *doc = xmlParseFile(config.conffile);
456 xmlNode *xml_node = xmlDocGetRootElement(doc);
458 long unsigned int token_count = 0;
459 long unsigned int line_count = 0;
461 UErrorCode status = U_ZERO_ERROR;
465 printf("Could not parse XML config file '%s' \n",
470 config.chain = icu_chain_xml_config(xml_node, 1, &status);
472 if (!config.chain || !U_SUCCESS(status))
474 printf("Could not set up ICU chain from config file '%s' \n",
476 if (!U_SUCCESS(status))
477 printf("ICU Error: %d %s\n", status, u_errorName(status));
481 if (p_config->xmloutput)
482 fprintf(config.outfile,
483 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
487 /* read input lines for processing */
488 while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
490 WRBUF sw = wrbuf_alloc();
491 WRBUF cdata = wrbuf_alloc();
492 int success = icu_chain_assign_cstr(config.chain, line, &status);
495 while (success && icu_chain_next_token(config.chain, &status))
497 if (U_FAILURE(status))
501 const char *sortkey = icu_chain_token_sortkey(config.chain);
503 wrbuf_puts_escaped(sw, sortkey);
505 if (p_config->xmloutput)
507 fprintf(config.outfile,
508 "<token id=\"%lu\" line=\"%lu\"",
509 token_count, line_count);
512 wrbuf_xmlputs(cdata, icu_chain_token_norm(config.chain));
513 fprintf(config.outfile, " norm=\"%s\"",
517 wrbuf_xmlputs(cdata, icu_chain_token_display(config.chain));
518 fprintf(config.outfile, " display=\"%s\"",
521 if (p_config->sortoutput)
524 wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
525 fprintf(config.outfile, " sortkey=\"%s\"",
528 fprintf(config.outfile, "/>\n");
532 fprintf(config.outfile, "%lu %lu '%s' '%s'",
535 icu_chain_token_norm(config.chain),
536 icu_chain_token_display(config.chain));
537 if (p_config->sortoutput)
539 fprintf(config.outfile, " '%s'", wrbuf_cstr(sw));
541 fprintf(config.outfile, "\n");
546 wrbuf_destroy(cdata);
549 if (p_config->xmloutput)
550 fprintf(config.outfile,
554 icu_chain_destroy(config.chain);
560 #endif /* YAZ_HAVE_ICU */
563 int main(int argc, char **argv)
568 read_params(argc, argv, &config);
570 if (config.conffile && strlen(config.conffile))
571 process_text_file(&config);
573 if (config.print && strlen(config.print))
576 #else /* YAZ_HAVE_ICU */
578 printf("ICU not available on your system.\n"
579 "Please install libicu-dev and icu-doc or similar, "
580 "re-configure and re-compile\n");
584 #endif /* YAZ_HAVE_ICU */
593 * c-file-style: "Stroustrup"
594 * indent-tabs-mode: nil
596 * vim: shiftwidth=4 tabstop=8 expandtab