63a38db65f8a9fff3761d4891c1cb6882a9480b1
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2010 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #if HAVE_CONFIG_H
7 #include "config.h"
8 #endif
9
10 #include <string.h>
11
12 #include <stdio.h>
13 #include <stdlib.h>
14
15 #include <yaz/options.h>
16
17 #if YAZ_HAVE_ICU
18
19 #include <unicode/ucnv.h>
20 #include <unicode/ustring.h>
21 #include <unicode/ucol.h> 
22 #include <unicode/ubrk.h>
23 #include <unicode/utrans.h>
24
25 #include <yaz/icu.h>
26 #include <yaz/wrbuf.h>
27
28 /* commando line and config parameters */
29 static struct config_t { 
30     char conffile[1024];
31     char print[1024];
32     int xmloutput;
33     int sortoutput;
34     yaz_icu_chain_t chain;
35     FILE * infile;
36     FILE * outfile;
37 } config;
38
39
40   
41 void print_option_error(const struct config_t *p_config)
42 {  
43     fprintf(stderr, "Calling error, valid options are :\n");
44     fprintf(stderr, "yaz-icu\n"
45             "   [-c (path/to/config/file.xml)]\n"
46             "   [-p (a|c|l|t)] print ICU info \n"
47             "   [-s] Show sort normalization key\n"
48             "   [-x] XML output\n"
49             "\n"
50             "Examples:\n"
51             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
52             "./yaz-icu -p c\n"
53             "./yaz-icu -p l -x\n"
54             "./yaz-icu -p t -x\n"
55             "\n"
56             "Example ICU chain XML configuration file:\n"
57             "<icu_chain locale=\"en\">\n"
58             "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
59             "  <tokenize rule=\"l\"/>\n"
60             "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
61             "  <casemap rule=\"l\"/>\n"
62             "</icu_chain>\n"
63           );
64     exit(1);
65 }
66
67 void read_params(int argc, char **argv, struct config_t *p_config)
68 {    
69     char *arg;
70     int ret;
71     
72     /* set default parameters */
73     p_config->conffile[0] = 0;
74     p_config->print[0] = 0;
75     p_config->xmloutput = 0;
76     p_config->sortoutput = 0;
77     p_config->chain = 0;
78     p_config->infile = stdin;
79     p_config->outfile = stdout;
80     
81     /* set up command line parameters */
82     
83     while ((ret = options("c:p:xs", argv, argc, &arg)) != -2)
84     {
85         switch (ret)
86         {
87         case 'c':
88             strcpy(p_config->conffile, arg);
89             break;
90         case 'p':
91             strcpy(p_config->print, arg);
92             break;
93         case 's':
94             p_config->sortoutput = 1;
95             break;
96         case 'x':
97             p_config->xmloutput = 1;
98             break;
99         default:
100             printf("Got %d\n", ret);
101             print_option_error(p_config);
102         }
103     }
104     
105     if ((!strlen(p_config->conffile)
106          && !strlen(p_config->print))
107         || !config.infile
108         || !config.outfile)
109         
110         print_option_error(p_config);
111 }
112
113
114 /*     UConverter *conv; */
115 /*     conv = ucnv_open("utf-8", &status); */
116 /*     assert(U_SUCCESS(status)); */
117
118 /*     *ustr16_len  */
119 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
120 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
121 /*                       &status); */
122   
123
124
125 /*      ucnv_fromUChars(conv, */
126 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
127 /*                      ustr16, *ustr16_len, */
128 /*                      &status); */
129 /*      ucnv_close(conv); */
130
131
132 static void print_icu_converters(const struct config_t *p_config)
133 {
134     int32_t count;
135     int32_t i;
136
137     count = ucnv_countAvailable();
138     if (p_config->xmloutput)
139         fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
140                 count, ucnv_getDefaultName());
141     else {    
142         fprintf(config.outfile, "Available ICU converters: %d\n", count);
143         fprintf(config.outfile, "Default ICU Converter is: '%s'\n", 
144                 ucnv_getDefaultName());
145     }
146     
147     for(i=0;i<count;i++)
148     {
149         if (p_config->xmloutput)
150             fprintf(config.outfile, "<converter id=\"%s\"/>\n", 
151                     ucnv_getAvailableName(i));
152         else     
153             fprintf(config.outfile, "%s\n", ucnv_getAvailableName(i));
154     }
155     
156     if (p_config->xmloutput)
157         fprintf(config.outfile, "</converters>\n");
158     else
159         fprintf(config.outfile, "\n");
160 }
161
162 static void print_icu_transliterators(const struct config_t *p_config)
163 {
164     UErrorCode status;
165     UEnumeration *en = utrans_openIDs(&status);
166     int32_t count = uenum_count(en, &status);
167     const char *name;
168     int32_t length;
169
170     if (p_config->xmloutput)
171         fprintf(config.outfile, "<transliterators count=\"%d\">\n",  count);
172     else 
173         fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
174
175     while ((name = uenum_next(en, &length, &status)))
176     {
177         if (p_config->xmloutput)
178             fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", name);
179         else
180             fprintf(config.outfile, "%s\n", name);
181     }
182     uenum_close(en);
183     if (p_config->xmloutput)
184     {
185         fprintf(config.outfile, "</transliterators>\n");
186     }
187     else
188     {
189         fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
190                 "   Pattern         Description\n"
191                 "   Ranges          [a-z]       The lower case letters a through z\n"
192                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
193                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
194                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
195                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
196                 "\n"
197                 "   Combination     Example\n"
198                 "   Union           [[:Greek:] [:letter:]]\n"
199                 "   Intersection    [[:Greek:] & [:letter:]]\n"
200                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
201                 "   Complement      [^[:Greek:] [:letter:]]\n"
202                 "\n"
203              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
204                 "\n"
205                 "Examples:\n"
206                 "   [:Punctuation:] Any-Remove\n"
207                 "   [:Cased-Letter:] Any-Upper\n"
208                 "   [:Control:] Any-Remove\n"
209                 "   [:Decimal_Number:] Any-Remove\n"
210                 "   [:Final_Punctuation:] Any-Remove\n"
211                 "   [:Georgian:] Any-Upper\n"
212                 "   [:Katakana:] Any-Remove\n"
213                 "   [:Arabic:] Any-Remove\n"
214                 "   [:Punctuation:] Remove\n"
215                 "   [[:Punctuation:]-[.,]] Remove\n"
216                 "   [:Line_Separator:] Any-Remove\n"
217                 "   [:Math_Symbol:] Any-Remove\n"
218                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
219                 "   [:^Number:] Remove (numeric tokenization)\n"
220                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
221                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
222                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
223                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
224                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
225                 "\n"
226                 "see http://userguide.icu-project.org/transforms/general\n"
227                 "    http://www.unicode.org/reports/tr44/\n"
228             );
229         
230         
231         fprintf(config.outfile, "\n\n");
232         
233     }
234 }
235
236 static void print_icu_xml_locales(const struct config_t *p_config)
237 {
238     int32_t count;
239     int32_t i;
240     UErrorCode status = U_ZERO_ERROR;
241     
242     UChar keyword[64];
243     int32_t keyword_len = 0;
244     char keyword_str[128];
245     int32_t keyword_str_len = 0;
246
247     UChar language[64];
248     int32_t language_len = 0;
249     char lang_str[128];
250     int32_t lang_str_len = 0;
251
252     UChar script[64];
253     int32_t script_len = 0;
254     char script_str[128];
255     int32_t script_str_len = 0;
256
257     UChar location[64];
258     int32_t location_len = 0;
259     char location_str[128];
260     int32_t location_str_len = 0;
261
262     UChar variant[64];
263     int32_t variant_len = 0;
264     char variant_str[128];
265     int32_t variant_str_len = 0;
266
267     UChar name[64];
268     int32_t name_len = 0;
269     char name_str[128];
270     int32_t name_str_len = 0;
271
272     UChar localname[64];
273     int32_t localname_len = 0;
274     char localname_str[128];
275     int32_t localname_str_len = 0;
276
277     count = uloc_countAvailable() ;
278
279     if (p_config->xmloutput)
280     {
281         fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
282                 count, uloc_getDefault(), ucol_countAvailable());
283     }
284     else
285     {
286         fprintf(config.outfile, "Available ICU locales: %d\n", count);
287         fprintf(config.outfile, "Default locale is: %s\n",  uloc_getDefault());
288     }
289   
290     for(i=0;i<count;i++) 
291     {
292
293         keyword_len 
294             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
295                                      keyword, 64, 
296                                      &status);
297
298         u_strToUTF8(keyword_str, 128, &keyword_str_len,
299                     keyword, keyword_len,
300                     &status);
301     
302     
303         language_len 
304             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
305                                       language, 64, 
306                                       &status);
307
308         u_strToUTF8(lang_str, 128, &lang_str_len,
309                     language, language_len,
310                     &status);
311
312
313         script_len 
314             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
315                                     script, 64, 
316                                     &status);
317
318         u_strToUTF8(script_str, 128, &script_str_len,
319                     script, script_len,
320                     &status);
321
322         location_len 
323             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
324                                      location, 64, 
325                                      &status);
326
327         u_strToUTF8(location_str, 128, &location_str_len,
328                     location, location_len,
329                     &status);
330
331         variant_len 
332             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
333                                      variant, 64, 
334                                      &status);
335
336         u_strToUTF8(variant_str, 128, &variant_str_len,
337                     variant, variant_len,
338                     &status);
339
340         name_len 
341             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
342                                   name, 64, 
343                                   &status);
344
345         u_strToUTF8(name_str, 128, &name_str_len,
346                     name, name_len,
347                     &status);
348
349         localname_len 
350             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
351                                   localname, 64, 
352                                   &status);
353
354         u_strToUTF8(localname_str, 128, &localname_str_len,
355                     localname, localname_len,
356                     &status);
357
358
359         if (p_config->xmloutput)
360         {
361             fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
362             /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
363             /* if (strlen(keyword_str)) */
364             /*   fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
365             /* if (ucol_getAvailable(i)) */
366             /*   fprintf(config.outfile, " collation=\"1\""); */
367             if (strlen(lang_str))
368                 fprintf(config.outfile, " language=\"%s\"", lang_str);
369             if (strlen(script_str))
370                 fprintf(config.outfile, " script=\"%s\"", script_str);
371             if (strlen(location_str))
372                 fprintf(config.outfile, " location=\"%s\"", location_str);
373             if (strlen(variant_str))
374                 fprintf(config.outfile, " variant=\"%s\"", variant_str);
375             if (strlen(name_str))
376                 fprintf(config.outfile, " name=\"%s\"", name_str);
377             if (strlen(localname_str))
378                 fprintf(config.outfile, " localname=\"%s\"", localname_str);
379             fprintf(config.outfile, ">");
380             if (strlen(localname_str))
381                 fprintf(config.outfile, "%s", localname_str);
382             fprintf(config.outfile, "</locale>\n"); 
383         }
384         else if (1 == p_config->xmloutput)
385         {
386             fprintf(config.outfile, "%s", uloc_getAvailable(i)); 
387             fprintf(config.outfile, " | ");
388             if (strlen(name_str))
389                 fprintf(config.outfile, "%s", name_str);
390             fprintf(config.outfile, " | ");
391             if (strlen(localname_str))
392                 fprintf(config.outfile, "%s", localname_str);
393             fprintf(config.outfile, "\n");
394         }
395         else
396             fprintf(config.outfile, "%s\n", uloc_getAvailable(i));
397     }
398     if (p_config->xmloutput)
399         fprintf(config.outfile, "</locales>\n");
400     else
401         fprintf(config.outfile, "\n");
402
403     if(U_FAILURE(status))
404     {
405         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
406         exit(2);
407     }
408 }
409
410
411 static void print_info(const struct config_t *p_config)
412 {
413     if (p_config->xmloutput)
414         fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
415                 "<icu>\n");
416
417     if ('c' == config.print[0])
418         print_icu_converters(&config);
419     else if ('l' == config.print[0])
420         print_icu_xml_locales(&config);
421     else if ('t' == config.print[0])
422         print_icu_transliterators(&config);
423     else {
424         print_icu_converters(&config);
425         print_icu_xml_locales(&config);
426         print_icu_transliterators(&config);
427     }
428
429     if (p_config->xmloutput)
430         fprintf(config.outfile, "</icu>\n");
431
432     exit(0);
433 }
434
435
436
437 static void process_text_file(const struct config_t *p_config)
438 {
439     char *line = 0;
440     char linebuf[1024];
441  
442     xmlDoc *doc = xmlParseFile(config.conffile);  
443     xmlNode *xml_node = xmlDocGetRootElement(doc);
444
445     long unsigned int token_count = 0;    
446     long unsigned int line_count = 0;    
447     
448     UErrorCode status = U_ZERO_ERROR;
449     
450     if (!xml_node)
451     {   
452         printf("Could not parse XML config file '%s' \n",
453                 config.conffile);
454         exit(1);
455     }
456
457     config.chain = icu_chain_xml_config(xml_node, 1, &status);
458
459     if (!config.chain || !U_SUCCESS(status))
460     {   
461         printf("Could not set up ICU chain from config file '%s' \n",
462                 config.conffile);
463         if (!U_SUCCESS(status))
464             printf("ICU Error: %d %s\n", status, u_errorName(status));
465         exit(1);
466     }
467
468     if (p_config->xmloutput)
469         fprintf(config.outfile,
470                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
471                 "<icu>\n"
472                 "<tokens>\n");
473     
474     /* read input lines for processing */
475     while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
476     {
477         WRBUF sw = wrbuf_alloc();
478         WRBUF cdata = wrbuf_alloc();
479         int success = icu_chain_assign_cstr(config.chain, line, &status);
480         line_count++;
481
482         while (success && icu_chain_next_token(config.chain, &status))
483         {
484             if (U_FAILURE(status))
485                 success = 0;
486             else
487             {
488                 const char *sortkey = icu_chain_token_sortkey(config.chain);
489                 wrbuf_rewind(sw);
490                 wrbuf_puts_escaped(sw, sortkey);
491                 token_count++;
492                 if (p_config->xmloutput)                    
493                 {
494                     fprintf(config.outfile, 
495                             "<token id=\"%lu\" line=\"%lu\"",
496                             token_count, line_count);
497
498                     wrbuf_rewind(cdata);
499                     wrbuf_xmlputs(cdata, icu_chain_token_norm(config.chain));
500                     fprintf(config.outfile, " norm=\"%s\"",
501                             wrbuf_cstr(cdata));
502
503                     wrbuf_rewind(cdata);
504                     wrbuf_xmlputs(cdata, icu_chain_token_display(config.chain));
505                     fprintf(config.outfile, " display=\"%s\"",
506                             wrbuf_cstr(cdata));
507                     
508                     if (p_config->sortoutput)
509                     {
510                         wrbuf_rewind(cdata);
511                         wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
512                         fprintf(config.outfile, " sortkey=\"%s\"",
513                                 wrbuf_cstr(cdata));
514                     }
515                     fprintf(config.outfile, "/>\n");
516                 }
517                 else
518                 {
519                     fprintf(config.outfile, "%lu %lu '%s' '%s'",
520                             token_count,
521                             line_count,
522                             icu_chain_token_norm(config.chain),
523                             icu_chain_token_display(config.chain));
524                     if (p_config->sortoutput)
525                     {
526                         fprintf(config.outfile, " '%s'", wrbuf_cstr(sw));
527                     }
528                     fprintf(config.outfile, "\n");
529                 }
530             }
531         }
532         wrbuf_destroy(sw);
533         wrbuf_destroy(cdata);
534     }
535
536     if (p_config->xmloutput)
537         fprintf(config.outfile,
538                 "</tokens>\n"
539                 "</icu>\n");
540     
541     icu_chain_destroy(config.chain);
542     xmlFreeDoc(doc);
543     if (line)
544         free(line);
545 }
546
547 #endif /* YAZ_HAVE_ICU */
548
549
550 int main(int argc, char **argv) 
551 {
552
553 #if YAZ_HAVE_ICU
554
555     read_params(argc, argv, &config);
556
557     if (config.conffile && strlen(config.conffile))
558         process_text_file(&config);
559      
560     if (config.print && strlen(config.print))
561         print_info(&config);
562
563 #else /* YAZ_HAVE_ICU */
564
565     printf("ICU not available on your system.\n"
566            "Please install libicu-dev and icu-doc or similar, "
567            "re-configure and re-compile\n");
568
569
570     exit(3);
571 #endif /* YAZ_HAVE_ICU */
572
573     return 0;
574 }
575
576
577 /*
578  * Local variables:
579  * c-basic-offset: 4
580  * c-file-style: "Stroustrup"
581  * indent-tabs-mode: nil
582  * End:
583  * vim: shiftwidth=4 tabstop=8 expandtab
584  */
585