fe86bedda61c7018783baae3ee7abe7d2fbab893
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2011 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #if HAVE_CONFIG_H
7 #include "config.h"
8 #endif
9
10 #include <string.h>
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <errno.h>
15
16 #include <yaz/options.h>
17
18 #if YAZ_HAVE_ICU
19
20 #include <unicode/ucnv.h>
21 #include <unicode/ustring.h>
22 #include <unicode/ucol.h> 
23 #include <unicode/ubrk.h>
24 #include <unicode/utrans.h>
25
26 #include <yaz/icu.h>
27 #include <yaz/wrbuf.h>
28
29 /* commando line and config parameters */
30 static struct config_t { 
31     char conffile[1024];
32     char print[1024];
33     int xmloutput;
34     int sortoutput;
35     yaz_icu_chain_t chain;
36     FILE * infile;
37     FILE * outfile;
38 } config;
39
40
41   
42 void print_option_error(const struct config_t *p_config)
43 {  
44     fprintf(stderr, "yaz-icu [options] [infile]\n"
45             "Options:\n"
46             "   -c file         XML configuration\n"
47             "   -p a|c|l|t      Print ICU info \n"
48             "   -s              Show sort normalization key\n"
49             "   -x              XML output instread of text\n"
50             "\n"
51             "Examples:\n"
52             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
53             "./yaz-icu -p c\n"
54             "./yaz-icu -p l -x\n"
55             "./yaz-icu -p t -x\n"
56             "\n"
57             "Example ICU chain XML configuration file:\n"
58             "<icu_chain locale=\"en\">\n"
59             "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
60             "  <tokenize rule=\"l\"/>\n"
61             "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
62             "  <casemap rule=\"l\"/>\n"
63             "</icu_chain>\n"
64           );
65     exit(1);
66 }
67
68 void read_params(int argc, char **argv, struct config_t *p_config)
69 {    
70     char *arg;
71     int ret;
72     
73     /* set default parameters */
74     p_config->conffile[0] = 0;
75     p_config->print[0] = 0;
76     p_config->xmloutput = 0;
77     p_config->sortoutput = 0;
78     p_config->chain = 0;
79     p_config->infile = 0;
80     p_config->outfile = stdout;
81     
82     /* set up command line parameters */
83     
84     while ((ret = options("c:p:xs", argv, argc, &arg)) != -2)
85     {
86         switch (ret)
87         {
88         case 'c':
89             strcpy(p_config->conffile, arg);
90             break;
91         case 'p':
92             strcpy(p_config->print, arg);
93             break;
94         case 's':
95             p_config->sortoutput = 1;
96             break;
97         case 'x':
98             p_config->xmloutput = 1;
99             break;
100         case 0:
101             if (p_config->infile)
102             {
103                 fprintf(stderr, "yaz-icu: only one input file may be given\n");
104                 print_option_error(p_config);
105             }
106             p_config->infile = fopen(arg, "r");
107             if (!p_config->infile)
108             {
109                 fprintf(stderr, "yaz-icu: cannot open %s : %s\n",
110                         arg, strerror(errno));
111                 exit(1);
112             }
113             break;
114         default:
115             fprintf(stderr, "yaz_icu: invalid option: %s\n", arg);
116             print_option_error(p_config);
117         }
118     }
119
120     if (p_config->infile == 0)
121         p_config->infile = stdin;
122
123     if (!strlen(p_config->conffile) && !strlen(p_config->print))
124         print_option_error(p_config);
125 }
126
127
128 /*     UConverter *conv; */
129 /*     conv = ucnv_open("utf-8", &status); */
130 /*     assert(U_SUCCESS(status)); */
131
132 /*     *ustr16_len  */
133 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
134 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
135 /*                       &status); */
136   
137
138
139 /*      ucnv_fromUChars(conv, */
140 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
141 /*                      ustr16, *ustr16_len, */
142 /*                      &status); */
143 /*      ucnv_close(conv); */
144
145
146 static void print_icu_converters(const struct config_t *p_config)
147 {
148     int32_t count;
149     int32_t i;
150
151     count = ucnv_countAvailable();
152     if (p_config->xmloutput)
153         fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
154                 count, ucnv_getDefaultName());
155     else
156     {    
157         fprintf(config.outfile, "Available ICU converters: %d\n", count);
158         fprintf(config.outfile, "Default ICU Converter is: '%s'\n", 
159                 ucnv_getDefaultName());
160     }
161     
162     for (i = 0; i < count; i++)
163     {
164         if (p_config->xmloutput)
165             fprintf(config.outfile, "<converter id=\"%s\"/>\n", 
166                     ucnv_getAvailableName(i));
167         else     
168             fprintf(config.outfile, "%s\n", ucnv_getAvailableName(i));
169     }
170     
171     if (p_config->xmloutput)
172         fprintf(config.outfile, "</converters>\n");
173     else
174         fprintf(config.outfile, "\n");
175 }
176
177 static void print_icu_transliterators(const struct config_t *p_config)
178 {
179     UErrorCode status;
180     UEnumeration *en = utrans_openIDs(&status);
181     int32_t count = uenum_count(en, &status);
182     const char *name;
183     int32_t length;
184
185     if (p_config->xmloutput)
186         fprintf(config.outfile, "<transliterators count=\"%d\">\n",  count);
187     else 
188         fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
189
190     while ((name = uenum_next(en, &length, &status)))
191     {
192         if (p_config->xmloutput)
193             fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", name);
194         else
195             fprintf(config.outfile, "%s\n", name);
196     }
197     uenum_close(en);
198     if (p_config->xmloutput)
199         fprintf(config.outfile, "</transliterators>\n");
200     else
201     {
202         fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
203                 "   Pattern         Description\n"
204                 "   Ranges          [a-z]       The lower case letters a through z\n"
205                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
206                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
207                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
208                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
209                 "\n"
210                 "   Combination     Example\n"
211                 "   Union           [[:Greek:] [:letter:]]\n"
212                 "   Intersection    [[:Greek:] & [:letter:]]\n"
213                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
214                 "   Complement      [^[:Greek:] [:letter:]]\n"
215                 "\n"
216              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
217                 "\n"
218                 "Examples:\n"
219                 "   [:Punctuation:] Any-Remove\n"
220                 "   [:Cased-Letter:] Any-Upper\n"
221                 "   [:Control:] Any-Remove\n"
222                 "   [:Decimal_Number:] Any-Remove\n"
223                 "   [:Final_Punctuation:] Any-Remove\n"
224                 "   [:Georgian:] Any-Upper\n"
225                 "   [:Katakana:] Any-Remove\n"
226                 "   [:Arabic:] Any-Remove\n"
227                 "   [:Punctuation:] Remove\n"
228                 "   [[:Punctuation:]-[.,]] Remove\n"
229                 "   [:Line_Separator:] Any-Remove\n"
230                 "   [:Math_Symbol:] Any-Remove\n"
231                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
232                 "   [:^Number:] Remove (numeric tokenization)\n"
233                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
234                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
235                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
236                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
237                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
238                 "\n"
239                 "see http://userguide.icu-project.org/transforms/general\n"
240                 "    http://www.unicode.org/reports/tr44/\n"
241             );
242         
243         
244         fprintf(config.outfile, "\n\n");
245         
246     }
247 }
248
249 static void print_icu_xml_locales(const struct config_t *p_config)
250 {
251     int32_t count;
252     int32_t i;
253     UErrorCode status = U_ZERO_ERROR;
254     
255     UChar keyword[64];
256     int32_t keyword_len = 0;
257     char keyword_str[128];
258     int32_t keyword_str_len = 0;
259
260     UChar language[64];
261     int32_t language_len = 0;
262     char lang_str[128];
263     int32_t lang_str_len = 0;
264
265     UChar script[64];
266     int32_t script_len = 0;
267     char script_str[128];
268     int32_t script_str_len = 0;
269
270     UChar location[64];
271     int32_t location_len = 0;
272     char location_str[128];
273     int32_t location_str_len = 0;
274
275     UChar variant[64];
276     int32_t variant_len = 0;
277     char variant_str[128];
278     int32_t variant_str_len = 0;
279
280     UChar name[64];
281     int32_t name_len = 0;
282     char name_str[128];
283     int32_t name_str_len = 0;
284
285     UChar localname[64];
286     int32_t localname_len = 0;
287     char localname_str[128];
288     int32_t localname_str_len = 0;
289
290     count = uloc_countAvailable() ;
291
292     if (p_config->xmloutput)
293     {
294         fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
295                 count, uloc_getDefault(), ucol_countAvailable());
296     }
297     else
298     {
299         fprintf(config.outfile, "Available ICU locales: %d\n", count);
300         fprintf(config.outfile, "Default locale is: %s\n",  uloc_getDefault());
301     }
302   
303     for (i = 0; i < count; i++) 
304     {
305
306         keyword_len 
307             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
308                                      keyword, 64, 
309                                      &status);
310
311         u_strToUTF8(keyword_str, 128, &keyword_str_len,
312                     keyword, keyword_len,
313                     &status);
314     
315     
316         language_len 
317             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
318                                       language, 64, 
319                                       &status);
320
321         u_strToUTF8(lang_str, 128, &lang_str_len,
322                     language, language_len,
323                     &status);
324
325
326         script_len 
327             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
328                                     script, 64, 
329                                     &status);
330
331         u_strToUTF8(script_str, 128, &script_str_len,
332                     script, script_len,
333                     &status);
334
335         location_len 
336             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
337                                      location, 64, 
338                                      &status);
339
340         u_strToUTF8(location_str, 128, &location_str_len,
341                     location, location_len,
342                     &status);
343
344         variant_len 
345             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
346                                      variant, 64, 
347                                      &status);
348
349         u_strToUTF8(variant_str, 128, &variant_str_len,
350                     variant, variant_len,
351                     &status);
352
353         name_len 
354             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
355                                   name, 64, 
356                                   &status);
357
358         u_strToUTF8(name_str, 128, &name_str_len,
359                     name, name_len,
360                     &status);
361
362         localname_len 
363             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
364                                   localname, 64, 
365                                   &status);
366
367         u_strToUTF8(localname_str, 128, &localname_str_len,
368                     localname, localname_len,
369                     &status);
370
371
372         if (p_config->xmloutput)
373         {
374             fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
375             /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
376             /* if (strlen(keyword_str)) */
377             /*   fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
378             /* if (ucol_getAvailable(i)) */
379             /*   fprintf(config.outfile, " collation=\"1\""); */
380             if (strlen(lang_str))
381                 fprintf(config.outfile, " language=\"%s\"", lang_str);
382             if (strlen(script_str))
383                 fprintf(config.outfile, " script=\"%s\"", script_str);
384             if (strlen(location_str))
385                 fprintf(config.outfile, " location=\"%s\"", location_str);
386             if (strlen(variant_str))
387                 fprintf(config.outfile, " variant=\"%s\"", variant_str);
388             if (strlen(name_str))
389                 fprintf(config.outfile, " name=\"%s\"", name_str);
390             if (strlen(localname_str))
391                 fprintf(config.outfile, " localname=\"%s\"", localname_str);
392             fprintf(config.outfile, ">");
393             if (strlen(localname_str))
394                 fprintf(config.outfile, "%s", localname_str);
395             fprintf(config.outfile, "</locale>\n"); 
396         }
397         else if (1 == p_config->xmloutput)
398         {
399             fprintf(config.outfile, "%s", uloc_getAvailable(i)); 
400             fprintf(config.outfile, " | ");
401             if (strlen(name_str))
402                 fprintf(config.outfile, "%s", name_str);
403             fprintf(config.outfile, " | ");
404             if (strlen(localname_str))
405                 fprintf(config.outfile, "%s", localname_str);
406             fprintf(config.outfile, "\n");
407         }
408         else
409             fprintf(config.outfile, "%s\n", uloc_getAvailable(i));
410     }
411     if (p_config->xmloutput)
412         fprintf(config.outfile, "</locales>\n");
413     else
414         fprintf(config.outfile, "\n");
415
416     if (U_FAILURE(status))
417     {
418         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
419         exit(2);
420     }
421 }
422
423
424 static void print_info(const struct config_t *p_config)
425 {
426     if (p_config->xmloutput)
427         fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
428                 "<icu>\n");
429
430     if ('c' == config.print[0])
431         print_icu_converters(&config);
432     else if ('l' == config.print[0])
433         print_icu_xml_locales(&config);
434     else if ('t' == config.print[0])
435         print_icu_transliterators(&config);
436     else {
437         print_icu_converters(&config);
438         print_icu_xml_locales(&config);
439         print_icu_transliterators(&config);
440     }
441
442     if (p_config->xmloutput)
443         fprintf(config.outfile, "</icu>\n");
444
445     exit(0);
446 }
447
448
449
450 static void process_text_file(const struct config_t *p_config)
451 {
452     char *line = 0;
453     char linebuf[1024];
454  
455     xmlDoc *doc = xmlParseFile(config.conffile);  
456     xmlNode *xml_node = xmlDocGetRootElement(doc);
457
458     long unsigned int token_count = 0;    
459     long unsigned int line_count = 0;    
460     
461     UErrorCode status = U_ZERO_ERROR;
462     
463     if (!xml_node)
464     {   
465         printf("Could not parse XML config file '%s' \n",
466                 config.conffile);
467         exit(1);
468     }
469
470     config.chain = icu_chain_xml_config(xml_node, 1, &status);
471
472     if (!config.chain || !U_SUCCESS(status))
473     {   
474         printf("Could not set up ICU chain from config file '%s' \n",
475                 config.conffile);
476         if (!U_SUCCESS(status))
477             printf("ICU Error: %d %s\n", status, u_errorName(status));
478         exit(1);
479     }
480
481     if (p_config->xmloutput)
482         fprintf(config.outfile,
483                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
484                 "<icu>\n"
485                 "<tokens>\n");
486     
487     /* read input lines for processing */
488     while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
489     {
490         WRBUF sw = wrbuf_alloc();
491         WRBUF cdata = wrbuf_alloc();
492         int success = icu_chain_assign_cstr(config.chain, line, &status);
493         line_count++;
494
495         while (success && icu_chain_next_token(config.chain, &status))
496         {
497             if (U_FAILURE(status))
498                 success = 0;
499             else
500             {
501                 const char *sortkey = icu_chain_token_sortkey(config.chain);
502                 wrbuf_rewind(sw);
503                 wrbuf_puts_escaped(sw, sortkey);
504                 token_count++;
505                 if (p_config->xmloutput)                    
506                 {
507                     fprintf(config.outfile, 
508                             "<token id=\"%lu\" line=\"%lu\"",
509                             token_count, line_count);
510
511                     wrbuf_rewind(cdata);
512                     wrbuf_xmlputs(cdata, icu_chain_token_norm(config.chain));
513                     fprintf(config.outfile, " norm=\"%s\"",
514                             wrbuf_cstr(cdata));
515
516                     wrbuf_rewind(cdata);
517                     wrbuf_xmlputs(cdata, icu_chain_token_display(config.chain));
518                     fprintf(config.outfile, " display=\"%s\"",
519                             wrbuf_cstr(cdata));
520                     
521                     if (p_config->sortoutput)
522                     {
523                         wrbuf_rewind(cdata);
524                         wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
525                         fprintf(config.outfile, " sortkey=\"%s\"",
526                                 wrbuf_cstr(cdata));
527                     }
528                     fprintf(config.outfile, "/>\n");
529                 }
530                 else
531                 {
532                     fprintf(config.outfile, "%lu %lu '%s' '%s'",
533                             token_count,
534                             line_count,
535                             icu_chain_token_norm(config.chain),
536                             icu_chain_token_display(config.chain));
537                     if (p_config->sortoutput)
538                     {
539                         fprintf(config.outfile, " '%s'", wrbuf_cstr(sw));
540                     }
541                     fprintf(config.outfile, "\n");
542                 }
543             }
544         }
545         wrbuf_destroy(sw);
546         wrbuf_destroy(cdata);
547     }
548
549     if (p_config->xmloutput)
550         fprintf(config.outfile,
551                 "</tokens>\n"
552                 "</icu>\n");
553     
554     icu_chain_destroy(config.chain);
555     xmlFreeDoc(doc);
556     if (line)
557         free(line);
558 }
559
560 #endif /* YAZ_HAVE_ICU */
561
562
563 int main(int argc, char **argv) 
564 {
565
566 #if YAZ_HAVE_ICU
567
568     read_params(argc, argv, &config);
569
570     if (config.conffile && strlen(config.conffile))
571         process_text_file(&config);
572      
573     if (config.print && strlen(config.print))
574         print_info(&config);
575
576 #else /* YAZ_HAVE_ICU */
577
578     printf("ICU not available on your system.\n"
579            "Please install libicu-dev and icu-doc or similar, "
580            "re-configure and re-compile\n");
581
582
583     exit(3);
584 #endif /* YAZ_HAVE_ICU */
585
586     return 0;
587 }
588
589
590 /*
591  * Local variables:
592  * c-basic-offset: 4
593  * c-file-style: "Stroustrup"
594  * indent-tabs-mode: nil
595  * End:
596  * vim: shiftwidth=4 tabstop=8 expandtab
597  */
598