yaz-icu.c refactor and cleanup
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2011 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #if HAVE_CONFIG_H
7 #include "config.h"
8 #endif
9
10 #include <string.h>
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <errno.h>
15
16 #include <yaz/options.h>
17
18 #if YAZ_HAVE_ICU
19
20 #include <unicode/ucnv.h>
21 #include <unicode/ustring.h>
22 #include <unicode/ucol.h> 
23 #include <unicode/ubrk.h>
24 #include <unicode/utrans.h>
25 #include <unicode/uclean.h>
26
27 #include <yaz/icu.h>
28 #include <yaz/wrbuf.h>
29
30 /* commando line and config parameters */
31 struct config_t { 
32     char conffile[1024];
33     char print[1024];
34     int xmloutput;
35     int sortoutput;
36     yaz_icu_chain_t chain;
37     FILE * infile;
38     FILE * outfile;
39 };
40   
41 void print_option_error(const struct config_t *p_config)
42 {  
43     fprintf(stderr, "yaz-icu [options] [infile]\n"
44             "Options:\n"
45             "   -c file         XML configuration\n"
46             "   -p a|c|l|t      Print ICU info \n"
47             "   -s              Show sort normalization key\n"
48             "   -x              XML output instread of text\n"
49             "\n"
50             "Examples:\n"
51             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
52             "./yaz-icu -p c\n"
53             "./yaz-icu -p l -x\n"
54             "./yaz-icu -p t -x\n"
55             "\n"
56             "Example ICU chain XML configuration file:\n"
57             "<icu_chain locale=\"en\">\n"
58             "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
59             "  <tokenize rule=\"l\"/>\n"
60             "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
61             "  <casemap rule=\"l\"/>\n"
62             "</icu_chain>\n"
63           );
64     exit(1);
65 }
66
67 void read_params(int argc, char **argv, struct config_t *p_config)
68 {    
69     char *arg;
70     int ret;
71     
72     /* set default parameters */
73     p_config->conffile[0] = 0;
74     p_config->print[0] = 0;
75     p_config->xmloutput = 0;
76     p_config->sortoutput = 0;
77     p_config->chain = 0;
78     p_config->infile = 0;
79     p_config->outfile = stdout;
80     
81     /* set up command line parameters */
82     
83     while ((ret = options("c:p:xs", argv, argc, &arg)) != -2)
84     {
85         switch (ret)
86         {
87         case 'c':
88             strcpy(p_config->conffile, arg);
89             break;
90         case 'p':
91             strcpy(p_config->print, arg);
92             break;
93         case 's':
94             p_config->sortoutput = 1;
95             break;
96         case 'x':
97             p_config->xmloutput = 1;
98             break;
99         case 0:
100             if (p_config->infile)
101             {
102                 fprintf(stderr, "yaz-icu: only one input file may be given\n");
103                 print_option_error(p_config);
104             }
105             p_config->infile = fopen(arg, "r");
106             if (!p_config->infile)
107             {
108                 fprintf(stderr, "yaz-icu: cannot open %s : %s\n",
109                         arg, strerror(errno));
110                 exit(1);
111             }
112             break;
113         default:
114             fprintf(stderr, "yaz_icu: invalid option: %s\n", arg);
115             print_option_error(p_config);
116         }
117     }
118
119     if (p_config->infile == 0)
120         p_config->infile = stdin;
121
122     if (!strlen(p_config->conffile) && !strlen(p_config->print))
123         print_option_error(p_config);
124 }
125
126
127 /*     UConverter *conv; */
128 /*     conv = ucnv_open("utf-8", &status); */
129 /*     assert(U_SUCCESS(status)); */
130
131 /*     *ustr16_len  */
132 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
133 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
134 /*                       &status); */
135   
136
137
138 /*      ucnv_fromUChars(conv, */
139 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
140 /*                      ustr16, *ustr16_len, */
141 /*                      &status); */
142 /*      ucnv_close(conv); */
143
144
145 static void print_icu_converters(const struct config_t *p_config)
146 {
147     int32_t count;
148     int32_t i;
149
150     count = ucnv_countAvailable();
151     if (p_config->xmloutput)
152         fprintf(p_config->outfile, "<converters count=\"%d\" default=\"%s\">\n",
153                 count, ucnv_getDefaultName());
154     else
155     {    
156         fprintf(p_config->outfile, "Available ICU converters: %d\n", count);
157         fprintf(p_config->outfile, "Default ICU Converter is: '%s'\n", 
158                 ucnv_getDefaultName());
159     }
160     
161     for (i = 0; i < count; i++)
162     {
163         if (p_config->xmloutput)
164             fprintf(p_config->outfile, "<converter id=\"%s\"/>\n", 
165                     ucnv_getAvailableName(i));
166         else     
167             fprintf(p_config->outfile, "%s\n", ucnv_getAvailableName(i));
168     }
169     
170     if (p_config->xmloutput)
171         fprintf(p_config->outfile, "</converters>\n");
172     else
173         fprintf(p_config->outfile, "\n");
174 }
175
176 static void print_icu_transliterators(const struct config_t *p_config)
177 {
178     UErrorCode status;
179     UEnumeration *en = utrans_openIDs(&status);
180     int32_t count = uenum_count(en, &status);
181     const char *name;
182     int32_t length;
183
184     if (p_config->xmloutput)
185         fprintf(p_config->outfile, "<transliterators count=\"%d\">\n",  count);
186     else 
187         fprintf(p_config->outfile, "Available ICU transliterators: %d\n", count);
188
189     while ((name = uenum_next(en, &length, &status)))
190     {
191         if (p_config->xmloutput)
192             fprintf(p_config->outfile, "<transliterator id=\"%s\"/>\n", name);
193         else
194             fprintf(p_config->outfile, "%s\n", name);
195     }
196     uenum_close(en);
197     if (p_config->xmloutput)
198         fprintf(p_config->outfile, "</transliterators>\n");
199     else
200     {
201         fprintf(p_config->outfile, "\n\nUnicode Set Patterns:\n"
202                 "   Pattern         Description\n"
203                 "   Ranges          [a-z]       The lower case letters a through z\n"
204                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
205                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
206                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
207                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
208                 "\n"
209                 "   Combination     Example\n"
210                 "   Union           [[:Greek:] [:letter:]]\n"
211                 "   Intersection    [[:Greek:] & [:letter:]]\n"
212                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
213                 "   Complement      [^[:Greek:] [:letter:]]\n"
214                 "\n"
215              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
216                 "\n"
217                 "Examples:\n"
218                 "   [:Punctuation:] Any-Remove\n"
219                 "   [:Cased-Letter:] Any-Upper\n"
220                 "   [:Control:] Any-Remove\n"
221                 "   [:Decimal_Number:] Any-Remove\n"
222                 "   [:Final_Punctuation:] Any-Remove\n"
223                 "   [:Georgian:] Any-Upper\n"
224                 "   [:Katakana:] Any-Remove\n"
225                 "   [:Arabic:] Any-Remove\n"
226                 "   [:Punctuation:] Remove\n"
227                 "   [[:Punctuation:]-[.,]] Remove\n"
228                 "   [:Line_Separator:] Any-Remove\n"
229                 "   [:Math_Symbol:] Any-Remove\n"
230                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
231                 "   [:^Number:] Remove (numeric tokenization)\n"
232                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
233                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
234                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
235                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
236                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
237                 "\n"
238                 "see http://userguide.icu-project.org/transforms/general\n"
239                 "    http://www.unicode.org/reports/tr44/\n"
240             );
241         
242         
243         fprintf(p_config->outfile, "\n\n");
244         
245     }
246 }
247
248 static void print_icu_xml_locales(const struct config_t *p_config)
249 {
250     int32_t count;
251     int32_t i;
252     UErrorCode status = U_ZERO_ERROR;
253     
254     UChar keyword[64];
255     int32_t keyword_len = 0;
256     char keyword_str[128];
257     int32_t keyword_str_len = 0;
258
259     UChar language[64];
260     int32_t language_len = 0;
261     char lang_str[128];
262     int32_t lang_str_len = 0;
263
264     UChar script[64];
265     int32_t script_len = 0;
266     char script_str[128];
267     int32_t script_str_len = 0;
268
269     UChar location[64];
270     int32_t location_len = 0;
271     char location_str[128];
272     int32_t location_str_len = 0;
273
274     UChar variant[64];
275     int32_t variant_len = 0;
276     char variant_str[128];
277     int32_t variant_str_len = 0;
278
279     UChar name[64];
280     int32_t name_len = 0;
281     char name_str[128];
282     int32_t name_str_len = 0;
283
284     UChar localname[64];
285     int32_t localname_len = 0;
286     char localname_str[128];
287     int32_t localname_str_len = 0;
288
289     count = uloc_countAvailable() ;
290
291     if (p_config->xmloutput)
292     {
293         fprintf(p_config->outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
294                 count, uloc_getDefault(), ucol_countAvailable());
295     }
296     else
297     {
298         fprintf(p_config->outfile, "Available ICU locales: %d\n", count);
299         fprintf(p_config->outfile, "Default locale is: %s\n",  uloc_getDefault());
300     }
301   
302     for (i = 0; i < count; i++) 
303     {
304
305         keyword_len 
306             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
307                                      keyword, 64, 
308                                      &status);
309
310         u_strToUTF8(keyword_str, 128, &keyword_str_len,
311                     keyword, keyword_len,
312                     &status);
313     
314     
315         language_len 
316             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
317                                       language, 64, 
318                                       &status);
319
320         u_strToUTF8(lang_str, 128, &lang_str_len,
321                     language, language_len,
322                     &status);
323
324
325         script_len 
326             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
327                                     script, 64, 
328                                     &status);
329
330         u_strToUTF8(script_str, 128, &script_str_len,
331                     script, script_len,
332                     &status);
333
334         location_len 
335             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
336                                      location, 64, 
337                                      &status);
338
339         u_strToUTF8(location_str, 128, &location_str_len,
340                     location, location_len,
341                     &status);
342
343         variant_len 
344             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
345                                      variant, 64, 
346                                      &status);
347
348         u_strToUTF8(variant_str, 128, &variant_str_len,
349                     variant, variant_len,
350                     &status);
351
352         name_len 
353             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
354                                   name, 64, 
355                                   &status);
356
357         u_strToUTF8(name_str, 128, &name_str_len,
358                     name, name_len,
359                     &status);
360
361         localname_len 
362             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
363                                   localname, 64, 
364                                   &status);
365
366         u_strToUTF8(localname_str, 128, &localname_str_len,
367                     localname, localname_len,
368                     &status);
369
370
371         if (p_config->xmloutput)
372         {
373             fprintf(p_config->outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
374             /* fprintf(p_config->outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
375             /* if (strlen(keyword_str)) */
376             /*   fprintf(p_config->outfile, " keyword=\"%s\"", keyword_str); */
377             /* if (ucol_getAvailable(i)) */
378             /*   fprintf(p_config->outfile, " collation=\"1\""); */
379             if (strlen(lang_str))
380                 fprintf(p_config->outfile, " language=\"%s\"", lang_str);
381             if (strlen(script_str))
382                 fprintf(p_config->outfile, " script=\"%s\"", script_str);
383             if (strlen(location_str))
384                 fprintf(p_config->outfile, " location=\"%s\"", location_str);
385             if (strlen(variant_str))
386                 fprintf(p_config->outfile, " variant=\"%s\"", variant_str);
387             if (strlen(name_str))
388                 fprintf(p_config->outfile, " name=\"%s\"", name_str);
389             if (strlen(localname_str))
390                 fprintf(p_config->outfile, " localname=\"%s\"", localname_str);
391             fprintf(p_config->outfile, ">");
392             if (strlen(localname_str))
393                 fprintf(p_config->outfile, "%s", localname_str);
394             fprintf(p_config->outfile, "</locale>\n"); 
395         }
396         else if (1 == p_config->xmloutput)
397         {
398             fprintf(p_config->outfile, "%s", uloc_getAvailable(i)); 
399             fprintf(p_config->outfile, " | ");
400             if (strlen(name_str))
401                 fprintf(p_config->outfile, "%s", name_str);
402             fprintf(p_config->outfile, " | ");
403             if (strlen(localname_str))
404                 fprintf(p_config->outfile, "%s", localname_str);
405             fprintf(p_config->outfile, "\n");
406         }
407         else
408             fprintf(p_config->outfile, "%s\n", uloc_getAvailable(i));
409     }
410     if (p_config->xmloutput)
411         fprintf(p_config->outfile, "</locales>\n");
412     else
413         fprintf(p_config->outfile, "\n");
414
415     if (U_FAILURE(status))
416     {
417         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
418         exit(2);
419     }
420 }
421
422
423 static void print_info(const struct config_t *p_config)
424 {
425     if (p_config->xmloutput)
426         fprintf(p_config->outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
427                 "<icu>\n");
428
429     if ('c' == p_config->print[0])
430         print_icu_converters(p_config);
431     else if ('l' == p_config->print[0])
432         print_icu_xml_locales(p_config);
433     else if ('t' == p_config->print[0])
434         print_icu_transliterators(p_config);
435     else {
436         print_icu_converters(p_config);
437         print_icu_xml_locales(p_config);
438         print_icu_transliterators(p_config);
439     }
440
441     if (p_config->xmloutput)
442         fprintf(p_config->outfile, "</icu>\n");
443
444     exit(0);
445 }
446
447
448
449 static void process_text_file(struct config_t *p_config)
450 {
451     char *line = 0;
452     char linebuf[1024];
453  
454     xmlDoc *doc = xmlParseFile(p_config->conffile);  
455     xmlNode *xml_node = xmlDocGetRootElement(doc);
456
457     long unsigned int token_count = 0;    
458     long unsigned int line_count = 0;    
459     
460     UErrorCode status = U_ZERO_ERROR;
461     
462     if (!xml_node)
463     {   
464         printf("Could not parse XML config file '%s' \n",
465                 p_config->conffile);
466         exit(1);
467     }
468
469     p_config->chain = icu_chain_xml_config(xml_node, 1, &status);
470
471     if (!p_config->chain || !U_SUCCESS(status))
472     {   
473         printf("Could not set up ICU chain from config file '%s' \n",
474                 p_config->conffile);
475         if (!U_SUCCESS(status))
476             printf("ICU Error: %d %s\n", status, u_errorName(status));
477         exit(1);
478     }
479
480     if (p_config->xmloutput)
481         fprintf(p_config->outfile,
482                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
483                 "<icu>\n"
484                 "<tokens>\n");
485     
486     /* read input lines for processing */
487     while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile)))
488     {
489         WRBUF sw = wrbuf_alloc();
490         WRBUF cdata = wrbuf_alloc();
491         int success = icu_chain_assign_cstr(p_config->chain, line, &status);
492         line_count++;
493
494         while (success && icu_chain_next_token(p_config->chain, &status))
495         {
496             if (U_FAILURE(status))
497                 success = 0;
498             else
499             {
500                 const char *sortkey = icu_chain_token_sortkey(p_config->chain);
501                 wrbuf_rewind(sw);
502                 wrbuf_puts_escaped(sw, sortkey);
503                 token_count++;
504                 if (p_config->xmloutput)                    
505                 {
506                     fprintf(p_config->outfile, 
507                             "<token id=\"%lu\" line=\"%lu\"",
508                             token_count, line_count);
509
510                     wrbuf_rewind(cdata);
511                     wrbuf_xmlputs(cdata, icu_chain_token_norm(p_config->chain));
512                     fprintf(p_config->outfile, " norm=\"%s\"",
513                             wrbuf_cstr(cdata));
514
515                     wrbuf_rewind(cdata);
516                     wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain));
517                     fprintf(p_config->outfile, " display=\"%s\"",
518                             wrbuf_cstr(cdata));
519                     
520                     if (p_config->sortoutput)
521                     {
522                         wrbuf_rewind(cdata);
523                         wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
524                         fprintf(p_config->outfile, " sortkey=\"%s\"",
525                                 wrbuf_cstr(cdata));
526                     }
527                     fprintf(p_config->outfile, "/>\n");
528                 }
529                 else
530                 {
531                     fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
532                             token_count,
533                             line_count,
534                             icu_chain_token_norm(p_config->chain),
535                             icu_chain_token_display(p_config->chain));
536                     if (p_config->sortoutput)
537                     {
538                         fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
539                     }
540                     fprintf(p_config->outfile, "\n");
541                 }
542             }
543         }
544         wrbuf_destroy(sw);
545         wrbuf_destroy(cdata);
546     }
547
548     if (p_config->xmloutput)
549         fprintf(p_config->outfile,
550                 "</tokens>\n"
551                 "</icu>\n");
552     
553     icu_chain_destroy(p_config->chain);
554     xmlFreeDoc(doc);
555     if (line)
556         free(line);
557 }
558
559 #endif /* YAZ_HAVE_ICU */
560
561
562 int main(int argc, char **argv) 
563 {
564     struct config_t config;
565
566 #if YAZ_HAVE_ICU
567     read_params(argc, argv, &config);
568
569     if (config.conffile && strlen(config.conffile))
570         process_text_file(&config);
571      
572     if (config.print && strlen(config.print))
573         print_info(&config);
574
575     u_cleanup();
576 #else /* YAZ_HAVE_ICU */
577
578     printf("ICU not available on your system.\n"
579            "Please install libicu-dev and icu-doc or similar, "
580            "re-configure and re-compile\n");
581
582
583     exit(3);
584 #endif /* YAZ_HAVE_ICU */
585
586     return 0;
587 }
588
589
590 /*
591  * Local variables:
592  * c-basic-offset: 4
593  * c-file-style: "Stroustrup"
594  * indent-tabs-mode: nil
595  * End:
596  * vim: shiftwidth=4 tabstop=8 expandtab
597  */
598