Renamed 'normalize' rule to 'transform'.
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: yaz-icu.c,v 1.13 2007-11-12 11:11:16 adam Exp $
6  */
7
8 #if HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11
12 #include <string.h>
13
14 #include <stdio.h>
15 #include <stdlib.h>
16
17 #include <yaz/options.h>
18
19 #if YAZ_HAVE_ICU
20
21 #include <unicode/ucnv.h>
22 #include <unicode/ustring.h>
23 #include <unicode/ucol.h> 
24 #include <unicode/ubrk.h>
25 #include <unicode/utrans.h>
26
27 #include <yaz/icu.h>
28
29 /* commando line and config parameters */
30 static struct config_t { 
31     char conffile[1024];
32     char print[1024];
33     int xmloutput;
34     yaz_icu_chain_t chain;
35     FILE * infile;
36     FILE * outfile;
37 } config;
38
39
40   
41 void print_option_error(const struct config_t *p_config)
42 {  
43     fprintf(stderr, "Calling error, valid options are :\n");
44     fprintf(stderr, "yaz-icu\n"
45             "   [-c (path/to/config/file.xml)]\n"
46             "   [-p (a|c|l|t)] print ICU info \n"
47             "   [-x] XML output\n"
48             "\n"
49             "Examples:\n"
50             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
51             "./yaz-icu -p c\n"
52             "./yaz-icu -p l -x\n"
53             "./yaz-icu -p t -x\n"
54             "\n"
55             "Example ICU chain XML configuration file:\n"
56             "<icu_chain locale=\"en\">\n"
57             "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
58             "  <tokenize rule=\"l\"/>\n"
59             "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
60             "  <casemap rule=\"l\"/>\n"
61             "</icu_chain>\n"
62           );
63     exit(1);
64 }
65
66 void read_params(int argc, char **argv, struct config_t *p_config)
67 {    
68     char *arg;
69     int ret;
70     
71     /* set default parameters */
72     p_config->conffile[0] = 0;
73     p_config->print[0] = 0;
74     p_config->xmloutput = 0;
75     p_config->chain = 0;
76     p_config->infile = stdin;
77     p_config->outfile = stdout;
78     
79     /* set up command line parameters */
80     
81     while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
82     {
83         switch (ret)
84         {
85         case 'c':
86             strcpy(p_config->conffile, arg);
87             break;
88         case 'p':
89             strcpy(p_config->print, arg);
90             break;
91         case 'x':
92             p_config->xmloutput = 1;
93             break;
94         default:
95             print_option_error(p_config);
96         }
97     }
98     
99     if ((!strlen(p_config->conffile)
100          && !strlen(p_config->print))
101         || !config.infile
102         || !config.outfile)
103         
104         print_option_error(p_config);
105 }
106
107
108 /*     UConverter *conv; */
109 /*     conv = ucnv_open("utf-8", &status); */
110 /*     assert(U_SUCCESS(status)); */
111
112 /*     *ustr16_len  */
113 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
114 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
115 /*                       &status); */
116   
117
118
119 /*      ucnv_fromUChars(conv, */
120 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
121 /*                      ustr16, *ustr16_len, */
122 /*                      &status); */
123 /*      ucnv_close(conv); */
124
125
126 static void print_icu_converters(const struct config_t *p_config)
127 {
128     int32_t count;
129     int32_t i;
130
131     count = ucnv_countAvailable();
132     if (p_config->xmloutput)
133         fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
134                 count, ucnv_getDefaultName());
135     else {    
136         fprintf(config.outfile, "Available ICU converters: %d\n", count);
137         fprintf(config.outfile, "Default ICU Converter is: '%s'\n", 
138                 ucnv_getDefaultName());
139     }
140     
141     for(i=0;i<count;i++)
142     {
143         if (p_config->xmloutput)
144             fprintf(config.outfile, "<converter id=\"%s\"/>\n", 
145                     ucnv_getAvailableName(i));
146         else     
147             fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
148     }
149     
150     if (p_config->xmloutput)
151         fprintf(config.outfile, "</converters>\n");
152     else
153         fprintf(config.outfile, "\n");
154 }
155
156 static void print_icu_transliterators(const struct config_t *p_config)
157 {
158     int32_t buf_cap = 128;
159     char buf[buf_cap];
160     int32_t i;
161     int32_t count = utrans_countAvailableIDs();
162     
163     if (p_config->xmloutput)
164         fprintf(config.outfile, "<transliterators count=\"%d\">\n",  count);
165     else 
166         fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
167     
168     for(i = 0; i <count; i++)
169     {
170         utrans_getAvailableID(i, buf, buf_cap);
171         if (p_config->xmloutput)
172             fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
173         else
174             fprintf(config.outfile, " %s", buf);
175     }
176     
177     if (p_config->xmloutput)
178     {
179         fprintf(config.outfile, "</transliterators>\n");
180     }
181     else
182     {
183         fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
184                 "   Pattern         Description\n"
185                 "   Ranges          [a-z]       The lower case letters a through z\n"
186                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
187                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
188                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
189                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
190                 "\n"
191                 "   Combination     Example\n"
192                 "   Union           [[:Greek:] [:letter:]]\n"
193                 "   Intersection    [[:Greek:] & [:letter:]]\n"
194                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
195                 "   Complement      [^[:Greek:] [:letter:]]\n"
196                 "\n"
197              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
198                 "\n"
199                 "Examples:\n"
200                 "   [:Punctuation:] Any-Remove\n"
201                 "   [:Cased-Letter:] Any-Upper\n"
202                 "   [:Control:] Any-Remove\n"
203                 "   [:Decimal_Number:] Any-Remove\n"
204                 "   [:Final_Punctuation:] Any-Remove\n"
205                 "   [:Georgian:] Any-Upper\n"
206                 "   [:Katakana:] Any-Remove\n"
207                 "   [:Arabic:] Any-Remove\n"
208                 "   [:Punctuation:] Remove\n"
209                 "   [[:Punctuation:]-[.,]] Remove\n"
210                 "   [:Line_Separator:] Any-Remove\n"
211                 "   [:Math_Symbol:] Any-Remove\n"
212                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
213                 "   [:^Number:] Remove (numeric tokenization)\n"
214                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
215                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
216                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
217                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
218                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
219                 "\n"
220                 "see http://icu.sourceforge.net/userguide/Transform.html\n"
221                 "    http://www.unicode.org/Public/UNIDATA/UCD.html\n"
222                 "    http://icu.sourceforge.net/userguide/Transform.html\n"
223                 "    http://icu.sourceforge.net/userguide/TransformRule.html\n"
224             );
225         
226         
227         fprintf(config.outfile, "\n\n");
228         
229     }
230 }
231
232 static void print_icu_xml_locales(const struct config_t *p_config)
233 {
234     int32_t count;
235     int32_t i;
236     UErrorCode status = U_ZERO_ERROR;
237     
238     UChar keyword[64];
239     int32_t keyword_len = 0;
240     char keyword_str[128];
241     int32_t keyword_str_len = 0;
242
243     UChar language[64];
244     int32_t language_len = 0;
245     char lang_str[128];
246     int32_t lang_str_len = 0;
247
248     UChar script[64];
249     int32_t script_len = 0;
250     char script_str[128];
251     int32_t script_str_len = 0;
252
253     UChar location[64];
254     int32_t location_len = 0;
255     char location_str[128];
256     int32_t location_str_len = 0;
257
258     UChar variant[64];
259     int32_t variant_len = 0;
260     char variant_str[128];
261     int32_t variant_str_len = 0;
262
263     UChar name[64];
264     int32_t name_len = 0;
265     char name_str[128];
266     int32_t name_str_len = 0;
267
268     UChar localname[64];
269     int32_t localname_len = 0;
270     char localname_str[128];
271     int32_t localname_str_len = 0;
272
273     count = uloc_countAvailable() ;
274
275     if (p_config->xmloutput)
276     {
277         fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
278                 count, uloc_getDefault(), ucol_countAvailable());
279     }
280   
281     for(i=0;i<count;i++) 
282     {
283
284         keyword_len 
285             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
286                                      keyword, 64, 
287                                      &status);
288
289         u_strToUTF8(keyword_str, 128, &keyword_str_len,
290                     keyword, keyword_len,
291                     &status);
292     
293     
294         language_len 
295             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
296                                       language, 64, 
297                                       &status);
298
299         u_strToUTF8(lang_str, 128, &lang_str_len,
300                     language, language_len,
301                     &status);
302
303
304         script_len 
305             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
306                                     script, 64, 
307                                     &status);
308
309         u_strToUTF8(script_str, 128, &script_str_len,
310                     script, script_len,
311                     &status);
312
313         location_len 
314             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
315                                      location, 64, 
316                                      &status);
317
318         u_strToUTF8(location_str, 128, &location_str_len,
319                     location, location_len,
320                     &status);
321
322         variant_len 
323             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
324                                      variant, 64, 
325                                      &status);
326
327         u_strToUTF8(variant_str, 128, &variant_str_len,
328                     variant, variant_len,
329                     &status);
330
331         name_len 
332             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
333                                   name, 64, 
334                                   &status);
335
336         u_strToUTF8(name_str, 128, &name_str_len,
337                     name, name_len,
338                     &status);
339
340         localname_len 
341             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
342                                   localname, 64, 
343                                   &status);
344
345         u_strToUTF8(localname_str, 128, &localname_str_len,
346                     localname, localname_len,
347                     &status);
348
349
350         if (p_config->xmloutput)
351         {
352             fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
353             /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
354             /* if (strlen(keyword_str)) */
355             /*   fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
356             /* if (ucol_getAvailable(i)) */
357             /*   fprintf(config.outfile, " collation=\"1\""); */
358             if (strlen(lang_str))
359                 fprintf(config.outfile, " language=\"%s\"", lang_str);
360             if (strlen(script_str))
361                 fprintf(config.outfile, " script=\"%s\"", script_str);
362             if (strlen(location_str))
363                 fprintf(config.outfile, " location=\"%s\"", location_str);
364             if (strlen(variant_str))
365                 fprintf(config.outfile, " variant=\"%s\"", variant_str);
366             if (strlen(name_str))
367                 fprintf(config.outfile, " name=\"%s\"", name_str);
368             if (strlen(localname_str))
369                 fprintf(config.outfile, " localname=\"%s\"", localname_str);
370             fprintf(config.outfile, ">");
371             if (strlen(localname_str))
372                 fprintf(config.outfile, "%s", localname_str);
373             fprintf(config.outfile, "</locale>\n"); 
374         }
375         else if (1 == p_config->xmloutput)
376         {
377             fprintf(config.outfile, "%s", uloc_getAvailable(i)); 
378             fprintf(config.outfile, " | ");
379             if (strlen(name_str))
380                 fprintf(config.outfile, "%s", name_str);
381             fprintf(config.outfile, " | ");
382             if (strlen(localname_str))
383                 fprintf(config.outfile, "%s", localname_str);
384             fprintf(config.outfile, "\n");
385         }
386         else
387             fprintf(config.outfile, "%s ", uloc_getAvailable(i));
388     }
389     if (p_config->xmloutput)
390         fprintf(config.outfile, "</locales>\n");
391     else
392         fprintf(config.outfile, "\n");
393
394     if(U_FAILURE(status))
395     {
396         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
397         exit(status);
398     }
399 }
400
401
402 static void print_info(const struct config_t *p_config)
403 {
404     if (p_config->xmloutput)
405         fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
406                 "<icu>\n");
407
408     if ('c' == config.print[0])
409         print_icu_converters(&config);
410     else if ('l' == config.print[0])
411         print_icu_xml_locales(&config);
412     else if ('t' == config.print[0])
413         print_icu_transliterators(&config);
414     else {
415         print_icu_converters(&config);
416         print_icu_xml_locales(&config);
417         print_icu_transliterators(&config);
418     }
419
420     if (p_config->xmloutput)
421         fprintf(config.outfile, "</icu>\n");
422
423     exit(0);
424 }
425
426
427
428 static void process_text_file(const struct config_t *p_config)
429 {
430     char *line = 0;
431     char linebuf[1024];
432  
433     xmlDoc *doc = xmlParseFile(config.conffile);  
434     xmlNode *xml_node = xmlDocGetRootElement(doc);
435
436     long unsigned int token_count = 0;    
437     long unsigned int line_count = 0;    
438     
439     UErrorCode status = U_ZERO_ERROR;
440     int success = 0;
441     
442     if (! xml_node)
443     {   
444         printf("Could not parse XML config file '%s' \n",
445                 config.conffile);
446         exit (1);
447     }
448
449     config.chain = icu_chain_xml_config(xml_node, 0, &status);
450
451     if (config.chain && U_SUCCESS(status))
452         success = 1;
453     else {   
454         printf("Could not set up ICU chain from config file '%s' \n",
455                 config.conffile);
456         exit (1);
457     }
458
459     if (p_config->xmloutput)
460         fprintf(config.outfile,
461                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
462                 "<icu>\n"
463                 "<tokens>\n");
464     
465     /* read input lines for processing */
466     while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
467     {
468         success = icu_chain_assign_cstr(config.chain, line, &status);
469         line_count++;
470
471         while (success && icu_chain_next_token(config.chain, &status))
472         {
473             if (U_FAILURE(status))
474                 success = 0;
475             else {
476                 token_count++;
477                 if (p_config->xmloutput)                    
478                     fprintf(config.outfile, 
479                             "<token id=\%lu\" line=\"%lu\""
480                             " norm=\"%s\" display=\"%s\"/>\n",
481                             token_count,
482                             line_count,
483                             icu_chain_token_norm(config.chain),
484                             icu_chain_token_display(config.chain));
485                 else
486                     fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
487                             token_count,
488                             line_count,
489                             icu_chain_token_norm(config.chain),
490                             icu_chain_token_display(config.chain));
491             }
492         }
493         
494     }
495
496     if (p_config->xmloutput)
497         fprintf(config.outfile, 
498                 "</tokens>\n"
499                 "</icu>\n");
500
501     icu_chain_destroy(config.chain);
502     xmlFreeDoc(doc);
503     if (line)
504         free(line);
505 }
506
507 #endif /* YAZ_HAVE_ICU */
508
509
510 int main(int argc, char **argv) 
511 {
512
513 #if YAZ_HAVE_ICU
514
515     read_params(argc, argv, &config);
516
517     if (config.conffile && strlen(config.conffile))
518         process_text_file(&config);
519      
520     if (config.print && strlen(config.print))
521         print_info(&config);
522
523 #else /* YAZ_HAVE_ICU */
524
525     printf("ICU not available on your system.\n"
526            "Please install libicu36-dev and icu-doc or similar, "
527            "re-configure and re-compile\n");
528
529
530 #endif /* YAZ_HAVE_ICU */
531
532     return(0);
533 }
534
535
536 /*
537  * Local variables:
538  * c-basic-offset: 4
539  * indent-tabs-mode: nil
540  * End:
541  * vim: shiftwidth=4 tabstop=8 expandtab
542  */
543