c6e7a0e2eb11fb0bfebe497194f211737f581391
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: yaz-icu.c,v 1.9 2007-11-08 08:17:18 adam Exp $
6  */
7
8 #if HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11
12 #include <string.h>
13
14 #include <stdio.h>
15 #include <stdlib.h>
16
17 #include <yaz/options.h>
18
19
20 #if YAZ_HAVE_ICU
21
22 #include <unicode/ucnv.h>
23 #include <unicode/ustring.h>
24
25 #include <yaz/icu_I18N.h>
26
27 /* commando line and config parameters */
28 static struct config_t { 
29     char conffile[1024];
30     char print[1024];
31     int xmloutput;
32     struct icu_chain * chain;
33     FILE * infile;
34     FILE * outfile;
35 } config;
36
37
38   
39 void print_option_error(const struct config_t *p_config)
40 {  
41     fprintf(stderr, "Calling error, valid options are :\n");
42     fprintf(stderr, "yaz-icu\n"
43             "   [-c (path/to/config/file.xml)]\n"
44             "   [-p (a|c|l|t)] print ICU info \n"
45             "   [-x] XML output\n"
46             "\n"
47             "Examples:\n"
48             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
49             "./yaz-icu -p c\n"
50             "./yaz-icu -p l -x\n"
51             "./yaz-icu -p t -x\n"
52             "\n"
53             "Example ICU chain XML configuration file:\n"
54             "<icu_chain id=\"en:word\" locale=\"en\">\n"
55             "  <normalize rule=\"[:Control:] Any-Remove\"/>\n"
56             "  <tokenize rule=\"l\"/>\n"
57             "  <normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
58             "  <casemap rule=\"l\"/>\n"
59             "</icu_chain>\n"
60           );
61     exit(1);
62 }
63
64 void read_params(int argc, char **argv, struct config_t *p_config)
65 {    
66     char *arg;
67     int ret;
68     
69     /* set default parameters */
70     p_config->conffile[0] = 0;
71     p_config->print[0] = 0;
72     p_config->xmloutput = 0;
73     p_config->chain = 0;
74     p_config->infile = stdin;
75     p_config->outfile = stdout;
76     
77     /* set up command line parameters */
78     
79     while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
80     {
81         switch (ret)
82         {
83         case 'c':
84             strcpy(p_config->conffile, arg);
85             break;
86         case 'p':
87             strcpy(p_config->print, arg);
88             break;
89         case 'x':
90             p_config->xmloutput = 1;
91             break;
92         default:
93             print_option_error(p_config);
94         }
95     }
96     
97     if ((!strlen(p_config->conffile)
98          && !strlen(p_config->print))
99         || !config.infile
100         || !config.outfile)
101         
102         print_option_error(p_config);
103 }
104
105
106 /*     UConverter *conv; */
107 /*     conv = ucnv_open("utf-8", &status); */
108 /*     assert(U_SUCCESS(status)); */
109
110 /*     *ustr16_len  */
111 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
112 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
113 /*                       &status); */
114   
115
116
117 /*      ucnv_fromUChars(conv, */
118 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
119 /*                      ustr16, *ustr16_len, */
120 /*                      &status); */
121 /*      ucnv_close(conv); */
122
123
124 static void print_icu_converters(const struct config_t *p_config)
125 {
126     int32_t count;
127     int32_t i;
128
129     count = ucnv_countAvailable();
130     if (p_config->xmloutput)
131         fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
132                 count, ucnv_getDefaultName());
133     else {    
134         fprintf(config.outfile, "Available ICU converters: %d\n", count);
135         fprintf(config.outfile, "Default ICU Converter is: '%s'\n", 
136                 ucnv_getDefaultName());
137     }
138     
139     for(i=0;i<count;i++){
140         if (p_config->xmloutput)
141             fprintf(config.outfile, "<converter id=\"%s\"/>\n", 
142                     ucnv_getAvailableName(i));
143         else     
144             fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
145     }
146     
147     if (p_config->xmloutput)
148         fprintf(config.outfile, "</converters>\n");
149     else
150         fprintf(config.outfile, "\n");
151 }
152
153 static void print_icu_transliterators(const struct config_t *p_config)
154 {
155     int32_t buf_cap = 128;
156     char buf[buf_cap];
157     int32_t i;
158     int32_t count = utrans_countAvailableIDs();
159     
160     if (p_config->xmloutput)
161         fprintf(config.outfile, "<transliterators count=\"%d\">\n",  count);
162     else 
163         fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
164     
165     for(i = 0; i <count; i++)
166     {
167         utrans_getAvailableID(i, buf, buf_cap);
168         if (p_config->xmloutput)
169             fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
170         else
171             fprintf(config.outfile, " %s", buf);
172     }
173     
174     if (p_config->xmloutput){
175         fprintf(config.outfile, "</transliterators>\n");
176     }
177     else
178     {
179         fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
180                 "   Pattern         Description\n"
181                 "   Ranges          [a-z]       The lower case letters a through z\n"
182                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
183                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
184                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
185                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
186                 "\n"
187                 "   Combination     Example\n"
188                 "   Union           [[:Greek:] [:letter:]]\n"
189                 "   Intersection    [[:Greek:] & [:letter:]]\n"
190                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
191                 "   Complement      [^[:Greek:] [:letter:]]\n"
192                 "\n"
193              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
194                 "\n"
195                 "Examples:\n"
196                 "   [:Punctuation:] Any-Remove\n"
197                 "   [:Cased-Letter:] Any-Upper\n"
198                 "   [:Control:] Any-Remove\n"
199                 "   [:Decimal_Number:] Any-Remove\n"
200                 "   [:Final_Punctuation:] Any-Remove\n"
201                 "   [:Georgian:] Any-Upper\n"
202                 "   [:Katakana:] Any-Remove\n"
203                 "   [:Arabic:] Any-Remove\n"
204                 "   [:Punctuation:] Remove\n"
205                 "   [[:Punctuation:]-[.,]] Remove\n"
206                 "   [:Line_Separator:] Any-Remove\n"
207                 "   [:Math_Symbol:] Any-Remove\n"
208                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
209                 "   [:^Number:] Remove (numeric tokenization)\n"
210                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
211                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
212                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
213                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
214                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
215                 "\n"
216                 "see http://icu.sourceforge.net/userguide/Transform.html\n"
217                 "    http://www.unicode.org/Public/UNIDATA/UCD.html\n"
218                 "    http://icu.sourceforge.net/userguide/Transform.html\n"
219                 "    http://icu.sourceforge.net/userguide/TransformRule.html\n"
220             );
221         
222         
223         fprintf(config.outfile, "\n\n");
224         
225     }
226 }
227
228 static void print_icu_xml_locales(const struct config_t *p_config)
229 {
230     int32_t count;
231     int32_t i;
232     UErrorCode status = U_ZERO_ERROR;
233     
234     UChar keyword[64];
235     int32_t keyword_len = 0;
236     char keyword_str[128];
237     int32_t keyword_str_len = 0;
238
239     UChar language[64];
240     int32_t language_len = 0;
241     char lang_str[128];
242     int32_t lang_str_len = 0;
243
244     UChar script[64];
245     int32_t script_len = 0;
246     char script_str[128];
247     int32_t script_str_len = 0;
248
249     UChar location[64];
250     int32_t location_len = 0;
251     char location_str[128];
252     int32_t location_str_len = 0;
253
254     UChar variant[64];
255     int32_t variant_len = 0;
256     char variant_str[128];
257     int32_t variant_str_len = 0;
258
259     UChar name[64];
260     int32_t name_len = 0;
261     char name_str[128];
262     int32_t name_str_len = 0;
263
264     UChar localname[64];
265     int32_t localname_len = 0;
266     char localname_str[128];
267     int32_t localname_str_len = 0;
268
269     count = uloc_countAvailable() ;
270
271     if (p_config->xmloutput){
272     
273         fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
274                 count, uloc_getDefault(), ucol_countAvailable());
275     }
276   
277     for(i=0;i<count;i++) 
278     {
279
280         keyword_len 
281             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
282                                      keyword, 64, 
283                                      &status);
284
285         u_strToUTF8(keyword_str, 128, &keyword_str_len,
286                     keyword, keyword_len,
287                     &status);
288     
289     
290         language_len 
291             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
292                                       language, 64, 
293                                       &status);
294
295         u_strToUTF8(lang_str, 128, &lang_str_len,
296                     language, language_len,
297                     &status);
298
299
300         script_len 
301             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
302                                     script, 64, 
303                                     &status);
304
305         u_strToUTF8(script_str, 128, &script_str_len,
306                     script, script_len,
307                     &status);
308
309         location_len 
310             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
311                                      location, 64, 
312                                      &status);
313
314         u_strToUTF8(location_str, 128, &location_str_len,
315                     location, location_len,
316                     &status);
317
318         variant_len 
319             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
320                                      variant, 64, 
321                                      &status);
322
323         u_strToUTF8(variant_str, 128, &variant_str_len,
324                     variant, variant_len,
325                     &status);
326
327         name_len 
328             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
329                                   name, 64, 
330                                   &status);
331
332         u_strToUTF8(name_str, 128, &name_str_len,
333                     name, name_len,
334                     &status);
335
336         localname_len 
337             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
338                                   localname, 64, 
339                                   &status);
340
341         u_strToUTF8(localname_str, 128, &localname_str_len,
342                     localname, localname_len,
343                     &status);
344
345
346         if (p_config->xmloutput){
347             fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
348             /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
349             /* if (strlen(keyword_str)) */
350             /*   fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
351             /* if (ucol_getAvailable(i)) */
352             /*   fprintf(config.outfile, " collation=\"1\""); */
353             if (strlen(lang_str))
354                 fprintf(config.outfile, " language=\"%s\"", lang_str);
355             if (strlen(script_str))
356                 fprintf(config.outfile, " script=\"%s\"", script_str);
357             if (strlen(location_str))
358                 fprintf(config.outfile, " location=\"%s\"", location_str);
359             if (strlen(variant_str))
360                 fprintf(config.outfile, " variant=\"%s\"", variant_str);
361             if (strlen(name_str))
362                 fprintf(config.outfile, " name=\"%s\"", name_str);
363             if (strlen(localname_str))
364                 fprintf(config.outfile, " localname=\"%s\"", localname_str);
365             fprintf(config.outfile, ">");
366             if (strlen(localname_str))
367                 fprintf(config.outfile, "%s", localname_str);
368             fprintf(config.outfile, "</locale>\n"); 
369         }
370         else if (1 == p_config->xmloutput){
371             fprintf(config.outfile, "%s", uloc_getAvailable(i)); 
372             fprintf(config.outfile, " | ");
373             if (strlen(name_str))
374                 fprintf(config.outfile, "%s", name_str);
375             fprintf(config.outfile, " | ");
376             if (strlen(localname_str))
377                 fprintf(config.outfile, "%s", localname_str);
378             fprintf(config.outfile, "\n");
379         }
380         else
381             fprintf(config.outfile, "%s ", uloc_getAvailable(i));
382     }
383     if (p_config->xmloutput)
384         fprintf(config.outfile, "</locales>\n");
385     else
386         fprintf(config.outfile, "\n");
387
388     if(U_FAILURE(status)) {
389         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
390         exit(status);
391     }
392 }
393
394
395 static void print_info(const struct config_t *p_config)
396 {
397     if (p_config->xmloutput)
398         fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
399                 "<icu>\n");
400
401     if ('c' == config.print[0])
402         print_icu_converters(&config);
403     else if ('l' == config.print[0])
404         print_icu_xml_locales(&config);
405     else if ('t' == config.print[0])
406         print_icu_transliterators(&config);
407     else {
408         print_icu_converters(&config);
409         print_icu_xml_locales(&config);
410         print_icu_transliterators(&config);
411     }
412
413     if (p_config->xmloutput)
414         fprintf(config.outfile, "</icu>\n");
415
416     exit(0);
417 }
418
419
420
421 static void process_text_file(const struct config_t *p_config)
422 {
423     char *line = 0;
424     char linebuf[1024];
425  
426     xmlDoc *doc = xmlParseFile(config.conffile);  
427     xmlNode *xml_node = xmlDocGetRootElement(doc);
428     xmlChar *xml_locale = xmlGetProp(xml_node, (xmlChar *) "locale");
429
430     long unsigned int token_count = 0;    
431     long unsigned int line_count = 0;    
432     
433     UErrorCode status = U_ZERO_ERROR;
434     int success = 0;
435     
436     if (! xml_node) {   
437         printf("Could not parse XML config file '%s' \n",
438                 config.conffile);
439         exit (1);
440     }
441
442     if (!xml_locale || !strlen((const char *) xml_locale))
443         return;        
444     
445     config.chain = icu_chain_xml_config(xml_node, (const char *) xml_locale, 0,
446                                         &status);
447
448     xmlFree(xml_locale);
449
450
451     if (config.chain && U_SUCCESS(status))
452         success = 1;
453     else {   
454         printf("Could not set up ICU chain from config file '%s' \n",
455                 config.conffile);
456         exit (1);
457     }
458     
459
460
461     if (p_config->xmloutput)
462         fprintf(config.outfile,
463                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
464                 "<icu>\n"
465                 "<tokens>\n");
466     
467     /* read input lines for processing */
468     while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
469     {
470         success = icu_chain_assign_cstr(config.chain, line, &status);
471         line_count++;
472
473         while (success && icu_chain_next_token(config.chain, &status)){
474             if (U_FAILURE(status))
475                 success = 0;
476             else {
477                 token_count++;
478                 if (p_config->xmloutput)                    
479                     fprintf(config.outfile, 
480                             "<token id=\%lu\" line=\"%lu\""
481                             " norm=\"%s\" display=\"%s\"/>\n",
482                             token_count,
483                             line_count,
484                             icu_chain_token_norm(config.chain),
485                             icu_chain_token_display(config.chain));
486                 else
487                     fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
488                             token_count,
489                             line_count,
490                             icu_chain_token_norm(config.chain),
491                             icu_chain_token_display(config.chain));
492             }
493         }
494         
495     }
496
497     if (p_config->xmloutput)
498         fprintf(config.outfile, 
499                 "</tokens>\n"
500                 "</icu>\n");
501
502     icu_chain_destroy(config.chain);
503     xmlFreeDoc(doc);
504     if (line)
505         free(line);
506 }
507
508 #endif /* YAZ_HAVE_ICU */
509
510
511 int main(int argc, char **argv) 
512 {
513
514 #if YAZ_HAVE_ICU
515
516     read_params(argc, argv, &config);
517
518     if (config.conffile && strlen(config.conffile))
519         process_text_file(&config);
520      
521     if (config.print && strlen(config.print))
522         print_info(&config);
523
524 #else /* YAZ_HAVE_ICU */
525
526     printf("ICU not available on your system.\n"
527            "Please install libicu36-dev and icu-doc or similar, "
528            "re-configure and re-compile\n");
529
530
531 #endif /* YAZ_HAVE_ICU */
532
533     return(0);
534 }
535
536
537 /*
538  * Local variables:
539  * c-basic-offset: 4
540  * indent-tabs-mode: nil
541  * End:
542  * vim: shiftwidth=4 tabstop=8 expandtab
543  */
544