Update source headers for 2008. Omit CVS ID keyword subst.
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2008 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #if HAVE_CONFIG_H
7 #include "config.h"
8 #endif
9
10 #include <string.h>
11
12 #include <stdio.h>
13 #include <stdlib.h>
14
15 #include <yaz/options.h>
16
17 #if YAZ_HAVE_ICU
18
19 #include <unicode/ucnv.h>
20 #include <unicode/ustring.h>
21 #include <unicode/ucol.h> 
22 #include <unicode/ubrk.h>
23 #include <unicode/utrans.h>
24
25 #include <yaz/icu.h>
26 #include <yaz/wrbuf.h>
27
28 /* commando line and config parameters */
29 static struct config_t { 
30     char conffile[1024];
31     char print[1024];
32     int xmloutput;
33     yaz_icu_chain_t chain;
34     FILE * infile;
35     FILE * outfile;
36 } config;
37
38
39   
40 void print_option_error(const struct config_t *p_config)
41 {  
42     fprintf(stderr, "Calling error, valid options are :\n");
43     fprintf(stderr, "yaz-icu\n"
44             "   [-c (path/to/config/file.xml)]\n"
45             "   [-p (a|c|l|t)] print ICU info \n"
46             "   [-x] XML output\n"
47             "\n"
48             "Examples:\n"
49             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
50             "./yaz-icu -p c\n"
51             "./yaz-icu -p l -x\n"
52             "./yaz-icu -p t -x\n"
53             "\n"
54             "Example ICU chain XML configuration file:\n"
55             "<icu_chain locale=\"en\">\n"
56             "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
57             "  <tokenize rule=\"l\"/>\n"
58             "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
59             "  <casemap rule=\"l\"/>\n"
60             "</icu_chain>\n"
61           );
62     exit(1);
63 }
64
65 void read_params(int argc, char **argv, struct config_t *p_config)
66 {    
67     char *arg;
68     int ret;
69     
70     /* set default parameters */
71     p_config->conffile[0] = 0;
72     p_config->print[0] = 0;
73     p_config->xmloutput = 0;
74     p_config->chain = 0;
75     p_config->infile = stdin;
76     p_config->outfile = stdout;
77     
78     /* set up command line parameters */
79     
80     while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
81     {
82         switch (ret)
83         {
84         case 'c':
85             strcpy(p_config->conffile, arg);
86             break;
87         case 'p':
88             strcpy(p_config->print, arg);
89             break;
90         case 'x':
91             p_config->xmloutput = 1;
92             break;
93         default:
94             print_option_error(p_config);
95         }
96     }
97     
98     if ((!strlen(p_config->conffile)
99          && !strlen(p_config->print))
100         || !config.infile
101         || !config.outfile)
102         
103         print_option_error(p_config);
104 }
105
106
107 /*     UConverter *conv; */
108 /*     conv = ucnv_open("utf-8", &status); */
109 /*     assert(U_SUCCESS(status)); */
110
111 /*     *ustr16_len  */
112 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
113 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
114 /*                       &status); */
115   
116
117
118 /*      ucnv_fromUChars(conv, */
119 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
120 /*                      ustr16, *ustr16_len, */
121 /*                      &status); */
122 /*      ucnv_close(conv); */
123
124
125 static void print_icu_converters(const struct config_t *p_config)
126 {
127     int32_t count;
128     int32_t i;
129
130     count = ucnv_countAvailable();
131     if (p_config->xmloutput)
132         fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
133                 count, ucnv_getDefaultName());
134     else {    
135         fprintf(config.outfile, "Available ICU converters: %d\n", count);
136         fprintf(config.outfile, "Default ICU Converter is: '%s'\n", 
137                 ucnv_getDefaultName());
138     }
139     
140     for(i=0;i<count;i++)
141     {
142         if (p_config->xmloutput)
143             fprintf(config.outfile, "<converter id=\"%s\"/>\n", 
144                     ucnv_getAvailableName(i));
145         else     
146             fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
147     }
148     
149     if (p_config->xmloutput)
150         fprintf(config.outfile, "</converters>\n");
151     else
152         fprintf(config.outfile, "\n");
153 }
154
155 static void print_icu_transliterators(const struct config_t *p_config)
156 {
157     int32_t buf_cap = 128;
158     char buf[128];
159     int32_t i;
160     int32_t count = utrans_countAvailableIDs();
161     
162     if (p_config->xmloutput)
163         fprintf(config.outfile, "<transliterators count=\"%d\">\n",  count);
164     else 
165         fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
166     
167     for(i = 0; i <count; i++)
168     {
169         utrans_getAvailableID(i, buf, buf_cap);
170         if (p_config->xmloutput)
171             fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
172         else
173             fprintf(config.outfile, " %s", buf);
174     }
175     
176     if (p_config->xmloutput)
177     {
178         fprintf(config.outfile, "</transliterators>\n");
179     }
180     else
181     {
182         fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
183                 "   Pattern         Description\n"
184                 "   Ranges          [a-z]       The lower case letters a through z\n"
185                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
186                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
187                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
188                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
189                 "\n"
190                 "   Combination     Example\n"
191                 "   Union           [[:Greek:] [:letter:]]\n"
192                 "   Intersection    [[:Greek:] & [:letter:]]\n"
193                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
194                 "   Complement      [^[:Greek:] [:letter:]]\n"
195                 "\n"
196              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
197                 "\n"
198                 "Examples:\n"
199                 "   [:Punctuation:] Any-Remove\n"
200                 "   [:Cased-Letter:] Any-Upper\n"
201                 "   [:Control:] Any-Remove\n"
202                 "   [:Decimal_Number:] Any-Remove\n"
203                 "   [:Final_Punctuation:] Any-Remove\n"
204                 "   [:Georgian:] Any-Upper\n"
205                 "   [:Katakana:] Any-Remove\n"
206                 "   [:Arabic:] Any-Remove\n"
207                 "   [:Punctuation:] Remove\n"
208                 "   [[:Punctuation:]-[.,]] Remove\n"
209                 "   [:Line_Separator:] Any-Remove\n"
210                 "   [:Math_Symbol:] Any-Remove\n"
211                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
212                 "   [:^Number:] Remove (numeric tokenization)\n"
213                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
214                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
215                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
216                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
217                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
218                 "\n"
219                 "see http://icu.sourceforge.net/userguide/Transform.html\n"
220                 "    http://www.unicode.org/Public/UNIDATA/UCD.html\n"
221                 "    http://icu.sourceforge.net/userguide/Transform.html\n"
222                 "    http://icu.sourceforge.net/userguide/TransformRule.html\n"
223             );
224         
225         
226         fprintf(config.outfile, "\n\n");
227         
228     }
229 }
230
231 static void print_icu_xml_locales(const struct config_t *p_config)
232 {
233     int32_t count;
234     int32_t i;
235     UErrorCode status = U_ZERO_ERROR;
236     
237     UChar keyword[64];
238     int32_t keyword_len = 0;
239     char keyword_str[128];
240     int32_t keyword_str_len = 0;
241
242     UChar language[64];
243     int32_t language_len = 0;
244     char lang_str[128];
245     int32_t lang_str_len = 0;
246
247     UChar script[64];
248     int32_t script_len = 0;
249     char script_str[128];
250     int32_t script_str_len = 0;
251
252     UChar location[64];
253     int32_t location_len = 0;
254     char location_str[128];
255     int32_t location_str_len = 0;
256
257     UChar variant[64];
258     int32_t variant_len = 0;
259     char variant_str[128];
260     int32_t variant_str_len = 0;
261
262     UChar name[64];
263     int32_t name_len = 0;
264     char name_str[128];
265     int32_t name_str_len = 0;
266
267     UChar localname[64];
268     int32_t localname_len = 0;
269     char localname_str[128];
270     int32_t localname_str_len = 0;
271
272     count = uloc_countAvailable() ;
273
274     if (p_config->xmloutput)
275     {
276         fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
277                 count, uloc_getDefault(), ucol_countAvailable());
278     }
279   
280     for(i=0;i<count;i++) 
281     {
282
283         keyword_len 
284             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
285                                      keyword, 64, 
286                                      &status);
287
288         u_strToUTF8(keyword_str, 128, &keyword_str_len,
289                     keyword, keyword_len,
290                     &status);
291     
292     
293         language_len 
294             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
295                                       language, 64, 
296                                       &status);
297
298         u_strToUTF8(lang_str, 128, &lang_str_len,
299                     language, language_len,
300                     &status);
301
302
303         script_len 
304             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
305                                     script, 64, 
306                                     &status);
307
308         u_strToUTF8(script_str, 128, &script_str_len,
309                     script, script_len,
310                     &status);
311
312         location_len 
313             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
314                                      location, 64, 
315                                      &status);
316
317         u_strToUTF8(location_str, 128, &location_str_len,
318                     location, location_len,
319                     &status);
320
321         variant_len 
322             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
323                                      variant, 64, 
324                                      &status);
325
326         u_strToUTF8(variant_str, 128, &variant_str_len,
327                     variant, variant_len,
328                     &status);
329
330         name_len 
331             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
332                                   name, 64, 
333                                   &status);
334
335         u_strToUTF8(name_str, 128, &name_str_len,
336                     name, name_len,
337                     &status);
338
339         localname_len 
340             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
341                                   localname, 64, 
342                                   &status);
343
344         u_strToUTF8(localname_str, 128, &localname_str_len,
345                     localname, localname_len,
346                     &status);
347
348
349         if (p_config->xmloutput)
350         {
351             fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
352             /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
353             /* if (strlen(keyword_str)) */
354             /*   fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
355             /* if (ucol_getAvailable(i)) */
356             /*   fprintf(config.outfile, " collation=\"1\""); */
357             if (strlen(lang_str))
358                 fprintf(config.outfile, " language=\"%s\"", lang_str);
359             if (strlen(script_str))
360                 fprintf(config.outfile, " script=\"%s\"", script_str);
361             if (strlen(location_str))
362                 fprintf(config.outfile, " location=\"%s\"", location_str);
363             if (strlen(variant_str))
364                 fprintf(config.outfile, " variant=\"%s\"", variant_str);
365             if (strlen(name_str))
366                 fprintf(config.outfile, " name=\"%s\"", name_str);
367             if (strlen(localname_str))
368                 fprintf(config.outfile, " localname=\"%s\"", localname_str);
369             fprintf(config.outfile, ">");
370             if (strlen(localname_str))
371                 fprintf(config.outfile, "%s", localname_str);
372             fprintf(config.outfile, "</locale>\n"); 
373         }
374         else if (1 == p_config->xmloutput)
375         {
376             fprintf(config.outfile, "%s", uloc_getAvailable(i)); 
377             fprintf(config.outfile, " | ");
378             if (strlen(name_str))
379                 fprintf(config.outfile, "%s", name_str);
380             fprintf(config.outfile, " | ");
381             if (strlen(localname_str))
382                 fprintf(config.outfile, "%s", localname_str);
383             fprintf(config.outfile, "\n");
384         }
385         else
386             fprintf(config.outfile, "%s ", uloc_getAvailable(i));
387     }
388     if (p_config->xmloutput)
389         fprintf(config.outfile, "</locales>\n");
390     else
391         fprintf(config.outfile, "\n");
392
393     if(U_FAILURE(status))
394     {
395         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
396         exit(status);
397     }
398 }
399
400
401 static void print_info(const struct config_t *p_config)
402 {
403     if (p_config->xmloutput)
404         fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
405                 "<icu>\n");
406
407     if ('c' == config.print[0])
408         print_icu_converters(&config);
409     else if ('l' == config.print[0])
410         print_icu_xml_locales(&config);
411     else if ('t' == config.print[0])
412         print_icu_transliterators(&config);
413     else {
414         print_icu_converters(&config);
415         print_icu_xml_locales(&config);
416         print_icu_transliterators(&config);
417     }
418
419     if (p_config->xmloutput)
420         fprintf(config.outfile, "</icu>\n");
421
422     exit(0);
423 }
424
425
426
427 static void process_text_file(const struct config_t *p_config)
428 {
429     char *line = 0;
430     char linebuf[1024];
431  
432     xmlDoc *doc = xmlParseFile(config.conffile);  
433     xmlNode *xml_node = xmlDocGetRootElement(doc);
434
435     long unsigned int token_count = 0;    
436     long unsigned int line_count = 0;    
437     
438     UErrorCode status = U_ZERO_ERROR;
439     int success = 0;
440     
441     if (! xml_node)
442     {   
443         printf("Could not parse XML config file '%s' \n",
444                 config.conffile);
445         exit (1);
446     }
447
448     config.chain = icu_chain_xml_config(xml_node, 1, &status);
449
450     if (config.chain && U_SUCCESS(status))
451         success = 1;
452     else {   
453         printf("Could not set up ICU chain from config file '%s' \n",
454                 config.conffile);
455         exit (1);
456     }
457
458     if (p_config->xmloutput)
459         fprintf(config.outfile,
460                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
461                 "<icu>\n"
462                 "<tokens>\n");
463     
464     /* read input lines for processing */
465     while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
466     {
467         success = icu_chain_assign_cstr(config.chain, line, &status);
468         line_count++;
469
470         while (success && icu_chain_next_token(config.chain, &status))
471         {
472             WRBUF sw = wrbuf_alloc();
473             if (U_FAILURE(status))
474                 success = 0;
475             else {
476                 const char *sortkey = icu_chain_token_sortkey(config.chain);
477                 wrbuf_rewind(sw);
478                 wrbuf_puts_escaped(sw, sortkey);
479                 token_count++;
480                 if (p_config->xmloutput)                    
481                 {
482                     /* should XML encode this. Bug #1902 */
483                     fprintf(config.outfile, 
484                             "<token id=\"%lu\" line=\"%lu\""
485                             " norm=\"%s\" display=\"%s\" sortkey=\"%s\"/>\n",
486                             token_count,
487                             line_count,
488                             icu_chain_token_norm(config.chain),
489                             icu_chain_token_display(config.chain),
490                             wrbuf_cstr(sw));
491                 }
492                 else
493                     fprintf(config.outfile, "%lu %lu '%s' '%s' '%s'\n",
494                             token_count,
495                             line_count,
496                             icu_chain_token_norm(config.chain),
497                             icu_chain_token_display(config.chain),
498                             wrbuf_cstr(sw));
499             }
500             wrbuf_destroy(sw);
501         }
502         
503     }
504
505     if (p_config->xmloutput)
506         fprintf(config.outfile, 
507                 "</tokens>\n"
508                 "</icu>\n");
509
510     icu_chain_destroy(config.chain);
511     xmlFreeDoc(doc);
512     if (line)
513         free(line);
514 }
515
516 #endif /* YAZ_HAVE_ICU */
517
518
519 int main(int argc, char **argv) 
520 {
521
522 #if YAZ_HAVE_ICU
523
524     read_params(argc, argv, &config);
525
526     if (config.conffile && strlen(config.conffile))
527         process_text_file(&config);
528      
529     if (config.print && strlen(config.print))
530         print_info(&config);
531
532 #else /* YAZ_HAVE_ICU */
533
534     printf("ICU not available on your system.\n"
535            "Please install libicu36-dev and icu-doc or similar, "
536            "re-configure and re-compile\n");
537
538
539 #endif /* YAZ_HAVE_ICU */
540
541     return(0);
542 }
543
544
545 /*
546  * Local variables:
547  * c-basic-offset: 4
548  * indent-tabs-mode: nil
549  * End:
550  * vim: shiftwidth=4 tabstop=8 expandtab
551  */
552