4f0368e3d579e401447a14a984a53e9d46ac5bf3
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: yaz-icu.c,v 1.16 2008-01-14 22:58:06 adam Exp $
6  */
7
8 #if HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11
12 #include <string.h>
13
14 #include <stdio.h>
15 #include <stdlib.h>
16
17 #include <yaz/options.h>
18
19 #if YAZ_HAVE_ICU
20
21 #include <unicode/ucnv.h>
22 #include <unicode/ustring.h>
23 #include <unicode/ucol.h> 
24 #include <unicode/ubrk.h>
25 #include <unicode/utrans.h>
26
27 #include <yaz/icu.h>
28 #include <yaz/wrbuf.h>
29
30 /* commando line and config parameters */
31 static struct config_t { 
32     char conffile[1024];
33     char print[1024];
34     int xmloutput;
35     yaz_icu_chain_t chain;
36     FILE * infile;
37     FILE * outfile;
38 } config;
39
40
41   
42 void print_option_error(const struct config_t *p_config)
43 {  
44     fprintf(stderr, "Calling error, valid options are :\n");
45     fprintf(stderr, "yaz-icu\n"
46             "   [-c (path/to/config/file.xml)]\n"
47             "   [-p (a|c|l|t)] print ICU info \n"
48             "   [-x] XML output\n"
49             "\n"
50             "Examples:\n"
51             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
52             "./yaz-icu -p c\n"
53             "./yaz-icu -p l -x\n"
54             "./yaz-icu -p t -x\n"
55             "\n"
56             "Example ICU chain XML configuration file:\n"
57             "<icu_chain locale=\"en\">\n"
58             "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
59             "  <tokenize rule=\"l\"/>\n"
60             "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
61             "  <casemap rule=\"l\"/>\n"
62             "</icu_chain>\n"
63           );
64     exit(1);
65 }
66
67 void read_params(int argc, char **argv, struct config_t *p_config)
68 {    
69     char *arg;
70     int ret;
71     
72     /* set default parameters */
73     p_config->conffile[0] = 0;
74     p_config->print[0] = 0;
75     p_config->xmloutput = 0;
76     p_config->chain = 0;
77     p_config->infile = stdin;
78     p_config->outfile = stdout;
79     
80     /* set up command line parameters */
81     
82     while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
83     {
84         switch (ret)
85         {
86         case 'c':
87             strcpy(p_config->conffile, arg);
88             break;
89         case 'p':
90             strcpy(p_config->print, arg);
91             break;
92         case 'x':
93             p_config->xmloutput = 1;
94             break;
95         default:
96             print_option_error(p_config);
97         }
98     }
99     
100     if ((!strlen(p_config->conffile)
101          && !strlen(p_config->print))
102         || !config.infile
103         || !config.outfile)
104         
105         print_option_error(p_config);
106 }
107
108
109 /*     UConverter *conv; */
110 /*     conv = ucnv_open("utf-8", &status); */
111 /*     assert(U_SUCCESS(status)); */
112
113 /*     *ustr16_len  */
114 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
115 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
116 /*                       &status); */
117   
118
119
120 /*      ucnv_fromUChars(conv, */
121 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
122 /*                      ustr16, *ustr16_len, */
123 /*                      &status); */
124 /*      ucnv_close(conv); */
125
126
127 static void print_icu_converters(const struct config_t *p_config)
128 {
129     int32_t count;
130     int32_t i;
131
132     count = ucnv_countAvailable();
133     if (p_config->xmloutput)
134         fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
135                 count, ucnv_getDefaultName());
136     else {    
137         fprintf(config.outfile, "Available ICU converters: %d\n", count);
138         fprintf(config.outfile, "Default ICU Converter is: '%s'\n", 
139                 ucnv_getDefaultName());
140     }
141     
142     for(i=0;i<count;i++)
143     {
144         if (p_config->xmloutput)
145             fprintf(config.outfile, "<converter id=\"%s\"/>\n", 
146                     ucnv_getAvailableName(i));
147         else     
148             fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
149     }
150     
151     if (p_config->xmloutput)
152         fprintf(config.outfile, "</converters>\n");
153     else
154         fprintf(config.outfile, "\n");
155 }
156
157 static void print_icu_transliterators(const struct config_t *p_config)
158 {
159     int32_t buf_cap = 128;
160     char buf[128];
161     int32_t i;
162     int32_t count = utrans_countAvailableIDs();
163     
164     if (p_config->xmloutput)
165         fprintf(config.outfile, "<transliterators count=\"%d\">\n",  count);
166     else 
167         fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
168     
169     for(i = 0; i <count; i++)
170     {
171         utrans_getAvailableID(i, buf, buf_cap);
172         if (p_config->xmloutput)
173             fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
174         else
175             fprintf(config.outfile, " %s", buf);
176     }
177     
178     if (p_config->xmloutput)
179     {
180         fprintf(config.outfile, "</transliterators>\n");
181     }
182     else
183     {
184         fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
185                 "   Pattern         Description\n"
186                 "   Ranges          [a-z]       The lower case letters a through z\n"
187                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
188                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
189                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
190                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
191                 "\n"
192                 "   Combination     Example\n"
193                 "   Union           [[:Greek:] [:letter:]]\n"
194                 "   Intersection    [[:Greek:] & [:letter:]]\n"
195                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
196                 "   Complement      [^[:Greek:] [:letter:]]\n"
197                 "\n"
198              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
199                 "\n"
200                 "Examples:\n"
201                 "   [:Punctuation:] Any-Remove\n"
202                 "   [:Cased-Letter:] Any-Upper\n"
203                 "   [:Control:] Any-Remove\n"
204                 "   [:Decimal_Number:] Any-Remove\n"
205                 "   [:Final_Punctuation:] Any-Remove\n"
206                 "   [:Georgian:] Any-Upper\n"
207                 "   [:Katakana:] Any-Remove\n"
208                 "   [:Arabic:] Any-Remove\n"
209                 "   [:Punctuation:] Remove\n"
210                 "   [[:Punctuation:]-[.,]] Remove\n"
211                 "   [:Line_Separator:] Any-Remove\n"
212                 "   [:Math_Symbol:] Any-Remove\n"
213                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
214                 "   [:^Number:] Remove (numeric tokenization)\n"
215                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
216                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
217                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
218                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
219                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
220                 "\n"
221                 "see http://icu.sourceforge.net/userguide/Transform.html\n"
222                 "    http://www.unicode.org/Public/UNIDATA/UCD.html\n"
223                 "    http://icu.sourceforge.net/userguide/Transform.html\n"
224                 "    http://icu.sourceforge.net/userguide/TransformRule.html\n"
225             );
226         
227         
228         fprintf(config.outfile, "\n\n");
229         
230     }
231 }
232
233 static void print_icu_xml_locales(const struct config_t *p_config)
234 {
235     int32_t count;
236     int32_t i;
237     UErrorCode status = U_ZERO_ERROR;
238     
239     UChar keyword[64];
240     int32_t keyword_len = 0;
241     char keyword_str[128];
242     int32_t keyword_str_len = 0;
243
244     UChar language[64];
245     int32_t language_len = 0;
246     char lang_str[128];
247     int32_t lang_str_len = 0;
248
249     UChar script[64];
250     int32_t script_len = 0;
251     char script_str[128];
252     int32_t script_str_len = 0;
253
254     UChar location[64];
255     int32_t location_len = 0;
256     char location_str[128];
257     int32_t location_str_len = 0;
258
259     UChar variant[64];
260     int32_t variant_len = 0;
261     char variant_str[128];
262     int32_t variant_str_len = 0;
263
264     UChar name[64];
265     int32_t name_len = 0;
266     char name_str[128];
267     int32_t name_str_len = 0;
268
269     UChar localname[64];
270     int32_t localname_len = 0;
271     char localname_str[128];
272     int32_t localname_str_len = 0;
273
274     count = uloc_countAvailable() ;
275
276     if (p_config->xmloutput)
277     {
278         fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
279                 count, uloc_getDefault(), ucol_countAvailable());
280     }
281   
282     for(i=0;i<count;i++) 
283     {
284
285         keyword_len 
286             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
287                                      keyword, 64, 
288                                      &status);
289
290         u_strToUTF8(keyword_str, 128, &keyword_str_len,
291                     keyword, keyword_len,
292                     &status);
293     
294     
295         language_len 
296             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
297                                       language, 64, 
298                                       &status);
299
300         u_strToUTF8(lang_str, 128, &lang_str_len,
301                     language, language_len,
302                     &status);
303
304
305         script_len 
306             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
307                                     script, 64, 
308                                     &status);
309
310         u_strToUTF8(script_str, 128, &script_str_len,
311                     script, script_len,
312                     &status);
313
314         location_len 
315             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
316                                      location, 64, 
317                                      &status);
318
319         u_strToUTF8(location_str, 128, &location_str_len,
320                     location, location_len,
321                     &status);
322
323         variant_len 
324             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
325                                      variant, 64, 
326                                      &status);
327
328         u_strToUTF8(variant_str, 128, &variant_str_len,
329                     variant, variant_len,
330                     &status);
331
332         name_len 
333             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
334                                   name, 64, 
335                                   &status);
336
337         u_strToUTF8(name_str, 128, &name_str_len,
338                     name, name_len,
339                     &status);
340
341         localname_len 
342             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
343                                   localname, 64, 
344                                   &status);
345
346         u_strToUTF8(localname_str, 128, &localname_str_len,
347                     localname, localname_len,
348                     &status);
349
350
351         if (p_config->xmloutput)
352         {
353             fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
354             /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
355             /* if (strlen(keyword_str)) */
356             /*   fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
357             /* if (ucol_getAvailable(i)) */
358             /*   fprintf(config.outfile, " collation=\"1\""); */
359             if (strlen(lang_str))
360                 fprintf(config.outfile, " language=\"%s\"", lang_str);
361             if (strlen(script_str))
362                 fprintf(config.outfile, " script=\"%s\"", script_str);
363             if (strlen(location_str))
364                 fprintf(config.outfile, " location=\"%s\"", location_str);
365             if (strlen(variant_str))
366                 fprintf(config.outfile, " variant=\"%s\"", variant_str);
367             if (strlen(name_str))
368                 fprintf(config.outfile, " name=\"%s\"", name_str);
369             if (strlen(localname_str))
370                 fprintf(config.outfile, " localname=\"%s\"", localname_str);
371             fprintf(config.outfile, ">");
372             if (strlen(localname_str))
373                 fprintf(config.outfile, "%s", localname_str);
374             fprintf(config.outfile, "</locale>\n"); 
375         }
376         else if (1 == p_config->xmloutput)
377         {
378             fprintf(config.outfile, "%s", uloc_getAvailable(i)); 
379             fprintf(config.outfile, " | ");
380             if (strlen(name_str))
381                 fprintf(config.outfile, "%s", name_str);
382             fprintf(config.outfile, " | ");
383             if (strlen(localname_str))
384                 fprintf(config.outfile, "%s", localname_str);
385             fprintf(config.outfile, "\n");
386         }
387         else
388             fprintf(config.outfile, "%s ", uloc_getAvailable(i));
389     }
390     if (p_config->xmloutput)
391         fprintf(config.outfile, "</locales>\n");
392     else
393         fprintf(config.outfile, "\n");
394
395     if(U_FAILURE(status))
396     {
397         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
398         exit(status);
399     }
400 }
401
402
403 static void print_info(const struct config_t *p_config)
404 {
405     if (p_config->xmloutput)
406         fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
407                 "<icu>\n");
408
409     if ('c' == config.print[0])
410         print_icu_converters(&config);
411     else if ('l' == config.print[0])
412         print_icu_xml_locales(&config);
413     else if ('t' == config.print[0])
414         print_icu_transliterators(&config);
415     else {
416         print_icu_converters(&config);
417         print_icu_xml_locales(&config);
418         print_icu_transliterators(&config);
419     }
420
421     if (p_config->xmloutput)
422         fprintf(config.outfile, "</icu>\n");
423
424     exit(0);
425 }
426
427
428
429 static void process_text_file(const struct config_t *p_config)
430 {
431     char *line = 0;
432     char linebuf[1024];
433  
434     xmlDoc *doc = xmlParseFile(config.conffile);  
435     xmlNode *xml_node = xmlDocGetRootElement(doc);
436
437     long unsigned int token_count = 0;    
438     long unsigned int line_count = 0;    
439     
440     UErrorCode status = U_ZERO_ERROR;
441     int success = 0;
442     
443     if (! xml_node)
444     {   
445         printf("Could not parse XML config file '%s' \n",
446                 config.conffile);
447         exit (1);
448     }
449
450     config.chain = icu_chain_xml_config(xml_node, 1, &status);
451
452     if (config.chain && U_SUCCESS(status))
453         success = 1;
454     else {   
455         printf("Could not set up ICU chain from config file '%s' \n",
456                 config.conffile);
457         exit (1);
458     }
459
460     if (p_config->xmloutput)
461         fprintf(config.outfile,
462                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
463                 "<icu>\n"
464                 "<tokens>\n");
465     
466     /* read input lines for processing */
467     while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
468     {
469         success = icu_chain_assign_cstr(config.chain, line, &status);
470         line_count++;
471
472         while (success && icu_chain_next_token(config.chain, &status))
473         {
474             WRBUF sw = wrbuf_alloc();
475             if (U_FAILURE(status))
476                 success = 0;
477             else {
478                 const char *sortkey = icu_chain_token_sortkey(config.chain);
479                 wrbuf_rewind(sw);
480                 wrbuf_puts_escaped(sw, sortkey);
481                 token_count++;
482                 if (p_config->xmloutput)                    
483                 {
484                     /* should XML encode this. Bug #1902 */
485                     fprintf(config.outfile, 
486                             "<token id=\"%lu\" line=\"%lu\""
487                             " norm=\"%s\" display=\"%s\" sortkey=\"%s\"/>\n",
488                             token_count,
489                             line_count,
490                             icu_chain_token_norm(config.chain),
491                             icu_chain_token_display(config.chain),
492                             wrbuf_cstr(sw));
493                 }
494                 else
495                     fprintf(config.outfile, "%lu %lu '%s' '%s' '%s'\n",
496                             token_count,
497                             line_count,
498                             icu_chain_token_norm(config.chain),
499                             icu_chain_token_display(config.chain),
500                             wrbuf_cstr(sw));
501             }
502             wrbuf_destroy(sw);
503         }
504         
505     }
506
507     if (p_config->xmloutput)
508         fprintf(config.outfile, 
509                 "</tokens>\n"
510                 "</icu>\n");
511
512     icu_chain_destroy(config.chain);
513     xmlFreeDoc(doc);
514     if (line)
515         free(line);
516 }
517
518 #endif /* YAZ_HAVE_ICU */
519
520
521 int main(int argc, char **argv) 
522 {
523
524 #if YAZ_HAVE_ICU
525
526     read_params(argc, argv, &config);
527
528     if (config.conffile && strlen(config.conffile))
529         process_text_file(&config);
530      
531     if (config.print && strlen(config.print))
532         print_info(&config);
533
534 #else /* YAZ_HAVE_ICU */
535
536     printf("ICU not available on your system.\n"
537            "Please install libicu36-dev and icu-doc or similar, "
538            "re-configure and re-compile\n");
539
540
541 #endif /* YAZ_HAVE_ICU */
542
543     return(0);
544 }
545
546
547 /*
548  * Local variables:
549  * c-basic-offset: 4
550  * indent-tabs-mode: nil
551  * End:
552  * vim: shiftwidth=4 tabstop=8 expandtab
553  */
554