Merge branch 'master' into sru_2_0
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5
6 #if HAVE_CONFIG_H
7 #include "config.h"
8 #endif
9
10 #include <string.h>
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <errno.h>
15
16 #include <yaz/options.h>
17
18 #if YAZ_HAVE_ICU
19
20 #include <unicode/ucnv.h>
21 #include <unicode/ustring.h>
22 #include <unicode/ucol.h>
23 #include <unicode/ubrk.h>
24 #include <unicode/utrans.h>
25 #include <unicode/uclean.h>
26
27 #include <yaz/icu.h>
28 #include <yaz/wrbuf.h>
29
30 /* commando line and config parameters */
31 struct config_t {
32     char conffile[1024];
33     char print[1024];
34     int xmloutput;
35     int sortoutput;
36     int org_output;
37     yaz_icu_chain_t chain;
38     FILE * infile;
39     FILE * outfile;
40 };
41
42 void print_option_error(const struct config_t *p_config)
43 {
44     fprintf(stderr, "yaz-icu [options] [infile]\n"
45             "Options:\n"
46             "   -c file         XML configuration\n"
47             "   -p a|c|l|t      Print ICU info \n"
48             "   -s              Show sort normalization key\n"
49             "   -o              Show org positions\n"
50             "   -x              XML output instread of text\n"
51             "\n"
52             "Examples:\n"
53             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
54             "./yaz-icu -p c\n"
55             "./yaz-icu -p l -x\n"
56             "./yaz-icu -p t -x\n"
57             "\n"
58             "Example ICU chain XML configuration file:\n"
59             "<icu_chain locale=\"en\">\n"
60             "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
61             "  <tokenize rule=\"l\"/>\n"
62             "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
63             "  <casemap rule=\"l\"/>\n"
64             "</icu_chain>\n"
65           );
66     exit(1);
67 }
68
69 void read_params(int argc, char **argv, struct config_t *p_config)
70 {
71     char *arg;
72     int ret;
73
74     /* set default parameters */
75     p_config->conffile[0] = 0;
76     p_config->print[0] = 0;
77     p_config->xmloutput = 0;
78     p_config->sortoutput = 0;
79     p_config->chain = 0;
80     p_config->infile = 0;
81     p_config->outfile = stdout;
82     p_config->org_output = 0;
83
84     /* set up command line parameters */
85
86     while ((ret = options("c:op:sx", argv, argc, &arg)) != -2)
87     {
88         switch (ret)
89         {
90         case 'c':
91             strcpy(p_config->conffile, arg);
92             break;
93         case 'p':
94             strcpy(p_config->print, arg);
95             break;
96         case 's':
97             p_config->sortoutput = 1;
98             break;
99         case 'x':
100             p_config->xmloutput = 1;
101             break;
102         case 'o':
103             p_config->org_output = 1;
104             break;
105         case 0:
106             if (p_config->infile)
107             {
108                 fprintf(stderr, "yaz-icu: only one input file may be given\n");
109                 print_option_error(p_config);
110             }
111             p_config->infile = fopen(arg, "r");
112             if (!p_config->infile)
113             {
114                 fprintf(stderr, "yaz-icu: cannot open %s : %s\n",
115                         arg, strerror(errno));
116                 exit(1);
117             }
118             break;
119         default:
120             fprintf(stderr, "yaz_icu: invalid option: %s\n", arg);
121             print_option_error(p_config);
122         }
123     }
124
125     if (p_config->infile == 0)
126         p_config->infile = stdin;
127
128     if (!strlen(p_config->conffile) && !strlen(p_config->print))
129         print_option_error(p_config);
130 }
131
132 static void print_icu_converters(const struct config_t *p_config)
133 {
134     int32_t count;
135     int32_t i;
136
137     count = ucnv_countAvailable();
138     if (p_config->xmloutput)
139         fprintf(p_config->outfile, "<converters count=\"%d\" default=\"%s\">\n",
140                 count, ucnv_getDefaultName());
141     else
142     {
143         fprintf(p_config->outfile, "Available ICU converters: %d\n", count);
144         fprintf(p_config->outfile, "Default ICU Converter is: '%s'\n",
145                 ucnv_getDefaultName());
146     }
147
148     for (i = 0; i < count; i++)
149     {
150         if (p_config->xmloutput)
151             fprintf(p_config->outfile, "<converter id=\"%s\"/>\n",
152                     ucnv_getAvailableName(i));
153         else
154             fprintf(p_config->outfile, "%s\n", ucnv_getAvailableName(i));
155     }
156
157     if (p_config->xmloutput)
158         fprintf(p_config->outfile, "</converters>\n");
159     else
160         fprintf(p_config->outfile, "\n");
161 }
162
163 static void print_icu_transliterators(const struct config_t *p_config)
164 {
165     UErrorCode status;
166     UEnumeration *en = utrans_openIDs(&status);
167     int32_t count = uenum_count(en, &status);
168     const char *name;
169     int32_t length;
170
171     if (p_config->xmloutput)
172         fprintf(p_config->outfile, "<transliterators count=\"%d\">\n",  count);
173     else
174         fprintf(p_config->outfile, "Available ICU transliterators: %d\n", count);
175
176     while ((name = uenum_next(en, &length, &status)))
177     {
178         if (p_config->xmloutput)
179             fprintf(p_config->outfile, "<transliterator id=\"%s\"/>\n", name);
180         else
181             fprintf(p_config->outfile, "%s\n", name);
182     }
183     uenum_close(en);
184     if (p_config->xmloutput)
185         fprintf(p_config->outfile, "</transliterators>\n");
186     else
187     {
188         fprintf(p_config->outfile, "\n\nUnicode Set Patterns:\n"
189                 "   Pattern         Description\n"
190                 "   Ranges          [a-z]       The lower case letters a through z\n"
191                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
192                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
193                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
194                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
195                 "\n"
196                 "   Combination     Example\n"
197                 "   Union           [[:Greek:] [:letter:]]\n"
198                 "   Intersection    [[:Greek:] & [:letter:]]\n"
199                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
200                 "   Complement      [^[:Greek:] [:letter:]]\n"
201                 "\n"
202              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
203                 "\n"
204                 "Examples:\n"
205                 "   [:Punctuation:] Any-Remove\n"
206                 "   [:Cased-Letter:] Any-Upper\n"
207                 "   [:Control:] Any-Remove\n"
208                 "   [:Decimal_Number:] Any-Remove\n"
209                 "   [:Final_Punctuation:] Any-Remove\n"
210                 "   [:Georgian:] Any-Upper\n"
211                 "   [:Katakana:] Any-Remove\n"
212                 "   [:Arabic:] Any-Remove\n"
213                 "   [:Punctuation:] Remove\n"
214                 "   [[:Punctuation:]-[.,]] Remove\n"
215                 "   [:Line_Separator:] Any-Remove\n"
216                 "   [:Math_Symbol:] Any-Remove\n"
217                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
218                 "   [:^Number:] Remove (numeric tokenization)\n"
219                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
220                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
221                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
222                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
223                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
224                 "\n"
225                 "see http://userguide.icu-project.org/transforms/general\n"
226                 "    http://www.unicode.org/reports/tr44/\n"
227             );
228
229
230         fprintf(p_config->outfile, "\n\n");
231
232     }
233 }
234
235 static void print_icu_xml_locales(const struct config_t *p_config)
236 {
237     int32_t count;
238     int32_t i;
239     UErrorCode status = U_ZERO_ERROR;
240
241     UChar keyword[64];
242     int32_t keyword_len = 0;
243     char keyword_str[128];
244     int32_t keyword_str_len = 0;
245
246     UChar language[64];
247     int32_t language_len = 0;
248     char lang_str[128];
249     int32_t lang_str_len = 0;
250
251     UChar script[64];
252     int32_t script_len = 0;
253     char script_str[128];
254     int32_t script_str_len = 0;
255
256     UChar location[64];
257     int32_t location_len = 0;
258     char location_str[128];
259     int32_t location_str_len = 0;
260
261     UChar variant[64];
262     int32_t variant_len = 0;
263     char variant_str[128];
264     int32_t variant_str_len = 0;
265
266     UChar name[64];
267     int32_t name_len = 0;
268     char name_str[128];
269     int32_t name_str_len = 0;
270
271     UChar localname[64];
272     int32_t localname_len = 0;
273     char localname_str[128];
274     int32_t localname_str_len = 0;
275
276     count = uloc_countAvailable() ;
277
278     if (p_config->xmloutput)
279     {
280         fprintf(p_config->outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
281                 count, uloc_getDefault(), ucol_countAvailable());
282     }
283     else
284     {
285         fprintf(p_config->outfile, "Available ICU locales: %d\n", count);
286         fprintf(p_config->outfile, "Default locale is: %s\n",  uloc_getDefault());
287     }
288
289     for (i = 0; i < count; i++)
290     {
291
292         keyword_len
293             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
294                                      keyword, 64,
295                                      &status);
296
297         u_strToUTF8(keyword_str, 128, &keyword_str_len,
298                     keyword, keyword_len,
299                     &status);
300
301
302         language_len
303             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
304                                       language, 64,
305                                       &status);
306
307         u_strToUTF8(lang_str, 128, &lang_str_len,
308                     language, language_len,
309                     &status);
310
311
312         script_len
313             = uloc_getDisplayScript(uloc_getAvailable(i), "en",
314                                     script, 64,
315                                     &status);
316
317         u_strToUTF8(script_str, 128, &script_str_len,
318                     script, script_len,
319                     &status);
320
321         location_len
322             = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
323                                      location, 64,
324                                      &status);
325
326         u_strToUTF8(location_str, 128, &location_str_len,
327                     location, location_len,
328                     &status);
329
330         variant_len
331             = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
332                                      variant, 64,
333                                      &status);
334
335         u_strToUTF8(variant_str, 128, &variant_str_len,
336                     variant, variant_len,
337                     &status);
338
339         name_len
340             = uloc_getDisplayName(uloc_getAvailable(i), "en",
341                                   name, 64,
342                                   &status);
343
344         u_strToUTF8(name_str, 128, &name_str_len,
345                     name, name_len,
346                     &status);
347
348         localname_len
349             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
350                                   localname, 64,
351                                   &status);
352
353         u_strToUTF8(localname_str, 128, &localname_str_len,
354                     localname, localname_len,
355                     &status);
356
357
358         if (p_config->xmloutput)
359         {
360             fprintf(p_config->outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
361             if (strlen(lang_str))
362                 fprintf(p_config->outfile, " language=\"%s\"", lang_str);
363             if (strlen(script_str))
364                 fprintf(p_config->outfile, " script=\"%s\"", script_str);
365             if (strlen(location_str))
366                 fprintf(p_config->outfile, " location=\"%s\"", location_str);
367             if (strlen(variant_str))
368                 fprintf(p_config->outfile, " variant=\"%s\"", variant_str);
369             if (strlen(name_str))
370                 fprintf(p_config->outfile, " name=\"%s\"", name_str);
371             if (strlen(localname_str))
372                 fprintf(p_config->outfile, " localname=\"%s\"", localname_str);
373             fprintf(p_config->outfile, ">");
374             if (strlen(localname_str))
375                 fprintf(p_config->outfile, "%s", localname_str);
376             fprintf(p_config->outfile, "</locale>\n");
377         }
378         else if (1 == p_config->xmloutput)
379         {
380             fprintf(p_config->outfile, "%s", uloc_getAvailable(i));
381             fprintf(p_config->outfile, " | ");
382             if (strlen(name_str))
383                 fprintf(p_config->outfile, "%s", name_str);
384             fprintf(p_config->outfile, " | ");
385             if (strlen(localname_str))
386                 fprintf(p_config->outfile, "%s", localname_str);
387             fprintf(p_config->outfile, "\n");
388         }
389         else
390             fprintf(p_config->outfile, "%s\n", uloc_getAvailable(i));
391     }
392     if (p_config->xmloutput)
393         fprintf(p_config->outfile, "</locales>\n");
394     else
395         fprintf(p_config->outfile, "\n");
396
397     if (U_FAILURE(status))
398     {
399         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
400         exit(2);
401     }
402 }
403
404
405 static void print_info(const struct config_t *p_config)
406 {
407     if (p_config->xmloutput)
408         fprintf(p_config->outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
409                 "<icu>\n");
410
411     if ('c' == p_config->print[0])
412         print_icu_converters(p_config);
413     else if ('l' == p_config->print[0])
414         print_icu_xml_locales(p_config);
415     else if ('t' == p_config->print[0])
416         print_icu_transliterators(p_config);
417     else {
418         print_icu_converters(p_config);
419         print_icu_xml_locales(p_config);
420         print_icu_transliterators(p_config);
421     }
422
423     if (p_config->xmloutput)
424         fprintf(p_config->outfile, "</icu>\n");
425
426     exit(0);
427 }
428
429
430
431 static void process_text_file(struct config_t *p_config)
432 {
433     char *line = 0;
434     char linebuf[1024];
435
436     xmlDoc *doc = xmlParseFile(p_config->conffile);
437     xmlNode *xml_node = xmlDocGetRootElement(doc);
438
439     long unsigned int token_count = 0;
440     long unsigned int line_count = 0;
441
442     UErrorCode status = U_ZERO_ERROR;
443
444     if (!xml_node)
445     {
446         printf("Could not parse XML config file '%s' \n",
447                 p_config->conffile);
448         exit(1);
449     }
450
451     p_config->chain = icu_chain_xml_config(xml_node, 1, &status);
452
453     if (!p_config->chain || !U_SUCCESS(status))
454     {
455         printf("Could not set up ICU chain from config file '%s' \n",
456                 p_config->conffile);
457         if (!U_SUCCESS(status))
458             printf("ICU Error: %d %s\n", status, u_errorName(status));
459         exit(1);
460     }
461
462     if (p_config->xmloutput)
463         fprintf(p_config->outfile,
464                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
465                 "<icu>\n"
466                 "<tokens>\n");
467
468     /* read input lines for processing */
469     while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile)))
470     {
471         WRBUF sw = wrbuf_alloc();
472         WRBUF cdata = wrbuf_alloc();
473         int success = icu_chain_assign_cstr(p_config->chain, line, &status);
474         line_count++;
475
476         while (success && icu_chain_next_token(p_config->chain, &status))
477         {
478             if (U_FAILURE(status))
479                 success = 0;
480             else
481             {
482                 size_t start, len;
483                 const char *sortkey = icu_chain_token_sortkey(p_config->chain);
484
485                 icu_chain_get_org_info(p_config->chain, &start, &len);
486                 wrbuf_rewind(sw);
487                 wrbuf_puts_escaped(sw, sortkey);
488                 token_count++;
489                 if (p_config->xmloutput)
490                 {
491                     fprintf(p_config->outfile,
492                             "<token id=\"%lu\" line=\"%lu\"",
493                             token_count, line_count);
494
495                     wrbuf_rewind(cdata);
496                     wrbuf_xmlputs(cdata, icu_chain_token_norm(p_config->chain));
497                     fprintf(p_config->outfile, " norm=\"%s\"",
498                             wrbuf_cstr(cdata));
499
500                     wrbuf_rewind(cdata);
501                     wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain));
502                     fprintf(p_config->outfile, " display=\"%s\"",
503                             wrbuf_cstr(cdata));
504
505                     if (p_config->sortoutput)
506                     {
507                         wrbuf_rewind(cdata);
508                         wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
509                         fprintf(p_config->outfile, " sortkey=\"%s\"",
510                                 wrbuf_cstr(cdata));
511                     }
512                     fprintf(p_config->outfile, "/>\n");
513                 }
514                 else
515                 {
516                     fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
517                             token_count,
518                             line_count,
519                             icu_chain_token_norm(p_config->chain),
520                             icu_chain_token_display(p_config->chain));
521                     if (p_config->sortoutput)
522                     {
523                         fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
524                     }
525                     if (p_config->org_output)
526                     {
527                         fprintf(p_config->outfile, " %ld+%ld",
528                                 (long) start, (long) len);
529                     }
530                     fprintf(p_config->outfile, "\n");
531                 }
532             }
533         }
534         wrbuf_destroy(sw);
535         wrbuf_destroy(cdata);
536     }
537
538     if (p_config->xmloutput)
539         fprintf(p_config->outfile,
540                 "</tokens>\n"
541                 "</icu>\n");
542
543     icu_chain_destroy(p_config->chain);
544     xmlFreeDoc(doc);
545     if (line)
546         free(line);
547 }
548
549 #endif /* YAZ_HAVE_ICU */
550
551
552 int main(int argc, char **argv)
553 {
554 #if YAZ_HAVE_ICU
555     struct config_t config;
556
557     read_params(argc, argv, &config);
558
559     if (config.conffile && strlen(config.conffile))
560         process_text_file(&config);
561
562     if (config.print && strlen(config.print))
563         print_info(&config);
564
565     u_cleanup();
566 #else /* YAZ_HAVE_ICU */
567
568     printf("ICU not available on your system.\n"
569            "Please install libicu-dev and icu-doc or similar, "
570            "re-configure and re-compile\n");
571
572
573     exit(3);
574 #endif /* YAZ_HAVE_ICU */
575
576     return 0;
577 }
578
579
580 /*
581  * Local variables:
582  * c-basic-offset: 4
583  * c-file-style: "Stroustrup"
584  * indent-tabs-mode: nil
585  * End:
586  * vim: shiftwidth=4 tabstop=8 expandtab
587  */
588