No more manifest files
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5
6 #if HAVE_CONFIG_H
7 #include "config.h"
8 #endif
9
10 #include <string.h>
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <errno.h>
15
16 #include <yaz/options.h>
17
18 #if YAZ_HAVE_ICU
19
20 #include <unicode/ucnv.h>
21 #include <unicode/ustring.h>
22 #include <unicode/ucol.h>
23 #include <unicode/ubrk.h>
24 #include <unicode/utrans.h>
25 #include <unicode/uclean.h>
26
27 #include <yaz/icu.h>
28 #include <yaz/wrbuf.h>
29 #include <yaz/backtrace.h>
30
31 /* commando line and config parameters */
32 struct config_t {
33     char conffile[1024];
34     char print[1024];
35     int xmloutput;
36     int sortoutput;
37     int org_output;
38     yaz_icu_chain_t chain;
39     FILE * infile;
40     FILE * outfile;
41 };
42
43 void print_option_error(const struct config_t *p_config)
44 {
45     fprintf(stderr, "yaz-icu [options] [infile]\n"
46             "Options:\n"
47             "   -c file         XML configuration\n"
48             "   -p a|c|l|t      Print ICU info \n"
49             "   -s              Show sort normalization key\n"
50             "   -o              Show org positions\n"
51             "   -x              XML output instread of text\n"
52             "\n"
53             "Examples:\n"
54             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
55             "./yaz-icu -p c\n"
56             "./yaz-icu -p l -x\n"
57             "./yaz-icu -p t -x\n"
58             "\n"
59             "Example ICU chain XML configuration file:\n"
60             "<icu_chain locale=\"en\">\n"
61             "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
62             "  <tokenize rule=\"l\"/>\n"
63             "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
64             "  <casemap rule=\"l\"/>\n"
65             "</icu_chain>\n"
66           );
67     exit(1);
68 }
69
70 void read_params(int argc, char **argv, struct config_t *p_config)
71 {
72     char *arg;
73     int ret;
74
75     /* set default parameters */
76     p_config->conffile[0] = 0;
77     p_config->print[0] = 0;
78     p_config->xmloutput = 0;
79     p_config->sortoutput = 0;
80     p_config->chain = 0;
81     p_config->infile = 0;
82     p_config->outfile = stdout;
83     p_config->org_output = 0;
84
85     /* set up command line parameters */
86
87     while ((ret = options("c:op:sx", argv, argc, &arg)) != -2)
88     {
89         switch (ret)
90         {
91         case 'c':
92             strcpy(p_config->conffile, arg);
93             break;
94         case 'p':
95             strcpy(p_config->print, arg);
96             break;
97         case 's':
98             p_config->sortoutput = 1;
99             break;
100         case 'x':
101             p_config->xmloutput = 1;
102             break;
103         case 'o':
104             p_config->org_output = 1;
105             break;
106         case 0:
107             if (p_config->infile)
108             {
109                 fprintf(stderr, "yaz-icu: only one input file may be given\n");
110                 print_option_error(p_config);
111             }
112             p_config->infile = fopen(arg, "r");
113             if (!p_config->infile)
114             {
115                 fprintf(stderr, "yaz-icu: cannot open %s : %s\n",
116                         arg, strerror(errno));
117                 exit(1);
118             }
119             break;
120         default:
121             fprintf(stderr, "yaz_icu: invalid option: %s\n", arg);
122             print_option_error(p_config);
123         }
124     }
125
126     if (p_config->infile == 0)
127         p_config->infile = stdin;
128
129     if (!strlen(p_config->conffile) && !strlen(p_config->print))
130         print_option_error(p_config);
131 }
132
133 static void print_icu_converters(const struct config_t *p_config)
134 {
135     int32_t count;
136     int32_t i;
137
138     count = ucnv_countAvailable();
139     if (p_config->xmloutput)
140         fprintf(p_config->outfile, "<converters count=\"%d\" default=\"%s\">\n",
141                 count, ucnv_getDefaultName());
142     else
143     {
144         fprintf(p_config->outfile, "Available ICU converters: %d\n", count);
145         fprintf(p_config->outfile, "Default ICU Converter is: '%s'\n",
146                 ucnv_getDefaultName());
147     }
148
149     for (i = 0; i < count; i++)
150     {
151         if (p_config->xmloutput)
152             fprintf(p_config->outfile, "<converter id=\"%s\"/>\n",
153                     ucnv_getAvailableName(i));
154         else
155             fprintf(p_config->outfile, "%s\n", ucnv_getAvailableName(i));
156     }
157
158     if (p_config->xmloutput)
159         fprintf(p_config->outfile, "</converters>\n");
160     else
161         fprintf(p_config->outfile, "\n");
162 }
163
164 static void print_icu_transliterators(const struct config_t *p_config)
165 {
166     UErrorCode status;
167     UEnumeration *en = utrans_openIDs(&status);
168     int32_t count = uenum_count(en, &status);
169     const char *name;
170     int32_t length;
171
172     if (p_config->xmloutput)
173         fprintf(p_config->outfile, "<transliterators count=\"%d\">\n",  count);
174     else
175         fprintf(p_config->outfile, "Available ICU transliterators: %d\n", count);
176
177     while ((name = uenum_next(en, &length, &status)))
178     {
179         if (p_config->xmloutput)
180             fprintf(p_config->outfile, "<transliterator id=\"%s\"/>\n", name);
181         else
182             fprintf(p_config->outfile, "%s\n", name);
183     }
184     uenum_close(en);
185     if (p_config->xmloutput)
186         fprintf(p_config->outfile, "</transliterators>\n");
187     else
188     {
189         fprintf(p_config->outfile, "\n\nUnicode Set Patterns:\n"
190                 "   Pattern         Description\n"
191                 "   Ranges          [a-z]       The lower case letters a through z\n"
192                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
193                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
194                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
195                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
196                 "\n"
197                 "   Combination     Example\n"
198                 "   Union           [[:Greek:] [:letter:]]\n"
199                 "   Intersection    [[:Greek:] & [:letter:]]\n"
200                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
201                 "   Complement      [^[:Greek:] [:letter:]]\n"
202                 "\n"
203              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
204                 "\n"
205                 "Examples:\n"
206                 "   [:Punctuation:] Any-Remove\n"
207                 "   [:Cased-Letter:] Any-Upper\n"
208                 "   [:Control:] Any-Remove\n"
209                 "   [:Decimal_Number:] Any-Remove\n"
210                 "   [:Final_Punctuation:] Any-Remove\n"
211                 "   [:Georgian:] Any-Upper\n"
212                 "   [:Katakana:] Any-Remove\n"
213                 "   [:Arabic:] Any-Remove\n"
214                 "   [:Punctuation:] Remove\n"
215                 "   [[:Punctuation:]-[.,]] Remove\n"
216                 "   [:Line_Separator:] Any-Remove\n"
217                 "   [:Math_Symbol:] Any-Remove\n"
218                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
219                 "   [:^Number:] Remove (numeric tokenization)\n"
220                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
221                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
222                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
223                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
224                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
225                 "\n"
226                 "see http://userguide.icu-project.org/transforms/general\n"
227                 "    http://www.unicode.org/reports/tr44/\n"
228             );
229
230
231         fprintf(p_config->outfile, "\n\n");
232
233     }
234 }
235
236 static void print_icu_xml_locales(const struct config_t *p_config)
237 {
238     int32_t count;
239     int32_t i;
240     UErrorCode status = U_ZERO_ERROR;
241
242     UChar keyword[64];
243     int32_t keyword_len = 0;
244     char keyword_str[128];
245     int32_t keyword_str_len = 0;
246
247     UChar language[64];
248     int32_t language_len = 0;
249     char lang_str[128];
250     int32_t lang_str_len = 0;
251
252     UChar script[64];
253     int32_t script_len = 0;
254     char script_str[128];
255     int32_t script_str_len = 0;
256
257     UChar location[64];
258     int32_t location_len = 0;
259     char location_str[128];
260     int32_t location_str_len = 0;
261
262     UChar variant[64];
263     int32_t variant_len = 0;
264     char variant_str[128];
265     int32_t variant_str_len = 0;
266
267     UChar name[64];
268     int32_t name_len = 0;
269     char name_str[128];
270     int32_t name_str_len = 0;
271
272     UChar localname[64];
273     int32_t localname_len = 0;
274     char localname_str[128];
275     int32_t localname_str_len = 0;
276
277     count = uloc_countAvailable() ;
278
279     if (p_config->xmloutput)
280     {
281         fprintf(p_config->outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
282                 count, uloc_getDefault(), ucol_countAvailable());
283     }
284     else
285     {
286         fprintf(p_config->outfile, "Available ICU locales: %d\n", count);
287         fprintf(p_config->outfile, "Default locale is: %s\n",  uloc_getDefault());
288     }
289
290     for (i = 0; i < count; i++)
291     {
292
293         keyword_len
294             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
295                                      keyword, 64,
296                                      &status);
297
298         u_strToUTF8(keyword_str, 128, &keyword_str_len,
299                     keyword, keyword_len,
300                     &status);
301
302
303         language_len
304             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
305                                       language, 64,
306                                       &status);
307
308         u_strToUTF8(lang_str, 128, &lang_str_len,
309                     language, language_len,
310                     &status);
311
312
313         script_len
314             = uloc_getDisplayScript(uloc_getAvailable(i), "en",
315                                     script, 64,
316                                     &status);
317
318         u_strToUTF8(script_str, 128, &script_str_len,
319                     script, script_len,
320                     &status);
321
322         location_len
323             = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
324                                      location, 64,
325                                      &status);
326
327         u_strToUTF8(location_str, 128, &location_str_len,
328                     location, location_len,
329                     &status);
330
331         variant_len
332             = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
333                                      variant, 64,
334                                      &status);
335
336         u_strToUTF8(variant_str, 128, &variant_str_len,
337                     variant, variant_len,
338                     &status);
339
340         name_len
341             = uloc_getDisplayName(uloc_getAvailable(i), "en",
342                                   name, 64,
343                                   &status);
344
345         u_strToUTF8(name_str, 128, &name_str_len,
346                     name, name_len,
347                     &status);
348
349         localname_len
350             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
351                                   localname, 64,
352                                   &status);
353
354         u_strToUTF8(localname_str, 128, &localname_str_len,
355                     localname, localname_len,
356                     &status);
357
358
359         if (p_config->xmloutput)
360         {
361             fprintf(p_config->outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
362             if (strlen(lang_str))
363                 fprintf(p_config->outfile, " language=\"%s\"", lang_str);
364             if (strlen(script_str))
365                 fprintf(p_config->outfile, " script=\"%s\"", script_str);
366             if (strlen(location_str))
367                 fprintf(p_config->outfile, " location=\"%s\"", location_str);
368             if (strlen(variant_str))
369                 fprintf(p_config->outfile, " variant=\"%s\"", variant_str);
370             if (strlen(name_str))
371                 fprintf(p_config->outfile, " name=\"%s\"", name_str);
372             if (strlen(localname_str))
373                 fprintf(p_config->outfile, " localname=\"%s\"", localname_str);
374             fprintf(p_config->outfile, ">");
375             if (strlen(localname_str))
376                 fprintf(p_config->outfile, "%s", localname_str);
377             fprintf(p_config->outfile, "</locale>\n");
378         }
379         else if (1 == p_config->xmloutput)
380         {
381             fprintf(p_config->outfile, "%s", uloc_getAvailable(i));
382             fprintf(p_config->outfile, " | ");
383             if (strlen(name_str))
384                 fprintf(p_config->outfile, "%s", name_str);
385             fprintf(p_config->outfile, " | ");
386             if (strlen(localname_str))
387                 fprintf(p_config->outfile, "%s", localname_str);
388             fprintf(p_config->outfile, "\n");
389         }
390         else
391             fprintf(p_config->outfile, "%s\n", uloc_getAvailable(i));
392     }
393     if (p_config->xmloutput)
394         fprintf(p_config->outfile, "</locales>\n");
395     else
396         fprintf(p_config->outfile, "\n");
397
398     if (U_FAILURE(status))
399     {
400         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
401         exit(2);
402     }
403 }
404
405
406 static void print_info(const struct config_t *p_config)
407 {
408     if (p_config->xmloutput)
409         fprintf(p_config->outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
410                 "<icu>\n");
411
412     if ('c' == p_config->print[0])
413         print_icu_converters(p_config);
414     else if ('l' == p_config->print[0])
415         print_icu_xml_locales(p_config);
416     else if ('t' == p_config->print[0])
417         print_icu_transliterators(p_config);
418     else {
419         print_icu_converters(p_config);
420         print_icu_xml_locales(p_config);
421         print_icu_transliterators(p_config);
422     }
423
424     if (p_config->xmloutput)
425         fprintf(p_config->outfile, "</icu>\n");
426
427     exit(0);
428 }
429
430
431
432 static void process_text_file(struct config_t *p_config)
433 {
434     char *line = 0;
435     char linebuf[1024];
436
437     xmlDoc *doc = xmlParseFile(p_config->conffile);
438     xmlNode *xml_node = xmlDocGetRootElement(doc);
439
440     long unsigned int token_count = 0;
441     long unsigned int line_count = 0;
442
443     UErrorCode status = U_ZERO_ERROR;
444
445     if (!xml_node)
446     {
447         printf("Could not parse XML config file '%s' \n",
448                 p_config->conffile);
449         exit(1);
450     }
451
452     p_config->chain = icu_chain_xml_config(xml_node, 1, &status);
453
454     if (!p_config->chain || !U_SUCCESS(status))
455     {
456         printf("Could not set up ICU chain from config file '%s' \n",
457                 p_config->conffile);
458         exit(1);
459     }
460
461     if (p_config->xmloutput)
462         fprintf(p_config->outfile,
463                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
464                 "<icu>\n"
465                 "<tokens>\n");
466
467     /* read input lines for processing */
468     while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile)))
469     {
470         WRBUF sw = wrbuf_alloc();
471         WRBUF cdata = wrbuf_alloc();
472         int success = icu_chain_assign_cstr(p_config->chain, line, &status);
473         line_count++;
474
475         while (success && icu_chain_next_token(p_config->chain, &status))
476         {
477             if (U_FAILURE(status))
478                 success = 0;
479             else
480             {
481                 size_t start, len;
482                 const char *org_string = 0;
483                 const char *sortkey = icu_chain_token_sortkey(p_config->chain);
484
485                 icu_chain_get_org_info2(p_config->chain, &start, &len,
486                                         &org_string);
487                 wrbuf_rewind(sw);
488                 wrbuf_puts_escaped(sw, sortkey);
489                 token_count++;
490                 if (p_config->xmloutput)
491                 {
492                     fprintf(p_config->outfile,
493                             "<token id=\"%lu\" line=\"%lu\"",
494                             token_count, line_count);
495
496                     wrbuf_rewind(cdata);
497                     wrbuf_xmlputs(cdata, icu_chain_token_norm(p_config->chain));
498                     fprintf(p_config->outfile, " norm=\"%s\"",
499                             wrbuf_cstr(cdata));
500
501                     wrbuf_rewind(cdata);
502                     wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain));
503                     fprintf(p_config->outfile, " display=\"%s\"",
504                             wrbuf_cstr(cdata));
505
506                     if (p_config->sortoutput)
507                     {
508                         wrbuf_rewind(cdata);
509                         wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
510                         fprintf(p_config->outfile, " sortkey=\"%s\"",
511                                 wrbuf_cstr(cdata));
512                     }
513                     fprintf(p_config->outfile, "/>\n");
514                 }
515                 else
516                 {
517                     fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
518                             token_count,
519                             line_count,
520                             icu_chain_token_norm(p_config->chain),
521                             icu_chain_token_display(p_config->chain));
522                     if (p_config->sortoutput)
523                     {
524                         fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
525                     }
526                     if (p_config->org_output)
527                     {
528                         fprintf(p_config->outfile, " %ld+%ld",
529                                 (long) start, (long) len);
530                         fputc(' ', p_config->outfile);
531                         fwrite(org_string, 1, start, p_config->outfile);
532                         fputc('*', p_config->outfile);
533                         fwrite(org_string + start, 1, len, p_config->outfile);
534                         fputc('*', p_config->outfile);
535                         fputs(org_string + start + len, p_config->outfile);
536                     }
537                     fprintf(p_config->outfile, "\n");
538                 }
539             }
540         }
541         wrbuf_destroy(sw);
542         wrbuf_destroy(cdata);
543     }
544
545     if (p_config->xmloutput)
546         fprintf(p_config->outfile,
547                 "</tokens>\n"
548                 "</icu>\n");
549
550     icu_chain_destroy(p_config->chain);
551     xmlFreeDoc(doc);
552     if (line)
553         free(line);
554 }
555
556 #endif /* YAZ_HAVE_ICU */
557
558
559 int main(int argc, char **argv)
560 {
561 #if YAZ_HAVE_ICU
562     struct config_t config;
563
564     yaz_enable_panic_backtrace(*argv);
565     read_params(argc, argv, &config);
566
567     if (config.conffile && strlen(config.conffile))
568         process_text_file(&config);
569
570     if (config.print && strlen(config.print))
571         print_info(&config);
572
573     u_cleanup();
574 #else /* YAZ_HAVE_ICU */
575
576     printf("ICU not available on your system.\n"
577            "Please install libicu-dev and icu-doc or similar, "
578            "re-configure and re-compile\n");
579
580
581     exit(3);
582 #endif /* YAZ_HAVE_ICU */
583
584     return 0;
585 }
586
587
588 /*
589  * Local variables:
590  * c-basic-offset: 4
591  * c-file-style: "Stroustrup"
592  * indent-tabs-mode: nil
593  * End:
594  * vim: shiftwidth=4 tabstop=8 expandtab
595  */
596