Reduce ICU buffer copy for transform YAZ-790
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5
6 #if HAVE_CONFIG_H
7 #include "config.h"
8 #endif
9
10 #include <string.h>
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <errno.h>
15
16 #include <yaz/options.h>
17
18 #if YAZ_HAVE_ICU
19
20 #include <unicode/ucnv.h>
21 #include <unicode/ustring.h>
22 #include <unicode/ucol.h>
23 #include <unicode/ubrk.h>
24 #include <unicode/utrans.h>
25 #include <unicode/uclean.h>
26
27 #include <yaz/icu.h>
28 #include <yaz/wrbuf.h>
29
30 /* commando line and config parameters */
31 struct config_t {
32     char conffile[1024];
33     char print[1024];
34     int xmloutput;
35     int sortoutput;
36     int org_output;
37     int count;
38     yaz_icu_chain_t chain;
39     FILE * infile;
40     FILE * outfile;
41 };
42
43 void print_option_error(const struct config_t *p_config)
44 {
45     fprintf(stderr, "yaz-icu [options] [infile]\n"
46             "Options:\n"
47             "   -c file         XML configuration\n"
48             "   -p a|c|l|t      Print ICU info \n"
49             "   -s              Show sort normalization key\n"
50             "   -o              Show org positions\n"
51             "   -x              XML output instread of text\n"
52             "   -C n            Perform conversions n times (instead of once)\n"
53             "\n"
54             "Examples:\n"
55             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
56             "./yaz-icu -p c\n"
57             "./yaz-icu -p l -x\n"
58             "./yaz-icu -p t -x\n"
59             "\n"
60             "Example ICU chain XML configuration file:\n"
61             "<icu_chain locale=\"en\">\n"
62             "  <transform rule=\"[:Control:] Any-Remove\"/>\n"
63             "  <tokenize rule=\"l\"/>\n"
64             "  <transform rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
65             "  <casemap rule=\"l\"/>\n"
66             "</icu_chain>\n"
67           );
68     exit(1);
69 }
70
71 void read_params(int argc, char **argv, struct config_t *p_config)
72 {
73     char *arg;
74     int ret;
75
76     /* set default parameters */
77     p_config->conffile[0] = 0;
78     p_config->print[0] = 0;
79     p_config->xmloutput = 0;
80     p_config->sortoutput = 0;
81     p_config->chain = 0;
82     p_config->infile = 0;
83     p_config->outfile = stdout;
84     p_config->org_output = 0;
85     p_config->count = 1;
86
87     /* set up command line parameters */
88
89     while ((ret = options("c:C:op:sx", argv, argc, &arg)) != -2)
90     {
91         switch (ret)
92         {
93         case 'c':
94             strcpy(p_config->conffile, arg);
95             break;
96         case 'C':
97             p_config->count = atoi(arg);
98             break;
99         case 'p':
100             strcpy(p_config->print, arg);
101             break;
102         case 's':
103             p_config->sortoutput = 1;
104             break;
105         case 'x':
106             p_config->xmloutput = 1;
107             break;
108         case 'o':
109             p_config->org_output = 1;
110             break;
111         case 0:
112             if (p_config->infile)
113             {
114                 fprintf(stderr, "yaz-icu: only one input file may be given\n");
115                 print_option_error(p_config);
116             }
117             p_config->infile = fopen(arg, "r");
118             if (!p_config->infile)
119             {
120                 fprintf(stderr, "yaz-icu: cannot open %s : %s\n",
121                         arg, strerror(errno));
122                 exit(1);
123             }
124             break;
125         default:
126             fprintf(stderr, "yaz_icu: invalid option: %s\n", arg);
127             print_option_error(p_config);
128         }
129     }
130
131     if (p_config->infile == 0)
132         p_config->infile = stdin;
133
134     if (!strlen(p_config->conffile) && !strlen(p_config->print))
135         print_option_error(p_config);
136 }
137
138 static void print_icu_converters(const struct config_t *p_config)
139 {
140     int32_t count;
141     int32_t i;
142
143     count = ucnv_countAvailable();
144     if (p_config->xmloutput)
145         fprintf(p_config->outfile, "<converters count=\"%d\" default=\"%s\">\n",
146                 count, ucnv_getDefaultName());
147     else
148     {
149         fprintf(p_config->outfile, "Available ICU converters: %d\n", count);
150         fprintf(p_config->outfile, "Default ICU Converter is: '%s'\n",
151                 ucnv_getDefaultName());
152     }
153
154     for (i = 0; i < count; i++)
155     {
156         if (p_config->xmloutput)
157             fprintf(p_config->outfile, "<converter id=\"%s\"/>\n",
158                     ucnv_getAvailableName(i));
159         else
160             fprintf(p_config->outfile, "%s\n", ucnv_getAvailableName(i));
161     }
162
163     if (p_config->xmloutput)
164         fprintf(p_config->outfile, "</converters>\n");
165     else
166         fprintf(p_config->outfile, "\n");
167 }
168
169 static void print_icu_transliterators(const struct config_t *p_config)
170 {
171     UErrorCode status;
172     UEnumeration *en = utrans_openIDs(&status);
173     int32_t count = uenum_count(en, &status);
174     const char *name;
175     int32_t length;
176
177     if (p_config->xmloutput)
178         fprintf(p_config->outfile, "<transliterators count=\"%d\">\n",  count);
179     else
180         fprintf(p_config->outfile, "Available ICU transliterators: %d\n", count);
181
182     while ((name = uenum_next(en, &length, &status)))
183     {
184         if (p_config->xmloutput)
185             fprintf(p_config->outfile, "<transliterator id=\"%s\"/>\n", name);
186         else
187             fprintf(p_config->outfile, "%s\n", name);
188     }
189     uenum_close(en);
190     if (p_config->xmloutput)
191         fprintf(p_config->outfile, "</transliterators>\n");
192     else
193     {
194         fprintf(p_config->outfile, "\n\nUnicode Set Patterns:\n"
195                 "   Pattern         Description\n"
196                 "   Ranges          [a-z]       The lower case letters a through z\n"
197                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
198                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
199                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
200                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
201                 "\n"
202                 "   Combination     Example\n"
203                 "   Union           [[:Greek:] [:letter:]]\n"
204                 "   Intersection    [[:Greek:] & [:letter:]]\n"
205                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
206                 "   Complement      [^[:Greek:] [:letter:]]\n"
207                 "\n"
208              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
209                 "\n"
210                 "Examples:\n"
211                 "   [:Punctuation:] Any-Remove\n"
212                 "   [:Cased-Letter:] Any-Upper\n"
213                 "   [:Control:] Any-Remove\n"
214                 "   [:Decimal_Number:] Any-Remove\n"
215                 "   [:Final_Punctuation:] Any-Remove\n"
216                 "   [:Georgian:] Any-Upper\n"
217                 "   [:Katakana:] Any-Remove\n"
218                 "   [:Arabic:] Any-Remove\n"
219                 "   [:Punctuation:] Remove\n"
220                 "   [[:Punctuation:]-[.,]] Remove\n"
221                 "   [:Line_Separator:] Any-Remove\n"
222                 "   [:Math_Symbol:] Any-Remove\n"
223                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
224                 "   [:^Number:] Remove (numeric tokenization)\n"
225                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
226                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
227                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
228                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
229                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
230                 "\n"
231                 "see http://userguide.icu-project.org/transforms/general\n"
232                 "    http://www.unicode.org/reports/tr44/\n"
233             );
234
235
236         fprintf(p_config->outfile, "\n\n");
237
238     }
239 }
240
241 static void print_icu_xml_locales(const struct config_t *p_config)
242 {
243     int32_t count;
244     int32_t i;
245     UErrorCode status = U_ZERO_ERROR;
246
247     UChar keyword[64];
248     int32_t keyword_len = 0;
249     char keyword_str[128];
250     int32_t keyword_str_len = 0;
251
252     UChar language[64];
253     int32_t language_len = 0;
254     char lang_str[128];
255     int32_t lang_str_len = 0;
256
257     UChar script[64];
258     int32_t script_len = 0;
259     char script_str[128];
260     int32_t script_str_len = 0;
261
262     UChar location[64];
263     int32_t location_len = 0;
264     char location_str[128];
265     int32_t location_str_len = 0;
266
267     UChar variant[64];
268     int32_t variant_len = 0;
269     char variant_str[128];
270     int32_t variant_str_len = 0;
271
272     UChar name[64];
273     int32_t name_len = 0;
274     char name_str[128];
275     int32_t name_str_len = 0;
276
277     UChar localname[64];
278     int32_t localname_len = 0;
279     char localname_str[128];
280     int32_t localname_str_len = 0;
281
282     count = uloc_countAvailable() ;
283
284     if (p_config->xmloutput)
285     {
286         fprintf(p_config->outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n",
287                 count, uloc_getDefault(), ucol_countAvailable());
288     }
289     else
290     {
291         fprintf(p_config->outfile, "Available ICU locales: %d\n", count);
292         fprintf(p_config->outfile, "Default locale is: %s\n",  uloc_getDefault());
293     }
294
295     for (i = 0; i < count; i++)
296     {
297
298         keyword_len
299             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en",
300                                      keyword, 64,
301                                      &status);
302
303         u_strToUTF8(keyword_str, 128, &keyword_str_len,
304                     keyword, keyword_len,
305                     &status);
306
307
308         language_len
309             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en",
310                                       language, 64,
311                                       &status);
312
313         u_strToUTF8(lang_str, 128, &lang_str_len,
314                     language, language_len,
315                     &status);
316
317
318         script_len
319             = uloc_getDisplayScript(uloc_getAvailable(i), "en",
320                                     script, 64,
321                                     &status);
322
323         u_strToUTF8(script_str, 128, &script_str_len,
324                     script, script_len,
325                     &status);
326
327         location_len
328             = uloc_getDisplayCountry(uloc_getAvailable(i), "en",
329                                      location, 64,
330                                      &status);
331
332         u_strToUTF8(location_str, 128, &location_str_len,
333                     location, location_len,
334                     &status);
335
336         variant_len
337             = uloc_getDisplayVariant(uloc_getAvailable(i), "en",
338                                      variant, 64,
339                                      &status);
340
341         u_strToUTF8(variant_str, 128, &variant_str_len,
342                     variant, variant_len,
343                     &status);
344
345         name_len
346             = uloc_getDisplayName(uloc_getAvailable(i), "en",
347                                   name, 64,
348                                   &status);
349
350         u_strToUTF8(name_str, 128, &name_str_len,
351                     name, name_len,
352                     &status);
353
354         localname_len
355             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i),
356                                   localname, 64,
357                                   &status);
358
359         u_strToUTF8(localname_str, 128, &localname_str_len,
360                     localname, localname_len,
361                     &status);
362
363
364         if (p_config->xmloutput)
365         {
366             fprintf(p_config->outfile, "<locale id=\"%s\"", uloc_getAvailable(i));
367             if (strlen(lang_str))
368                 fprintf(p_config->outfile, " language=\"%s\"", lang_str);
369             if (strlen(script_str))
370                 fprintf(p_config->outfile, " script=\"%s\"", script_str);
371             if (strlen(location_str))
372                 fprintf(p_config->outfile, " location=\"%s\"", location_str);
373             if (strlen(variant_str))
374                 fprintf(p_config->outfile, " variant=\"%s\"", variant_str);
375             if (strlen(name_str))
376                 fprintf(p_config->outfile, " name=\"%s\"", name_str);
377             if (strlen(localname_str))
378                 fprintf(p_config->outfile, " localname=\"%s\"", localname_str);
379             fprintf(p_config->outfile, ">");
380             if (strlen(localname_str))
381                 fprintf(p_config->outfile, "%s", localname_str);
382             fprintf(p_config->outfile, "</locale>\n");
383         }
384         else if (1 == p_config->xmloutput)
385         {
386             fprintf(p_config->outfile, "%s", uloc_getAvailable(i));
387             fprintf(p_config->outfile, " | ");
388             if (strlen(name_str))
389                 fprintf(p_config->outfile, "%s", name_str);
390             fprintf(p_config->outfile, " | ");
391             if (strlen(localname_str))
392                 fprintf(p_config->outfile, "%s", localname_str);
393             fprintf(p_config->outfile, "\n");
394         }
395         else
396             fprintf(p_config->outfile, "%s\n", uloc_getAvailable(i));
397     }
398     if (p_config->xmloutput)
399         fprintf(p_config->outfile, "</locales>\n");
400     else
401         fprintf(p_config->outfile, "\n");
402
403     if (U_FAILURE(status))
404     {
405         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
406         exit(2);
407     }
408 }
409
410
411 static void print_info(const struct config_t *p_config)
412 {
413     if (p_config->xmloutput)
414         fprintf(p_config->outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
415                 "<icu>\n");
416
417     if ('c' == p_config->print[0])
418         print_icu_converters(p_config);
419     else if ('l' == p_config->print[0])
420         print_icu_xml_locales(p_config);
421     else if ('t' == p_config->print[0])
422         print_icu_transliterators(p_config);
423     else {
424         print_icu_converters(p_config);
425         print_icu_xml_locales(p_config);
426         print_icu_transliterators(p_config);
427     }
428
429     if (p_config->xmloutput)
430         fprintf(p_config->outfile, "</icu>\n");
431
432     exit(0);
433 }
434
435
436
437 static void process_text_file(struct config_t *p_config)
438 {
439     char *line = 0;
440     char linebuf[1024];
441
442     xmlDoc *doc = xmlParseFile(p_config->conffile);
443     xmlNode *xml_node = xmlDocGetRootElement(doc);
444
445     long unsigned int token_count = 0;
446     long unsigned int line_count = 0;
447
448     UErrorCode status = U_ZERO_ERROR;
449
450     if (!xml_node)
451     {
452         printf("Could not parse XML config file '%s' \n",
453                 p_config->conffile);
454         exit(1);
455     }
456
457     p_config->chain = icu_chain_xml_config(xml_node, 1, &status);
458
459     if (!p_config->chain || !U_SUCCESS(status))
460     {
461         printf("Could not set up ICU chain from config file '%s' \n",
462                 p_config->conffile);
463         exit(1);
464     }
465
466     if (p_config->xmloutput)
467         fprintf(p_config->outfile,
468                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
469                 "<icu>\n"
470                 "<tokens>\n");
471
472     /* read input lines for processing */
473     while ((line=fgets(linebuf, sizeof(linebuf)-1, p_config->infile)))
474     {
475         WRBUF sw = wrbuf_alloc();
476         WRBUF cdata = wrbuf_alloc();
477         int i;
478         for (i = 0; i < p_config->count; i++)
479         {
480             int success = icu_chain_assign_cstr(p_config->chain, line, &status);
481             line_count++;
482             while (success && icu_chain_next_token(p_config->chain, &status))
483             {
484                 if (U_FAILURE(status))
485                     success = 0;
486                 else
487                 {
488                     size_t start, len;
489                     const char *sortkey =
490                         icu_chain_token_sortkey(p_config->chain);
491                     icu_chain_get_org_info(p_config->chain, &start, &len);
492                     wrbuf_rewind(sw);
493                     wrbuf_puts_escaped(sw, sortkey);
494                     if (i == 0)
495                         token_count++;
496                     if (i > 0)
497                         ;  /* only output on first iteration */
498                     else if (p_config->xmloutput)
499                     {
500                         fprintf(p_config->outfile,
501                                 "<token id=\"%lu\" line=\"%lu\"",
502                                 token_count, line_count);
503                         wrbuf_rewind(cdata);
504                         wrbuf_xmlputs(cdata,
505                                       icu_chain_token_norm(p_config->chain));
506                         fprintf(p_config->outfile, " norm=\"%s\"",
507                                 wrbuf_cstr(cdata));
508                         wrbuf_rewind(cdata);
509                         wrbuf_xmlputs(cdata, icu_chain_token_display(p_config->chain));
510                         fprintf(p_config->outfile, " display=\"%s\"",
511                                 wrbuf_cstr(cdata));
512                         if (p_config->sortoutput)
513                         {
514                             wrbuf_rewind(cdata);
515                             wrbuf_xmlputs(cdata, wrbuf_cstr(sw));
516                             fprintf(p_config->outfile, " sortkey=\"%s\"",
517                                     wrbuf_cstr(cdata));
518                         }
519                         fprintf(p_config->outfile, "/>\n");
520                     }
521                     else
522                     {
523                         fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
524                                 token_count,
525                                 line_count,
526                                 icu_chain_token_norm(p_config->chain),
527                                 icu_chain_token_display(p_config->chain));
528                         if (p_config->sortoutput)
529                         {
530                             fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
531                         }
532                         if (p_config->org_output)
533                         {
534                             fprintf(p_config->outfile, " %ld+%ld",
535                                     (long) start, (long) len);
536                         }
537                         fprintf(p_config->outfile, "\n");
538                     }
539                 }
540             }
541         }
542         wrbuf_destroy(sw);
543         wrbuf_destroy(cdata);
544     }
545
546     if (p_config->xmloutput)
547         fprintf(p_config->outfile,
548                 "</tokens>\n"
549                 "</icu>\n");
550
551     icu_chain_destroy(p_config->chain);
552     xmlFreeDoc(doc);
553     if (line)
554         free(line);
555 }
556
557 #endif /* YAZ_HAVE_ICU */
558
559
560 int main(int argc, char **argv)
561 {
562 #if YAZ_HAVE_ICU
563     struct config_t config;
564
565     xmlInitParser();
566     LIBXML_TEST_VERSION
567     read_params(argc, argv, &config);
568
569     if (config.conffile && strlen(config.conffile))
570         process_text_file(&config);
571
572     if (config.print && strlen(config.print))
573         print_info(&config);
574
575     u_cleanup();
576     xmlCleanupParser();
577 #else /* YAZ_HAVE_ICU */
578
579     printf("ICU not available on your system.\n"
580            "Please install libicu-dev and icu-doc or similar, "
581            "re-configure and re-compile\n");
582
583
584     exit(3);
585 #endif /* YAZ_HAVE_ICU */
586     return 0;
587 }
588
589
590 /*
591  * Local variables:
592  * c-basic-offset: 4
593  * c-file-style: "Stroustrup"
594  * indent-tabs-mode: nil
595  * End:
596  * vim: shiftwidth=4 tabstop=8 expandtab
597  */
598