Added ICU chain component - which used to be part of Pazpar2.
[yaz-moved-to-github.git] / util / yaz-icu.c
1 /* $Id: yaz-icu.c,v 1.1 2007-10-22 12:21:40 adam Exp $
2    Copyright (c) 2006-2007, Index Data.
3
4 This file is part of Pazpar2.
5
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
9 version.
10
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE.  If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA.
20  */
21
22 #if HAVE_CONFIG_H
23 #include "cconfig.h"
24 #endif
25
26 #include <string.h>
27
28 #include <stdio.h>
29 #include <stdlib.h>
30
31 //#include <yaz/xmalloc.h>
32 #include <yaz/options.h>
33
34
35 #ifdef HAVE_ICU
36
37 #include <unicode/ucnv.h>
38 #include <unicode/ustring.h>
39
40 #include <yaz/icu_I18N.h>
41
42 /* commando line and config parameters */
43 static struct config_t { 
44     char conffile[1024];
45     char print[1024];
46     int xmloutput;
47     struct icu_chain * chain;
48     FILE * infile;
49     FILE * outfile;
50 } config;
51
52
53   
54 void print_option_error(const struct config_t *p_config)
55 {  
56     fprintf(stderr, "Calling error, valid options are :\n");
57     fprintf(stderr, "yaz-icu\n"
58             "   [-c (path/to/config/file.xml)]\n"
59             "   [-p (a|c|l|t)] print ICU info \n"
60             "   [-x] XML output\n"
61             "\n"
62             "Examples:\n"
63             "cat hugetextfile.txt | ./yaz-icu -c config.xml \n"
64             "./yaz-icu -p c\n"
65             "./yaz-icu -p l -x\n"
66             "./yaz-icu -p t -x\n"
67             "\n"
68             "Example ICU chain XML configuration file:\n"
69             "<icu_chain id=\"en:word\" locale=\"en\">\n"
70             "  <normalize rule=\"[:Control:] Any-Remove\"/>\n"
71             "  <tokenize rule=\"l\"/>\n"
72             "  <normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
73             "  <display/>\n"
74             "  <casemap rule=\"l\"/>\n"
75             "  <index/>\n"
76             "  <sortkey/>\n"
77             "</icu_chain>\n"
78           );
79     exit(1);
80 }
81
82 void read_params(int argc, char **argv, struct config_t *p_config)
83 {    
84     char *arg;
85     int ret;
86     
87     /* set default parameters */
88     p_config->conffile[0] = 0;
89     p_config->print[0] = 0;
90     p_config->xmloutput = 0;
91     p_config->chain = 0;
92     p_config->infile = stdin;
93     p_config->outfile = stdout;
94     
95     /* set up command line parameters */
96     
97     while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
98     {
99         switch (ret)
100         {
101         case 'c':
102             strcpy(p_config->conffile, arg);
103             break;
104         case 'p':
105             strcpy(p_config->print, arg);
106             break;
107         case 'x':
108             p_config->xmloutput = 1;
109             break;
110         default:
111             print_option_error(p_config);
112         }
113     }
114     
115     if ((!strlen(p_config->conffile)
116          && !strlen(p_config->print))
117         || !config.infile
118         || !config.outfile)
119         
120         print_option_error(p_config);
121 };
122
123
124 /*     UConverter *conv; */
125 /*     conv = ucnv_open("utf-8", &status); */
126 /*     assert(U_SUCCESS(status)); */
127
128 /*     *ustr16_len  */
129 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
130 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
131 /*                       &status); */
132   
133
134
135 /*      ucnv_fromUChars(conv, */
136 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
137 /*                      ustr16, *ustr16_len, */
138 /*                      &status); */
139 /*      ucnv_close(conv); */
140
141
142 static void print_icu_converters(const struct config_t *p_config)
143 {
144     int32_t count;
145     int32_t i;
146
147     count = ucnv_countAvailable();
148     if (p_config->xmloutput)
149         fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
150                 count, ucnv_getDefaultName());
151     else {    
152         fprintf(config.outfile, "Available ICU converters: %d\n", count);
153         fprintf(config.outfile, "Default ICU Converter is: '%s'\n", 
154                 ucnv_getDefaultName());
155     }
156     
157     for(i=0;i<count;i++){
158         if (p_config->xmloutput)
159             fprintf(config.outfile, "<converter id=\"%s\"/>\n", 
160                     ucnv_getAvailableName(i));
161         else     
162             fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
163     }
164     
165     if (p_config->xmloutput)
166         fprintf(config.outfile, "</converters>\n");
167     else
168         fprintf(config.outfile, "\n");
169 }
170
171 static void print_icu_transliterators(const struct config_t *p_config)
172 {
173     int32_t count;
174     int32_t i;
175     
176     count = utrans_countAvailableIDs();
177     
178     int32_t buf_cap = 128;
179     char buf[buf_cap];
180     
181     if (p_config->xmloutput)
182         fprintf(config.outfile, "<transliterators count=\"%d\">\n",  count);
183     else 
184         fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
185     
186     for(i = 0; i <count; i++)
187     {
188         utrans_getAvailableID(i, buf, buf_cap);
189         if (p_config->xmloutput)
190             fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
191         else
192             fprintf(config.outfile, " %s", buf);
193     }
194     
195     if (p_config->xmloutput){
196         fprintf(config.outfile, "</transliterators>\n");
197     }
198     else
199     {
200         fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
201                 "   Pattern         Description\n"
202                 "   Ranges          [a-z]       The lower case letters a through z\n"
203                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
204                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
205                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
206                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
207                 "\n"
208                 "   Combination     Example\n"
209                 "   Union           [[:Greek:] [:letter:]]\n"
210                 "   Intersection    [[:Greek:] & [:letter:]]\n"
211                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
212                 "   Complement      [^[:Greek:] [:letter:]]\n"
213                 "\n"
214              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
215                 "\n"
216                 "Examples:\n"
217                 "   [:Punctuation:] Any-Remove\n"
218                 "   [:Cased-Letter:] Any-Upper\n"
219                 "   [:Control:] Any-Remove\n"
220                 "   [:Decimal_Number:] Any-Remove\n"
221                 "   [:Final_Punctuation:] Any-Remove\n"
222                 "   [:Georgian:] Any-Upper\n"
223                 "   [:Katakana:] Any-Remove\n"
224                 "   [:Arabic:] Any-Remove\n"
225                 "   [:Punctuation:] Remove\n"
226                 "   [[:Punctuation:]-[.,]] Remove\n"
227                 "   [:Line_Separator:] Any-Remove\n"
228                 "   [:Math_Symbol:] Any-Remove\n"
229                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
230                 "   [:^Number:] Remove (numeric tokenization)\n"
231                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
232                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
233                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
234                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
235                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
236                 "\n"
237                 "see http://icu.sourceforge.net/userguide/Transform.html\n"
238                 "    http://www.unicode.org/Public/UNIDATA/UCD.html\n"
239                 "    http://icu.sourceforge.net/userguide/Transform.html\n"
240                 "    http://icu.sourceforge.net/userguide/TransformRule.html\n"
241             );
242         
243         
244         fprintf(config.outfile, "\n\n");
245         
246     }
247 }
248
249 static void print_icu_xml_locales(const struct config_t *p_config)
250 {
251     int32_t count;
252     int32_t i;
253     UErrorCode status = U_ZERO_ERROR;
254     
255     UChar keyword[64];
256     int32_t keyword_len = 0;
257     char keyword_str[128];
258     int32_t keyword_str_len = 0;
259
260     UChar language[64];
261     int32_t language_len = 0;
262     char lang_str[128];
263     int32_t lang_str_len = 0;
264
265     UChar script[64];
266     int32_t script_len = 0;
267     char script_str[128];
268     int32_t script_str_len = 0;
269
270     UChar location[64];
271     int32_t location_len = 0;
272     char location_str[128];
273     int32_t location_str_len = 0;
274
275     UChar variant[64];
276     int32_t variant_len = 0;
277     char variant_str[128];
278     int32_t variant_str_len = 0;
279
280     UChar name[64];
281     int32_t name_len = 0;
282     char name_str[128];
283     int32_t name_str_len = 0;
284
285     UChar localname[64];
286     int32_t localname_len = 0;
287     char localname_str[128];
288     int32_t localname_str_len = 0;
289
290     count = uloc_countAvailable() ;
291
292     if (p_config->xmloutput){
293     
294         fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
295                 count, uloc_getDefault(), ucol_countAvailable());
296     }
297   
298     for(i=0;i<count;i++) 
299     {
300
301         keyword_len 
302             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
303                                      keyword, 64, 
304                                      &status);
305
306         u_strToUTF8(keyword_str, 128, &keyword_str_len,
307                     keyword, keyword_len,
308                     &status);
309     
310     
311         language_len 
312             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
313                                       language, 64, 
314                                       &status);
315
316         u_strToUTF8(lang_str, 128, &lang_str_len,
317                     language, language_len,
318                     &status);
319
320
321         script_len 
322             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
323                                     script, 64, 
324                                     &status);
325
326         u_strToUTF8(script_str, 128, &script_str_len,
327                     script, script_len,
328                     &status);
329
330         location_len 
331             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
332                                      location, 64, 
333                                      &status);
334
335         u_strToUTF8(location_str, 128, &location_str_len,
336                     location, location_len,
337                     &status);
338
339         variant_len 
340             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
341                                      variant, 64, 
342                                      &status);
343
344         u_strToUTF8(variant_str, 128, &variant_str_len,
345                     variant, variant_len,
346                     &status);
347
348         name_len 
349             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
350                                   name, 64, 
351                                   &status);
352
353         u_strToUTF8(name_str, 128, &name_str_len,
354                     name, name_len,
355                     &status);
356
357         localname_len 
358             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
359                                   localname, 64, 
360                                   &status);
361
362         u_strToUTF8(localname_str, 128, &localname_str_len,
363                     localname, localname_len,
364                     &status);
365
366
367         if (p_config->xmloutput){
368             fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
369             /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
370             /* if (strlen(keyword_str)) */
371             /*   fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
372             /* if (ucol_getAvailable(i)) */
373             /*   fprintf(config.outfile, " collation=\"1\""); */
374             if (strlen(lang_str))
375                 fprintf(config.outfile, " language=\"%s\"", lang_str);
376             if (strlen(script_str))
377                 fprintf(config.outfile, " script=\"%s\"", script_str);
378             if (strlen(location_str))
379                 fprintf(config.outfile, " location=\"%s\"", location_str);
380             if (strlen(variant_str))
381                 fprintf(config.outfile, " variant=\"%s\"", variant_str);
382             if (strlen(name_str))
383                 fprintf(config.outfile, " name=\"%s\"", name_str);
384             if (strlen(localname_str))
385                 fprintf(config.outfile, " localname=\"%s\"", localname_str);
386             fprintf(config.outfile, ">");
387             if (strlen(localname_str))
388                 fprintf(config.outfile, "%s", localname_str);
389             fprintf(config.outfile, "</locale>\n"); 
390         }
391         else if (1 == p_config->xmloutput){
392             fprintf(config.outfile, "%s", uloc_getAvailable(i)); 
393             fprintf(config.outfile, " | ");
394             if (strlen(name_str))
395                 fprintf(config.outfile, "%s", name_str);
396             fprintf(config.outfile, " | ");
397             if (strlen(localname_str))
398                 fprintf(config.outfile, "%s", localname_str);
399             fprintf(config.outfile, "\n");
400         }
401         else
402             fprintf(config.outfile, "%s ", uloc_getAvailable(i));
403     }
404     if (p_config->xmloutput)
405         fprintf(config.outfile, "</locales>\n");
406     else
407         fprintf(config.outfile, "\n");
408
409     if(U_FAILURE(status)) {
410         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
411         exit(status);
412     }
413 }
414
415
416 static void print_info(const struct config_t *p_config)
417 {
418     if (p_config->xmloutput)
419         fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
420                 "<icu>\n");
421
422     if ('c' == config.print[0])
423         print_icu_converters(&config);
424     else if ('l' == config.print[0])
425         print_icu_xml_locales(&config);
426     else if ('t' == config.print[0])
427         print_icu_transliterators(&config);
428     else {
429         print_icu_converters(&config);
430         print_icu_xml_locales(&config);
431         print_icu_transliterators(&config);
432     }
433
434     if (p_config->xmloutput)
435         fprintf(config.outfile, "</icu>\n");
436
437     exit(0);
438 };
439
440
441
442 static void process_text_file(const struct config_t *p_config)
443 {
444     char *line = 0;
445     char linebuf[1024];
446  
447     xmlDoc *doc = xmlParseFile(config.conffile);  
448     xmlNode *xml_node = xmlDocGetRootElement(doc);
449
450     long unsigned int token_count = 0;    
451     long unsigned int line_count = 0;    
452     
453     UErrorCode status = U_ZERO_ERROR;
454     int success = 0;
455     
456     if (! xml_node) {   
457         printf("Could not parse XML config file '%s' \n",
458                 config.conffile);
459         exit (1);
460     }
461
462     
463     config.chain = icu_chain_xml_config(xml_node, &status);
464
465     if (config.chain && U_SUCCESS(status))
466         success = 1;
467     else {   
468         printf("Could not set up ICU chain from config file '%s' \n",
469                 config.conffile);
470         exit (1);
471     }
472     
473     if (p_config->xmloutput)
474         fprintf(config.outfile,
475                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
476                 "<icu>\n"
477                 "<tokens>\n");
478     
479     // read input lines for processing
480     while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
481     {
482         success = icu_chain_assign_cstr(config.chain, line, &status);
483         line_count++;
484
485         while (success && icu_chain_next_token(config.chain, &status)){
486             if (U_FAILURE(status))
487                 success = 0;
488             else {
489                 token_count++;
490                 if (p_config->xmloutput)                    
491                     fprintf(config.outfile, 
492                             "<token id=\%lu\" line=\"%lu\""
493                             " norm=\"%s\" display=\"%s\"/>\n",
494                             token_count,
495                             line_count,
496                             icu_chain_get_norm(config.chain),
497                             icu_chain_get_display(config.chain));
498                 else
499                     fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
500                             token_count,
501                             line_count,
502                             icu_chain_get_norm(config.chain),
503                             icu_chain_get_display(config.chain));
504             }
505         }
506         
507     }
508
509     if (p_config->xmloutput)
510         fprintf(config.outfile, 
511                 "</tokens>\n"
512                 "</icu>\n");
513
514     icu_chain_destroy(config.chain);
515     xmlFreeDoc(doc);
516     if (line)
517         free(line);
518 };
519
520 #endif // HAVE_ICU
521
522
523 int main(int argc, char **argv) 
524 {
525
526 #ifdef HAVE_ICU
527
528     read_params(argc, argv, &config);
529
530     if (config.conffile && strlen(config.conffile))
531         process_text_file(&config);
532      
533     if (config.print && strlen(config.print))
534         print_info(&config);
535
536 #else // HAVE_ICU
537
538     printf("ICU not available on your system.\n"
539            "Please install libicu36-dev and icu-doc or similar, "
540            "re-configure and re-compile\n");
541
542
543 #endif // HAVE_ICU
544
545     return(0);
546 };
547
548
549 /*
550  * Local variables:
551  * c-basic-offset: 4
552  * indent-tabs-mode: nil
553  * End:
554  * vim: shiftwidth=4 tabstop=8 expandtab
555  */
556