db258936fdbb48e14c7ef9e1dabbff20eb03b3d0
[pazpar2-moved-to-github.git] / src / icu_chain_test.c
1 /* This file is part of Pazpar2.
2    Copyright (C) 2006-2008 Index Data
3
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #if HAVE_CONFIG_H
21 #include "cconfig.h"
22 #endif
23
24 #include <string.h>
25
26 #include <stdio.h>
27 #include <stdlib.h>
28
29 //#include <yaz/xmalloc.h>
30 #include <yaz/options.h>
31
32
33 #ifdef HAVE_ICU
34
35 #include <unicode/ucnv.h>
36 #include <unicode/ustring.h>
37
38 #include "icu_I18N.h"
39
40 /* commando line and config parameters */
41 static struct config_t { 
42     char conffile[1024];
43     char print[1024];
44     int xmloutput;
45     struct icu_chain * chain;
46     FILE * infile;
47     FILE * outfile;
48 } config;
49
50
51   
52 void print_option_error(const struct config_t *p_config)
53 {  
54     fprintf(stderr, "Calling error, valid options are :\n");
55     fprintf(stderr, "icu_chain_test\n"
56             "   [-c (path/to/config/file.xml)]\n"
57             "   [-p (a|c|l|t)] print ICU info \n"
58             "   [-x] XML output\n"
59             "\n"
60             "Examples:\n"
61             "cat hugetextfile.txt | ./icu_chain_test -c config.xml \n"
62             "./icu_chain_test -p c\n"
63             "./icu_chain_test -p l -x\n"
64             "./icu_chain_test -p t -x\n"
65             "\n"
66             "Example ICU chain XML configuration file:\n"
67             "<icu_chain id=\"en:word\" locale=\"en\">\n"
68             "  <normalize rule=\"[:Control:] Any-Remove\"/>\n"
69             "  <tokenize rule=\"l\"/>\n"
70             "  <normalize rule=\"[[:WhiteSpace:][:Punctuation:]] Remove\"/>\n"
71             "  <display/>\n"
72             "  <casemap rule=\"l\"/>\n"
73             "  <index/>\n"
74             "  <sortkey/>\n"
75             "</icu_chain>\n"
76           );
77     exit(1);
78 }
79
80 void read_params(int argc, char **argv, struct config_t *p_config)
81 {    
82     char *arg;
83     int ret;
84     
85     /* set default parameters */
86     p_config->conffile[0] = 0;
87     p_config->print[0] = 0;
88     p_config->xmloutput = 0;
89     p_config->chain = 0;
90     p_config->infile = stdin;
91     p_config->outfile = stdout;
92     
93     /* set up command line parameters */
94     
95     while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
96     {
97         switch (ret)
98         {
99         case 'c':
100             strcpy(p_config->conffile, arg);
101             break;
102         case 'p':
103             strcpy(p_config->print, arg);
104             break;
105         case 'x':
106             p_config->xmloutput = 1;
107             break;
108         default:
109             print_option_error(p_config);
110         }
111     }
112     
113     if ((!strlen(p_config->conffile)
114          && !strlen(p_config->print))
115         || !config.infile
116         || !config.outfile)
117         
118         print_option_error(p_config);
119 };
120
121
122 /*     UConverter *conv; */
123 /*     conv = ucnv_open("utf-8", &status); */
124 /*     assert(U_SUCCESS(status)); */
125
126 /*     *ustr16_len  */
127 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
128 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
129 /*                       &status); */
130   
131
132
133 /*      ucnv_fromUChars(conv, */
134 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
135 /*                      ustr16, *ustr16_len, */
136 /*                      &status); */
137 /*      ucnv_close(conv); */
138
139
140 static void print_icu_converters(const struct config_t *p_config)
141 {
142     int32_t count;
143     int32_t i;
144
145     count = ucnv_countAvailable();
146     if (p_config->xmloutput)
147         fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
148                 count, ucnv_getDefaultName());
149     else {    
150         fprintf(config.outfile, "Available ICU converters: %d\n", count);
151         fprintf(config.outfile, "Default ICU Converter is: '%s'\n", 
152                 ucnv_getDefaultName());
153     }
154     
155     for(i=0;i<count;i++){
156         if (p_config->xmloutput)
157             fprintf(config.outfile, "<converter id=\"%s\"/>\n", 
158                     ucnv_getAvailableName(i));
159         else     
160             fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
161     }
162     
163     if (p_config->xmloutput)
164         fprintf(config.outfile, "</converters>\n");
165     else
166         fprintf(config.outfile, "\n");
167 }
168
169 static void print_icu_transliterators(const struct config_t *p_config)
170 {
171     int32_t count;
172     int32_t i;
173     
174     count = utrans_countAvailableIDs();
175     
176     int32_t buf_cap = 128;
177     char buf[buf_cap];
178     
179     if (p_config->xmloutput)
180         fprintf(config.outfile, "<transliterators count=\"%d\">\n",  count);
181     else 
182         fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
183     
184     for(i = 0; i <count; i++)
185     {
186         utrans_getAvailableID(i, buf, buf_cap);
187         if (p_config->xmloutput)
188             fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
189         else
190             fprintf(config.outfile, " %s", buf);
191     }
192     
193     if (p_config->xmloutput){
194         fprintf(config.outfile, "</transliterators>\n");
195     }
196     else
197     {
198         fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
199                 "   Pattern         Description\n"
200                 "   Ranges          [a-z]       The lower case letters a through z\n"
201                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
202                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
203                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
204                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
205                 "\n"
206                 "   Combination     Example\n"
207                 "   Union           [[:Greek:] [:letter:]]\n"
208                 "   Intersection    [[:Greek:] & [:letter:]]\n"
209                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
210                 "   Complement      [^[:Greek:] [:letter:]]\n"
211                 "\n"
212              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
213                 "\n"
214                 "Examples:\n"
215                 "   [:Punctuation:] Any-Remove\n"
216                 "   [:Cased-Letter:] Any-Upper\n"
217                 "   [:Control:] Any-Remove\n"
218                 "   [:Decimal_Number:] Any-Remove\n"
219                 "   [:Final_Punctuation:] Any-Remove\n"
220                 "   [:Georgian:] Any-Upper\n"
221                 "   [:Katakana:] Any-Remove\n"
222                 "   [:Arabic:] Any-Remove\n"
223                 "   [:Punctuation:] Remove\n"
224                 "   [[:Punctuation:]-[.,]] Remove\n"
225                 "   [:Line_Separator:] Any-Remove\n"
226                 "   [:Math_Symbol:] Any-Remove\n"
227                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
228                 "   [:^Number:] Remove (numeric tokenization)\n"
229                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
230                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
231                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
232                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
233                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
234                 "\n"
235                 "see http://icu.sourceforge.net/userguide/Transform.html\n"
236                 "    http://www.unicode.org/Public/UNIDATA/UCD.html\n"
237                 "    http://icu.sourceforge.net/userguide/Transform.html\n"
238                 "    http://icu.sourceforge.net/userguide/TransformRule.html\n"
239             );
240         
241         
242         fprintf(config.outfile, "\n\n");
243         
244     }
245 }
246
247 static void print_icu_xml_locales(const struct config_t *p_config)
248 {
249     int32_t count;
250     int32_t i;
251     UErrorCode status = U_ZERO_ERROR;
252     
253     UChar keyword[64];
254     int32_t keyword_len = 0;
255     char keyword_str[128];
256     int32_t keyword_str_len = 0;
257
258     UChar language[64];
259     int32_t language_len = 0;
260     char lang_str[128];
261     int32_t lang_str_len = 0;
262
263     UChar script[64];
264     int32_t script_len = 0;
265     char script_str[128];
266     int32_t script_str_len = 0;
267
268     UChar location[64];
269     int32_t location_len = 0;
270     char location_str[128];
271     int32_t location_str_len = 0;
272
273     UChar variant[64];
274     int32_t variant_len = 0;
275     char variant_str[128];
276     int32_t variant_str_len = 0;
277
278     UChar name[64];
279     int32_t name_len = 0;
280     char name_str[128];
281     int32_t name_str_len = 0;
282
283     UChar localname[64];
284     int32_t localname_len = 0;
285     char localname_str[128];
286     int32_t localname_str_len = 0;
287
288     count = uloc_countAvailable() ;
289
290     if (p_config->xmloutput){
291     
292         fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
293                 count, uloc_getDefault(), ucol_countAvailable());
294     }
295   
296     for(i=0;i<count;i++) 
297     {
298
299         keyword_len 
300             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
301                                      keyword, 64, 
302                                      &status);
303
304         u_strToUTF8(keyword_str, 128, &keyword_str_len,
305                     keyword, keyword_len,
306                     &status);
307     
308     
309         language_len 
310             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
311                                       language, 64, 
312                                       &status);
313
314         u_strToUTF8(lang_str, 128, &lang_str_len,
315                     language, language_len,
316                     &status);
317
318
319         script_len 
320             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
321                                     script, 64, 
322                                     &status);
323
324         u_strToUTF8(script_str, 128, &script_str_len,
325                     script, script_len,
326                     &status);
327
328         location_len 
329             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
330                                      location, 64, 
331                                      &status);
332
333         u_strToUTF8(location_str, 128, &location_str_len,
334                     location, location_len,
335                     &status);
336
337         variant_len 
338             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
339                                      variant, 64, 
340                                      &status);
341
342         u_strToUTF8(variant_str, 128, &variant_str_len,
343                     variant, variant_len,
344                     &status);
345
346         name_len 
347             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
348                                   name, 64, 
349                                   &status);
350
351         u_strToUTF8(name_str, 128, &name_str_len,
352                     name, name_len,
353                     &status);
354
355         localname_len 
356             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
357                                   localname, 64, 
358                                   &status);
359
360         u_strToUTF8(localname_str, 128, &localname_str_len,
361                     localname, localname_len,
362                     &status);
363
364
365         if (p_config->xmloutput){
366             fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
367             /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
368             /* if (strlen(keyword_str)) */
369             /*   fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
370             /* if (ucol_getAvailable(i)) */
371             /*   fprintf(config.outfile, " collation=\"1\""); */
372             if (strlen(lang_str))
373                 fprintf(config.outfile, " language=\"%s\"", lang_str);
374             if (strlen(script_str))
375                 fprintf(config.outfile, " script=\"%s\"", script_str);
376             if (strlen(location_str))
377                 fprintf(config.outfile, " location=\"%s\"", location_str);
378             if (strlen(variant_str))
379                 fprintf(config.outfile, " variant=\"%s\"", variant_str);
380             if (strlen(name_str))
381                 fprintf(config.outfile, " name=\"%s\"", name_str);
382             if (strlen(localname_str))
383                 fprintf(config.outfile, " localname=\"%s\"", localname_str);
384             fprintf(config.outfile, ">");
385             if (strlen(localname_str))
386                 fprintf(config.outfile, "%s", localname_str);
387             fprintf(config.outfile, "</locale>\n"); 
388         }
389         else if (1 == p_config->xmloutput){
390             fprintf(config.outfile, "%s", uloc_getAvailable(i)); 
391             fprintf(config.outfile, " | ");
392             if (strlen(name_str))
393                 fprintf(config.outfile, "%s", name_str);
394             fprintf(config.outfile, " | ");
395             if (strlen(localname_str))
396                 fprintf(config.outfile, "%s", localname_str);
397             fprintf(config.outfile, "\n");
398         }
399         else
400             fprintf(config.outfile, "%s ", uloc_getAvailable(i));
401     }
402     if (p_config->xmloutput)
403         fprintf(config.outfile, "</locales>\n");
404     else
405         fprintf(config.outfile, "\n");
406
407     if(U_FAILURE(status)) {
408         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
409         exit(status);
410     }
411 }
412
413
414 static void print_info(const struct config_t *p_config)
415 {
416     if (p_config->xmloutput)
417         fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
418                 "<icu>\n");
419
420     if ('c' == config.print[0])
421         print_icu_converters(&config);
422     else if ('l' == config.print[0])
423         print_icu_xml_locales(&config);
424     else if ('t' == config.print[0])
425         print_icu_transliterators(&config);
426     else {
427         print_icu_converters(&config);
428         print_icu_xml_locales(&config);
429         print_icu_transliterators(&config);
430     }
431
432     if (p_config->xmloutput)
433         fprintf(config.outfile, "</icu>\n");
434
435     exit(0);
436 };
437
438
439
440 static void process_text_file(const struct config_t *p_config)
441 {
442     char *line = 0;
443     char linebuf[1024];
444  
445     xmlDoc *doc = xmlParseFile(config.conffile);  
446     xmlNode *xml_node = xmlDocGetRootElement(doc);
447
448     long unsigned int token_count = 0;    
449     long unsigned int line_count = 0;    
450     
451     UErrorCode status = U_ZERO_ERROR;
452     int success = 0;
453     
454     if (! xml_node) {   
455         printf("Could not parse XML config file '%s' \n",
456                 config.conffile);
457         exit (1);
458     }
459
460     
461     config.chain = icu_chain_xml_config(xml_node, &status);
462
463     if (config.chain && U_SUCCESS(status))
464         success = 1;
465     else {   
466         printf("Could not set up ICU chain from config file '%s' \n",
467                 config.conffile);
468         exit (1);
469     }
470     
471     if (p_config->xmloutput)
472         fprintf(config.outfile,
473                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
474                 "<icu>\n"
475                 "<tokens>\n");
476     
477     // read input lines for processing
478     while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
479     {
480         success = icu_chain_assign_cstr(config.chain, line, &status);
481         line_count++;
482
483         while (success && icu_chain_next_token(config.chain, &status)){
484             if (U_FAILURE(status))
485                 success = 0;
486             else {
487                 token_count++;
488                 if (p_config->xmloutput)                    
489                     fprintf(config.outfile, 
490                             "<token id=\%lu\" line=\"%lu\""
491                             " norm=\"%s\" display=\"%s\"/>\n",
492                             token_count,
493                             line_count,
494                             icu_chain_get_norm(config.chain),
495                             icu_chain_get_display(config.chain));
496                 else
497                     fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
498                             token_count,
499                             line_count,
500                             icu_chain_get_norm(config.chain),
501                             icu_chain_get_display(config.chain));
502             }
503         }
504         
505     }
506
507     if (p_config->xmloutput)
508         fprintf(config.outfile, 
509                 "</tokens>\n"
510                 "</icu>\n");
511
512     icu_chain_destroy(config.chain);
513     xmlFreeDoc(doc);
514     if (line)
515         free(line);
516 };
517
518 #endif // HAVE_ICU
519
520
521 int main(int argc, char **argv) 
522 {
523
524 #ifdef HAVE_ICU
525
526     read_params(argc, argv, &config);
527
528     if (config.conffile && strlen(config.conffile))
529         process_text_file(&config);
530      
531     if (config.print && strlen(config.print))
532         print_info(&config);
533
534 #else // HAVE_ICU
535
536     printf("ICU not available on your system.\n"
537            "Please install libicu36-dev and icu-doc or similar, "
538            "re-configure and re-compile\n");
539
540
541 #endif // HAVE_ICU
542
543     return(0);
544 };
545
546
547 /*
548  * Local variables:
549  * c-basic-offset: 4
550  * indent-tabs-mode: nil
551  * End:
552  * vim: shiftwidth=4 tabstop=8 expandtab
553  */
554