Reindent according to c-mode. Added GPL header
[pazpar2-moved-to-github.git] / src / icu_chain_test.c
1 /* $Id: icu_chain_test.c,v 1.6 2007-07-05 18:40:24 adam Exp $
2    Copyright (c) 2006-2007, Index Data.
3
4 This file is part of Pazpar2.
5
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
9 version.
10
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE.  If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA.
20  */
21
22 #if HAVE_CONFIG_H
23 #include "cconfig.h"
24 #endif
25
26 #include <string.h>
27
28 #include <stdio.h>
29 #include <stdlib.h>
30
31 //#include <yaz/xmalloc.h>
32 #include <yaz/options.h>
33
34
35 #ifdef HAVE_ICU
36
37 #include <unicode/ucnv.h>
38 #include <unicode/ustring.h>
39
40 #include "icu_I18N.h"
41
42 /* commando line and config parameters */
43 static struct config_t { 
44     char conffile[1024];
45     char print[1024];
46     int xmloutput;
47     struct icu_chain * chain;
48     FILE * infile;
49     FILE * outfile;
50 } config;
51
52
53   
54 void print_option_error(const struct config_t *p_config)
55 {  
56     fprintf(stderr, "Calling error, valid options are :\n");
57     fprintf(stderr, "icu_chain_test\n"
58             "   [-c (path/to/config/file.xml)]\n"
59             "   [-p (a|c|l|t)] print ICU info \n"
60             "   [-x] XML output\n"
61             "\n"
62             "Examples:\n"
63             "cat hugetextfile.txt | ./icu_chain_test -c config.xml \n"
64             "./icu_chain_test -p c\n"
65             "./icu_chain_test -p l -x\n"
66             "./icu_chain_test -p t -x\n"
67           );
68     exit(1);
69 }
70
71 void read_params(int argc, char **argv, struct config_t *p_config)
72 {    
73     char *arg;
74     int ret;
75     
76     /* set default parameters */
77     p_config->conffile[0] = 0;
78     p_config->print[0] = 0;
79     p_config->xmloutput = 0;
80     p_config->chain = 0;
81     p_config->infile = stdin;
82     p_config->outfile = stdout;
83     
84     /* set up command line parameters */
85     
86     while ((ret = options("c:p:x", argv, argc, &arg)) != -2)
87     {
88         switch (ret)
89         {
90         case 'c':
91             strcpy(p_config->conffile, arg);
92             break;
93         case 'p':
94             strcpy(p_config->print, arg);
95             break;
96         case 'x':
97             p_config->xmloutput = 1;
98             break;
99         default:
100             print_option_error(p_config);
101         }
102     }
103     
104     if ((!strlen(p_config->conffile)
105          && !strlen(p_config->print))
106         || !config.infile
107         || !config.outfile)
108         
109         print_option_error(p_config);
110 };
111
112
113 /*     UConverter *conv; */
114 /*     conv = ucnv_open("utf-8", &status); */
115 /*     assert(U_SUCCESS(status)); */
116
117 /*     *ustr16_len  */
118 /*       = ucnv_toUChars(conv, ustr16, 1024,  */
119 /*                       (const char *) *xstr8, strlen((const char *) *xstr8), */
120 /*                       &status); */
121   
122
123
124 /*      ucnv_fromUChars(conv, */
125 /*                      (char *) *xstr8, strlen((const char *) *xstr8), */
126 /*                      ustr16, *ustr16_len, */
127 /*                      &status); */
128 /*      ucnv_close(conv); */
129
130
131 static void print_icu_converters(const struct config_t *p_config)
132 {
133     int32_t count;
134     int32_t i;
135
136     count = ucnv_countAvailable();
137     if (p_config->xmloutput)
138         fprintf(config.outfile, "<converters count=\"%d\" default=\"%s\">\n",
139                 count, ucnv_getDefaultName());
140     else {    
141         fprintf(config.outfile, "Available ICU converters: %d\n", count);
142         fprintf(config.outfile, "Default ICU Converter is: '%s'\n", ucnv_getDefaultName());
143     }
144     
145     for(i=0;i<count;i++){
146         if (p_config->xmloutput)
147             fprintf(config.outfile, "<converter id=\"%s\"/>\n", ucnv_getAvailableName(i));
148         else     
149             fprintf(config.outfile, "%s ", ucnv_getAvailableName(i));
150     }
151     
152     if (p_config->xmloutput)
153         fprintf(config.outfile, "</converters>\n");
154     else
155         fprintf(config.outfile, "\n");
156 }
157
158 static void print_icu_transliterators(const struct config_t *p_config)
159 {
160     int32_t count;
161     int32_t i;
162     
163     count = utrans_countAvailableIDs();
164     
165     int32_t buf_cap = 128;
166     char buf[buf_cap];
167     
168     if (p_config->xmloutput)
169         fprintf(config.outfile, "<transliterators count=\"%d\">\n",  count);
170     else 
171         fprintf(config.outfile, "Available ICU transliterators: %d\n", count);
172     
173     for(i = 0; i <count; i++)
174     {
175         utrans_getAvailableID(i, buf, buf_cap);
176         if (p_config->xmloutput)
177             fprintf(config.outfile, "<transliterator id=\"%s\"/>\n", buf);
178         else
179             fprintf(config.outfile, " %s", buf);
180     }
181     
182     if (p_config->xmloutput){
183         fprintf(config.outfile, "</transliterators>\n");
184     }
185     else
186     {
187         fprintf(config.outfile, "\n\nUnicode Set Patterns:\n"
188                 "   Pattern         Description\n"
189                 "   Ranges          [a-z]       The lower case letters a through z\n"
190                 "   Named Chars     [abc123] The six characters a,b,c,1,2 and 3\n"
191                 "   String          [abc{def}] chars a, b and c, and string 'def'\n"
192                 "   Categories      [\\p{Letter}] Perl General Category 'Letter'.\n"
193                 "   Categories      [:Letter:] Posix General Category 'Letter'.\n"
194                 "\n"
195                 "   Combination     Example\n"
196                 "   Union           [[:Greek:] [:letter:]]\n"
197                 "   Intersection    [[:Greek:] & [:letter:]]\n"
198                 "   Set Complement  [[:Greek:] - [:letter:]]\n"
199                 "   Complement      [^[:Greek:] [:letter:]]\n"
200                 "\n"
201              "see: http://icu.sourceforge.net/userguide/unicodeSet.html\n"
202                 "\n"
203                 "Examples:\n"
204                 "   [:Punctuation:] Any-Remove\n"
205                 "   [:Cased-Letter:] Any-Upper\n"
206                 "   [:Control:] Any-Remove\n"
207                 "   [:Decimal_Number:] Any-Remove\n"
208                 "   [:Final_Punctuation:] Any-Remove\n"
209                 "   [:Georgian:] Any-Upper\n"
210                 "   [:Katakana:] Any-Remove\n"
211                 "   [:Arabic:] Any-Remove\n"
212                 "   [:Punctuation:] Remove\n"
213                 "   [[:Punctuation:]-[.,]] Remove\n"
214                 "   [:Line_Separator:] Any-Remove\n"
215                 "   [:Math_Symbol:] Any-Remove\n"
216                 "   Lower; [:^Letter:] Remove (word tokenization)\n"
217                 "   [:^Number:] Remove (numeric tokenization)\n"
218                 "   [:^Katagana:] Remove (remove everything except Katagana)\n"
219                 "   Lower;[[:WhiteSpace:][:Punctuation:]] Remove (word tokenization)\n"
220                 "   NFD; [:Nonspacing Mark:] Remove; NFC   (removes accents from characters)\n"
221                 "   [A-Za-z]; Lower(); Latin-Katakana; Katakana-Hiragana (transforms latin and katagana to hiragana)\n"
222                 "   [[:separator:][:start punctuation:][:initial punctuation:]] Remove \n"
223                 "\n"
224                 "see http://icu.sourceforge.net/userguide/Transform.html\n"
225                 "    http://www.unicode.org/Public/UNIDATA/UCD.html\n"
226                 "    http://icu.sourceforge.net/userguide/Transform.html\n"
227                 "    http://icu.sourceforge.net/userguide/TransformRule.html\n"
228             );
229         
230         
231         fprintf(config.outfile, "\n\n");
232         
233     }
234 }
235
236 static void print_icu_xml_locales(const struct config_t *p_config)
237 {
238     int32_t count;
239     int32_t i;
240     UErrorCode status = U_ZERO_ERROR;
241     
242     UChar keyword[64];
243     int32_t keyword_len = 0;
244     char keyword_str[128];
245     int32_t keyword_str_len = 0;
246
247     UChar language[64];
248     int32_t language_len = 0;
249     char lang_str[128];
250     int32_t lang_str_len = 0;
251
252     UChar script[64];
253     int32_t script_len = 0;
254     char script_str[128];
255     int32_t script_str_len = 0;
256
257     UChar location[64];
258     int32_t location_len = 0;
259     char location_str[128];
260     int32_t location_str_len = 0;
261
262     UChar variant[64];
263     int32_t variant_len = 0;
264     char variant_str[128];
265     int32_t variant_str_len = 0;
266
267     UChar name[64];
268     int32_t name_len = 0;
269     char name_str[128];
270     int32_t name_str_len = 0;
271
272     UChar localname[64];
273     int32_t localname_len = 0;
274     char localname_str[128];
275     int32_t localname_str_len = 0;
276
277     count = uloc_countAvailable() ;
278
279     if (p_config->xmloutput){
280     
281         fprintf(config.outfile, "<locales count=\"%d\" default=\"%s\" collations=\"%d\">\n", 
282                 count, uloc_getDefault(), ucol_countAvailable());
283     }
284   
285     for(i=0;i<count;i++) 
286     {
287
288         keyword_len 
289             = uloc_getDisplayKeyword(uloc_getAvailable(i), "en", 
290                                      keyword, 64, 
291                                      &status);
292
293         u_strToUTF8(keyword_str, 128, &keyword_str_len,
294                     keyword, keyword_len,
295                     &status);
296     
297     
298         language_len 
299             = uloc_getDisplayLanguage(uloc_getAvailable(i), "en", 
300                                       language, 64, 
301                                       &status);
302
303         u_strToUTF8(lang_str, 128, &lang_str_len,
304                     language, language_len,
305                     &status);
306
307
308         script_len 
309             = uloc_getDisplayScript(uloc_getAvailable(i), "en", 
310                                     script, 64, 
311                                     &status);
312
313         u_strToUTF8(script_str, 128, &script_str_len,
314                     script, script_len,
315                     &status);
316
317         location_len 
318             = uloc_getDisplayCountry(uloc_getAvailable(i), "en", 
319                                      location, 64, 
320                                      &status);
321
322         u_strToUTF8(location_str, 128, &location_str_len,
323                     location, location_len,
324                     &status);
325
326         variant_len 
327             = uloc_getDisplayVariant(uloc_getAvailable(i), "en", 
328                                      variant, 64, 
329                                      &status);
330
331         u_strToUTF8(variant_str, 128, &variant_str_len,
332                     variant, variant_len,
333                     &status);
334
335         name_len 
336             = uloc_getDisplayName(uloc_getAvailable(i), "en", 
337                                   name, 64, 
338                                   &status);
339
340         u_strToUTF8(name_str, 128, &name_str_len,
341                     name, name_len,
342                     &status);
343
344         localname_len 
345             = uloc_getDisplayName(uloc_getAvailable(i), uloc_getAvailable(i), 
346                                   localname, 64, 
347                                   &status);
348
349         u_strToUTF8(localname_str, 128, &localname_str_len,
350                     localname, localname_len,
351                     &status);
352
353
354         if (p_config->xmloutput){
355             fprintf(config.outfile, "<locale id=\"%s\"", uloc_getAvailable(i)); 
356             /* fprintf(config.outfile, " locale=\"%s\"", uloc_getAvailable(i)); */
357             /* if (strlen(keyword_str)) */
358             /*   fprintf(config.outfile, " keyword=\"%s\"", keyword_str); */
359             /* if (ucol_getAvailable(i)) */
360             /*   fprintf(config.outfile, " collation=\"1\""); */
361             if (strlen(lang_str))
362                 fprintf(config.outfile, " language=\"%s\"", lang_str);
363             if (strlen(script_str))
364                 fprintf(config.outfile, " script=\"%s\"", script_str);
365             if (strlen(location_str))
366                 fprintf(config.outfile, " location=\"%s\"", location_str);
367             if (strlen(variant_str))
368                 fprintf(config.outfile, " variant=\"%s\"", variant_str);
369             if (strlen(name_str))
370                 fprintf(config.outfile, " name=\"%s\"", name_str);
371             if (strlen(localname_str))
372                 fprintf(config.outfile, " localname=\"%s\"", localname_str);
373             fprintf(config.outfile, ">");
374             if (strlen(localname_str))
375                 fprintf(config.outfile, "%s", localname_str);
376             fprintf(config.outfile, "</locale>\n"); 
377         }
378         else if (1 == p_config->xmloutput){
379             fprintf(config.outfile, "%s", uloc_getAvailable(i)); 
380             fprintf(config.outfile, " | ");
381             if (strlen(name_str))
382                 fprintf(config.outfile, "%s", name_str);
383             fprintf(config.outfile, " | ");
384             if (strlen(localname_str))
385                 fprintf(config.outfile, "%s", localname_str);
386             fprintf(config.outfile, "\n");
387         }
388         else
389             fprintf(config.outfile, "%s ", uloc_getAvailable(i));
390     }
391     if (p_config->xmloutput)
392         fprintf(config.outfile, "</locales>\n");
393     else
394         fprintf(config.outfile, "\n");
395
396     if(U_FAILURE(status)) {
397         fprintf(stderr, "ICU Error: %d %s\n", status, u_errorName(status));
398         exit(status);
399     }
400 }
401
402
403 static void print_info(const struct config_t *p_config)
404 {
405     if (p_config->xmloutput)
406         fprintf(config.outfile, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
407                 "<icu>\n");
408
409     if ('c' == config.print[0])
410         print_icu_converters(&config);
411     else if ('l' == config.print[0])
412         print_icu_xml_locales(&config);
413     else if ('t' == config.print[0])
414         print_icu_transliterators(&config);
415     else {
416         print_icu_converters(&config);
417         print_icu_xml_locales(&config);
418         print_icu_transliterators(&config);
419     }
420
421     if (p_config->xmloutput)
422         fprintf(config.outfile, "</icu>\n");
423
424     exit(0);
425 };
426
427
428
429 static void process_text_file(const struct config_t *p_config)
430 {
431     char *line = 0;
432     char linebuf[1024];
433  
434     xmlDoc *doc = xmlParseFile(config.conffile);  
435     xmlNode *xml_node = xmlDocGetRootElement(doc);
436
437     long unsigned int token_count = 0;    
438     long unsigned int line_count = 0;    
439     
440     UErrorCode status = U_ZERO_ERROR;
441     int success = 0;
442     
443     
444     config.chain = icu_chain_xml_config(xml_node, &status);
445
446     if (config.chain && U_SUCCESS(status))
447         success = 1;
448
449     if (p_config->xmloutput)
450         fprintf(config.outfile,
451                 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
452                 "<icu>\n"
453                 "<tokens>\n");
454     
455     // read input lines for processing
456     while ((line=fgets(linebuf, sizeof(linebuf)-1, config.infile)))
457     {
458         success = icu_chain_assign_cstr(config.chain, line, &status);
459         line_count++;
460
461         while (success && icu_chain_next_token(config.chain, &status)){
462             if (U_FAILURE(status))
463                 success = 0;
464             else {
465                 token_count++;
466                 if (p_config->xmloutput)                    
467                     fprintf(config.outfile, 
468                             "<token id=\%lu\" line=\"%lu\""
469                             " norm=\"%s\" display=\"%s\"/>\n",
470                             token_count,
471                             line_count,
472                             icu_chain_get_norm(config.chain),
473                             icu_chain_get_display(config.chain));
474                 else
475                     fprintf(config.outfile, "%lu %lu '%s' '%s'\n",
476                             token_count,
477                             line_count,
478                             icu_chain_get_norm(config.chain),
479                             icu_chain_get_display(config.chain));
480             }
481         }
482         
483     }
484
485     if (p_config->xmloutput)
486         fprintf(config.outfile, 
487                 "</tokens>\n"
488                 "</icu>\n");
489
490     icu_chain_destroy(config.chain);
491     xmlFreeDoc(doc);
492     if (line)
493         free(line);
494 };
495
496 #endif // HAVE_ICU
497
498
499 int main(int argc, char **argv) 
500 {
501
502 #ifdef HAVE_ICU
503
504     read_params(argc, argv, &config);
505
506     if (config.conffile && strlen(config.conffile))
507         process_text_file(&config);
508      
509     if (config.print && strlen(config.print))
510         print_info(&config);
511
512 #else // HAVE_ICU
513
514     printf("ICU not available on your system.\n"
515            "Please install libicu36-dev and icu-doc or similar, "
516            "re-configure and re-compile\n");
517
518
519 #endif // HAVE_ICU
520
521     return(0);
522 };
523
524
525 /*
526  * Local variables:
527  * c-basic-offset: 4
528  * indent-tabs-mode: nil
529  * End:
530  * vim: shiftwidth=4 tabstop=8 expandtab
531  */
532