Split ICU wrapper library into several sources
[yaz-moved-to-github.git] / src / icu_chain.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2009 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file
8  * \brief ICU chain
9  */
10
11 #if HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14
15 #if YAZ_HAVE_ICU
16 #include <yaz/xmalloc.h>
17
18 #include <yaz/icu_I18N.h>
19
20 #include <yaz/log.h>
21
22 #include <string.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25
26 #include <unicode/ustring.h>  /* some more string fcns*/
27 #include <unicode/uchar.h>    /* char names           */
28
29 enum icu_chain_step_type {
30     ICU_chain_step_type_none,
31     ICU_chain_step_type_display,   /* convert to utf8 display format */
32     ICU_chain_step_type_casemap,   /* apply utf16 charmap */
33     ICU_chain_step_type_transform, /* apply utf16 transform */
34     ICU_chain_step_type_tokenize,  /* apply utf16 tokenization */
35     ICU_chain_step_type_transliterate  /* apply utf16 tokenization */
36 };
37
38 struct icu_chain_step
39 {
40     /* type and action object */
41     enum icu_chain_step_type type;
42     union {
43         struct icu_casemap * casemap;
44         struct icu_transform * transform;
45         struct icu_tokenizer * tokenizer;  
46     } u;
47     /* temprary post-action utf16 buffer */
48     struct icu_buf_utf16 * buf16;  
49     struct icu_chain_step * previous;
50     int more_tokens;
51     int need_new_token;
52 };
53
54 struct icu_chain
55 {
56     char *locale;
57     int sort;
58
59     const char * src8cstr;
60
61     UCollator * coll;
62     
63     /* number of tokens returned so far */
64     int32_t token_count;
65     
66     /* utf8 output buffers */
67     struct icu_buf_utf8 * display8;
68     struct icu_buf_utf8 * norm8;
69     struct icu_buf_utf8 * sort8;
70     
71     /* utf16 source buffer */
72     struct icu_buf_utf16 * src16;
73     
74     /* linked list of chain steps */
75     struct icu_chain_step * steps;
76 };
77
78 int icu_check_status(UErrorCode status)
79 {
80     if (U_FAILURE(status))
81     {
82         yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
83         return 0;   
84     }
85     return 1;
86 }
87
88 static struct icu_chain_step *icu_chain_step_create(
89     struct icu_chain * chain,  enum icu_chain_step_type type,
90     const uint8_t * rule, struct icu_buf_utf16 * buf16,
91     UErrorCode *status)
92 {
93     struct icu_chain_step * step = 0;
94     
95     if(!chain || !type || !rule)
96         return 0;
97
98     step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step));
99
100     step->type = type;
101
102     step->buf16 = buf16;
103
104     /* create auxilary objects */
105     switch (step->type)
106     {
107     case ICU_chain_step_type_display:
108         break;
109     case ICU_chain_step_type_casemap:
110         step->u.casemap = icu_casemap_create(rule[0], status);
111         break;
112     case ICU_chain_step_type_transform:
113         /* rule omitted. Only ID used */
114         step->u.transform = icu_transform_create((const char *) rule, 'f',
115                                                  0, status);
116         break;
117     case ICU_chain_step_type_tokenize:
118         step->u.tokenizer = icu_tokenizer_create((char *) chain->locale, 
119                                                  (char) rule[0], status);
120         break;
121     case ICU_chain_step_type_transliterate:
122         /* we pass a dummy ID to utrans_openU.. */
123         step->u.transform = icu_transform_create("custom", 'f',
124                                                  (const char *) rule, status);
125         break;
126     default:
127         break;
128     }
129     return step;
130 }
131
132
133 static void icu_chain_step_destroy(struct icu_chain_step * step)
134 {
135     if (!step)
136         return;
137
138     icu_chain_step_destroy(step->previous);
139
140     switch (step->type)
141     {
142     case ICU_chain_step_type_display:
143         break;
144     case ICU_chain_step_type_casemap:
145         icu_casemap_destroy(step->u.casemap);
146         icu_buf_utf16_destroy(step->buf16);
147         break;
148     case ICU_chain_step_type_transform:
149     case ICU_chain_step_type_transliterate:
150         icu_transform_destroy(step->u.transform);
151         icu_buf_utf16_destroy(step->buf16);
152         break;
153     case ICU_chain_step_type_tokenize:
154         icu_tokenizer_destroy(step->u.tokenizer);
155         icu_buf_utf16_destroy(step->buf16);
156         break;
157     default:
158         break;
159     }
160     xfree(step);
161 }
162
163 struct icu_chain *icu_chain_create(const char *locale, int sort,
164                                    UErrorCode * status)
165 {
166     struct icu_chain * chain 
167         = (struct icu_chain *) xmalloc(sizeof(struct icu_chain));
168
169     *status = U_ZERO_ERROR;
170
171     chain->locale = xstrdup(locale);
172
173     chain->sort = sort;
174
175     chain->coll = ucol_open((const char *) chain->locale, status);
176
177     if (U_FAILURE(*status))
178         return 0;
179
180     chain->token_count = 0;
181
182     chain->src8cstr = 0;
183
184     chain->display8 = icu_buf_utf8_create(0);
185     chain->norm8 = icu_buf_utf8_create(0);
186     chain->sort8 = icu_buf_utf8_create(0);
187
188     chain->src16 = icu_buf_utf16_create(0);
189
190     chain->steps = 0;
191
192     return chain;
193 }
194
195 void icu_chain_destroy(struct icu_chain * chain)
196 {
197     if (chain)
198     {
199         if (chain->coll)
200             ucol_close(chain->coll);
201
202         icu_buf_utf8_destroy(chain->display8);
203         icu_buf_utf8_destroy(chain->norm8);
204         icu_buf_utf8_destroy(chain->sort8);
205         
206         icu_buf_utf16_destroy(chain->src16);
207     
208         icu_chain_step_destroy(chain->steps);
209         xfree(chain->locale);
210         xfree(chain);
211     }
212 }
213
214 static struct icu_chain_step *icu_chain_insert_step(
215     struct icu_chain * chain, enum icu_chain_step_type type,
216     const uint8_t * rule, UErrorCode *status);
217
218 struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node, 
219                                         int sort,
220                                         UErrorCode * status)
221 {
222     xmlNode *node = 0;
223     struct icu_chain * chain = 0;
224    
225     *status = U_ZERO_ERROR;
226
227     if (!xml_node ||xml_node->type != XML_ELEMENT_NODE)
228         return 0;
229     
230     {
231         xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node, 
232                                           (xmlChar *) "locale");
233         
234         if (xml_locale)
235         {
236             chain = icu_chain_create((const char *) xml_locale, sort, status);
237             xmlFree(xml_locale);
238         }
239         
240     }
241     if (!chain)
242         return 0;
243
244     for (node = xml_node->children; node; node = node->next)
245     {
246         xmlChar *xml_rule;
247         struct icu_chain_step * step = 0;
248
249         if (node->type != XML_ELEMENT_NODE)
250             continue;
251
252         xml_rule = xmlGetProp(node, (xmlChar *) "rule");
253
254         if (!strcmp((const char *) node->name, "casemap"))
255             step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap, 
256                                          (const uint8_t *) xml_rule, status);
257         else if (!strcmp((const char *) node->name, "transform"))
258             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, 
259                                          (const uint8_t *) xml_rule, status);
260         else if (!strcmp((const char *) node->name, "transliterate"))
261             step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate, 
262                                          (const uint8_t *) xml_rule, status);
263         else if (!strcmp((const char *) node->name, "tokenize"))
264             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, 
265                                          (const uint8_t *) xml_rule, status);
266         else if (!strcmp((const char *) node->name, "display"))
267             step = icu_chain_insert_step(chain, ICU_chain_step_type_display, 
268                                          (const uint8_t *) "", status);
269         else if (!strcmp((const char *) node->name, "normalize"))
270         {
271             yaz_log(YLOG_WARN, "Element %s is deprecated. "
272                     "Use transform instead", node->name);
273             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, 
274                                          (const uint8_t *) xml_rule, status);
275         }
276         else if (!strcmp((const char *) node->name, "index")
277                  || !strcmp((const char *) node->name, "sortkey"))
278         {
279             yaz_log(YLOG_WARN, "Element %s is no longer needed. "
280                     "Remove it from the configuration", node->name);
281         }
282         else
283         {
284             yaz_log(YLOG_WARN, "Unknown element %s", node->name);
285             icu_chain_destroy(chain);
286             return 0;
287         }
288         xmlFree(xml_rule);
289         if (step && U_FAILURE(*status))
290         {
291             icu_chain_destroy(chain);
292             return 0;
293         }
294     }
295     return chain;
296 }
297
298 static struct icu_chain_step *icu_chain_insert_step(
299     struct icu_chain * chain, enum icu_chain_step_type type,
300     const uint8_t * rule, UErrorCode *status)
301 {    
302     struct icu_chain_step * step = 0;
303     struct icu_buf_utf16 * src16 = 0;
304     struct icu_buf_utf16 * buf16 = 0;
305
306     if (!chain || !type || !rule)
307         return 0;
308
309     /* assign utf16 src buffers as needed */
310     if (chain->steps && chain->steps->buf16)
311         src16 = chain->steps->buf16;
312     else if (chain->src16)
313         src16 = chain->src16;
314     else
315         return 0;
316
317     /* create utf16 destination buffers as needed, or */
318     switch (type)
319     {
320     case ICU_chain_step_type_display:
321         buf16 = src16;
322         break;
323     case ICU_chain_step_type_casemap:
324         buf16 = icu_buf_utf16_create(0);
325         break;
326     case ICU_chain_step_type_transform:
327     case ICU_chain_step_type_transliterate:
328         buf16 = icu_buf_utf16_create(0);
329         break;
330     case ICU_chain_step_type_tokenize:
331         buf16 = icu_buf_utf16_create(0);
332         break;
333         break;
334     default:
335         break;
336     }
337     /* create actual chain step with this buffer */
338     step = icu_chain_step_create(chain, type, rule, buf16, status);
339
340     step->previous = chain->steps;
341     chain->steps = step;
342
343     return step;
344 }
345
346 static int icu_chain_step_next_token(struct icu_chain * chain,
347                                      struct icu_chain_step * step,
348                                      UErrorCode *status)
349 {
350     struct icu_buf_utf16 * src16 = 0;
351     int got_new_token = 0;
352
353     if (!chain || !chain->src16 || !step || !step->more_tokens)
354         return 0;
355
356     /* assign utf16 src buffers as needed, advance in previous steps
357        tokens until non-zero token met, and setting stop condition */
358
359     if (step->previous)
360     {
361         src16 = step->previous->buf16;
362         /* tokens might be killed in previous steps, therefore looping */
363
364         while (step->need_new_token 
365                && step->previous->more_tokens
366                && !got_new_token)
367             got_new_token
368                 = icu_chain_step_next_token(chain, step->previous, status);
369     }
370     else 
371     { /* first step can only work once on chain->src16 input buffer */
372         src16 = chain->src16;
373         step->more_tokens = 0;
374         got_new_token = 1;
375     }
376
377     if (!src16)
378         return 0;
379
380     /* stop if nothing to process */
381     if (step->need_new_token && !got_new_token)
382     {
383         step->more_tokens = 0;
384         return 0;
385     }
386
387     /* either an old token not finished yet, or a new token, thus
388        perform the work, eventually put this steps output in 
389        step->buf16 or the chains UTF8 output buffers  */
390
391     switch (step->type)
392     {
393     case ICU_chain_step_type_display:
394         icu_utf16_to_utf8(chain->display8, src16, status);
395         break;
396     case ICU_chain_step_type_casemap:
397         icu_casemap_casemap(step->u.casemap,
398                             step->buf16, src16, status,
399                             chain->locale);
400         break;
401     case ICU_chain_step_type_transform:
402     case ICU_chain_step_type_transliterate:
403         icu_transform_trans(step->u.transform,
404                             step->buf16, src16, status);
405         break;
406     case ICU_chain_step_type_tokenize:
407         /* attach to new src16 token only first time during splitting */
408         if (step->need_new_token)
409         {
410             icu_tokenizer_attach(step->u.tokenizer, src16, status);
411             step->need_new_token = 0;
412         }
413
414         /* splitting one src16 token into multiple buf16 tokens */
415         step->more_tokens
416             = icu_tokenizer_next_token(step->u.tokenizer,
417                                        step->buf16, status);
418
419         /* make sure to get new previous token if this one had been used up
420            by recursive call to _same_ step */
421
422         if (!step->more_tokens)
423         {
424             step->more_tokens = icu_chain_step_next_token(chain, step, status);
425             return step->more_tokens;  /* avoid one token count too much! */
426         }
427         break;
428     default:
429         return 0;
430         break;
431     }
432
433     if (U_FAILURE(*status))
434         return 0;
435
436     /* if token disappered into thin air, tell caller */
437     /* if (!step->buf16->utf16_len && !step->more_tokens) */ 
438     /*    return 0; */ 
439
440     return 1;
441 }
442
443 int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr, 
444                           UErrorCode *status)
445 {
446     struct icu_chain_step * stp = 0; 
447
448     if (!chain || !src8cstr)
449         return 0;
450
451     chain->src8cstr = src8cstr;
452
453     stp = chain->steps;
454     
455     /* clear token count */
456     chain->token_count = 0;
457
458     /* clear all steps stop states */
459     while (stp)
460     {
461         stp->more_tokens = 1;
462         stp->need_new_token = 1;
463         stp = stp->previous;
464     }
465     
466     /* finally convert UTF8 to UTF16 string if needed */
467     if (chain->steps || chain->sort)
468         icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status);
469             
470     if (U_FAILURE(*status))
471         return 0;
472
473     return 1;
474 }
475
476 int icu_chain_next_token(struct icu_chain * chain, UErrorCode *status)
477 {
478     int got_token = 0;
479     
480     *status = U_ZERO_ERROR;
481
482     if (!chain)
483         return 0;
484
485     /* special case with no steps - same as index type binary */
486     if (!chain->steps)
487     {
488         if (chain->token_count)
489             return 0;
490         else
491         {
492             chain->token_count++;
493             
494             if (chain->sort)
495                 icu_sortkey8_from_utf16(chain->coll,
496                                         chain->sort8, chain->steps->buf16,
497                                         status);
498             return chain->token_count;
499         }
500     }
501     /* usual case, one or more icu chain steps existing */
502     else 
503     {
504         while (!got_token && chain->steps && chain->steps->more_tokens)
505             got_token = icu_chain_step_next_token(chain, chain->steps, status);
506
507         if (got_token)
508         {
509             chain->token_count++;
510
511             icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status);
512             
513             if (chain->sort)
514                 icu_sortkey8_from_utf16(chain->coll,
515                                         chain->sort8, chain->steps->buf16,
516                                         status);
517             return chain->token_count;
518         }
519     }
520         
521     return 0;
522 }
523
524 int icu_chain_token_number(struct icu_chain * chain)
525 {
526     if (!chain)
527         return 0;
528     
529     return chain->token_count;
530 }
531
532 const char * icu_chain_token_display(struct icu_chain * chain)
533 {
534     if (chain->display8)
535         return icu_buf_utf8_to_cstr(chain->display8);
536     
537     return 0;
538 }
539
540 const char * icu_chain_token_norm(struct icu_chain * chain)
541 {
542     if (!chain->steps)
543         return chain->src8cstr;
544
545     if (chain->norm8)
546         return icu_buf_utf8_to_cstr(chain->norm8);
547     
548     return 0;
549 }
550
551 const char * icu_chain_token_sortkey(struct icu_chain * chain)
552 {
553     if (chain->sort8)
554         return icu_buf_utf8_to_cstr(chain->sort8);
555     
556     return 0;
557 }
558
559 #endif /* YAZ_HAVE_ICU */
560
561 /*
562  * Local variables:
563  * c-basic-offset: 4
564  * c-file-style: "Stroustrup"
565  * indent-tabs-mode: nil
566  * End:
567  * vim: shiftwidth=4 tabstop=8 expandtab
568  */
569