ICU: iterator type
[yaz-moved-to-github.git] / src / icu_chain.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2009 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file
8  * \brief ICU chain
9  */
10
11 #if HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14
15 #if YAZ_HAVE_ICU
16 #include <yaz/xmalloc.h>
17
18 #include <yaz/icu_I18N.h>
19
20 #include <yaz/log.h>
21
22 #include <string.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <assert.h>
26
27 #include <unicode/ustring.h>  /* some more string fcns*/
28 #include <unicode/uchar.h>    /* char names           */
29
30 enum icu_chain_step_type {
31     ICU_chain_step_type_none,
32     ICU_chain_step_type_display,   /* convert to utf8 display format */
33     ICU_chain_step_type_casemap,   /* apply utf16 charmap */
34     ICU_chain_step_type_transform, /* apply utf16 transform */
35     ICU_chain_step_type_tokenize,  /* apply utf16 tokenization */
36     ICU_chain_step_type_transliterate  /* apply utf16 tokenization */
37 };
38
39 struct icu_chain_step
40 {
41     /* type and action object */
42     enum icu_chain_step_type type;
43     union {
44         struct icu_casemap * casemap;
45         struct icu_transform * transform;
46         struct icu_tokenizer * tokenizer;  
47     } u;
48     /* temprary post-action utf16 buffer */
49     struct icu_buf_utf16 * buf16;  
50     struct icu_chain_step * previous;
51     int more_tokens;
52     int need_new_token;
53 };
54
55 struct icu_chain
56 {
57     char *locale;
58     int sort;
59
60     const char * src8cstr;
61
62     UCollator * coll;
63     
64     /* number of tokens returned so far */
65     int32_t token_count;
66     
67     /* utf8 output buffers */
68     struct icu_buf_utf8 * display8;
69     struct icu_buf_utf8 * norm8;
70     struct icu_buf_utf8 * sort8;
71     
72     /* utf16 source buffer */
73     struct icu_buf_utf16 * src16;
74     
75     /* linked list of chain steps */
76     struct icu_chain_step * steps;
77 };
78
79 int icu_check_status(UErrorCode status)
80 {
81     if (U_FAILURE(status))
82     {
83         yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
84         return 0;   
85     }
86     return 1;
87 }
88
89 static struct icu_chain_step *icu_chain_step_create(
90     struct icu_chain * chain,  enum icu_chain_step_type type,
91     const uint8_t * rule, struct icu_buf_utf16 * buf16,
92     UErrorCode *status)
93 {
94     struct icu_chain_step * step = 0;
95     
96     if(!chain || !type || !rule)
97         return 0;
98
99     step = (struct icu_chain_step *) xmalloc(sizeof(struct icu_chain_step));
100
101     step->type = type;
102
103     step->buf16 = buf16;
104
105     /* create auxilary objects */
106     switch (step->type)
107     {
108     case ICU_chain_step_type_display:
109         break;
110     case ICU_chain_step_type_casemap:
111         step->u.casemap = icu_casemap_create(rule[0], status);
112         break;
113     case ICU_chain_step_type_transform:
114         /* rule omitted. Only ID used */
115         step->u.transform = icu_transform_create((const char *) rule, 'f',
116                                                  0, status);
117         break;
118     case ICU_chain_step_type_tokenize:
119         step->u.tokenizer = icu_tokenizer_create((char *) chain->locale, 
120                                                  (char) rule[0], status);
121         break;
122     case ICU_chain_step_type_transliterate:
123         /* we pass a dummy ID to utrans_openU.. */
124         step->u.transform = icu_transform_create("custom", 'f',
125                                                  (const char *) rule, status);
126         break;
127     default:
128         break;
129     }
130     return step;
131 }
132
133
134 static void icu_chain_step_destroy(struct icu_chain_step * step)
135 {
136     if (!step)
137         return;
138
139     icu_chain_step_destroy(step->previous);
140
141     switch (step->type)
142     {
143     case ICU_chain_step_type_display:
144         break;
145     case ICU_chain_step_type_casemap:
146         icu_casemap_destroy(step->u.casemap);
147         icu_buf_utf16_destroy(step->buf16);
148         break;
149     case ICU_chain_step_type_transform:
150     case ICU_chain_step_type_transliterate:
151         icu_transform_destroy(step->u.transform);
152         icu_buf_utf16_destroy(step->buf16);
153         break;
154     case ICU_chain_step_type_tokenize:
155         icu_tokenizer_destroy(step->u.tokenizer);
156         icu_buf_utf16_destroy(step->buf16);
157         break;
158     default:
159         break;
160     }
161     xfree(step);
162 }
163
164 struct icu_chain *icu_chain_create(const char *locale, int sort,
165                                    UErrorCode * status)
166 {
167     struct icu_chain * chain 
168         = (struct icu_chain *) xmalloc(sizeof(struct icu_chain));
169
170     *status = U_ZERO_ERROR;
171
172     chain->locale = xstrdup(locale);
173
174     chain->sort = sort;
175
176     chain->coll = ucol_open((const char *) chain->locale, status);
177
178     if (U_FAILURE(*status))
179         return 0;
180
181     chain->token_count = 0;
182
183     chain->src8cstr = 0;
184
185     chain->display8 = icu_buf_utf8_create(0);
186     chain->norm8 = icu_buf_utf8_create(0);
187     chain->sort8 = icu_buf_utf8_create(0);
188
189     chain->src16 = icu_buf_utf16_create(0);
190
191     chain->steps = 0;
192
193     return chain;
194 }
195
196 void icu_chain_destroy(struct icu_chain * chain)
197 {
198     if (chain)
199     {
200         if (chain->coll)
201             ucol_close(chain->coll);
202
203         icu_buf_utf8_destroy(chain->display8);
204         icu_buf_utf8_destroy(chain->norm8);
205         icu_buf_utf8_destroy(chain->sort8);
206         
207         icu_buf_utf16_destroy(chain->src16);
208     
209         icu_chain_step_destroy(chain->steps);
210         xfree(chain->locale);
211         xfree(chain);
212     }
213 }
214
215 static struct icu_chain_step *icu_chain_insert_step(
216     struct icu_chain * chain, enum icu_chain_step_type type,
217     const uint8_t * rule, UErrorCode *status);
218
219 struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node, 
220                                         int sort,
221                                         UErrorCode * status)
222 {
223     xmlNode *node = 0;
224     struct icu_chain * chain = 0;
225    
226     *status = U_ZERO_ERROR;
227
228     if (!xml_node ||xml_node->type != XML_ELEMENT_NODE)
229         return 0;
230     
231     {
232         xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node, 
233                                           (xmlChar *) "locale");
234         
235         if (xml_locale)
236         {
237             chain = icu_chain_create((const char *) xml_locale, sort, status);
238             xmlFree(xml_locale);
239         }
240         
241     }
242     if (!chain)
243         return 0;
244
245     for (node = xml_node->children; node; node = node->next)
246     {
247         xmlChar *xml_rule;
248         struct icu_chain_step * step = 0;
249
250         if (node->type != XML_ELEMENT_NODE)
251             continue;
252
253         xml_rule = xmlGetProp(node, (xmlChar *) "rule");
254
255         if (!strcmp((const char *) node->name, "casemap"))
256             step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap, 
257                                          (const uint8_t *) xml_rule, status);
258         else if (!strcmp((const char *) node->name, "transform"))
259             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, 
260                                          (const uint8_t *) xml_rule, status);
261         else if (!strcmp((const char *) node->name, "transliterate"))
262             step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate, 
263                                          (const uint8_t *) xml_rule, status);
264         else if (!strcmp((const char *) node->name, "tokenize"))
265             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize, 
266                                          (const uint8_t *) xml_rule, status);
267         else if (!strcmp((const char *) node->name, "display"))
268             step = icu_chain_insert_step(chain, ICU_chain_step_type_display, 
269                                          (const uint8_t *) "", status);
270         else if (!strcmp((const char *) node->name, "normalize"))
271         {
272             yaz_log(YLOG_WARN, "Element %s is deprecated. "
273                     "Use transform instead", node->name);
274             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform, 
275                                          (const uint8_t *) xml_rule, status);
276         }
277         else if (!strcmp((const char *) node->name, "index")
278                  || !strcmp((const char *) node->name, "sortkey"))
279         {
280             yaz_log(YLOG_WARN, "Element %s is no longer needed. "
281                     "Remove it from the configuration", node->name);
282         }
283         else
284         {
285             yaz_log(YLOG_WARN, "Unknown element %s", node->name);
286             icu_chain_destroy(chain);
287             return 0;
288         }
289         xmlFree(xml_rule);
290         if (step && U_FAILURE(*status))
291         {
292             icu_chain_destroy(chain);
293             return 0;
294         }
295     }
296     return chain;
297 }
298
299 static struct icu_chain_step *icu_chain_insert_step(
300     struct icu_chain * chain, enum icu_chain_step_type type,
301     const uint8_t * rule, UErrorCode *status)
302 {    
303     struct icu_chain_step * step = 0;
304     struct icu_buf_utf16 * src16 = 0;
305     struct icu_buf_utf16 * buf16 = 0;
306
307     if (!chain || !type || !rule)
308         return 0;
309
310     /* assign utf16 src buffers as needed */
311     if (chain->steps && chain->steps->buf16)
312         src16 = chain->steps->buf16;
313     else if (chain->src16)
314         src16 = chain->src16;
315     else
316         return 0;
317
318     /* create utf16 destination buffers as needed, or */
319     switch (type)
320     {
321     case ICU_chain_step_type_display:
322         buf16 = src16;
323         break;
324     case ICU_chain_step_type_casemap:
325         buf16 = icu_buf_utf16_create(0);
326         break;
327     case ICU_chain_step_type_transform:
328     case ICU_chain_step_type_transliterate:
329         buf16 = icu_buf_utf16_create(0);
330         break;
331     case ICU_chain_step_type_tokenize:
332         buf16 = icu_buf_utf16_create(0);
333         break;
334         break;
335     default:
336         break;
337     }
338     /* create actual chain step with this buffer */
339     step = icu_chain_step_create(chain, type, rule, buf16, status);
340
341     step->previous = chain->steps;
342     chain->steps = step;
343
344     return step;
345 }
346
347 static int icu_chain_step_next_token(struct icu_chain * chain,
348                                      struct icu_chain_step * step,
349                                      UErrorCode *status)
350 {
351     struct icu_buf_utf16 * src16 = 0;
352     int got_new_token = 0;
353
354     if (!chain || !chain->src16 || !step || !step->more_tokens)
355         return 0;
356
357     /* assign utf16 src buffers as needed, advance in previous steps
358        tokens until non-zero token met, and setting stop condition */
359
360     if (step->previous)
361     {
362         src16 = step->previous->buf16;
363         /* tokens might be killed in previous steps, therefore looping */
364
365         while (step->need_new_token 
366                && step->previous->more_tokens
367                && !got_new_token)
368             got_new_token
369                 = icu_chain_step_next_token(chain, step->previous, status);
370     }
371     else 
372     { /* first step can only work once on chain->src16 input buffer */
373         src16 = chain->src16;
374         step->more_tokens = 0;
375         got_new_token = 1;
376     }
377
378     if (!src16)
379         return 0;
380
381     /* stop if nothing to process */
382     if (step->need_new_token && !got_new_token)
383     {
384         step->more_tokens = 0;
385         return 0;
386     }
387
388     /* either an old token not finished yet, or a new token, thus
389        perform the work, eventually put this steps output in 
390        step->buf16 or the chains UTF8 output buffers  */
391
392     switch (step->type)
393     {
394     case ICU_chain_step_type_display:
395         icu_utf16_to_utf8(chain->display8, src16, status);
396         break;
397     case ICU_chain_step_type_casemap:
398         icu_casemap_casemap(step->u.casemap,
399                             step->buf16, src16, status,
400                             chain->locale);
401         break;
402     case ICU_chain_step_type_transform:
403     case ICU_chain_step_type_transliterate:
404         icu_transform_trans(step->u.transform,
405                             step->buf16, src16, status);
406         break;
407     case ICU_chain_step_type_tokenize:
408         /* attach to new src16 token only first time during splitting */
409         if (step->need_new_token)
410         {
411             icu_tokenizer_attach(step->u.tokenizer, src16, status);
412             step->need_new_token = 0;
413         }
414
415         /* splitting one src16 token into multiple buf16 tokens */
416         step->more_tokens
417             = icu_tokenizer_next_token(step->u.tokenizer,
418                                        step->buf16, status);
419
420         /* make sure to get new previous token if this one had been used up
421            by recursive call to _same_ step */
422
423         if (!step->more_tokens)
424         {
425             step->more_tokens = icu_chain_step_next_token(chain, step, status);
426             return step->more_tokens;  /* avoid one token count too much! */
427         }
428         break;
429     default:
430         return 0;
431         break;
432     }
433
434     if (U_FAILURE(*status))
435         return 0;
436
437     /* if token disappered into thin air, tell caller */
438     /* if (!step->buf16->utf16_len && !step->more_tokens) */ 
439     /*    return 0; */ 
440
441     return 1;
442 }
443
444 struct icu_iter {
445     struct icu_chain *chain;
446     struct icu_buf_utf16 *next;
447     UErrorCode status;
448     struct icu_buf_utf8 *display;
449     struct icu_buf_utf8 *sort8;
450 };
451
452 static void utf16_print(struct icu_buf_utf16 *src16)
453 {
454     UErrorCode status = U_ZERO_ERROR;
455     const char *p;
456     struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
457     icu_utf16_to_utf8(dst8, src16, &status);
458
459     assert(status != 1234);
460     if (U_FAILURE(status))
461     {
462         printf("utf8:failure\n");
463     }
464     else
465     {
466         p = icu_buf_utf8_to_cstr(dst8);
467         printf("utf8:%s\n", p);
468     }
469     icu_buf_utf8_destroy(dst8);
470 }
471
472 struct icu_buf_utf16 *icu_iter_invoke(struct icu_iter *iter,
473                                       struct icu_chain_step *step,
474                                       struct icu_buf_utf16 *src)
475 {
476     if (!step)
477         return src;
478     else
479     {
480         struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
481         
482         switch (step->type)
483         {
484         case ICU_chain_step_type_casemap:
485             if (dst)
486             {
487                 struct icu_buf_utf16 *src = dst;
488
489                 dst = icu_buf_utf16_create(0);
490                 icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
491                                     iter->chain->locale);
492                 icu_buf_utf16_destroy(src);
493             }
494             break;
495         case ICU_chain_step_type_tokenize:
496             if (dst)
497             {
498                 struct icu_buf_utf16 *src = dst;
499
500                 icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
501                 icu_buf_utf16_destroy(src);
502             }
503             dst = icu_buf_utf16_create(0);
504             iter->status = U_ZERO_ERROR;
505             if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
506             {
507                 icu_buf_utf16_destroy(dst);
508                 dst = 0;
509             }
510             break;
511         case ICU_chain_step_type_transform:
512         case ICU_chain_step_type_transliterate:
513             if (dst)
514             {
515                 struct icu_buf_utf16 *src = dst;
516                 dst = icu_buf_utf16_create(0);
517                 icu_transform_trans(step->u.transform, dst, src, &iter->status);
518                 icu_buf_utf16_destroy(src);
519             }
520             break;
521         case ICU_chain_step_type_display:
522             if (dst)
523                 icu_utf16_to_utf8(iter->display, dst, &iter->status);
524             break;
525         default:
526             assert(0);
527         }
528         return dst;
529     }
530 }
531
532 struct icu_iter *icu_iter_create(struct icu_chain *chain,
533                                  const char *src8cstr)
534 {
535     if (!src8cstr)
536         return 0;
537     else
538     {
539         struct icu_buf_utf16 *src16 = icu_buf_utf16_create(0);
540         struct icu_iter *iter = xmalloc(sizeof(*iter));
541         iter->chain = chain;
542         iter->status = U_ZERO_ERROR;
543         iter->display = icu_buf_utf8_create(0);
544         iter->sort8 = icu_buf_utf8_create(0);
545
546         icu_utf16_from_utf8_cstr(src16, src8cstr, &iter->status);
547         iter->next = icu_iter_invoke(iter, chain->steps, src16);
548         return iter;
549     }
550 }
551
552 void icu_iter_destroy(struct icu_iter *iter)
553 {
554     if (iter)
555     {
556         icu_buf_utf8_destroy(iter->display);
557         icu_buf_utf8_destroy(iter->sort8);
558         xfree(iter);
559     }
560 }
561
562 int icu_iter_next(struct icu_iter *iter, struct icu_buf_utf8 *result)
563 {
564     struct icu_buf_utf16 *last = iter->next;
565     if (!last)
566         return 0;
567     else
568     {
569         if (iter->chain->sort)
570         {        
571             icu_sortkey8_from_utf16(iter->chain->coll,
572                                     iter->sort8, last,
573                                     &iter->status);
574         }
575         icu_utf16_to_utf8(result, last, &iter->status);
576         iter->next = icu_iter_invoke(iter, iter->chain->steps, 0);
577         icu_buf_utf16_destroy(last);
578         return 1;
579     }
580 }
581
582 const char *icu_iter_get_sortkey(struct icu_iter *iter)
583 {
584     return icu_buf_utf8_to_cstr(iter->sort8);
585 }
586
587 const char *icu_iter_get_display(struct icu_iter *iter)
588
589     return icu_buf_utf8_to_cstr(iter->display);   
590 }
591
592 int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr, 
593                           UErrorCode *status)
594 {
595     struct icu_chain_step * stp = 0; 
596
597     if (!chain || !src8cstr)
598         return 0;
599
600     chain->src8cstr = src8cstr;
601
602     stp = chain->steps;
603     
604     /* clear token count */
605     chain->token_count = 0;
606
607     /* clear all steps stop states */
608     while (stp)
609     {
610         stp->more_tokens = 1;
611         stp->need_new_token = 1;
612         stp = stp->previous;
613     }
614     
615     /* finally convert UTF8 to UTF16 string if needed */
616     if (chain->steps || chain->sort)
617         icu_utf16_from_utf8_cstr(chain->src16, chain->src8cstr, status);
618             
619     if (U_FAILURE(*status))
620         return 0;
621
622     return 1;
623 }
624
625 int icu_chain_next_token(struct icu_chain * chain, UErrorCode *status)
626 {
627     int got_token = 0;
628     
629     *status = U_ZERO_ERROR;
630
631     if (!chain)
632         return 0;
633
634     /* special case with no steps - same as index type binary */
635     if (!chain->steps)
636     {
637         if (chain->token_count)
638             return 0;
639         else
640         {
641             chain->token_count++;
642             
643             if (chain->sort)
644                 icu_sortkey8_from_utf16(chain->coll,
645                                         chain->sort8, chain->steps->buf16,
646                                         status);
647             return chain->token_count;
648         }
649     }
650     /* usual case, one or more icu chain steps existing */
651     else 
652     {
653         while (!got_token && chain->steps && chain->steps->more_tokens)
654             got_token = icu_chain_step_next_token(chain, chain->steps, status);
655
656         if (got_token)
657         {
658             chain->token_count++;
659
660             icu_utf16_to_utf8(chain->norm8, chain->steps->buf16, status);
661             
662             if (chain->sort)
663                 icu_sortkey8_from_utf16(chain->coll,
664                                         chain->sort8, chain->steps->buf16,
665                                         status);
666             return chain->token_count;
667         }
668     }
669         
670     return 0;
671 }
672
673 int icu_chain_token_number(struct icu_chain * chain)
674 {
675     if (!chain)
676         return 0;
677     
678     return chain->token_count;
679 }
680
681 const char * icu_chain_token_display(struct icu_chain * chain)
682 {
683     if (chain->display8)
684         return icu_buf_utf8_to_cstr(chain->display8);
685     
686     return 0;
687 }
688
689 const char * icu_chain_token_norm(struct icu_chain * chain)
690 {
691     if (!chain->steps)
692         return chain->src8cstr;
693
694     if (chain->norm8)
695         return icu_buf_utf8_to_cstr(chain->norm8);
696     
697     return 0;
698 }
699
700 const char * icu_chain_token_sortkey(struct icu_chain * chain)
701 {
702     if (chain->sort8)
703         return icu_buf_utf8_to_cstr(chain->sort8);
704     
705     return 0;
706 }
707
708 #endif /* YAZ_HAVE_ICU */
709
710 /*
711  * Local variables:
712  * c-basic-offset: 4
713  * c-file-style: "Stroustrup"
714  * indent-tabs-mode: nil
715  * End:
716  * vim: shiftwidth=4 tabstop=8 expandtab
717  */
718