fa6c96a9b0f609d29de35c9c28d3bc981f3baef7
[yaz-moved-to-github.git] / src / icu_chain.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file
8  * \brief ICU chain
9  */
10
11 #if HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14
15 #if YAZ_HAVE_ICU
16 #include <yaz/xmalloc.h>
17
18 #include <yaz/icu_I18N.h>
19
20 #include <yaz/stemmer.h>
21
22 #include <yaz/log.h>
23 #include <yaz/nmem.h>
24 #include <yaz/nmem_xml.h>
25 #include <string.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <assert.h>
29
30 #include <unicode/ustring.h>  /* some more string fcns*/
31 #include <unicode/uchar.h>    /* char names           */
32
33 enum icu_chain_step_type {
34     ICU_chain_step_type_none,
35     ICU_chain_step_type_display,        /* convert to utf8 display format */
36     ICU_chain_step_type_casemap,        /* apply utf16 charmap */
37     ICU_chain_step_type_transform,      /* apply utf16 transform */
38     ICU_chain_step_type_tokenize,       /* apply utf16 tokenization */
39     ICU_chain_step_type_transliterate,  /* apply utf16 tokenization */
40     YAZ_chain_step_type_stemming,       /* apply utf16 stemming (YAZ) */
41     ICU_chain_step_type_join
42 };
43
44 struct icu_chain_step
45 {
46     /* type and action object */
47     enum icu_chain_step_type type;
48     union {
49         struct icu_casemap   *casemap;
50         struct icu_transform *transform;
51         struct icu_tokenizer *tokenizer;
52         yaz_stemmer_p         stemmer;
53         struct icu_buf_utf16 *join;
54     } u;
55     struct icu_chain_step *previous;
56 };
57
58 struct icu_chain
59 {
60     yaz_icu_iter_t iter;
61     char *locale;
62     int sort;
63
64     UCollator *coll;
65
66     /* linked list of chain steps */
67     struct icu_chain_step *csteps;
68 };
69
70 int icu_check_status(UErrorCode status)
71 {
72     if (U_FAILURE(status))
73     {
74         yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
75         return 0;
76     }
77     return 1;
78 }
79
80 static struct icu_chain_step *icu_chain_insert_step(
81     struct icu_chain *chain, enum icu_chain_step_type type,
82     const char *rule, UErrorCode *status)
83 {
84     struct icu_chain_step *step = 0;
85
86     assert(chain);
87     assert(type);
88
89     step = (struct icu_chain_step *) xmalloc(sizeof(*step));
90     step->type = type;
91
92     switch (step->type)
93     {
94     case ICU_chain_step_type_display:
95         break;
96     case ICU_chain_step_type_casemap:
97         assert(rule);
98         step->u.casemap = icu_casemap_create(rule[0], status);
99         break;
100     case ICU_chain_step_type_transform:
101         assert(rule);
102         /* rule omitted. Only ID used */
103         step->u.transform = icu_transform_create(rule, 'f', 0, status);
104         break;
105     case ICU_chain_step_type_tokenize:
106         assert(rule);
107         step->u.tokenizer = icu_tokenizer_create(chain->locale, rule[0], status);
108         break;
109     case ICU_chain_step_type_transliterate:
110         assert(rule);
111         /* we pass a dummy ID to utrans_openU.. */
112         step->u.transform = icu_transform_create("custom", 'f', rule, status);
113         break;
114     case YAZ_chain_step_type_stemming:
115         assert(rule);
116         step->u.stemmer = yaz_stemmer_create(chain->locale, rule, status);
117         break;
118     case ICU_chain_step_type_join:
119         assert(rule);
120         step->u.join = icu_buf_utf16_create(0);
121         icu_utf16_from_utf8_cstr(step->u.join, rule, status);
122         break;
123     default:
124         break;
125     }
126     step->previous = chain->csteps;
127     chain->csteps = step;
128
129     return step;
130 }
131
132
133 static void icu_chain_step_destroy(struct icu_chain_step *step)
134 {
135     if (!step)
136         return;
137
138     icu_chain_step_destroy(step->previous);
139
140     switch (step->type)
141     {
142     case ICU_chain_step_type_display:
143         break;
144     case ICU_chain_step_type_casemap:
145         icu_casemap_destroy(step->u.casemap);
146         break;
147     case ICU_chain_step_type_transform:
148     case ICU_chain_step_type_transliterate:
149         icu_transform_destroy(step->u.transform);
150         break;
151     case ICU_chain_step_type_tokenize:
152         icu_tokenizer_destroy(step->u.tokenizer);
153         break;
154     case YAZ_chain_step_type_stemming:
155         yaz_stemmer_destroy(step->u.stemmer);
156         break;
157     case ICU_chain_step_type_join:
158         icu_buf_utf16_destroy(step->u.join);
159         break;
160     default:
161         break;
162     }
163     xfree(step);
164 }
165
166 struct icu_chain_step *icu_chain_step_clone(struct icu_chain_step *old)
167 {
168     struct icu_chain_step *step = 0;
169     struct icu_chain_step **sp = &step;
170     while (old)
171     {
172         *sp = (struct icu_chain_step *) xmalloc(sizeof(**sp));
173         (*sp)->type = old->type;
174
175         switch ((*sp)->type)
176         {
177         case ICU_chain_step_type_display:
178             break;
179         case ICU_chain_step_type_casemap:
180             (*sp)->u.casemap = icu_casemap_clone(old->u.casemap);
181             break;
182         case ICU_chain_step_type_transform:
183         case ICU_chain_step_type_transliterate:
184             (*sp)->u.transform = icu_transform_clone(old->u.transform);
185             break;
186         case ICU_chain_step_type_tokenize:
187             (*sp)->u.tokenizer = icu_tokenizer_clone(old->u.tokenizer);
188             break;
189         case YAZ_chain_step_type_stemming:
190             (*sp)->u.stemmer = yaz_stemmer_clone(old->u.stemmer);
191             break;
192         case ICU_chain_step_type_none:
193             break;
194         case ICU_chain_step_type_join:
195             (*sp)->u.join = icu_buf_utf16_create(0);
196             (*sp)->u.join = icu_buf_utf16_copy((*sp)->u.join, old->u.join);
197             break;
198         }
199         old = old->previous;
200         sp = &(*sp)->previous;
201     }
202     *sp = 0;
203     return step;
204 }
205
206 struct icu_chain *icu_chain_create(const char *locale, int sort,
207                                    UErrorCode *status)
208 {
209     struct icu_chain *chain;
210     UCollator *coll = ucol_open(locale, status);
211
212     if (U_FAILURE(*status))
213         return 0;
214
215     chain = (struct icu_chain *) xmalloc(sizeof(*chain));
216     chain->iter = 0;
217     chain->locale = xstrdup(locale);
218     chain->sort = sort;
219     chain->coll = coll;
220     chain->csteps = 0;
221
222     return chain;
223 }
224
225 void icu_chain_destroy(struct icu_chain *chain)
226 {
227     if (chain)
228     {
229         if (chain->coll)
230             ucol_close(chain->coll);
231
232         if (chain->iter)
233             icu_iter_destroy(chain->iter);
234         icu_chain_step_destroy(chain->csteps);
235         xfree(chain->locale);
236         xfree(chain);
237     }
238 }
239
240 struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node,
241                                        int sort,
242                                        UErrorCode *status)
243 {
244     xmlNode *node = 0;
245     int no_errors = 0;
246     struct icu_chain *chain = 0;
247     NMEM nmem = 0;
248
249     *status = U_ZERO_ERROR;
250
251     if (xml_node && xml_node->type == XML_ELEMENT_NODE)
252     {
253         xmlChar *xml_locale = xmlGetProp((xmlNode *) xml_node,
254                                          (xmlChar *) "locale");
255         if (xml_locale)
256         {
257             chain = icu_chain_create((const char *) xml_locale, sort, status);
258             xmlFree(xml_locale);
259         }
260     }
261
262     if (!chain)
263         return 0;
264
265     nmem = nmem_create();
266     for (node = xml_node->children; node; node = node->next)
267     {
268         char *rule = 0;
269         struct icu_chain_step *step = 0;
270         struct _xmlAttr *attr;
271
272         nmem_reset(nmem);
273         if (node->type != XML_ELEMENT_NODE)
274             continue;
275
276         for (attr = node->properties; attr; attr = attr->next)
277         {
278             if (!strcmp((const char *) attr->name, "rule"))
279             {
280                 rule = nmem_text_node_cdata(attr->children, nmem);
281             }
282             else
283             {
284                 yaz_log(YLOG_WARN, "Unsupported attribute '%s' for "
285                         "element '%s'", attr->name, node->name);
286                 no_errors++;
287             }
288         }
289         if (!rule && node->children)
290             rule = nmem_text_node_cdata(node->children, nmem);
291
292         if (!rule && strcmp((const char *) node->name, "display"))
293         {
294             yaz_log(YLOG_WARN, "Missing attribute 'rule' for element %s",
295                     (const char *) node->name);
296             no_errors++;
297             continue;
298         }
299         if (!strcmp((const char *) node->name, "casemap"))
300             step = icu_chain_insert_step(chain,
301                                          ICU_chain_step_type_casemap,
302                                          rule, status);
303         else if (!strcmp((const char *) node->name, "transform"))
304             step = icu_chain_insert_step(chain,
305                                          ICU_chain_step_type_transform,
306                                          rule, status);
307         else if (!strcmp((const char *) node->name, "transliterate"))
308             step = icu_chain_insert_step(chain,
309                                          ICU_chain_step_type_transliterate,
310                                          rule, status);
311         else if (!strcmp((const char *) node->name, "tokenize"))
312             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
313                                          rule, status);
314         else if (!strcmp((const char *) node->name, "display"))
315             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
316                                          rule, status);
317         else if (!strcmp((const char *) node->name, "stemming"))
318             step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming,
319                                          rule, status);
320         else if (!strcmp((const char *) node->name, "join"))
321             step = icu_chain_insert_step(chain, ICU_chain_step_type_join,
322                                          rule, status);
323         else if (!strcmp((const char *) node->name, "normalize"))
324         {
325             yaz_log(YLOG_WARN, "Element %s is deprecated. "
326                     "Use transform instead", node->name);
327             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
328                                          rule, status);
329         }
330         else if (!strcmp((const char *) node->name, "index")
331                  || !strcmp((const char *) node->name, "sortkey"))
332         {
333             yaz_log(YLOG_WARN, "Element %s is no longer needed. "
334                     "Remove it from the configuration", node->name);
335         }
336         else
337         {
338             yaz_log(YLOG_WARN, "Unknown element %s", node->name);
339             no_errors++;
340             continue;
341         }
342         if (!step)
343         {
344             yaz_log(YLOG_WARN, "Step not created for %s", node->name);
345             no_errors++;
346         }
347         if (step && U_FAILURE(*status))
348         {
349             yaz_log(YLOG_WARN, "ICU Error %d %s for element %s, rule %s",
350                     *status, u_errorName(*status), node->name, rule ?
351                     rule : "");
352             no_errors++;
353             break;
354         }
355     }
356     nmem_destroy(nmem);
357     if (no_errors)
358     {
359         icu_chain_destroy(chain);
360         return 0;
361     }
362     return chain;
363 }
364
365 struct icu_iter {
366     struct icu_chain *chain;
367     struct icu_buf_utf16 *last;
368     struct icu_buf_utf16 *org;
369     struct icu_buf_utf8 *org8;
370     UErrorCode status;
371     struct icu_buf_utf8 *display;
372     struct icu_buf_utf8 *sort8;
373     struct icu_buf_utf8 *result;
374     int token_count;
375     size_t org_start;
376     size_t org_len;
377     size_t utf8_base;
378     size_t utf16_base;
379     struct icu_chain_step *steps;
380 };
381
382 void icu_utf16_print(struct icu_buf_utf16 *src16)
383 {
384     UErrorCode status = U_ZERO_ERROR;
385     const char *p;
386     struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
387     icu_utf16_to_utf8(dst8, src16, &status);
388
389     if (U_FAILURE(status))
390     {
391         printf("failure");
392     }
393     else
394     {
395         p = icu_buf_utf8_to_cstr(dst8);
396         printf("%s", p);
397     }
398     icu_buf_utf8_destroy(dst8);
399 }
400
401 struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
402                                       struct icu_chain_step *step,
403                                       struct icu_buf_utf16 *src)
404 {
405     if (!step)
406         return src;
407     else
408     {
409         struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
410
411         switch (step->type)
412         {
413         case ICU_chain_step_type_casemap:
414             if (dst)
415             {
416                 struct icu_buf_utf16 *src = dst;
417
418                 dst = icu_buf_utf16_create(0);
419                 icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
420                                     iter->chain->locale);
421                 icu_buf_utf16_destroy(src);
422             }
423             break;
424         case ICU_chain_step_type_tokenize:
425             if (dst)
426             {
427                 struct icu_buf_utf16 *src = dst;
428
429                 icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
430                 if (step->previous)
431                 {   /* no need to copy if it's already the same */
432                     iter->utf8_base = iter->utf16_base = 0;
433                     icu_buf_utf16_copy(iter->org, src);
434                 }
435                 icu_buf_utf16_destroy(src);
436             }
437             dst = icu_buf_utf16_create(0);
438             iter->status = U_ZERO_ERROR;
439             if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status,
440                                           &iter->org_start, &iter->org_len))
441             {
442                 icu_buf_utf16_destroy(dst);
443                 dst = 0;
444             }
445             break;
446         case ICU_chain_step_type_transform:
447         case ICU_chain_step_type_transliterate:
448             if (dst)
449             {
450                 struct icu_buf_utf16 *src = dst;
451                 dst = icu_buf_utf16_create(0);
452                 icu_transform_trans(step->u.transform, dst, src, &iter->status);
453                 icu_buf_utf16_destroy(src);
454             }
455             break;
456         case ICU_chain_step_type_display:
457             if (dst)
458                 icu_utf16_to_utf8(iter->display, dst, &iter->status);
459             break;
460         case YAZ_chain_step_type_stemming:
461             if (dst)
462             {
463                 struct icu_buf_utf16 *src = dst;
464                 dst = icu_buf_utf16_create(0);
465                 yaz_stemmer_stem(step->u.stemmer, dst, src, &iter->status);
466                 icu_buf_utf16_destroy(src);
467             }
468             break;
469         case ICU_chain_step_type_join:
470             if (dst)
471             {
472                 while (1)
473                 {
474                     struct icu_buf_utf16 *dst1 =
475                         icu_iter_invoke(iter, step->previous, 0);
476
477                     if (!dst1)
478                         break; 
479                     dst = icu_buf_utf16_append(dst, step->u.join);
480                     dst = icu_buf_utf16_append(dst, dst1);
481                     icu_buf_utf16_destroy(dst1);
482                 }
483             }
484             break;
485         default:
486             assert(0);
487         }
488         return dst;
489     }
490 }
491
492 yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
493 {
494     yaz_icu_iter_t iter = xmalloc(sizeof(*iter));
495     iter->chain = chain;
496     iter->status = U_ZERO_ERROR;
497     iter->display = icu_buf_utf8_create(0);
498     iter->sort8 = icu_buf_utf8_create(0);
499     iter->result = icu_buf_utf8_create(0);
500     iter->org = icu_buf_utf16_create(0);
501     iter->org8 = 0;
502     iter->last = 0; /* no last returned string (yet) */
503     iter->steps = icu_chain_step_clone(chain->csteps);
504     iter->token_count = 0;
505
506     return iter;
507 }
508
509 void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
510 {
511     struct icu_buf_utf16 *src = icu_buf_utf16_create(0);
512     icu_utf16_from_utf8_cstr(src, src8cstr, &iter->status);
513     icu_buf_utf16_copy(iter->org, src);
514     iter->token_count = 0;
515     iter->org_start = 0;
516     iter->utf8_base = iter->utf16_base = 0;
517     iter->org_len = src->utf16_len;
518     iter->last = icu_iter_invoke(iter, iter->steps, src);
519 }
520
521 void icu_iter_destroy(yaz_icu_iter_t iter)
522 {
523     if (iter)
524     {
525         icu_buf_utf8_destroy(iter->display);
526         icu_buf_utf8_destroy(iter->sort8);
527         icu_buf_utf8_destroy(iter->result);
528         icu_buf_utf16_destroy(iter->org);
529         icu_buf_utf8_destroy(iter->org8);
530         icu_chain_step_destroy(iter->steps);
531         xfree(iter);
532     }
533 }
534
535 int icu_iter_next(yaz_icu_iter_t iter)
536 {
537     if (iter->token_count && iter->last)
538         iter->last = icu_iter_invoke(iter, iter->steps, 0);
539     if (!iter->last)
540         return 0;
541     else
542     {
543         iter->token_count++;
544         if (iter->chain->sort)
545         {
546             icu_sortkey8_from_utf16(iter->chain->coll,
547                                     iter->sort8, iter->last,
548                                     &iter->status);
549         }
550         icu_utf16_to_utf8(iter->result, iter->last, &iter->status);
551         icu_buf_utf16_destroy(iter->last);
552
553         return 1;
554     }
555 }
556
557 const char *icu_iter_get_norm(yaz_icu_iter_t iter)
558 {
559     return icu_buf_utf8_to_cstr(iter->result);
560 }
561
562 const char *icu_iter_get_sortkey(yaz_icu_iter_t iter)
563 {
564     return icu_buf_utf8_to_cstr(iter->sort8);
565 }
566
567 const char *icu_iter_get_display(yaz_icu_iter_t iter)
568 {
569     return icu_buf_utf8_to_cstr(iter->display);
570 }
571
572 int icu_iter_get_token_number(yaz_icu_iter_t iter)
573 {
574     return iter->token_count;
575 }
576
577
578 void icu_iter_get_org_info2(yaz_icu_iter_t iter, size_t *start, size_t *len,
579                             const char **cstr)
580 {
581     int32_t len1 = 0, len2 = 0;
582     UErrorCode status = U_ZERO_ERROR;
583
584     if (iter->org_start < iter->utf16_base)
585     {
586         iter->utf8_base = 0;
587         iter->utf16_base = 0;
588     }
589     u_strToUTF8(0, 0, &len1,
590                 iter->org->utf16 + iter->utf16_base,
591                 iter->org_start - iter->utf16_base,
592                 &status);
593
594     status = U_ZERO_ERROR;
595
596     *start = len1 + iter->utf8_base;
597
598     u_strToUTF8(0, 0, &len2,
599                 iter->org->utf16 + iter->utf16_base,
600                 iter->org_start - iter->utf16_base + iter->org_len,
601                 &status);
602
603     *len = len2 - len1;
604
605     if (cstr)
606     {
607         if (!iter->org8)
608             iter->org8 = icu_buf_utf8_create(0);
609         status = U_ZERO_ERROR;
610         icu_utf16_to_utf8(iter->org8, iter->org, &status);
611         *cstr = icu_buf_utf8_to_cstr(iter->org8);
612     }
613     iter->utf8_base = *start;
614     iter->utf16_base = iter->org_start;
615 }
616
617 void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
618 {
619     icu_iter_get_org_info2(iter, start, len, 0);
620 }
621
622 int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
623                           UErrorCode *status)
624 {
625     if (chain->iter)
626         icu_iter_destroy(chain->iter);
627     chain->iter = icu_iter_create(chain);
628     icu_iter_first(chain->iter, src8cstr);
629     return 1;
630 }
631
632 int icu_chain_next_token(struct icu_chain *chain, UErrorCode *status)
633 {
634     *status = U_ZERO_ERROR;
635     return icu_iter_next(chain->iter);
636 }
637
638 int icu_chain_token_number(struct icu_chain *chain)
639 {
640     if (chain && chain->iter)
641         return chain->iter->token_count;
642     return 0;
643 }
644
645 const char *icu_chain_token_display(struct icu_chain *chain)
646 {
647     if (chain->iter)
648         return icu_iter_get_display(chain->iter);
649     return 0;
650 }
651
652 const char *icu_chain_token_norm(struct icu_chain *chain)
653 {
654     if (chain->iter)
655         return icu_iter_get_norm(chain->iter);
656     return 0;
657 }
658
659 const char *icu_chain_token_sortkey(struct icu_chain *chain)
660 {
661     if (chain->iter)
662         return icu_iter_get_sortkey(chain->iter);
663     return 0;
664 }
665
666 void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len)
667 {
668     if (chain->iter)
669         icu_iter_get_org_info(chain->iter, start, len);
670 }
671
672 void icu_chain_get_org_info2(struct icu_chain *chain, size_t *start,
673                              size_t *len, const char **cstr)
674 {
675     if (chain->iter)
676         icu_iter_get_org_info2(chain->iter, start, len, cstr);
677 }
678
679
680 #endif /* YAZ_HAVE_ICU */
681
682 /*
683  * Local variables:
684  * c-basic-offset: 4
685  * c-file-style: "Stroustrup"
686  * indent-tabs-mode: nil
687  * End:
688  * vim: shiftwidth=4 tabstop=8 expandtab
689  */
690