CCL: fix other inherited attributes
[yaz-moved-to-github.git] / src / icu_chain.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file
8  * \brief ICU chain
9  */
10
11 #if HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14
15 #if YAZ_HAVE_ICU
16 #include <yaz/xmalloc.h>
17
18 #include <yaz/icu_I18N.h>
19
20 #include <yaz/stemmer.h>
21
22 #include <yaz/log.h>
23 #include <yaz/nmem.h>
24 #include <yaz/nmem_xml.h>
25 #include <yaz/xml_get.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <assert.h>
30
31 #include <unicode/ustring.h>  /* some more string fcns*/
32 #include <unicode/uchar.h>    /* char names           */
33
34 enum icu_chain_step_type {
35     ICU_chain_step_type_none,
36     ICU_chain_step_type_display,        /* convert to utf8 display format */
37     ICU_chain_step_type_casemap,        /* apply utf16 charmap */
38     ICU_chain_step_type_transform,      /* apply utf16 transform */
39     ICU_chain_step_type_tokenize,       /* apply utf16 tokenization */
40     ICU_chain_step_type_transliterate,  /* apply utf16 tokenization */
41     YAZ_chain_step_type_stemming,       /* apply utf16 stemming (YAZ) */
42     ICU_chain_step_type_join
43 };
44
45 struct icu_chain_step
46 {
47     /* type and action object */
48     enum icu_chain_step_type type;
49     union {
50         struct icu_casemap   *casemap;
51         struct icu_transform *transform;
52         struct icu_tokenizer *tokenizer;
53         yaz_stemmer_p         stemmer;
54         struct icu_buf_utf16 *join;
55     } u;
56     struct icu_chain_step *previous;
57 };
58
59 struct icu_chain
60 {
61     yaz_icu_iter_t iter;
62     char *locale;
63     int sort;
64
65     UCollator *coll;
66
67     /* linked list of chain steps */
68     struct icu_chain_step *csteps;
69 };
70
71 int icu_check_status(UErrorCode status)
72 {
73     if (U_FAILURE(status))
74     {
75         yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
76         return 0;
77     }
78     return 1;
79 }
80
81 static struct icu_chain_step *icu_chain_insert_step(
82     struct icu_chain *chain, enum icu_chain_step_type type,
83     const char *rule, UErrorCode *status)
84 {
85     struct icu_chain_step *step = 0;
86
87     assert(chain);
88     assert(type);
89
90     step = (struct icu_chain_step *) xmalloc(sizeof(*step));
91     step->type = type;
92
93     switch (step->type)
94     {
95     case ICU_chain_step_type_display:
96         break;
97     case ICU_chain_step_type_casemap:
98         assert(rule);
99         step->u.casemap = icu_casemap_create(rule[0], status);
100         break;
101     case ICU_chain_step_type_transform:
102         assert(rule);
103         /* rule omitted. Only ID used */
104         step->u.transform = icu_transform_create(rule, 'f', 0, status);
105         break;
106     case ICU_chain_step_type_tokenize:
107         assert(rule);
108         step->u.tokenizer = icu_tokenizer_create(chain->locale, rule[0], status);
109         break;
110     case ICU_chain_step_type_transliterate:
111         assert(rule);
112         /* we pass a dummy ID to utrans_openU.. */
113         step->u.transform = icu_transform_create("custom", 'f', rule, status);
114         break;
115     case YAZ_chain_step_type_stemming:
116         assert(rule);
117         step->u.stemmer = yaz_stemmer_create(chain->locale, rule, status);
118         break;
119     case ICU_chain_step_type_join:
120         assert(rule);
121         step->u.join = icu_buf_utf16_create(0);
122         icu_utf16_from_utf8_cstr(step->u.join, rule, status);
123         break;
124     default:
125         break;
126     }
127     step->previous = chain->csteps;
128     chain->csteps = step;
129
130     return step;
131 }
132
133
134 static void icu_chain_step_destroy(struct icu_chain_step *step)
135 {
136     if (!step)
137         return;
138
139     icu_chain_step_destroy(step->previous);
140
141     switch (step->type)
142     {
143     case ICU_chain_step_type_display:
144         break;
145     case ICU_chain_step_type_casemap:
146         icu_casemap_destroy(step->u.casemap);
147         break;
148     case ICU_chain_step_type_transform:
149     case ICU_chain_step_type_transliterate:
150         icu_transform_destroy(step->u.transform);
151         break;
152     case ICU_chain_step_type_tokenize:
153         icu_tokenizer_destroy(step->u.tokenizer);
154         break;
155     case YAZ_chain_step_type_stemming:
156         yaz_stemmer_destroy(step->u.stemmer);
157         break;
158     case ICU_chain_step_type_join:
159         icu_buf_utf16_destroy(step->u.join);
160         break;
161     default:
162         break;
163     }
164     xfree(step);
165 }
166
167 struct icu_chain_step *icu_chain_step_clone(struct icu_chain_step *old)
168 {
169     struct icu_chain_step *step = 0;
170     struct icu_chain_step **sp = &step;
171     while (old)
172     {
173         *sp = (struct icu_chain_step *) xmalloc(sizeof(**sp));
174         (*sp)->type = old->type;
175
176         switch ((*sp)->type)
177         {
178         case ICU_chain_step_type_display:
179             break;
180         case ICU_chain_step_type_casemap:
181             (*sp)->u.casemap = icu_casemap_clone(old->u.casemap);
182             break;
183         case ICU_chain_step_type_transform:
184         case ICU_chain_step_type_transliterate:
185             (*sp)->u.transform = icu_transform_clone(old->u.transform);
186             break;
187         case ICU_chain_step_type_tokenize:
188             (*sp)->u.tokenizer = icu_tokenizer_clone(old->u.tokenizer);
189             break;
190         case YAZ_chain_step_type_stemming:
191             (*sp)->u.stemmer = yaz_stemmer_clone(old->u.stemmer);
192             break;
193         case ICU_chain_step_type_none:
194             break;
195         case ICU_chain_step_type_join:
196             (*sp)->u.join = icu_buf_utf16_create(0);
197             (*sp)->u.join = icu_buf_utf16_copy((*sp)->u.join, old->u.join);
198             break;
199         }
200         old = old->previous;
201         sp = &(*sp)->previous;
202     }
203     *sp = 0;
204     return step;
205 }
206
207 struct icu_chain *icu_chain_create(const char *locale, int sort,
208                                    UErrorCode *status)
209 {
210     struct icu_chain *chain;
211     UCollator *coll = ucol_open(locale, status);
212
213     if (U_FAILURE(*status))
214         return 0;
215
216     chain = (struct icu_chain *) xmalloc(sizeof(*chain));
217     chain->iter = 0;
218     chain->locale = xstrdup(locale);
219     chain->sort = sort;
220     chain->coll = coll;
221     chain->csteps = 0;
222
223     return chain;
224 }
225
226 void icu_chain_destroy(struct icu_chain *chain)
227 {
228     if (chain)
229     {
230         if (chain->coll)
231             ucol_close(chain->coll);
232
233         if (chain->iter)
234             icu_iter_destroy(chain->iter);
235         icu_chain_step_destroy(chain->csteps);
236         xfree(chain->locale);
237         xfree(chain);
238     }
239 }
240
241 struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node,
242                                        int sort,
243                                        UErrorCode *status)
244 {
245     xmlNode *node = 0;
246     int no_errors = 0;
247     struct icu_chain *chain = 0;
248     NMEM nmem = 0;
249
250     *status = U_ZERO_ERROR;
251
252     if (xml_node && xml_node->type == XML_ELEMENT_NODE)
253     {
254         const char *xml_locale = yaz_xml_get_prop((xmlNode *) xml_node,
255                                                   "locale");
256         if (xml_locale)
257             chain = icu_chain_create((const char *) xml_locale, sort, status);
258     }
259
260     if (!chain)
261         return 0;
262
263     nmem = nmem_create();
264     for (node = xml_node->children; node; node = node->next)
265     {
266         char *rule = 0;
267         struct icu_chain_step *step = 0;
268         const char *attr_str;
269
270         nmem_reset(nmem);
271         if (node->type != XML_ELEMENT_NODE)
272             continue;
273         attr_str = yaz_xml_get_prop(node, "rule%s", &rule);
274         if (attr_str)
275         {
276             yaz_log(YLOG_WARN, "Unsupported attribute '%s' for "
277                     "element '%s'", attr_str, node->name);
278             no_errors++;
279         }
280         if (!rule && node->children)
281             rule = nmem_text_node_cdata(node->children, nmem);
282
283         if (!rule && strcmp((const char *) node->name, "display"))
284         {
285             yaz_log(YLOG_WARN, "Missing attribute 'rule' for element %s",
286                     (const char *) node->name);
287             no_errors++;
288             continue;
289         }
290         if (!strcmp((const char *) node->name, "casemap"))
291             step = icu_chain_insert_step(chain,
292                                          ICU_chain_step_type_casemap,
293                                          rule, status);
294         else if (!strcmp((const char *) node->name, "transform"))
295             step = icu_chain_insert_step(chain,
296                                          ICU_chain_step_type_transform,
297                                          rule, status);
298         else if (!strcmp((const char *) node->name, "transliterate"))
299             step = icu_chain_insert_step(chain,
300                                          ICU_chain_step_type_transliterate,
301                                          rule, status);
302         else if (!strcmp((const char *) node->name, "tokenize"))
303             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
304                                          rule, status);
305         else if (!strcmp((const char *) node->name, "display"))
306             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
307                                          rule, status);
308         else if (!strcmp((const char *) node->name, "stemming"))
309             step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming,
310                                          rule, status);
311         else if (!strcmp((const char *) node->name, "join"))
312             step = icu_chain_insert_step(chain, ICU_chain_step_type_join,
313                                          rule, status);
314         else if (!strcmp((const char *) node->name, "normalize"))
315         {
316             yaz_log(YLOG_WARN, "Element %s is deprecated. "
317                     "Use transform instead", node->name);
318             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
319                                          rule, status);
320         }
321         else if (!strcmp((const char *) node->name, "index")
322                  || !strcmp((const char *) node->name, "sortkey"))
323         {
324             yaz_log(YLOG_WARN, "Element %s is no longer needed. "
325                     "Remove it from the configuration", node->name);
326         }
327         else
328         {
329             yaz_log(YLOG_WARN, "Unknown element %s", node->name);
330             no_errors++;
331             continue;
332         }
333         if (!step)
334         {
335             yaz_log(YLOG_WARN, "Step not created for %s", node->name);
336             no_errors++;
337         }
338         if (step && U_FAILURE(*status))
339         {
340             yaz_log(YLOG_WARN, "ICU Error %d %s for element %s, rule %s",
341                     *status, u_errorName(*status), node->name, rule ?
342                     rule : "");
343             no_errors++;
344             break;
345         }
346     }
347     nmem_destroy(nmem);
348     if (no_errors)
349     {
350         icu_chain_destroy(chain);
351         return 0;
352     }
353     return chain;
354 }
355
356 struct icu_iter {
357     struct icu_chain *chain;
358     struct icu_buf_utf16 *last;
359     struct icu_buf_utf16 *org;
360     struct icu_buf_utf8 *org8;
361     UErrorCode status;
362     struct icu_buf_utf8 *display;
363     struct icu_buf_utf8 *sort8;
364     struct icu_buf_utf8 *result;
365     int token_count;
366     size_t org_start;
367     size_t org_len;
368     size_t utf8_base;
369     size_t utf16_base;
370     struct icu_chain_step *steps;
371 };
372
373 void icu_utf16_print(struct icu_buf_utf16 *src16)
374 {
375     UErrorCode status = U_ZERO_ERROR;
376     const char *p;
377     struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
378     icu_utf16_to_utf8(dst8, src16, &status);
379
380     if (U_FAILURE(status))
381     {
382         printf("failure");
383     }
384     else
385     {
386         p = icu_buf_utf8_to_cstr(dst8);
387         printf("%s", p);
388     }
389     icu_buf_utf8_destroy(dst8);
390 }
391
392 struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
393                                       struct icu_chain_step *step,
394                                       struct icu_buf_utf16 *src)
395 {
396     if (!step)
397         return src;
398     else
399     {
400         struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
401
402         switch (step->type)
403         {
404         case ICU_chain_step_type_casemap:
405             if (dst)
406             {
407                 struct icu_buf_utf16 *src = dst;
408
409                 dst = icu_buf_utf16_create(0);
410                 icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
411                                     iter->chain->locale);
412                 icu_buf_utf16_destroy(src);
413             }
414             break;
415         case ICU_chain_step_type_tokenize:
416             if (dst)
417             {
418                 struct icu_buf_utf16 *src = dst;
419
420                 icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
421                 if (step->previous)
422                 {   /* no need to copy if it's already the same */
423                     iter->utf8_base = iter->utf16_base = 0;
424                     icu_buf_utf16_copy(iter->org, src);
425                 }
426                 icu_buf_utf16_destroy(src);
427             }
428             dst = icu_buf_utf16_create(0);
429             iter->status = U_ZERO_ERROR;
430             if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status,
431                                           &iter->org_start, &iter->org_len))
432             {
433                 icu_buf_utf16_destroy(dst);
434                 dst = 0;
435             }
436             break;
437         case ICU_chain_step_type_transform:
438         case ICU_chain_step_type_transliterate:
439             if (dst)
440             {
441                 struct icu_buf_utf16 *src = dst;
442                 dst = icu_buf_utf16_create(0);
443                 icu_transform_trans(step->u.transform, dst, src, &iter->status);
444                 icu_buf_utf16_destroy(src);
445             }
446             break;
447         case ICU_chain_step_type_display:
448             if (dst)
449                 icu_utf16_to_utf8(iter->display, dst, &iter->status);
450             break;
451         case YAZ_chain_step_type_stemming:
452             if (dst)
453             {
454                 struct icu_buf_utf16 *src = dst;
455                 dst = icu_buf_utf16_create(0);
456                 yaz_stemmer_stem(step->u.stemmer, dst, src, &iter->status);
457                 icu_buf_utf16_destroy(src);
458             }
459             break;
460         case ICU_chain_step_type_join:
461             if (dst)
462             {
463                 while (1)
464                 {
465                     struct icu_buf_utf16 *dst1 =
466                         icu_iter_invoke(iter, step->previous, 0);
467
468                     if (!dst1)
469                         break; 
470                     dst = icu_buf_utf16_append(dst, step->u.join);
471                     dst = icu_buf_utf16_append(dst, dst1);
472                     icu_buf_utf16_destroy(dst1);
473                 }
474             }
475             break;
476         default:
477             assert(0);
478         }
479         return dst;
480     }
481 }
482
483 yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
484 {
485     yaz_icu_iter_t iter = xmalloc(sizeof(*iter));
486     iter->chain = chain;
487     iter->status = U_ZERO_ERROR;
488     iter->display = icu_buf_utf8_create(0);
489     iter->sort8 = icu_buf_utf8_create(0);
490     iter->result = icu_buf_utf8_create(0);
491     iter->org = icu_buf_utf16_create(0);
492     iter->org8 = 0;
493     iter->last = 0; /* no last returned string (yet) */
494     iter->steps = icu_chain_step_clone(chain->csteps);
495     iter->token_count = 0;
496
497     return iter;
498 }
499
500 void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
501 {
502     struct icu_buf_utf16 *src = icu_buf_utf16_create(0);
503     icu_utf16_from_utf8_cstr(src, src8cstr, &iter->status);
504     icu_buf_utf16_copy(iter->org, src);
505     iter->token_count = 0;
506     iter->org_start = 0;
507     iter->utf8_base = iter->utf16_base = 0;
508     iter->org_len = src->utf16_len;
509     iter->last = icu_iter_invoke(iter, iter->steps, src);
510 }
511
512 void icu_iter_destroy(yaz_icu_iter_t iter)
513 {
514     if (iter)
515     {
516         icu_buf_utf8_destroy(iter->display);
517         icu_buf_utf8_destroy(iter->sort8);
518         icu_buf_utf8_destroy(iter->result);
519         icu_buf_utf16_destroy(iter->org);
520         icu_buf_utf8_destroy(iter->org8);
521         icu_chain_step_destroy(iter->steps);
522         xfree(iter);
523     }
524 }
525
526 int icu_iter_next(yaz_icu_iter_t iter)
527 {
528     if (iter->token_count && iter->last)
529         iter->last = icu_iter_invoke(iter, iter->steps, 0);
530     if (!iter->last)
531         return 0;
532     else
533     {
534         iter->token_count++;
535         if (iter->chain->sort)
536         {
537             icu_sortkey8_from_utf16(iter->chain->coll,
538                                     iter->sort8, iter->last,
539                                     &iter->status);
540         }
541         icu_utf16_to_utf8(iter->result, iter->last, &iter->status);
542         icu_buf_utf16_destroy(iter->last);
543
544         return 1;
545     }
546 }
547
548 const char *icu_iter_get_norm(yaz_icu_iter_t iter)
549 {
550     return icu_buf_utf8_to_cstr(iter->result);
551 }
552
553 const char *icu_iter_get_sortkey(yaz_icu_iter_t iter)
554 {
555     return icu_buf_utf8_to_cstr(iter->sort8);
556 }
557
558 const char *icu_iter_get_display(yaz_icu_iter_t iter)
559 {
560     return icu_buf_utf8_to_cstr(iter->display);
561 }
562
563 int icu_iter_get_token_number(yaz_icu_iter_t iter)
564 {
565     return iter->token_count;
566 }
567
568
569 void icu_iter_get_org_info2(yaz_icu_iter_t iter, size_t *start, size_t *len,
570                             const char **cstr)
571 {
572     int32_t len1 = 0, len2 = 0;
573     UErrorCode status = U_ZERO_ERROR;
574
575     if (iter->org_start < iter->utf16_base)
576     {
577         iter->utf8_base = 0;
578         iter->utf16_base = 0;
579     }
580     u_strToUTF8(0, 0, &len1,
581                 iter->org->utf16 + iter->utf16_base,
582                 iter->org_start - iter->utf16_base,
583                 &status);
584
585     status = U_ZERO_ERROR;
586
587     *start = len1 + iter->utf8_base;
588
589     u_strToUTF8(0, 0, &len2,
590                 iter->org->utf16 + iter->utf16_base,
591                 iter->org_start - iter->utf16_base + iter->org_len,
592                 &status);
593
594     *len = len2 - len1;
595
596     if (cstr)
597     {
598         if (!iter->org8)
599             iter->org8 = icu_buf_utf8_create(0);
600         status = U_ZERO_ERROR;
601         icu_utf16_to_utf8(iter->org8, iter->org, &status);
602         *cstr = icu_buf_utf8_to_cstr(iter->org8);
603     }
604     iter->utf8_base = *start;
605     iter->utf16_base = iter->org_start;
606 }
607
608 void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
609 {
610     icu_iter_get_org_info2(iter, start, len, 0);
611 }
612
613 int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
614                           UErrorCode *status)
615 {
616     if (chain->iter)
617         icu_iter_destroy(chain->iter);
618     chain->iter = icu_iter_create(chain);
619     icu_iter_first(chain->iter, src8cstr);
620     return 1;
621 }
622
623 int icu_chain_next_token(struct icu_chain *chain, UErrorCode *status)
624 {
625     *status = U_ZERO_ERROR;
626     return icu_iter_next(chain->iter);
627 }
628
629 int icu_chain_token_number(struct icu_chain *chain)
630 {
631     if (chain && chain->iter)
632         return chain->iter->token_count;
633     return 0;
634 }
635
636 const char *icu_chain_token_display(struct icu_chain *chain)
637 {
638     if (chain->iter)
639         return icu_iter_get_display(chain->iter);
640     return 0;
641 }
642
643 const char *icu_chain_token_norm(struct icu_chain *chain)
644 {
645     if (chain->iter)
646         return icu_iter_get_norm(chain->iter);
647     return 0;
648 }
649
650 const char *icu_chain_token_sortkey(struct icu_chain *chain)
651 {
652     if (chain->iter)
653         return icu_iter_get_sortkey(chain->iter);
654     return 0;
655 }
656
657 void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len)
658 {
659     if (chain->iter)
660         icu_iter_get_org_info(chain->iter, start, len);
661 }
662
663 void icu_chain_get_org_info2(struct icu_chain *chain, size_t *start,
664                              size_t *len, const char **cstr)
665 {
666     if (chain->iter)
667         icu_iter_get_org_info2(chain->iter, start, len, cstr);
668 }
669
670
671 #endif /* YAZ_HAVE_ICU */
672
673 /*
674  * Local variables:
675  * c-basic-offset: 4
676  * c-file-style: "Stroustrup"
677  * indent-tabs-mode: nil
678  * End:
679  * vim: shiftwidth=4 tabstop=8 expandtab
680  */
681