f62e9bb39db7031c3544621b33251e20dafdc9f0
[yaz-moved-to-github.git] / src / icu_chain.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file
8  * \brief ICU chain
9  */
10
11 #if HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14
15 #if YAZ_HAVE_ICU
16 #include <yaz/xmalloc.h>
17
18 #include <yaz/icu_I18N.h>
19
20 #include <yaz/stemmer.h>
21
22 #include <yaz/log.h>
23 #include <yaz/nmem.h>
24 #include <yaz/nmem_xml.h>
25 #include <string.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <assert.h>
29
30 #include <unicode/ustring.h>  /* some more string fcns*/
31 #include <unicode/uchar.h>    /* char names           */
32
33 enum icu_chain_step_type {
34     ICU_chain_step_type_none,
35     ICU_chain_step_type_display,        /* convert to utf8 display format */
36     ICU_chain_step_type_casemap,        /* apply utf16 charmap */
37     ICU_chain_step_type_transform,      /* apply utf16 transform */
38     ICU_chain_step_type_tokenize,       /* apply utf16 tokenization */
39     ICU_chain_step_type_transliterate,  /* apply utf16 tokenization */
40     YAZ_chain_step_type_stemming        /* apply utf16 stemming (YAZ) */
41 };
42
43 struct icu_chain_step
44 {
45     /* type and action object */
46     enum icu_chain_step_type type;
47     union {
48         struct icu_casemap   *casemap;
49         struct icu_transform *transform;
50         struct icu_tokenizer *tokenizer;
51         yaz_stemmer_p         stemmer;
52     } u;
53     struct icu_chain_step *previous;
54 };
55
56 struct icu_chain
57 {
58     yaz_icu_iter_t iter;
59     char *locale;
60     int sort;
61
62     UCollator *coll;
63
64     /* linked list of chain steps */
65     struct icu_chain_step *csteps;
66 };
67
68 int icu_check_status(UErrorCode status)
69 {
70     if (U_FAILURE(status))
71     {
72         yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
73         return 0;
74     }
75     return 1;
76 }
77
78 static struct icu_chain_step *icu_chain_insert_step(
79     struct icu_chain *chain, enum icu_chain_step_type type,
80     const uint8_t *rule, UErrorCode *status)
81 {
82     struct icu_chain_step *step = 0;
83
84     if (!chain || !type || !rule)
85         return 0;
86
87     step = (struct icu_chain_step *) xmalloc(sizeof(*step));
88
89     step->type = type;
90     /* create auxilary objects */
91     switch (step->type)
92     {
93     case ICU_chain_step_type_display:
94         break;
95     case ICU_chain_step_type_casemap:
96         step->u.casemap = icu_casemap_create(rule[0], status);
97         break;
98     case ICU_chain_step_type_transform:
99         /* rule omitted. Only ID used */
100         step->u.transform = icu_transform_create((const char *) rule, 'f',
101                                                  0, status);
102         break;
103     case ICU_chain_step_type_tokenize:
104         step->u.tokenizer = icu_tokenizer_create(chain->locale,
105                                                  (char) rule[0], status);
106         break;
107     case ICU_chain_step_type_transliterate:
108         /* we pass a dummy ID to utrans_openU.. */
109         step->u.transform = icu_transform_create("custom", 'f',
110                                                  (const char *) rule, status);
111         break;
112     case YAZ_chain_step_type_stemming:
113         step->u.stemmer = yaz_stemmer_create(chain->locale,
114                                              (const char *) rule, status);
115         break;
116     default:
117         break;
118     }
119     step->previous = chain->csteps;
120     chain->csteps = step;
121
122     return step;
123 }
124
125
126 static void icu_chain_step_destroy(struct icu_chain_step *step)
127 {
128     if (!step)
129         return;
130
131     icu_chain_step_destroy(step->previous);
132
133     switch (step->type)
134     {
135     case ICU_chain_step_type_display:
136         break;
137     case ICU_chain_step_type_casemap:
138         icu_casemap_destroy(step->u.casemap);
139         break;
140     case ICU_chain_step_type_transform:
141     case ICU_chain_step_type_transliterate:
142         icu_transform_destroy(step->u.transform);
143         break;
144     case ICU_chain_step_type_tokenize:
145         icu_tokenizer_destroy(step->u.tokenizer);
146         break;
147     case YAZ_chain_step_type_stemming:
148         yaz_stemmer_destroy(step->u.stemmer);
149         break;
150     default:
151         break;
152     }
153     xfree(step);
154 }
155
156 struct icu_chain_step *icu_chain_step_clone(struct icu_chain_step *old)
157 {
158     struct icu_chain_step *step = 0;
159     struct icu_chain_step **sp = &step;
160     while (old)
161     {
162         *sp = (struct icu_chain_step *) xmalloc(sizeof(**sp));
163         (*sp)->type = old->type;
164
165         switch ((*sp)->type)
166         {
167         case ICU_chain_step_type_display:
168             break;
169         case ICU_chain_step_type_casemap:
170             (*sp)->u.casemap = icu_casemap_clone(old->u.casemap);
171             break;
172         case ICU_chain_step_type_transform:
173         case ICU_chain_step_type_transliterate:
174             (*sp)->u.transform = icu_transform_clone(old->u.transform);
175             break;
176         case ICU_chain_step_type_tokenize:
177             (*sp)->u.tokenizer = icu_tokenizer_clone(old->u.tokenizer);
178             break;
179         case YAZ_chain_step_type_stemming:
180             (*sp)->u.stemmer = yaz_stemmer_clone(old->u.stemmer);
181             break;
182         case ICU_chain_step_type_none:
183             break;
184         }
185         old = old->previous;
186         sp = &(*sp)->previous;
187     }
188     *sp = 0;
189     return step;
190 }
191
192 struct icu_chain *icu_chain_create(const char *locale, int sort,
193                                    UErrorCode *status)
194 {
195     struct icu_chain *chain;
196     UCollator *coll = ucol_open(locale, status);
197
198     if (U_FAILURE(*status))
199         return 0;
200
201     chain = (struct icu_chain *) xmalloc(sizeof(*chain));
202     chain->iter = 0;
203     chain->locale = xstrdup(locale);
204     chain->sort = sort;
205     chain->coll = coll;
206     chain->csteps = 0;
207
208     return chain;
209 }
210
211 void icu_chain_destroy(struct icu_chain *chain)
212 {
213     if (chain)
214     {
215         if (chain->coll)
216             ucol_close(chain->coll);
217
218         if (chain->iter)
219             icu_iter_destroy(chain->iter);
220         icu_chain_step_destroy(chain->csteps);
221         xfree(chain->locale);
222         xfree(chain);
223     }
224 }
225
226 struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node,
227                                        int sort,
228                                        UErrorCode *status)
229 {
230     xmlNode *node = 0;
231     int no_errors = 0;
232     struct icu_chain *chain = 0;
233     NMEM nmem = 0;
234
235     *status = U_ZERO_ERROR;
236
237     if (xml_node && xml_node->type == XML_ELEMENT_NODE)
238     {
239         xmlChar *xml_locale = xmlGetProp((xmlNode *) xml_node,
240                                          (xmlChar *) "locale");
241         if (xml_locale)
242         {
243             chain = icu_chain_create((const char *) xml_locale, sort, status);
244             xmlFree(xml_locale);
245         }
246     }
247
248     if (!chain)
249         return 0;
250
251     nmem = nmem_create();
252     for (node = xml_node->children; node; node = node->next)
253     {
254         char *rule = 0;
255         struct icu_chain_step *step = 0;
256         struct _xmlAttr *attr;
257
258         nmem_reset(nmem);
259         if (node->type != XML_ELEMENT_NODE)
260             continue;
261
262         for (attr = node->properties; attr; attr = attr->next)
263         {
264             if (!strcmp((const char *) attr->name, "rule"))
265             {
266                 rule = nmem_text_node_cdata(attr->children, nmem);
267             }
268             else
269             {
270                 yaz_log(YLOG_WARN, "Unsupported attribute '%s' for "
271                         "element '%s'", attr->name, node->name);
272                 no_errors++;
273                 continue;
274             }
275         }
276         if (!rule && node->children)
277             rule = nmem_text_node_cdata(node->children, nmem);
278
279         if (!strcmp((const char *) node->name, "casemap"))
280             step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
281                                          (const uint8_t *) rule, status);
282         else if (!strcmp((const char *) node->name, "transform"))
283             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
284                                          (const uint8_t *) rule, status);
285         else if (!strcmp((const char *) node->name, "transliterate"))
286             step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate,
287                                          (const uint8_t *) rule, status);
288         else if (!strcmp((const char *) node->name, "tokenize"))
289             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
290                                          (const uint8_t *) rule, status);
291         else if (!strcmp((const char *) node->name, "display"))
292             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
293                                          (const uint8_t *) "", status);
294         else if (!strcmp((const char *) node->name, "stemming"))
295             step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming,
296                                          (const uint8_t *) rule, status);
297         else if (!strcmp((const char *) node->name, "normalize"))
298         {
299             yaz_log(YLOG_WARN, "Element %s is deprecated. "
300                     "Use transform instead", node->name);
301             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
302                                          (const uint8_t *) rule, status);
303         }
304         else if (!strcmp((const char *) node->name, "index")
305                  || !strcmp((const char *) node->name, "sortkey"))
306         {
307             yaz_log(YLOG_WARN, "Element %s is no longer needed. "
308                     "Remove it from the configuration", node->name);
309         }
310         else
311         {
312             yaz_log(YLOG_WARN, "Unknown element %s", node->name);
313             no_errors++;
314             continue;
315         }
316         if (step && U_FAILURE(*status))
317         {
318             no_errors++;
319             break;
320         }
321     }
322     nmem_destroy(nmem);
323     if (no_errors)
324     {
325         icu_chain_destroy(chain);
326         return 0;
327     }
328     return chain;
329 }
330
331 struct icu_iter {
332     struct icu_chain *chain;
333     struct icu_buf_utf16 *last;
334     UErrorCode status;
335     struct icu_buf_utf8 *display;
336     struct icu_buf_utf8 *sort8;
337     struct icu_buf_utf8 *result;
338     struct icu_buf_utf16 *input;
339     int token_count;
340     struct icu_chain_step *steps;
341 };
342
343 void icu_utf16_print(struct icu_buf_utf16 *src16)
344 {
345     UErrorCode status = U_ZERO_ERROR;
346     const char *p;
347     struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
348     icu_utf16_to_utf8(dst8, src16, &status);
349
350     if (U_FAILURE(status))
351     {
352         printf("failure");
353     }
354     else
355     {
356         p = icu_buf_utf8_to_cstr(dst8);
357         printf("%s", p);
358     }
359     icu_buf_utf8_destroy(dst8);
360 }
361
362 struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
363                                       struct icu_chain_step *step,
364                                       struct icu_buf_utf16 *src)
365 {
366     if (!step)
367         return src;
368     else
369     {
370         struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
371
372         switch (step->type)
373         {
374         case ICU_chain_step_type_casemap:
375             if (dst)
376             {
377                 struct icu_buf_utf16 *src = dst;
378
379                 dst = icu_buf_utf16_create(0);
380                 icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
381                                     iter->chain->locale);
382                 icu_buf_utf16_destroy(src);
383             }
384             break;
385         case ICU_chain_step_type_tokenize:
386             if (dst)
387             {
388                 struct icu_buf_utf16 *src = dst;
389
390                 icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
391                 icu_buf_utf16_destroy(src);
392             }
393             dst = icu_buf_utf16_create(0);
394             iter->status = U_ZERO_ERROR;
395             if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
396             {
397                 icu_buf_utf16_destroy(dst);
398                 dst = 0;
399             }
400             break;
401         case ICU_chain_step_type_transform:
402         case ICU_chain_step_type_transliterate:
403             if (dst)
404             {
405                 struct icu_buf_utf16 *src = dst;
406                 dst = icu_buf_utf16_create(0);
407                 icu_transform_trans(step->u.transform, dst, src, &iter->status);
408                 icu_buf_utf16_destroy(src);
409             }
410             break;
411         case ICU_chain_step_type_display:
412             if (dst)
413                 icu_utf16_to_utf8(iter->display, dst, &iter->status);
414             break;
415         case YAZ_chain_step_type_stemming:
416             if (dst)
417             {
418                 struct icu_buf_utf16 *src = dst;
419                 dst = icu_buf_utf16_create(0);
420                 yaz_stemmer_stem(step->u.stemmer, dst, src, &iter->status);
421                 icu_buf_utf16_destroy(src);
422             }
423             break;
424         default:
425             assert(0);
426         }
427         return dst;
428     }
429 }
430
431 yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
432 {
433     yaz_icu_iter_t iter = xmalloc(sizeof(*iter));
434     iter->chain = chain;
435     iter->status = U_ZERO_ERROR;
436     iter->display = icu_buf_utf8_create(0);
437     iter->sort8 = icu_buf_utf8_create(0);
438     iter->result = icu_buf_utf8_create(0);
439     iter->last = 0; /* no last returned string (yet) */
440     iter->steps = icu_chain_step_clone(chain->csteps);
441     iter->input = 0;
442
443     return iter;
444 }
445
446 void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
447 {
448     if (iter->input)
449         icu_buf_utf16_destroy(iter->input);
450     iter->input = icu_buf_utf16_create(0);
451     iter->token_count = 0;
452     /* fill and assign input string.. It will be 0 after
453        first iteration */
454     icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
455 }
456
457 void icu_iter_destroy(yaz_icu_iter_t iter)
458 {
459     if (iter)
460     {
461         icu_buf_utf8_destroy(iter->display);
462         icu_buf_utf8_destroy(iter->sort8);
463         icu_buf_utf8_destroy(iter->result);
464         if (iter->input)
465             icu_buf_utf16_destroy(iter->input);
466         icu_chain_step_destroy(iter->steps);
467         xfree(iter);
468     }
469 }
470
471 int icu_iter_next(yaz_icu_iter_t iter)
472 {
473     if (!iter->input && iter->last == 0)
474         return 0;
475     else
476     {
477         /* on first call, iter->input is the input string. Thereafter: 0. */
478         assert(iter->steps || !iter->chain->csteps);
479         iter->last = icu_iter_invoke(iter, iter->steps, iter->input);
480         iter->input = 0;
481
482         if (!iter->last)
483             return 0;
484
485         iter->token_count++;
486
487         if (iter->chain->sort)
488         {
489             icu_sortkey8_from_utf16(iter->chain->coll,
490                                     iter->sort8, iter->last,
491                                     &iter->status);
492         }
493         icu_utf16_to_utf8(iter->result, iter->last, &iter->status);
494         icu_buf_utf16_destroy(iter->last);
495
496         return 1;
497     }
498 }
499
500 const char *icu_iter_get_norm(yaz_icu_iter_t iter)
501 {
502     return icu_buf_utf8_to_cstr(iter->result);
503 }
504
505 const char *icu_iter_get_sortkey(yaz_icu_iter_t iter)
506 {
507     return icu_buf_utf8_to_cstr(iter->sort8);
508 }
509
510 const char *icu_iter_get_display(yaz_icu_iter_t iter)
511 {
512     return icu_buf_utf8_to_cstr(iter->display);
513 }
514
515 int icu_iter_get_token_number(yaz_icu_iter_t iter)
516 {
517     return iter->token_count;
518 }
519
520 int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
521                           UErrorCode *status)
522 {
523     if (chain->iter)
524         icu_iter_destroy(chain->iter);
525     chain->iter = icu_iter_create(chain);
526     icu_iter_first(chain->iter, src8cstr);
527     return 1;
528 }
529
530 int icu_chain_next_token(struct icu_chain *chain, UErrorCode *status)
531 {
532     *status = U_ZERO_ERROR;
533     return icu_iter_next(chain->iter);
534 }
535
536 int icu_chain_token_number(struct icu_chain *chain)
537 {
538     if (chain && chain->iter)
539         return chain->iter->token_count;
540     return 0;
541 }
542
543 const char *icu_chain_token_display(struct icu_chain *chain)
544 {
545     if (chain->iter)
546         return icu_iter_get_display(chain->iter);
547     return 0;
548 }
549
550 const char *icu_chain_token_norm(struct icu_chain *chain)
551 {
552     if (chain->iter)
553         return icu_iter_get_norm(chain->iter);
554     return 0;
555 }
556
557 const char *icu_chain_token_sortkey(struct icu_chain *chain)
558 {
559     if (chain->iter)
560         return icu_iter_get_sortkey(chain->iter);
561     return 0;
562 }
563
564 #endif /* YAZ_HAVE_ICU */
565
566 /*
567  * Local variables:
568  * c-basic-offset: 4
569  * c-file-style: "Stroustrup"
570  * indent-tabs-mode: nil
571  * End:
572  * vim: shiftwidth=4 tabstop=8 expandtab
573  */
574