195b15aeafb66115eeec3675fd3a0644586eb49c
[yaz-moved-to-github.git] / src / icu_chain.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2012 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file
8  * \brief ICU chain
9  */
10
11 #if HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14
15 #if YAZ_HAVE_ICU
16 #include <yaz/xmalloc.h>
17
18 #include <yaz/icu_I18N.h>
19
20 #include <yaz/stemmer.h>
21
22 #include <yaz/log.h>
23 #include <yaz/nmem.h>
24 #include <yaz/nmem_xml.h>
25 #include <string.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <assert.h>
29
30 #include <unicode/ustring.h>  /* some more string fcns*/
31 #include <unicode/uchar.h>    /* char names           */
32
33 enum icu_chain_step_type {
34     ICU_chain_step_type_none,
35     ICU_chain_step_type_display,        /* convert to utf8 display format */
36     ICU_chain_step_type_casemap,        /* apply utf16 charmap */
37     ICU_chain_step_type_transform,      /* apply utf16 transform */
38     ICU_chain_step_type_tokenize,       /* apply utf16 tokenization */
39     ICU_chain_step_type_transliterate,  /* apply utf16 tokenization */
40     YAZ_chain_step_type_stemming        /* apply utf16 stemming (YAZ) */
41 };
42
43 struct icu_chain_step
44 {
45     /* type and action object */
46     enum icu_chain_step_type type;
47     union {
48         struct icu_casemap   *casemap;
49         struct icu_transform *transform;
50         struct icu_tokenizer *tokenizer;
51         yaz_stemmer_p         stemmer;
52     } u;
53     struct icu_chain_step *previous;
54 };
55
56 struct icu_chain
57 {
58     yaz_icu_iter_t iter;
59     char *locale;
60     int sort;
61
62     UCollator *coll;
63
64     /* linked list of chain steps */
65     struct icu_chain_step *csteps;
66 };
67
68 int icu_check_status(UErrorCode status)
69 {
70     if (U_FAILURE(status))
71     {
72         yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
73         return 0;
74     }
75     return 1;
76 }
77
78 static struct icu_chain_step *icu_chain_insert_step(
79     struct icu_chain *chain, enum icu_chain_step_type type,
80     const uint8_t *rule, UErrorCode *status)
81 {
82     struct icu_chain_step *step = 0;
83
84     if (!chain || !type || !rule)
85         return 0;
86
87     step = (struct icu_chain_step *) xmalloc(sizeof(*step));
88
89     step->type = type;
90     /* create auxilary objects */
91     switch (step->type)
92     {
93     case ICU_chain_step_type_display:
94         break;
95     case ICU_chain_step_type_casemap:
96         step->u.casemap = icu_casemap_create(rule[0], status);
97         break;
98     case ICU_chain_step_type_transform:
99         /* rule omitted. Only ID used */
100         step->u.transform = icu_transform_create((const char *) rule, 'f',
101                                                  0, status);
102         break;
103     case ICU_chain_step_type_tokenize:
104         step->u.tokenizer = icu_tokenizer_create(chain->locale,
105                                                  (char) rule[0], status);
106         break;
107     case ICU_chain_step_type_transliterate:
108         /* we pass a dummy ID to utrans_openU.. */
109         step->u.transform = icu_transform_create("custom", 'f',
110                                                  (const char *) rule, status);
111         break;
112     case YAZ_chain_step_type_stemming:
113         step->u.stemmer = yaz_stemmer_create(chain->locale,
114                                              (const char *) rule, status);
115         break;
116     default:
117         break;
118     }
119     step->previous = chain->csteps;
120     chain->csteps = step;
121
122     return step;
123 }
124
125
126 static void icu_chain_step_destroy(struct icu_chain_step *step)
127 {
128     if (!step)
129         return;
130
131     icu_chain_step_destroy(step->previous);
132
133     switch (step->type)
134     {
135     case ICU_chain_step_type_display:
136         break;
137     case ICU_chain_step_type_casemap:
138         icu_casemap_destroy(step->u.casemap);
139         break;
140     case ICU_chain_step_type_transform:
141     case ICU_chain_step_type_transliterate:
142         icu_transform_destroy(step->u.transform);
143         break;
144     case ICU_chain_step_type_tokenize:
145         icu_tokenizer_destroy(step->u.tokenizer);
146         break;
147     case YAZ_chain_step_type_stemming:
148         yaz_stemmer_destroy(step->u.stemmer);
149         break;
150     default:
151         break;
152     }
153     xfree(step);
154 }
155
156 struct icu_chain_step *icu_chain_step_clone(struct icu_chain_step *old)
157 {
158     struct icu_chain_step *step = 0;
159     struct icu_chain_step **sp = &step;
160     while (old)
161     {
162         *sp = (struct icu_chain_step *) xmalloc(sizeof(**sp));
163         (*sp)->type = old->type;
164
165         switch ((*sp)->type)
166         {
167         case ICU_chain_step_type_display:
168             break;
169         case ICU_chain_step_type_casemap:
170             (*sp)->u.casemap = icu_casemap_clone(old->u.casemap);
171             break;
172         case ICU_chain_step_type_transform:
173         case ICU_chain_step_type_transliterate:
174             (*sp)->u.transform = icu_transform_clone(old->u.transform);
175             break;
176         case ICU_chain_step_type_tokenize:
177             (*sp)->u.tokenizer = icu_tokenizer_clone(old->u.tokenizer);
178             break;
179         case YAZ_chain_step_type_stemming:
180             (*sp)->u.stemmer = yaz_stemmer_clone(old->u.stemmer);
181             break;
182         case ICU_chain_step_type_none:
183             break;
184         }
185         old = old->previous;
186         sp = &(*sp)->previous;
187     }
188     *sp = 0;
189     return step;
190 }
191
192 struct icu_chain *icu_chain_create(const char *locale, int sort,
193                                    UErrorCode *status)
194 {
195     struct icu_chain *chain;
196     UCollator *coll = ucol_open(locale, status);
197
198     if (U_FAILURE(*status))
199         return 0;
200
201     chain = (struct icu_chain *) xmalloc(sizeof(*chain));
202     chain->iter = 0;
203     chain->locale = xstrdup(locale);
204     chain->sort = sort;
205     chain->coll = coll;
206     chain->csteps = 0;
207
208     return chain;
209 }
210
211 void icu_chain_destroy(struct icu_chain *chain)
212 {
213     if (chain)
214     {
215         if (chain->coll)
216             ucol_close(chain->coll);
217
218         if (chain->iter)
219             icu_iter_destroy(chain->iter);
220         icu_chain_step_destroy(chain->csteps);
221         xfree(chain->locale);
222         xfree(chain);
223     }
224 }
225
226 struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node,
227                                        int sort,
228                                        UErrorCode *status)
229 {
230     xmlNode *node = 0;
231     int no_errors = 0;
232     struct icu_chain *chain = 0;
233     NMEM nmem = 0;
234
235     *status = U_ZERO_ERROR;
236
237     if (xml_node && xml_node->type == XML_ELEMENT_NODE)
238     {
239         xmlChar *xml_locale = xmlGetProp((xmlNode *) xml_node,
240                                          (xmlChar *) "locale");
241         if (xml_locale)
242         {
243             chain = icu_chain_create((const char *) xml_locale, sort, status);
244             xmlFree(xml_locale);
245         }
246     }
247
248     if (!chain)
249         return 0;
250
251     nmem = nmem_create();
252     for (node = xml_node->children; node; node = node->next)
253     {
254         char *rule = 0;
255         struct icu_chain_step *step = 0;
256         struct _xmlAttr *attr;
257
258         nmem_reset(nmem);
259         if (node->type != XML_ELEMENT_NODE)
260             continue;
261
262         for (attr = node->properties; attr; attr = attr->next)
263         {
264             if (!strcmp((const char *) attr->name, "rule"))
265             {
266                 rule = nmem_text_node_cdata(attr->children, nmem);
267             }
268             else
269             {
270                 yaz_log(YLOG_WARN, "Unsupported attribute '%s' for "
271                         "element '%s'", attr->name, node->name);
272                 no_errors++;
273                 continue;
274             }
275         }
276         if (!rule && node->children)
277             rule = nmem_text_node_cdata(node->children, nmem);
278
279         if (!strcmp((const char *) node->name, "casemap"))
280             step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
281                                          (const uint8_t *) rule, status);
282         else if (!strcmp((const char *) node->name, "transform"))
283             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
284                                          (const uint8_t *) rule, status);
285         else if (!strcmp((const char *) node->name, "transliterate"))
286             step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate,
287                                          (const uint8_t *) rule, status);
288         else if (!strcmp((const char *) node->name, "tokenize"))
289             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
290                                          (const uint8_t *) rule, status);
291         else if (!strcmp((const char *) node->name, "display"))
292             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
293                                          (const uint8_t *) "", status);
294         else if (!strcmp((const char *) node->name, "stemming"))
295             step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming,
296                                          (const uint8_t *) rule, status);
297         else if (!strcmp((const char *) node->name, "normalize"))
298         {
299             yaz_log(YLOG_WARN, "Element %s is deprecated. "
300                     "Use transform instead", node->name);
301             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
302                                          (const uint8_t *) rule, status);
303         }
304         else if (!strcmp((const char *) node->name, "index")
305                  || !strcmp((const char *) node->name, "sortkey"))
306         {
307             yaz_log(YLOG_WARN, "Element %s is no longer needed. "
308                     "Remove it from the configuration", node->name);
309         }
310         else
311         {
312             yaz_log(YLOG_WARN, "Unknown element %s", node->name);
313             no_errors++;
314             continue;
315         }
316         if (step && U_FAILURE(*status))
317         {
318             no_errors++;
319             break;
320         }
321     }
322     nmem_destroy(nmem);
323     if (no_errors)
324     {
325         icu_chain_destroy(chain);
326         return 0;
327     }
328     return chain;
329 }
330
331 struct icu_iter {
332     struct icu_chain *chain;
333     struct icu_buf_utf16 *last;
334     UErrorCode status;
335     struct icu_buf_utf8 *display;
336     struct icu_buf_utf8 *sort8;
337     struct icu_buf_utf8 *result;
338     struct icu_buf_utf16 *input;
339     int token_count;
340     struct icu_chain_step *steps;
341 };
342
343 void icu_utf16_print(struct icu_buf_utf16 *src16)
344 {
345     UErrorCode status = U_ZERO_ERROR;
346     const char *p;
347     struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
348     icu_utf16_to_utf8(dst8, src16, &status);
349
350     assert(status != 1234);
351     if (U_FAILURE(status))
352     {
353         printf("failure");
354     }
355     else
356     {
357         p = icu_buf_utf8_to_cstr(dst8);
358         printf("%s", p);
359     }
360     icu_buf_utf8_destroy(dst8);
361 }
362
363 struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
364                                       struct icu_chain_step *step,
365                                       struct icu_buf_utf16 *src)
366 {
367     if (!step)
368         return src;
369     else
370     {
371         struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
372
373         switch (step->type)
374         {
375         case ICU_chain_step_type_casemap:
376             if (dst)
377             {
378                 struct icu_buf_utf16 *src = dst;
379
380                 dst = icu_buf_utf16_create(0);
381                 icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
382                                     iter->chain->locale);
383                 icu_buf_utf16_destroy(src);
384             }
385             break;
386         case ICU_chain_step_type_tokenize:
387             if (dst)
388             {
389                 struct icu_buf_utf16 *src = dst;
390
391                 icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
392                 icu_buf_utf16_destroy(src);
393             }
394             dst = icu_buf_utf16_create(0);
395             iter->status = U_ZERO_ERROR;
396             if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
397             {
398                 icu_buf_utf16_destroy(dst);
399                 dst = 0;
400             }
401             break;
402         case ICU_chain_step_type_transform:
403         case ICU_chain_step_type_transliterate:
404             if (dst)
405             {
406                 struct icu_buf_utf16 *src = dst;
407                 dst = icu_buf_utf16_create(0);
408                 icu_transform_trans(step->u.transform, dst, src, &iter->status);
409                 icu_buf_utf16_destroy(src);
410             }
411             break;
412         case ICU_chain_step_type_display:
413             if (dst)
414                 icu_utf16_to_utf8(iter->display, dst, &iter->status);
415             break;
416         case YAZ_chain_step_type_stemming:
417             if (dst)
418             {
419                 struct icu_buf_utf16 *src = dst;
420                 dst = icu_buf_utf16_create(0);
421                 yaz_stemmer_stem(step->u.stemmer, dst, src, &iter->status);
422                 icu_buf_utf16_destroy(src);
423             }
424             break;
425         default:
426             assert(0);
427         }
428         return dst;
429     }
430 }
431
432 yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
433 {
434     yaz_icu_iter_t iter = xmalloc(sizeof(*iter));
435     iter->chain = chain;
436     iter->status = U_ZERO_ERROR;
437     iter->display = icu_buf_utf8_create(0);
438     iter->sort8 = icu_buf_utf8_create(0);
439     iter->result = icu_buf_utf8_create(0);
440     iter->last = 0; /* no last returned string (yet) */
441     iter->steps = icu_chain_step_clone(chain->csteps);
442     iter->input = 0;
443
444     return iter;
445 }
446
447 void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
448 {
449     if (iter->input)
450         icu_buf_utf16_destroy(iter->input);
451     iter->input = icu_buf_utf16_create(0);
452     iter->token_count = 0;
453     /* fill and assign input string.. It will be 0 after
454        first iteration */
455     icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
456 }
457
458 void icu_iter_destroy(yaz_icu_iter_t iter)
459 {
460     if (iter)
461     {
462         icu_buf_utf8_destroy(iter->display);
463         icu_buf_utf8_destroy(iter->sort8);
464         icu_buf_utf8_destroy(iter->result);
465         if (iter->input)
466             icu_buf_utf16_destroy(iter->input);
467         icu_chain_step_destroy(iter->steps);
468         xfree(iter);
469     }
470 }
471
472 int icu_iter_next(yaz_icu_iter_t iter)
473 {
474     if (!iter->input && iter->last == 0)
475         return 0;
476     else
477     {
478         /* on first call, iter->input is the input string. Thereafter: 0. */
479         iter->last = icu_iter_invoke(iter, iter->steps ?
480                                      iter->steps : iter->chain->csteps,
481                                      iter->input);
482         iter->input = 0;
483
484         if (!iter->last)
485             return 0;
486
487         iter->token_count++;
488
489         if (iter->chain->sort)
490         {
491             icu_sortkey8_from_utf16(iter->chain->coll,
492                                     iter->sort8, iter->last,
493                                     &iter->status);
494         }
495         icu_utf16_to_utf8(iter->result, iter->last, &iter->status);
496         icu_buf_utf16_destroy(iter->last);
497
498         return 1;
499     }
500 }
501
502 const char *icu_iter_get_norm(yaz_icu_iter_t iter)
503 {
504     return icu_buf_utf8_to_cstr(iter->result);
505 }
506
507 const char *icu_iter_get_sortkey(yaz_icu_iter_t iter)
508 {
509     return icu_buf_utf8_to_cstr(iter->sort8);
510 }
511
512 const char *icu_iter_get_display(yaz_icu_iter_t iter)
513 {
514     return icu_buf_utf8_to_cstr(iter->display);
515 }
516
517 int icu_iter_get_token_number(yaz_icu_iter_t iter)
518 {
519     return iter->token_count;
520 }
521
522 int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
523                           UErrorCode *status)
524 {
525     if (chain->iter)
526         icu_iter_destroy(chain->iter);
527     chain->iter = icu_iter_create(chain);
528     icu_iter_first(chain->iter, src8cstr);
529     return 1;
530 }
531
532 int icu_chain_next_token(struct icu_chain *chain, UErrorCode *status)
533 {
534     *status = U_ZERO_ERROR;
535     return icu_iter_next(chain->iter);
536 }
537
538 int icu_chain_token_number(struct icu_chain *chain)
539 {
540     if (chain && chain->iter)
541         return chain->iter->token_count;
542     return 0;
543 }
544
545 const char *icu_chain_token_display(struct icu_chain *chain)
546 {
547     if (chain->iter)
548         return icu_iter_get_display(chain->iter);
549     return 0;
550 }
551
552 const char *icu_chain_token_norm(struct icu_chain *chain)
553 {
554     if (chain->iter)
555         return icu_iter_get_norm(chain->iter);
556     return 0;
557 }
558
559 const char *icu_chain_token_sortkey(struct icu_chain *chain)
560 {
561     if (chain->iter)
562         return icu_iter_get_sortkey(chain->iter);
563     return 0;
564 }
565
566 #endif /* YAZ_HAVE_ICU */
567
568 /*
569  * Local variables:
570  * c-basic-offset: 4
571  * c-file-style: "Stroustrup"
572  * indent-tabs-mode: nil
573  * End:
574  * vim: shiftwidth=4 tabstop=8 expandtab
575  */
576