e7b9b4e9811f005d10acb732d43adbc32d814e08
[yaz-moved-to-github.git] / src / icu_chain.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2013 Index Data
3  * See the file LICENSE for details.
4  */
5
6 /**
7  * \file
8  * \brief ICU chain
9  */
10
11 #if HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14
15 #if YAZ_HAVE_ICU
16 #include <yaz/xmalloc.h>
17
18 #include <yaz/icu_I18N.h>
19
20 #include <yaz/stemmer.h>
21
22 #include <yaz/log.h>
23 #include <yaz/nmem.h>
24 #include <yaz/nmem_xml.h>
25 #include <string.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <assert.h>
29
30 #include <unicode/ustring.h>  /* some more string fcns*/
31 #include <unicode/uchar.h>    /* char names           */
32
33 enum icu_chain_step_type {
34     ICU_chain_step_type_none,
35     ICU_chain_step_type_display,        /* convert to utf8 display format */
36     ICU_chain_step_type_casemap,        /* apply utf16 charmap */
37     ICU_chain_step_type_transform,      /* apply utf16 transform */
38     ICU_chain_step_type_tokenize,       /* apply utf16 tokenization */
39     ICU_chain_step_type_transliterate,  /* apply utf16 tokenization */
40     YAZ_chain_step_type_stemming,       /* apply utf16 stemming (YAZ) */
41     ICU_chain_step_type_join
42 };
43
44 struct icu_chain_step
45 {
46     /* type and action object */
47     enum icu_chain_step_type type;
48     union {
49         struct icu_casemap   *casemap;
50         struct icu_transform *transform;
51         struct icu_tokenizer *tokenizer;
52         yaz_stemmer_p         stemmer;
53         struct icu_buf_utf16 *join;
54     } u;
55     struct icu_chain_step *previous;
56 };
57
58 struct icu_chain
59 {
60     yaz_icu_iter_t iter;
61     char *locale;
62     int sort;
63
64     UCollator *coll;
65
66     /* linked list of chain steps */
67     struct icu_chain_step *csteps;
68 };
69
70 int icu_check_status(UErrorCode status)
71 {
72     if (U_FAILURE(status))
73     {
74         yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
75         return 0;
76     }
77     return 1;
78 }
79
80 static struct icu_chain_step *icu_chain_insert_step(
81     struct icu_chain *chain, enum icu_chain_step_type type,
82     const char *rule, UErrorCode *status)
83 {
84     struct icu_chain_step *step = 0;
85
86     if (!chain || !type || !rule)
87         return 0;
88
89     step = (struct icu_chain_step *) xmalloc(sizeof(*step));
90
91     step->type = type;
92     /* create auxilary objects */
93     switch (step->type)
94     {
95     case ICU_chain_step_type_display:
96         break;
97     case ICU_chain_step_type_casemap:
98         step->u.casemap = icu_casemap_create(rule[0], status);
99         break;
100     case ICU_chain_step_type_transform:
101         /* rule omitted. Only ID used */
102         step->u.transform = icu_transform_create(rule, 'f', 0, status);
103         break;
104     case ICU_chain_step_type_tokenize:
105         step->u.tokenizer = icu_tokenizer_create(chain->locale, rule[0], status);
106         break;
107     case ICU_chain_step_type_transliterate:
108         /* we pass a dummy ID to utrans_openU.. */
109         step->u.transform = icu_transform_create("custom", 'f', rule, status);
110         break;
111     case YAZ_chain_step_type_stemming:
112         step->u.stemmer = yaz_stemmer_create(chain->locale, rule, status);
113         break;
114     case ICU_chain_step_type_join:
115         step->u.join = icu_buf_utf16_create(0);
116         icu_utf16_from_utf8_cstr(step->u.join, rule, status);
117         break;
118     default:
119         break;
120     }
121     step->previous = chain->csteps;
122     chain->csteps = step;
123
124     return step;
125 }
126
127
128 static void icu_chain_step_destroy(struct icu_chain_step *step)
129 {
130     if (!step)
131         return;
132
133     icu_chain_step_destroy(step->previous);
134
135     switch (step->type)
136     {
137     case ICU_chain_step_type_display:
138         break;
139     case ICU_chain_step_type_casemap:
140         icu_casemap_destroy(step->u.casemap);
141         break;
142     case ICU_chain_step_type_transform:
143     case ICU_chain_step_type_transliterate:
144         icu_transform_destroy(step->u.transform);
145         break;
146     case ICU_chain_step_type_tokenize:
147         icu_tokenizer_destroy(step->u.tokenizer);
148         break;
149     case YAZ_chain_step_type_stemming:
150         yaz_stemmer_destroy(step->u.stemmer);
151         break;
152     case ICU_chain_step_type_join:
153         icu_buf_utf16_destroy(step->u.join);
154         break;
155     default:
156         break;
157     }
158     xfree(step);
159 }
160
161 struct icu_chain_step *icu_chain_step_clone(struct icu_chain_step *old)
162 {
163     struct icu_chain_step *step = 0;
164     struct icu_chain_step **sp = &step;
165     while (old)
166     {
167         *sp = (struct icu_chain_step *) xmalloc(sizeof(**sp));
168         (*sp)->type = old->type;
169
170         switch ((*sp)->type)
171         {
172         case ICU_chain_step_type_display:
173             break;
174         case ICU_chain_step_type_casemap:
175             (*sp)->u.casemap = icu_casemap_clone(old->u.casemap);
176             break;
177         case ICU_chain_step_type_transform:
178         case ICU_chain_step_type_transliterate:
179             (*sp)->u.transform = icu_transform_clone(old->u.transform);
180             break;
181         case ICU_chain_step_type_tokenize:
182             (*sp)->u.tokenizer = icu_tokenizer_clone(old->u.tokenizer);
183             break;
184         case YAZ_chain_step_type_stemming:
185             (*sp)->u.stemmer = yaz_stemmer_clone(old->u.stemmer);
186             break;
187         case ICU_chain_step_type_none:
188             break;
189         case ICU_chain_step_type_join:
190             (*sp)->u.join = icu_buf_utf16_create(0);
191             (*sp)->u.join = icu_buf_utf16_copy((*sp)->u.join, old->u.join);
192             break;
193         }
194         old = old->previous;
195         sp = &(*sp)->previous;
196     }
197     *sp = 0;
198     return step;
199 }
200
201 struct icu_chain *icu_chain_create(const char *locale, int sort,
202                                    UErrorCode *status)
203 {
204     struct icu_chain *chain;
205     UCollator *coll = ucol_open(locale, status);
206
207     if (U_FAILURE(*status))
208         return 0;
209
210     chain = (struct icu_chain *) xmalloc(sizeof(*chain));
211     chain->iter = 0;
212     chain->locale = xstrdup(locale);
213     chain->sort = sort;
214     chain->coll = coll;
215     chain->csteps = 0;
216
217     return chain;
218 }
219
220 void icu_chain_destroy(struct icu_chain *chain)
221 {
222     if (chain)
223     {
224         if (chain->coll)
225             ucol_close(chain->coll);
226
227         if (chain->iter)
228             icu_iter_destroy(chain->iter);
229         icu_chain_step_destroy(chain->csteps);
230         xfree(chain->locale);
231         xfree(chain);
232     }
233 }
234
235 struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node,
236                                        int sort,
237                                        UErrorCode *status)
238 {
239     xmlNode *node = 0;
240     int no_errors = 0;
241     struct icu_chain *chain = 0;
242     NMEM nmem = 0;
243
244     *status = U_ZERO_ERROR;
245
246     if (xml_node && xml_node->type == XML_ELEMENT_NODE)
247     {
248         xmlChar *xml_locale = xmlGetProp((xmlNode *) xml_node,
249                                          (xmlChar *) "locale");
250         if (xml_locale)
251         {
252             chain = icu_chain_create((const char *) xml_locale, sort, status);
253             xmlFree(xml_locale);
254         }
255     }
256
257     if (!chain)
258         return 0;
259
260     nmem = nmem_create();
261     for (node = xml_node->children; node; node = node->next)
262     {
263         char *rule = 0;
264         struct icu_chain_step *step = 0;
265         struct _xmlAttr *attr;
266
267         nmem_reset(nmem);
268         if (node->type != XML_ELEMENT_NODE)
269             continue;
270
271         for (attr = node->properties; attr; attr = attr->next)
272         {
273             if (!strcmp((const char *) attr->name, "rule"))
274             {
275                 rule = nmem_text_node_cdata(attr->children, nmem);
276             }
277             else
278             {
279                 yaz_log(YLOG_WARN, "Unsupported attribute '%s' for "
280                         "element '%s'", attr->name, node->name);
281                 no_errors++;
282                 continue;
283             }
284         }
285         if (!rule && node->children)
286             rule = nmem_text_node_cdata(node->children, nmem);
287
288         if (!strcmp((const char *) node->name, "casemap"))
289             step = icu_chain_insert_step(chain,
290                                          ICU_chain_step_type_casemap,
291                                          rule, status);
292         else if (!strcmp((const char *) node->name, "transform"))
293             step = icu_chain_insert_step(chain,
294                                          ICU_chain_step_type_transform,
295                                          rule, status);
296         else if (!strcmp((const char *) node->name, "transliterate"))
297             step = icu_chain_insert_step(chain,
298                                          ICU_chain_step_type_transliterate,
299                                          rule, status);
300         else if (!strcmp((const char *) node->name, "tokenize"))
301             step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
302                                          rule, status);
303         else if (!strcmp((const char *) node->name, "display"))
304             step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
305                                          "", status);
306         else if (!strcmp((const char *) node->name, "stemming"))
307             step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming,
308                                          rule, status);
309         else if (!strcmp((const char *) node->name, "join"))
310         {
311             step = icu_chain_insert_step(chain, ICU_chain_step_type_join,
312                                          rule, status);
313         }
314         else if (!strcmp((const char *) node->name, "normalize"))
315         {
316             yaz_log(YLOG_WARN, "Element %s is deprecated. "
317                     "Use transform instead", node->name);
318             step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
319                                          rule, status);
320         }
321         else if (!strcmp((const char *) node->name, "index")
322                  || !strcmp((const char *) node->name, "sortkey"))
323         {
324             yaz_log(YLOG_WARN, "Element %s is no longer needed. "
325                     "Remove it from the configuration", node->name);
326         }
327         else
328         {
329             yaz_log(YLOG_WARN, "Unknown element %s", node->name);
330             no_errors++;
331             continue;
332         }
333         if (!step)
334         {
335             yaz_log(YLOG_WARN, "Step not created for %s", node->name);
336             no_errors++;
337         }
338         if (step && U_FAILURE(*status))
339         {
340             no_errors++;
341             break;
342         }
343     }
344     nmem_destroy(nmem);
345     if (no_errors)
346     {
347         icu_chain_destroy(chain);
348         return 0;
349     }
350     return chain;
351 }
352
353 struct icu_iter {
354     struct icu_chain *chain;
355     struct icu_buf_utf16 *last;
356     UErrorCode status;
357     struct icu_buf_utf8 *display;
358     struct icu_buf_utf8 *sort8;
359     struct icu_buf_utf8 *result;
360     struct icu_buf_utf16 *input;
361     int token_count;
362     struct icu_chain_step *steps;
363 };
364
365 void icu_utf16_print(struct icu_buf_utf16 *src16)
366 {
367     UErrorCode status = U_ZERO_ERROR;
368     const char *p;
369     struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
370     icu_utf16_to_utf8(dst8, src16, &status);
371
372     if (U_FAILURE(status))
373     {
374         printf("failure");
375     }
376     else
377     {
378         p = icu_buf_utf8_to_cstr(dst8);
379         printf("%s", p);
380     }
381     icu_buf_utf8_destroy(dst8);
382 }
383
384 struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
385                                       struct icu_chain_step *step,
386                                       struct icu_buf_utf16 *src)
387 {
388     if (!step)
389         return src;
390     else
391     {
392         struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
393
394         switch (step->type)
395         {
396         case ICU_chain_step_type_casemap:
397             if (dst)
398             {
399                 struct icu_buf_utf16 *src = dst;
400
401                 dst = icu_buf_utf16_create(0);
402                 icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
403                                     iter->chain->locale);
404                 icu_buf_utf16_destroy(src);
405             }
406             break;
407         case ICU_chain_step_type_tokenize:
408             if (dst)
409             {
410                 struct icu_buf_utf16 *src = dst;
411
412                 icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
413                 icu_buf_utf16_destroy(src);
414             }
415             dst = icu_buf_utf16_create(0);
416             iter->status = U_ZERO_ERROR;
417             if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
418             {
419                 icu_buf_utf16_destroy(dst);
420                 dst = 0;
421             }
422             break;
423         case ICU_chain_step_type_transform:
424         case ICU_chain_step_type_transliterate:
425             if (dst)
426             {
427                 struct icu_buf_utf16 *src = dst;
428                 dst = icu_buf_utf16_create(0);
429                 icu_transform_trans(step->u.transform, dst, src, &iter->status);
430                 icu_buf_utf16_destroy(src);
431             }
432             break;
433         case ICU_chain_step_type_display:
434             if (dst)
435                 icu_utf16_to_utf8(iter->display, dst, &iter->status);
436             break;
437         case YAZ_chain_step_type_stemming:
438             if (dst)
439             {
440                 struct icu_buf_utf16 *src = dst;
441                 dst = icu_buf_utf16_create(0);
442                 yaz_stemmer_stem(step->u.stemmer, dst, src, &iter->status);
443                 icu_buf_utf16_destroy(src);
444             }
445             break;
446         case ICU_chain_step_type_join:
447             if (dst)
448             {
449                 while (1)
450                 {
451                     struct icu_buf_utf16 *dst1 =
452                         icu_iter_invoke(iter, step->previous, 0);
453
454                     if (!dst1)
455                         break; 
456                     dst = icu_buf_utf16_append(dst, step->u.join);
457                     dst = icu_buf_utf16_append(dst, dst1);
458                     icu_buf_utf16_destroy(dst1);
459                 }
460             }
461             break;
462         default:
463             assert(0);
464         }
465         return dst;
466     }
467 }
468
469 yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
470 {
471     yaz_icu_iter_t iter = xmalloc(sizeof(*iter));
472     iter->chain = chain;
473     iter->status = U_ZERO_ERROR;
474     iter->display = icu_buf_utf8_create(0);
475     iter->sort8 = icu_buf_utf8_create(0);
476     iter->result = icu_buf_utf8_create(0);
477     iter->last = 0; /* no last returned string (yet) */
478     iter->steps = icu_chain_step_clone(chain->csteps);
479     iter->input = 0;
480
481     return iter;
482 }
483
484 void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
485 {
486     if (iter->input)
487         icu_buf_utf16_destroy(iter->input);
488     iter->input = icu_buf_utf16_create(0);
489     iter->token_count = 0;
490     /* fill and assign input string.. It will be 0 after
491        first iteration */
492     icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
493 }
494
495 void icu_iter_destroy(yaz_icu_iter_t iter)
496 {
497     if (iter)
498     {
499         icu_buf_utf8_destroy(iter->display);
500         icu_buf_utf8_destroy(iter->sort8);
501         icu_buf_utf8_destroy(iter->result);
502         if (iter->input)
503             icu_buf_utf16_destroy(iter->input);
504         icu_chain_step_destroy(iter->steps);
505         xfree(iter);
506     }
507 }
508
509 int icu_iter_next(yaz_icu_iter_t iter)
510 {
511     if (!iter->input && iter->last == 0)
512         return 0;
513     else
514     {
515         /* on first call, iter->input is the input string. Thereafter: 0. */
516         assert(iter->steps || !iter->chain->csteps);
517         iter->last = icu_iter_invoke(iter, iter->steps, iter->input);
518         iter->input = 0;
519
520         if (!iter->last)
521             return 0;
522
523         iter->token_count++;
524
525         if (iter->chain->sort)
526         {
527             icu_sortkey8_from_utf16(iter->chain->coll,
528                                     iter->sort8, iter->last,
529                                     &iter->status);
530         }
531         icu_utf16_to_utf8(iter->result, iter->last, &iter->status);
532         icu_buf_utf16_destroy(iter->last);
533
534         return 1;
535     }
536 }
537
538 const char *icu_iter_get_norm(yaz_icu_iter_t iter)
539 {
540     return icu_buf_utf8_to_cstr(iter->result);
541 }
542
543 const char *icu_iter_get_sortkey(yaz_icu_iter_t iter)
544 {
545     return icu_buf_utf8_to_cstr(iter->sort8);
546 }
547
548 const char *icu_iter_get_display(yaz_icu_iter_t iter)
549 {
550     return icu_buf_utf8_to_cstr(iter->display);
551 }
552
553 int icu_iter_get_token_number(yaz_icu_iter_t iter)
554 {
555     return iter->token_count;
556 }
557
558 int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
559                           UErrorCode *status)
560 {
561     if (chain->iter)
562         icu_iter_destroy(chain->iter);
563     chain->iter = icu_iter_create(chain);
564     icu_iter_first(chain->iter, src8cstr);
565     return 1;
566 }
567
568 int icu_chain_next_token(struct icu_chain *chain, UErrorCode *status)
569 {
570     *status = U_ZERO_ERROR;
571     return icu_iter_next(chain->iter);
572 }
573
574 int icu_chain_token_number(struct icu_chain *chain)
575 {
576     if (chain && chain->iter)
577         return chain->iter->token_count;
578     return 0;
579 }
580
581 const char *icu_chain_token_display(struct icu_chain *chain)
582 {
583     if (chain->iter)
584         return icu_iter_get_display(chain->iter);
585     return 0;
586 }
587
588 const char *icu_chain_token_norm(struct icu_chain *chain)
589 {
590     if (chain->iter)
591         return icu_iter_get_norm(chain->iter);
592     return 0;
593 }
594
595 const char *icu_chain_token_sortkey(struct icu_chain *chain)
596 {
597     if (chain->iter)
598         return icu_iter_get_sortkey(chain->iter);
599     return 0;
600 }
601
602 #endif /* YAZ_HAVE_ICU */
603
604 /*
605  * Local variables:
606  * c-basic-offset: 4
607  * c-file-style: "Stroustrup"
608  * indent-tabs-mode: nil
609  * End:
610  * vim: shiftwidth=4 tabstop=8 expandtab
611  */
612