1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2010 Index Data
3 * See the file LICENSE for details.
16 #include <yaz/xmalloc.h>
18 #include <yaz/icu_I18N.h>
20 #include <yaz/stemmer.h>
29 #include <unicode/ustring.h> /* some more string fcns*/
30 #include <unicode/uchar.h> /* char names */
32 enum icu_chain_step_type {
33 ICU_chain_step_type_none,
34 ICU_chain_step_type_display, /* convert to utf8 display format */
35 ICU_chain_step_type_casemap, /* apply utf16 charmap */
36 ICU_chain_step_type_transform, /* apply utf16 transform */
37 ICU_chain_step_type_tokenize, /* apply utf16 tokenization */
38 ICU_chain_step_type_transliterate, /* apply utf16 tokenization */
39 YAZ_chain_step_type_stemming /* apply utf16 stemming (YAZ) */
44 /* type and action object */
45 enum icu_chain_step_type type;
47 struct icu_casemap * casemap;
48 struct icu_transform * transform;
49 struct icu_tokenizer * tokenizer;
50 yaz_stemmer_p stemmer;
52 struct icu_chain_step * previous;
63 /* linked list of chain steps */
64 struct icu_chain_step * csteps;
67 int icu_check_status(UErrorCode status)
69 if (U_FAILURE(status))
71 yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status));
77 static struct icu_chain_step *icu_chain_step_create(
78 struct icu_chain * chain, enum icu_chain_step_type type,
82 struct icu_chain_step * step = 0;
84 if (!chain || !type || !rule)
87 step = (struct icu_chain_step *) xmalloc(sizeof(*step));
90 /* create auxilary objects */
93 case ICU_chain_step_type_display:
95 case ICU_chain_step_type_casemap:
96 step->u.casemap = icu_casemap_create(rule[0], status);
98 case ICU_chain_step_type_transform:
99 /* rule omitted. Only ID used */
100 step->u.transform = icu_transform_create((const char *) rule, 'f',
103 case ICU_chain_step_type_tokenize:
104 step->u.tokenizer = icu_tokenizer_create((char *) chain->locale,
105 (char) rule[0], status);
107 case ICU_chain_step_type_transliterate:
108 /* we pass a dummy ID to utrans_openU.. */
109 step->u.transform = icu_transform_create("custom", 'f',
110 (const char *) rule, status);
112 case YAZ_chain_step_type_stemming:
113 step->u.stemmer = yaz_stemmer_create((char *) chain->locale, (const char *) rule, status);
122 static void icu_chain_step_destroy(struct icu_chain_step * step)
127 icu_chain_step_destroy(step->previous);
131 case ICU_chain_step_type_display:
133 case ICU_chain_step_type_casemap:
134 icu_casemap_destroy(step->u.casemap);
136 case ICU_chain_step_type_transform:
137 case ICU_chain_step_type_transliterate:
138 icu_transform_destroy(step->u.transform);
140 case ICU_chain_step_type_tokenize:
141 icu_tokenizer_destroy(step->u.tokenizer);
143 case YAZ_chain_step_type_stemming:
144 yaz_stemmer_destroy(step->u.stemmer);
152 struct icu_chain_step *icu_chain_step_clone(struct icu_chain_step *old)
154 struct icu_chain_step *step = 0;
155 struct icu_chain_step **sp = &step;
158 *sp = (struct icu_chain_step *) xmalloc(sizeof(**sp));
159 (*sp)->type = old->type;
163 case ICU_chain_step_type_display:
165 case ICU_chain_step_type_casemap:
166 (*sp)->u.casemap = icu_casemap_clone(old->u.casemap);
168 case ICU_chain_step_type_transform:
169 case ICU_chain_step_type_transliterate:
170 (*sp)->u.transform = icu_transform_clone(old->u.transform);
172 case ICU_chain_step_type_tokenize:
173 (*sp)->u.tokenizer = icu_tokenizer_clone(old->u.tokenizer);
175 case YAZ_chain_step_type_stemming:
176 yaz_stemmer_clone(step->u.stemmer);
178 case ICU_chain_step_type_none:
182 sp = &(*sp)->previous;
188 struct icu_chain *icu_chain_create(const char *locale, int sort,
191 struct icu_chain * chain
192 = (struct icu_chain *) xmalloc(sizeof(*chain));
194 *status = U_ZERO_ERROR;
197 chain->locale = xstrdup(locale);
201 chain->coll = ucol_open((const char *) chain->locale, status);
203 if (U_FAILURE(*status))
211 void icu_chain_destroy(struct icu_chain * chain)
216 ucol_close(chain->coll);
219 icu_iter_destroy(chain->iter);
220 icu_chain_step_destroy(chain->csteps);
221 xfree(chain->locale);
226 static struct icu_chain_step *icu_chain_insert_step(
227 struct icu_chain * chain, enum icu_chain_step_type type,
228 const uint8_t * rule, UErrorCode *status);
230 struct icu_chain * icu_chain_xml_config(const xmlNode *xml_node,
235 struct icu_chain * chain = 0;
237 *status = U_ZERO_ERROR;
239 if (!xml_node ||xml_node->type != XML_ELEMENT_NODE)
243 xmlChar * xml_locale = xmlGetProp((xmlNode *) xml_node,
244 (xmlChar *) "locale");
248 chain = icu_chain_create((const char *) xml_locale, sort, status);
256 for (node = xml_node->children; node; node = node->next)
259 struct icu_chain_step * step = 0;
261 if (node->type != XML_ELEMENT_NODE)
264 xml_rule = xmlGetProp(node, (xmlChar *) "rule");
266 if (!strcmp((const char *) node->name, "casemap"))
267 step = icu_chain_insert_step(chain, ICU_chain_step_type_casemap,
268 (const uint8_t *) xml_rule, status);
269 else if (!strcmp((const char *) node->name, "transform"))
270 step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
271 (const uint8_t *) xml_rule, status);
272 else if (!strcmp((const char *) node->name, "transliterate"))
273 step = icu_chain_insert_step(chain, ICU_chain_step_type_transliterate,
274 (const uint8_t *) xml_rule, status);
275 else if (!strcmp((const char *) node->name, "tokenize"))
276 step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
277 (const uint8_t *) xml_rule, status);
278 else if (!strcmp((const char *) node->name, "display"))
279 step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
280 (const uint8_t *) "", status);
281 else if (!strcmp((const char *) node->name, "stemming"))
282 step = yaz_chain_insert_step(chain, YAZ_chain_step_type_stemming,
283 (const uint8_t *) xml_rule, status);
284 else if (!strcmp((const char *) node->name, "normalize"))
286 yaz_log(YLOG_WARN, "Element %s is deprecated. "
287 "Use transform instead", node->name);
288 step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
289 (const uint8_t *) xml_rule, status);
291 else if (!strcmp((const char *) node->name, "index")
292 || !strcmp((const char *) node->name, "sortkey"))
294 yaz_log(YLOG_WARN, "Element %s is no longer needed. "
295 "Remove it from the configuration", node->name);
299 yaz_log(YLOG_WARN, "Unknown element %s", node->name);
300 icu_chain_destroy(chain);
304 if (step && U_FAILURE(*status))
306 icu_chain_destroy(chain);
314 static struct icu_chain_step *icu_chain_insert_step(
315 struct icu_chain * chain, enum icu_chain_step_type type,
316 const uint8_t * rule, UErrorCode *status)
318 struct icu_chain_step * step = 0;
319 if (!chain || !type || !rule)
322 /* create actual chain step with this buffer */
323 step = icu_chain_step_create(chain, type, rule,
326 step->previous = chain->csteps;
327 chain->csteps = step;
333 struct icu_chain *chain;
334 struct icu_buf_utf16 *last;
336 struct icu_buf_utf8 *display;
337 struct icu_buf_utf8 *sort8;
338 struct icu_buf_utf8 *result;
339 struct icu_buf_utf16 *input;
341 struct icu_chain_step *steps;
344 void icu_utf16_print(struct icu_buf_utf16 *src16)
346 UErrorCode status = U_ZERO_ERROR;
348 struct icu_buf_utf8 *dst8 = icu_buf_utf8_create(0);
349 icu_utf16_to_utf8(dst8, src16, &status);
351 assert(status != 1234);
352 if (U_FAILURE(status))
358 p = icu_buf_utf8_to_cstr(dst8);
361 icu_buf_utf8_destroy(dst8);
364 struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
365 struct icu_chain_step *step,
366 struct icu_buf_utf16 *src)
372 struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
376 case ICU_chain_step_type_casemap:
379 struct icu_buf_utf16 *src = dst;
381 dst = icu_buf_utf16_create(0);
382 icu_casemap_casemap(step->u.casemap, dst, src, &iter->status,
383 iter->chain->locale);
384 icu_buf_utf16_destroy(src);
387 case ICU_chain_step_type_tokenize:
390 struct icu_buf_utf16 *src = dst;
392 icu_tokenizer_attach(step->u.tokenizer, src, &iter->status);
393 icu_buf_utf16_destroy(src);
395 dst = icu_buf_utf16_create(0);
396 iter->status = U_ZERO_ERROR;
397 if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
399 icu_buf_utf16_destroy(dst);
403 case ICU_chain_step_type_transform:
404 case ICU_chain_step_type_transliterate:
407 struct icu_buf_utf16 *src = dst;
408 dst = icu_buf_utf16_create(0);
409 icu_transform_trans(step->u.transform, dst, src, &iter->status);
410 icu_buf_utf16_destroy(src);
413 case ICU_chain_step_type_display:
415 icu_utf16_to_utf8(iter->display, dst, &iter->status);
417 case YAZ_chain_step_type_stemming:
420 struct icu_buf_utf16 *src = dst;
421 dst = icu_buf_utf16_create(0);
422 yaz_stemmer_stem(step->u.stemmer, dst, src, &iter->status);
423 icu_buf_utf16_destroy(src);
433 yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
435 yaz_icu_iter_t iter = xmalloc(sizeof(*iter));
437 iter->status = U_ZERO_ERROR;
438 iter->display = icu_buf_utf8_create(0);
439 iter->sort8 = icu_buf_utf8_create(0);
440 iter->result = icu_buf_utf8_create(0);
441 iter->last = 0; /* no last returned string (yet) */
442 iter->steps = icu_chain_step_clone(chain->csteps);
448 void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
451 icu_buf_utf16_destroy(iter->input);
452 iter->input = icu_buf_utf16_create(0);
453 iter->token_count = 0;
454 /* fill and assign input string.. It will be 0 after
456 icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
459 void icu_iter_destroy(yaz_icu_iter_t iter)
463 icu_buf_utf8_destroy(iter->display);
464 icu_buf_utf8_destroy(iter->sort8);
465 icu_buf_utf8_destroy(iter->result);
467 icu_buf_utf16_destroy(iter->input);
468 icu_chain_step_destroy(iter->steps);
473 int icu_iter_next(yaz_icu_iter_t iter)
475 if (!iter->input && iter->last == 0)
479 /* on first call, iter->input is the input string. Thereafter: 0. */
480 iter->last = icu_iter_invoke(iter, iter->steps ?
481 iter->steps : iter->chain->csteps,
490 if (iter->chain->sort)
492 icu_sortkey8_from_utf16(iter->chain->coll,
493 iter->sort8, iter->last,
496 icu_utf16_to_utf8(iter->result, iter->last, &iter->status);
497 icu_buf_utf16_destroy(iter->last);
503 const char *icu_iter_get_norm(yaz_icu_iter_t iter)
505 return icu_buf_utf8_to_cstr(iter->result);
508 const char *icu_iter_get_sortkey(yaz_icu_iter_t iter)
510 return icu_buf_utf8_to_cstr(iter->sort8);
513 const char *icu_iter_get_display(yaz_icu_iter_t iter)
515 return icu_buf_utf8_to_cstr(iter->display);
518 int icu_iter_get_token_number(yaz_icu_iter_t iter)
520 return iter->token_count;
523 int icu_chain_assign_cstr(struct icu_chain * chain, const char * src8cstr,
527 icu_iter_destroy(chain->iter);
528 chain->iter = icu_iter_create(chain);
529 icu_iter_first(chain->iter, src8cstr);
533 int icu_chain_next_token(struct icu_chain * chain, UErrorCode *status)
535 *status = U_ZERO_ERROR;
536 return icu_iter_next(chain->iter);
539 int icu_chain_token_number(struct icu_chain * chain)
541 if (chain && chain->iter)
542 return chain->iter->token_count;
546 const char * icu_chain_token_display(struct icu_chain * chain)
549 return icu_iter_get_display(chain->iter);
553 const char * icu_chain_token_norm(struct icu_chain * chain)
556 return icu_iter_get_norm(chain->iter);
560 const char * icu_chain_token_sortkey(struct icu_chain * chain)
563 return icu_iter_get_sortkey(chain->iter);
567 #endif /* YAZ_HAVE_ICU */
572 * c-file-style: "Stroustrup"
573 * indent-tabs-mode: nil
575 * vim: shiftwidth=4 tabstop=8 expandtab