X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=include%2Fyaz%2Ficu.h;h=973cc5d8adaa67d85bac9371704f2ee925003fb9;hp=23126abcb1a566a4ecacd0d2ff2f907af9c0ac17;hb=ad88b93c8fbb00728acd0b49b4079167304ed58d;hpb=47db800079d3df8e8adfd93b466795d0803dabe8 diff --git a/include/yaz/icu.h b/include/yaz/icu.h index 23126ab..973cc5d 100644 --- a/include/yaz/icu.h +++ b/include/yaz/icu.h @@ -1,5 +1,5 @@ /* This file is part of the YAZ toolkit. - * Copyright (C) 1995-2009 Index Data. + * Copyright (C) 1995-2013 Index Data. * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -48,23 +48,23 @@ typedef struct icu_chain *yaz_icu_chain_t; YAZ_EXPORT void icu_chain_destroy(yaz_icu_chain_t chain); /** \brief constructs ICU chain from XML specification - \param \param xml_node icu_chain XML node - with attribute locale in it - \param \param sort 1 if ICU chain is to deal with sort keys; 0 otherwise - \param \param status May include ICU error code on failure + \param xml_node icu_chain XML node - with attribute locale in it + \param sort 1 if ICU chain is to deal with sort keys; 0 otherwise + \param status May include ICU error code on failure \returns chain ptr or NULL on failure in which case status may hold info */ YAZ_EXPORT yaz_icu_chain_t icu_chain_xml_config(const xmlNode *xml_node, int sort, - UErrorCode * status); + UErrorCode *status); /** \brief pass string to ICU for parsing/tokenization/etc \param chain ICU chain to be used for parsing - \param src8cstr input C string (\0-terminated) + \param src8cstr input C string (null-terminated) \param status may include ICU error on failure \retval 0 failure \retval 1 success */ YAZ_EXPORT int icu_chain_assign_cstr(yaz_icu_chain_t chain, - const char * src8cstr, + const char *src8cstr, UErrorCode *status); /** \brief returns one token (if any) @@ -109,6 +109,83 @@ YAZ_EXPORT const char * icu_chain_token_norm(yaz_icu_chain_t chain); */ YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain); +/** \brief returns token as it relates to originl text + \param chain ICU chain + \param start offset in original text + \param size number of uchars in original text +*/ +YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain, + size_t *start, size_t *len); + +/** \brief ICU tokenizer iterator type (opaque) */ +typedef struct icu_iter *yaz_icu_iter_t; + +/** \brief create ICU tokenizer iterator from chain + \param chain ICU chain + \returns ICU iterator +*/ +YAZ_EXPORT +yaz_icu_iter_t icu_iter_create(struct icu_chain *chain); + +/** \brief starts iteration over string + \param iter ICU tokenizer iterator + \param src8cstr input string (0-terminated) + + Call icu_iter_next to iterate over each token. +*/ +YAZ_EXPORT +void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr); + +/** \brief iterates over one token + \param iter ICU tokenizer iterator + \retval 0 no more tokens (EOF) + \retval 1 got one token (use icu_iter_get..-functions) +*/ +YAZ_EXPORT +int icu_iter_next(yaz_icu_iter_t iter); + +/** \brief destroy ICU tokenizer iterator + \param iter ICU tokenizer iterator +*/ +YAZ_EXPORT +void icu_iter_destroy(yaz_icu_iter_t iter); + +/** \brief returns ICU normalized token + \param iter ICU tokenizer iterator + \returns string (0-terminated) +*/ +YAZ_EXPORT +const char *icu_iter_get_norm(yaz_icu_iter_t iter); + +/** \brief returns ICU sortkey string + \param iter ICU tokenizer iterator + \returns string (0-terminated) +*/ +YAZ_EXPORT +const char *icu_iter_get_sortkey(yaz_icu_iter_t iter); + +/** \brief returns ICU display string + \param iter ICU tokenizer iterator + \returns string (0-terminated) +*/ +YAZ_EXPORT +const char *icu_iter_get_display(yaz_icu_iter_t iter); + +/** \brief returns ICU token count for iterator + \param iter ICU tokenizer iterator + \returns token count (1, 2, 3...) +*/ +YAZ_EXPORT +int icu_iter_get_token_number(yaz_icu_iter_t iter); + +/** \brief returns ICU original token start (offset) and length + \param iter ICU tokenizer iterator + \param start offset of last token in original text + \param len length of last token in original text +*/ +YAZ_EXPORT +void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len); + YAZ_END_CDECL #endif /* YAZ_ICU_H */