From 35c32d033bf5d7201cb72a68d88e118e1851dc40 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 31 Mar 2015 14:54:12 +0200 Subject: [PATCH] Extend get_org_info (snippets) to return original string YAZ-836 Two new functions have appeared: icu_chain_get_org_info2 and icu_iter_get_org_info2 with a 4th parameter being a pointer to the original string. --- include/yaz/icu.h | 24 ++++++++++++++++++++++-- src/icu_chain.c | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/include/yaz/icu.h b/include/yaz/icu.h index 637996a..176e299 100644 --- a/include/yaz/icu.h +++ b/include/yaz/icu.h @@ -109,7 +109,7 @@ YAZ_EXPORT const char * icu_chain_token_norm(yaz_icu_chain_t chain); */ YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain); -/** \brief returns token as it relates to originl text +/** \brief returns token as it relates to original text (legacy) \param chain ICU chain \param start offset in original text \param len number of uchars in original text @@ -117,6 +117,16 @@ YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain); YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain, size_t *start, size_t *len); +/** \brief returns token as it relates to original text (2nd version) + \param chain ICU chain + \param start offset in original text + \param len number of uchars in original text + \param cstr if not-null, holds original string in there +*/ +YAZ_EXPORT void icu_chain_get_org_info2(yaz_icu_chain_t chain, + size_t *start, size_t *len, + const char **cstr); + /** \brief ICU tokenizer iterator type (opaque) */ typedef struct icu_iter *yaz_icu_iter_t; @@ -178,7 +188,7 @@ const char *icu_iter_get_display(yaz_icu_iter_t iter); YAZ_EXPORT int icu_iter_get_token_number(yaz_icu_iter_t iter); -/** \brief returns ICU original token start (offset) and length +/** \brief returns ICU original token start (offset) and length (legacy) \param iter ICU tokenizer iterator \param start offset of last token in original text \param len length of last token in original text @@ -186,6 +196,16 @@ int icu_iter_get_token_number(yaz_icu_iter_t iter); YAZ_EXPORT void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len); +/** \brief returns ICU original token start (offset) and length + \param iter ICU tokenizer iterator + \param start offset of last token in original text + \param len length of last token in original text + \param cstr if non-null: original string +*/ +YAZ_EXPORT +void icu_iter_get_org_info2(yaz_icu_iter_t iter, size_t *start, size_t *len, + const char **cstr); + YAZ_END_CDECL #endif /* YAZ_ICU_H */ diff --git a/src/icu_chain.c b/src/icu_chain.c index d0a2857..fa6c96a 100644 --- a/src/icu_chain.c +++ b/src/icu_chain.c @@ -366,6 +366,7 @@ struct icu_iter { struct icu_chain *chain; struct icu_buf_utf16 *last; struct icu_buf_utf16 *org; + struct icu_buf_utf8 *org8; UErrorCode status; struct icu_buf_utf8 *display; struct icu_buf_utf8 *sort8; @@ -426,8 +427,11 @@ struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter, struct icu_buf_utf16 *src = dst; icu_tokenizer_attach(step->u.tokenizer, src, &iter->status); - iter->utf8_base = iter->utf16_base = 0; - icu_buf_utf16_copy(iter->org, src); + if (step->previous) + { /* no need to copy if it's already the same */ + iter->utf8_base = iter->utf16_base = 0; + icu_buf_utf16_copy(iter->org, src); + } icu_buf_utf16_destroy(src); } dst = icu_buf_utf16_create(0); @@ -494,6 +498,7 @@ yaz_icu_iter_t icu_iter_create(struct icu_chain *chain) iter->sort8 = icu_buf_utf8_create(0); iter->result = icu_buf_utf8_create(0); iter->org = icu_buf_utf16_create(0); + iter->org8 = 0; iter->last = 0; /* no last returned string (yet) */ iter->steps = icu_chain_step_clone(chain->csteps); iter->token_count = 0; @@ -521,6 +526,7 @@ void icu_iter_destroy(yaz_icu_iter_t iter) icu_buf_utf8_destroy(iter->sort8); icu_buf_utf8_destroy(iter->result); icu_buf_utf16_destroy(iter->org); + icu_buf_utf8_destroy(iter->org8); icu_chain_step_destroy(iter->steps); xfree(iter); } @@ -569,7 +575,8 @@ int icu_iter_get_token_number(yaz_icu_iter_t iter) } -void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len) +void icu_iter_get_org_info2(yaz_icu_iter_t iter, size_t *start, size_t *len, + const char **cstr) { int32_t len1 = 0, len2 = 0; UErrorCode status = U_ZERO_ERROR; @@ -595,10 +602,23 @@ void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len) *len = len2 - len1; + if (cstr) + { + if (!iter->org8) + iter->org8 = icu_buf_utf8_create(0); + status = U_ZERO_ERROR; + icu_utf16_to_utf8(iter->org8, iter->org, &status); + *cstr = icu_buf_utf8_to_cstr(iter->org8); + } iter->utf8_base = *start; iter->utf16_base = iter->org_start; } +void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len) +{ + icu_iter_get_org_info2(iter, start, len, 0); +} + int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr, UErrorCode *status) { @@ -649,6 +669,13 @@ void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len) icu_iter_get_org_info(chain->iter, start, len); } +void icu_chain_get_org_info2(struct icu_chain *chain, size_t *start, + size_t *len, const char **cstr) +{ + if (chain->iter) + icu_iter_get_org_info2(chain->iter, start, len, cstr); +} + #endif /* YAZ_HAVE_ICU */ -- 1.7.10.4