From: Adam Dickmeiss Date: Mon, 10 Jun 2013 13:36:32 +0000 (+0200) Subject: Merge branch 'master' into yaz_663 X-Git-Tag: v4.2.58~2 X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=commitdiff_plain;h=e8117f5e26e3eed1f7b48e1dcb2ce8c12a31fd52;hp=e68a157030efef7b7893e2d53d53b98cd310d328 Merge branch 'master' into yaz_663 --- diff --git a/include/yaz/icu.h b/include/yaz/icu.h index 488071b..007e29b 100644 --- a/include/yaz/icu.h +++ b/include/yaz/icu.h @@ -109,6 +109,14 @@ YAZ_EXPORT const char * icu_chain_token_norm(yaz_icu_chain_t chain); */ YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain); +/** \brief returns token as it relates to originl text + \param chain ICU chain + \param start offset in original text + \param size number of uchars in original text +*/ +YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain, + size_t *start, size_t *len); + /** \brief ICU tokenizer iterator type (opaque) */ typedef struct icu_iter *yaz_icu_iter_t; @@ -170,6 +178,14 @@ const char *icu_iter_get_display(yaz_icu_iter_t iter); YAZ_EXPORT int icu_iter_get_token_number(yaz_icu_iter_t iter); +/** \brief returns ICU original token start (offset) and length + \param iter ICU tokenizer iterator + \param start offset of last token in original text + \param len length of last token in original text +*/ +YAZ_EXPORT +void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len); + YAZ_END_CDECL #endif /* YAZ_ICU_H */ diff --git a/include/yaz/icu_I18N.h b/include/yaz/icu_I18N.h index b26cb60..d61c007 100644 --- a/include/yaz/icu_I18N.h +++ b/include/yaz/icu_I18N.h @@ -139,7 +139,8 @@ int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, struct icu_buf_utf16 * tkn16, - UErrorCode *status); + UErrorCode *status, + size_t *start, size_t *len); int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer); diff --git a/src/icu_chain.c b/src/icu_chain.c index 2ac1960..730edde 100644 --- a/src/icu_chain.c +++ b/src/icu_chain.c @@ -362,12 +362,14 @@ struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node, struct icu_iter { struct icu_chain *chain; struct icu_buf_utf16 *last; + struct icu_buf_utf16 *org; UErrorCode status; struct icu_buf_utf8 *display; struct icu_buf_utf8 *sort8; struct icu_buf_utf8 *result; - struct icu_buf_utf16 *input; int token_count; + size_t org_start; + size_t org_len; struct icu_chain_step *steps; }; @@ -423,7 +425,8 @@ struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter, } dst = icu_buf_utf16_create(0); iter->status = U_ZERO_ERROR; - if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status)) + if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status, + &iter->org_start, &iter->org_len)) { icu_buf_utf16_destroy(dst); dst = 0; @@ -483,22 +486,23 @@ yaz_icu_iter_t icu_iter_create(struct icu_chain *chain) iter->display = icu_buf_utf8_create(0); iter->sort8 = icu_buf_utf8_create(0); iter->result = icu_buf_utf8_create(0); + iter->org = icu_buf_utf16_create(0); iter->last = 0; /* no last returned string (yet) */ iter->steps = icu_chain_step_clone(chain->csteps); - iter->input = 0; + iter->token_count = 0; return iter; } void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr) { - if (iter->input) - icu_buf_utf16_destroy(iter->input); - iter->input = icu_buf_utf16_create(0); + struct icu_buf_utf16 *src = icu_buf_utf16_create(0); + icu_utf16_from_utf8_cstr(src, src8cstr, &iter->status); + icu_buf_utf16_copy(iter->org, src); iter->token_count = 0; - /* fill and assign input string.. It will be 0 after - first iteration */ - icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status); + iter->org_start = 0; + iter->org_len = src->utf16_len; + iter->last = icu_iter_invoke(iter, iter->steps, src); } void icu_iter_destroy(yaz_icu_iter_t iter) @@ -508,8 +512,7 @@ void icu_iter_destroy(yaz_icu_iter_t iter) icu_buf_utf8_destroy(iter->display); icu_buf_utf8_destroy(iter->sort8); icu_buf_utf8_destroy(iter->result); - if (iter->input) - icu_buf_utf16_destroy(iter->input); + icu_buf_utf16_destroy(iter->org); icu_chain_step_destroy(iter->steps); xfree(iter); } @@ -517,20 +520,13 @@ void icu_iter_destroy(yaz_icu_iter_t iter) int icu_iter_next(yaz_icu_iter_t iter) { - if (!iter->input && iter->last == 0) + if (iter->token_count && iter->last) + iter->last = icu_iter_invoke(iter, iter->steps, 0); + if (!iter->last) return 0; else { - /* on first call, iter->input is the input string. Thereafter: 0. */ - assert(iter->steps || !iter->chain->csteps); - iter->last = icu_iter_invoke(iter, iter->steps, iter->input); - iter->input = 0; - - if (!iter->last) - return 0; - iter->token_count++; - if (iter->chain->sort) { icu_sortkey8_from_utf16(iter->chain->coll, @@ -564,6 +560,30 @@ int icu_iter_get_token_number(yaz_icu_iter_t iter) return iter->token_count; } + +void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len) +{ + /* save full length of org since we're gonna cut it */ + int32_t save_len = iter->org->utf16_len; + + struct icu_buf_utf8 *tmp = icu_buf_utf8_create(0); + UErrorCode status; + + iter->org->utf16_len = iter->org_start; + icu_utf16_to_utf8(tmp, iter->org, &status); + if (U_SUCCESS(status)) + *start = tmp->utf8_len; + else + *start = 0; + iter->org->utf16_len = iter->org_start + iter->org_len; + icu_utf16_to_utf8(tmp, iter->org, &status); + if (U_SUCCESS(status)) + *len = tmp->utf8_len - *start; + else + *len = 0; + iter->org->utf16_len = save_len; +} + int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr, UErrorCode *status) { @@ -608,6 +628,13 @@ const char *icu_chain_token_sortkey(struct icu_chain *chain) return 0; } +void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len) +{ + if (chain->iter) + icu_iter_get_org_info(chain->iter, start, len); +} + + #endif /* YAZ_HAVE_ICU */ /* diff --git a/src/icu_tokenizer.c b/src/icu_tokenizer.c index 67246ea..7e2fc3f 100644 --- a/src/icu_tokenizer.c +++ b/src/icu_tokenizer.c @@ -155,7 +155,8 @@ int icu_tokenizer_attach(struct icu_tokenizer *tokenizer, int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *tkn16, - UErrorCode *status) + UErrorCode *status, + size_t *start, size_t *len) { int32_t tkn_start = 0; int32_t tkn_end = 0; @@ -202,6 +203,9 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, tokenizer->token_start = tkn_start; tokenizer->token_end = tkn_end; + *start = tkn_start; + *len = tkn_end - tkn_start; + /* copying into token buffer if it exists */ if (tkn16) { diff --git a/src/icu_utf16.c b/src/icu_utf16.c index 3036d0a..894c97d 100644 --- a/src/icu_utf16.c +++ b/src/icu_utf16.c @@ -46,21 +46,17 @@ struct icu_buf_utf16 *icu_buf_utf16_create(size_t capacity) struct icu_buf_utf16 *icu_buf_utf16_clear(struct icu_buf_utf16 *buf16) { - if (buf16) - { - if (buf16->utf16) - buf16->utf16[0] = (UChar) 0; - buf16->utf16_len = 0; - } + assert(buf16); + if (buf16->utf16) + buf16->utf16[0] = (UChar) 0; + buf16->utf16_len = 0; return buf16; } struct icu_buf_utf16 *icu_buf_utf16_resize(struct icu_buf_utf16 *buf16, size_t capacity) { - if (!buf16) - return 0; - + assert(buf16); if (capacity > 0) { if (0 == buf16->utf16) @@ -68,13 +64,8 @@ struct icu_buf_utf16 *icu_buf_utf16_resize(struct icu_buf_utf16 *buf16, else buf16->utf16 = (UChar *) xrealloc(buf16->utf16, sizeof(UChar) * capacity); + buf16->utf16_cap = capacity; } - else - { - xfree(buf16->utf16); - buf16->utf16 = 0; - } - buf16->utf16_cap = capacity; return buf16; } diff --git a/src/icu_utf8.c b/src/icu_utf8.c index ee670a1..443adc8 100644 --- a/src/icu_utf8.c +++ b/src/icu_utf8.c @@ -22,6 +22,7 @@ #include #include #include +#include #include /* some more string fcns*/ #include /* char names */ @@ -45,21 +46,17 @@ struct icu_buf_utf8 *icu_buf_utf8_create(size_t capacity) struct icu_buf_utf8 *icu_buf_utf8_clear(struct icu_buf_utf8 *buf8) { - if (buf8) - { - if (buf8->utf8) - buf8->utf8[0] = (uint8_t) 0; - buf8->utf8_len = 0; - } + assert(buf8); + if (buf8->utf8) + buf8->utf8[0] = (uint8_t) 0; + buf8->utf8_len = 0; return buf8; } struct icu_buf_utf8 *icu_buf_utf8_resize(struct icu_buf_utf8 *buf8, size_t capacity) { - if (!buf8) - return 0; - + assert(buf8); if (capacity > 0) { if (0 == buf8->utf8) @@ -70,19 +67,13 @@ struct icu_buf_utf8 *icu_buf_utf8_resize(struct icu_buf_utf8 *buf8, buf8->utf8_cap = capacity; } - else - { - xfree(buf8->utf8); - buf8->utf8 = 0; - buf8->utf8_cap = 0; - } - return buf8; } const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8) { - if (!src8 || src8->utf8_len == 0) + assert(src8); + if (src8->utf8_len == 0) return ""; if (src8->utf8_len == src8->utf8_cap) diff --git a/src/nmemsdup.c b/src/nmemsdup.c index b0ea602..3126aac 100644 --- a/src/nmemsdup.c +++ b/src/nmemsdup.c @@ -33,7 +33,7 @@ char *nmem_strdup_null(NMEM mem, const char *src) char *nmem_strdupn(NMEM mem, const char *src, size_t n) { char *dst = (char *) nmem_malloc(mem, n+1); - memcpy (dst, src, n); + memcpy(dst, src, n); dst[n] = '\0'; return dst; } diff --git a/test/test_icu.c b/test/test_icu.c index d5420f3..cf9e4e8 100644 --- a/test/test_icu.c +++ b/test/test_icu.c @@ -368,6 +368,7 @@ static int test_icu_tokenizer(const char *locale, char action, struct icu_buf_utf16 *tkn16 = icu_buf_utf16_create(0); struct icu_buf_utf8 *tkn8 = icu_buf_utf8_create(0); struct icu_tokenizer *tokenizer = 0; + size_t org_start, org_len; /* transforming to UTF16 */ icu_utf16_from_utf8_cstr(src16, src8cstr, &status); @@ -383,7 +384,8 @@ static int test_icu_tokenizer(const char *locale, char action, icu_check_status(status); /* perform work on tokens */ - while (icu_tokenizer_next_token(tokenizer, tkn16, &status)) + while (icu_tokenizer_next_token(tokenizer, tkn16, &status, + &org_start, &org_len)) { icu_check_status(status); diff --git a/util/yaz-icu.c b/util/yaz-icu.c index 048e45e..ceff120 100644 --- a/util/yaz-icu.c +++ b/util/yaz-icu.c @@ -33,6 +33,7 @@ struct config_t { char print[1024]; int xmloutput; int sortoutput; + int org_output; yaz_icu_chain_t chain; FILE * infile; FILE * outfile; @@ -45,6 +46,7 @@ void print_option_error(const struct config_t *p_config) " -c file XML configuration\n" " -p a|c|l|t Print ICU info \n" " -s Show sort normalization key\n" + " -o Show org positions\n" " -x XML output instread of text\n" "\n" "Examples:\n" @@ -77,10 +79,11 @@ void read_params(int argc, char **argv, struct config_t *p_config) p_config->chain = 0; p_config->infile = 0; p_config->outfile = stdout; + p_config->org_output = 0; /* set up command line parameters */ - while ((ret = options("c:p:xs", argv, argc, &arg)) != -2) + while ((ret = options("c:op:sx", argv, argc, &arg)) != -2) { switch (ret) { @@ -96,6 +99,9 @@ void read_params(int argc, char **argv, struct config_t *p_config) case 'x': p_config->xmloutput = 1; break; + case 'o': + p_config->org_output = 1; + break; case 0: if (p_config->infile) { @@ -473,7 +479,10 @@ static void process_text_file(struct config_t *p_config) success = 0; else { + size_t start, len; const char *sortkey = icu_chain_token_sortkey(p_config->chain); + + icu_chain_get_org_info(p_config->chain, &start, &len); wrbuf_rewind(sw); wrbuf_puts_escaped(sw, sortkey); token_count++; @@ -513,6 +522,11 @@ static void process_text_file(struct config_t *p_config) { fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw)); } + if (p_config->org_output) + { + fprintf(p_config->outfile, " %ld+%ld", + (long) start, (long) len); + } fprintf(p_config->outfile, "\n"); } }