Merge branch 'master' into yaz_663
authorAdam Dickmeiss <adam@indexdata.dk>
Mon, 10 Jun 2013 13:36:32 +0000 (15:36 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Mon, 10 Jun 2013 13:36:32 +0000 (15:36 +0200)
include/yaz/icu.h
include/yaz/icu_I18N.h
src/icu_chain.c
src/icu_tokenizer.c
src/icu_utf16.c
src/icu_utf8.c
src/nmemsdup.c
test/test_icu.c
util/yaz-icu.c

index 488071b..007e29b 100644 (file)
@@ -109,6 +109,14 @@ YAZ_EXPORT const char * icu_chain_token_norm(yaz_icu_chain_t chain);
 */
 YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain);
 
+/** \brief returns token as it relates to originl text
+    \param chain ICU chain
+    \param start offset in original text
+    \param size number of uchars in original text
+*/
+YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain,
+                                       size_t *start, size_t *len);
+
 /** \brief ICU tokenizer iterator type (opaque) */
 typedef struct icu_iter *yaz_icu_iter_t;
 
@@ -170,6 +178,14 @@ const char *icu_iter_get_display(yaz_icu_iter_t iter);
 YAZ_EXPORT
 int icu_iter_get_token_number(yaz_icu_iter_t iter);
 
+/** \brief returns ICU original token start (offset) and length
+    \param iter ICU tokenizer iterator
+    \param start offset of last token in original text
+    \param len length of last token in original text
+*/
+YAZ_EXPORT
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len);
+
 YAZ_END_CDECL
 
 #endif /* YAZ_ICU_H */
index b26cb60..d61c007 100644 (file)
@@ -139,7 +139,8 @@ int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 
 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
                                  struct icu_buf_utf16 * tkn16,
-                                 UErrorCode *status);
+                                 UErrorCode *status,
+                                 size_t *start, size_t *len);
 
 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
 
index 2ac1960..730edde 100644 (file)
@@ -362,12 +362,14 @@ struct icu_chain *icu_chain_xml_config(const xmlNode *xml_node,
 struct icu_iter {
     struct icu_chain *chain;
     struct icu_buf_utf16 *last;
+    struct icu_buf_utf16 *org;
     UErrorCode status;
     struct icu_buf_utf8 *display;
     struct icu_buf_utf8 *sort8;
     struct icu_buf_utf8 *result;
-    struct icu_buf_utf16 *input;
     int token_count;
+    size_t org_start;
+    size_t org_len;
     struct icu_chain_step *steps;
 };
 
@@ -423,7 +425,8 @@ struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
             }
             dst = icu_buf_utf16_create(0);
             iter->status = U_ZERO_ERROR;
-            if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
+            if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status,
+                                          &iter->org_start, &iter->org_len))
             {
                 icu_buf_utf16_destroy(dst);
                 dst = 0;
@@ -483,22 +486,23 @@ yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
     iter->display = icu_buf_utf8_create(0);
     iter->sort8 = icu_buf_utf8_create(0);
     iter->result = icu_buf_utf8_create(0);
+    iter->org = icu_buf_utf16_create(0);
     iter->last = 0; /* no last returned string (yet) */
     iter->steps = icu_chain_step_clone(chain->csteps);
-    iter->input = 0;
+    iter->token_count = 0;
 
     return iter;
 }
 
 void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
 {
-    if (iter->input)
-        icu_buf_utf16_destroy(iter->input);
-    iter->input = icu_buf_utf16_create(0);
+    struct icu_buf_utf16 *src = icu_buf_utf16_create(0);
+    icu_utf16_from_utf8_cstr(src, src8cstr, &iter->status);
+    icu_buf_utf16_copy(iter->org, src);
     iter->token_count = 0;
-    /* fill and assign input string.. It will be 0 after
-       first iteration */
-    icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
+    iter->org_start = 0;
+    iter->org_len = src->utf16_len;
+    iter->last = icu_iter_invoke(iter, iter->steps, src);
 }
 
 void icu_iter_destroy(yaz_icu_iter_t iter)
@@ -508,8 +512,7 @@ void icu_iter_destroy(yaz_icu_iter_t iter)
         icu_buf_utf8_destroy(iter->display);
         icu_buf_utf8_destroy(iter->sort8);
         icu_buf_utf8_destroy(iter->result);
-        if (iter->input)
-            icu_buf_utf16_destroy(iter->input);
+        icu_buf_utf16_destroy(iter->org);
         icu_chain_step_destroy(iter->steps);
         xfree(iter);
     }
@@ -517,20 +520,13 @@ void icu_iter_destroy(yaz_icu_iter_t iter)
 
 int icu_iter_next(yaz_icu_iter_t iter)
 {
-    if (!iter->input && iter->last == 0)
+    if (iter->token_count && iter->last)
+        iter->last = icu_iter_invoke(iter, iter->steps, 0);
+    if (!iter->last)
         return 0;
     else
     {
-        /* on first call, iter->input is the input string. Thereafter: 0. */
-        assert(iter->steps || !iter->chain->csteps);
-        iter->last = icu_iter_invoke(iter, iter->steps, iter->input);
-        iter->input = 0;
-
-        if (!iter->last)
-            return 0;
-
         iter->token_count++;
-
         if (iter->chain->sort)
         {
             icu_sortkey8_from_utf16(iter->chain->coll,
@@ -564,6 +560,30 @@ int icu_iter_get_token_number(yaz_icu_iter_t iter)
     return iter->token_count;
 }
 
+
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
+{
+    /* save full length of org since we're gonna cut it */
+    int32_t save_len = iter->org->utf16_len;
+
+    struct icu_buf_utf8 *tmp = icu_buf_utf8_create(0);
+    UErrorCode status;
+
+    iter->org->utf16_len = iter->org_start;
+    icu_utf16_to_utf8(tmp, iter->org, &status);
+    if (U_SUCCESS(status))
+        *start = tmp->utf8_len;
+    else
+        *start = 0;
+    iter->org->utf16_len = iter->org_start + iter->org_len;
+    icu_utf16_to_utf8(tmp, iter->org, &status);
+    if (U_SUCCESS(status))
+        *len = tmp->utf8_len - *start;
+    else
+        *len = 0;
+    iter->org->utf16_len = save_len;
+}
+
 int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
                           UErrorCode *status)
 {
@@ -608,6 +628,13 @@ const char *icu_chain_token_sortkey(struct icu_chain *chain)
     return 0;
 }
 
+void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len)
+{
+    if (chain->iter)
+        icu_iter_get_org_info(chain->iter, start, len);
+}
+
+
 #endif /* YAZ_HAVE_ICU */
 
 /*
index 67246ea..7e2fc3f 100644 (file)
@@ -155,7 +155,8 @@ int icu_tokenizer_attach(struct icu_tokenizer *tokenizer,
 
 int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
                                  struct icu_buf_utf16 *tkn16,
-                                 UErrorCode *status)
+                                 UErrorCode *status,
+                                 size_t *start, size_t *len)
 {
     int32_t tkn_start = 0;
     int32_t tkn_end = 0;
@@ -202,6 +203,9 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
     tokenizer->token_start = tkn_start;
     tokenizer->token_end = tkn_end;
 
+    *start = tkn_start;
+    *len = tkn_end - tkn_start;
+
     /* copying into token buffer if it exists */
     if (tkn16)
     {
index 3036d0a..894c97d 100644 (file)
@@ -46,21 +46,17 @@ struct icu_buf_utf16 *icu_buf_utf16_create(size_t capacity)
 
 struct icu_buf_utf16 *icu_buf_utf16_clear(struct icu_buf_utf16 *buf16)
 {
-    if (buf16)
-    {
-        if (buf16->utf16)
-            buf16->utf16[0] = (UChar) 0;
-        buf16->utf16_len = 0;
-    }
+    assert(buf16);
+    if (buf16->utf16)
+        buf16->utf16[0] = (UChar) 0;
+    buf16->utf16_len = 0;
     return buf16;
 }
 
 struct icu_buf_utf16 *icu_buf_utf16_resize(struct icu_buf_utf16 *buf16,
                                            size_t capacity)
 {
-    if (!buf16)
-        return 0;
-
+    assert(buf16);
     if (capacity > 0)
     {
         if (0 == buf16->utf16)
@@ -68,13 +64,8 @@ struct icu_buf_utf16 *icu_buf_utf16_resize(struct icu_buf_utf16 *buf16,
         else
             buf16->utf16
                 = (UChar *) xrealloc(buf16->utf16, sizeof(UChar) * capacity);
+        buf16->utf16_cap = capacity;
     }
-    else
-    {
-        xfree(buf16->utf16);
-        buf16->utf16 = 0;
-    }
-    buf16->utf16_cap = capacity;
     return buf16;
 }
 
index ee670a1..443adc8 100644 (file)
@@ -22,6 +22,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <assert.h>
 
 #include <unicode/ustring.h>  /* some more string fcns*/
 #include <unicode/uchar.h>    /* char names           */
@@ -45,21 +46,17 @@ struct icu_buf_utf8 *icu_buf_utf8_create(size_t capacity)
 
 struct icu_buf_utf8 *icu_buf_utf8_clear(struct icu_buf_utf8 *buf8)
 {
-    if (buf8)
-    {
-        if (buf8->utf8)
-            buf8->utf8[0] = (uint8_t) 0;
-        buf8->utf8_len = 0;
-    }
+    assert(buf8);
+    if (buf8->utf8)
+        buf8->utf8[0] = (uint8_t) 0;
+    buf8->utf8_len = 0;
     return buf8;
 }
 
 struct icu_buf_utf8 *icu_buf_utf8_resize(struct icu_buf_utf8 *buf8,
                                          size_t capacity)
 {
-    if (!buf8)
-        return 0;
-
+    assert(buf8);
     if (capacity > 0)
     {
         if (0 == buf8->utf8)
@@ -70,19 +67,13 @@ struct icu_buf_utf8 *icu_buf_utf8_resize(struct icu_buf_utf8 *buf8,
 
         buf8->utf8_cap = capacity;
     }
-    else
-    {
-        xfree(buf8->utf8);
-        buf8->utf8 = 0;
-        buf8->utf8_cap = 0;
-    }
-
     return buf8;
 }
 
 const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
 {
-    if (!src8 || src8->utf8_len == 0)
+    assert(src8);
+    if (src8->utf8_len == 0)
         return "";
 
     if (src8->utf8_len == src8->utf8_cap)
index b0ea602..3126aac 100644 (file)
@@ -33,7 +33,7 @@ char *nmem_strdup_null(NMEM mem, const char *src)
 char *nmem_strdupn(NMEM mem, const char *src, size_t n)
 {
     char *dst = (char *) nmem_malloc(mem, n+1);
-    memcpy (dst, src, n);
+    memcpy(dst, src, n);
     dst[n] = '\0';
     return dst;
 }
index d5420f3..cf9e4e8 100644 (file)
@@ -368,6 +368,7 @@ static int test_icu_tokenizer(const char *locale, char action,
     struct icu_buf_utf16 *tkn16 = icu_buf_utf16_create(0);
     struct icu_buf_utf8 *tkn8 = icu_buf_utf8_create(0);
     struct icu_tokenizer *tokenizer = 0;
+    size_t org_start, org_len;
 
     /* transforming to UTF16 */
     icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
@@ -383,7 +384,8 @@ static int test_icu_tokenizer(const char *locale, char action,
     icu_check_status(status);
 
     /* perform work on tokens */
-    while (icu_tokenizer_next_token(tokenizer, tkn16, &status))
+    while (icu_tokenizer_next_token(tokenizer, tkn16, &status,
+                                    &org_start, &org_len))
     {
         icu_check_status(status);
 
index 048e45e..ceff120 100644 (file)
@@ -33,6 +33,7 @@ struct config_t {
     char print[1024];
     int xmloutput;
     int sortoutput;
+    int org_output;
     yaz_icu_chain_t chain;
     FILE * infile;
     FILE * outfile;
@@ -45,6 +46,7 @@ void print_option_error(const struct config_t *p_config)
             "   -c file         XML configuration\n"
             "   -p a|c|l|t      Print ICU info \n"
             "   -s              Show sort normalization key\n"
+            "   -o              Show org positions\n"
             "   -x              XML output instread of text\n"
             "\n"
             "Examples:\n"
@@ -77,10 +79,11 @@ void read_params(int argc, char **argv, struct config_t *p_config)
     p_config->chain = 0;
     p_config->infile = 0;
     p_config->outfile = stdout;
+    p_config->org_output = 0;
 
     /* set up command line parameters */
 
-    while ((ret = options("c:p:xs", argv, argc, &arg)) != -2)
+    while ((ret = options("c:op:sx", argv, argc, &arg)) != -2)
     {
         switch (ret)
         {
@@ -96,6 +99,9 @@ void read_params(int argc, char **argv, struct config_t *p_config)
         case 'x':
             p_config->xmloutput = 1;
             break;
+        case 'o':
+            p_config->org_output = 1;
+            break;
         case 0:
             if (p_config->infile)
             {
@@ -473,7 +479,10 @@ static void process_text_file(struct config_t *p_config)
                 success = 0;
             else
             {
+                size_t start, len;
                 const char *sortkey = icu_chain_token_sortkey(p_config->chain);
+
+                icu_chain_get_org_info(p_config->chain, &start, &len);
                 wrbuf_rewind(sw);
                 wrbuf_puts_escaped(sw, sortkey);
                 token_count++;
@@ -513,6 +522,11 @@ static void process_text_file(struct config_t *p_config)
                     {
                         fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
                     }
+                    if (p_config->org_output)
+                    {
+                        fprintf(p_config->outfile, " %ld+%ld",
+                                (long) start, (long) len);
+                    }
                     fprintf(p_config->outfile, "\n");
                 }
             }