First go at returning start+offset
authorAdam Dickmeiss <adam@indexdata.dk>
Fri, 31 May 2013 21:05:12 +0000 (23:05 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Fri, 31 May 2013 21:05:12 +0000 (23:05 +0200)
The offset+size however are based on UChar however.

include/yaz/icu.h
include/yaz/icu_I18N.h
src/icu_chain.c
src/icu_tokenizer.c
src/nmemsdup.c
util/yaz-icu.c

index 488071b..007e29b 100644 (file)
@@ -109,6 +109,14 @@ YAZ_EXPORT const char * icu_chain_token_norm(yaz_icu_chain_t chain);
 */
 YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain);
 
+/** \brief returns token as it relates to originl text
+    \param chain ICU chain
+    \param start offset in original text
+    \param size number of uchars in original text
+*/
+YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain,
+                                       size_t *start, size_t *len);
+
 /** \brief ICU tokenizer iterator type (opaque) */
 typedef struct icu_iter *yaz_icu_iter_t;
 
@@ -170,6 +178,14 @@ const char *icu_iter_get_display(yaz_icu_iter_t iter);
 YAZ_EXPORT
 int icu_iter_get_token_number(yaz_icu_iter_t iter);
 
+/** \brief returns ICU original token start (offset) and length
+    \param iter ICU tokenizer iterator
+    \param start offset of last token in original text
+    \param len length of last token in original text
+*/
+YAZ_EXPORT
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len);
+
 YAZ_END_CDECL
 
 #endif /* YAZ_ICU_H */
index b26cb60..d61c007 100644 (file)
@@ -139,7 +139,8 @@ int icu_tokenizer_attach(struct icu_tokenizer * tokenizer,
 
 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
                                  struct icu_buf_utf16 * tkn16,
-                                 UErrorCode *status);
+                                 UErrorCode *status,
+                                 size_t *start, size_t *len);
 
 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
 
index 2ac1960..de2e627 100644 (file)
@@ -368,6 +368,8 @@ struct icu_iter {
     struct icu_buf_utf8 *result;
     struct icu_buf_utf16 *input;
     int token_count;
+    size_t org_start;
+    size_t org_len;
     struct icu_chain_step *steps;
 };
 
@@ -423,7 +425,8 @@ struct icu_buf_utf16 *icu_iter_invoke(yaz_icu_iter_t iter,
             }
             dst = icu_buf_utf16_create(0);
             iter->status = U_ZERO_ERROR;
-            if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
+            if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status,
+                                          &iter->org_start, &iter->org_len))
             {
                 icu_buf_utf16_destroy(dst);
                 dst = 0;
@@ -499,6 +502,8 @@ void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
     /* fill and assign input string.. It will be 0 after
        first iteration */
     icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
+    iter->org_start = 0;
+    iter->org_len = iter->input->utf16_len;
 }
 
 void icu_iter_destroy(yaz_icu_iter_t iter)
@@ -564,6 +569,13 @@ int icu_iter_get_token_number(yaz_icu_iter_t iter)
     return iter->token_count;
 }
 
+
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
+{
+    *start = iter->org_start;
+    *len = iter->org_len;
+}
+
 int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
                           UErrorCode *status)
 {
@@ -608,6 +620,13 @@ const char *icu_chain_token_sortkey(struct icu_chain *chain)
     return 0;
 }
 
+void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len)
+{
+    if (chain->iter)
+        icu_iter_get_org_info(chain->iter, start, len);
+}
+
+
 #endif /* YAZ_HAVE_ICU */
 
 /*
index 67246ea..7e2fc3f 100644 (file)
@@ -155,7 +155,8 @@ int icu_tokenizer_attach(struct icu_tokenizer *tokenizer,
 
 int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
                                  struct icu_buf_utf16 *tkn16,
-                                 UErrorCode *status)
+                                 UErrorCode *status,
+                                 size_t *start, size_t *len)
 {
     int32_t tkn_start = 0;
     int32_t tkn_end = 0;
@@ -202,6 +203,9 @@ int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
     tokenizer->token_start = tkn_start;
     tokenizer->token_end = tkn_end;
 
+    *start = tkn_start;
+    *len = tkn_end - tkn_start;
+
     /* copying into token buffer if it exists */
     if (tkn16)
     {
index b0ea602..3126aac 100644 (file)
@@ -33,7 +33,7 @@ char *nmem_strdup_null(NMEM mem, const char *src)
 char *nmem_strdupn(NMEM mem, const char *src, size_t n)
 {
     char *dst = (char *) nmem_malloc(mem, n+1);
-    memcpy (dst, src, n);
+    memcpy(dst, src, n);
     dst[n] = '\0';
     return dst;
 }
index 048e45e..3e0c1d8 100644 (file)
@@ -473,7 +473,10 @@ static void process_text_file(struct config_t *p_config)
                 success = 0;
             else
             {
+                size_t start, len;
                 const char *sortkey = icu_chain_token_sortkey(p_config->chain);
+
+                icu_chain_get_org_info(p_config->chain, &start, &len);
                 wrbuf_rewind(sw);
                 wrbuf_puts_escaped(sw, sortkey);
                 token_count++;
@@ -504,11 +507,13 @@ static void process_text_file(struct config_t *p_config)
                 }
                 else
                 {
-                    fprintf(p_config->outfile, "%lu %lu '%s' '%s'",
+                    fprintf(p_config->outfile, "%lu %lu '%s' '%s' %ld+%ld",
                             token_count,
                             line_count,
                             icu_chain_token_norm(p_config->chain),
-                            icu_chain_token_display(p_config->chain));
+                            icu_chain_token_display(p_config->chain),
+                            (long) start,
+                            (long) len);
                     if (p_config->sortoutput)
                     {
                         fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));