*/
YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain);
+/** \brief returns token as it relates to originl text
+ \param chain ICU chain
+ \param start offset in original text
+ \param size number of uchars in original text
+*/
+YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain,
+ size_t *start, size_t *len);
+
/** \brief ICU tokenizer iterator type (opaque) */
typedef struct icu_iter *yaz_icu_iter_t;
YAZ_EXPORT
int icu_iter_get_token_number(yaz_icu_iter_t iter);
+/** \brief returns ICU original token start (offset) and length
+ \param iter ICU tokenizer iterator
+ \param start offset of last token in original text
+ \param len length of last token in original text
+*/
+YAZ_EXPORT
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len);
+
YAZ_END_CDECL
#endif /* YAZ_ICU_H */
int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
struct icu_buf_utf16 * tkn16,
- UErrorCode *status);
+ UErrorCode *status,
+ size_t *start, size_t *len);
int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
struct icu_iter {
struct icu_chain *chain;
struct icu_buf_utf16 *last;
+ struct icu_buf_utf16 *org;
UErrorCode status;
struct icu_buf_utf8 *display;
struct icu_buf_utf8 *sort8;
struct icu_buf_utf8 *result;
- struct icu_buf_utf16 *input;
int token_count;
+ size_t org_start;
+ size_t org_len;
struct icu_chain_step *steps;
};
}
dst = icu_buf_utf16_create(0);
iter->status = U_ZERO_ERROR;
- if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
+ if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status,
+ &iter->org_start, &iter->org_len))
{
icu_buf_utf16_destroy(dst);
dst = 0;
iter->display = icu_buf_utf8_create(0);
iter->sort8 = icu_buf_utf8_create(0);
iter->result = icu_buf_utf8_create(0);
+ iter->org = icu_buf_utf16_create(0);
iter->last = 0; /* no last returned string (yet) */
iter->steps = icu_chain_step_clone(chain->csteps);
- iter->input = 0;
+ iter->token_count = 0;
return iter;
}
void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
{
- if (iter->input)
- icu_buf_utf16_destroy(iter->input);
- iter->input = icu_buf_utf16_create(0);
+ struct icu_buf_utf16 *src = icu_buf_utf16_create(0);
+ icu_utf16_from_utf8_cstr(src, src8cstr, &iter->status);
+ icu_buf_utf16_copy(iter->org, src);
iter->token_count = 0;
- /* fill and assign input string.. It will be 0 after
- first iteration */
- icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
+ iter->org_start = 0;
+ iter->org_len = src->utf16_len;
+ iter->last = icu_iter_invoke(iter, iter->steps, src);
}
void icu_iter_destroy(yaz_icu_iter_t iter)
icu_buf_utf8_destroy(iter->display);
icu_buf_utf8_destroy(iter->sort8);
icu_buf_utf8_destroy(iter->result);
- if (iter->input)
- icu_buf_utf16_destroy(iter->input);
+ icu_buf_utf16_destroy(iter->org);
icu_chain_step_destroy(iter->steps);
xfree(iter);
}
int icu_iter_next(yaz_icu_iter_t iter)
{
- if (!iter->input && iter->last == 0)
+ if (iter->token_count && iter->last)
+ iter->last = icu_iter_invoke(iter, iter->steps, 0);
+ if (!iter->last)
return 0;
else
{
- /* on first call, iter->input is the input string. Thereafter: 0. */
- assert(iter->steps || !iter->chain->csteps);
- iter->last = icu_iter_invoke(iter, iter->steps, iter->input);
- iter->input = 0;
-
- if (!iter->last)
- return 0;
-
iter->token_count++;
-
if (iter->chain->sort)
{
icu_sortkey8_from_utf16(iter->chain->coll,
return iter->token_count;
}
+
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
+{
+ /* save full length of org since we're gonna cut it */
+ int32_t save_len = iter->org->utf16_len;
+
+ struct icu_buf_utf8 *tmp = icu_buf_utf8_create(0);
+ UErrorCode status;
+
+ iter->org->utf16_len = iter->org_start;
+ icu_utf16_to_utf8(tmp, iter->org, &status);
+ if (U_SUCCESS(status))
+ *start = tmp->utf8_len;
+ else
+ *start = 0;
+ iter->org->utf16_len = iter->org_start + iter->org_len;
+ icu_utf16_to_utf8(tmp, iter->org, &status);
+ if (U_SUCCESS(status))
+ *len = tmp->utf8_len - *start;
+ else
+ *len = 0;
+ iter->org->utf16_len = save_len;
+}
+
int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
UErrorCode *status)
{
return 0;
}
+void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len)
+{
+ if (chain->iter)
+ icu_iter_get_org_info(chain->iter, start, len);
+}
+
+
#endif /* YAZ_HAVE_ICU */
/*
int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
struct icu_buf_utf16 *tkn16,
- UErrorCode *status)
+ UErrorCode *status,
+ size_t *start, size_t *len)
{
int32_t tkn_start = 0;
int32_t tkn_end = 0;
tokenizer->token_start = tkn_start;
tokenizer->token_end = tkn_end;
+ *start = tkn_start;
+ *len = tkn_end - tkn_start;
+
/* copying into token buffer if it exists */
if (tkn16)
{
struct icu_buf_utf16 *icu_buf_utf16_clear(struct icu_buf_utf16 *buf16)
{
- if (buf16)
- {
- if (buf16->utf16)
- buf16->utf16[0] = (UChar) 0;
- buf16->utf16_len = 0;
- }
+ assert(buf16);
+ if (buf16->utf16)
+ buf16->utf16[0] = (UChar) 0;
+ buf16->utf16_len = 0;
return buf16;
}
struct icu_buf_utf16 *icu_buf_utf16_resize(struct icu_buf_utf16 *buf16,
size_t capacity)
{
- if (!buf16)
- return 0;
-
+ assert(buf16);
if (capacity > 0)
{
if (0 == buf16->utf16)
else
buf16->utf16
= (UChar *) xrealloc(buf16->utf16, sizeof(UChar) * capacity);
+ buf16->utf16_cap = capacity;
}
- else
- {
- xfree(buf16->utf16);
- buf16->utf16 = 0;
- }
- buf16->utf16_cap = capacity;
return buf16;
}
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
+#include <assert.h>
#include <unicode/ustring.h> /* some more string fcns*/
#include <unicode/uchar.h> /* char names */
struct icu_buf_utf8 *icu_buf_utf8_clear(struct icu_buf_utf8 *buf8)
{
- if (buf8)
- {
- if (buf8->utf8)
- buf8->utf8[0] = (uint8_t) 0;
- buf8->utf8_len = 0;
- }
+ assert(buf8);
+ if (buf8->utf8)
+ buf8->utf8[0] = (uint8_t) 0;
+ buf8->utf8_len = 0;
return buf8;
}
struct icu_buf_utf8 *icu_buf_utf8_resize(struct icu_buf_utf8 *buf8,
size_t capacity)
{
- if (!buf8)
- return 0;
-
+ assert(buf8);
if (capacity > 0)
{
if (0 == buf8->utf8)
buf8->utf8_cap = capacity;
}
- else
- {
- xfree(buf8->utf8);
- buf8->utf8 = 0;
- buf8->utf8_cap = 0;
- }
-
return buf8;
}
const char *icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
{
- if (!src8 || src8->utf8_len == 0)
+ assert(src8);
+ if (src8->utf8_len == 0)
return "";
if (src8->utf8_len == src8->utf8_cap)
char *nmem_strdupn(NMEM mem, const char *src, size_t n)
{
char *dst = (char *) nmem_malloc(mem, n+1);
- memcpy (dst, src, n);
+ memcpy(dst, src, n);
dst[n] = '\0';
return dst;
}
struct icu_buf_utf16 *tkn16 = icu_buf_utf16_create(0);
struct icu_buf_utf8 *tkn8 = icu_buf_utf8_create(0);
struct icu_tokenizer *tokenizer = 0;
+ size_t org_start, org_len;
/* transforming to UTF16 */
icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
icu_check_status(status);
/* perform work on tokens */
- while (icu_tokenizer_next_token(tokenizer, tkn16, &status))
+ while (icu_tokenizer_next_token(tokenizer, tkn16, &status,
+ &org_start, &org_len))
{
icu_check_status(status);
char print[1024];
int xmloutput;
int sortoutput;
+ int org_output;
yaz_icu_chain_t chain;
FILE * infile;
FILE * outfile;
" -c file XML configuration\n"
" -p a|c|l|t Print ICU info \n"
" -s Show sort normalization key\n"
+ " -o Show org positions\n"
" -x XML output instread of text\n"
"\n"
"Examples:\n"
p_config->chain = 0;
p_config->infile = 0;
p_config->outfile = stdout;
+ p_config->org_output = 0;
/* set up command line parameters */
- while ((ret = options("c:p:xs", argv, argc, &arg)) != -2)
+ while ((ret = options("c:op:sx", argv, argc, &arg)) != -2)
{
switch (ret)
{
case 'x':
p_config->xmloutput = 1;
break;
+ case 'o':
+ p_config->org_output = 1;
+ break;
case 0:
if (p_config->infile)
{
success = 0;
else
{
+ size_t start, len;
const char *sortkey = icu_chain_token_sortkey(p_config->chain);
+
+ icu_chain_get_org_info(p_config->chain, &start, &len);
wrbuf_rewind(sw);
wrbuf_puts_escaped(sw, sortkey);
token_count++;
{
fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
}
+ if (p_config->org_output)
+ {
+ fprintf(p_config->outfile, " %ld+%ld",
+ (long) start, (long) len);
+ }
fprintf(p_config->outfile, "\n");
}
}