*/
YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain);
+/** \brief returns token as it relates to originl text
+ \param chain ICU chain
+ \param start offset in original text
+ \param size number of uchars in original text
+*/
+YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain,
+ size_t *start, size_t *len);
+
/** \brief ICU tokenizer iterator type (opaque) */
typedef struct icu_iter *yaz_icu_iter_t;
YAZ_EXPORT
int icu_iter_get_token_number(yaz_icu_iter_t iter);
+/** \brief returns ICU original token start (offset) and length
+ \param iter ICU tokenizer iterator
+ \param start offset of last token in original text
+ \param len length of last token in original text
+*/
+YAZ_EXPORT
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len);
+
YAZ_END_CDECL
#endif /* YAZ_ICU_H */
int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer,
struct icu_buf_utf16 * tkn16,
- UErrorCode *status);
+ UErrorCode *status,
+ size_t *start, size_t *len);
int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
struct icu_buf_utf8 *result;
struct icu_buf_utf16 *input;
int token_count;
+ size_t org_start;
+ size_t org_len;
struct icu_chain_step *steps;
};
}
dst = icu_buf_utf16_create(0);
iter->status = U_ZERO_ERROR;
- if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status))
+ if (!icu_tokenizer_next_token(step->u.tokenizer, dst, &iter->status,
+ &iter->org_start, &iter->org_len))
{
icu_buf_utf16_destroy(dst);
dst = 0;
/* fill and assign input string.. It will be 0 after
first iteration */
icu_utf16_from_utf8_cstr(iter->input, src8cstr, &iter->status);
+ iter->org_start = 0;
+ iter->org_len = iter->input->utf16_len;
}
void icu_iter_destroy(yaz_icu_iter_t iter)
return iter->token_count;
}
+
+void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
+{
+ *start = iter->org_start;
+ *len = iter->org_len;
+}
+
int icu_chain_assign_cstr(struct icu_chain *chain, const char *src8cstr,
UErrorCode *status)
{
return 0;
}
+void icu_chain_get_org_info(struct icu_chain *chain, size_t *start, size_t *len)
+{
+ if (chain->iter)
+ icu_iter_get_org_info(chain->iter, start, len);
+}
+
+
#endif /* YAZ_HAVE_ICU */
/*
int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
struct icu_buf_utf16 *tkn16,
- UErrorCode *status)
+ UErrorCode *status,
+ size_t *start, size_t *len)
{
int32_t tkn_start = 0;
int32_t tkn_end = 0;
tokenizer->token_start = tkn_start;
tokenizer->token_end = tkn_end;
+ *start = tkn_start;
+ *len = tkn_end - tkn_start;
+
/* copying into token buffer if it exists */
if (tkn16)
{
char *nmem_strdupn(NMEM mem, const char *src, size_t n)
{
char *dst = (char *) nmem_malloc(mem, n+1);
- memcpy (dst, src, n);
+ memcpy(dst, src, n);
dst[n] = '\0';
return dst;
}
struct icu_buf_utf16 *tkn16 = icu_buf_utf16_create(0);
struct icu_buf_utf8 *tkn8 = icu_buf_utf8_create(0);
struct icu_tokenizer *tokenizer = 0;
+ size_t org_start, org_len;
/* transforming to UTF16 */
icu_utf16_from_utf8_cstr(src16, src8cstr, &status);
icu_check_status(status);
/* perform work on tokens */
- while (icu_tokenizer_next_token(tokenizer, tkn16, &status))
+ while (icu_tokenizer_next_token(tokenizer, tkn16, &status,
+ &org_start, &org_len))
{
icu_check_status(status);
char print[1024];
int xmloutput;
int sortoutput;
+ int org_output;
yaz_icu_chain_t chain;
FILE * infile;
FILE * outfile;
" -c file XML configuration\n"
" -p a|c|l|t Print ICU info \n"
" -s Show sort normalization key\n"
+ " -o Show org positions\n"
" -x XML output instread of text\n"
"\n"
"Examples:\n"
p_config->chain = 0;
p_config->infile = 0;
p_config->outfile = stdout;
+ p_config->org_output = 0;
/* set up command line parameters */
- while ((ret = options("c:p:xs", argv, argc, &arg)) != -2)
+ while ((ret = options("c:op:sx", argv, argc, &arg)) != -2)
{
switch (ret)
{
case 'x':
p_config->xmloutput = 1;
break;
+ case 'o':
+ p_config->org_output = 1;
+ break;
case 0:
if (p_config->infile)
{
success = 0;
else
{
+ size_t start, len;
const char *sortkey = icu_chain_token_sortkey(p_config->chain);
+
+ icu_chain_get_org_info(p_config->chain, &start, &len);
wrbuf_rewind(sw);
wrbuf_puts_escaped(sw, sortkey);
token_count++;
{
fprintf(p_config->outfile, " '%s'", wrbuf_cstr(sw));
}
+ if (p_config->org_output)
+ {
+ fprintf(p_config->outfile, " %ld+%ld",
+ (long) start, (long) len);
+ }
fprintf(p_config->outfile, "\n");
}
}