X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=include%2Fyaz%2Ficu.h;h=973cc5d8adaa67d85bac9371704f2ee925003fb9;hp=7674293c29f9af5e6b3cdff1c25b4b398b9c3bd2;hb=ad88b93c8fbb00728acd0b49b4079167304ed58d;hpb=9b6117759c80f45b99b47fba0651e2b7912ca8dc diff --git a/include/yaz/icu.h b/include/yaz/icu.h index 7674293..973cc5d 100644 --- a/include/yaz/icu.h +++ b/include/yaz/icu.h @@ -1,5 +1,5 @@ -/* - * Copyright (c) 1995-2007, Index Data +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2013 Index Data. * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -35,39 +35,157 @@ #include -#include +#include #include YAZ_BEGIN_CDECL +/** \brief opaque ICU chain */ typedef struct icu_chain *yaz_icu_chain_t; -YAZ_EXPORT yaz_icu_chain_t icu_chain_create(const char * locale, - int sort, - UErrorCode * status); - +/** \brief destroys ICU chain */ YAZ_EXPORT void icu_chain_destroy(yaz_icu_chain_t chain); +/** \brief constructs ICU chain from XML specification + \param xml_node icu_chain XML node - with attribute locale in it + \param sort 1 if ICU chain is to deal with sort keys; 0 otherwise + \param status May include ICU error code on failure + \returns chain ptr or NULL on failure in which case status may hold info +*/ YAZ_EXPORT yaz_icu_chain_t icu_chain_xml_config(const xmlNode *xml_node, - int sort, - UErrorCode * status); - + int sort, + UErrorCode *status); +/** \brief pass string to ICU for parsing/tokenization/etc + \param chain ICU chain to be used for parsing + \param src8cstr input C string (null-terminated) + \param status may include ICU error on failure + \retval 0 failure + \retval 1 success +*/ YAZ_EXPORT int icu_chain_assign_cstr(yaz_icu_chain_t chain, - const char * src8cstr, - UErrorCode *status); + const char *src8cstr, + UErrorCode *status); +/** \brief returns one token (if any) + \param chain ICU chain + \param status may include ICU error on failure + \retval 0 error or end-of-tokens (no more tokens) + \retval >0 token number (1, 3, 3, ..) + + This function tries to move to "next" token in assigned + C-string .. Or returns 0 if no more is to be found +*/ YAZ_EXPORT int icu_chain_next_token(yaz_icu_chain_t chain, - UErrorCode *status); + UErrorCode *status); +/** \brief returns token number of last token processed + \brief chain ICU chain + \returns token number (numbered from 1) +*/ YAZ_EXPORT int icu_chain_token_number(yaz_icu_chain_t chain); +/** \brief returns display token of last token processed + \param chain ICU chain + \returns display token string (C string) + This function returns display string for last token returned + by icu_chain_next_token. +*/ YAZ_EXPORT const char * icu_chain_token_display(yaz_icu_chain_t chain); +/** \brief returns normalized token of last token processed + \param chain ICU chain + \returns normalized token string (C string) + This function returns normalized string for last token returned + by icu_chain_next_token. +*/ YAZ_EXPORT const char * icu_chain_token_norm(yaz_icu_chain_t chain); +/** \brief returns sortkey token of last token processed + \param chain ICU chain + \returns sortkey token string (C string) + This function returns sortkey string for last token returned + by icu_chain_next_token. +*/ YAZ_EXPORT const char * icu_chain_token_sortkey(yaz_icu_chain_t chain); +/** \brief returns token as it relates to originl text + \param chain ICU chain + \param start offset in original text + \param size number of uchars in original text +*/ +YAZ_EXPORT void icu_chain_get_org_info(yaz_icu_chain_t chain, + size_t *start, size_t *len); + +/** \brief ICU tokenizer iterator type (opaque) */ +typedef struct icu_iter *yaz_icu_iter_t; + +/** \brief create ICU tokenizer iterator from chain + \param chain ICU chain + \returns ICU iterator +*/ +YAZ_EXPORT +yaz_icu_iter_t icu_iter_create(struct icu_chain *chain); + +/** \brief starts iteration over string + \param iter ICU tokenizer iterator + \param src8cstr input string (0-terminated) + + Call icu_iter_next to iterate over each token. +*/ +YAZ_EXPORT +void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr); + +/** \brief iterates over one token + \param iter ICU tokenizer iterator + \retval 0 no more tokens (EOF) + \retval 1 got one token (use icu_iter_get..-functions) +*/ +YAZ_EXPORT +int icu_iter_next(yaz_icu_iter_t iter); + +/** \brief destroy ICU tokenizer iterator + \param iter ICU tokenizer iterator +*/ +YAZ_EXPORT +void icu_iter_destroy(yaz_icu_iter_t iter); + +/** \brief returns ICU normalized token + \param iter ICU tokenizer iterator + \returns string (0-terminated) +*/ +YAZ_EXPORT +const char *icu_iter_get_norm(yaz_icu_iter_t iter); + +/** \brief returns ICU sortkey string + \param iter ICU tokenizer iterator + \returns string (0-terminated) +*/ +YAZ_EXPORT +const char *icu_iter_get_sortkey(yaz_icu_iter_t iter); + +/** \brief returns ICU display string + \param iter ICU tokenizer iterator + \returns string (0-terminated) +*/ +YAZ_EXPORT +const char *icu_iter_get_display(yaz_icu_iter_t iter); + +/** \brief returns ICU token count for iterator + \param iter ICU tokenizer iterator + \returns token count (1, 2, 3...) +*/ +YAZ_EXPORT +int icu_iter_get_token_number(yaz_icu_iter_t iter); + +/** \brief returns ICU original token start (offset) and length + \param iter ICU tokenizer iterator + \param start offset of last token in original text + \param len length of last token in original text +*/ +YAZ_EXPORT +void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len); + YAZ_END_CDECL #endif /* YAZ_ICU_H */ @@ -75,7 +193,9 @@ YAZ_END_CDECL /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab */ +