added sortkey flag to icu_chain constructor
[yaz-moved-to-github.git] / include / yaz / icu_I18N.h
1 /*
2  * Copyright (c) 1995-2007, Index Data
3  * All rights reserved.
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of Index Data nor the names of its contributors
13  *       may be used to endorse or promote products derived from this
14  *       software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 #ifndef ICU_I18NL_H
29 #define ICU_I18NL_H
30
31 #include <yaz/nmem.h>
32
33 #include <libxml/parser.h>
34 #include <libxml/tree.h>
35
36 #include <unicode/utypes.h>   /* Basic ICU data types */
37 #include <unicode/uchar.h>    /* char names           */
38
39 #include <unicode/ucol.h> 
40 #include <unicode/ubrk.h>
41 #include <unicode/utrans.h>
42
43
44 // #define ICU_CHAIN_SORTKEY
45 #undef ICU_CHAIN_SORTKEY
46
47 /* declared structs and functions */
48
49 int icu_check_status (UErrorCode status);
50
51 struct icu_buf_utf16
52 {
53   UChar * utf16;
54   int32_t utf16_len;
55   int32_t utf16_cap;
56 };
57
58 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
59
60 struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16);
61
62 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
63                                             size_t capacity);
64
65 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
66                                           struct icu_buf_utf16 * src16);
67
68 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
69
70
71
72 struct icu_buf_utf8
73 {
74     uint8_t * utf8;
75     int32_t utf8_len;
76     int32_t utf8_cap;
77 };
78
79 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
80
81 struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8);
82
83 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
84                                           size_t capacity);
85
86 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
87
88
89 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
90                                struct icu_buf_utf8 * src8,
91                                UErrorCode * status);
92
93 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
94                                     const char * src8cstr,
95                                     UErrorCode * status);
96
97
98 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
99                              struct icu_buf_utf16 * src16,
100                              UErrorCode * status);
101
102 struct icu_casemap
103 {
104     char locale[16];
105     char action;
106 };
107
108 struct icu_casemap * icu_casemap_create(const char *locale, char action,
109                                             UErrorCode *status);
110
111 void icu_casemap_destroy(struct icu_casemap * casemap);
112
113 int icu_casemap_casemap(struct icu_casemap * casemap,
114                         struct icu_buf_utf16 * dest16,
115                         struct icu_buf_utf16 * src16,
116                         UErrorCode *status);
117
118 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
119                       struct icu_buf_utf16 * src16,
120                       const char *locale, char action,
121                       UErrorCode *status);
122
123 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
124                                    struct icu_buf_utf8 * dest8, 
125                                    struct icu_buf_utf16 * src16,
126                                    UErrorCode * status);
127
128 struct icu_tokenizer
129 {
130     char locale[16];
131     char action;
132     UBreakIterator* bi;
133     struct icu_buf_utf16 * buf16;
134     int32_t token_count;
135     int32_t token_id;
136     int32_t token_start;
137     int32_t token_end;
138 /*
139   keep always invariant
140   0 <= token_start 
141   <= token_end 
142   <= buf16->utf16_len
143   and invariant
144   0 <= token_id <= token_count
145 */
146 };
147
148 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
149                                             UErrorCode *status);
150
151 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
152
153 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
154                          struct icu_buf_utf16 * src16, UErrorCode *status);
155
156 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
157                                  struct icu_buf_utf16 * tkn16, 
158                                  UErrorCode *status);
159
160 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
161 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
162 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
163 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
164 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
165
166
167
168 struct icu_normalizer
169 {
170     char action;
171     struct icu_buf_utf16 * rules16;
172     UParseError parse_error[256];
173     UTransliterator * trans;
174 };
175
176 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
177                                               UErrorCode *status);
178
179
180 void icu_normalizer_destroy(struct icu_normalizer * normalizer);
181
182 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
183                              struct icu_buf_utf16 * dest16,
184                              struct icu_buf_utf16 * src16,
185                              UErrorCode *status);
186
187 enum icu_chain_step_type {
188     ICU_chain_step_type_none,
189     ICU_chain_step_type_display,   /* convert to utf8 display format */
190 #ifdef ICU_CHAIN_SORTKEY
191     ICU_chain_step_type_index,     /* convert to utf8 index format  */
192     ICU_chain_step_type_sortkey,   /* convert to utf8 sortkey format */
193 #endif
194     ICU_chain_step_type_casemap,   /* apply utf16 charmap */
195     ICU_chain_step_type_normalize, /* apply utf16 normalization */
196     ICU_chain_step_type_tokenize   /* apply utf16 tokenization */
197 };
198
199
200
201 struct icu_chain_step
202 {
203     /* type and action object */
204     enum icu_chain_step_type type;
205     union {
206         struct icu_casemap * casemap;
207         struct icu_normalizer * normalizer;
208         struct icu_tokenizer * tokenizer;  
209     } u;
210     /* temprary post-action utf16 buffer */
211     struct icu_buf_utf16 * buf16;  
212     struct icu_chain_step * previous;
213     int more_tokens;
214     int need_new_token;
215 };
216
217
218 struct icu_chain;
219
220 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
221                                               enum icu_chain_step_type type,
222                                               const uint8_t * rule,
223                                               struct icu_buf_utf16 * buf16,
224                                               UErrorCode *status);
225
226
227 void icu_chain_step_destroy(struct icu_chain_step * step);
228
229
230 struct icu_chain
231 {
232     uint8_t locale[16];
233     int sort;
234
235     UCollator * coll;
236     
237     /* number of tokens returned so far */
238     int32_t token_count;
239     
240     /* utf8 output buffers */
241     struct icu_buf_utf8 * display8;
242     struct icu_buf_utf8 * norm8;
243     struct icu_buf_utf8 * sort8;
244     
245     /* utf16 source buffer */
246     struct icu_buf_utf16 * src16;
247     
248     /* linked list of chain steps */
249     struct icu_chain_step * steps;
250 };
251
252 struct icu_chain * icu_chain_create(const uint8_t * locale,
253                                     int sort,
254                                     UErrorCode * status);
255
256 void icu_chain_destroy(struct icu_chain * chain);
257
258 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node,
259                                         const uint8_t * locale,
260                                         int sort,
261                                         UErrorCode * status);
262
263 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
264                                               enum icu_chain_step_type type,
265                                               const uint8_t * rule,
266                                               UErrorCode *status);
267
268 int icu_chain_step_next_token(struct icu_chain * chain,
269                               struct icu_chain_step * step,
270                               UErrorCode *status);
271
272 int icu_chain_assign_cstr(struct icu_chain * chain,
273                           const char * src8cstr, 
274                           UErrorCode *status);
275
276 int icu_chain_next_token(struct icu_chain * chain,
277                          UErrorCode *status);
278
279 int icu_chain_get_token_count(struct icu_chain * chain);
280
281 const char * icu_chain_get_display(struct icu_chain * chain);
282
283 const char * icu_chain_get_norm(struct icu_chain * chain);
284
285 const char * icu_chain_get_sort(struct icu_chain * chain);
286
287 const UCollator * icu_chain_get_coll(struct icu_chain * chain);
288
289 #endif /* ICU_I18NL_H */
290
291 /*
292  * Local variables:
293  * c-basic-offset: 4
294  * indent-tabs-mode: nil
295  * End:
296  * vim: shiftwidth=4 tabstop=8 expandtab
297  */