Make ICU stuff compatible with old C comps. YAZ License.
[yaz-moved-to-github.git] / include / yaz / icu_I18N.h
1 /*
2  * Copyright (c) 1995-2007, Index Data
3  * All rights reserved.
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of Index Data nor the names of its contributors
13  *       may be used to endorse or promote products derived from this
14  *       software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 #ifndef ICU_I18NL_H
29 #define ICU_I18NL_H
30
31 #include <yaz/nmem.h>
32
33 #include <libxml/parser.h>
34 #include <libxml/tree.h>
35
36 #include <unicode/utypes.h>   /* Basic ICU data types */
37 #include <unicode/uchar.h>    /* char names           */
38
39 #include <unicode/ucol.h> 
40 #include <unicode/ubrk.h>
41 #include <unicode/utrans.h>
42
43
44
45 /* declared structs and functions */
46
47 int icu_check_status (UErrorCode status);
48
49 struct icu_buf_utf16
50 {
51   UChar * utf16;
52   int32_t utf16_len;
53   int32_t utf16_cap;
54 };
55
56 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
57 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
58                                             size_t capacity);
59 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
60                                           struct icu_buf_utf16 * src16);
61 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
62
63
64
65 struct icu_buf_utf8
66 {
67     uint8_t * utf8;
68     int32_t utf8_len;
69     int32_t utf8_cap;
70 };
71
72 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
73 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
74                                           size_t capacity);
75 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
76
77
78 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
79                                struct icu_buf_utf8 * src8,
80                                UErrorCode * status);
81
82 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
83                                     const char * src8cstr,
84                                     UErrorCode * status);
85
86
87 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
88                              struct icu_buf_utf16 * src16,
89                              UErrorCode * status);
90
91 struct icu_casemap
92 {
93     char locale[16];
94     char action;
95 };
96
97 struct icu_casemap * icu_casemap_create(const char *locale, char action,
98                                             UErrorCode *status);
99
100 void icu_casemap_destroy(struct icu_casemap * casemap);
101
102 int icu_casemap_casemap(struct icu_casemap * casemap,
103                         struct icu_buf_utf16 * dest16,
104                         struct icu_buf_utf16 * src16,
105                         UErrorCode *status);
106
107 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
108                       struct icu_buf_utf16 * src16,
109                       const char *locale, char action,
110                       UErrorCode *status);
111
112 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
113                                    struct icu_buf_utf8 * dest8, 
114                                    struct icu_buf_utf16 * src16,
115                                    UErrorCode * status);
116
117 struct icu_tokenizer
118 {
119     char locale[16];
120     char action;
121     UBreakIterator* bi;
122     struct icu_buf_utf16 * buf16;
123     int32_t token_count;
124     int32_t token_id;
125     int32_t token_start;
126     int32_t token_end;
127 /*
128   keep always invariant
129   0 <= token_start 
130   <= token_end 
131   <= buf16->utf16_len
132   and invariant
133   0 <= token_id <= token_count
134 */
135 };
136
137 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
138                                             UErrorCode *status);
139
140 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
141
142 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
143                          struct icu_buf_utf16 * src16, UErrorCode *status);
144
145 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
146                                  struct icu_buf_utf16 * tkn16, 
147                                  UErrorCode *status);
148
149 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
150 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
151 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
152 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
153 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
154
155
156
157 struct icu_normalizer
158 {
159     char action;
160     struct icu_buf_utf16 * rules16;
161     UParseError parse_error[256];
162     UTransliterator * trans;
163 };
164
165 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
166                                               UErrorCode *status);
167
168
169 void icu_normalizer_destroy(struct icu_normalizer * normalizer);
170
171 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
172                              struct icu_buf_utf16 * dest16,
173                              struct icu_buf_utf16 * src16,
174                              UErrorCode *status);
175
176 enum icu_chain_step_type {
177     ICU_chain_step_type_none,
178     ICU_chain_step_type_display,   /* convert to utf8 display format */
179     ICU_chain_step_type_index,     /* convert to utf8 index format  */
180     ICU_chain_step_type_sortkey,   /* convert to utf8 sortkey format */
181     ICU_chain_step_type_casemap,   /* apply utf16 charmap */
182     ICU_chain_step_type_normalize, /* apply utf16 normalization */
183     ICU_chain_step_type_tokenize   /* apply utf16 tokenization */
184 };
185
186
187
188 struct icu_chain_step
189 {
190     /* type and action object */
191     enum icu_chain_step_type type;
192     union {
193         struct icu_casemap * casemap;
194         struct icu_normalizer * normalizer;
195         struct icu_tokenizer * tokenizer;  
196     } u;
197     /* temprary post-action utf16 buffer */
198     struct icu_buf_utf16 * buf16;  
199     struct icu_chain_step * previous;
200     int more_tokens;
201     int need_new_token;
202 };
203
204
205 struct icu_chain;
206
207 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
208                                               enum icu_chain_step_type type,
209                                               const uint8_t * rule,
210                                               struct icu_buf_utf16 * buf16,
211                                               UErrorCode *status);
212
213
214 void icu_chain_step_destroy(struct icu_chain_step * step);
215
216
217 struct icu_chain
218 {
219     uint8_t identifier[128];
220     uint8_t locale[16];
221     
222     /* number of tokens returned so far */
223     int32_t token_count;
224     
225     /* utf8 output buffers */
226     struct icu_buf_utf8 * display8;
227     struct icu_buf_utf8 * norm8;
228     struct icu_buf_utf8 * sort8;
229     
230     /* utf16 source buffer */
231     struct icu_buf_utf16 * src16;
232     
233     /* linked list of chain steps */
234     struct icu_chain_step * steps;
235 };
236
237 struct icu_chain * icu_chain_create(const uint8_t * identifier, 
238                                     const uint8_t * locale);
239
240 void icu_chain_destroy(struct icu_chain * chain);
241
242 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node, 
243                                         UErrorCode * status);
244
245
246 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
247                                               enum icu_chain_step_type type,
248                                               const uint8_t * rule,
249                                               UErrorCode *status);
250
251
252 int icu_chain_step_next_token(struct icu_chain * chain,
253                               struct icu_chain_step * step,
254                               UErrorCode *status);
255
256 int icu_chain_assign_cstr(struct icu_chain * chain,
257                           const char * src8cstr, 
258                           UErrorCode *status);
259
260 int icu_chain_next_token(struct icu_chain * chain,
261                          UErrorCode *status);
262
263 int icu_chain_get_token_count(struct icu_chain * chain);
264
265 const char * icu_chain_get_display(struct icu_chain * chain);
266
267 const char * icu_chain_get_norm(struct icu_chain * chain);
268
269 const char * icu_chain_get_sort(struct icu_chain * chain);
270
271 #endif /* ICU_I18NL_H */
272
273 /*
274  * Local variables:
275  * c-basic-offset: 4
276  * indent-tabs-mode: nil
277  * End:
278  * vim: shiftwidth=4 tabstop=8 expandtab
279  */