Use __in64 type for 64-bit YAZ on Windows
[yaz-moved-to-github.git] / include / yaz / icu_I18N.h
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2009 Index Data.
3  * All rights reserved.
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of Index Data nor the names of its contributors
13  *       may be used to endorse or promote products derived from this
14  *       software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 /** \file
29     \brief Internal header for ICU utilities
30
31     These functions, while non-static, are considered unstable and internal
32     and may be renamed for each YAZ release.
33 */
34
35 #ifndef ICU_I18NL_H
36 #define ICU_I18NL_H
37
38 #include <yaz/yconfig.h>
39
40 #include <unicode/utypes.h>   /* Basic ICU data types */
41 #include <unicode/uchar.h>    /* char names           */
42
43 #include <unicode/ucol.h> 
44 #include <unicode/ubrk.h>
45 #include <unicode/utrans.h>
46
47 #include <yaz/icu.h>
48
49 /* declared structs and functions */
50
51 int icu_check_status (UErrorCode status);
52
53 struct icu_buf_utf16
54 {
55     UChar * utf16;
56     int32_t utf16_len;
57     int32_t utf16_cap;
58 };
59
60 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
61
62 struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16);
63
64 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
65                                             size_t capacity);
66
67 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
68                                           struct icu_buf_utf16 * src16);
69
70 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
71
72
73
74 struct icu_buf_utf8
75 {
76     uint8_t * utf8;
77     int32_t utf8_len;
78     int32_t utf8_cap;
79 };
80
81 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
82
83 struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8);
84
85 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
86                                           size_t capacity);
87
88 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
89
90
91 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
92                                struct icu_buf_utf8 * src8,
93                                UErrorCode * status);
94
95 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
96                                     const char * src8cstr,
97                                     UErrorCode * status);
98
99
100 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
101                              struct icu_buf_utf16 * src16,
102                              UErrorCode * status);
103
104 struct icu_casemap
105 {
106     char action;
107 };
108
109 struct icu_casemap * icu_casemap_create(char action, UErrorCode *status);
110
111 void icu_casemap_destroy(struct icu_casemap * casemap);
112
113 int icu_casemap_casemap(struct icu_casemap * casemap,
114                         struct icu_buf_utf16 * dest16,
115                         struct icu_buf_utf16 * src16,
116                         UErrorCode *status,
117                         const char *locale);
118
119 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
120                       struct icu_buf_utf16 * src16,
121                       const char *locale, char action,
122                       UErrorCode *status);
123
124 void icu_sortkey8_from_utf16(UCollator *coll,
125                              struct icu_buf_utf8 * dest8, 
126                              struct icu_buf_utf16 * src16,
127                              UErrorCode * status);
128
129 struct icu_tokenizer
130 {
131     char action;
132     UBreakIterator* bi;
133     struct icu_buf_utf16 * buf16;
134     int32_t token_count;
135     int32_t token_id;
136     int32_t token_start;
137     int32_t token_end;
138 /*
139   keep always invariant
140   0 <= token_start 
141   <= token_end 
142   <= buf16->utf16_len
143   and invariant
144   0 <= token_id <= token_count
145 */
146 };
147
148 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
149                                             UErrorCode *status);
150
151 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
152
153 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
154                          struct icu_buf_utf16 * src16, UErrorCode *status);
155
156 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
157                                  struct icu_buf_utf16 * tkn16, 
158                                  UErrorCode *status);
159
160 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
161 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
162 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
163 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
164 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
165
166
167
168 struct icu_transform
169 {
170     char action;
171     UParseError parse_error;
172     UTransliterator * trans;
173 };
174
175 struct icu_transform * icu_transform_create(const char *id, char action,
176                                             const char *rules,
177                                             UErrorCode *status);
178
179 void icu_transform_destroy(struct icu_transform * transform);
180
181 int icu_transform_trans(struct icu_transform * transform,
182                         struct icu_buf_utf16 * dest16,
183                         struct icu_buf_utf16 * src16,
184                         UErrorCode *status);
185
186 enum icu_chain_step_type {
187     ICU_chain_step_type_none,
188     ICU_chain_step_type_display,   /* convert to utf8 display format */
189     ICU_chain_step_type_casemap,   /* apply utf16 charmap */
190     ICU_chain_step_type_transform, /* apply utf16 transform */
191     ICU_chain_step_type_tokenize,  /* apply utf16 tokenization */
192     ICU_chain_step_type_transliterate  /* apply utf16 tokenization */
193 };
194
195
196
197 struct icu_chain_step
198 {
199     /* type and action object */
200     enum icu_chain_step_type type;
201     union {
202         struct icu_casemap * casemap;
203         struct icu_transform * transform;
204         struct icu_tokenizer * tokenizer;  
205     } u;
206     /* temprary post-action utf16 buffer */
207     struct icu_buf_utf16 * buf16;  
208     struct icu_chain_step * previous;
209     int more_tokens;
210     int need_new_token;
211 };
212
213
214 struct icu_chain;
215
216 struct icu_chain_step * icu_chain_step_create(yaz_icu_chain_t chain,
217                                               enum icu_chain_step_type type,
218                                               const uint8_t * rule,
219                                               struct icu_buf_utf16 * buf16,
220                                               UErrorCode *status);
221
222
223 void icu_chain_step_destroy(struct icu_chain_step * step);
224
225
226 struct icu_chain
227 {
228     char *locale;
229     int sort;
230
231     const char * src8cstr;
232
233     UCollator * coll;
234     
235     /* number of tokens returned so far */
236     int32_t token_count;
237     
238     /* utf8 output buffers */
239     struct icu_buf_utf8 * display8;
240     struct icu_buf_utf8 * norm8;
241     struct icu_buf_utf8 * sort8;
242     
243     /* utf16 source buffer */
244     struct icu_buf_utf16 * src16;
245     
246     /* linked list of chain steps */
247     struct icu_chain_step * steps;
248 };
249
250 struct icu_chain_step * icu_chain_insert_step(yaz_icu_chain_t chain,
251                                               enum icu_chain_step_type type,
252                                               const uint8_t * rule,
253                                               UErrorCode *status);
254
255 int icu_chain_step_next_token(yaz_icu_chain_t chain,
256                               struct icu_chain_step * step,
257                               UErrorCode *status);
258
259 int icu_chain_token_number(yaz_icu_chain_t chain);
260
261 const UCollator * icu_chain_get_coll(yaz_icu_chain_t chain);
262
263 yaz_icu_chain_t icu_chain_create(const char * locale,
264                                  int sort,
265                                  UErrorCode * status);
266
267
268 #endif /* ICU_I18NL_H */
269
270 /*
271  * Local variables:
272  * c-basic-offset: 4
273  * c-file-style: "Stroustrup"
274  * indent-tabs-mode: nil
275  * End:
276  * vim: shiftwidth=4 tabstop=8 expandtab
277  */
278