Public header icu.h, private header icu_I18N.h .
[yaz-moved-to-github.git] / include / yaz / icu_I18N.h
1 /*
2  * Copyright (c) 1995-2007, Index Data
3  * All rights reserved.
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of Index Data nor the names of its contributors
13  *       may be used to endorse or promote products derived from this
14  *       software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 /** \file
29     \brief Internal header for ICU utilities
30 */
31
32 #ifndef ICU_I18NL_H
33 #define ICU_I18NL_H
34
35 #include <yaz/yconfig.h>
36
37 #include <libxml/tree.h>
38
39 #include <unicode/utypes.h>   /* Basic ICU data types */
40 #include <unicode/uchar.h>    /* char names           */
41
42 #include <unicode/ucol.h> 
43 #include <unicode/ubrk.h>
44 #include <unicode/utrans.h>
45
46 #include <yaz/icu.h>
47
48 /* declared structs and functions */
49
50 int icu_check_status (UErrorCode status);
51
52 struct icu_buf_utf16
53 {
54     UChar * utf16;
55     int32_t utf16_len;
56     int32_t utf16_cap;
57 };
58
59 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
60
61 struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16);
62
63 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
64                                             size_t capacity);
65
66 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
67                                           struct icu_buf_utf16 * src16);
68
69 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
70
71
72
73 struct icu_buf_utf8
74 {
75     uint8_t * utf8;
76     int32_t utf8_len;
77     int32_t utf8_cap;
78 };
79
80 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
81
82 struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8);
83
84 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
85                                           size_t capacity);
86
87 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
88
89
90 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
91                                struct icu_buf_utf8 * src8,
92                                UErrorCode * status);
93
94 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
95                                     const char * src8cstr,
96                                     UErrorCode * status);
97
98
99 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
100                              struct icu_buf_utf16 * src16,
101                              UErrorCode * status);
102
103 struct icu_casemap
104 {
105     char locale[16];
106     char action;
107 };
108
109 struct icu_casemap * icu_casemap_create(const char *locale, char action,
110                                         UErrorCode *status);
111
112 void icu_casemap_destroy(struct icu_casemap * casemap);
113
114 int icu_casemap_casemap(struct icu_casemap * casemap,
115                         struct icu_buf_utf16 * dest16,
116                         struct icu_buf_utf16 * src16,
117                         UErrorCode *status);
118
119 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
120                       struct icu_buf_utf16 * src16,
121                       const char *locale, char action,
122                       UErrorCode *status);
123
124 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
125                                    struct icu_buf_utf8 * dest8, 
126                                    struct icu_buf_utf16 * src16,
127                                    UErrorCode * status);
128
129 struct icu_tokenizer
130 {
131     char locale[16];
132     char action;
133     UBreakIterator* bi;
134     struct icu_buf_utf16 * buf16;
135     int32_t token_count;
136     int32_t token_id;
137     int32_t token_start;
138     int32_t token_end;
139 /*
140   keep always invariant
141   0 <= token_start 
142   <= token_end 
143   <= buf16->utf16_len
144   and invariant
145   0 <= token_id <= token_count
146 */
147 };
148
149 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
150                                             UErrorCode *status);
151
152 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
153
154 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
155                          struct icu_buf_utf16 * src16, UErrorCode *status);
156
157 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
158                                  struct icu_buf_utf16 * tkn16, 
159                                  UErrorCode *status);
160
161 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
162 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
163 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
164 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
165 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
166
167
168
169 struct icu_normalizer
170 {
171     char action;
172     struct icu_buf_utf16 * rules16;
173     UParseError parse_error[256];
174     UTransliterator * trans;
175 };
176
177 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
178                                               UErrorCode *status);
179
180
181 void icu_normalizer_destroy(struct icu_normalizer * normalizer);
182
183 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
184                              struct icu_buf_utf16 * dest16,
185                              struct icu_buf_utf16 * src16,
186                              UErrorCode *status);
187
188 enum icu_chain_step_type {
189     ICU_chain_step_type_none,
190     ICU_chain_step_type_display,   /* convert to utf8 display format */
191     ICU_chain_step_type_casemap,   /* apply utf16 charmap */
192     ICU_chain_step_type_normalize, /* apply utf16 normalization */
193     ICU_chain_step_type_tokenize   /* apply utf16 tokenization */
194 };
195
196
197
198 struct icu_chain_step
199 {
200     /* type and action object */
201     enum icu_chain_step_type type;
202     union {
203         struct icu_casemap * casemap;
204         struct icu_normalizer * normalizer;
205         struct icu_tokenizer * tokenizer;  
206     } u;
207     /* temprary post-action utf16 buffer */
208     struct icu_buf_utf16 * buf16;  
209     struct icu_chain_step * previous;
210     int more_tokens;
211     int need_new_token;
212 };
213
214
215 struct icu_chain;
216
217 struct icu_chain_step * icu_chain_step_create(yaz_icu_chain_t chain,
218                                               enum icu_chain_step_type type,
219                                               const uint8_t * rule,
220                                               struct icu_buf_utf16 * buf16,
221                                               UErrorCode *status);
222
223
224 void icu_chain_step_destroy(struct icu_chain_step * step);
225
226
227 struct icu_chain
228 {
229     uint8_t locale[16];
230     int sort;
231
232     const char * src8cstr;
233
234     UCollator * coll;
235     
236     /* number of tokens returned so far */
237     int32_t token_count;
238     
239     /* utf8 output buffers */
240     struct icu_buf_utf8 * display8;
241     struct icu_buf_utf8 * norm8;
242     struct icu_buf_utf8 * sort8;
243     
244     /* utf16 source buffer */
245     struct icu_buf_utf16 * src16;
246     
247     /* linked list of chain steps */
248     struct icu_chain_step * steps;
249 };
250
251 struct icu_chain_step * icu_chain_insert_step(yaz_icu_chain_t chain,
252                                               enum icu_chain_step_type type,
253                                               const uint8_t * rule,
254                                               UErrorCode *status);
255
256 int icu_chain_step_next_token(yaz_icu_chain_t chain,
257                               struct icu_chain_step * step,
258                               UErrorCode *status);
259
260 int icu_chain_token_number(yaz_icu_chain_t chain);
261
262 const UCollator * icu_chain_get_coll(yaz_icu_chain_t chain);
263
264 #endif /* ICU_I18NL_H */
265
266 /*
267  * Local variables:
268  * c-basic-offset: 4
269  * indent-tabs-mode: nil
270  * End:
271  * vim: shiftwidth=4 tabstop=8 expandtab
272  */