Added support for rule-based transliterator for ICU wrapper of YAZ
[yaz-moved-to-github.git] / include / yaz / icu_I18N.h
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2009 Index Data.
3  * All rights reserved.
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of Index Data nor the names of its contributors
13  *       may be used to endorse or promote products derived from this
14  *       software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 /** \file
29     \brief Internal header for ICU utilities
30 */
31
32 #ifndef ICU_I18NL_H
33 #define ICU_I18NL_H
34
35 #include <yaz/yconfig.h>
36
37 #include <unicode/utypes.h>   /* Basic ICU data types */
38 #include <unicode/uchar.h>    /* char names           */
39
40 #include <unicode/ucol.h> 
41 #include <unicode/ubrk.h>
42 #include <unicode/utrans.h>
43
44 #include <yaz/icu.h>
45
46 /* declared structs and functions */
47
48 int icu_check_status (UErrorCode status);
49
50 struct icu_buf_utf16
51 {
52     UChar * utf16;
53     int32_t utf16_len;
54     int32_t utf16_cap;
55 };
56
57 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
58
59 struct icu_buf_utf16 * icu_buf_utf16_clear(struct icu_buf_utf16 * buf16);
60
61 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
62                                             size_t capacity);
63
64 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
65                                           struct icu_buf_utf16 * src16);
66
67 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
68
69
70
71 struct icu_buf_utf8
72 {
73     uint8_t * utf8;
74     int32_t utf8_len;
75     int32_t utf8_cap;
76 };
77
78 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
79
80 struct icu_buf_utf8 * icu_buf_utf8_clear(struct icu_buf_utf8 * buf8);
81
82 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
83                                           size_t capacity);
84
85 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
86
87
88 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
89                                struct icu_buf_utf8 * src8,
90                                UErrorCode * status);
91
92 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
93                                     const char * src8cstr,
94                                     UErrorCode * status);
95
96
97 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
98                              struct icu_buf_utf16 * src16,
99                              UErrorCode * status);
100
101 struct icu_casemap
102 {
103     char action;
104 };
105
106 struct icu_casemap * icu_casemap_create(char action, UErrorCode *status);
107
108 void icu_casemap_destroy(struct icu_casemap * casemap);
109
110 int icu_casemap_casemap(struct icu_casemap * casemap,
111                         struct icu_buf_utf16 * dest16,
112                         struct icu_buf_utf16 * src16,
113                         UErrorCode *status,
114                         const char *locale);
115
116 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
117                       struct icu_buf_utf16 * src16,
118                       const char *locale, char action,
119                       UErrorCode *status);
120
121 void icu_sortkey8_from_utf16(UCollator *coll,
122                              struct icu_buf_utf8 * dest8, 
123                              struct icu_buf_utf16 * src16,
124                              UErrorCode * status);
125
126 struct icu_tokenizer
127 {
128     char action;
129     UBreakIterator* bi;
130     struct icu_buf_utf16 * buf16;
131     int32_t token_count;
132     int32_t token_id;
133     int32_t token_start;
134     int32_t token_end;
135 /*
136   keep always invariant
137   0 <= token_start 
138   <= token_end 
139   <= buf16->utf16_len
140   and invariant
141   0 <= token_id <= token_count
142 */
143 };
144
145 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
146                                             UErrorCode *status);
147
148 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
149
150 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
151                          struct icu_buf_utf16 * src16, UErrorCode *status);
152
153 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
154                                  struct icu_buf_utf16 * tkn16, 
155                                  UErrorCode *status);
156
157 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
158 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
159 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
160 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
161 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
162
163
164
165 struct icu_transform
166 {
167     char action;
168     UParseError parse_error;
169     UTransliterator * trans;
170 };
171
172 struct icu_transform * icu_transform_create(const char *id, char action,
173                                             const char *rules,
174                                             UErrorCode *status);
175
176 void icu_transform_destroy(struct icu_transform * transform);
177
178 int icu_transform_trans(struct icu_transform * transform,
179                         struct icu_buf_utf16 * dest16,
180                         struct icu_buf_utf16 * src16,
181                         UErrorCode *status);
182
183 enum icu_chain_step_type {
184     ICU_chain_step_type_none,
185     ICU_chain_step_type_display,   /* convert to utf8 display format */
186     ICU_chain_step_type_casemap,   /* apply utf16 charmap */
187     ICU_chain_step_type_transform, /* apply utf16 transform */
188     ICU_chain_step_type_tokenize,  /* apply utf16 tokenization */
189     ICU_chain_step_type_transliterate  /* apply utf16 tokenization */
190 };
191
192
193
194 struct icu_chain_step
195 {
196     /* type and action object */
197     enum icu_chain_step_type type;
198     union {
199         struct icu_casemap * casemap;
200         struct icu_transform * transform;
201         struct icu_tokenizer * tokenizer;  
202     } u;
203     /* temprary post-action utf16 buffer */
204     struct icu_buf_utf16 * buf16;  
205     struct icu_chain_step * previous;
206     int more_tokens;
207     int need_new_token;
208 };
209
210
211 struct icu_chain;
212
213 struct icu_chain_step * icu_chain_step_create(yaz_icu_chain_t chain,
214                                               enum icu_chain_step_type type,
215                                               const uint8_t * rule,
216                                               struct icu_buf_utf16 * buf16,
217                                               UErrorCode *status);
218
219
220 void icu_chain_step_destroy(struct icu_chain_step * step);
221
222
223 struct icu_chain
224 {
225     char *locale;
226     int sort;
227
228     const char * src8cstr;
229
230     UCollator * coll;
231     
232     /* number of tokens returned so far */
233     int32_t token_count;
234     
235     /* utf8 output buffers */
236     struct icu_buf_utf8 * display8;
237     struct icu_buf_utf8 * norm8;
238     struct icu_buf_utf8 * sort8;
239     
240     /* utf16 source buffer */
241     struct icu_buf_utf16 * src16;
242     
243     /* linked list of chain steps */
244     struct icu_chain_step * steps;
245 };
246
247 struct icu_chain_step * icu_chain_insert_step(yaz_icu_chain_t chain,
248                                               enum icu_chain_step_type type,
249                                               const uint8_t * rule,
250                                               UErrorCode *status);
251
252 int icu_chain_step_next_token(yaz_icu_chain_t chain,
253                               struct icu_chain_step * step,
254                               UErrorCode *status);
255
256 int icu_chain_token_number(yaz_icu_chain_t chain);
257
258 const UCollator * icu_chain_get_coll(yaz_icu_chain_t chain);
259
260 #endif /* ICU_I18NL_H */
261
262 /*
263  * Local variables:
264  * c-basic-offset: 4
265  * c-file-style: "Stroustrup"
266  * indent-tabs-mode: nil
267  * End:
268  * vim: shiftwidth=4 tabstop=8 expandtab
269  */
270