Updated source file headers with new year and no CVS Id.
[pazpar2-moved-to-github.git] / src / icu_I18N.h
1 /* This file is part of Pazpar2.
2    Copyright (C) 2006-2008 Index Data
3
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20 #ifndef ICU_I18NL_H
21 #define ICU_I18NL_H
22
23 #include <yaz/nmem.h>
24
25 #include <libxml/parser.h>
26 #include <libxml/tree.h>
27
28 #include <unicode/utypes.h>   /* Basic ICU data types */
29 #include <unicode/uchar.h>    /* char names           */
30
31 //#include <unicode/ustdio.h>
32 #include <unicode/ucol.h> 
33 //#include <unicode/ucnv.h>     /* C   Converter API    */
34 //#include <unicode/ustring.h>  /* some more string fcns*/
35 //#include <unicode/uloc.h>
36 #include <unicode/ubrk.h>
37 //#include <unicode/unistr.h>
38 #include <unicode/utrans.h>
39
40
41
42 // declared structs and functions
43
44 int icu_check_status (UErrorCode status);
45
46 struct icu_buf_utf16
47 {
48   UChar * utf16;
49   int32_t utf16_len;
50   int32_t utf16_cap;
51 };
52
53 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
54 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
55                                             size_t capacity);
56 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
57                                           struct icu_buf_utf16 * src16);
58 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
59
60
61
62 struct icu_buf_utf8
63 {
64   uint8_t * utf8;
65   int32_t utf8_len;
66   int32_t utf8_cap;
67 };
68
69 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
70 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
71                                           size_t capacity);
72 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
73
74
75 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
76                                struct icu_buf_utf8 * src8,
77                                UErrorCode * status);
78
79 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
80                                     const char * src8cstr,
81                                     UErrorCode * status);
82
83
84 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
85                              struct icu_buf_utf16 * src16,
86                              UErrorCode * status);
87
88 struct icu_casemap
89 {
90   char locale[16];
91   char action;
92 };
93
94 struct icu_casemap * icu_casemap_create(const char *locale, char action,
95                                             UErrorCode *status);
96
97 void icu_casemap_destroy(struct icu_casemap * casemap);
98
99 int icu_casemap_casemap(struct icu_casemap * casemap,
100                         struct icu_buf_utf16 * dest16,
101                         struct icu_buf_utf16 * src16,
102                         UErrorCode *status);
103
104 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
105                       struct icu_buf_utf16 * src16,
106                       const char *locale, char action,
107                       UErrorCode *status);
108
109 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
110                                    struct icu_buf_utf8 * dest8, 
111                                    struct icu_buf_utf16 * src16,
112                                    UErrorCode * status);
113
114 struct icu_tokenizer
115 {
116   char locale[16];
117   char action;
118   UBreakIterator* bi;
119   struct icu_buf_utf16 * buf16;
120   int32_t token_count;
121   int32_t token_id;
122   int32_t token_start;
123   int32_t token_end;
124   // keep always invariant
125   // 0 <= token_start 
126   //   <= token_end 
127   //   <= buf16->utf16_len
128   // and invariant
129   // 0 <= token_id <= token_count
130 };
131
132 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
133                                             UErrorCode *status);
134
135 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
136
137 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
138                          struct icu_buf_utf16 * src16, UErrorCode *status);
139
140 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
141                                  struct icu_buf_utf16 * tkn16, 
142                                  UErrorCode *status);
143
144 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
145 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
146 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
147 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
148 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
149
150
151
152 struct icu_normalizer
153 {
154   char action;
155   struct icu_buf_utf16 * rules16;
156   UParseError parse_error[256];
157   UTransliterator * trans;
158 };
159
160 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
161                                               UErrorCode *status);
162
163
164 void icu_normalizer_destroy(struct icu_normalizer * normalizer);
165
166 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
167                              struct icu_buf_utf16 * dest16,
168                              struct icu_buf_utf16 * src16,
169                              UErrorCode *status);
170
171
172 #if 0
173 struct icu_token
174 {
175   int32_t token_id;
176   uint8_t * display8;
177   uint8_t * norm8;
178   uint8_t * sort8;
179 }
180 #endif
181
182
183 enum icu_chain_step_type {
184     ICU_chain_step_type_none,      // 
185     ICU_chain_step_type_display,   // convert to utf8 display format 
186     ICU_chain_step_type_index,     // convert to utf8 index format 
187     ICU_chain_step_type_sortkey,   // convert to utf8 sortkey format 
188     ICU_chain_step_type_casemap,   // apply utf16 charmap
189     ICU_chain_step_type_normalize, // apply utf16 normalization
190     ICU_chain_step_type_tokenize   // apply utf16 tokenization 
191 };
192
193
194
195 struct icu_chain_step
196 {
197   // type and action object
198   enum icu_chain_step_type type;
199   union {
200     struct icu_casemap * casemap;
201     struct icu_normalizer * normalizer;
202     struct icu_tokenizer * tokenizer;  
203   } u;
204   // temprary post-action utf16 buffer
205   struct icu_buf_utf16 * buf16;  
206   struct icu_chain_step * previous;
207   int more_tokens;
208   int need_new_token;
209 };
210
211
212 struct icu_chain;
213
214 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
215                                               enum icu_chain_step_type type,
216                                               const uint8_t * rule,
217                                               struct icu_buf_utf16 * buf16,
218                                               UErrorCode *status);
219
220
221 void icu_chain_step_destroy(struct icu_chain_step * step);
222
223
224 struct icu_chain
225 {
226   uint8_t identifier[128];
227   uint8_t locale[16];
228
229   // number of tokens returned so far
230   int32_t token_count;
231
232   // utf8 output buffers
233   struct icu_buf_utf8 * display8;
234   struct icu_buf_utf8 * norm8;
235   struct icu_buf_utf8 * sort8;
236
237   // utf16 source buffer
238   struct icu_buf_utf16 * src16;
239
240   // linked list of chain steps
241   struct icu_chain_step * steps;
242 };
243
244 struct icu_chain * icu_chain_create(const uint8_t * identifier, 
245                                     const uint8_t * locale);
246
247 void icu_chain_destroy(struct icu_chain * chain);
248
249 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node, 
250                                         UErrorCode * status);
251
252
253 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
254                                               enum icu_chain_step_type type,
255                                               const uint8_t * rule,
256                                               UErrorCode *status);
257
258
259 int icu_chain_step_next_token(struct icu_chain * chain,
260                               struct icu_chain_step * step,
261                               UErrorCode *status);
262
263 int icu_chain_assign_cstr(struct icu_chain * chain,
264                           const char * src8cstr, 
265                           UErrorCode *status);
266
267 int icu_chain_next_token(struct icu_chain * chain,
268                          UErrorCode *status);
269
270 int icu_chain_get_token_count(struct icu_chain * chain);
271
272 const char * icu_chain_get_display(struct icu_chain * chain);
273
274 const char * icu_chain_get_norm(struct icu_chain * chain);
275
276 const char * icu_chain_get_sort(struct icu_chain * chain);
277
278
279
280
281
282 #endif // ICU_I18NL_H