Added ICU chain component - which used to be part of Pazpar2.
[yaz-moved-to-github.git] / include / yaz / icu_I18N.h
1 /* $Id: icu_I18N.h,v 1.1 2007-10-22 12:21:39 adam Exp $
2    Copyright (c) 2006-2007, Index Data.
3
4    This file is part of Pazpar2.
5
6    Pazpar2 is free software; you can redistribute it and/or modify it under
7    the terms of the GNU General Public License as published by the Free
8    Software Foundation; either version 2, or (at your option) any later
9    version.
10
11    Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12    WARRANTY; without even the implied warranty of MERCHANTABILITY or
13    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14    for more details.
15
16    You should have received a copy of the GNU General Public License
17    along with Pazpar2; see the file LICENSE.  If not, write to the
18    Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19    02111-1307, USA.
20 */
21
22 #ifndef ICU_I18NL_H
23 #define ICU_I18NL_H
24
25 #include <yaz/nmem.h>
26
27 #include <libxml/parser.h>
28 #include <libxml/tree.h>
29
30 #include <unicode/utypes.h>   /* Basic ICU data types */
31 #include <unicode/uchar.h>    /* char names           */
32
33 //#include <unicode/ustdio.h>
34 #include <unicode/ucol.h> 
35 //#include <unicode/ucnv.h>     /* C   Converter API    */
36 //#include <unicode/ustring.h>  /* some more string fcns*/
37 //#include <unicode/uloc.h>
38 #include <unicode/ubrk.h>
39 //#include <unicode/unistr.h>
40 #include <unicode/utrans.h>
41
42
43
44 // declared structs and functions
45
46 int icu_check_status (UErrorCode status);
47
48 struct icu_buf_utf16
49 {
50   UChar * utf16;
51   int32_t utf16_len;
52   int32_t utf16_cap;
53 };
54
55 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
56 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
57                                             size_t capacity);
58 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
59                                           struct icu_buf_utf16 * src16);
60 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
61
62
63
64 struct icu_buf_utf8
65 {
66   uint8_t * utf8;
67   int32_t utf8_len;
68   int32_t utf8_cap;
69 };
70
71 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
72 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
73                                           size_t capacity);
74 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
75
76
77 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
78                                struct icu_buf_utf8 * src8,
79                                UErrorCode * status);
80
81 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
82                                     const char * src8cstr,
83                                     UErrorCode * status);
84
85
86 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
87                              struct icu_buf_utf16 * src16,
88                              UErrorCode * status);
89
90 struct icu_casemap
91 {
92   char locale[16];
93   char action;
94 };
95
96 struct icu_casemap * icu_casemap_create(const char *locale, char action,
97                                             UErrorCode *status);
98
99 void icu_casemap_destroy(struct icu_casemap * casemap);
100
101 int icu_casemap_casemap(struct icu_casemap * casemap,
102                         struct icu_buf_utf16 * dest16,
103                         struct icu_buf_utf16 * src16,
104                         UErrorCode *status);
105
106 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
107                       struct icu_buf_utf16 * src16,
108                       const char *locale, char action,
109                       UErrorCode *status);
110
111 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
112                                    struct icu_buf_utf8 * dest8, 
113                                    struct icu_buf_utf16 * src16,
114                                    UErrorCode * status);
115
116 struct icu_tokenizer
117 {
118   char locale[16];
119   char action;
120   UBreakIterator* bi;
121   struct icu_buf_utf16 * buf16;
122   int32_t token_count;
123   int32_t token_id;
124   int32_t token_start;
125   int32_t token_end;
126   // keep always invariant
127   // 0 <= token_start 
128   //   <= token_end 
129   //   <= buf16->utf16_len
130   // and invariant
131   // 0 <= token_id <= token_count
132 };
133
134 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
135                                             UErrorCode *status);
136
137 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
138
139 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
140                          struct icu_buf_utf16 * src16, UErrorCode *status);
141
142 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
143                                  struct icu_buf_utf16 * tkn16, 
144                                  UErrorCode *status);
145
146 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
147 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
148 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
149 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
150 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
151
152
153
154 struct icu_normalizer
155 {
156   char action;
157   struct icu_buf_utf16 * rules16;
158   UParseError parse_error[256];
159   UTransliterator * trans;
160 };
161
162 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
163                                               UErrorCode *status);
164
165
166 void icu_normalizer_destroy(struct icu_normalizer * normalizer);
167
168 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
169                              struct icu_buf_utf16 * dest16,
170                              struct icu_buf_utf16 * src16,
171                              UErrorCode *status);
172
173
174 #if 0
175 struct icu_token
176 {
177   int32_t token_id;
178   uint8_t * display8;
179   uint8_t * norm8;
180   uint8_t * sort8;
181 }
182 #endif
183
184
185 enum icu_chain_step_type {
186     ICU_chain_step_type_none,      // 
187     ICU_chain_step_type_display,   // convert to utf8 display format 
188     ICU_chain_step_type_index,     // convert to utf8 index format 
189     ICU_chain_step_type_sortkey,   // convert to utf8 sortkey format 
190     ICU_chain_step_type_casemap,   // apply utf16 charmap
191     ICU_chain_step_type_normalize, // apply utf16 normalization
192     ICU_chain_step_type_tokenize   // apply utf16 tokenization 
193 };
194
195
196
197 struct icu_chain_step
198 {
199   // type and action object
200   enum icu_chain_step_type type;
201   union {
202     struct icu_casemap * casemap;
203     struct icu_normalizer * normalizer;
204     struct icu_tokenizer * tokenizer;  
205   } u;
206   // temprary post-action utf16 buffer
207   struct icu_buf_utf16 * buf16;  
208   struct icu_chain_step * previous;
209   int more_tokens;
210   int need_new_token;
211 };
212
213
214 struct icu_chain;
215
216 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
217                                               enum icu_chain_step_type type,
218                                               const uint8_t * rule,
219                                               struct icu_buf_utf16 * buf16,
220                                               UErrorCode *status);
221
222
223 void icu_chain_step_destroy(struct icu_chain_step * step);
224
225
226 struct icu_chain
227 {
228   uint8_t identifier[128];
229   uint8_t locale[16];
230
231   // number of tokens returned so far
232   int32_t token_count;
233
234   // utf8 output buffers
235   struct icu_buf_utf8 * display8;
236   struct icu_buf_utf8 * norm8;
237   struct icu_buf_utf8 * sort8;
238
239   // utf16 source buffer
240   struct icu_buf_utf16 * src16;
241
242   // linked list of chain steps
243   struct icu_chain_step * steps;
244 };
245
246 struct icu_chain * icu_chain_create(const uint8_t * identifier, 
247                                     const uint8_t * locale);
248
249 void icu_chain_destroy(struct icu_chain * chain);
250
251 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node, 
252                                         UErrorCode * status);
253
254
255 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
256                                               enum icu_chain_step_type type,
257                                               const uint8_t * rule,
258                                               UErrorCode *status);
259
260
261 int icu_chain_step_next_token(struct icu_chain * chain,
262                               struct icu_chain_step * step,
263                               UErrorCode *status);
264
265 int icu_chain_assign_cstr(struct icu_chain * chain,
266                           const char * src8cstr, 
267                           UErrorCode *status);
268
269 int icu_chain_next_token(struct icu_chain * chain,
270                          UErrorCode *status);
271
272 int icu_chain_get_token_count(struct icu_chain * chain);
273
274 const char * icu_chain_get_display(struct icu_chain * chain);
275
276 const char * icu_chain_get_norm(struct icu_chain * chain);
277
278 const char * icu_chain_get_sort(struct icu_chain * chain);
279
280
281
282
283
284 #endif // ICU_I18NL_H