First ICU chain integration in relevance ranking of pazpar2.
[pazpar2-moved-to-github.git] / src / icu_I18N.h
1 /* $Id: icu_I18N.h,v 1.16 2007-05-21 10:14:08 marc Exp $
2    Copyright (c) 2006-2007, Index Data.
3
4    This file is part of Pazpar2.
5
6    Pazpar2 is free software; you can redistribute it and/or modify it under
7    the terms of the GNU General Public License as published by the Free
8    Software Foundation; either version 2, or (at your option) any later
9    version.
10
11    Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12    WARRANTY; without even the implied warranty of MERCHANTABILITY or
13    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14    for more details.
15
16    You should have received a copy of the GNU General Public License
17    along with Pazpar2; see the file LICENSE.  If not, write to the
18    Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19    02111-1307, USA.
20 */
21
22 #ifndef ICU_I18NL_H
23 #define ICU_I18NL_H
24
25 #ifdef HAVE_ICU
26
27 #include <yaz/nmem.h>
28
29 #include <libxml/parser.h>
30 #include <libxml/tree.h>
31
32 #include <unicode/utypes.h>   /* Basic ICU data types */
33 #include <unicode/uchar.h>    /* char names           */
34
35 //#include <unicode/ustdio.h>
36 #include <unicode/ucol.h> 
37 //#include <unicode/ucnv.h>     /* C   Converter API    */
38 //#include <unicode/ustring.h>  /* some more string fcns*/
39 //#include <unicode/uloc.h>
40 #include <unicode/ubrk.h>
41 //#include <unicode/unistr.h>
42 #include <unicode/utrans.h>
43
44
45
46 // declared structs and functions
47
48 int icu_check_status (UErrorCode status);
49
50 struct icu_buf_utf16
51 {
52   UChar * utf16;
53   int32_t utf16_len;
54   int32_t utf16_cap;
55 };
56
57 struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity);
58 struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 * buf16,
59                                             size_t capacity);
60 struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 * dest16,
61                                           struct icu_buf_utf16 * src16);
62 void icu_buf_utf16_destroy(struct icu_buf_utf16 * buf16);
63
64
65
66 struct icu_buf_utf8
67 {
68   uint8_t * utf8;
69   int32_t utf8_len;
70   int32_t utf8_cap;
71 };
72
73 struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity);
74 struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8,
75                                           size_t capacity);
76 void icu_buf_utf8_destroy(struct icu_buf_utf8 * buf8);
77
78
79 UErrorCode icu_utf16_from_utf8(struct icu_buf_utf16 * dest16,
80                                struct icu_buf_utf8 * src8,
81                                UErrorCode * status);
82
83 UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 * dest16,
84                                     const char * src8cstr,
85                                     UErrorCode * status);
86
87
88 UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 * dest8,
89                              struct icu_buf_utf16 * src16,
90                              UErrorCode * status);
91
92 struct icu_casemap
93 {
94   char locale[16];
95   char action;
96 };
97
98 struct icu_casemap * icu_casemap_create(const char *locale, char action,
99                                             UErrorCode *status);
100
101 void icu_casemap_destroy(struct icu_casemap * casemap);
102
103 int icu_casemap_casemap(struct icu_casemap * casemap,
104                         struct icu_buf_utf16 * dest16,
105                         struct icu_buf_utf16 * src16,
106                         UErrorCode *status);
107
108 int icu_utf16_casemap(struct icu_buf_utf16 * dest16,
109                       struct icu_buf_utf16 * src16,
110                       const char *locale, char action,
111                       UErrorCode *status);
112
113 UErrorCode icu_sortkey8_from_utf16(UCollator *coll,
114                                    struct icu_buf_utf8 * dest8, 
115                                    struct icu_buf_utf16 * src16,
116                                    UErrorCode * status);
117
118 struct icu_tokenizer
119 {
120   char locale[16];
121   char action;
122   UBreakIterator* bi;
123   struct icu_buf_utf16 * buf16;
124   int32_t token_count;
125   int32_t token_id;
126   int32_t token_start;
127   int32_t token_end;
128   // keep always invariant
129   // 0 <= token_start 
130   //   <= token_end 
131   //   <= buf16->utf16_len
132   // and invariant
133   // 0 <= token_id <= token_count
134 };
135
136 struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action,
137                                             UErrorCode *status);
138
139 void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer);
140
141 int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, 
142                          struct icu_buf_utf16 * src16, UErrorCode *status);
143
144 int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, 
145                                  struct icu_buf_utf16 * tkn16, 
146                                  UErrorCode *status);
147
148 int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer);
149 int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer);
150 int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer);
151 int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer);
152 int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer);
153
154
155
156 struct icu_normalizer
157 {
158   char action;
159   struct icu_buf_utf16 * rules16;
160   UParseError parse_error[256];
161   UTransliterator * trans;
162 };
163
164 struct icu_normalizer * icu_normalizer_create(const char *rules, char action,
165                                               UErrorCode *status);
166
167
168 void icu_normalizer_destroy(struct icu_normalizer * normalizer);
169
170 int icu_normalizer_normalize(struct icu_normalizer * normalizer,
171                              struct icu_buf_utf16 * dest16,
172                              struct icu_buf_utf16 * src16,
173                              UErrorCode *status);
174
175
176 #if 0
177 struct icu_token
178 {
179   int32_t token_id;
180   uint8_t * display8;
181   uint8_t * norm8;
182   uint8_t * sort8;
183 }
184 #endif
185
186
187 enum icu_chain_step_type {
188     ICU_chain_step_type_none,      // 
189     ICU_chain_step_type_display,   // convert to utf8 display format 
190     ICU_chain_step_type_norm,      // convert to utf8 norm format 
191     ICU_chain_step_type_sort,      // convert to utf8 sort format 
192     ICU_chain_step_type_casemap,   // apply utf16 charmap
193     ICU_chain_step_type_normalize, // apply utf16 normalization
194     ICU_chain_step_type_tokenize   // apply utf16 tokenization 
195 };
196
197
198
199 struct icu_chain_step
200 {
201   // type and action object
202   enum icu_chain_step_type type;
203   union {
204     struct icu_casemap * casemap;
205     struct icu_normalizer * normalizer;
206     struct icu_tokenizer * tokenizer;  
207   } u;
208   // temprary post-action utf16 buffer
209   struct icu_buf_utf16 * buf16;  
210   struct icu_chain_step * previous;
211   int more_tokens;
212   int need_new_token;
213 };
214
215
216 struct icu_chain;
217
218 struct icu_chain_step * icu_chain_step_create(struct icu_chain * chain,
219                                               enum icu_chain_step_type type,
220                                               const uint8_t * rule,
221                                               struct icu_buf_utf16 * buf16,
222                                               UErrorCode *status);
223
224
225 void icu_chain_step_destroy(struct icu_chain_step * step);
226
227
228 struct icu_chain
229 {
230   uint8_t identifier[128];
231   uint8_t locale[16];
232
233   // number of tokens returned so far
234   int32_t token_count;
235
236   // utf8 output buffers
237   struct icu_buf_utf8 * display8;
238   struct icu_buf_utf8 * norm8;
239   struct icu_buf_utf8 * sort8;
240
241   // utf16 source buffer
242   struct icu_buf_utf16 * src16;
243
244   // linked list of chain steps
245   struct icu_chain_step * steps;
246 };
247
248 struct icu_chain * icu_chain_create(const uint8_t * identifier, 
249                                     const uint8_t * locale);
250
251 void icu_chain_destroy(struct icu_chain * chain);
252
253 struct icu_chain * icu_chain_xml_config(xmlNode *xml_node, 
254                                         UErrorCode * status);
255
256
257 struct icu_chain_step * icu_chain_insert_step(struct icu_chain * chain,
258                                               enum icu_chain_step_type type,
259                                               const uint8_t * rule,
260                                               UErrorCode *status);
261
262
263 int icu_chain_step_next_token(struct icu_chain * chain,
264                               struct icu_chain_step * step,
265                               UErrorCode *status);
266
267 int icu_chain_assign_cstr(struct icu_chain * chain,
268                           const char * src8cstr, 
269                           UErrorCode *status);
270
271 int icu_chain_next_token(struct icu_chain * chain,
272                          UErrorCode *status);
273
274 int icu_chain_get_token_count(struct icu_chain * chain);
275
276 const char * icu_chain_get_display(struct icu_chain * chain);
277
278 const char * icu_chain_get_norm(struct icu_chain * chain);
279
280 const char * icu_chain_get_sort(struct icu_chain * chain);
281
282
283
284
285
286 #endif // HAVE_ICU
287 #endif // ICU_I18NL_H