Added safe cast to prevent warning
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2006, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.20 2006-04-19 23:46:15 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  */
16
17 #if HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20
21 #include <errno.h>
22 #include <string.h>
23 #include <ctype.h>
24 #if HAVE_WCHAR_H
25 #include <wchar.h>
26 #endif
27
28 #if HAVE_ICONV_H
29 #include <iconv.h>
30 #endif
31
32 #include <yaz/yaz-util.h>
33
34 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
35                                size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
37                                size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
39                                size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
41                                size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
43                                size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
45                                size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
47                                size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
49                                size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
51                                size_t *no_read, int *combining);
52
53
54 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
55                                 size_t *no_read, int *combining);
56 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
57                                 size_t *no_read, int *combining);
58 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
59                                 size_t *no_read, int *combining);
60 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
61                                 size_t *no_read, int *combining);
62 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
63                                 size_t *no_read, int *combining);
64 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
65                                 size_t *no_read, int *combining);
66 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
67                                 size_t *no_read, int *combining);
68 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
69                                 size_t *no_read, int *combining);
70 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
71                                 size_t *no_read, int *combining);
72
73 struct yaz_iconv_struct {
74     int my_errno;
75     int init_flag;
76     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77                           size_t inbytesleft, size_t *no_read);
78     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
79                                  size_t inbytesleft, size_t *no_read);
80     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
81                            char **outbuf, size_t *outbytesleft,
82                            int last);
83     int marc8_esc_mode;
84
85     int comb_offset;
86     int comb_size;
87     unsigned long comb_x[8];
88     size_t comb_no_read[8];
89     size_t no_read_x;
90     unsigned long unget_x;
91 #if HAVE_ICONV_H
92     iconv_t iconv_cd;
93 #endif
94     unsigned long compose_char;
95
96     unsigned long write_marc8_comb_ch[8];
97     size_t write_marc8_comb_no;
98     unsigned long write_marc8_last;
99     const char *write_marc8_page_chr;
100 };
101
102 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
103                                          size_t inbytesleft, size_t *no_read)
104 {
105     unsigned long x = inp[0];
106     *no_read = 1;
107     return x;
108 }
109
110 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
111                              size_t inbytesleft, size_t *no_read)
112 {
113     if (inp[0] != 0xef)
114     {
115         *no_read = 0;
116         return 0;
117     }
118     if (inbytesleft < 3)
119     {
120         cd->my_errno = YAZ_ICONV_EINVAL;
121         return (size_t) -1;
122     }
123     if (inp[1] != 0xbb && inp[2] == 0xbf)
124         *no_read = 3;
125     else
126         *no_read = 0;
127     return 0;
128 }
129
130 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
131                                     size_t inbytesleft, size_t *no_read)
132 {
133     unsigned long x = 0;
134
135     if (inp[0] <= 0x7f)
136     {
137         x = inp[0];
138         *no_read = 1;
139     }
140     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
141     {
142         *no_read = 0;
143         cd->my_errno = YAZ_ICONV_EILSEQ;
144     }
145     else if (inp[0] <= 0xdf && inbytesleft >= 2)
146     {
147         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
148         if (x >= 0x80)
149             *no_read = 2;
150         else
151         {
152             *no_read = 0;
153             cd->my_errno = YAZ_ICONV_EILSEQ;
154         }
155     }
156     else if (inp[0] <= 0xef && inbytesleft >= 3)
157     {
158         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
159             (inp[2] & 0x3f);
160         if (x >= 0x800)
161             *no_read = 3;
162         else
163         {
164             *no_read = 0;
165             cd->my_errno = YAZ_ICONV_EILSEQ;
166         }
167     }
168     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
169     {
170         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
171             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
172         if (x >= 0x10000)
173             *no_read = 4;
174         else
175         {
176             *no_read = 0;
177             cd->my_errno = YAZ_ICONV_EILSEQ;
178         }
179     }
180     else if (inp[0] <= 0xfb && inbytesleft >= 5)
181     {
182         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
183             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
184             (inp[4] & 0x3f);
185         if (x >= 0x200000)
186             *no_read = 5;
187         else
188         {
189             *no_read = 0;
190             cd->my_errno = YAZ_ICONV_EILSEQ;
191         }
192     }
193     else if (inp[0] <= 0xfd && inbytesleft >= 6)
194     {
195         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
196             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
197             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
198         if (x >= 0x4000000)
199             *no_read = 6;
200         else
201         {
202             *no_read = 0;
203             cd->my_errno = YAZ_ICONV_EILSEQ;
204         }
205     }
206     else
207     {
208         *no_read = 0;
209         cd->my_errno = YAZ_ICONV_EINVAL;
210     }
211     return x;
212 }
213
214 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
215                                     size_t inbytesleft, size_t *no_read)
216 {
217     unsigned long x = 0;
218     
219     if (inbytesleft < 4)
220     {
221         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
222         *no_read = 0;
223     }
224     else
225     {
226         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
227         *no_read = 4;
228     }
229     return x;
230 }
231
232 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
233                                       size_t inbytesleft, size_t *no_read)
234 {
235     unsigned long x = 0;
236     
237     if (inbytesleft < 4)
238     {
239         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
240         *no_read = 0;
241     }
242     else
243     {
244         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
245         *no_read = 4;
246     }
247     return x;
248 }
249
250 #if HAVE_WCHAR_H
251 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
252                                        size_t inbytesleft, size_t *no_read)
253 {
254     unsigned long x = 0;
255     
256     if (inbytesleft < sizeof(wchar_t))
257     {
258         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
259         *no_read = 0;
260     }
261     else
262     {
263         wchar_t wch;
264         memcpy (&wch, inp, sizeof(wch));
265         x = wch;
266         *no_read = sizeof(wch);
267     }
268     return x;
269 }
270 #endif
271
272
273 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
274                                           size_t inbytesleft, size_t *no_read,
275                                           int *comb);
276
277 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
278                                      size_t inbytesleft, size_t *no_read)
279 {
280     unsigned long x;
281     if (cd->comb_offset < cd->comb_size)
282     {
283         *no_read = cd->comb_no_read[cd->comb_offset];
284         x = cd->comb_x[cd->comb_offset];
285
286         /* special case for double-diacritic combining characters, 
287            INVERTED BREVE and DOUBLE TILDE.
288            We'll increment the no_read counter by 1, since we want to skip over
289            the processing of the closing ligature character
290         */
291         /* this code is no longer necessary.. our handlers code in
292            yaz_marc8_?_conv (generated by charconv.tcl) now returns
293            0 and no_read=1 when a sequence does not match the input.
294            The SECOND HALFs in codetables.xml produces a non-existant
295            entry in the conversion trie.. Hence when met, the input byte is
296            skipped as it should (in yaz_iconv)
297         */
298 #if 0
299         if (x == 0x0361 || x == 0x0360)
300             *no_read += 1;
301 #endif
302         cd->comb_offset++;
303         return x;
304     }
305
306     cd->comb_offset = 0;
307     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
308     {
309         int comb = 0;
310         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
311         if (!comb || !x)
312             break;
313         cd->comb_x[cd->comb_size] = x;
314         cd->comb_no_read[cd->comb_size] = *no_read;
315         inp += *no_read;
316         inbytesleft = inbytesleft - *no_read;
317     }
318     return x;
319 }
320
321 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
322                                           size_t inbytesleft, size_t *no_read,
323                                           int *comb)
324 {
325     *no_read = 0;
326     while(inbytesleft >= 1 && inp[0] == 27)
327     {
328         size_t inbytesleft0 = inbytesleft;
329         inp++;
330         inbytesleft--;
331         while(inbytesleft > 0 && strchr("(,$!", *inp))
332         {
333             inbytesleft--;
334             inp++;
335         }
336         if (inbytesleft <= 0)
337         {
338             *no_read = 0;
339             cd->my_errno = YAZ_ICONV_EINVAL;
340             return 0;
341         }
342         cd->marc8_esc_mode = *inp++;
343         inbytesleft--;
344         (*no_read) += inbytesleft0 - inbytesleft;
345     }
346     if (inbytesleft <= 0)
347         return 0;
348     else
349     {
350         unsigned long x;
351         size_t no_read_sub = 0;
352         *comb = 0;
353
354         switch(cd->marc8_esc_mode)
355         {
356         case 'B':  /* Basic ASCII */
357         case 'E':  /* ANSEL */
358         case 's':  /* ASCII */
359             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
360             break;
361         case 'g':  /* Greek */
362             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
363             break;
364         case 'b':  /* Subscripts */
365             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
366             break;
367         case 'p':  /* Superscripts */
368             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
369             break;
370         case '2':  /* Basic Hebrew */
371             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
372             break;
373         case 'N':  /* Basic Cyrillic */
374         case 'Q':  /* Extended Cyrillic */
375             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
376             break;
377         case '3':  /* Basic Arabic */
378         case '4':  /* Extended Arabic */
379             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
380             break;
381         case 'S':  /* Greek */
382             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
383             break;
384         case '1':  /* Chinese, Japanese, Korean (EACC) */
385             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
386             break;
387         default:
388             *no_read = 0;
389             cd->my_errno = YAZ_ICONV_EILSEQ;
390             return 0;
391         }
392         *no_read += no_read_sub;
393         return x;
394     }
395 }
396
397 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
398                               char **outbuf, size_t *outbytesleft,
399                               int last)
400 {
401     unsigned char *outp = (unsigned char *) *outbuf;
402
403     if (x <= 0x7f && *outbytesleft >= 1)
404     {
405         *outp++ = (unsigned char) x;
406         (*outbytesleft)--;
407     } 
408     else if (x <= 0x7ff && *outbytesleft >= 2)
409     {
410         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
411         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
412         (*outbytesleft) -= 2;
413     }
414     else if (x <= 0xffff && *outbytesleft >= 3)
415     {
416         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
417         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
418         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
419         (*outbytesleft) -= 3;
420     }
421     else if (x <= 0x1fffff && *outbytesleft >= 4)
422     {
423         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
424         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
425         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
426         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
427         (*outbytesleft) -= 4;
428     }
429     else if (x <= 0x3ffffff && *outbytesleft >= 5)
430     {
431         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
432         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
433         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
434         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
435         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
436         (*outbytesleft) -= 5;
437     }
438     else if (*outbytesleft >= 6)
439     {
440         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
441         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
442         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
443         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
444         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
445         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
446         (*outbytesleft) -= 6;
447     }
448     else 
449     {
450         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
451         return (size_t)(-1);
452     }
453     *outbuf = (char *) outp;
454     return 0;
455 }
456
457
458 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
459                                    char **outbuf, size_t *outbytesleft,
460                                    int last)
461 {
462     /* list of two char unicode sequence that, when combined, are
463        equivalent to single unicode chars that can be represented in
464        ISO-8859-1/Latin-1.
465        Regular iconv on Linux at least does not seem to convert these,
466        but since MARC-8 to UTF-8 generates these composed sequence
467        we get a better chance of a successful MARC-8 -> ISO-8859-1
468        conversion */
469     static struct {
470         unsigned long x1, x2;
471         unsigned y;
472     } latin1_comb[] = {
473         { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
474         { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
475         { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
476         { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
477         { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
478         { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
479         /* no need for 0xc6      LATIN CAPITAL LETTER AE */
480         { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
481         { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
482         { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
483         { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
484         { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
485         { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
486         { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
487         { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
488         { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
489         { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
490         { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
491         { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
492         { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
493         { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
494         { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
495         /* omitted:    0xd7      MULTIPLICATION SIGN */
496         /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
497         { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
498         { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
499         { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
500         { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
501         { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
502         /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
503         /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
504         { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
505         { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
506         { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
507         { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
508         { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
509         { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
510         /* omitted:    0xe6      LATIN SMALL LETTER AE */
511         { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
512         { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
513         { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
514         { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
515         { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
516         { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
517         { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
518         { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
519         { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
520         /* omitted:    0xf0      LATIN SMALL LETTER ETH */
521         { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
522         { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
523         { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
524         { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
525         { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
526         { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
527         /* omitted:    0xf7      DIVISION SIGN */
528         /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
529         { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
530         { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
531         { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
532         { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
533         { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
534         /* omitted:    0xfe      LATIN SMALL LETTER THORN */
535         { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
536         
537         { 0, 0, 0}
538     };
539     unsigned char *outp = (unsigned char *) *outbuf;
540
541     if (cd->compose_char)
542     {
543         int i;
544         for (i = 0; latin1_comb[i].x1; i++)
545             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
546             {
547                 x = latin1_comb[i].y;
548                 break;
549             }
550         if (*outbytesleft < 1)
551         {  /* no room. Retain compose_char and bail out */
552             cd->my_errno = YAZ_ICONV_E2BIG;
553             return (size_t)(-1);
554         }
555         if (!latin1_comb[i].x1) 
556         {   /* not found. Just write compose_char */
557             *outp++ = (unsigned char) cd->compose_char;
558             (*outbytesleft)--;
559             *outbuf = (char *) outp;
560         }
561         /* compose_char used so reset it. x now holds current char */
562         cd->compose_char = 0;
563     }
564
565     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
566     {
567         cd->compose_char = x;
568         return 0;
569     }
570     else if (x > 255 || x < 1)
571     {
572         cd->my_errno = YAZ_ICONV_EILSEQ;
573         return (size_t) -1;
574     }
575     else if (*outbytesleft < 1)
576     {
577         cd->my_errno = YAZ_ICONV_E2BIG;
578         return (size_t)(-1);
579     }
580     *outp++ = (unsigned char) x;
581     (*outbytesleft)--;
582     *outbuf = (char *) outp;
583     return 0;
584 }
585
586
587 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
588                               char **outbuf, size_t *outbytesleft,
589                               int last)
590 {
591     unsigned char *outp = (unsigned char *) *outbuf;
592     if (*outbytesleft >= 4)
593     {
594         *outp++ = (unsigned char) (x>>24);
595         *outp++ = (unsigned char) (x>>16);
596         *outp++ = (unsigned char) (x>>8);
597         *outp++ = (unsigned char) x;
598         (*outbytesleft) -= 4;
599     }
600     else
601     {
602         cd->my_errno = YAZ_ICONV_E2BIG;
603         return (size_t)(-1);
604     }
605     *outbuf = (char *) outp;
606     return 0;
607 }
608
609 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
610                                 char **outbuf, size_t *outbytesleft,
611                                 int last)
612 {
613     unsigned char *outp = (unsigned char *) *outbuf;
614     if (*outbytesleft >= 4)
615     {
616         *outp++ = (unsigned char) x;
617         *outp++ = (unsigned char) (x>>8);
618         *outp++ = (unsigned char) (x>>16);
619         *outp++ = (unsigned char) (x>>24);
620         (*outbytesleft) -= 4;
621     }
622     else
623     {
624         cd->my_errno = YAZ_ICONV_E2BIG;
625         return (size_t)(-1);
626     }
627     *outbuf = (char *) outp;
628     return 0;
629 }
630
631 static unsigned long lookup_marc8(yaz_iconv_t cd,
632                                   unsigned long x, int *comb,
633                                   const char **page_chr)
634 {
635     char utf8_buf[7];
636     char *utf8_outbuf = utf8_buf;
637     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
638
639     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
640     if (r == (size_t)(-1))
641     {
642         cd->my_errno = YAZ_ICONV_EILSEQ;
643         return 0;
644     }
645     else
646     {
647         unsigned char *inp;
648         size_t inbytesleft, no_read_sub = 0;
649         unsigned long x;
650
651         *utf8_outbuf = '\0';        
652         inp = (unsigned char *) utf8_buf;
653         inbytesleft = strlen(utf8_buf);
654         
655         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
656         if (x)
657         {
658             *page_chr = "\033(B";
659             return x;
660         }
661         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
662         if (x)
663         {
664             *page_chr = "\033g";
665             return x;
666         }
667         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
668         if (x)
669         {
670             *page_chr = "\033b";
671             return x;
672         }
673         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
674         if (x)
675         {
676             *page_chr = "\033p";
677             return x;
678         }
679         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
680         if (x)
681         {
682             *page_chr = "\033(2";
683             return x;
684         }
685         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
686         if (x)
687         {
688             *page_chr = "\033(N";
689             return x;
690         }
691         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
692         if (x)
693         {
694             *page_chr = "\033(3";
695             return x;
696         }
697         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
698         if (x)
699         {
700             *page_chr = "\033(S";
701             return x;
702         }
703         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
704         if (x)
705         {
706             *page_chr = "\033(1";
707             return x;
708         }
709         cd->my_errno = YAZ_ICONV_EILSEQ;
710         return x;
711     }
712 }
713
714 static size_t flush_combos(yaz_iconv_t cd,
715                            char **outbuf, size_t *outbytesleft)
716 {
717     unsigned long y = cd->write_marc8_last;
718     unsigned char byte, second_half = 0;
719     char out_buf[10];
720     size_t i, out_no = 0;
721
722     if (!y)
723         return 0;
724
725     byte = (unsigned char )((y>>16) & 0xff);
726     if (byte)
727         out_buf[out_no++] = byte;
728     byte = (unsigned char)((y>>8) & 0xff);
729     if (byte)
730         out_buf[out_no++] = byte;
731     byte = (unsigned char )(y & 0xff);
732     if (byte)
733         out_buf[out_no++] = byte;
734
735     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
736     {
737         cd->my_errno = YAZ_ICONV_E2BIG;
738         return (size_t) (-1);
739     }
740
741     for (i = 0; i < cd->write_marc8_comb_no; i++)
742     {
743         byte = cd->write_marc8_comb_ch[i];
744         if (byte == 0xEB)
745             second_half = 0xEC;
746         else if (byte == 0xFA)
747             second_half = 0xFB;
748
749         *(*outbuf)++ = byte;
750         (*outbytesleft)--;
751     }
752     memcpy(*outbuf, out_buf, out_no);
753     *outbuf += out_no;
754     (*outbytesleft) -= out_no;
755     if (second_half)
756     {
757         *(*outbuf)++ = second_half;
758         (*outbytesleft)--;
759     }        
760
761     cd->write_marc8_last = 0;
762     cd->write_marc8_comb_no = 0;
763     return 0;
764 }
765
766 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
767                               char **outbuf, size_t *outbytesleft,
768                               int last)
769 {
770     int comb = 0;
771     const char *page_chr = 0;
772     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
773
774     if (!y)
775         return (size_t) (-1);
776
777     if (comb)
778     {
779         if (cd->write_marc8_comb_no < 6)
780             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
781     }
782     else
783     {
784         size_t r = flush_combos(cd, outbuf, outbytesleft);
785         if (r)
786             return r;
787         if (strcmp(page_chr, cd->write_marc8_page_chr))
788         {
789             size_t plen = strlen(page_chr);
790
791             if (*outbytesleft < plen)
792             {
793                 cd->my_errno = YAZ_ICONV_E2BIG;
794                 return (size_t) (-1);
795             }
796             memcpy(*outbuf, page_chr, plen);
797             (*outbuf) += plen;
798             (*outbytesleft) -= plen;
799             cd->write_marc8_page_chr = page_chr;            
800         }
801         cd->write_marc8_last = y;
802     }
803     if (last)
804     {
805         size_t r = flush_combos(cd, outbuf, outbytesleft);
806         if (r)
807         {
808             if (comb)
809                 cd->write_marc8_comb_no--;
810             else
811                 cd->write_marc8_last = 0;
812             return r;
813         }
814     }
815     return 0;
816 }
817
818 #if HAVE_WCHAR_H
819 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
820                                  char **outbuf, size_t *outbytesleft,
821                                  int last)
822 {
823     unsigned char *outp = (unsigned char *) *outbuf;
824
825     if (*outbytesleft >= sizeof(wchar_t))
826     {
827         wchar_t wch = x;
828         memcpy(outp, &wch, sizeof(wch));
829         outp += sizeof(wch);
830         (*outbytesleft) -= sizeof(wch);
831     }
832     else
833     {
834         cd->my_errno = YAZ_ICONV_E2BIG;
835         return (size_t)(-1);
836     }
837     *outbuf = (char *) outp;
838     return 0;
839 }
840 #endif
841
842 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
843 {
844     return cd->read_handle && cd->write_handle;
845 }
846
847 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
848 {
849     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
850
851     cd->write_handle = 0;
852     cd->read_handle = 0;
853     cd->init_handle = 0;
854     cd->my_errno = YAZ_ICONV_UNKNOWN;
855     cd->marc8_esc_mode = 'B';
856     cd->comb_offset = cd->comb_size = 0;
857     cd->compose_char = 0;
858
859     cd->write_marc8_comb_no = 0;
860     cd->write_marc8_last = 0;
861     cd->write_marc8_page_chr = "\033(B";
862
863     /* a useful hack: if fromcode has leading @,
864        the library not use YAZ's own conversions .. */
865     if (fromcode[0] == '@')
866         fromcode++;
867     else
868     {
869         if (!yaz_matchstr(fromcode, "UTF8"))
870         {
871             cd->read_handle = yaz_read_UTF8;
872             cd->init_handle = yaz_init_UTF8;
873         }
874         else if (!yaz_matchstr(fromcode, "ISO88591"))
875             cd->read_handle = yaz_read_ISO8859_1;
876         else if (!yaz_matchstr(fromcode, "UCS4"))
877             cd->read_handle = yaz_read_UCS4;
878         else if (!yaz_matchstr(fromcode, "UCS4LE"))
879             cd->read_handle = yaz_read_UCS4LE;
880         else if (!yaz_matchstr(fromcode, "MARC8"))
881             cd->read_handle = yaz_read_marc8;
882 #if HAVE_WCHAR_H
883         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
884             cd->read_handle = yaz_read_wchar_t;
885 #endif
886         
887         if (!yaz_matchstr(tocode, "UTF8"))
888             cd->write_handle = yaz_write_UTF8;
889         else if (!yaz_matchstr(tocode, "ISO88591"))
890             cd->write_handle = yaz_write_ISO8859_1;
891         else if (!yaz_matchstr (tocode, "UCS4"))
892             cd->write_handle = yaz_write_UCS4;
893         else if (!yaz_matchstr(tocode, "UCS4LE"))
894             cd->write_handle = yaz_write_UCS4LE;
895         else if (!yaz_matchstr(tocode, "MARC8"))
896             cd->write_handle = yaz_write_marc8;
897 #if HAVE_WCHAR_H
898         else if (!yaz_matchstr(tocode, "WCHAR_T"))
899             cd->write_handle = yaz_write_wchar_t;
900 #endif
901     }
902 #if HAVE_ICONV_H
903     cd->iconv_cd = 0;
904     if (!cd->read_handle || !cd->write_handle)
905     {
906         cd->iconv_cd = iconv_open (tocode, fromcode);
907         if (cd->iconv_cd == (iconv_t) (-1))
908         {
909             xfree (cd);
910             return 0;
911         }
912     }
913 #else
914     if (!cd->read_handle || !cd->write_handle)
915     {
916         xfree (cd);
917         return 0;
918     }
919 #endif
920     cd->init_flag = 1;
921     return cd;
922 }
923
924 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
925                  char **outbuf, size_t *outbytesleft)
926 {
927     char *inbuf0;
928     size_t r = 0;
929
930 #if HAVE_ICONV_H
931     if (cd->iconv_cd)
932     {
933         size_t r =
934             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
935         if (r == (size_t)(-1))
936         {
937             switch (yaz_errno())
938             {
939             case E2BIG:
940                 cd->my_errno = YAZ_ICONV_E2BIG;
941                 break;
942             case EINVAL:
943                 cd->my_errno = YAZ_ICONV_EINVAL;
944                 break;
945             case EILSEQ:
946                 cd->my_errno = YAZ_ICONV_EILSEQ;
947                 break;
948             default:
949                 cd->my_errno = YAZ_ICONV_UNKNOWN;
950             }
951         }
952         return r;
953     }
954 #endif
955     if (inbuf == 0 || *inbuf == 0)
956     {
957         cd->init_flag = 1;
958         cd->my_errno = YAZ_ICONV_UNKNOWN;
959         return 0;
960     }
961     inbuf0 = *inbuf;
962
963     if (cd->init_flag)
964     {
965         if (cd->init_handle)
966         {
967             size_t no_read;
968             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
969                                          *inbytesleft, &no_read);
970             if (r)
971             {
972                 if (cd->my_errno == YAZ_ICONV_EINVAL)
973                     return r;
974                 cd->init_flag = 0;
975                 return r;
976             }
977             *inbytesleft -= no_read;
978             *inbuf += no_read;
979         }
980         cd->init_flag = 0;
981         cd->unget_x = 0;
982         cd->no_read_x = 0;
983     }
984     while (1)
985     {
986         unsigned long x;
987         size_t no_read;
988
989         if (*inbytesleft == 0)
990         {
991             r = *inbuf - inbuf0;
992             break;
993         }
994         if (!cd->unget_x)
995         {
996             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
997                                   &no_read);
998             if (no_read == 0)
999             {
1000                 r = (size_t)(-1);
1001                 break;
1002             }
1003         }
1004         else
1005         {
1006             x = cd->unget_x;
1007             no_read = cd->no_read_x;
1008         }
1009         if (x)
1010         {
1011             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1012                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1013             if (r)
1014             {
1015                 /* unable to write it. save it because read_handle cannot
1016                    rewind .. */
1017                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1018                 {
1019                     cd->unget_x = x;
1020                     cd->no_read_x = no_read;
1021                     break;
1022                 }
1023             }
1024             cd->unget_x = 0;
1025         }
1026         *inbytesleft -= no_read;
1027         (*inbuf) += no_read;
1028     }
1029     return r;
1030 }
1031
1032 int yaz_iconv_error (yaz_iconv_t cd)
1033 {
1034     return cd->my_errno;
1035 }
1036
1037 int yaz_iconv_close (yaz_iconv_t cd)
1038 {
1039 #if HAVE_ICONV_H
1040     if (cd->iconv_cd)
1041         iconv_close (cd->iconv_cd);
1042 #endif
1043     xfree (cd);
1044     return 0;
1045 }
1046
1047 /*
1048  * Local variables:
1049  * c-basic-offset: 4
1050  * indent-tabs-mode: nil
1051  * End:
1052  * vim: shiftwidth=4 tabstop=8 expandtab
1053  */
1054