d3e078fe6bca4eabbcf59286e259292838a8a3d6
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2006, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.21 2006-04-19 23:48:06 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  */
16
17 #if HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20
21 #include <errno.h>
22 #include <string.h>
23 #include <ctype.h>
24 #if HAVE_WCHAR_H
25 #include <wchar.h>
26 #endif
27
28 #if HAVE_ICONV_H
29 #include <iconv.h>
30 #endif
31
32 #include <yaz/yaz-util.h>
33
34 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
35                                size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
37                                size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
39                                size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
41                                size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
43                                size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
45                                size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
47                                size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
49                                size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
51                                size_t *no_read, int *combining);
52
53
54 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
55                                 size_t *no_read, int *combining);
56 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
57                                 size_t *no_read, int *combining);
58 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
59                                 size_t *no_read, int *combining);
60 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
61                                 size_t *no_read, int *combining);
62 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
63                                 size_t *no_read, int *combining);
64 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
65                                 size_t *no_read, int *combining);
66 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
67                                 size_t *no_read, int *combining);
68 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
69                                 size_t *no_read, int *combining);
70 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
71                                 size_t *no_read, int *combining);
72
73 struct yaz_iconv_struct {
74     int my_errno;
75     int init_flag;
76     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77                           size_t inbytesleft, size_t *no_read);
78     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
79                                  size_t inbytesleft, size_t *no_read);
80     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
81                            char **outbuf, size_t *outbytesleft,
82                            int last);
83     int marc8_esc_mode;
84
85     int comb_offset;
86     int comb_size;
87     unsigned long comb_x[8];
88     size_t comb_no_read[8];
89     size_t no_read_x;
90     unsigned long unget_x;
91 #if HAVE_ICONV_H
92     iconv_t iconv_cd;
93 #endif
94     unsigned long compose_char;
95
96     unsigned long write_marc8_comb_ch[8];
97     size_t write_marc8_comb_no;
98     unsigned long write_marc8_last;
99     const char *write_marc8_page_chr;
100 };
101
102 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
103                                          size_t inbytesleft, size_t *no_read)
104 {
105     unsigned long x = inp[0];
106     *no_read = 1;
107     return x;
108 }
109
110 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
111                              size_t inbytesleft, size_t *no_read)
112 {
113     if (inp[0] != 0xef)
114     {
115         *no_read = 0;
116         return 0;
117     }
118     if (inbytesleft < 3)
119     {
120         cd->my_errno = YAZ_ICONV_EINVAL;
121         return (size_t) -1;
122     }
123     if (inp[1] != 0xbb && inp[2] == 0xbf)
124         *no_read = 3;
125     else
126         *no_read = 0;
127     return 0;
128 }
129
130 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
131                                     size_t inbytesleft, size_t *no_read)
132 {
133     unsigned long x = 0;
134
135     if (inp[0] <= 0x7f)
136     {
137         x = inp[0];
138         *no_read = 1;
139     }
140     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
141     {
142         *no_read = 0;
143         cd->my_errno = YAZ_ICONV_EILSEQ;
144     }
145     else if (inp[0] <= 0xdf && inbytesleft >= 2)
146     {
147         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
148         if (x >= 0x80)
149             *no_read = 2;
150         else
151         {
152             *no_read = 0;
153             cd->my_errno = YAZ_ICONV_EILSEQ;
154         }
155     }
156     else if (inp[0] <= 0xef && inbytesleft >= 3)
157     {
158         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
159             (inp[2] & 0x3f);
160         if (x >= 0x800)
161             *no_read = 3;
162         else
163         {
164             *no_read = 0;
165             cd->my_errno = YAZ_ICONV_EILSEQ;
166         }
167     }
168     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
169     {
170         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
171             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
172         if (x >= 0x10000)
173             *no_read = 4;
174         else
175         {
176             *no_read = 0;
177             cd->my_errno = YAZ_ICONV_EILSEQ;
178         }
179     }
180     else if (inp[0] <= 0xfb && inbytesleft >= 5)
181     {
182         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
183             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
184             (inp[4] & 0x3f);
185         if (x >= 0x200000)
186             *no_read = 5;
187         else
188         {
189             *no_read = 0;
190             cd->my_errno = YAZ_ICONV_EILSEQ;
191         }
192     }
193     else if (inp[0] <= 0xfd && inbytesleft >= 6)
194     {
195         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
196             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
197             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
198         if (x >= 0x4000000)
199             *no_read = 6;
200         else
201         {
202             *no_read = 0;
203             cd->my_errno = YAZ_ICONV_EILSEQ;
204         }
205     }
206     else
207     {
208         *no_read = 0;
209         cd->my_errno = YAZ_ICONV_EINVAL;
210     }
211     return x;
212 }
213
214 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
215                                     size_t inbytesleft, size_t *no_read)
216 {
217     unsigned long x = 0;
218     
219     if (inbytesleft < 4)
220     {
221         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
222         *no_read = 0;
223     }
224     else
225     {
226         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
227         *no_read = 4;
228     }
229     return x;
230 }
231
232 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
233                                       size_t inbytesleft, size_t *no_read)
234 {
235     unsigned long x = 0;
236     
237     if (inbytesleft < 4)
238     {
239         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
240         *no_read = 0;
241     }
242     else
243     {
244         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
245         *no_read = 4;
246     }
247     return x;
248 }
249
250 #if HAVE_WCHAR_H
251 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
252                                        size_t inbytesleft, size_t *no_read)
253 {
254     unsigned long x = 0;
255     
256     if (inbytesleft < sizeof(wchar_t))
257     {
258         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
259         *no_read = 0;
260     }
261     else
262     {
263         wchar_t wch;
264         memcpy (&wch, inp, sizeof(wch));
265         x = wch;
266         *no_read = sizeof(wch);
267     }
268     return x;
269 }
270 #endif
271
272
273 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
274                                           size_t inbytesleft, size_t *no_read,
275                                           int *comb);
276
277 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
278                                      size_t inbytesleft, size_t *no_read)
279 {
280     unsigned long x;
281     if (cd->comb_offset < cd->comb_size)
282     {
283         *no_read = cd->comb_no_read[cd->comb_offset];
284         x = cd->comb_x[cd->comb_offset];
285
286         /* special case for double-diacritic combining characters, 
287            INVERTED BREVE and DOUBLE TILDE.
288            We'll increment the no_read counter by 1, since we want to skip over
289            the processing of the closing ligature character
290         */
291         /* this code is no longer necessary.. our handlers code in
292            yaz_marc8_?_conv (generated by charconv.tcl) now returns
293            0 and no_read=1 when a sequence does not match the input.
294            The SECOND HALFs in codetables.xml produces a non-existant
295            entry in the conversion trie.. Hence when met, the input byte is
296            skipped as it should (in yaz_iconv)
297         */
298 #if 0
299         if (x == 0x0361 || x == 0x0360)
300             *no_read += 1;
301 #endif
302         cd->comb_offset++;
303         return x;
304     }
305
306     cd->comb_offset = 0;
307     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
308     {
309         int comb = 0;
310         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
311         if (!comb || !x)
312             break;
313         cd->comb_x[cd->comb_size] = x;
314         cd->comb_no_read[cd->comb_size] = *no_read;
315         inp += *no_read;
316         inbytesleft = inbytesleft - *no_read;
317     }
318     return x;
319 }
320
321 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
322                                           size_t inbytesleft, size_t *no_read,
323                                           int *comb)
324 {
325     *no_read = 0;
326     while(inbytesleft >= 1 && inp[0] == 27)
327     {
328         size_t inbytesleft0 = inbytesleft;
329         inp++;
330         inbytesleft--;
331         while(inbytesleft > 0 && strchr("(,$!", *inp))
332         {
333             inbytesleft--;
334             inp++;
335         }
336         if (inbytesleft <= 0)
337         {
338             *no_read = 0;
339             cd->my_errno = YAZ_ICONV_EINVAL;
340             return 0;
341         }
342         cd->marc8_esc_mode = *inp++;
343         inbytesleft--;
344         (*no_read) += inbytesleft0 - inbytesleft;
345     }
346     if (inbytesleft <= 0)
347         return 0;
348     else
349     {
350         unsigned long x;
351         size_t no_read_sub = 0;
352         *comb = 0;
353
354         switch(cd->marc8_esc_mode)
355         {
356         case 'B':  /* Basic ASCII */
357         case 'E':  /* ANSEL */
358         case 's':  /* ASCII */
359             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
360             break;
361         case 'g':  /* Greek */
362             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
363             break;
364         case 'b':  /* Subscripts */
365             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
366             break;
367         case 'p':  /* Superscripts */
368             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
369             break;
370         case '2':  /* Basic Hebrew */
371             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
372             break;
373         case 'N':  /* Basic Cyrillic */
374         case 'Q':  /* Extended Cyrillic */
375             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
376             break;
377         case '3':  /* Basic Arabic */
378         case '4':  /* Extended Arabic */
379             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
380             break;
381         case 'S':  /* Greek */
382             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
383             break;
384         case '1':  /* Chinese, Japanese, Korean (EACC) */
385             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
386             break;
387         default:
388             *no_read = 0;
389             cd->my_errno = YAZ_ICONV_EILSEQ;
390             return 0;
391         }
392         *no_read += no_read_sub;
393         return x;
394     }
395 }
396
397 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
398                               char **outbuf, size_t *outbytesleft,
399                               int last)
400 {
401     unsigned char *outp = (unsigned char *) *outbuf;
402
403     if (x <= 0x7f && *outbytesleft >= 1)
404     {
405         *outp++ = (unsigned char) x;
406         (*outbytesleft)--;
407     } 
408     else if (x <= 0x7ff && *outbytesleft >= 2)
409     {
410         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
411         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
412         (*outbytesleft) -= 2;
413     }
414     else if (x <= 0xffff && *outbytesleft >= 3)
415     {
416         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
417         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
418         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
419         (*outbytesleft) -= 3;
420     }
421     else if (x <= 0x1fffff && *outbytesleft >= 4)
422     {
423         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
424         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
425         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
426         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
427         (*outbytesleft) -= 4;
428     }
429     else if (x <= 0x3ffffff && *outbytesleft >= 5)
430     {
431         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
432         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
433         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
434         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
435         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
436         (*outbytesleft) -= 5;
437     }
438     else if (*outbytesleft >= 6)
439     {
440         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
441         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
442         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
443         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
444         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
445         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
446         (*outbytesleft) -= 6;
447     }
448     else 
449     {
450         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
451         return (size_t)(-1);
452     }
453     *outbuf = (char *) outp;
454     return 0;
455 }
456
457
458 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
459                                    char **outbuf, size_t *outbytesleft,
460                                    int last)
461 {
462     /* list of two char unicode sequence that, when combined, are
463        equivalent to single unicode chars that can be represented in
464        ISO-8859-1/Latin-1.
465        Regular iconv on Linux at least does not seem to convert these,
466        but since MARC-8 to UTF-8 generates these composed sequence
467        we get a better chance of a successful MARC-8 -> ISO-8859-1
468        conversion */
469     static struct {
470         unsigned long x1, x2;
471         unsigned y;
472     } latin1_comb[] = {
473         { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
474         { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
475         { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
476         { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
477         { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
478         { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
479         /* no need for 0xc6      LATIN CAPITAL LETTER AE */
480         { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
481         { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
482         { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
483         { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
484         { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
485         { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
486         { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
487         { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
488         { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
489         { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
490         { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
491         { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
492         { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
493         { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
494         { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
495         /* omitted:    0xd7      MULTIPLICATION SIGN */
496         /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
497         { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
498         { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
499         { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
500         { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
501         { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
502         /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
503         /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
504         { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
505         { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
506         { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
507         { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
508         { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
509         { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
510         /* omitted:    0xe6      LATIN SMALL LETTER AE */
511         { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
512         { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
513         { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
514         { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
515         { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
516         { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
517         { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
518         { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
519         { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
520         /* omitted:    0xf0      LATIN SMALL LETTER ETH */
521         { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
522         { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
523         { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
524         { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
525         { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
526         { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
527         /* omitted:    0xf7      DIVISION SIGN */
528         /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
529         { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
530         { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
531         { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
532         { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
533         { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
534         /* omitted:    0xfe      LATIN SMALL LETTER THORN */
535         { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
536         
537         { 0, 0, 0}
538     };
539     unsigned char *outp = (unsigned char *) *outbuf;
540
541     if (cd->compose_char)
542     {
543         int i;
544         for (i = 0; latin1_comb[i].x1; i++)
545             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
546             {
547                 x = latin1_comb[i].y;
548                 break;
549             }
550         if (*outbytesleft < 1)
551         {  /* no room. Retain compose_char and bail out */
552             cd->my_errno = YAZ_ICONV_E2BIG;
553             return (size_t)(-1);
554         }
555         if (!latin1_comb[i].x1) 
556         {   /* not found. Just write compose_char */
557             *outp++ = (unsigned char) cd->compose_char;
558             (*outbytesleft)--;
559             *outbuf = (char *) outp;
560         }
561         /* compose_char used so reset it. x now holds current char */
562         cd->compose_char = 0;
563     }
564
565     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
566     {
567         cd->compose_char = x;
568         return 0;
569     }
570     else if (x > 255 || x < 1)
571     {
572         cd->my_errno = YAZ_ICONV_EILSEQ;
573         return (size_t) -1;
574     }
575     else if (*outbytesleft < 1)
576     {
577         cd->my_errno = YAZ_ICONV_E2BIG;
578         return (size_t)(-1);
579     }
580     *outp++ = (unsigned char) x;
581     (*outbytesleft)--;
582     *outbuf = (char *) outp;
583     return 0;
584 }
585
586
587 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
588                               char **outbuf, size_t *outbytesleft,
589                               int last)
590 {
591     unsigned char *outp = (unsigned char *) *outbuf;
592     if (*outbytesleft >= 4)
593     {
594         *outp++ = (unsigned char) (x>>24);
595         *outp++ = (unsigned char) (x>>16);
596         *outp++ = (unsigned char) (x>>8);
597         *outp++ = (unsigned char) x;
598         (*outbytesleft) -= 4;
599     }
600     else
601     {
602         cd->my_errno = YAZ_ICONV_E2BIG;
603         return (size_t)(-1);
604     }
605     *outbuf = (char *) outp;
606     return 0;
607 }
608
609 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
610                                 char **outbuf, size_t *outbytesleft,
611                                 int last)
612 {
613     unsigned char *outp = (unsigned char *) *outbuf;
614     if (*outbytesleft >= 4)
615     {
616         *outp++ = (unsigned char) x;
617         *outp++ = (unsigned char) (x>>8);
618         *outp++ = (unsigned char) (x>>16);
619         *outp++ = (unsigned char) (x>>24);
620         (*outbytesleft) -= 4;
621     }
622     else
623     {
624         cd->my_errno = YAZ_ICONV_E2BIG;
625         return (size_t)(-1);
626     }
627     *outbuf = (char *) outp;
628     return 0;
629 }
630
631 static unsigned long lookup_marc8(yaz_iconv_t cd,
632                                   unsigned long x, int *comb,
633                                   const char **page_chr)
634 {
635     char utf8_buf[7];
636     char *utf8_outbuf = utf8_buf;
637     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
638
639     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
640     if (r == (size_t)(-1))
641     {
642         cd->my_errno = YAZ_ICONV_EILSEQ;
643         return 0;
644     }
645     else
646     {
647         unsigned char *inp;
648         size_t inbytesleft, no_read_sub = 0;
649         unsigned long x;
650
651         *utf8_outbuf = '\0';        
652         inp = (unsigned char *) utf8_buf;
653         inbytesleft = strlen(utf8_buf);
654         
655         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
656         if (x)
657         {
658             *page_chr = "\033(B";
659             return x;
660         }
661         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
662         if (x)
663         {
664             *page_chr = "\033g";
665             return x;
666         }
667         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
668         if (x)
669         {
670             *page_chr = "\033b";
671             return x;
672         }
673         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
674         if (x)
675         {
676             *page_chr = "\033p";
677             return x;
678         }
679         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
680         if (x)
681         {
682             *page_chr = "\033(2";
683             return x;
684         }
685         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
686         if (x)
687         {
688             *page_chr = "\033(N";
689             return x;
690         }
691         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
692         if (x)
693         {
694             *page_chr = "\033(3";
695             return x;
696         }
697         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
698         if (x)
699         {
700             *page_chr = "\033(S";
701             return x;
702         }
703         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
704         if (x)
705         {
706             *page_chr = "\033(1";
707             return x;
708         }
709         cd->my_errno = YAZ_ICONV_EILSEQ;
710         return x;
711     }
712 }
713
714 static size_t flush_combos(yaz_iconv_t cd,
715                            char **outbuf, size_t *outbytesleft)
716 {
717     unsigned long y = cd->write_marc8_last;
718     unsigned char byte, second_half = 0;
719     char out_buf[10];
720     size_t i, out_no = 0;
721
722     if (!y)
723         return 0;
724
725     byte = (unsigned char )((y>>16) & 0xff);
726     if (byte)
727         out_buf[out_no++] = byte;
728     byte = (unsigned char)((y>>8) & 0xff);
729     if (byte)
730         out_buf[out_no++] = byte;
731     byte = (unsigned char )(y & 0xff);
732     if (byte)
733         out_buf[out_no++] = byte;
734
735     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
736     {
737         cd->my_errno = YAZ_ICONV_E2BIG;
738         return (size_t) (-1);
739     }
740
741     for (i = 0; i < cd->write_marc8_comb_no; i++)
742     {
743         /* all MARC-8 combined characters are simple bytes */
744         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
745         if (byte == 0xEB)
746             second_half = 0xEC;
747         else if (byte == 0xFA)
748             second_half = 0xFB;
749
750         *(*outbuf)++ = byte;
751         (*outbytesleft)--;
752     }
753     memcpy(*outbuf, out_buf, out_no);
754     *outbuf += out_no;
755     (*outbytesleft) -= out_no;
756     if (second_half)
757     {
758         *(*outbuf)++ = second_half;
759         (*outbytesleft)--;
760     }        
761
762     cd->write_marc8_last = 0;
763     cd->write_marc8_comb_no = 0;
764     return 0;
765 }
766
767 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
768                               char **outbuf, size_t *outbytesleft,
769                               int last)
770 {
771     int comb = 0;
772     const char *page_chr = 0;
773     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
774
775     if (!y)
776         return (size_t) (-1);
777
778     if (comb)
779     {
780         if (cd->write_marc8_comb_no < 6)
781             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
782     }
783     else
784     {
785         size_t r = flush_combos(cd, outbuf, outbytesleft);
786         if (r)
787             return r;
788         if (strcmp(page_chr, cd->write_marc8_page_chr))
789         {
790             size_t plen = strlen(page_chr);
791
792             if (*outbytesleft < plen)
793             {
794                 cd->my_errno = YAZ_ICONV_E2BIG;
795                 return (size_t) (-1);
796             }
797             memcpy(*outbuf, page_chr, plen);
798             (*outbuf) += plen;
799             (*outbytesleft) -= plen;
800             cd->write_marc8_page_chr = page_chr;            
801         }
802         cd->write_marc8_last = y;
803     }
804     if (last)
805     {
806         size_t r = flush_combos(cd, outbuf, outbytesleft);
807         if (r)
808         {
809             if (comb)
810                 cd->write_marc8_comb_no--;
811             else
812                 cd->write_marc8_last = 0;
813             return r;
814         }
815     }
816     return 0;
817 }
818
819 #if HAVE_WCHAR_H
820 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
821                                  char **outbuf, size_t *outbytesleft,
822                                  int last)
823 {
824     unsigned char *outp = (unsigned char *) *outbuf;
825
826     if (*outbytesleft >= sizeof(wchar_t))
827     {
828         wchar_t wch = x;
829         memcpy(outp, &wch, sizeof(wch));
830         outp += sizeof(wch);
831         (*outbytesleft) -= sizeof(wch);
832     }
833     else
834     {
835         cd->my_errno = YAZ_ICONV_E2BIG;
836         return (size_t)(-1);
837     }
838     *outbuf = (char *) outp;
839     return 0;
840 }
841 #endif
842
843 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
844 {
845     return cd->read_handle && cd->write_handle;
846 }
847
848 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
849 {
850     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
851
852     cd->write_handle = 0;
853     cd->read_handle = 0;
854     cd->init_handle = 0;
855     cd->my_errno = YAZ_ICONV_UNKNOWN;
856     cd->marc8_esc_mode = 'B';
857     cd->comb_offset = cd->comb_size = 0;
858     cd->compose_char = 0;
859
860     cd->write_marc8_comb_no = 0;
861     cd->write_marc8_last = 0;
862     cd->write_marc8_page_chr = "\033(B";
863
864     /* a useful hack: if fromcode has leading @,
865        the library not use YAZ's own conversions .. */
866     if (fromcode[0] == '@')
867         fromcode++;
868     else
869     {
870         if (!yaz_matchstr(fromcode, "UTF8"))
871         {
872             cd->read_handle = yaz_read_UTF8;
873             cd->init_handle = yaz_init_UTF8;
874         }
875         else if (!yaz_matchstr(fromcode, "ISO88591"))
876             cd->read_handle = yaz_read_ISO8859_1;
877         else if (!yaz_matchstr(fromcode, "UCS4"))
878             cd->read_handle = yaz_read_UCS4;
879         else if (!yaz_matchstr(fromcode, "UCS4LE"))
880             cd->read_handle = yaz_read_UCS4LE;
881         else if (!yaz_matchstr(fromcode, "MARC8"))
882             cd->read_handle = yaz_read_marc8;
883 #if HAVE_WCHAR_H
884         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
885             cd->read_handle = yaz_read_wchar_t;
886 #endif
887         
888         if (!yaz_matchstr(tocode, "UTF8"))
889             cd->write_handle = yaz_write_UTF8;
890         else if (!yaz_matchstr(tocode, "ISO88591"))
891             cd->write_handle = yaz_write_ISO8859_1;
892         else if (!yaz_matchstr (tocode, "UCS4"))
893             cd->write_handle = yaz_write_UCS4;
894         else if (!yaz_matchstr(tocode, "UCS4LE"))
895             cd->write_handle = yaz_write_UCS4LE;
896         else if (!yaz_matchstr(tocode, "MARC8"))
897             cd->write_handle = yaz_write_marc8;
898 #if HAVE_WCHAR_H
899         else if (!yaz_matchstr(tocode, "WCHAR_T"))
900             cd->write_handle = yaz_write_wchar_t;
901 #endif
902     }
903 #if HAVE_ICONV_H
904     cd->iconv_cd = 0;
905     if (!cd->read_handle || !cd->write_handle)
906     {
907         cd->iconv_cd = iconv_open (tocode, fromcode);
908         if (cd->iconv_cd == (iconv_t) (-1))
909         {
910             xfree (cd);
911             return 0;
912         }
913     }
914 #else
915     if (!cd->read_handle || !cd->write_handle)
916     {
917         xfree (cd);
918         return 0;
919     }
920 #endif
921     cd->init_flag = 1;
922     return cd;
923 }
924
925 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
926                  char **outbuf, size_t *outbytesleft)
927 {
928     char *inbuf0;
929     size_t r = 0;
930
931 #if HAVE_ICONV_H
932     if (cd->iconv_cd)
933     {
934         size_t r =
935             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
936         if (r == (size_t)(-1))
937         {
938             switch (yaz_errno())
939             {
940             case E2BIG:
941                 cd->my_errno = YAZ_ICONV_E2BIG;
942                 break;
943             case EINVAL:
944                 cd->my_errno = YAZ_ICONV_EINVAL;
945                 break;
946             case EILSEQ:
947                 cd->my_errno = YAZ_ICONV_EILSEQ;
948                 break;
949             default:
950                 cd->my_errno = YAZ_ICONV_UNKNOWN;
951             }
952         }
953         return r;
954     }
955 #endif
956     if (inbuf == 0 || *inbuf == 0)
957     {
958         cd->init_flag = 1;
959         cd->my_errno = YAZ_ICONV_UNKNOWN;
960         return 0;
961     }
962     inbuf0 = *inbuf;
963
964     if (cd->init_flag)
965     {
966         if (cd->init_handle)
967         {
968             size_t no_read;
969             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
970                                          *inbytesleft, &no_read);
971             if (r)
972             {
973                 if (cd->my_errno == YAZ_ICONV_EINVAL)
974                     return r;
975                 cd->init_flag = 0;
976                 return r;
977             }
978             *inbytesleft -= no_read;
979             *inbuf += no_read;
980         }
981         cd->init_flag = 0;
982         cd->unget_x = 0;
983         cd->no_read_x = 0;
984     }
985     while (1)
986     {
987         unsigned long x;
988         size_t no_read;
989
990         if (*inbytesleft == 0)
991         {
992             r = *inbuf - inbuf0;
993             break;
994         }
995         if (!cd->unget_x)
996         {
997             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
998                                   &no_read);
999             if (no_read == 0)
1000             {
1001                 r = (size_t)(-1);
1002                 break;
1003             }
1004         }
1005         else
1006         {
1007             x = cd->unget_x;
1008             no_read = cd->no_read_x;
1009         }
1010         if (x)
1011         {
1012             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1013                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1014             if (r)
1015             {
1016                 /* unable to write it. save it because read_handle cannot
1017                    rewind .. */
1018                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1019                 {
1020                     cd->unget_x = x;
1021                     cd->no_read_x = no_read;
1022                     break;
1023                 }
1024             }
1025             cd->unget_x = 0;
1026         }
1027         *inbytesleft -= no_read;
1028         (*inbuf) += no_read;
1029     }
1030     return r;
1031 }
1032
1033 int yaz_iconv_error (yaz_iconv_t cd)
1034 {
1035     return cd->my_errno;
1036 }
1037
1038 int yaz_iconv_close (yaz_iconv_t cd)
1039 {
1040 #if HAVE_ICONV_H
1041     if (cd->iconv_cd)
1042         iconv_close (cd->iconv_cd);
1043 #endif
1044     xfree (cd);
1045     return 0;
1046 }
1047
1048 /*
1049  * Local variables:
1050  * c-basic-offset: 4
1051  * indent-tabs-mode: nil
1052  * End:
1053  * vim: shiftwidth=4 tabstop=8 expandtab
1054  */
1055