00340067f6f34f4f15f982fa603c34a38ca76d73
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2006, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.25 2006-08-24 10:01:03 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  */
16
17 #if HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20
21 #include <errno.h>
22 #include <string.h>
23 #include <ctype.h>
24 #if HAVE_WCHAR_H
25 #include <wchar.h>
26 #endif
27
28 #if HAVE_ICONV_H
29 #include <iconv.h>
30 #endif
31
32 #include <yaz/yaz-util.h>
33
34 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
35                                size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
37                                size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
39                                size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
41                                size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
43                                size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
45                                size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
47                                size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
49                                size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
51                                size_t *no_read, int *combining);
52
53
54 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
55                                 size_t *no_read, int *combining);
56 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
57                                 size_t *no_read, int *combining);
58 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
59                                 size_t *no_read, int *combining);
60 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
61                                 size_t *no_read, int *combining);
62 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
63                                 size_t *no_read, int *combining);
64 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
65                                 size_t *no_read, int *combining);
66 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
67                                 size_t *no_read, int *combining);
68 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
69                                 size_t *no_read, int *combining);
70 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
71                                 size_t *no_read, int *combining);
72
73 struct yaz_iconv_struct {
74     int my_errno;
75     int init_flag;
76     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77                           size_t inbytesleft, size_t *no_read);
78     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
79                                  size_t inbytesleft, size_t *no_read);
80     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
81                            char **outbuf, size_t *outbytesleft,
82                            int last);
83     int marc8_esc_mode;
84
85     int comb_offset;
86     int comb_size;
87     unsigned long comb_x[8];
88     size_t comb_no_read[8];
89     size_t no_read_x;
90     unsigned long unget_x;
91 #if HAVE_ICONV_H
92     iconv_t iconv_cd;
93 #endif
94     unsigned long compose_char;
95
96     unsigned long write_marc8_comb_ch[8];
97     size_t write_marc8_comb_no;
98     unsigned long write_marc8_last;
99     const char *write_marc8_page_chr;
100 };
101
102 static struct {
103     unsigned long x1, x2;
104     unsigned y;
105 } latin1_comb[] = {
106     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
107     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
108     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
109     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
110     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
111     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
112     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
113     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
114     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
115     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
116     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
117     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
118     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
119     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
120     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
121     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
122     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
123     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
124     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
125     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
126     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
127     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
128     /* omitted:    0xd7      MULTIPLICATION SIGN */
129     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
130     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
131     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
132     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
133     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
134     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
135     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
136     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
137     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
138     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
139     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
140     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
141     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
142     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
143     /* omitted:    0xe6      LATIN SMALL LETTER AE */
144     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
145     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
146     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
147     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
148     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
149     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
150     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
151     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
152     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
153     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
154     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
155     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
156     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
157     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
158     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
159     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
160     /* omitted:    0xf7      DIVISION SIGN */
161     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
162     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
163     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
164     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
165     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
166     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
167     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
168     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
169     
170     { 0, 0, 0}
171 };
172
173 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
174                                          size_t inbytesleft, size_t *no_read)
175 {
176     unsigned long x = inp[0];
177     *no_read = 1;
178     return x;
179 }
180
181 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
182                              size_t inbytesleft, size_t *no_read)
183 {
184     if (inp[0] != 0xef)
185     {
186         *no_read = 0;
187         return 0;
188     }
189     if (inbytesleft < 3)
190     {
191         cd->my_errno = YAZ_ICONV_EINVAL;
192         return (size_t) -1;
193     }
194     if (inp[1] != 0xbb && inp[2] == 0xbf)
195         *no_read = 3;
196     else
197         *no_read = 0;
198     return 0;
199 }
200
201 unsigned long yaz_read_UTF8_char(unsigned char *inp,
202                                  size_t inbytesleft, size_t *no_read,
203                                  int *error)
204 {
205     unsigned long x = 0;
206
207     if (inp[0] <= 0x7f)
208     {
209         x = inp[0];
210         *no_read = 1;
211     }
212     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
213     {
214         *no_read = 0;
215         *error = YAZ_ICONV_EILSEQ;
216     }
217     else if (inp[0] <= 0xdf && inbytesleft >= 2)
218     {
219         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
220         if (x >= 0x80)
221             *no_read = 2;
222         else
223         {
224             *no_read = 0;
225             *error = YAZ_ICONV_EILSEQ;
226         }
227     }
228     else if (inp[0] <= 0xef && inbytesleft >= 3)
229     {
230         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
231             (inp[2] & 0x3f);
232         if (x >= 0x800)
233             *no_read = 3;
234         else
235         {
236             *no_read = 0;
237             *error = YAZ_ICONV_EILSEQ;
238         }
239     }
240     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
241     {
242         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
243             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
244         if (x >= 0x10000)
245             *no_read = 4;
246         else
247         {
248             *no_read = 0;
249             *error = YAZ_ICONV_EILSEQ;
250         }
251     }
252     else if (inp[0] <= 0xfb && inbytesleft >= 5)
253     {
254         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
255             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
256             (inp[4] & 0x3f);
257         if (x >= 0x200000)
258             *no_read = 5;
259         else
260         {
261             *no_read = 0;
262             *error = YAZ_ICONV_EILSEQ;
263         }
264     }
265     else if (inp[0] <= 0xfd && inbytesleft >= 6)
266     {
267         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
268             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
269             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
270         if (x >= 0x4000000)
271             *no_read = 6;
272         else
273         {
274             *no_read = 0;
275             *error = YAZ_ICONV_EILSEQ;
276         }
277     }
278     else
279     {
280         *no_read = 0;
281         *error = YAZ_ICONV_EINVAL;
282     }
283     return x;
284 }
285
286 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
287                                     size_t inbytesleft, size_t *no_read)
288 {
289     return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
290 }
291
292 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
293                                     size_t inbytesleft, size_t *no_read)
294 {
295     unsigned long x = 0;
296     
297     if (inbytesleft < 4)
298     {
299         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
300         *no_read = 0;
301     }
302     else
303     {
304         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
305         *no_read = 4;
306     }
307     return x;
308 }
309
310 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
311                                       size_t inbytesleft, size_t *no_read)
312 {
313     unsigned long x = 0;
314     
315     if (inbytesleft < 4)
316     {
317         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
318         *no_read = 0;
319     }
320     else
321     {
322         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
323         *no_read = 4;
324     }
325     return x;
326 }
327
328 #if HAVE_WCHAR_H
329 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
330                                        size_t inbytesleft, size_t *no_read)
331 {
332     unsigned long x = 0;
333     
334     if (inbytesleft < sizeof(wchar_t))
335     {
336         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
337         *no_read = 0;
338     }
339     else
340     {
341         wchar_t wch;
342         memcpy (&wch, inp, sizeof(wch));
343         x = wch;
344         *no_read = sizeof(wch);
345     }
346     return x;
347 }
348 #endif
349
350
351 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
352                                           size_t inbytesleft, size_t *no_read,
353                                           int *comb);
354
355 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
356                                      size_t inbytesleft, size_t *no_read)
357 {
358     unsigned long x;
359     if (cd->comb_offset < cd->comb_size)
360     {
361         *no_read = cd->comb_no_read[cd->comb_offset];
362         x = cd->comb_x[cd->comb_offset];
363
364         /* special case for double-diacritic combining characters, 
365            INVERTED BREVE and DOUBLE TILDE.
366            We'll increment the no_read counter by 1, since we want to skip over
367            the processing of the closing ligature character
368         */
369         /* this code is no longer necessary.. our handlers code in
370            yaz_marc8_?_conv (generated by charconv.tcl) now returns
371            0 and no_read=1 when a sequence does not match the input.
372            The SECOND HALFs in codetables.xml produces a non-existant
373            entry in the conversion trie.. Hence when met, the input byte is
374            skipped as it should (in yaz_iconv)
375         */
376 #if 0
377         if (x == 0x0361 || x == 0x0360)
378             *no_read += 1;
379 #endif
380         cd->comb_offset++;
381         return x;
382     }
383
384     cd->comb_offset = 0;
385     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
386     {
387         int comb = 0;
388         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
389         if (!comb || !x)
390             break;
391         cd->comb_x[cd->comb_size] = x;
392         cd->comb_no_read[cd->comb_size] = *no_read;
393         inp += *no_read;
394         inbytesleft = inbytesleft - *no_read;
395     }
396     return x;
397 }
398
399 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
400                                      size_t inbytesleft, size_t *no_read)
401 {
402     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
403     if (x && cd->comb_size == 1)
404     {
405         /* For MARC8s we try to get a Latin-1 page code out of it */
406         int i;
407         for (i = 0; latin1_comb[i].x1; i++)
408             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
409             {
410                 *no_read += cd->comb_no_read[0];
411                 cd->comb_size = 0;
412                 x = latin1_comb[i].y;
413                 break;
414             }
415     }
416     return x;
417 }
418
419 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
420                                          size_t inbytesleft, size_t *no_read,
421                                          int *comb)
422 {
423     *no_read = 0;
424     while(inbytesleft >= 1 && inp[0] == 27)
425     {
426         size_t inbytesleft0 = inbytesleft;
427         inp++;
428         inbytesleft--;
429         while(inbytesleft > 0 && strchr("(,$!", *inp))
430         {
431             inbytesleft--;
432             inp++;
433         }
434         if (inbytesleft <= 0)
435         {
436             *no_read = 0;
437             cd->my_errno = YAZ_ICONV_EINVAL;
438             return 0;
439         }
440         cd->marc8_esc_mode = *inp++;
441         inbytesleft--;
442         (*no_read) += inbytesleft0 - inbytesleft;
443     }
444     if (inbytesleft <= 0)
445         return 0;
446     else
447     {
448         unsigned long x;
449         size_t no_read_sub = 0;
450         *comb = 0;
451
452         switch(cd->marc8_esc_mode)
453         {
454         case 'B':  /* Basic ASCII */
455         case 'E':  /* ANSEL */
456         case 's':  /* ASCII */
457             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
458             break;
459         case 'g':  /* Greek */
460             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
461             break;
462         case 'b':  /* Subscripts */
463             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
464             break;
465         case 'p':  /* Superscripts */
466             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
467             break;
468         case '2':  /* Basic Hebrew */
469             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
470             break;
471         case 'N':  /* Basic Cyrillic */
472         case 'Q':  /* Extended Cyrillic */
473             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
474             break;
475         case '3':  /* Basic Arabic */
476         case '4':  /* Extended Arabic */
477             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
478             break;
479         case 'S':  /* Greek */
480             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
481             break;
482         case '1':  /* Chinese, Japanese, Korean (EACC) */
483             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
484             break;
485         default:
486             *no_read = 0;
487             cd->my_errno = YAZ_ICONV_EILSEQ;
488             return 0;
489         }
490         *no_read += no_read_sub;
491         return x;
492     }
493 }
494
495 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
496                              char **outbuf, size_t *outbytesleft,
497                              int last)
498 {
499     return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
500 }
501
502 size_t yaz_write_UTF8_char(unsigned long x,
503                            char **outbuf, size_t *outbytesleft,
504                            int *error)
505 {
506     unsigned char *outp = (unsigned char *) *outbuf;
507
508     if (x <= 0x7f && *outbytesleft >= 1)
509     {
510         *outp++ = (unsigned char) x;
511         (*outbytesleft)--;
512     } 
513     else if (x <= 0x7ff && *outbytesleft >= 2)
514     {
515         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
516         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
517         (*outbytesleft) -= 2;
518     }
519     else if (x <= 0xffff && *outbytesleft >= 3)
520     {
521         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
522         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
523         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
524         (*outbytesleft) -= 3;
525     }
526     else if (x <= 0x1fffff && *outbytesleft >= 4)
527     {
528         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
529         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
530         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
531         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
532         (*outbytesleft) -= 4;
533     }
534     else if (x <= 0x3ffffff && *outbytesleft >= 5)
535     {
536         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
537         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
538         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
539         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
540         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
541         (*outbytesleft) -= 5;
542     }
543     else if (*outbytesleft >= 6)
544     {
545         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
546         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
547         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
548         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
549         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
550         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
551         (*outbytesleft) -= 6;
552     }
553     else 
554     {
555         *error = YAZ_ICONV_E2BIG;  /* not room for output */
556         return (size_t)(-1);
557     }
558     *outbuf = (char *) outp;
559     return 0;
560 }
561
562
563 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
564                                    char **outbuf, size_t *outbytesleft,
565                                    int last)
566 {
567     /* list of two char unicode sequence that, when combined, are
568        equivalent to single unicode chars that can be represented in
569        ISO-8859-1/Latin-1.
570        Regular iconv on Linux at least does not seem to convert these,
571        but since MARC-8 to UTF-8 generates these composed sequence
572        we get a better chance of a successful MARC-8 -> ISO-8859-1
573        conversion */
574     unsigned char *outp = (unsigned char *) *outbuf;
575
576     if (cd->compose_char)
577     {
578         int i;
579         for (i = 0; latin1_comb[i].x1; i++)
580             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
581             {
582                 x = latin1_comb[i].y;
583                 break;
584             }
585         if (*outbytesleft < 1)
586         {  /* no room. Retain compose_char and bail out */
587             cd->my_errno = YAZ_ICONV_E2BIG;
588             return (size_t)(-1);
589         }
590         if (!latin1_comb[i].x1) 
591         {   /* not found. Just write compose_char */
592             *outp++ = (unsigned char) cd->compose_char;
593             (*outbytesleft)--;
594             *outbuf = (char *) outp;
595         }
596         /* compose_char used so reset it. x now holds current char */
597         cd->compose_char = 0;
598     }
599
600     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
601     {
602         cd->compose_char = x;
603         return 0;
604     }
605     else if (x > 255 || x < 1)
606     {
607         cd->my_errno = YAZ_ICONV_EILSEQ;
608         return (size_t) -1;
609     }
610     else if (*outbytesleft < 1)
611     {
612         cd->my_errno = YAZ_ICONV_E2BIG;
613         return (size_t)(-1);
614     }
615     *outp++ = (unsigned char) x;
616     (*outbytesleft)--;
617     *outbuf = (char *) outp;
618     return 0;
619 }
620
621
622 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
623                               char **outbuf, size_t *outbytesleft,
624                               int last)
625 {
626     unsigned char *outp = (unsigned char *) *outbuf;
627     if (*outbytesleft >= 4)
628     {
629         *outp++ = (unsigned char) (x>>24);
630         *outp++ = (unsigned char) (x>>16);
631         *outp++ = (unsigned char) (x>>8);
632         *outp++ = (unsigned char) x;
633         (*outbytesleft) -= 4;
634     }
635     else
636     {
637         cd->my_errno = YAZ_ICONV_E2BIG;
638         return (size_t)(-1);
639     }
640     *outbuf = (char *) outp;
641     return 0;
642 }
643
644 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
645                                 char **outbuf, size_t *outbytesleft,
646                                 int last)
647 {
648     unsigned char *outp = (unsigned char *) *outbuf;
649     if (*outbytesleft >= 4)
650     {
651         *outp++ = (unsigned char) x;
652         *outp++ = (unsigned char) (x>>8);
653         *outp++ = (unsigned char) (x>>16);
654         *outp++ = (unsigned char) (x>>24);
655         (*outbytesleft) -= 4;
656     }
657     else
658     {
659         cd->my_errno = YAZ_ICONV_E2BIG;
660         return (size_t)(-1);
661     }
662     *outbuf = (char *) outp;
663     return 0;
664 }
665
666 static unsigned long lookup_marc8(yaz_iconv_t cd,
667                                   unsigned long x, int *comb,
668                                   const char **page_chr)
669 {
670     char utf8_buf[7];
671     char *utf8_outbuf = utf8_buf;
672     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
673
674     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
675     if (r == (size_t)(-1))
676     {
677         cd->my_errno = YAZ_ICONV_EILSEQ;
678         return 0;
679     }
680     else
681     {
682         unsigned char *inp;
683         size_t inbytesleft, no_read_sub = 0;
684         unsigned long x;
685
686         *utf8_outbuf = '\0';        
687         inp = (unsigned char *) utf8_buf;
688         inbytesleft = strlen(utf8_buf);
689         
690         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
691         if (x)
692         {
693             *page_chr = "\033(B";
694             return x;
695         }
696         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
697         if (x)
698         {
699             *page_chr = "\033g";
700             return x;
701         }
702         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
703         if (x)
704         {
705             *page_chr = "\033b";
706             return x;
707         }
708         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
709         if (x)
710         {
711             *page_chr = "\033p";
712             return x;
713         }
714         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
715         if (x)
716         {
717             *page_chr = "\033(2";
718             return x;
719         }
720         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
721         if (x)
722         {
723             *page_chr = "\033(N";
724             return x;
725         }
726         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
727         if (x)
728         {
729             *page_chr = "\033(3";
730             return x;
731         }
732         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
733         if (x)
734         {
735             *page_chr = "\033(S";
736             return x;
737         }
738         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
739         if (x)
740         {
741             *page_chr = "\033(1";
742             return x;
743         }
744         cd->my_errno = YAZ_ICONV_EILSEQ;
745         return x;
746     }
747 }
748
749 static size_t flush_combos(yaz_iconv_t cd,
750                            char **outbuf, size_t *outbytesleft)
751 {
752     unsigned long y = cd->write_marc8_last;
753     unsigned char byte, second_half = 0;
754     char out_buf[10];
755     size_t i, out_no = 0;
756
757     if (!y)
758         return 0;
759
760     byte = (unsigned char )((y>>16) & 0xff);
761     if (byte)
762         out_buf[out_no++] = byte;
763     byte = (unsigned char)((y>>8) & 0xff);
764     if (byte)
765         out_buf[out_no++] = byte;
766     byte = (unsigned char )(y & 0xff);
767     if (byte)
768         out_buf[out_no++] = byte;
769
770     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
771     {
772         cd->my_errno = YAZ_ICONV_E2BIG;
773         return (size_t) (-1);
774     }
775
776     for (i = 0; i < cd->write_marc8_comb_no; i++)
777     {
778         /* all MARC-8 combined characters are simple bytes */
779         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
780         if (byte == 0xEB)
781             second_half = 0xEC;
782         else if (byte == 0xFA)
783             second_half = 0xFB;
784
785         *(*outbuf)++ = byte;
786         (*outbytesleft)--;
787     }
788     memcpy(*outbuf, out_buf, out_no);
789     *outbuf += out_no;
790     (*outbytesleft) -= out_no;
791     if (second_half)
792     {
793         *(*outbuf)++ = second_half;
794         (*outbytesleft)--;
795     }        
796
797     cd->write_marc8_last = 0;
798     cd->write_marc8_comb_no = 0;
799     return 0;
800 }
801
802 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
803                                 char **outbuf, size_t *outbytesleft,
804                                 int last)
805 {
806     int comb = 0;
807     const char *page_chr = 0;
808     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
809
810     if (!y)
811         return (size_t) (-1);
812
813     if (comb)
814     {
815         if (cd->write_marc8_comb_no < 6)
816             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
817     }
818     else
819     {
820         size_t r = flush_combos(cd, outbuf, outbytesleft);
821         if (r)
822             return r;
823         if (strcmp(page_chr, cd->write_marc8_page_chr))
824         {
825             size_t plen = strlen(page_chr);
826
827             if (*outbytesleft < plen)
828             {
829                 cd->my_errno = YAZ_ICONV_E2BIG;
830                 return (size_t) (-1);
831             }
832             memcpy(*outbuf, page_chr, plen);
833             (*outbuf) += plen;
834             (*outbytesleft) -= plen;
835             cd->write_marc8_page_chr = page_chr;            
836         }
837         cd->write_marc8_last = y;
838     }
839     if (last)
840     {
841         size_t r = flush_combos(cd, outbuf, outbytesleft);
842         if (r)
843         {
844             if (comb)
845                 cd->write_marc8_comb_no--;
846             else
847                 cd->write_marc8_last = 0;
848             return r;
849         }
850     }
851     return 0;
852 }
853
854 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
855                               char **outbuf, size_t *outbytesleft,
856                               int last)
857 {
858     int i;
859     for (i = 0; latin1_comb[i].x1; i++)
860     {
861         if (x == latin1_comb[i].y)
862         {
863             size_t r ;
864             /* save the output pointers .. */
865             char *outbuf0 = *outbuf;
866             size_t outbytesleft0 = *outbytesleft;
867             int last_ch = cd->write_marc8_last;
868
869             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
870                                   outbuf, outbytesleft, 0);
871             if (r)
872                 return r;
873             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
874                                   outbuf, outbytesleft, last);
875             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
876             {
877                 /* not enough room. reset output to original values */
878                 *outbuf = outbuf0;
879                 *outbytesleft = outbytesleft0;
880                 cd->write_marc8_last = last_ch;
881             }
882             return r;
883         }
884     }
885     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
886 }
887
888
889 #if HAVE_WCHAR_H
890 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
891                                  char **outbuf, size_t *outbytesleft,
892                                  int last)
893 {
894     unsigned char *outp = (unsigned char *) *outbuf;
895
896     if (*outbytesleft >= sizeof(wchar_t))
897     {
898         wchar_t wch = x;
899         memcpy(outp, &wch, sizeof(wch));
900         outp += sizeof(wch);
901         (*outbytesleft) -= sizeof(wch);
902     }
903     else
904     {
905         cd->my_errno = YAZ_ICONV_E2BIG;
906         return (size_t)(-1);
907     }
908     *outbuf = (char *) outp;
909     return 0;
910 }
911 #endif
912
913 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
914 {
915     return cd->read_handle && cd->write_handle;
916 }
917
918 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
919 {
920     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
921
922     cd->write_handle = 0;
923     cd->read_handle = 0;
924     cd->init_handle = 0;
925     cd->my_errno = YAZ_ICONV_UNKNOWN;
926     cd->marc8_esc_mode = 'B';
927     cd->comb_offset = cd->comb_size = 0;
928     cd->compose_char = 0;
929
930     cd->write_marc8_comb_no = 0;
931     cd->write_marc8_last = 0;
932     cd->write_marc8_page_chr = "\033(B";
933
934     /* a useful hack: if fromcode has leading @,
935        the library not use YAZ's own conversions .. */
936     if (fromcode[0] == '@')
937         fromcode++;
938     else
939     {
940         if (!yaz_matchstr(fromcode, "UTF8"))
941         {
942             cd->read_handle = yaz_read_UTF8;
943             cd->init_handle = yaz_init_UTF8;
944         }
945         else if (!yaz_matchstr(fromcode, "ISO88591"))
946             cd->read_handle = yaz_read_ISO8859_1;
947         else if (!yaz_matchstr(fromcode, "UCS4"))
948             cd->read_handle = yaz_read_UCS4;
949         else if (!yaz_matchstr(fromcode, "UCS4LE"))
950             cd->read_handle = yaz_read_UCS4LE;
951         else if (!yaz_matchstr(fromcode, "MARC8"))
952             cd->read_handle = yaz_read_marc8;
953         else if (!yaz_matchstr(fromcode, "MARC8s"))
954             cd->read_handle = yaz_read_marc8s;
955 #if HAVE_WCHAR_H
956         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
957             cd->read_handle = yaz_read_wchar_t;
958 #endif
959         
960         if (!yaz_matchstr(tocode, "UTF8"))
961             cd->write_handle = yaz_write_UTF8;
962         else if (!yaz_matchstr(tocode, "ISO88591"))
963             cd->write_handle = yaz_write_ISO8859_1;
964         else if (!yaz_matchstr (tocode, "UCS4"))
965             cd->write_handle = yaz_write_UCS4;
966         else if (!yaz_matchstr(tocode, "UCS4LE"))
967             cd->write_handle = yaz_write_UCS4LE;
968         else if (!yaz_matchstr(tocode, "MARC8"))
969             cd->write_handle = yaz_write_marc8;
970         else if (!yaz_matchstr(tocode, "MARC8s"))
971             cd->write_handle = yaz_write_marc8;
972 #if HAVE_WCHAR_H
973         else if (!yaz_matchstr(tocode, "WCHAR_T"))
974             cd->write_handle = yaz_write_wchar_t;
975 #endif
976     }
977 #if HAVE_ICONV_H
978     cd->iconv_cd = 0;
979     if (!cd->read_handle || !cd->write_handle)
980     {
981         cd->iconv_cd = iconv_open (tocode, fromcode);
982         if (cd->iconv_cd == (iconv_t) (-1))
983         {
984             xfree (cd);
985             return 0;
986         }
987     }
988 #else
989     if (!cd->read_handle || !cd->write_handle)
990     {
991         xfree (cd);
992         return 0;
993     }
994 #endif
995     cd->init_flag = 1;
996     return cd;
997 }
998
999 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1000                  char **outbuf, size_t *outbytesleft)
1001 {
1002     char *inbuf0;
1003     size_t r = 0;
1004
1005 #if HAVE_ICONV_H
1006     if (cd->iconv_cd)
1007     {
1008         size_t r =
1009             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1010         if (r == (size_t)(-1))
1011         {
1012             switch (yaz_errno())
1013             {
1014             case E2BIG:
1015                 cd->my_errno = YAZ_ICONV_E2BIG;
1016                 break;
1017             case EINVAL:
1018                 cd->my_errno = YAZ_ICONV_EINVAL;
1019                 break;
1020             case EILSEQ:
1021                 cd->my_errno = YAZ_ICONV_EILSEQ;
1022                 break;
1023             default:
1024                 cd->my_errno = YAZ_ICONV_UNKNOWN;
1025             }
1026         }
1027         return r;
1028     }
1029 #endif
1030     if (inbuf == 0 || *inbuf == 0)
1031     {
1032         cd->init_flag = 1;
1033         cd->my_errno = YAZ_ICONV_UNKNOWN;
1034         return 0;
1035     }
1036     inbuf0 = *inbuf;
1037
1038     if (cd->init_flag)
1039     {
1040         if (cd->init_handle)
1041         {
1042             size_t no_read;
1043             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1044                                          *inbytesleft, &no_read);
1045             if (r)
1046             {
1047                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1048                     return r;
1049                 cd->init_flag = 0;
1050                 return r;
1051             }
1052             *inbytesleft -= no_read;
1053             *inbuf += no_read;
1054         }
1055         cd->init_flag = 0;
1056         cd->unget_x = 0;
1057         cd->no_read_x = 0;
1058     }
1059     while (1)
1060     {
1061         unsigned long x;
1062         size_t no_read;
1063
1064         if (*inbytesleft == 0)
1065         {
1066             r = *inbuf - inbuf0;
1067             break;
1068         }
1069         if (!cd->unget_x)
1070         {
1071             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1072                                   &no_read);
1073             if (no_read == 0)
1074             {
1075                 r = (size_t)(-1);
1076                 break;
1077             }
1078         }
1079         else
1080         {
1081             x = cd->unget_x;
1082             no_read = cd->no_read_x;
1083         }
1084         if (x)
1085         {
1086             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1087                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1088             if (r)
1089             {
1090                 /* unable to write it. save it because read_handle cannot
1091                    rewind .. */
1092                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1093                 {
1094                     cd->unget_x = x;
1095                     cd->no_read_x = no_read;
1096                     break;
1097                 }
1098             }
1099             cd->unget_x = 0;
1100         }
1101         *inbytesleft -= no_read;
1102         (*inbuf) += no_read;
1103     }
1104     return r;
1105 }
1106
1107 int yaz_iconv_error (yaz_iconv_t cd)
1108 {
1109     return cd->my_errno;
1110 }
1111
1112 int yaz_iconv_close (yaz_iconv_t cd)
1113 {
1114 #if HAVE_ICONV_H
1115     if (cd->iconv_cd)
1116         iconv_close (cd->iconv_cd);
1117 #endif
1118     xfree (cd);
1119     return 0;
1120 }
1121
1122 /*
1123  * Local variables:
1124  * c-basic-offset: 4
1125  * indent-tabs-mode: nil
1126  * End:
1127  * vim: shiftwidth=4 tabstop=8 expandtab
1128  */
1129