cc902101fc6b117b5f3eafbb09c26fcc12864c43
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2006, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.26 2006-08-27 19:04:03 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36
37 #include <yaz/yaz-util.h>
38
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40                                size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42                                size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44                                size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46                                size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48                                size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50                                size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52                                size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54                                size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56                                size_t *no_read, int *combining);
57
58
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60                                 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62                                 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64                                 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66                                 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68                                 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70                                 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72                                 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74                                 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76                                 size_t *no_read, int *combining);
77
78 struct yaz_iconv_struct {
79     int my_errno;
80     int init_flag;
81     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82                           size_t inbytesleft, size_t *no_read);
83     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84                                  size_t inbytesleft, size_t *no_read);
85     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86                            char **outbuf, size_t *outbytesleft,
87                            int last);
88     int marc8_esc_mode;
89
90     int comb_offset;
91     int comb_size;
92     unsigned long comb_x[8];
93     size_t comb_no_read[8];
94     size_t no_read_x;
95     unsigned long unget_x;
96 #if HAVE_ICONV_H
97     iconv_t iconv_cd;
98 #endif
99     unsigned long compose_char;
100
101     unsigned long write_marc8_comb_ch[8];
102     size_t write_marc8_comb_no;
103     unsigned long write_marc8_last;
104     const char *write_marc8_page_chr;
105 };
106
107 static struct {
108     unsigned long x1, x2;
109     unsigned y;
110 } latin1_comb[] = {
111     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
112     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
113     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
114     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
115     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
116     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
117     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
118     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
119     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
120     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
121     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
122     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
123     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
124     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
125     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
126     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
127     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
128     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
129     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
130     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
131     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
132     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
133     /* omitted:    0xd7      MULTIPLICATION SIGN */
134     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
135     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
136     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
137     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
138     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
139     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
140     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
141     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
142     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
143     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
144     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
145     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
146     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
147     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
148     /* omitted:    0xe6      LATIN SMALL LETTER AE */
149     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
150     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
151     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
152     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
153     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
154     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
155     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
156     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
157     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
158     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
159     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
160     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
161     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
162     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
163     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
164     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
165     /* omitted:    0xf7      DIVISION SIGN */
166     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
167     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
168     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
169     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
170     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
171     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
172     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
173     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
174     
175     { 0, 0, 0}
176 };
177
178 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
179                                          size_t inbytesleft, size_t *no_read)
180 {
181     unsigned long x = inp[0];
182     *no_read = 1;
183     return x;
184 }
185
186 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
187                              size_t inbytesleft, size_t *no_read)
188 {
189     if (inp[0] != 0xef)
190     {
191         *no_read = 0;
192         return 0;
193     }
194     if (inbytesleft < 3)
195     {
196         cd->my_errno = YAZ_ICONV_EINVAL;
197         return (size_t) -1;
198     }
199     if (inp[1] != 0xbb && inp[2] == 0xbf)
200         *no_read = 3;
201     else
202         *no_read = 0;
203     return 0;
204 }
205
206 unsigned long yaz_read_UTF8_char(unsigned char *inp,
207                                  size_t inbytesleft, size_t *no_read,
208                                  int *error)
209 {
210     unsigned long x = 0;
211
212     if (inp[0] <= 0x7f)
213     {
214         x = inp[0];
215         *no_read = 1;
216     }
217     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
218     {
219         *no_read = 0;
220         *error = YAZ_ICONV_EILSEQ;
221     }
222     else if (inp[0] <= 0xdf && inbytesleft >= 2)
223     {
224         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
225         if (x >= 0x80)
226             *no_read = 2;
227         else
228         {
229             *no_read = 0;
230             *error = YAZ_ICONV_EILSEQ;
231         }
232     }
233     else if (inp[0] <= 0xef && inbytesleft >= 3)
234     {
235         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
236             (inp[2] & 0x3f);
237         if (x >= 0x800)
238             *no_read = 3;
239         else
240         {
241             *no_read = 0;
242             *error = YAZ_ICONV_EILSEQ;
243         }
244     }
245     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
246     {
247         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
248             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
249         if (x >= 0x10000)
250             *no_read = 4;
251         else
252         {
253             *no_read = 0;
254             *error = YAZ_ICONV_EILSEQ;
255         }
256     }
257     else if (inp[0] <= 0xfb && inbytesleft >= 5)
258     {
259         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
260             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
261             (inp[4] & 0x3f);
262         if (x >= 0x200000)
263             *no_read = 5;
264         else
265         {
266             *no_read = 0;
267             *error = YAZ_ICONV_EILSEQ;
268         }
269     }
270     else if (inp[0] <= 0xfd && inbytesleft >= 6)
271     {
272         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
273             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
274             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
275         if (x >= 0x4000000)
276             *no_read = 6;
277         else
278         {
279             *no_read = 0;
280             *error = YAZ_ICONV_EILSEQ;
281         }
282     }
283     else
284     {
285         *no_read = 0;
286         *error = YAZ_ICONV_EINVAL;
287     }
288     return x;
289 }
290
291 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
292                                     size_t inbytesleft, size_t *no_read)
293 {
294     return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
295 }
296
297 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
298                                     size_t inbytesleft, size_t *no_read)
299 {
300     unsigned long x = 0;
301     
302     if (inbytesleft < 4)
303     {
304         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
305         *no_read = 0;
306     }
307     else
308     {
309         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
310         *no_read = 4;
311     }
312     return x;
313 }
314
315 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
316                                       size_t inbytesleft, size_t *no_read)
317 {
318     unsigned long x = 0;
319     
320     if (inbytesleft < 4)
321     {
322         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
323         *no_read = 0;
324     }
325     else
326     {
327         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
328         *no_read = 4;
329     }
330     return x;
331 }
332
333 #if HAVE_WCHAR_H
334 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
335                                        size_t inbytesleft, size_t *no_read)
336 {
337     unsigned long x = 0;
338     
339     if (inbytesleft < sizeof(wchar_t))
340     {
341         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
342         *no_read = 0;
343     }
344     else
345     {
346         wchar_t wch;
347         memcpy (&wch, inp, sizeof(wch));
348         x = wch;
349         *no_read = sizeof(wch);
350     }
351     return x;
352 }
353 #endif
354
355
356 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
357                                           size_t inbytesleft, size_t *no_read,
358                                           int *comb);
359
360 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
361                                      size_t inbytesleft, size_t *no_read)
362 {
363     unsigned long x;
364     if (cd->comb_offset < cd->comb_size)
365     {
366         *no_read = cd->comb_no_read[cd->comb_offset];
367         x = cd->comb_x[cd->comb_offset];
368
369         /* special case for double-diacritic combining characters, 
370            INVERTED BREVE and DOUBLE TILDE.
371            We'll increment the no_read counter by 1, since we want to skip over
372            the processing of the closing ligature character
373         */
374         /* this code is no longer necessary.. our handlers code in
375            yaz_marc8_?_conv (generated by charconv.tcl) now returns
376            0 and no_read=1 when a sequence does not match the input.
377            The SECOND HALFs in codetables.xml produces a non-existant
378            entry in the conversion trie.. Hence when met, the input byte is
379            skipped as it should (in yaz_iconv)
380         */
381 #if 0
382         if (x == 0x0361 || x == 0x0360)
383             *no_read += 1;
384 #endif
385         cd->comb_offset++;
386         return x;
387     }
388
389     cd->comb_offset = 0;
390     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
391     {
392         int comb = 0;
393         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
394         if (!comb || !x)
395             break;
396         cd->comb_x[cd->comb_size] = x;
397         cd->comb_no_read[cd->comb_size] = *no_read;
398         inp += *no_read;
399         inbytesleft = inbytesleft - *no_read;
400     }
401     return x;
402 }
403
404 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
405                                      size_t inbytesleft, size_t *no_read)
406 {
407     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
408     if (x && cd->comb_size == 1)
409     {
410         /* For MARC8s we try to get a Latin-1 page code out of it */
411         int i;
412         for (i = 0; latin1_comb[i].x1; i++)
413             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
414             {
415                 *no_read += cd->comb_no_read[0];
416                 cd->comb_size = 0;
417                 x = latin1_comb[i].y;
418                 break;
419             }
420     }
421     return x;
422 }
423
424 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
425                                          size_t inbytesleft, size_t *no_read,
426                                          int *comb)
427 {
428     *no_read = 0;
429     while(inbytesleft >= 1 && inp[0] == 27)
430     {
431         size_t inbytesleft0 = inbytesleft;
432         inp++;
433         inbytesleft--;
434         while(inbytesleft > 0 && strchr("(,$!", *inp))
435         {
436             inbytesleft--;
437             inp++;
438         }
439         if (inbytesleft <= 0)
440         {
441             *no_read = 0;
442             cd->my_errno = YAZ_ICONV_EINVAL;
443             return 0;
444         }
445         cd->marc8_esc_mode = *inp++;
446         inbytesleft--;
447         (*no_read) += inbytesleft0 - inbytesleft;
448     }
449     if (inbytesleft <= 0)
450         return 0;
451     else
452     {
453         unsigned long x;
454         size_t no_read_sub = 0;
455         *comb = 0;
456
457         switch(cd->marc8_esc_mode)
458         {
459         case 'B':  /* Basic ASCII */
460         case 'E':  /* ANSEL */
461         case 's':  /* ASCII */
462             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
463             break;
464         case 'g':  /* Greek */
465             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
466             break;
467         case 'b':  /* Subscripts */
468             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
469             break;
470         case 'p':  /* Superscripts */
471             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
472             break;
473         case '2':  /* Basic Hebrew */
474             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
475             break;
476         case 'N':  /* Basic Cyrillic */
477         case 'Q':  /* Extended Cyrillic */
478             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
479             break;
480         case '3':  /* Basic Arabic */
481         case '4':  /* Extended Arabic */
482             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
483             break;
484         case 'S':  /* Greek */
485             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
486             break;
487         case '1':  /* Chinese, Japanese, Korean (EACC) */
488             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
489             break;
490         default:
491             *no_read = 0;
492             cd->my_errno = YAZ_ICONV_EILSEQ;
493             return 0;
494         }
495         *no_read += no_read_sub;
496         return x;
497     }
498 }
499
500 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
501                              char **outbuf, size_t *outbytesleft,
502                              int last)
503 {
504     return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
505 }
506
507 size_t yaz_write_UTF8_char(unsigned long x,
508                            char **outbuf, size_t *outbytesleft,
509                            int *error)
510 {
511     unsigned char *outp = (unsigned char *) *outbuf;
512
513     if (x <= 0x7f && *outbytesleft >= 1)
514     {
515         *outp++ = (unsigned char) x;
516         (*outbytesleft)--;
517     } 
518     else if (x <= 0x7ff && *outbytesleft >= 2)
519     {
520         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
521         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
522         (*outbytesleft) -= 2;
523     }
524     else if (x <= 0xffff && *outbytesleft >= 3)
525     {
526         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
527         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
528         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
529         (*outbytesleft) -= 3;
530     }
531     else if (x <= 0x1fffff && *outbytesleft >= 4)
532     {
533         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
534         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
535         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
536         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
537         (*outbytesleft) -= 4;
538     }
539     else if (x <= 0x3ffffff && *outbytesleft >= 5)
540     {
541         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
542         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
543         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
544         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
545         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
546         (*outbytesleft) -= 5;
547     }
548     else if (*outbytesleft >= 6)
549     {
550         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
551         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
552         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
553         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
554         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
555         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
556         (*outbytesleft) -= 6;
557     }
558     else 
559     {
560         *error = YAZ_ICONV_E2BIG;  /* not room for output */
561         return (size_t)(-1);
562     }
563     *outbuf = (char *) outp;
564     return 0;
565 }
566
567
568 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
569                                    char **outbuf, size_t *outbytesleft,
570                                    int last)
571 {
572     /* list of two char unicode sequence that, when combined, are
573        equivalent to single unicode chars that can be represented in
574        ISO-8859-1/Latin-1.
575        Regular iconv on Linux at least does not seem to convert these,
576        but since MARC-8 to UTF-8 generates these composed sequence
577        we get a better chance of a successful MARC-8 -> ISO-8859-1
578        conversion */
579     unsigned char *outp = (unsigned char *) *outbuf;
580
581     if (cd->compose_char)
582     {
583         int i;
584         for (i = 0; latin1_comb[i].x1; i++)
585             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
586             {
587                 x = latin1_comb[i].y;
588                 break;
589             }
590         if (*outbytesleft < 1)
591         {  /* no room. Retain compose_char and bail out */
592             cd->my_errno = YAZ_ICONV_E2BIG;
593             return (size_t)(-1);
594         }
595         if (!latin1_comb[i].x1) 
596         {   /* not found. Just write compose_char */
597             *outp++ = (unsigned char) cd->compose_char;
598             (*outbytesleft)--;
599             *outbuf = (char *) outp;
600         }
601         /* compose_char used so reset it. x now holds current char */
602         cd->compose_char = 0;
603     }
604
605     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
606     {
607         cd->compose_char = x;
608         return 0;
609     }
610     else if (x > 255 || x < 1)
611     {
612         cd->my_errno = YAZ_ICONV_EILSEQ;
613         return (size_t) -1;
614     }
615     else if (*outbytesleft < 1)
616     {
617         cd->my_errno = YAZ_ICONV_E2BIG;
618         return (size_t)(-1);
619     }
620     *outp++ = (unsigned char) x;
621     (*outbytesleft)--;
622     *outbuf = (char *) outp;
623     return 0;
624 }
625
626
627 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
628                               char **outbuf, size_t *outbytesleft,
629                               int last)
630 {
631     unsigned char *outp = (unsigned char *) *outbuf;
632     if (*outbytesleft >= 4)
633     {
634         *outp++ = (unsigned char) (x>>24);
635         *outp++ = (unsigned char) (x>>16);
636         *outp++ = (unsigned char) (x>>8);
637         *outp++ = (unsigned char) x;
638         (*outbytesleft) -= 4;
639     }
640     else
641     {
642         cd->my_errno = YAZ_ICONV_E2BIG;
643         return (size_t)(-1);
644     }
645     *outbuf = (char *) outp;
646     return 0;
647 }
648
649 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
650                                 char **outbuf, size_t *outbytesleft,
651                                 int last)
652 {
653     unsigned char *outp = (unsigned char *) *outbuf;
654     if (*outbytesleft >= 4)
655     {
656         *outp++ = (unsigned char) x;
657         *outp++ = (unsigned char) (x>>8);
658         *outp++ = (unsigned char) (x>>16);
659         *outp++ = (unsigned char) (x>>24);
660         (*outbytesleft) -= 4;
661     }
662     else
663     {
664         cd->my_errno = YAZ_ICONV_E2BIG;
665         return (size_t)(-1);
666     }
667     *outbuf = (char *) outp;
668     return 0;
669 }
670
671 static unsigned long lookup_marc8(yaz_iconv_t cd,
672                                   unsigned long x, int *comb,
673                                   const char **page_chr)
674 {
675     char utf8_buf[7];
676     char *utf8_outbuf = utf8_buf;
677     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
678
679     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
680     if (r == (size_t)(-1))
681     {
682         cd->my_errno = YAZ_ICONV_EILSEQ;
683         return 0;
684     }
685     else
686     {
687         unsigned char *inp;
688         size_t inbytesleft, no_read_sub = 0;
689         unsigned long x;
690
691         *utf8_outbuf = '\0';        
692         inp = (unsigned char *) utf8_buf;
693         inbytesleft = strlen(utf8_buf);
694         
695         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
696         if (x)
697         {
698             *page_chr = "\033(B";
699             return x;
700         }
701         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
702         if (x)
703         {
704             *page_chr = "\033g";
705             return x;
706         }
707         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
708         if (x)
709         {
710             *page_chr = "\033b";
711             return x;
712         }
713         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
714         if (x)
715         {
716             *page_chr = "\033p";
717             return x;
718         }
719         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
720         if (x)
721         {
722             *page_chr = "\033(2";
723             return x;
724         }
725         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
726         if (x)
727         {
728             *page_chr = "\033(N";
729             return x;
730         }
731         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
732         if (x)
733         {
734             *page_chr = "\033(3";
735             return x;
736         }
737         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
738         if (x)
739         {
740             *page_chr = "\033(S";
741             return x;
742         }
743         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
744         if (x)
745         {
746             *page_chr = "\033(1";
747             return x;
748         }
749         cd->my_errno = YAZ_ICONV_EILSEQ;
750         return x;
751     }
752 }
753
754 static size_t flush_combos(yaz_iconv_t cd,
755                            char **outbuf, size_t *outbytesleft)
756 {
757     unsigned long y = cd->write_marc8_last;
758     unsigned char byte, second_half = 0;
759     char out_buf[10];
760     size_t i, out_no = 0;
761
762     if (!y)
763         return 0;
764
765     byte = (unsigned char )((y>>16) & 0xff);
766     if (byte)
767         out_buf[out_no++] = byte;
768     byte = (unsigned char)((y>>8) & 0xff);
769     if (byte)
770         out_buf[out_no++] = byte;
771     byte = (unsigned char )(y & 0xff);
772     if (byte)
773         out_buf[out_no++] = byte;
774
775     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
776     {
777         cd->my_errno = YAZ_ICONV_E2BIG;
778         return (size_t) (-1);
779     }
780
781     for (i = 0; i < cd->write_marc8_comb_no; i++)
782     {
783         /* all MARC-8 combined characters are simple bytes */
784         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
785         if (byte == 0xEB)
786             second_half = 0xEC;
787         else if (byte == 0xFA)
788             second_half = 0xFB;
789
790         *(*outbuf)++ = byte;
791         (*outbytesleft)--;
792     }
793     memcpy(*outbuf, out_buf, out_no);
794     *outbuf += out_no;
795     (*outbytesleft) -= out_no;
796     if (second_half)
797     {
798         *(*outbuf)++ = second_half;
799         (*outbytesleft)--;
800     }        
801
802     cd->write_marc8_last = 0;
803     cd->write_marc8_comb_no = 0;
804     return 0;
805 }
806
807 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
808                                 char **outbuf, size_t *outbytesleft,
809                                 int last)
810 {
811     int comb = 0;
812     const char *page_chr = 0;
813     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
814
815     if (!y)
816         return (size_t) (-1);
817
818     if (comb)
819     {
820         if (cd->write_marc8_comb_no < 6)
821             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
822     }
823     else
824     {
825         size_t r = flush_combos(cd, outbuf, outbytesleft);
826         const char *old_page_chr = cd->write_marc8_page_chr;
827         if (r)
828             return r;
829         if (strcmp(page_chr, old_page_chr))
830         {
831             size_t plen = 0;
832             const char *page_out = page_chr;
833
834             if (*outbytesleft < 8)
835             {
836                 cd->my_errno = YAZ_ICONV_E2BIG;
837                 
838                 return (size_t) (-1);
839             }
840             cd->write_marc8_page_chr = page_chr;
841
842             if (!strcmp(old_page_chr, "\033p") 
843                 || !strcmp(old_page_chr, "\033g")
844                 || !strcmp(old_page_chr, "\033b"))
845             {
846                 /* Technique 1 leave */
847                 page_out = "\033s";
848                 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
849                 {
850                     /* Must leave script + enter new page */
851                     plen = strlen(page_out);
852                     memcpy(*outbuf, page_out, plen);
853                     (*outbuf) += plen;
854                     (*outbytesleft) -= plen;
855                     page_out = page_chr;
856                 }
857             }
858             plen = strlen(page_out);
859             memcpy(*outbuf, page_out, plen);
860             (*outbuf) += plen;
861             (*outbytesleft) -= plen;
862         }
863         cd->write_marc8_last = y;
864     }
865     if (last)
866     {
867         size_t r = flush_combos(cd, outbuf, outbytesleft);
868         if (r)
869         {
870             if (comb)
871                 cd->write_marc8_comb_no--;
872             else
873                 cd->write_marc8_last = 0;
874             return r;
875         }
876     }
877     return 0;
878 }
879
880 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
881                               char **outbuf, size_t *outbytesleft,
882                               int last)
883 {
884     int i;
885     for (i = 0; latin1_comb[i].x1; i++)
886     {
887         if (x == latin1_comb[i].y)
888         {
889             size_t r ;
890             /* save the output pointers .. */
891             char *outbuf0 = *outbuf;
892             size_t outbytesleft0 = *outbytesleft;
893             int last_ch = cd->write_marc8_last;
894
895             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
896                                   outbuf, outbytesleft, 0);
897             if (r)
898                 return r;
899             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
900                                   outbuf, outbytesleft, last);
901             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
902             {
903                 /* not enough room. reset output to original values */
904                 *outbuf = outbuf0;
905                 *outbytesleft = outbytesleft0;
906                 cd->write_marc8_last = last_ch;
907             }
908             return r;
909         }
910     }
911     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
912 }
913
914
915 #if HAVE_WCHAR_H
916 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
917                                  char **outbuf, size_t *outbytesleft,
918                                  int last)
919 {
920     unsigned char *outp = (unsigned char *) *outbuf;
921
922     if (*outbytesleft >= sizeof(wchar_t))
923     {
924         wchar_t wch = x;
925         memcpy(outp, &wch, sizeof(wch));
926         outp += sizeof(wch);
927         (*outbytesleft) -= sizeof(wch);
928     }
929     else
930     {
931         cd->my_errno = YAZ_ICONV_E2BIG;
932         return (size_t)(-1);
933     }
934     *outbuf = (char *) outp;
935     return 0;
936 }
937 #endif
938
939 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
940 {
941     return cd->read_handle && cd->write_handle;
942 }
943
944 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
945 {
946     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
947
948     cd->write_handle = 0;
949     cd->read_handle = 0;
950     cd->init_handle = 0;
951     cd->my_errno = YAZ_ICONV_UNKNOWN;
952     cd->marc8_esc_mode = 'B';
953     cd->comb_offset = cd->comb_size = 0;
954     cd->compose_char = 0;
955
956     cd->write_marc8_comb_no = 0;
957     cd->write_marc8_last = 0;
958     cd->write_marc8_page_chr = "\033(B";
959
960     /* a useful hack: if fromcode has leading @,
961        the library not use YAZ's own conversions .. */
962     if (fromcode[0] == '@')
963         fromcode++;
964     else
965     {
966         if (!yaz_matchstr(fromcode, "UTF8"))
967         {
968             cd->read_handle = yaz_read_UTF8;
969             cd->init_handle = yaz_init_UTF8;
970         }
971         else if (!yaz_matchstr(fromcode, "ISO88591"))
972             cd->read_handle = yaz_read_ISO8859_1;
973         else if (!yaz_matchstr(fromcode, "UCS4"))
974             cd->read_handle = yaz_read_UCS4;
975         else if (!yaz_matchstr(fromcode, "UCS4LE"))
976             cd->read_handle = yaz_read_UCS4LE;
977         else if (!yaz_matchstr(fromcode, "MARC8"))
978             cd->read_handle = yaz_read_marc8;
979         else if (!yaz_matchstr(fromcode, "MARC8s"))
980             cd->read_handle = yaz_read_marc8s;
981 #if HAVE_WCHAR_H
982         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
983             cd->read_handle = yaz_read_wchar_t;
984 #endif
985         
986         if (!yaz_matchstr(tocode, "UTF8"))
987             cd->write_handle = yaz_write_UTF8;
988         else if (!yaz_matchstr(tocode, "ISO88591"))
989             cd->write_handle = yaz_write_ISO8859_1;
990         else if (!yaz_matchstr (tocode, "UCS4"))
991             cd->write_handle = yaz_write_UCS4;
992         else if (!yaz_matchstr(tocode, "UCS4LE"))
993             cd->write_handle = yaz_write_UCS4LE;
994         else if (!yaz_matchstr(tocode, "MARC8"))
995             cd->write_handle = yaz_write_marc8;
996         else if (!yaz_matchstr(tocode, "MARC8s"))
997             cd->write_handle = yaz_write_marc8;
998 #if HAVE_WCHAR_H
999         else if (!yaz_matchstr(tocode, "WCHAR_T"))
1000             cd->write_handle = yaz_write_wchar_t;
1001 #endif
1002     }
1003 #if HAVE_ICONV_H
1004     cd->iconv_cd = 0;
1005     if (!cd->read_handle || !cd->write_handle)
1006     {
1007         cd->iconv_cd = iconv_open (tocode, fromcode);
1008         if (cd->iconv_cd == (iconv_t) (-1))
1009         {
1010             xfree (cd);
1011             return 0;
1012         }
1013     }
1014 #else
1015     if (!cd->read_handle || !cd->write_handle)
1016     {
1017         xfree (cd);
1018         return 0;
1019     }
1020 #endif
1021     cd->init_flag = 1;
1022     return cd;
1023 }
1024
1025 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1026                  char **outbuf, size_t *outbytesleft)
1027 {
1028     char *inbuf0;
1029     size_t r = 0;
1030
1031 #if HAVE_ICONV_H
1032     if (cd->iconv_cd)
1033     {
1034         size_t r =
1035             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1036         if (r == (size_t)(-1))
1037         {
1038             switch (yaz_errno())
1039             {
1040             case E2BIG:
1041                 cd->my_errno = YAZ_ICONV_E2BIG;
1042                 break;
1043             case EINVAL:
1044                 cd->my_errno = YAZ_ICONV_EINVAL;
1045                 break;
1046             case EILSEQ:
1047                 cd->my_errno = YAZ_ICONV_EILSEQ;
1048                 break;
1049             default:
1050                 cd->my_errno = YAZ_ICONV_UNKNOWN;
1051             }
1052         }
1053         return r;
1054     }
1055 #endif
1056     if (inbuf == 0 || *inbuf == 0)
1057     {
1058         cd->init_flag = 1;
1059         cd->my_errno = YAZ_ICONV_UNKNOWN;
1060         return 0;
1061     }
1062     inbuf0 = *inbuf;
1063
1064     if (cd->init_flag)
1065     {
1066         if (cd->init_handle)
1067         {
1068             size_t no_read;
1069             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1070                                          *inbytesleft, &no_read);
1071             if (r)
1072             {
1073                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1074                     return r;
1075                 cd->init_flag = 0;
1076                 return r;
1077             }
1078             *inbytesleft -= no_read;
1079             *inbuf += no_read;
1080         }
1081         cd->init_flag = 0;
1082         cd->unget_x = 0;
1083         cd->no_read_x = 0;
1084     }
1085     while (1)
1086     {
1087         unsigned long x;
1088         size_t no_read;
1089
1090         if (*inbytesleft == 0)
1091         {
1092             r = *inbuf - inbuf0;
1093             break;
1094         }
1095         if (!cd->unget_x)
1096         {
1097             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1098                                   &no_read);
1099             if (no_read == 0)
1100             {
1101                 r = (size_t)(-1);
1102                 break;
1103             }
1104         }
1105         else
1106         {
1107             x = cd->unget_x;
1108             no_read = cd->no_read_x;
1109         }
1110         if (x)
1111         {
1112             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1113                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1114             if (r)
1115             {
1116                 /* unable to write it. save it because read_handle cannot
1117                    rewind .. */
1118                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1119                 {
1120                     cd->unget_x = x;
1121                     cd->no_read_x = no_read;
1122                     break;
1123                 }
1124             }
1125             cd->unget_x = 0;
1126         }
1127         *inbytesleft -= no_read;
1128         (*inbuf) += no_read;
1129     }
1130     return r;
1131 }
1132
1133 int yaz_iconv_error (yaz_iconv_t cd)
1134 {
1135     return cd->my_errno;
1136 }
1137
1138 int yaz_iconv_close (yaz_iconv_t cd)
1139 {
1140 #if HAVE_ICONV_H
1141     if (cd->iconv_cd)
1142         iconv_close (cd->iconv_cd);
1143 #endif
1144     xfree (cd);
1145     return 0;
1146 }
1147
1148 /*
1149  * Local variables:
1150  * c-basic-offset: 4
1151  * indent-tabs-mode: nil
1152  * End:
1153  * vim: shiftwidth=4 tabstop=8 expandtab
1154  */
1155